# TRAINING I3W


# A) Create Envorinment, Vehicles etc

### General Parameter

In [1]:
# Define horizon as a variable to ensure consistent use across notebook (length of one rollout)
HORIZON=500                                 #103 max Horizon, wenn es vor verlassen abbrechen soll!, default war 500

# name of the experiment
experiment_name = "IntersectionExample"

# scenario class
import flow.scenarios as scenarios
print("Available scenarios:")
print(scenarios.__all__)
scenario_name = "IntersectionTWScenario_2"

# environment class
import flow.multiagent_envs as flowenvs
print("\nAvailable environments:")
print(flowenvs.__all__)
env_name = "MultiAgentIntersectionEnv_sharedPolicy_2veh"

Available scenarios:
['Scenario', 'BayBridgeScenario', 'BayBridgeTollScenario', 'BottleneckScenario', 'Figure8Scenario', 'SimpleGridScenario', 'HighwayScenario', 'LoopScenario', 'MergeScenario', 'TwoLoopsOneMergingScenario', 'MultiLoopScenario', 'IntersectionScenarioTW', 'TenaciousDScenario', 'IntersectionTWScenario_2']

Available environments:
['MultiEnv', 'MultiAgentAccelEnv', 'MultiWaveAttenuationPOEnv', 'MultiAgentIntersectionEnv', 'MultiAgentTeamSpiritIntersectionEnv', 'MultiAgentIntersectionEnv_baseline_1', 'MultiAgentIntersectionEnv_baseline_2', 'MultiAgentIntersectionEnv_baseline_3', 'MultiAgentIntersectionEnv_sharedPolicy_TeamSpirit', 'MultiTenaciousDEnv', 'MultiAgentIntersectionEnv_sharedPolicy_2veh']


### Net Parameter

In [2]:
from flow.core.params import NetParams
from flow.scenarios.intersection import ADDITIONAL_NET_PARAMS

additionalNetParams = {
            "edge_length": 80,
            "lanes": 1,
            "speed_limit": 30
        }

net_params = NetParams( no_internal_links=False,                  #default: True   !! damit Kreuzungen nicht überspr. werden
                        inflows=None,                             #default: None
                        osm_path=None,                            #default: None
                        netfile=None,                             #default: None
                        additional_params=additionalNetParams     #default: None   !!
                      )

### InitialConfig Parameter

In [3]:
from flow.core.params import InitialConfig

initial_config = InitialConfig( shuffle=True,                            #default: False         !!
                                spacing="custom",                        #default: "uniform"     !!
                                min_gap=10,                              #default: 0
                                perturbation=29.99,                      #default: 0.0            !!        
                                x0=0,                                    #default: 0
                                bunching=0,                              #default: 0
                                lanes_distribution=float("inf"),         #default: float("inf")
                                edges_distribution="all",                #default: "all"
                                additional_params=None )                 #default: None

### SUMO Parameter

In [4]:
from flow.core.params import SumoParams

sumo_params = SumoParams( port = None,                  #default: None
                          sim_step=0.1,                 #default: 0.1
                          emission_path=None,           #default: None
                          lateral_resolution=None,      #default: None
                          no_step_log=True,             #default: True
                          render=False,                 #default: False
                          save_render=False,            #default: False
                          sight_radius=25,              #default: 25
                          show_radius=False,            #default: False
                          pxpm=2,                       #default: 2
                          overtake_right=False,         #default: False    
                          seed=None,                    #default: None
                          restart_instance=False,       #default: False
                          print_warnings=True,          #default: True
                          teleport_time=-1,             #default: -1
                          num_clients=1,                #default: 1
                          sumo_binary=None )            #default: None

### Environment Parameter

In [5]:
from flow.core.params import EnvParams

additionalEnvParams = {
        # maximum acceleration of autonomous vehicles
        "max_accel": 3,
        # maximum deceleration of autonomous vehicles
        "max_decel": 3,
        "target_velocity": 30
    }

env_params = EnvParams( additional_params=additionalEnvParams, #default: None    !!
                        horizon=HORIZON,                       #default: 500     !!
                        warmup_steps=0,                        #default: 0       
                        sims_per_step=1,                       #default: 1
                        evaluate=False )                       #default: False

### Vehicles Parameter

In [6]:
from flow.core.params import VehicleParams

# import vehicles dynamics models
#from flow.controllers import SumoCarFollowingController
from flow.controllers import ContinuousRouter
#from flow.controllers.lane_change_controllers import SumoLaneChangeController
from flow.controllers.lane_change_controllers import StaticLaneChanger
from flow.controllers import RLController
from flow.core.params import SumoLaneChangeParams
from flow.core.params import SumoCarFollowingParams
from random import *

vehicles = VehicleParams()

#### Add RL-Agent controlled vehicles 

In [7]:
# car following parameters, default: None
cf_parameter = SumoCarFollowingParams(
                speed_mode="aggressive")
# lane change parameters, default: None
lc_parameter =  None

vehicles.add( # name of the vehicle
                veh_id = "rl",
              # acceleration controller, default: (SumoCarFollowingController, {})
                acceleration_controller=(RLController, {}),
              # lane_change_controller, default: (SumoLaneChangeController, {})
                lane_change_controller=(StaticLaneChanger,{}),
              # routing controller, default: None
                routing_controller=(ContinuousRouter, {}),
              # initial speed, default: 0
                initial_speed=0,
              # number of vehicles, default: 1 
                num_vehicles=2,
                
                car_following_params=cf_parameter
              # speed mode, default: "right_of_way"
                #speed_mode="aggressive",
              # lane change mode, default: "no_lat_collide"
                #lane_change_mode="aggressive", 
              # car following parameter, default: None
                #sumo_car_following_params=cf_parameter,
              # lane change parameter, default: None
                #sumo_lc_params=lc_parameter
)

### Flow Parameter

In [8]:
# Creating flow_params. Make sure the dictionary keys are as specified. 
flow_params = dict( # name of the experiment
                      exp_tag=experiment_name,
                    # name of the flow environment the experiment is running on
                      env_name=env_name,
                    # name of the scenario class the experiment uses
                      scenario=scenario_name,
                    # simulator that is used by the experiment
                      simulator='traci',
                    # sumo-related parameters (see flow.core.params.SumoParams)
                      sim=sumo_params,
                    # environment related parameters (see flow.core.params.EnvParams)
                      env=env_params,
                    # network-related parameters (see flow.core.params.NetParams and
                    # the scenario's documentation or ADDITIONAL_NET_PARAMS component)
                      net=net_params,
                    # vehicles to be placed in the network at the start of a rollout 
                    # (see flow.core.vehicles.Vehicles)
                      veh=vehicles,
                   # (optional) parameters affecting the positioning of vehicles upon 
                   # initialization/reset (see flow.core.params.InitialConfig)
                      initial=initial_config
                )

# B) Training

In [9]:
import json

import ray
try:
    from ray.rllib.agents.agent import get_agent_class
except ImportError:
    from ray.rllib.agents.registry import get_agent_class
from ray.tune import run_experiments
from ray.tune.registry import register_env

from flow.utils.registry import make_create_env
from flow.utils.rllib import FlowParamsEncoder

from ray import tune
from ray.rllib.agents.ppo.ppo_policy_graph import PPOPolicyGraph

In [10]:
# number of parallel workers
N_CPUS = 2
# number of rollouts per training iteration
N_ROLLOUTS = 20

ray.init(redirect_output=True, num_cpus=N_CPUS+1)

Process STDOUT and STDERR is being redirected to /tmp/ray/session_2019-04-30_22-50-45_23679/logs.
Waiting for redis server at 127.0.0.1:31122 to respond...
Waiting for redis server at 127.0.0.1:16830 to respond...
Starting the Plasma object store with 6.554658406 GB memory using /dev/shm.

View the web UI at http://localhost:8888/notebooks/ray_ui.ipynb?token=4e6dc2980ee4b41c8417906b374bf2f6993af1f858e799ce



{'node_ip_address': '172.16.123.117',
 'object_store_addresses': ['/tmp/ray/session_2019-04-30_22-50-45_23679/sockets/plasma_store'],
 'raylet_socket_names': ['/tmp/ray/session_2019-04-30_22-50-45_23679/sockets/raylet'],
 'redis_address': '172.16.123.117:31122',
 'webui_url': 'http://localhost:8888/notebooks/ray_ui.ipynb?token=4e6dc2980ee4b41c8417906b374bf2f6993af1f858e799ce'}

In [11]:
# The algorithm or model to train. This may refer to "
#      "the name of a built-on algorithm (e.g. RLLib's DQN "
#      "or PPO), or a user-defined trainable function or "
#      "class registered in the tune registry.")
alg_run = "PPO"

agent_cls = get_agent_class(alg_run)
config = agent_cls._default_config.copy()
config["num_workers"] = N_CPUS  # number of parallel workers
config["train_batch_size"] = HORIZON * N_ROLLOUTS  # batch size
config["gamma"] = 0.999  # discount rate default 0.999
config["model"].update({"fcnet_hiddens": [100, 50, 25]})  # size of hidden layers in network defaule 64 32
config["use_gae"] = True  # using generalized advantage estimation
config["lambda"] = 0.97  
#config["sgd_minibatch_size"] = min(16 * 1024, config["train_batch_size"])  # stochastic gradient descent
#config["sample_batch_size"] = config["train_batch_size"]/config["num_workers"] # 200 default, trotzdem zu hoch?
config["kl_target"] = 0.02  # target KL divergence
config["num_sgd_iter"] = 10  # number of SGD iterations
config["horizon"] = HORIZON  # rollout horizon

# save the flow params for replay
flow_json = json.dumps(flow_params, cls=FlowParamsEncoder, sort_keys=True,
                       indent=4)  # generating a string version of flow_params
config['env_config']['flow_params'] = flow_json  # adding the flow_params to config dict
config['env_config']['run'] = alg_run

# Call the utility function make_create_env to be able to 
# register the Flow env for this experiment
create_env, gym_name = make_create_env(params=flow_params, version=0)

# Register as rllib env with Gym
register_env(gym_name, create_env)

In [12]:
# multi agent policy mapping
test_env = create_env()
obs_space = test_env.observation_space
act_space = test_env.action_space

def gen_policy():
    return (PPOPolicyGraph, obs_space, act_space, {})

# Setup PG with an ensemble of `num_policies` different policy graphs
policy_graphs = {'rl_0': gen_policy()}
    
def policy_mapping_fn(agent_id):
    return 'rl_0'

config.update({
        'multiagent': {
            'policy_graphs': policy_graphs,
            'policy_mapping_fn': tune.function(policy_mapping_fn),
            'policies_to_train': ['rl_0']
        }
    })

 Starting SUMO on port 52923


New Teamspirit:
-0.3921303351670222
0.7482375742940759
[('bottom_intersection', 2.6936290634791717), ('top_intersection', 18.952837148955812)]


In [None]:
trials = run_experiments({
    flow_params["exp_tag"]: {
        "run": alg_run,  # RL algorithm to run
        "env": gym_name,  # environment name generated earlier
        "config": {  # configuration params (must match "run" value)
            **config
        },
        "checkpoint_freq": 1,  # number of iterations between checkpoints
        "max_failures": 999,
        "stop": {  # stopping conditions
            "training_iteration": 1000,  # number of iterations to stop after
        },
    },
})

== Status ==
Using FIFO scheduling algorithm.
Resources requested: 0/3 CPUs, 0/1 GPUs
Memory usage on this node: 11.9/16.4 GB

Created LogSyncer for /home/thorsten/ray_results/IntersectionExample/PPO_MultiAgentIntersectionEnv_sharedPolicy_2veh-v0_0_2019-04-30_22-50-48tqduo587 -> 
== Status ==
Using FIFO scheduling algorithm.
Resources requested: 3/3 CPUs, 0/1 GPUs
Memory usage on this node: 11.9/16.4 GB
Result logdir: /home/thorsten/ray_results/IntersectionExample
RUNNING trials:
 - PPO_MultiAgentIntersectionEnv_sharedPolicy_2veh-v0_0:	RUNNING

Result for PPO_MultiAgentIntersectionEnv_sharedPolicy_2veh-v0_0:
  custom_metrics: {}
  date: 2019-04-30_22-51-58
  done: false
  episode_len_mean: 493.1578947368421
  episode_reward_max: 256.0580112210727
  episode_reward_mean: 105.42489979261951
  episode_reward_min: -129.17730171050277
  episodes_this_iter: 19
  episodes_total: 19
  experiment_id: 9b6f4aa7d04949df9d58e63adfb6a4fc
  hostname: Gandalf
  info:
    grad_time_ms: 6218.388
    load

== Status ==
Using FIFO scheduling algorithm.
Resources requested: 3/3 CPUs, 0/1 GPUs
Memory usage on this node: 12.6/16.4 GB
Result logdir: /home/thorsten/ray_results/IntersectionExample
RUNNING trials:
 - PPO_MultiAgentIntersectionEnv_sharedPolicy_2veh-v0_0:	RUNNING [pid=23730], 145 s, 5 iter, 50000 ts, 398 rew

Result for PPO_MultiAgentIntersectionEnv_sharedPolicy_2veh-v0_0:
  custom_metrics: {}
  date: 2019-04-30_22-54-23
  done: false
  episode_len_mean: 448.9
  episode_reward_max: 1267.4407887131674
  episode_reward_mean: 488.03434709904917
  episode_reward_min: -186.23258887717174
  episodes_this_iter: 23
  episodes_total: 129
  experiment_id: 9b6f4aa7d04949df9d58e63adfb6a4fc
  hostname: Gandalf
  info:
    grad_time_ms: 5364.887
    load_time_ms: 10.624
    num_steps_sampled: 60000
    num_steps_trained: 60000
    rl_0:
      cur_kl_coeff: 0.012500002980232239
      cur_lr: 4.999999873689376e-05
      entropy: 1.3977876901626587
      kl: 0.006946011912077665
      policy_loss:

Result for PPO_MultiAgentIntersectionEnv_sharedPolicy_2veh-v0_0:
  custom_metrics: {}
  date: 2019-04-30_22-56-58
  done: false
  episode_len_mean: 384.66
  episode_reward_max: 1415.8143422949574
  episode_reward_mean: 907.3666804869396
  episode_reward_min: -181.37813296845314
  episodes_this_iter: 25
  episodes_total: 255
  experiment_id: 9b6f4aa7d04949df9d58e63adfb6a4fc
  hostname: Gandalf
  info:
    grad_time_ms: 5611.125
    load_time_ms: 1.719
    num_steps_sampled: 110000
    num_steps_trained: 110000
    rl_0:
      cur_kl_coeff: 0.0007812501862645149
      cur_lr: 4.999999873689376e-05
      entropy: 1.402116060256958
      kl: 0.019181285053491592
      policy_loss: -0.005713180173188448
      total_loss: 1813.5765380859375
      vf_explained_var: 0.020336559042334557
      vf_loss: 1813.58251953125
    sample_time_ms: 24279.698
    update_time_ms: 10.152
  iterations_since_restore: 11
  node_ip: 172.16.123.117
  num_metric_batches_dropped: 0
  pid: 23730
  policy_reward_mea

== Status ==
Using FIFO scheduling algorithm.
Resources requested: 3/3 CPUs, 0/1 GPUs
Memory usage on this node: 12.9/16.4 GB
Result logdir: /home/thorsten/ray_results/IntersectionExample
RUNNING trials:
 - PPO_MultiAgentIntersectionEnv_sharedPolicy_2veh-v0_0:	RUNNING [pid=23730], 575 s, 16 iter, 160000 ts, 930 rew

Result for PPO_MultiAgentIntersectionEnv_sharedPolicy_2veh-v0_0:
  custom_metrics: {}
  date: 2019-04-30_23-01-38
  done: false
  episode_len_mean: 379.09
  episode_reward_max: 1419.8472652069586
  episode_reward_mean: 933.2460354557579
  episode_reward_min: -185.59256079457674
  episodes_this_iter: 23
  episodes_total: 410
  experiment_id: 9b6f4aa7d04949df9d58e63adfb6a4fc
  hostname: Gandalf
  info:
    grad_time_ms: 7667.34
    load_time_ms: 2.186
    num_steps_sampled: 170000
    num_steps_trained: 170000
    rl_0:
      cur_kl_coeff: 2.441406832076609e-05
      cur_lr: 4.999999873689376e-05
      entropy: 1.4476912021636963
      kl: 0.004889502190053463
      policy_lo

Result for PPO_MultiAgentIntersectionEnv_sharedPolicy_2veh-v0_0:
  custom_metrics: {}
  date: 2019-04-30_23-04-48
  done: false
  episode_len_mean: 352.46
  episode_reward_max: 1404.181516767624
  episode_reward_mean: 927.7338719412378
  episode_reward_min: -185.08902667741282
  episodes_this_iter: 30
  episodes_total: 554
  experiment_id: 9b6f4aa7d04949df9d58e63adfb6a4fc
  hostname: Gandalf
  info:
    grad_time_ms: 7461.164
    load_time_ms: 2.104
    num_steps_sampled: 220000
    num_steps_trained: 220000
    rl_0:
      cur_kl_coeff: 7.629396350239404e-07
      cur_lr: 4.999999873689376e-05
      entropy: 1.4571689367294312
      kl: 0.008601880632340908
      policy_loss: -0.0029612986836582422
      total_loss: 1803.2359619140625
      vf_explained_var: 0.2399851530790329
      vf_loss: 1803.23876953125
    sample_time_ms: 32484.311
    update_time_ms: 11.282
  iterations_since_restore: 22
  node_ip: 172.16.123.117
  num_metric_batches_dropped: 0
  pid: 23730
  policy_reward_mean

== Status ==
Using FIFO scheduling algorithm.
Resources requested: 3/3 CPUs, 0/1 GPUs
Memory usage on this node: 12.8/16.4 GB
Result logdir: /home/thorsten/ray_results/IntersectionExample
RUNNING trials:
 - PPO_MultiAgentIntersectionEnv_sharedPolicy_2veh-v0_0:	RUNNING [pid=23730], 1183 s, 27 iter, 270000 ts, 943 rew

Result for PPO_MultiAgentIntersectionEnv_sharedPolicy_2veh-v0_0:
  custom_metrics: {}
  date: 2019-04-30_23-12-04
  done: false
  episode_len_mean: 330.09
  episode_reward_max: 1403.572553558213
  episode_reward_mean: 858.0223877016609
  episode_reward_min: -183.7874063330836
  episodes_this_iter: 32
  episodes_total: 735
  experiment_id: 9b6f4aa7d04949df9d58e63adfb6a4fc
  hostname: Gandalf
  info:
    grad_time_ms: 13218.12
    load_time_ms: 2.677
    num_steps_sampled: 280000
    num_steps_trained: 280000
    rl_0:
      cur_kl_coeff: 4.768372718899627e-08
      cur_lr: 4.999999873689376e-05
      entropy: 1.4970407485961914
      kl: 0.0073963371105492115
      policy_l

Result for PPO_MultiAgentIntersectionEnv_sharedPolicy_2veh-v0_0:
  custom_metrics: {}
  date: 2019-04-30_23-17-22
  done: false
  episode_len_mean: 325.02
  episode_reward_max: 1432.8538469976413
  episode_reward_mean: 969.5582871279697
  episode_reward_min: -185.0878862523558
  episodes_this_iter: 30
  episodes_total: 887
  experiment_id: 9b6f4aa7d04949df9d58e63adfb6a4fc
  hostname: Gandalf
  info:
    grad_time_ms: 15931.966
    load_time_ms: 2.784
    num_steps_sampled: 330000
    num_steps_trained: 330000
    rl_0:
      cur_kl_coeff: 1.4901164746561335e-09
      cur_lr: 4.999999873689376e-05
      entropy: 1.3800606727600098
      kl: 0.014826163649559021
      policy_loss: -0.0029540355317294598
      total_loss: 2202.331787109375
      vf_explained_var: 0.281764417886734
      vf_loss: 2202.33447265625
    sample_time_ms: 53252.461
    update_time_ms: 22.234
  iterations_since_restore: 33
  node_ip: 172.16.123.117
  num_metric_batches_dropped: 0
  pid: 23730
  policy_reward_mean

== Status ==
Using FIFO scheduling algorithm.
Resources requested: 3/3 CPUs, 0/1 GPUs
Memory usage on this node: 12.9/16.4 GB
Result logdir: /home/thorsten/ray_results/IntersectionExample
RUNNING trials:
 - PPO_MultiAgentIntersectionEnv_sharedPolicy_2veh-v0_0:	RUNNING [pid=23730], 2013 s, 38 iter, 380000 ts, 1.05e+03 rew

Result for PPO_MultiAgentIntersectionEnv_sharedPolicy_2veh-v0_0:
  custom_metrics: {}
  date: 2019-04-30_23-27-39
  done: false
  episode_len_mean: 327.73
  episode_reward_max: 1415.4428854134142
  episode_reward_mean: 1017.7501406003424
  episode_reward_min: -186.88681449710285
  episodes_this_iter: 33
  episodes_total: 1073
  experiment_id: 9b6f4aa7d04949df9d58e63adfb6a4fc
  hostname: Gandalf
  info:
    grad_time_ms: 17027.158
    load_time_ms: 5.512
    num_steps_sampled: 390000
    num_steps_trained: 390000
    rl_0:
      cur_kl_coeff: 4.656613983300417e-11
      cur_lr: 4.999999873689376e-05
      entropy: 1.4174734354019165
      kl: 0.004987653810530901
     

Result for PPO_MultiAgentIntersectionEnv_sharedPolicy_2veh-v0_0:
  custom_metrics: {}
  date: 2019-04-30_23-35-25
  done: false
  episode_len_mean: 313.14
  episode_reward_max: 1412.6848037593577
  episode_reward_mean: 921.6453144207762
  episode_reward_min: -189.75005959371057
  episodes_this_iter: 32
  episodes_total: 1230
  experiment_id: 9b6f4aa7d04949df9d58e63adfb6a4fc
  hostname: Gandalf
  info:
    grad_time_ms: 18233.375
    load_time_ms: 7.788
    num_steps_sampled: 440000
    num_steps_trained: 440000
    rl_0:
      cur_kl_coeff: 2.9103837395627608e-12
      cur_lr: 4.999999873689376e-05
      entropy: 1.4886537790298462
      kl: 0.011486330069601536
      policy_loss: -0.00255565601401031
      total_loss: 1858.00439453125
      vf_explained_var: 0.508164644241333
      vf_loss: 1858.0072021484375
    sample_time_ms: 86038.402
    update_time_ms: 27.675
  iterations_since_restore: 44
  node_ip: 172.16.123.117
  num_metric_batches_dropped: 0
  pid: 23730
  policy_reward_mea

== Status ==
Using FIFO scheduling algorithm.
Resources requested: 3/3 CPUs, 0/1 GPUs
Memory usage on this node: 13.1/16.4 GB
Result logdir: /home/thorsten/ray_results/IntersectionExample
RUNNING trials:
 - PPO_MultiAgentIntersectionEnv_sharedPolicy_2veh-v0_0:	RUNNING [pid=23730], 2966 s, 49 iter, 490000 ts, 965 rew

Result for PPO_MultiAgentIntersectionEnv_sharedPolicy_2veh-v0_0:
  custom_metrics: {}
  date: 2019-04-30_23-42-36
  done: false
  episode_len_mean: 328.02
  episode_reward_max: 1404.1359614623707
  episode_reward_mean: 922.230092766831
  episode_reward_min: -187.41308860637884
  episodes_this_iter: 30
  episodes_total: 1407
  experiment_id: 9b6f4aa7d04949df9d58e63adfb6a4fc
  hostname: Gandalf
  info:
    grad_time_ms: 13058.884
    load_time_ms: 4.747
    num_steps_sampled: 500000
    num_steps_trained: 500000
    rl_0:
      cur_kl_coeff: 9.094949186133627e-14
      cur_lr: 4.999999873689376e-05
      entropy: 1.5205734968185425
      kl: 0.004076680168509483
      policy

Result for PPO_MultiAgentIntersectionEnv_sharedPolicy_2veh-v0_0:
  custom_metrics: {}
  date: 2019-04-30_23-49-21
  done: false
  episode_len_mean: 346.1
  episode_reward_max: 1427.6098325521411
  episode_reward_mean: 920.193538162321
  episode_reward_min: -180.41745559601415
  episodes_this_iter: 30
  episodes_total: 1553
  experiment_id: 9b6f4aa7d04949df9d58e63adfb6a4fc
  hostname: Gandalf
  info:
    grad_time_ms: 15421.135
    load_time_ms: 3.165
    num_steps_sampled: 550000
    num_steps_trained: 550000
    rl_0:
      cur_kl_coeff: 5.684343241333517e-15
      cur_lr: 4.999999873689376e-05
      entropy: 1.6023622751235962
      kl: 0.007021479308605194
      policy_loss: -0.001999141648411751
      total_loss: 1401.7830810546875
      vf_explained_var: 0.667628288269043
      vf_loss: 1401.7850341796875
    sample_time_ms: 64110.997
    update_time_ms: 24.126
  iterations_since_restore: 55
  node_ip: 172.16.123.117
  num_metric_batches_dropped: 0
  pid: 23730
  policy_reward_mea

== Status ==
Using FIFO scheduling algorithm.
Resources requested: 3/3 CPUs, 0/1 GPUs
Memory usage on this node: 13.2/16.4 GB
Result logdir: /home/thorsten/ray_results/IntersectionExample
RUNNING trials:
 - PPO_MultiAgentIntersectionEnv_sharedPolicy_2veh-v0_0:	RUNNING [pid=23730], 3801 s, 60 iter, 600000 ts, 1.03e+03 rew

Result for PPO_MultiAgentIntersectionEnv_sharedPolicy_2veh-v0_0:
  custom_metrics: {}
  date: 2019-04-30_23-55-23
  done: false
  episode_len_mean: 371.54
  episode_reward_max: 1410.6454575477906
  episode_reward_mean: 1063.1141648253533
  episode_reward_min: -182.86768180815815
  episodes_this_iter: 27
  episodes_total: 1714
  experiment_id: 9b6f4aa7d04949df9d58e63adfb6a4fc
  hostname: Gandalf
  info:
    grad_time_ms: 11801.11
    load_time_ms: 3.006
    num_steps_sampled: 610000
    num_steps_trained: 610000
    rl_0:
      cur_kl_coeff: 8.88178631458362e-17
      cur_lr: 4.999999873689376e-05
      entropy: 1.4391510486602783
      kl: 0.013606453314423561
      p

Result for PPO_MultiAgentIntersectionEnv_sharedPolicy_2veh-v0_0:
  custom_metrics: {}
  date: 2019-04-30_23-57-36
  done: false
  episode_len_mean: 342.15
  episode_reward_max: 1412.192791516768
  episode_reward_mean: 1032.0866740939186
  episode_reward_min: -180.75067701806694
  episodes_this_iter: 33
  episodes_total: 1858
  experiment_id: 9b6f4aa7d04949df9d58e63adfb6a4fc
  hostname: Gandalf
  info:
    grad_time_ms: 6646.288
    load_time_ms: 1.907
    num_steps_sampled: 660000
    num_steps_trained: 660000
    rl_0:
      cur_kl_coeff: 1.1102232893229526e-17
      cur_lr: 4.999999873689376e-05
      entropy: 1.3602583408355713
      kl: 0.008110272698104382
      policy_loss: -0.001963422866538167
      total_loss: 1646.193359375
      vf_explained_var: 0.7004605531692505
      vf_loss: 1646.1954345703125
    sample_time_ms: 34594.866
    update_time_ms: 9.689
  iterations_since_restore: 66
  node_ip: 172.16.123.117
  num_metric_batches_dropped: 0
  pid: 23730
  policy_reward_mean:

== Status ==
Using FIFO scheduling algorithm.
Resources requested: 3/3 CPUs, 0/1 GPUs
Memory usage on this node: 13.5/16.4 GB
Result logdir: /home/thorsten/ray_results/IntersectionExample
RUNNING trials:
 - PPO_MultiAgentIntersectionEnv_sharedPolicy_2veh-v0_0:	RUNNING [pid=23730], 4149 s, 71 iter, 710000 ts, 972 rew

Result for PPO_MultiAgentIntersectionEnv_sharedPolicy_2veh-v0_0:
  custom_metrics: {}
  date: 2019-05-01_00-01-12
  done: false
  episode_len_mean: 365.67
  episode_reward_max: 1422.601035267312
  episode_reward_mean: 992.7474310816921
  episode_reward_min: -182.64855334205726
  episodes_this_iter: 26
  episodes_total: 2029
  experiment_id: 9b6f4aa7d04949df9d58e63adfb6a4fc
  hostname: Gandalf
  info:
    grad_time_ms: 5638.951
    load_time_ms: 1.615
    num_steps_sampled: 720000
    num_steps_trained: 720000
    rl_0:
      cur_kl_coeff: 3.4694477791342267e-19
      cur_lr: 4.999999873689376e-05
      entropy: 1.4356529712677002
      kl: 0.007350943051278591
      policy

Result for PPO_MultiAgentIntersectionEnv_sharedPolicy_2veh-v0_0:
  custom_metrics: {}
  date: 2019-05-01_00-03-32
  done: false
  episode_len_mean: 343.55
  episode_reward_max: 1415.7198725960566
  episode_reward_mean: 1038.4177213681658
  episode_reward_min: -184.143513019882
  episodes_this_iter: 30
  episodes_total: 2172
  experiment_id: 9b6f4aa7d04949df9d58e63adfb6a4fc
  hostname: Gandalf
  info:
    grad_time_ms: 5288.401
    load_time_ms: 1.552
    num_steps_sampled: 770000
    num_steps_trained: 770000
    rl_0:
      cur_kl_coeff: 2.1684048619588917e-20
      cur_lr: 4.999999873689376e-05
      entropy: 1.164542555809021
      kl: 0.007867121137678623
      policy_loss: -0.000619006110355258
      total_loss: 1556.715087890625
      vf_explained_var: 0.7087563872337341
      vf_loss: 1556.7156982421875
    sample_time_ms: 26993.806
    update_time_ms: 8.761
  iterations_since_restore: 77
  node_ip: 172.16.123.117
  num_metric_batches_dropped: 0
  pid: 23730
  policy_reward_mean