# TRAINING I3W


# A) Create Envorinment, Vehicles etc

### General Parameter

In [1]:
# Define horizon as a variable to ensure consistent use across notebook (length of one rollout)
HORIZON=500                                 #103 max Horizon, wenn es vor verlassen abbrechen soll!, default war 500

# name of the experiment
experiment_name = "IntersectionExample"

# scenario class
import flow.scenarios as scenarios
print("Available scenarios:")
print(scenarios.__all__)
scenario_name = "IntersectionTWScenario"

# environment class
import flow.multiagent_envs as flowenvs
print("\nAvailable environments:")
print(flowenvs.__all__)
env_name = "MultiAgentIntersectionEnv"

Available scenarios:
['Scenario', 'BayBridgeScenario', 'BayBridgeTollScenario', 'BottleneckScenario', 'Figure8Scenario', 'SimpleGridScenario', 'HighwayScenario', 'LoopScenario', 'MergeScenario', 'TwoLoopsOneMergingScenario', 'MultiLoopScenario', 'IntersectionScenarioTW']

Available environments:
['MultiEnv', 'MultiAgentAccelEnv', 'MultiWaveAttenuationPOEnv', 'MultiAgentIntersectionEnv', 'MultiAgentTeamSpiritIntersectionEnv', 'MultiAgentIntersectionEnv_baseline_1', 'MultiAgentIntersectionEnv_baseline_2', 'MultiAgentIntersectionEnv_baseline_3']


### Net Parameter

In [2]:
from flow.core.params import NetParams
from flow.scenarios.intersection import ADDITIONAL_NET_PARAMS

additionalNetParams = {
            "edge_length": 40,
            "lanes": 1,
            "speed_limit": 30
        }

net_params = NetParams( no_internal_links=False,                  #default: True   !! damit Kreuzungen nicht überspr. werden
                        inflows=None,                             #default: None
                        osm_path=None,                            #default: None
                        netfile=None,                             #default: None
                        additional_params=additionalNetParams     #default: None   !!
                      )

### InitialConfig Parameter

In [3]:
from flow.core.params import InitialConfig

initial_config = InitialConfig( shuffle=True,                            #default: False         !!
                                spacing="custom",                        #default: "uniform"     !!
                                min_gap=10,                              #default: 0
                                perturbation=29.99,                      #default: 0.0            !!        
                                x0=0,                                    #default: 0
                                bunching=0,                              #default: 0
                                lanes_distribution=float("inf"),         #default: float("inf")
                                edges_distribution="all",                #default: "all"
                                additional_params=None )                 #default: None

### SUMO Parameter

In [4]:
from flow.core.params import SumoParams

sumo_params = SumoParams( port = None,                  #default: None
                          sim_step=0.1,                 #default: 0.1
                          emission_path=None,           #default: None
                          lateral_resolution=None,      #default: None
                          no_step_log=True,             #default: True
                          render=False,                 #default: False
                          save_render=False,            #default: False
                          sight_radius=25,              #default: 25
                          show_radius=False,            #default: False
                          pxpm=2,                       #default: 2
                          overtake_right=False,         #default: False    
                          seed=None,                    #default: None
                          restart_instance=False,       #default: False
                          print_warnings=True,          #default: True
                          teleport_time=-1,             #default: -1
                          num_clients=1,                #default: 1
                          sumo_binary=None )            #default: None

### Environment Parameter

In [5]:
from flow.core.params import EnvParams

additionalEnvParams = {
        # maximum acceleration of autonomous vehicles
        "max_accel": 3,
        # maximum deceleration of autonomous vehicles
        "max_decel": 3,
        "target_velocity": 30
    }

env_params = EnvParams( additional_params=additionalEnvParams, #default: None    !!
                        horizon=HORIZON,                       #default: 500     !!
                        warmup_steps=0,                        #default: 0       
                        sims_per_step=1,                       #default: 1
                        evaluate=False )                       #default: False

### Vehicles Parameter

In [6]:
from flow.core.params import VehicleParams

# import vehicles dynamics models
#from flow.controllers import SumoCarFollowingController
from flow.controllers import ContinuousRouter
#from flow.controllers.lane_change_controllers import SumoLaneChangeController
from flow.controllers.lane_change_controllers import StaticLaneChanger
from flow.controllers import RLController
from flow.core.params import SumoLaneChangeParams
from flow.core.params import SumoCarFollowingParams
from random import *

vehicles = VehicleParams()

#### Add RL-Agent controlled vehicles 

In [7]:
# car following parameters, default: None
cf_parameter = SumoCarFollowingParams(
                speed_mode="aggressive")
# lane change parameters, default: None
lc_parameter =  None

vehicles.add( # name of the vehicle
                veh_id = "rl",
              # acceleration controller, default: (SumoCarFollowingController, {})
                acceleration_controller=(RLController, {}),
              # lane_change_controller, default: (SumoLaneChangeController, {})
                lane_change_controller=(StaticLaneChanger,{}),
              # routing controller, default: None
                routing_controller=(ContinuousRouter, {}),
              # initial speed, default: 0
                initial_speed=0,
              # number of vehicles, default: 1 
                num_vehicles=2,
                
                car_following_params=cf_parameter
              # speed mode, default: "right_of_way"
                #speed_mode="aggressive",
              # lane change mode, default: "no_lat_collide"
                #lane_change_mode="aggressive", 
              # car following parameter, default: None
                #sumo_car_following_params=cf_parameter,
              # lane change parameter, default: None
                #sumo_lc_params=lc_parameter
)

### Flow Parameter

In [8]:
# Creating flow_params. Make sure the dictionary keys are as specified. 
flow_params = dict( # name of the experiment
                      exp_tag=experiment_name,
                    # name of the flow environment the experiment is running on
                      env_name=env_name,
                    # name of the scenario class the experiment uses
                      scenario=scenario_name,
                    # simulator that is used by the experiment
                      simulator='traci',
                    # sumo-related parameters (see flow.core.params.SumoParams)
                      sim=sumo_params,
                    # environment related parameters (see flow.core.params.EnvParams)
                      env=env_params,
                    # network-related parameters (see flow.core.params.NetParams and
                    # the scenario's documentation or ADDITIONAL_NET_PARAMS component)
                      net=net_params,
                    # vehicles to be placed in the network at the start of a rollout 
                    # (see flow.core.vehicles.Vehicles)
                      veh=vehicles,
                   # (optional) parameters affecting the positioning of vehicles upon 
                   # initialization/reset (see flow.core.params.InitialConfig)
                      initial=initial_config
                )

# B) Training

In [9]:
import json

import ray
try:
    from ray.rllib.agents.agent import get_agent_class
except ImportError:
    from ray.rllib.agents.registry import get_agent_class
from ray.tune import run_experiments
from ray.tune.registry import register_env

from flow.utils.registry import make_create_env
from flow.utils.rllib import FlowParamsEncoder

from ray import tune
from ray.rllib.agents.ppo.ppo_policy_graph import PPOPolicyGraph

In [10]:
# number of parallel workers
N_CPUS = 2
# number of rollouts per training iteration
N_ROLLOUTS = 20

ray.init(redirect_output=True, num_cpus=N_CPUS+1)

Process STDOUT and STDERR is being redirected to /tmp/ray/session_2019-03-20_18-43-52_5179/logs.
Waiting for redis server at 127.0.0.1:58595 to respond...
Waiting for redis server at 127.0.0.1:27052 to respond...
Starting the Plasma object store with 6.554658406 GB memory using /dev/shm.

View the web UI at http://localhost:8889/notebooks/ray_ui.ipynb?token=2ddcf65e490eb487441c0b0eb3c5c1d690941bb451a99eb2



{'node_ip_address': '192.168.2.102',
 'object_store_addresses': ['/tmp/ray/session_2019-03-20_18-43-52_5179/sockets/plasma_store'],
 'raylet_socket_names': ['/tmp/ray/session_2019-03-20_18-43-52_5179/sockets/raylet'],
 'redis_address': '192.168.2.102:58595',
 'webui_url': 'http://localhost:8889/notebooks/ray_ui.ipynb?token=2ddcf65e490eb487441c0b0eb3c5c1d690941bb451a99eb2'}

In [11]:
# The algorithm or model to train. This may refer to "
#      "the name of a built-on algorithm (e.g. RLLib's DQN "
#      "or PPO), or a user-defined trainable function or "
#      "class registered in the tune registry.")
alg_run = "PPO"

agent_cls = get_agent_class(alg_run)
config = agent_cls._default_config.copy()
config["num_workers"] = N_CPUS  # number of parallel workers
config["train_batch_size"] = HORIZON * N_ROLLOUTS  # batch size
config["gamma"] = 0.999  # discount rate default 0.999
config["model"].update({"fcnet_hiddens": [100, 50,25]})  # size of hidden layers in network default 64 32
config["use_gae"] = True  # using generalized advantage estimation
config["lambda"] = 0.97  
#config["sgd_minibatch_size"] = min(16 * 1024, config["train_batch_size"])  # stochastic gradient descent
#config["sample_batch_size"] = config["train_batch_size"]/config["num_workers"] # 200 default, trotzdem zu hoch?
config["kl_target"] = 0.02  # target KL divergence
config["num_sgd_iter"] = 10  # number of SGD iterations
config["horizon"] = HORIZON  # rollout horizon

# save the flow params for replay
flow_json = json.dumps(flow_params, cls=FlowParamsEncoder, sort_keys=True,
                       indent=4)  # generating a string version of flow_params
config['env_config']['flow_params'] = flow_json  # adding the flow_params to config dict
config['env_config']['run'] = alg_run

# Call the utility function make_create_env to be able to 
# register the Flow env for this experiment
create_env, gym_name = make_create_env(params=flow_params, version=0)

# Register as rllib env with Gym
register_env(gym_name, create_env)

In [12]:
# multi agent policy mapping
test_env = create_env()
obs_space = test_env.observation_space
act_space = test_env.action_space

def gen_policy():
    return (PPOPolicyGraph, obs_space, act_space, {})

# Setup PG with an ensemble of `num_policies` different policy graphs
policy_graphs = {'rl_0': gen_policy(), 'rl_1': gen_policy()}
    
def policy_mapping_fn(agent_id):
    return agent_id

config.update({
        'multiagent': {
            'policy_graphs': policy_graphs,
            'policy_mapping_fn': tune.function(policy_mapping_fn)
        }
    })

 Starting SUMO on port 55263


23.147038256975808
6.886284052593682


In [None]:
trials = run_experiments({
    flow_params["exp_tag"]: {
        "run": alg_run,  # RL algorithm to run
        "env": gym_name,  # environment name generated earlier
        "config": {  # configuration params (must match "run" value)
            **config
        },
        "checkpoint_freq": 1,  # number of iterations between checkpoints
        "max_failures": 999,
        "stop": {  # stopping conditions
            "training_iteration": 1000,  # number of iterations to stop after
        },
    },
})

== Status ==
Using FIFO scheduling algorithm.
Resources requested: 0/3 CPUs, 0/1 GPUs
Memory usage on this node: 2.7/16.4 GB

Created LogSyncer for /home/thorsten/ray_results/IntersectionExample/PPO_MultiAgentIntersectionEnv-v0_0_2019-03-20_18-43-55ipg4b7cp -> 
== Status ==
Using FIFO scheduling algorithm.
Resources requested: 3/3 CPUs, 0/1 GPUs
Memory usage on this node: 2.7/16.4 GB
Result logdir: /home/thorsten/ray_results/IntersectionExample
RUNNING trials:
 - PPO_MultiAgentIntersectionEnv-v0_0:	RUNNING

Result for PPO_MultiAgentIntersectionEnv-v0_0:
  custom_metrics: {}
  date: 2019-03-20_18-44-54
  done: false
  episode_len_mean: 481.7
  episode_reward_max: 157.82593584178935
  episode_reward_mean: 55.08088457653795
  episode_reward_min: -146.92512367743936
  episodes_this_iter: 20
  episodes_total: 20
  experiment_id: d4702f73002545e9a3fbdb44c083d921
  hostname: Gandalf
  info:
    grad_time_ms: 5043.743
    load_time_ms: 151.02
    num_steps_sampled: 10000
    num_steps_trained:

Result for PPO_MultiAgentIntersectionEnv-v0_0:
  custom_metrics: {}
  date: 2019-03-20_18-46-37
  done: false
  episode_len_mean: 453.67
  episode_reward_max: 329.05035258097655
  episode_reward_mean: 89.71178834960858
  episode_reward_min: -158.17230433684242
  episodes_this_iter: 23
  episodes_total: 109
  experiment_id: d4702f73002545e9a3fbdb44c083d921
  hostname: Gandalf
  info:
    grad_time_ms: 4164.82
    load_time_ms: 32.191
    num_steps_sampled: 50000
    num_steps_trained: 50000
    rl_0:
      cur_kl_coeff: 0.012500000186264515
      cur_lr: 4.999999873689376e-05
      entropy: 1.4128468036651611
      kl: 0.007679771166294813
      policy_loss: -0.003738576779142022
      total_loss: 123.07553100585938
      vf_explained_var: 0.28636640310287476
      vf_loss: 123.07918548583984
    rl_1:
      cur_kl_coeff: 0.012500000186264515
      cur_lr: 4.999999873689376e-05
      entropy: 1.4362616539001465
      kl: 0.007967169396579266
      policy_loss: -0.004602141212671995
    

== Status ==
Using FIFO scheduling algorithm.
Resources requested: 3/3 CPUs, 0/1 GPUs
Memory usage on this node: 3.6/16.4 GB
Result logdir: /home/thorsten/ray_results/IntersectionExample
RUNNING trials:
 - PPO_MultiAgentIntersectionEnv-v0_0:	RUNNING [pid=5225], 234 s, 9 iter, 90000 ts, 212 rew

Result for PPO_MultiAgentIntersectionEnv-v0_0:
  custom_metrics: {}
  date: 2019-03-20_18-48-46
  done: false
  episode_len_mean: 253.02
  episode_reward_max: 389.73737098256055
  episode_reward_mean: 185.773258089166
  episode_reward_min: -161.4407383781322
  episodes_this_iter: 43
  episodes_total: 275
  experiment_id: d4702f73002545e9a3fbdb44c083d921
  hostname: Gandalf
  info:
    grad_time_ms: 4077.173
    load_time_ms: 17.367
    num_steps_sampled: 100000
    num_steps_trained: 100000
    rl_0:
      cur_kl_coeff: 0.0003906250058207661
      cur_lr: 4.999999873689376e-05
      entropy: 1.3957222700119019
      kl: 0.006072198040783405
      policy_loss: -0.001429747324436903
      total_lo

== Status ==
Using FIFO scheduling algorithm.
Resources requested: 3/3 CPUs, 0/1 GPUs
Memory usage on this node: 3.6/16.4 GB
Result logdir: /home/thorsten/ray_results/IntersectionExample
RUNNING trials:
 - PPO_MultiAgentIntersectionEnv-v0_0:	RUNNING [pid=5225], 364 s, 14 iter, 140000 ts, 270 rew

Result for PPO_MultiAgentIntersectionEnv-v0_0:
  custom_metrics: {}
  date: 2019-03-20_18-50-56
  done: false
  episode_len_mean: 195.04
  episode_reward_max: 318.46425023204347
  episode_reward_mean: 286.4556144171527
  episode_reward_min: -141.1893024718516
  episodes_this_iter: 54
  episodes_total: 522
  experiment_id: d4702f73002545e9a3fbdb44c083d921
  hostname: Gandalf
  info:
    grad_time_ms: 3981.342
    load_time_ms: 2.681
    num_steps_sampled: 150000
    num_steps_trained: 150000
    rl_0:
      cur_kl_coeff: 9.765625145519152e-05
      cur_lr: 4.999999873689376e-05
      entropy: 1.3504679203033447
      kl: 0.00482891546562314
      policy_loss: -0.0024597151204943657
      total_

== Status ==
Using FIFO scheduling algorithm.
Resources requested: 3/3 CPUs, 0/1 GPUs
Memory usage on this node: 3.6/16.4 GB
Result logdir: /home/thorsten/ray_results/IntersectionExample
RUNNING trials:
 - PPO_MultiAgentIntersectionEnv-v0_0:	RUNNING [pid=5225], 494 s, 19 iter, 190000 ts, 284 rew

Result for PPO_MultiAgentIntersectionEnv-v0_0:
  custom_metrics: {}
  date: 2019-03-20_18-53-06
  done: false
  episode_len_mean: 170.39
  episode_reward_max: 316.67231133066366
  episode_reward_mean: 283.61520014457415
  episode_reward_min: -162.2134245326086
  episodes_this_iter: 61
  episodes_total: 808
  experiment_id: d4702f73002545e9a3fbdb44c083d921
  hostname: Gandalf
  info:
    grad_time_ms: 3973.565
    load_time_ms: 2.631
    num_steps_sampled: 200000
    num_steps_trained: 200000
    rl_0:
      cur_kl_coeff: 6.10351571594947e-06
      cur_lr: 4.999999873689376e-05
      entropy: 1.346618890762329
      kl: 0.0012807153398171067
      policy_loss: -0.0005645955679938197
      total

== Status ==
Using FIFO scheduling algorithm.
Resources requested: 3/3 CPUs, 0/1 GPUs
Memory usage on this node: 3.6/16.4 GB
Result logdir: /home/thorsten/ray_results/IntersectionExample
RUNNING trials:
 - PPO_MultiAgentIntersectionEnv-v0_0:	RUNNING [pid=5225], 624 s, 24 iter, 240000 ts, 293 rew

Result for PPO_MultiAgentIntersectionEnv-v0_0:
  custom_metrics: {}
  date: 2019-03-20_18-55-17
  done: false
  episode_len_mean: 151.59
  episode_reward_max: 319.9274829244105
  episode_reward_mean: 291.85763393406916
  episode_reward_min: 268.2487837672008
  episodes_this_iter: 66
  episodes_total: 1126
  experiment_id: d4702f73002545e9a3fbdb44c083d921
  hostname: Gandalf
  info:
    grad_time_ms: 3972.124
    load_time_ms: 2.483
    num_steps_sampled: 250000
    num_steps_trained: 250000
    rl_0:
      cur_kl_coeff: 7.629394644936838e-07
      cur_lr: 4.999999873689376e-05
      entropy: 1.294982671737671
      kl: 0.007186948321759701
      policy_loss: -0.002544490620493889
      total_l

== Status ==
Using FIFO scheduling algorithm.
Resources requested: 3/3 CPUs, 0/1 GPUs
Memory usage on this node: 3.7/16.4 GB
Result logdir: /home/thorsten/ray_results/IntersectionExample
RUNNING trials:
 - PPO_MultiAgentIntersectionEnv-v0_0:	RUNNING [pid=5225], 756 s, 29 iter, 290000 ts, 299 rew

Result for PPO_MultiAgentIntersectionEnv-v0_0:
  custom_metrics: {}
  date: 2019-03-20_18-57-29
  done: false
  episode_len_mean: 141.9
  episode_reward_max: 323.8150314448601
  episode_reward_mean: 294.2938026461126
  episode_reward_min: -143.97634785214416
  episodes_this_iter: 70
  episodes_total: 1472
  experiment_id: d4702f73002545e9a3fbdb44c083d921
  hostname: Gandalf
  info:
    grad_time_ms: 3967.348
    load_time_ms: 2.448
    num_steps_sampled: 300000
    num_steps_trained: 300000
    rl_0:
      cur_kl_coeff: 1.9073486612342094e-07
      cur_lr: 4.999999873689376e-05
      entropy: 1.2537062168121338
      kl: 0.008926457725465298
      policy_loss: -0.001633352367207408
      total

== Status ==
Using FIFO scheduling algorithm.
Resources requested: 3/3 CPUs, 0/1 GPUs
Memory usage on this node: 3.7/16.4 GB
Result logdir: /home/thorsten/ray_results/IntersectionExample
RUNNING trials:
 - PPO_MultiAgentIntersectionEnv-v0_0:	RUNNING [pid=5225], 888 s, 34 iter, 340000 ts, 297 rew

Result for PPO_MultiAgentIntersectionEnv-v0_0:
  custom_metrics: {}
  date: 2019-03-20_18-59-42
  done: false
  episode_len_mean: 133.53
  episode_reward_max: 326.440354313734
  episode_reward_mean: 299.8713941891076
  episode_reward_min: 266.1321898789706
  episodes_this_iter: 75
  episodes_total: 1841
  experiment_id: d4702f73002545e9a3fbdb44c083d921
  hostname: Gandalf
  info:
    grad_time_ms: 3969.933
    load_time_ms: 2.397
    num_steps_sampled: 350000
    num_steps_trained: 350000
    rl_0:
      cur_kl_coeff: 2.3841858265427618e-08
      cur_lr: 4.999999873689376e-05
      entropy: 1.2152904272079468
      kl: 0.024016350507736206
      policy_loss: -0.004219915252178907
      total_l

== Status ==
Using FIFO scheduling algorithm.
Resources requested: 3/3 CPUs, 0/1 GPUs
Memory usage on this node: 3.8/16.4 GB
Result logdir: /home/thorsten/ray_results/IntersectionExample
RUNNING trials:
 - PPO_MultiAgentIntersectionEnv-v0_0:	RUNNING [pid=5225], 1020 s, 39 iter, 390000 ts, 301 rew

Result for PPO_MultiAgentIntersectionEnv-v0_0:
  custom_metrics: {}
  date: 2019-03-20_19-01-54
  done: false
  episode_len_mean: 128.61
  episode_reward_max: 335.4744230892069
  episode_reward_mean: 304.22473987737595
  episode_reward_min: 279.5589996498334
  episodes_this_iter: 78
  episodes_total: 2227
  experiment_id: d4702f73002545e9a3fbdb44c083d921
  hostname: Gandalf
  info:
    grad_time_ms: 3968.504
    load_time_ms: 2.508
    num_steps_sampled: 400000
    num_steps_trained: 400000
    rl_0:
      cur_kl_coeff: 2.3841858265427618e-08
      cur_lr: 4.999999873689376e-05
      entropy: 1.1923147439956665
      kl: 0.010978172533214092
      policy_loss: -0.002688758773729205
      tota

== Status ==
Using FIFO scheduling algorithm.
Resources requested: 3/3 CPUs, 0/1 GPUs
Memory usage on this node: 3.8/16.4 GB
Result logdir: /home/thorsten/ray_results/IntersectionExample
RUNNING trials:
 - PPO_MultiAgentIntersectionEnv-v0_0:	RUNNING [pid=5225], 1150 s, 44 iter, 440000 ts, 307 rew

Result for PPO_MultiAgentIntersectionEnv-v0_0:
  custom_metrics: {}
  date: 2019-03-20_19-04-04
  done: false
  episode_len_mean: 126.87
  episode_reward_max: 335.38664536305646
  episode_reward_mean: 308.22175349108625
  episode_reward_min: 277.66634458836796
  episodes_this_iter: 78
  episodes_total: 2621
  experiment_id: d4702f73002545e9a3fbdb44c083d921
  hostname: Gandalf
  info:
    grad_time_ms: 3988.533
    load_time_ms: 2.576
    num_steps_sampled: 450000
    num_steps_trained: 450000
    rl_0:
      cur_kl_coeff: 2.9802322831784522e-09
      cur_lr: 4.999999873689376e-05
      entropy: 1.1338139772415161
      kl: 0.006261985283344984
      policy_loss: -0.0012959465384483337
      t

== Status ==
Using FIFO scheduling algorithm.
Resources requested: 3/3 CPUs, 0/1 GPUs
Memory usage on this node: 3.8/16.4 GB
Result logdir: /home/thorsten/ray_results/IntersectionExample
RUNNING trials:
 - PPO_MultiAgentIntersectionEnv-v0_0:	RUNNING [pid=5225], 1282 s, 49 iter, 490000 ts, 307 rew

Result for PPO_MultiAgentIntersectionEnv-v0_0:
  custom_metrics: {}
  date: 2019-03-20_19-06-16
  done: false
  episode_len_mean: 127.22
  episode_reward_max: 339.52538810699366
  episode_reward_mean: 307.756711164097
  episode_reward_min: 280.1893425185198
  episodes_this_iter: 78
  episodes_total: 3015
  experiment_id: d4702f73002545e9a3fbdb44c083d921
  hostname: Gandalf
  info:
    grad_time_ms: 3982.77
    load_time_ms: 2.591
    num_steps_sampled: 500000
    num_steps_trained: 500000
    rl_0:
      cur_kl_coeff: 1.8626451769865326e-10
      cur_lr: 4.999999873689376e-05
      entropy: 1.1091989278793335
      kl: 0.005224085412919521
      policy_loss: -0.0016164240660145879
      total

== Status ==
Using FIFO scheduling algorithm.
Resources requested: 3/3 CPUs, 0/1 GPUs
Memory usage on this node: 3.9/16.4 GB
Result logdir: /home/thorsten/ray_results/IntersectionExample
RUNNING trials:
 - PPO_MultiAgentIntersectionEnv-v0_0:	RUNNING [pid=5225], 1413 s, 54 iter, 540000 ts, 307 rew

Result for PPO_MultiAgentIntersectionEnv-v0_0:
  custom_metrics: {}
  date: 2019-03-20_19-08-28
  done: false
  episode_len_mean: 128.36
  episode_reward_max: 336.87518172015155
  episode_reward_mean: 311.4263268044252
  episode_reward_min: 282.79548390614434
  episodes_this_iter: 78
  episodes_total: 3409
  experiment_id: d4702f73002545e9a3fbdb44c083d921
  hostname: Gandalf
  info:
    grad_time_ms: 3966.698
    load_time_ms: 2.623
    num_steps_sampled: 550000
    num_steps_trained: 550000
    rl_0:
      cur_kl_coeff: 4.6566129424663316e-11
      cur_lr: 4.999999873689376e-05
      entropy: 1.0694153308868408
      kl: 0.022866085171699524
      policy_loss: -0.00474423635751009
      tota

== Status ==
Using FIFO scheduling algorithm.
Resources requested: 3/3 CPUs, 0/1 GPUs
Memory usage on this node: 3.8/16.4 GB
Result logdir: /home/thorsten/ray_results/IntersectionExample
RUNNING trials:
 - PPO_MultiAgentIntersectionEnv-v0_0:	RUNNING [pid=5225], 1547 s, 59 iter, 590000 ts, 313 rew

Result for PPO_MultiAgentIntersectionEnv-v0_0:
  custom_metrics: {}
  date: 2019-03-20_19-10-41
  done: false
  episode_len_mean: 126.75
  episode_reward_max: 337.3335233162005
  episode_reward_mean: 313.9677604160704
  episode_reward_min: 285.9449847212062
  episodes_this_iter: 79
  episodes_total: 3798
  experiment_id: d4702f73002545e9a3fbdb44c083d921
  hostname: Gandalf
  info:
    grad_time_ms: 3991.144
    load_time_ms: 2.712
    num_steps_sampled: 600000
    num_steps_trained: 600000
    rl_0:
      cur_kl_coeff: 3.492458752751837e-11
      cur_lr: 4.999999873689376e-05
      entropy: 1.021341323852539
      kl: 0.004757656715810299
      policy_loss: -0.0011287376983091235
      total_

== Status ==
Using FIFO scheduling algorithm.
Resources requested: 3/3 CPUs, 0/1 GPUs
Memory usage on this node: 3.8/16.4 GB
Result logdir: /home/thorsten/ray_results/IntersectionExample
RUNNING trials:
 - PPO_MultiAgentIntersectionEnv-v0_0:	RUNNING [pid=5225], 1677 s, 64 iter, 640000 ts, 313 rew

Result for PPO_MultiAgentIntersectionEnv-v0_0:
  custom_metrics: {}
  date: 2019-03-20_19-12-52
  done: false
  episode_len_mean: 127.62
  episode_reward_max: 343.84229607092345
  episode_reward_mean: 314.59039749270573
  episode_reward_min: 288.09913082354535
  episodes_this_iter: 79
  episodes_total: 4191
  experiment_id: d4702f73002545e9a3fbdb44c083d921
  hostname: Gandalf
  info:
    grad_time_ms: 3983.722
    load_time_ms: 2.505
    num_steps_sampled: 650000
    num_steps_trained: 650000
    rl_0:
      cur_kl_coeff: 2.182786720469898e-12
      cur_lr: 4.999999873689376e-05
      entropy: 1.0306626558303833
      kl: 0.09084219485521317
      policy_loss: -0.013080928474664688
      tota

== Status ==
Using FIFO scheduling algorithm.
Resources requested: 3/3 CPUs, 0/1 GPUs
Memory usage on this node: 3.9/16.4 GB
Result logdir: /home/thorsten/ray_results/IntersectionExample
RUNNING trials:
 - PPO_MultiAgentIntersectionEnv-v0_0:	RUNNING [pid=5225], 1805 s, 69 iter, 690000 ts, 314 rew

Result for PPO_MultiAgentIntersectionEnv-v0_0:
  custom_metrics: {}
  date: 2019-03-20_19-15-00
  done: false
  episode_len_mean: 126.16
  episode_reward_max: 343.86777322187356
  episode_reward_mean: 314.95610407563726
  episode_reward_min: 285.9071696508532
  episodes_this_iter: 80
  episodes_total: 4586
  experiment_id: d4702f73002545e9a3fbdb44c083d921
  hostname: Gandalf
  info:
    grad_time_ms: 3960.61
    load_time_ms: 2.245
    num_steps_sampled: 700000
    num_steps_trained: 700000
    rl_0:
      cur_kl_coeff: 8.185453725419178e-13
      cur_lr: 4.999999873689376e-05
      entropy: 0.9801864624023438
      kl: 0.017307456582784653
      policy_loss: -0.002263941802084446
      total

== Status ==
Using FIFO scheduling algorithm.
Resources requested: 3/3 CPUs, 0/1 GPUs
Memory usage on this node: 3.9/16.4 GB
Result logdir: /home/thorsten/ray_results/IntersectionExample
RUNNING trials:
 - PPO_MultiAgentIntersectionEnv-v0_0:	RUNNING [pid=5225], 1933 s, 74 iter, 740000 ts, 317 rew

Result for PPO_MultiAgentIntersectionEnv-v0_0:
  custom_metrics: {}
  date: 2019-03-20_19-17-08
  done: false
  episode_len_mean: 127.31
  episode_reward_max: 343.36666107580265
  episode_reward_mean: 316.4495050645326
  episode_reward_min: 288.4124468469957
  episodes_this_iter: 78
  episodes_total: 4979
  experiment_id: d4702f73002545e9a3fbdb44c083d921
  hostname: Gandalf
  info:
    grad_time_ms: 3874.219
    load_time_ms: 2.266
    num_steps_sampled: 750000
    num_steps_trained: 750000
    rl_0:
      cur_kl_coeff: 4.092726862709589e-13
      cur_lr: 4.999999873689376e-05
      entropy: 1.0325734615325928
      kl: 0.053923968225717545
      policy_loss: -0.005850648041814566
      total

== Status ==
Using FIFO scheduling algorithm.
Resources requested: 3/3 CPUs, 0/1 GPUs
Memory usage on this node: 3.9/16.4 GB
Result logdir: /home/thorsten/ray_results/IntersectionExample
RUNNING trials:
 - PPO_MultiAgentIntersectionEnv-v0_0:	RUNNING [pid=5225], 2059 s, 79 iter, 790000 ts, 319 rew

Result for PPO_MultiAgentIntersectionEnv-v0_0:
  custom_metrics: {}
  date: 2019-03-20_19-19-16
  done: false
  episode_len_mean: 128.86
  episode_reward_max: 355.5098202940819
  episode_reward_mean: 322.98591480389416
  episode_reward_min: 283.986900164007
  episodes_this_iter: 77
  episodes_total: 5367
  experiment_id: d4702f73002545e9a3fbdb44c083d921
  hostname: Gandalf
  info:
    grad_time_ms: 3786.329
    load_time_ms: 2.3
    num_steps_sampled: 800000
    num_steps_trained: 800000
    rl_0:
      cur_kl_coeff: 6.139089345387483e-13
      cur_lr: 4.999999873689376e-05
      entropy: 1.0430421829223633
      kl: 0.003180508967489004
      policy_loss: -0.0010115040931850672
      total_l

== Status ==
Using FIFO scheduling algorithm.
Resources requested: 3/3 CPUs, 0/1 GPUs
Memory usage on this node: 4.3/16.4 GB
Result logdir: /home/thorsten/ray_results/IntersectionExample
RUNNING trials:
 - PPO_MultiAgentIntersectionEnv-v0_0:	RUNNING [pid=5225], 2201 s, 84 iter, 840000 ts, 319 rew

Result for PPO_MultiAgentIntersectionEnv-v0_0:
  custom_metrics: {}
  date: 2019-03-20_19-21-38
  done: false
  episode_len_mean: 128.68
  episode_reward_max: 344.7411777257442
  episode_reward_mean: 319.50852447140466
  episode_reward_min: 287.9140577442376
  episodes_this_iter: 77
  episodes_total: 5756
  experiment_id: d4702f73002545e9a3fbdb44c083d921
  hostname: Gandalf
  info:
    grad_time_ms: 3997.915
    load_time_ms: 2.838
    num_steps_sampled: 850000
    num_steps_trained: 850000
    rl_0:
      cur_kl_coeff: 1.5347723363468707e-13
      cur_lr: 4.999999873689376e-05
      entropy: 0.9942646622657776
      kl: 0.029206542298197746
      policy_loss: -0.0036265396047383547
      tot

== Status ==
Using FIFO scheduling algorithm.
Resources requested: 3/3 CPUs, 0/1 GPUs
Memory usage on this node: 4.2/16.4 GB
Result logdir: /home/thorsten/ray_results/IntersectionExample
RUNNING trials:
 - PPO_MultiAgentIntersectionEnv-v0_0:	RUNNING [pid=5225], 2350 s, 89 iter, 890000 ts, 316 rew

Result for PPO_MultiAgentIntersectionEnv-v0_0:
  custom_metrics: {}
  date: 2019-03-20_19-24-09
  done: false
  episode_len_mean: 128.11
  episode_reward_max: 343.94588628869906
  episode_reward_mean: 315.5294128588941
  episode_reward_min: 284.48180633340047
  episodes_this_iter: 78
  episodes_total: 6146
  experiment_id: d4702f73002545e9a3fbdb44c083d921
  hostname: Gandalf
  info:
    grad_time_ms: 4339.884
    load_time_ms: 2.985
    num_steps_sampled: 900000
    num_steps_trained: 900000
    rl_0:
      cur_kl_coeff: 7.673861681734354e-14
      cur_lr: 4.999999873689376e-05
      entropy: 0.9858578443527222
      kl: 0.017128022387623787
      policy_loss: -0.0008773556328378618
      tot

== Status ==
Using FIFO scheduling algorithm.
Resources requested: 3/3 CPUs, 0/1 GPUs
Memory usage on this node: 4.1/16.4 GB
Result logdir: /home/thorsten/ray_results/IntersectionExample
RUNNING trials:
 - PPO_MultiAgentIntersectionEnv-v0_0:	RUNNING [pid=5225], 2507 s, 94 iter, 940000 ts, 317 rew

Result for PPO_MultiAgentIntersectionEnv-v0_0:
  custom_metrics: {}
  date: 2019-03-20_19-26-55
  done: false
  episode_len_mean: 129.53
  episode_reward_max: 348.8433309265173
  episode_reward_mean: 318.4045750568198
  episode_reward_min: 291.5630529516937
  episodes_this_iter: 76
  episodes_total: 6534
  experiment_id: d4702f73002545e9a3fbdb44c083d921
  hostname: Gandalf
  info:
    grad_time_ms: 4758.687
    load_time_ms: 2.515
    num_steps_sampled: 950000
    num_steps_trained: 950000
    rl_0:
      cur_kl_coeff: 1.7266192002627495e-13
      cur_lr: 4.999999873689376e-05
      entropy: 0.9967941641807556
      kl: 0.02916465699672699
      policy_loss: -0.0026028503198176622
      total

== Status ==
Using FIFO scheduling algorithm.
Resources requested: 3/3 CPUs, 0/1 GPUs
Memory usage on this node: 4.4/16.4 GB
Result logdir: /home/thorsten/ray_results/IntersectionExample
RUNNING trials:
 - PPO_MultiAgentIntersectionEnv-v0_0:	RUNNING [pid=5225], 2675 s, 99 iter, 990000 ts, 317 rew

Result for PPO_MultiAgentIntersectionEnv-v0_0:
  custom_metrics: {}
  date: 2019-03-20_19-29-31
  done: false
  episode_len_mean: 128.71
  episode_reward_max: 352.9137416256068
  episode_reward_mean: 316.5973185232589
  episode_reward_min: 285.5909001425004
  episodes_this_iter: 77
  episodes_total: 6922
  experiment_id: d4702f73002545e9a3fbdb44c083d921
  hostname: Gandalf
  info:
    grad_time_ms: 4782.247
    load_time_ms: 2.468
    num_steps_sampled: 1000000
    num_steps_trained: 1000000
    rl_0:
      cur_kl_coeff: 8.633096001313748e-14
      cur_lr: 4.999999873689376e-05
      entropy: 0.9263541102409363
      kl: 0.05226404219865799
      policy_loss: -0.0072237527929246426
      tota

== Status ==
Using FIFO scheduling algorithm.
Resources requested: 3/3 CPUs, 0/1 GPUs
Memory usage on this node: 4.8/16.4 GB
Result logdir: /home/thorsten/ray_results/IntersectionExample
RUNNING trials:
 - PPO_MultiAgentIntersectionEnv-v0_0:	RUNNING [pid=5225], 2813 s, 104 iter, 1040000 ts, 316 rew

Result for PPO_MultiAgentIntersectionEnv-v0_0:
  custom_metrics: {}
  date: 2019-03-20_19-31-53
  done: false
  episode_len_mean: 130.24
  episode_reward_max: 357.5697073244343
  episode_reward_mean: 318.19832664439355
  episode_reward_min: 285.70879047232376
  episodes_this_iter: 77
  episodes_total: 7309
  experiment_id: d4702f73002545e9a3fbdb44c083d921
  hostname: Gandalf
  info:
    grad_time_ms: 4407.686
    load_time_ms: 2.387
    num_steps_sampled: 1050000
    num_steps_trained: 1050000
    rl_0:
      cur_kl_coeff: 1.942445770203305e-13
      cur_lr: 4.999999873689376e-05
      entropy: 0.9897105097770691
      kl: 0.013970807194709778
      policy_loss: 0.001819548080675304
      t

== Status ==
Using FIFO scheduling algorithm.
Resources requested: 3/3 CPUs, 0/1 GPUs
Memory usage on this node: 5.3/16.4 GB
Result logdir: /home/thorsten/ray_results/IntersectionExample
RUNNING trials:
 - PPO_MultiAgentIntersectionEnv-v0_0:	RUNNING [pid=5225], 2958 s, 109 iter, 1090000 ts, 318 rew

Result for PPO_MultiAgentIntersectionEnv-v0_0:
  custom_metrics: {}
  date: 2019-03-20_19-34-17
  done: false
  episode_len_mean: 129.88
  episode_reward_max: 349.44062446455996
  episode_reward_mean: 316.3569859460219
  episode_reward_min: 287.05610646393296
  episodes_this_iter: 79
  episodes_total: 7695
  experiment_id: d4702f73002545e9a3fbdb44c083d921
  hostname: Gandalf
  info:
    grad_time_ms: 4282.461
    load_time_ms: 2.302
    num_steps_sampled: 1100000
    num_steps_trained: 1100000
    rl_0:
      cur_kl_coeff: 1.942445770203305e-13
      cur_lr: 4.999999873689376e-05
      entropy: 0.969071626663208
      kl: 0.03832007199525833
      policy_loss: 0.0044335476122796535
      to

== Status ==
Using FIFO scheduling algorithm.
Resources requested: 3/3 CPUs, 0/1 GPUs
Memory usage on this node: 5.2/16.4 GB
Result logdir: /home/thorsten/ray_results/IntersectionExample
RUNNING trials:
 - PPO_MultiAgentIntersectionEnv-v0_0:	RUNNING [pid=5225], 3092 s, 114 iter, 1140000 ts, 319 rew

Result for PPO_MultiAgentIntersectionEnv-v0_0:
  custom_metrics: {}
  date: 2019-03-20_19-36-29
  done: false
  episode_len_mean: 129.82
  episode_reward_max: 357.20607358667985
  episode_reward_mean: 318.2465743290783
  episode_reward_min: 287.712851657592
  episodes_this_iter: 77
  episodes_total: 8080
  experiment_id: d4702f73002545e9a3fbdb44c083d921
  hostname: Gandalf
  info:
    grad_time_ms: 4049.946
    load_time_ms: 2.363
    num_steps_sampled: 1150000
    num_steps_trained: 1150000
    rl_0:
      cur_kl_coeff: 6.55575582968887e-13
      cur_lr: 4.999999873689376e-05
      entropy: 0.9858455061912537
      kl: 0.6027173399925232
      policy_loss: 0.008352587930858135
      total_

== Status ==
Using FIFO scheduling algorithm.
Resources requested: 3/3 CPUs, 0/1 GPUs
Memory usage on this node: 5.4/16.4 GB
Result logdir: /home/thorsten/ray_results/IntersectionExample
RUNNING trials:
 - PPO_MultiAgentIntersectionEnv-v0_0:	RUNNING [pid=5225], 3226 s, 119 iter, 1190000 ts, 314 rew

Result for PPO_MultiAgentIntersectionEnv-v0_0:
  custom_metrics: {}
  date: 2019-03-20_19-38-46
  done: false
  episode_len_mean: 131.29
  episode_reward_max: 357.4026047830284
  episode_reward_mean: 318.110139606049
  episode_reward_min: 286.1292357325848
  episodes_this_iter: 76
  episodes_total: 8461
  experiment_id: d4702f73002545e9a3fbdb44c083d921
  hostname: Gandalf
  info:
    grad_time_ms: 3988.011
    load_time_ms: 2.34
    num_steps_sampled: 1200000
    num_steps_trained: 1200000
    rl_0:
      cur_kl_coeff: 4.9782769052930664e-12
      cur_lr: 4.999999873689376e-05
      entropy: 0.9514537453651428
      kl: 0.06857185065746307
      policy_loss: 0.007316289935261011
      total

== Status ==
Using FIFO scheduling algorithm.
Resources requested: 3/3 CPUs, 0/1 GPUs
Memory usage on this node: 5.8/16.4 GB
Result logdir: /home/thorsten/ray_results/IntersectionExample
RUNNING trials:
 - PPO_MultiAgentIntersectionEnv-v0_0:	RUNNING [pid=5225], 3373 s, 124 iter, 1240000 ts, 319 rew

Result for PPO_MultiAgentIntersectionEnv-v0_0:
  custom_metrics: {}
  date: 2019-03-20_19-41-12
  done: false
  episode_len_mean: 131.87
  episode_reward_max: 353.61779540916535
  episode_reward_mean: 320.5914727124063
  episode_reward_min: 284.12892641104185
  episodes_this_iter: 76
  episodes_total: 8841
  experiment_id: d4702f73002545e9a3fbdb44c083d921
  hostname: Gandalf
  info:
    grad_time_ms: 4168.102
    load_time_ms: 2.666
    num_steps_sampled: 1250000
    num_steps_trained: 1250000
    rl_0:
      cur_kl_coeff: 3.780378787787697e-11
      cur_lr: 4.999999873689376e-05
      entropy: 1.051352858543396
      kl: 0.05802324414253235
      policy_loss: 0.0265983734279871
      total

== Status ==
Using FIFO scheduling algorithm.
Resources requested: 3/3 CPUs, 0/1 GPUs
Memory usage on this node: 5.7/16.4 GB
Result logdir: /home/thorsten/ray_results/IntersectionExample
RUNNING trials:
 - PPO_MultiAgentIntersectionEnv-v0_0:	RUNNING [pid=5225], 3518 s, 129 iter, 1290000 ts, 315 rew

Result for PPO_MultiAgentIntersectionEnv-v0_0:
  custom_metrics: {}
  date: 2019-03-20_19-43-40
  done: false
  episode_len_mean: 132.39
  episode_reward_max: 354.8927913247348
  episode_reward_mean: 320.93286567174147
  episode_reward_min: 284.41337949951765
  episodes_this_iter: 75
  episodes_total: 9217
  experiment_id: d4702f73002545e9a3fbdb44c083d921
  hostname: Gandalf
  info:
    grad_time_ms: 4137.466
    load_time_ms: 2.631
    num_steps_sampled: 1300000
    num_steps_trained: 1300000
    rl_0:
      cur_kl_coeff: 2.8707250465664913e-10
      cur_lr: 4.999999873689376e-05
      entropy: 1.1752846240997314
      kl: 1.948203682899475
      policy_loss: 0.04155068099498749
      tota

== Status ==
Using FIFO scheduling algorithm.
Resources requested: 3/3 CPUs, 0/1 GPUs
Memory usage on this node: 5.4/16.4 GB
Result logdir: /home/thorsten/ray_results/IntersectionExample
RUNNING trials:
 - PPO_MultiAgentIntersectionEnv-v0_0:	RUNNING [pid=5225], 3672 s, 134 iter, 1340000 ts, 323 rew

Result for PPO_MultiAgentIntersectionEnv-v0_0:
  custom_metrics: {}
  date: 2019-03-20_19-46-10
  done: false
  episode_len_mean: 133.82
  episode_reward_max: 355.1348537333608
  episode_reward_mean: 321.1550449536408
  episode_reward_min: 284.46579027567714
  episodes_this_iter: 75
  episodes_total: 9593
  experiment_id: d4702f73002545e9a3fbdb44c083d921
  hostname: Gandalf
  info:
    grad_time_ms: 4131.262
    load_time_ms: 2.403
    num_steps_sampled: 1350000
    num_steps_trained: 1350000
    rl_0:
      cur_kl_coeff: 2.179957547809863e-09
      cur_lr: 4.999999873689376e-05
      entropy: 1.1955946683883667
      kl: 0.8664892315864563
      policy_loss: 0.04997507855296135
      total

== Status ==
Using FIFO scheduling algorithm.
Resources requested: 3/3 CPUs, 0/1 GPUs
Memory usage on this node: 5.4/16.4 GB
Result logdir: /home/thorsten/ray_results/IntersectionExample
RUNNING trials:
 - PPO_MultiAgentIntersectionEnv-v0_0:	RUNNING [pid=5225], 3802 s, 139 iter, 1390000 ts, 323 rew

Result for PPO_MultiAgentIntersectionEnv-v0_0:
  custom_metrics: {}
  date: 2019-03-20_19-48-20
  done: false
  episode_len_mean: 133.66
  episode_reward_max: 361.9929737938629
  episode_reward_mean: 323.4147801924515
  episode_reward_min: 292.95101870066685
  episodes_this_iter: 75
  episodes_total: 9967
  experiment_id: d4702f73002545e9a3fbdb44c083d921
  hostname: Gandalf
  info:
    grad_time_ms: 4005.104
    load_time_ms: 2.467
    num_steps_sampled: 1400000
    num_steps_trained: 1400000
    rl_0:
      cur_kl_coeff: 1.6554050574768553e-08
      cur_lr: 4.999999873689376e-05
      entropy: 1.2452291250228882
      kl: 0.1082029864192009
      policy_loss: 0.055277615785598755
      tot

== Status ==
Using FIFO scheduling algorithm.
Resources requested: 3/3 CPUs, 0/1 GPUs
Memory usage on this node: 5.6/16.4 GB
Result logdir: /home/thorsten/ray_results/IntersectionExample
RUNNING trials:
 - PPO_MultiAgentIntersectionEnv-v0_0:	RUNNING [pid=5225], 3934 s, 144 iter, 1440000 ts, 324 rew

Result for PPO_MultiAgentIntersectionEnv-v0_0:
  custom_metrics: {}
  date: 2019-03-20_19-50-41
  done: false
  episode_len_mean: 133.97
  episode_reward_max: 357.44382796766627
  episode_reward_mean: 325.0266802015775
  episode_reward_min: 288.02189747176305
  episodes_this_iter: 75
  episodes_total: 10339
  experiment_id: d4702f73002545e9a3fbdb44c083d921
  hostname: Gandalf
  info:
    grad_time_ms: 4051.808
    load_time_ms: 2.308
    num_steps_sampled: 1450000
    num_steps_trained: 1450000
    rl_0:
      cur_kl_coeff: 1.257073307669998e-07
      cur_lr: 4.999999873689376e-05
      entropy: 1.3699363470077515
      kl: 362.6254577636719
      policy_loss: 0.40337973833084106
      tota

== Status ==
Using FIFO scheduling algorithm.
Resources requested: 3/3 CPUs, 0/1 GPUs
Memory usage on this node: 5.8/16.4 GB
Result logdir: /home/thorsten/ray_results/IntersectionExample
RUNNING trials:
 - PPO_MultiAgentIntersectionEnv-v0_0:	RUNNING [pid=5225], 4149 s, 149 iter, 1490000 ts, 325 rew

Result for PPO_MultiAgentIntersectionEnv-v0_0:
  custom_metrics: {}
  date: 2019-03-20_19-54-28
  done: false
  episode_len_mean: 135.2
  episode_reward_max: 361.2902715503085
  episode_reward_mean: 323.06011220079296
  episode_reward_min: 175.4995890002329
  episodes_this_iter: 73
  episodes_total: 10711
  experiment_id: d4702f73002545e9a3fbdb44c083d921
  hostname: Gandalf
  info:
    grad_time_ms: 5290.545
    load_time_ms: 2.948
    num_steps_sampled: 1500000
    num_steps_trained: 1500000
    rl_0:
      cur_kl_coeff: 9.545901775709353e-07
      cur_lr: 4.999999873689376e-05
      entropy: 1.389479398727417
      kl: 1100.154296875
      policy_loss: 0.3395327925682068
      total_loss:

== Status ==
Using FIFO scheduling algorithm.
Resources requested: 3/3 CPUs, 0/1 GPUs
Memory usage on this node: 6.1/16.4 GB
Result logdir: /home/thorsten/ray_results/IntersectionExample
RUNNING trials:
 - PPO_MultiAgentIntersectionEnv-v0_0:	RUNNING [pid=5225], 4421 s, 154 iter, 1540000 ts, 325 rew

Result for PPO_MultiAgentIntersectionEnv-v0_0:
  custom_metrics: {}
  date: 2019-03-20_19-58-57
  done: false
  episode_len_mean: 136.05
  episode_reward_max: 365.39965761360526
  episode_reward_mean: 325.3923051364666
  episode_reward_min: 292.1026151221282
  episodes_this_iter: 73
  episodes_total: 11078
  experiment_id: d4702f73002545e9a3fbdb44c083d921
  hostname: Gandalf
  info:
    grad_time_ms: 6689.416
    load_time_ms: 3.586
    num_steps_sampled: 1550000
    num_steps_trained: 1550000
    rl_0:
      cur_kl_coeff: 7.2489151534682605e-06
      cur_lr: 4.999999873689376e-05
      entropy: 1.4288119077682495
      kl: 149.0294647216797
      policy_loss: 0.36122432351112366
      tota

== Status ==
Using FIFO scheduling algorithm.
Resources requested: 3/3 CPUs, 0/1 GPUs
Memory usage on this node: 6.2/16.4 GB
Result logdir: /home/thorsten/ray_results/IntersectionExample
RUNNING trials:
 - PPO_MultiAgentIntersectionEnv-v0_0:	RUNNING [pid=5225], 4705 s, 159 iter, 1590000 ts, 325 rew

Result for PPO_MultiAgentIntersectionEnv-v0_0:
  custom_metrics: {}
  date: 2019-03-20_20-03-44
  done: false
  episode_len_mean: 139.72
  episode_reward_max: 383.09491621233127
  episode_reward_mean: 326.4128368254589
  episode_reward_min: 130.87510551995848
  episodes_this_iter: 72
  episodes_total: 11441
  experiment_id: d4702f73002545e9a3fbdb44c083d921
  hostname: Gandalf
  info:
    grad_time_ms: 8502.193
    load_time_ms: 3.948
    num_steps_sampled: 1600000
    num_steps_trained: 1600000
    rl_0:
      cur_kl_coeff: 5.504645378096029e-05
      cur_lr: 4.999999873689376e-05
      entropy: 1.571427822113037
      kl: 60.51083755493164
      policy_loss: 0.30252689123153687
      total

== Status ==
Using FIFO scheduling algorithm.
Resources requested: 3/3 CPUs, 0/1 GPUs
Memory usage on this node: 6.4/16.4 GB
Result logdir: /home/thorsten/ray_results/IntersectionExample
RUNNING trials:
 - PPO_MultiAgentIntersectionEnv-v0_0:	RUNNING [pid=5225], 4954 s, 164 iter, 1640000 ts, 324 rew

Result for PPO_MultiAgentIntersectionEnv-v0_0:
  custom_metrics: {}
  date: 2019-03-20_20-07-56
  done: false
  episode_len_mean: 138.82
  episode_reward_max: 367.7990810222922
  episode_reward_mean: 323.2182565767242
  episode_reward_min: 292.6989462584217
  episodes_this_iter: 72
  episodes_total: 11799
  experiment_id: d4702f73002545e9a3fbdb44c083d921
  hostname: Gandalf
  info:
    grad_time_ms: 8472.181
    load_time_ms: 4.397
    num_steps_sampled: 1650000
    num_steps_trained: 1650000
    rl_0:
      cur_kl_coeff: 0.00041800900362432003
      cur_lr: 4.999999873689376e-05
      entropy: 1.5089560747146606
      kl: 0.06328733265399933
      policy_loss: 0.020327607169747353
      to

== Status ==
Using FIFO scheduling algorithm.
Resources requested: 3/3 CPUs, 0/1 GPUs
Memory usage on this node: 5.8/16.4 GB
Result logdir: /home/thorsten/ray_results/IntersectionExample
RUNNING trials:
 - PPO_MultiAgentIntersectionEnv-v0_0:	RUNNING [pid=5225], 5110 s, 169 iter, 1690000 ts, 323 rew

Result for PPO_MultiAgentIntersectionEnv-v0_0:
  custom_metrics: {}
  date: 2019-03-20_20-10-11
  done: false
  episode_len_mean: 136.6
  episode_reward_max: 380.4360744755644
  episode_reward_mean: 326.44772270785273
  episode_reward_min: 294.8658171489943
  episodes_this_iter: 73
  episodes_total: 12163
  experiment_id: d4702f73002545e9a3fbdb44c083d921
  hostname: Gandalf
  info:
    grad_time_ms: 5417.15
    load_time_ms: 3.324
    num_steps_sampled: 1700000
    num_steps_trained: 1700000
    rl_0:
      cur_kl_coeff: 0.0021161711774766445
      cur_lr: 4.999999873689376e-05
      entropy: 1.4829151630401611
      kl: 0.058438241481781006
      policy_loss: 0.01931939460337162
      tota