# TRAINING I3W


# A) Create Envorinment, Vehicles etc

### General Parameter

In [1]:
# Define horizon as a variable to ensure consistent use across notebook (length of one rollout)
HORIZON=500                                 #103 max Horizon, wenn es vor verlassen abbrechen soll!, default war 500

# name of the experiment
experiment_name = "IntersectionExample"

# scenario class
import flow.scenarios as scenarios
print("Available scenarios:")
print(scenarios.__all__)
scenario_name = "IntersectionTWScenario"

# environment class
import flow.multiagent_envs as flowenvs
print("\nAvailable environments:")
print(flowenvs.__all__)
env_name = "MultiAgentIntersectionEnv_sharedPolicy_TeamSpirit"

Available scenarios:
['Scenario', 'BayBridgeScenario', 'BayBridgeTollScenario', 'BottleneckScenario', 'Figure8Scenario', 'SimpleGridScenario', 'HighwayScenario', 'LoopScenario', 'MergeScenario', 'TwoLoopsOneMergingScenario', 'MultiLoopScenario', 'IntersectionScenarioTW']

Available environments:
['MultiEnv', 'MultiAgentAccelEnv', 'MultiWaveAttenuationPOEnv', 'MultiAgentIntersectionEnv', 'MultiAgentTeamSpiritIntersectionEnv', 'MultiAgentIntersectionEnv_baseline_1', 'MultiAgentIntersectionEnv_baseline_2', 'MultiAgentIntersectionEnv_baseline_3', 'MultiAgentIntersectionEnv_sharedPolicy_TeamSpirit']


### Net Parameter

In [2]:
from flow.core.params import NetParams
from flow.scenarios.intersection import ADDITIONAL_NET_PARAMS

additionalNetParams = {
            "edge_length": 40,
            "lanes": 1,
            "speed_limit": 30
        }

net_params = NetParams( no_internal_links=False,                  #default: True   !! damit Kreuzungen nicht überspr. werden
                        inflows=None,                             #default: None
                        osm_path=None,                            #default: None
                        netfile=None,                             #default: None
                        additional_params=additionalNetParams     #default: None   !!
                      )

### InitialConfig Parameter

In [3]:
from flow.core.params import InitialConfig

initial_config = InitialConfig( shuffle=True,                            #default: False         !!
                                spacing="custom",                        #default: "uniform"     !!
                                min_gap=10,                              #default: 0
                                perturbation=29.99,                      #default: 0.0            !!        
                                x0=0,                                    #default: 0
                                bunching=0,                              #default: 0
                                lanes_distribution=float("inf"),         #default: float("inf")
                                edges_distribution="all",                #default: "all"
                                additional_params=None )                 #default: None

### SUMO Parameter

In [4]:
from flow.core.params import SumoParams

sumo_params = SumoParams( port = None,                  #default: None
                          sim_step=0.1,                 #default: 0.1
                          emission_path=None,           #default: None
                          lateral_resolution=None,      #default: None
                          no_step_log=True,             #default: True
                          render=False,                 #default: False
                          save_render=False,            #default: False
                          sight_radius=25,              #default: 25
                          show_radius=False,            #default: False
                          pxpm=2,                       #default: 2
                          overtake_right=False,         #default: False    
                          seed=None,                    #default: None
                          restart_instance=False,       #default: False
                          print_warnings=True,          #default: True
                          teleport_time=-1,             #default: -1
                          num_clients=1,                #default: 1
                          sumo_binary=None )            #default: None

### Environment Parameter

In [5]:
from flow.core.params import EnvParams

additionalEnvParams = {
        # maximum acceleration of autonomous vehicles
        "max_accel": 3,
        # maximum deceleration of autonomous vehicles
        "max_decel": 3,
        # desired velocity for all vehicles in the network, in m/s
        "target_velocity": 30,
        # initial teamspirit
        "ap_teamspirit_0": -1,
        "ap_teamspirit_1": -1,
        # shuffle teamspirit?
        "ap_teamspirit_shuffle": False    
    }

env_params = EnvParams( additional_params=additionalEnvParams, #default: None    !!
                        horizon=HORIZON,                       #default: 500     !!
                        warmup_steps=0,                        #default: 0       
                        sims_per_step=1,                       #default: 1
                        evaluate=False )                       #default: False

### Vehicles Parameter

In [6]:
from flow.core.params import VehicleParams

# import vehicles dynamics models
#from flow.controllers import SumoCarFollowingController
from flow.controllers import ContinuousRouter
#from flow.controllers.lane_change_controllers import SumoLaneChangeController
from flow.controllers.lane_change_controllers import StaticLaneChanger
from flow.controllers import RLController
from flow.core.params import SumoLaneChangeParams
from flow.core.params import SumoCarFollowingParams
from random import *

vehicles = VehicleParams()

#### Add RL-Agent controlled vehicles 

In [7]:
# car following parameters, default: None
cf_parameter = SumoCarFollowingParams(
                speed_mode="aggressive")
# lane change parameters, default: None
lc_parameter =  None

vehicles.add( # name of the vehicle
                veh_id = "rl",
              # acceleration controller, default: (SumoCarFollowingController, {})
                acceleration_controller=(RLController, {}),
              # lane_change_controller, default: (SumoLaneChangeController, {})
                lane_change_controller=(StaticLaneChanger,{}),
              # routing controller, default: None
                routing_controller=(ContinuousRouter, {}),
              # initial speed, default: 0
                initial_speed=0,
              # number of vehicles, default: 1 
                num_vehicles=2,
                
                car_following_params=cf_parameter
              # speed mode, default: "right_of_way"
                #speed_mode="aggressive",
              # lane change mode, default: "no_lat_collide"
                #lane_change_mode="aggressive", 
              # car following parameter, default: None
                #sumo_car_following_params=cf_parameter,
              # lane change parameter, default: None
                #sumo_lc_params=lc_parameter
)

### Flow Parameter

In [8]:
# Creating flow_params. Make sure the dictionary keys are as specified. 
flow_params = dict( # name of the experiment
                      exp_tag=experiment_name,
                    # name of the flow environment the experiment is running on
                      env_name=env_name,
                    # name of the scenario class the experiment uses
                      scenario=scenario_name,
                    # simulator that is used by the experiment
                      simulator='traci',
                    # sumo-related parameters (see flow.core.params.SumoParams)
                      sim=sumo_params,
                    # environment related parameters (see flow.core.params.EnvParams)
                      env=env_params,
                    # network-related parameters (see flow.core.params.NetParams and
                    # the scenario's documentation or ADDITIONAL_NET_PARAMS component)
                      net=net_params,
                    # vehicles to be placed in the network at the start of a rollout 
                    # (see flow.core.vehicles.Vehicles)
                      veh=vehicles,
                   # (optional) parameters affecting the positioning of vehicles upon 
                   # initialization/reset (see flow.core.params.InitialConfig)
                      initial=initial_config
                )

# B) Training

In [9]:
import json

import ray
try:
    from ray.rllib.agents.agent import get_agent_class
except ImportError:
    from ray.rllib.agents.registry import get_agent_class
from ray.tune import run_experiments
from ray.tune.registry import register_env

from flow.utils.registry import make_create_env
from flow.utils.rllib import FlowParamsEncoder

from ray import tune
from ray.rllib.agents.ppo.ppo_policy_graph import PPOPolicyGraph

In [10]:
# number of parallel workers
N_CPUS = 2
# number of rollouts per training iteration
N_ROLLOUTS = 20

ray.init(redirect_output=True, num_cpus=N_CPUS+1)

Process STDOUT and STDERR is being redirected to /tmp/ray/session_2019-04-09_13-11-00_30240/logs.
Waiting for redis server at 127.0.0.1:24259 to respond...
Waiting for redis server at 127.0.0.1:62896 to respond...
Starting the Plasma object store with 6.554658406 GB memory using /dev/shm.

View the web UI at http://localhost:8889/notebooks/ray_ui.ipynb?token=1c71038f05007dc71caeb68d84db6ade856d9270dd1ec1a0



{'node_ip_address': '172.16.123.117',
 'object_store_addresses': ['/tmp/ray/session_2019-04-09_13-11-00_30240/sockets/plasma_store'],
 'raylet_socket_names': ['/tmp/ray/session_2019-04-09_13-11-00_30240/sockets/raylet'],
 'redis_address': '172.16.123.117:24259',
 'webui_url': 'http://localhost:8889/notebooks/ray_ui.ipynb?token=1c71038f05007dc71caeb68d84db6ade856d9270dd1ec1a0'}

In [11]:
# The algorithm or model to train. This may refer to "
#      "the name of a built-on algorithm (e.g. RLLib's DQN "
#      "or PPO), or a user-defined trainable function or "
#      "class registered in the tune registry.")
alg_run = "PPO"

agent_cls = get_agent_class(alg_run)
config = agent_cls._default_config.copy()
config["num_workers"] = N_CPUS  # number of parallel workers
config["train_batch_size"] = HORIZON * N_ROLLOUTS  # batch size
config["gamma"] = 0.999  # discount rate default 0.999
config["model"].update({"fcnet_hiddens": [100, 50, 25]})  # size of hidden layers in network defaule 64 32
config["use_gae"] = True  # using generalized advantage estimation
config["lambda"] = 0.97  
#config["sgd_minibatch_size"] = min(16 * 1024, config["train_batch_size"])  # stochastic gradient descent
#config["sample_batch_size"] = config["train_batch_size"]/config["num_workers"] # 200 default, trotzdem zu hoch?
config["kl_target"] = 0.02  # target KL divergence
config["num_sgd_iter"] = 10  # number of SGD iterations
config["horizon"] = HORIZON  # rollout horizon

# save the flow params for replay
flow_json = json.dumps(flow_params, cls=FlowParamsEncoder, sort_keys=True,
                       indent=4)  # generating a string version of flow_params
config['env_config']['flow_params'] = flow_json  # adding the flow_params to config dict
config['env_config']['run'] = alg_run

# Call the utility function make_create_env to be able to 
# register the Flow env for this experiment
create_env, gym_name = make_create_env(params=flow_params, version=0)

# Register as rllib env with Gym
register_env(gym_name, create_env)

In [12]:
# multi agent policy mapping
test_env = create_env()
obs_space = test_env.observation_space
act_space = test_env.action_space

def gen_policy():
    return (PPOPolicyGraph, obs_space, act_space, {})

# Setup PG with an ensemble of `num_policies` different policy graphs
policy_graphs = {'rl_0': gen_policy()}
    
def policy_mapping_fn(agent_id):
    return 'rl_0'

config.update({
        'multiagent': {
            'policy_graphs': policy_graphs,
            'policy_mapping_fn': tune.function(policy_mapping_fn),
            'policies_to_train': ['rl_0']
        }
    })

 Starting SUMO on port 52973


New Teamspirit:
0.976419485470094
-0.4442784097254695


In [None]:
trials = run_experiments({
    flow_params["exp_tag"]: {
        "run": alg_run,  # RL algorithm to run
        "env": gym_name,  # environment name generated earlier
        "config": {  # configuration params (must match "run" value)
            **config
        },
        "checkpoint_freq": 1,  # number of iterations between checkpoints
        "max_failures": 999,
        "stop": {  # stopping conditions
            "training_iteration": 1000,  # number of iterations to stop after
        },
    },
})

== Status ==
Using FIFO scheduling algorithm.
Resources requested: 0/3 CPUs, 0/1 GPUs
Memory usage on this node: 5.0/16.4 GB

Created LogSyncer for /home/thorsten/ray_results/IntersectionExample/PPO_MultiAgentIntersectionEnv_sharedPolicy_TeamSpirit-v0_0_2019-04-09_13-11-02cdaml41i -> 
== Status ==
Using FIFO scheduling algorithm.
Resources requested: 3/3 CPUs, 0/1 GPUs
Memory usage on this node: 5.0/16.4 GB
Result logdir: /home/thorsten/ray_results/IntersectionExample
RUNNING trials:
 - PPO_MultiAgentIntersectionEnv_sharedPolicy_TeamSpirit-v0_0:	RUNNING

Result for PPO_MultiAgentIntersectionEnv_sharedPolicy_TeamSpirit-v0_0:
  custom_metrics: {}
  date: 2019-04-09_13-11-57
  done: false
  episode_len_mean: 485.95
  episode_reward_max: 123.0041521056342
  episode_reward_mean: -8.499962313751194
  episode_reward_min: -200.90302216972958
  episodes_this_iter: 20
  episodes_total: 20
  experiment_id: 22b1cc9446844e61a011e26f0fdde742
  hostname: Gandalf
  info:
    grad_time_ms: 6407.958
   

== Status ==
Using FIFO scheduling algorithm.
Resources requested: 3/3 CPUs, 0/1 GPUs
Memory usage on this node: 5.7/16.4 GB
Result logdir: /home/thorsten/ray_results/IntersectionExample
RUNNING trials:
 - PPO_MultiAgentIntersectionEnv_sharedPolicy_TeamSpirit-v0_0:	RUNNING [pid=30302], 135 s, 5 iter, 50000 ts, -50.7 rew

Result for PPO_MultiAgentIntersectionEnv_sharedPolicy_TeamSpirit-v0_0:
  custom_metrics: {}
  date: 2019-04-09_13-14-11
  done: false
  episode_len_mean: 297.34
  episode_reward_max: 157.88026843530776
  episode_reward_mean: -50.07914276241572
  episode_reward_min: -212.38164665177027
  episodes_this_iter: 41
  episodes_total: 167
  experiment_id: 22b1cc9446844e61a011e26f0fdde742
  hostname: Gandalf
  info:
    grad_time_ms: 4877.544
    load_time_ms: 10.471
    num_steps_sampled: 60000
    num_steps_trained: 60000
    rl_0:
      cur_kl_coeff: 0.012500002980232239
      cur_lr: 4.999999873689376e-05
      entropy: 1.380710482597351
      kl: 0.008414514362812042
     

Result for PPO_MultiAgentIntersectionEnv_sharedPolicy_TeamSpirit-v0_0:
  custom_metrics: {}
  date: 2019-04-09_13-16-21
  done: false
  episode_len_mean: 161.61
  episode_reward_max: 100.78826665440077
  episode_reward_mean: -52.440004880118934
  episode_reward_min: -208.23114062178718
  episodes_this_iter: 64
  episodes_total: 447
  experiment_id: 22b1cc9446844e61a011e26f0fdde742
  hostname: Gandalf
  info:
    grad_time_ms: 4445.614
    load_time_ms: 1.73
    num_steps_sampled: 110000
    num_steps_trained: 110000
    rl_0:
      cur_kl_coeff: 0.00039062509313225746
      cur_lr: 4.999999873689376e-05
      entropy: 1.3647818565368652
      kl: 0.00386752188205719
      policy_loss: -0.0005341600044630468
      total_loss: 734.5462646484375
      vf_explained_var: 0.3651774525642395
      vf_loss: 734.5467529296875
    sample_time_ms: 21887.172
    update_time_ms: 5.91
  iterations_since_restore: 11
  node_ip: 172.16.123.117
  num_metric_batches_dropped: 0
  pid: 30302
  policy_rewar

== Status ==
Using FIFO scheduling algorithm.
Resources requested: 3/3 CPUs, 0/1 GPUs
Memory usage on this node: 5.7/16.4 GB
Result logdir: /home/thorsten/ray_results/IntersectionExample
RUNNING trials:
 - PPO_MultiAgentIntersectionEnv_sharedPolicy_TeamSpirit-v0_0:	RUNNING [pid=30302], 420 s, 16 iter, 160000 ts, -74.8 rew

Result for PPO_MultiAgentIntersectionEnv_sharedPolicy_TeamSpirit-v0_0:
  custom_metrics: {}
  date: 2019-04-09_13-18-56
  done: false
  episode_len_mean: 131.66
  episode_reward_max: 70.42767800017589
  episode_reward_mean: -75.5627476129271
  episode_reward_min: -205.24244449241382
  episodes_this_iter: 76
  episodes_total: 862
  experiment_id: 22b1cc9446844e61a011e26f0fdde742
  hostname: Gandalf
  info:
    grad_time_ms: 4299.872
    load_time_ms: 1.774
    num_steps_sampled: 170000
    num_steps_trained: 170000
    rl_0:
      cur_kl_coeff: 6.103517080191523e-06
      cur_lr: 4.999999873689376e-05
      entropy: 1.3468657732009888
      kl: 0.004225405398756266
  

Result for PPO_MultiAgentIntersectionEnv_sharedPolicy_TeamSpirit-v0_0:
  custom_metrics: {}
  date: 2019-04-09_13-21-21
  done: false
  episode_len_mean: 117.63
  episode_reward_max: 62.73081598320505
  episode_reward_mean: -71.7889069665056
  episode_reward_min: -206.84527056337703
  episodes_this_iter: 81
  episodes_total: 1259
  experiment_id: 22b1cc9446844e61a011e26f0fdde742
  hostname: Gandalf
  info:
    grad_time_ms: 4631.72
    load_time_ms: 1.819
    num_steps_sampled: 220000
    num_steps_trained: 220000
    rl_0:
      cur_kl_coeff: 1.907349087559851e-07
      cur_lr: 4.999999873689376e-05
      entropy: 1.329553484916687
      kl: 0.0080961799249053
      policy_loss: -0.0028062909841537476
      total_loss: 991.7139282226562
      vf_explained_var: 0.45929795503616333
      vf_loss: 991.7167358398438
    sample_time_ms: 22530.555
    update_time_ms: 6.415
  iterations_since_restore: 22
  node_ip: 172.16.123.117
  num_metric_batches_dropped: 0
  pid: 30302
  policy_reward_m

== Status ==
Using FIFO scheduling algorithm.
Resources requested: 3/3 CPUs, 0/1 GPUs
Memory usage on this node: 5.7/16.4 GB
Result logdir: /home/thorsten/ray_results/IntersectionExample
RUNNING trials:
 - PPO_MultiAgentIntersectionEnv_sharedPolicy_TeamSpirit-v0_0:	RUNNING [pid=30302], 729 s, 27 iter, 270000 ts, -69.9 rew

Result for PPO_MultiAgentIntersectionEnv_sharedPolicy_TeamSpirit-v0_0:
  custom_metrics: {}
  date: 2019-04-09_13-24-05
  done: false
  episode_len_mean: 110.98
  episode_reward_max: 52.018286963408
  episode_reward_mean: -63.471653389786134
  episode_reward_min: -203.09993689942326
  episodes_this_iter: 90
  episodes_total: 1806
  experiment_id: 22b1cc9446844e61a011e26f0fdde742
  hostname: Gandalf
  info:
    grad_time_ms: 5012.303
    load_time_ms: 1.784
    num_steps_sampled: 280000
    num_steps_trained: 280000
    rl_0:
      cur_kl_coeff: 2.980232949312267e-09
      cur_lr: 4.999999873689376e-05
      entropy: 1.2882546186447144
      kl: 0.00465485779568553
  

Result for PPO_MultiAgentIntersectionEnv_sharedPolicy_TeamSpirit-v0_0:
  custom_metrics: {}
  date: 2019-04-09_13-26-16
  done: false
  episode_len_mean: 103.78
  episode_reward_max: 60.768172368314126
  episode_reward_mean: -67.23992727357248
  episode_reward_min: -202.27829937848736
  episodes_this_iter: 96
  episodes_total: 2277
  experiment_id: 22b1cc9446844e61a011e26f0fdde742
  hostname: Gandalf
  info:
    grad_time_ms: 4503.441
    load_time_ms: 1.766
    num_steps_sampled: 330000
    num_steps_trained: 330000
    rl_0:
      cur_kl_coeff: 9.313227966600834e-11
      cur_lr: 4.999999873689376e-05
      entropy: 1.2451201677322388
      kl: 0.0038572573103010654
      policy_loss: -0.0020013332832604647
      total_loss: 1148.1849365234375
      vf_explained_var: 0.523297905921936
      vf_loss: 1148.18701171875
    sample_time_ms: 22277.217
    update_time_ms: 5.962
  iterations_since_restore: 33
  node_ip: 172.16.123.117
  num_metric_batches_dropped: 0
  pid: 30302
  policy_rew

== Status ==
Using FIFO scheduling algorithm.
Resources requested: 3/3 CPUs, 0/1 GPUs
Memory usage on this node: 5.8/16.4 GB
Result logdir: /home/thorsten/ray_results/IntersectionExample
RUNNING trials:
 - PPO_MultiAgentIntersectionEnv_sharedPolicy_TeamSpirit-v0_0:	RUNNING [pid=30302], 1027 s, 38 iter, 380000 ts, -57.8 rew

Result for PPO_MultiAgentIntersectionEnv_sharedPolicy_TeamSpirit-v0_0:
  custom_metrics: {}
  date: 2019-04-09_13-29-04
  done: false
  episode_len_mean: 106.39
  episode_reward_max: 73.30477179951754
  episode_reward_mean: -53.15155035871268
  episode_reward_min: -203.53502362976653
  episodes_this_iter: 94
  episodes_total: 2856
  experiment_id: 22b1cc9446844e61a011e26f0fdde742
  hostname: Gandalf
  info:
    grad_time_ms: 4732.046
    load_time_ms: 1.752
    num_steps_sampled: 390000
    num_steps_trained: 390000
    rl_0:
      cur_kl_coeff: 1.4551918697813804e-12
      cur_lr: 4.999999873689376e-05
      entropy: 1.196708083152771
      kl: 0.004398655146360397

Result for PPO_MultiAgentIntersectionEnv_sharedPolicy_TeamSpirit-v0_0:
  custom_metrics: {}
  date: 2019-04-09_13-31-13
  done: false
  episode_len_mean: 107.98
  episode_reward_max: 78.88430261738985
  episode_reward_mean: -40.688788269471154
  episode_reward_min: -205.90763808229485
  episodes_this_iter: 93
  episodes_total: 3326
  experiment_id: 22b1cc9446844e61a011e26f0fdde742
  hostname: Gandalf
  info:
    grad_time_ms: 4696.059
    load_time_ms: 1.772
    num_steps_sampled: 440000
    num_steps_trained: 440000
    rl_0:
      cur_kl_coeff: 4.5474745930668137e-14
      cur_lr: 4.999999873689376e-05
      entropy: 1.1462059020996094
      kl: 0.004547902848571539
      policy_loss: -0.0016135374316945672
      total_loss: 1210.069091796875
      vf_explained_var: 0.4250567555427551
      vf_loss: 1210.0706787109375
    sample_time_ms: 22085.148
    update_time_ms: 6.299
  iterations_since_restore: 44
  node_ip: 172.16.123.117
  num_metric_batches_dropped: 0
  pid: 30302
  policy_r

== Status ==
Using FIFO scheduling algorithm.
Resources requested: 3/3 CPUs, 0/1 GPUs
Memory usage on this node: 5.8/16.4 GB
Result logdir: /home/thorsten/ray_results/IntersectionExample
RUNNING trials:
 - PPO_MultiAgentIntersectionEnv_sharedPolicy_TeamSpirit-v0_0:	RUNNING [pid=30302], 1316 s, 49 iter, 490000 ts, -52.1 rew

Result for PPO_MultiAgentIntersectionEnv_sharedPolicy_TeamSpirit-v0_0:
  custom_metrics: {}
  date: 2019-04-09_13-33-54
  done: false
  episode_len_mean: 103.3
  episode_reward_max: 63.46602818085465
  episode_reward_mean: -59.91273278871914
  episode_reward_min: -204.18160275508856
  episodes_this_iter: 98
  episodes_total: 3910
  experiment_id: 22b1cc9446844e61a011e26f0fdde742
  hostname: Gandalf
  info:
    grad_time_ms: 4342.332
    load_time_ms: 1.821
    num_steps_sampled: 500000
    num_steps_trained: 500000
    rl_0:
      cur_kl_coeff: 7.105429051666896e-16
      cur_lr: 4.999999873689376e-05
      entropy: 1.0888195037841797
      kl: 0.006299504078924656


Result for PPO_MultiAgentIntersectionEnv_sharedPolicy_TeamSpirit-v0_0:
  custom_metrics: {}
  date: 2019-04-09_13-36-07
  done: false
  episode_len_mean: 103.97
  episode_reward_max: 49.343749531203066
  episode_reward_mean: -43.081170708716535
  episode_reward_min: -203.6717731693624
  episodes_this_iter: 96
  episodes_total: 4406
  experiment_id: 22b1cc9446844e61a011e26f0fdde742
  hostname: Gandalf
  info:
    grad_time_ms: 4333.217
    load_time_ms: 1.74
    num_steps_sampled: 550000
    num_steps_trained: 550000
    rl_0:
      cur_kl_coeff: 2.220446578645905e-17
      cur_lr: 4.999999873689376e-05
      entropy: 1.054858684539795
      kl: 0.005582212004810572
      policy_loss: -0.0005996286636218429
      total_loss: 1320.48681640625
      vf_explained_var: 0.4547688066959381
      vf_loss: 1320.4874267578125
    sample_time_ms: 22498.957
    update_time_ms: 5.466
  iterations_since_restore: 55
  node_ip: 172.16.123.117
  num_metric_batches_dropped: 0
  pid: 30302
  policy_rewar

== Status ==
Using FIFO scheduling algorithm.
Resources requested: 3/3 CPUs, 0/1 GPUs
Memory usage on this node: 5.8/16.4 GB
Result logdir: /home/thorsten/ray_results/IntersectionExample
RUNNING trials:
 - PPO_MultiAgentIntersectionEnv_sharedPolicy_TeamSpirit-v0_0:	RUNNING [pid=30302], 1614 s, 60 iter, 600000 ts, -45.7 rew

Result for PPO_MultiAgentIntersectionEnv_sharedPolicy_TeamSpirit-v0_0:
  custom_metrics: {}
  date: 2019-04-09_13-38-53
  done: false
  episode_len_mean: 99.15841584158416
  episode_reward_max: 45.143505880240504
  episode_reward_mean: -54.198587135743416
  episode_reward_min: -203.49129793454688
  episodes_this_iter: 101
  episodes_total: 4996
  experiment_id: 22b1cc9446844e61a011e26f0fdde742
  hostname: Gandalf
  info:
    grad_time_ms: 4491.573
    load_time_ms: 1.595
    num_steps_sampled: 610000
    num_steps_trained: 610000
    rl_0:
      cur_kl_coeff: 3.4694477791342267e-19
      cur_lr: 4.999999873689376e-05
      entropy: 0.9955657720565796
      kl: 0.004

Result for PPO_MultiAgentIntersectionEnv_sharedPolicy_TeamSpirit-v0_0:
  custom_metrics: {}
  date: 2019-04-09_13-41-03
  done: false
  episode_len_mean: 94.97115384615384
  episode_reward_max: 31.830785021328516
  episode_reward_mean: -68.80393622429686
  episode_reward_min: -203.68874930578795
  episodes_this_iter: 104
  episodes_total: 5515
  experiment_id: 22b1cc9446844e61a011e26f0fdde742
  hostname: Gandalf
  info:
    grad_time_ms: 4541.04
    load_time_ms: 1.583
    num_steps_sampled: 660000
    num_steps_trained: 660000
    rl_0:
      cur_kl_coeff: 1.0842024309794459e-20
      cur_lr: 4.999999873689376e-05
      entropy: 0.9410213828086853
      kl: 0.0038620501291006804
      policy_loss: -0.0009526694193482399
      total_loss: 1386.31005859375
      vf_explained_var: 0.47105103731155396
      vf_loss: 1386.3111572265625
    sample_time_ms: 22194.964
    update_time_ms: 6.238
  iterations_since_restore: 66
  node_ip: 172.16.123.117
  num_metric_batches_dropped: 0
  pid: 3030

  custom_metrics: {}
  date: 2019-04-09_13-43-06
  done: false
  episode_len_mean: 94.65094339622641
  episode_reward_max: 30.22451712628606
  episode_reward_mean: -65.2384583380912
  episode_reward_min: -202.51230409914288
  episodes_this_iter: 106
  episodes_total: 6035
  experiment_id: 22b1cc9446844e61a011e26f0fdde742
  hostname: Gandalf
  info:
    grad_time_ms: 4285.764
    load_time_ms: 1.613
    num_steps_sampled: 710000
    num_steps_trained: 710000
    rl_0:
      cur_kl_coeff: 3.3881325968107683e-22
      cur_lr: 4.999999873689376e-05
      entropy: 0.9223096966743469
      kl: 0.005640849936753511
      policy_loss: -0.0015313720796257257
      total_loss: 1391.4886474609375
      vf_explained_var: 0.42455512285232544
      vf_loss: 1391.490234375
    sample_time_ms: 21025.348
    update_time_ms: 5.675
  iterations_since_restore: 71
  node_ip: 172.16.123.117
  num_metric_batches_dropped: 0
  pid: 30302
  policy_reward_mean:
    rl_0: -32.61922916904559
  time_since_restore: 

== Status ==
Using FIFO scheduling algorithm.
Resources requested: 3/3 CPUs, 0/1 GPUs
Memory usage on this node: 5.9/16.4 GB
Result logdir: /home/thorsten/ray_results/IntersectionExample
RUNNING trials:
 - PPO_MultiAgentIntersectionEnv_sharedPolicy_TeamSpirit-v0_0:	RUNNING [pid=30302], 2020 s, 76 iter, 760000 ts, -77.5 rew

Result for PPO_MultiAgentIntersectionEnv_sharedPolicy_TeamSpirit-v0_0:
  custom_metrics: {}
  date: 2019-04-09_13-45-38
  done: false
  episode_len_mean: 93.4392523364486
  episode_reward_max: 20.174919777410935
  episode_reward_mean: -68.14959590190334
  episode_reward_min: -202.37220308616924
  episodes_this_iter: 107
  episodes_total: 6673
  experiment_id: 22b1cc9446844e61a011e26f0fdde742
  hostname: Gandalf
  info:
    grad_time_ms: 4247.27
    load_time_ms: 1.667
    num_steps_sampled: 770000
    num_steps_trained: 770000
    rl_0:
      cur_kl_coeff: 5.2939571825168255e-24
      cur_lr: 4.999999873689376e-05
      entropy: 0.8554137945175171
      kl: 0.005289

Result for PPO_MultiAgentIntersectionEnv_sharedPolicy_TeamSpirit-v0_0:
  custom_metrics: {}
  date: 2019-04-09_13-47-49
  done: false
  episode_len_mean: 97.7843137254902
  episode_reward_max: 23.786416796915425
  episode_reward_mean: -58.363244948499194
  episode_reward_min: -203.70146304790006
  episodes_this_iter: 102
  episodes_total: 7202
  experiment_id: 22b1cc9446844e61a011e26f0fdde742
  hostname: Gandalf
  info:
    grad_time_ms: 4463.251
    load_time_ms: 1.792
    num_steps_sampled: 820000
    num_steps_trained: 820000
    rl_0:
      cur_kl_coeff: 1.654361619536508e-25
      cur_lr: 4.999999873689376e-05
      entropy: 0.8472520709037781
      kl: 0.0048398603685200214
      policy_loss: -0.0009178169420920312
      total_loss: 1396.052978515625
      vf_explained_var: 0.37273576855659485
      vf_loss: 1396.053955078125
    sample_time_ms: 21145.761
    update_time_ms: 7.027
  iterations_since_restore: 82
  node_ip: 172.16.123.117
  num_metric_batches_dropped: 0
  pid: 3030

  custom_metrics: {}
  date: 2019-04-09_13-50-07
  done: false
  episode_len_mean: 92.70370370370371
  episode_reward_max: 26.391072902676996
  episode_reward_mean: -72.28958742809962
  episode_reward_min: -202.8307724919642
  episodes_this_iter: 108
  episodes_total: 7723
  experiment_id: 22b1cc9446844e61a011e26f0fdde742
  hostname: Gandalf
  info:
    grad_time_ms: 4590.027
    load_time_ms: 1.693
    num_steps_sampled: 870000
    num_steps_trained: 870000
    rl_0:
      cur_kl_coeff: 5.1698800610515874e-27
      cur_lr: 4.999999873689376e-05
      entropy: 0.8248167634010315
      kl: 0.004779214505106211
      policy_loss: -0.0021821013651788235
      total_loss: 1322.5924072265625
      vf_explained_var: 0.4541299343109131
      vf_loss: 1322.5946044921875
    sample_time_ms: 22250.036
    update_time_ms: 7.906
  iterations_since_restore: 87
  node_ip: 172.16.123.117
  num_metric_batches_dropped: 0
  pid: 30302
  policy_reward_mean:
    rl_0: -36.144793714049804
  time_since_rest

== Status ==
Using FIFO scheduling algorithm.
Resources requested: 3/3 CPUs, 0/1 GPUs
Memory usage on this node: 5.9/16.4 GB
Result logdir: /home/thorsten/ray_results/IntersectionExample
RUNNING trials:
 - PPO_MultiAgentIntersectionEnv_sharedPolicy_TeamSpirit-v0_0:	RUNNING [pid=30302], 2439 s, 92 iter, 920000 ts, -59 rew

Result for PPO_MultiAgentIntersectionEnv_sharedPolicy_TeamSpirit-v0_0:
  custom_metrics: {}
  date: 2019-04-09_13-52-37
  done: false
  episode_len_mean: 94.93396226415095
  episode_reward_max: 20.871463203140422
  episode_reward_mean: -66.91782162817799
  episode_reward_min: -201.6255191237148
  episodes_this_iter: 106
  episodes_total: 8347
  experiment_id: 22b1cc9446844e61a011e26f0fdde742
  hostname: Gandalf
  info:
    grad_time_ms: 4383.603
    load_time_ms: 1.56
    num_steps_sampled: 930000
    num_steps_trained: 930000
    rl_0:
      cur_kl_coeff: 8.077937595393105e-29
      cur_lr: 4.999999873689376e-05
      entropy: 0.8112401962280273
      kl: 0.005716491

Result for PPO_MultiAgentIntersectionEnv_sharedPolicy_TeamSpirit-v0_0:
  custom_metrics: {}
  date: 2019-04-09_13-54-43
  done: false
  episode_len_mean: 90.35454545454546
  episode_reward_max: 15.335283187907388
  episode_reward_mean: -77.65931072168493
  episode_reward_min: -202.18043071138104
  episodes_this_iter: 110
  episodes_total: 8882
  experiment_id: 22b1cc9446844e61a011e26f0fdde742
  hostname: Gandalf
  info:
    grad_time_ms: 4297.831
    load_time_ms: 1.669
    num_steps_sampled: 980000
    num_steps_trained: 980000
    rl_0:
      cur_kl_coeff: 2.5243554985603454e-30
      cur_lr: 4.999999873689376e-05
      entropy: 0.7727394104003906
      kl: 0.005413609091192484
      policy_loss: -0.001954768318682909
      total_loss: 1461.2982177734375
      vf_explained_var: 0.41504189372062683
      vf_loss: 1461.300537109375
    sample_time_ms: 20295.61
    update_time_ms: 6.647
  iterations_since_restore: 98
  node_ip: 172.16.123.117
  num_metric_batches_dropped: 0
  pid: 30302

  custom_metrics: {}
  date: 2019-04-09_13-56-56
  done: false
  episode_len_mean: 94.84761904761905
  episode_reward_max: 17.29877167097174
  episode_reward_mean: -67.0170028654003
  episode_reward_min: -201.17333283759478
  episodes_this_iter: 105
  episodes_total: 9430
  experiment_id: 22b1cc9446844e61a011e26f0fdde742
  hostname: Gandalf
  info:
    grad_time_ms: 4639.564
    load_time_ms: 1.644
    num_steps_sampled: 1030000
    num_steps_trained: 1030000
    rl_0:
      cur_kl_coeff: 7.888610933001079e-32
      cur_lr: 4.999999873689376e-05
      entropy: 0.7307636141777039
      kl: 0.004998000804334879
      policy_loss: -0.0006135260919108987
      total_loss: 1418.1708984375
      vf_explained_var: 0.44998258352279663
      vf_loss: 1418.171630859375
    sample_time_ms: 21186.633
    update_time_ms: 6.871
  iterations_since_restore: 103
  node_ip: 172.16.123.117
  num_metric_batches_dropped: 0
  pid: 30302
  policy_reward_mean:
    rl_0: -33.50850143270015
  time_since_restore

== Status ==
Using FIFO scheduling algorithm.
Resources requested: 3/3 CPUs, 0/1 GPUs
Memory usage on this node: 6.0/16.4 GB
Result logdir: /home/thorsten/ray_results/IntersectionExample
RUNNING trials:
 - PPO_MultiAgentIntersectionEnv_sharedPolicy_TeamSpirit-v0_0:	RUNNING [pid=30302], 2858 s, 108 iter, 1080000 ts, -98.5 rew

Result for PPO_MultiAgentIntersectionEnv_sharedPolicy_TeamSpirit-v0_0:
  custom_metrics: {}
  date: 2019-04-09_13-59-36
  done: false
  episode_len_mean: 91.85321100917432
  episode_reward_max: 12.887953478583832
  episode_reward_mean: -75.56262462262679
  episode_reward_min: -201.09537817012693
  episodes_this_iter: 109
  episodes_total: 10086
  experiment_id: 22b1cc9446844e61a011e26f0fdde742
  hostname: Gandalf
  info:
    grad_time_ms: 4849.177
    load_time_ms: 1.674
    num_steps_sampled: 1090000
    num_steps_trained: 1090000
    rl_0:
      cur_kl_coeff: 1.2325954582814187e-33
      cur_lr: 4.999999873689376e-05
      entropy: 0.6948843002319336
      kl: 0

== Status ==
Using FIFO scheduling algorithm.
Resources requested: 3/3 CPUs, 0/1 GPUs
Memory usage on this node: 6.0/16.4 GB
Result logdir: /home/thorsten/ray_results/IntersectionExample
RUNNING trials:
 - PPO_MultiAgentIntersectionEnv_sharedPolicy_TeamSpirit-v0_0:	RUNNING [pid=30302], 2980 s, 113 iter, 1130000 ts, -85.6 rew

Result for PPO_MultiAgentIntersectionEnv_sharedPolicy_TeamSpirit-v0_0:
  custom_metrics: {}
  date: 2019-04-09_14-01-38
  done: false
  episode_len_mean: 89.54954954954955
  episode_reward_max: 10.073178500607742
  episode_reward_mean: -79.03862179997014
  episode_reward_min: -200.9305255220726
  episodes_this_iter: 111
  episodes_total: 10636
  experiment_id: 22b1cc9446844e61a011e26f0fdde742
  hostname: Gandalf
  info:
    grad_time_ms: 4291.066
    load_time_ms: 1.784
    num_steps_sampled: 1140000
    num_steps_trained: 1140000
    rl_0:
      cur_kl_coeff: 3.8518608071294333e-35
      cur_lr: 4.999999873689376e-05
      entropy: 0.6684631109237671
      kl: 0.

== Status ==
Using FIFO scheduling algorithm.
Resources requested: 3/3 CPUs, 0/1 GPUs
Memory usage on this node: 6.1/16.4 GB
Result logdir: /home/thorsten/ray_results/IntersectionExample
RUNNING trials:
 - PPO_MultiAgentIntersectionEnv_sharedPolicy_TeamSpirit-v0_0:	RUNNING [pid=30302], 3103 s, 118 iter, 1180000 ts, -76.6 rew

Result for PPO_MultiAgentIntersectionEnv_sharedPolicy_TeamSpirit-v0_0:
  custom_metrics: {}
  date: 2019-04-09_14-03-42
  done: false
  episode_len_mean: 90.74774774774775
  episode_reward_max: 12.25806354058453
  episode_reward_mean: -79.86433540043275
  episode_reward_min: -200.96411593953837
  episodes_this_iter: 111
  episodes_total: 11182
  experiment_id: 22b1cc9446844e61a011e26f0fdde742
  hostname: Gandalf
  info:
    grad_time_ms: 4179.566
    load_time_ms: 1.774
    num_steps_sampled: 1190000
    num_steps_trained: 1190000
    rl_0:
      cur_kl_coeff: 1.2037065022279479e-36
      cur_lr: 4.999999873689376e-05
      entropy: 0.6238279938697815
      kl: 0.

== Status ==
Using FIFO scheduling algorithm.
Resources requested: 3/3 CPUs, 0/1 GPUs
Memory usage on this node: 6.1/16.4 GB
Result logdir: /home/thorsten/ray_results/IntersectionExample
RUNNING trials:
 - PPO_MultiAgentIntersectionEnv_sharedPolicy_TeamSpirit-v0_0:	RUNNING [pid=30302], 3223 s, 123 iter, 1230000 ts, -69.2 rew

Result for PPO_MultiAgentIntersectionEnv_sharedPolicy_TeamSpirit-v0_0:
  custom_metrics: {}
  date: 2019-04-09_14-05-42
  done: false
  episode_len_mean: 89.19469026548673
  episode_reward_max: 8.905845322412418
  episode_reward_mean: -82.83973063948339
  episode_reward_min: -200.57095371952303
  episodes_this_iter: 113
  episodes_total: 11735
  experiment_id: 22b1cc9446844e61a011e26f0fdde742
  hostname: Gandalf
  info:
    grad_time_ms: 4172.982
    load_time_ms: 1.75
    num_steps_sampled: 1240000
    num_steps_trained: 1240000
    rl_0:
      cur_kl_coeff: 3.761582819462337e-38
      cur_lr: 4.999999873689376e-05
      entropy: 0.5763477087020874
      kl: 0.00

Result for PPO_MultiAgentIntersectionEnv_sharedPolicy_TeamSpirit-v0_0:
  custom_metrics: {}
  date: 2019-04-09_14-07-43
  done: false
  episode_len_mean: 99.43
  episode_reward_max: 7.2591075337985735
  episode_reward_mean: -57.04698642384553
  episode_reward_min: -200.7021090440061
  episodes_this_iter: 100
  episodes_total: 12291
  experiment_id: 22b1cc9446844e61a011e26f0fdde742
  hostname: Gandalf
  info:
    grad_time_ms: 4227.969
    load_time_ms: 1.636
    num_steps_sampled: 1290000
    num_steps_trained: 1290000
    rl_0:
      cur_kl_coeff: 1.1754946310819804e-39
      cur_lr: 4.999999873689376e-05
      entropy: 0.5695739388465881
      kl: 0.007411550264805555
      policy_loss: -0.0025158654898405075
      total_loss: 1437.813720703125
      vf_explained_var: 0.41832923889160156
      vf_loss: 1437.816162109375
    sample_time_ms: 19802.261
    update_time_ms: 5.67
  iterations_since_restore: 129
  node_ip: 172.16.123.117
  num_metric_batches_dropped: 0
  pid: 30302
  policy

  custom_metrics: {}
  date: 2019-04-09_14-09-45
  done: false
  episode_len_mean: 89.75
  episode_reward_max: 7.461549153822463
  episode_reward_mean: -83.86948904324979
  episode_reward_min: -200.6686016920333
  episodes_this_iter: 112
  episodes_total: 12852
  experiment_id: 22b1cc9446844e61a011e26f0fdde742
  hostname: Gandalf
  info:
    grad_time_ms: 4319.073
    load_time_ms: 1.711
    num_steps_sampled: 1340000
    num_steps_trained: 1340000
    rl_0:
      cur_kl_coeff: 3.6733637943810755e-41
      cur_lr: 4.999999873689376e-05
      entropy: 0.5185015797615051
      kl: 0.008375751785933971
      policy_loss: -0.0020104916766285896
      total_loss: 1385.0006103515625
      vf_explained_var: 0.49867844581604004
      vf_loss: 1385.0025634765625
    sample_time_ms: 19911.089
    update_time_ms: 5.69
  iterations_since_restore: 134
  node_ip: 172.16.123.117
  num_metric_batches_dropped: 0
  pid: 30302
  policy_reward_mean:
    rl_0: -41.93474452162489
  time_since_restore: 3489.

== Status ==
Using FIFO scheduling algorithm.
Resources requested: 3/3 CPUs, 0/1 GPUs
Memory usage on this node: 6.1/16.4 GB
Result logdir: /home/thorsten/ray_results/IntersectionExample
RUNNING trials:
 - PPO_MultiAgentIntersectionEnv_sharedPolicy_TeamSpirit-v0_0:	RUNNING [pid=30302], 3617 s, 139 iter, 1390000 ts, -86.7 rew

Result for PPO_MultiAgentIntersectionEnv_sharedPolicy_TeamSpirit-v0_0:
  custom_metrics: {}
  date: 2019-04-09_14-12-17
  done: false
  episode_len_mean: 86.94017094017094
  episode_reward_max: 6.9403164476752295
  episode_reward_mean: -88.98763519562172
  episode_reward_min: -200.317571291991
  episodes_this_iter: 117
  episodes_total: 13528
  experiment_id: 22b1cc9446844e61a011e26f0fdde742
  hostname: Gandalf
  info:
    grad_time_ms: 4322.889
    load_time_ms: 1.775
    num_steps_sampled: 1400000
    num_steps_trained: 1400000
    rl_0:
      cur_kl_coeff: 5.74532370373175e-43
      cur_lr: 4.999999873689376e-05
      entropy: 0.48776695132255554
      kl: 0.00

== Status ==
Using FIFO scheduling algorithm.
Resources requested: 3/3 CPUs, 0/1 GPUs
Memory usage on this node: 6.1/16.4 GB
Result logdir: /home/thorsten/ray_results/IntersectionExample
RUNNING trials:
 - PPO_MultiAgentIntersectionEnv_sharedPolicy_TeamSpirit-v0_0:	RUNNING [pid=30302], 3739 s, 144 iter, 1440000 ts, -87.8 rew

Result for PPO_MultiAgentIntersectionEnv_sharedPolicy_TeamSpirit-v0_0:
  custom_metrics: {}
  date: 2019-04-09_14-14-21
  done: false
  episode_len_mean: 94.19626168224299
  episode_reward_max: 7.744306652348826
  episode_reward_mean: -68.74162317977482
  episode_reward_min: -200.22193907564665
  episodes_this_iter: 107
  episodes_total: 14088
  experiment_id: 22b1cc9446844e61a011e26f0fdde742
  hostname: Gandalf
  info:
    grad_time_ms: 4217.123
    load_time_ms: 1.58
    num_steps_sampled: 1450000
    num_steps_trained: 1450000
    rl_0:
      cur_kl_coeff: 1.8216880036222622e-44
      cur_lr: 4.999999873689376e-05
      entropy: 0.45043256878852844
      kl: 0.

== Status ==
Using FIFO scheduling algorithm.
Resources requested: 3/3 CPUs, 0/1 GPUs
Memory usage on this node: 6.2/16.4 GB
Result logdir: /home/thorsten/ray_results/IntersectionExample
RUNNING trials:
 - PPO_MultiAgentIntersectionEnv_sharedPolicy_TeamSpirit-v0_0:	RUNNING [pid=30302], 3871 s, 149 iter, 1490000 ts, -80.4 rew

Result for PPO_MultiAgentIntersectionEnv_sharedPolicy_TeamSpirit-v0_0:
  custom_metrics: {}
  date: 2019-04-09_14-16-32
  done: false
  episode_len_mean: 83.13333333333334
  episode_reward_max: 7.904371470771968
  episode_reward_mean: -98.1322138507479
  episode_reward_min: -200.18534041383816
  episodes_this_iter: 120
  episodes_total: 14662
  experiment_id: 22b1cc9446844e61a011e26f0fdde742
  hostname: Gandalf
  info:
    grad_time_ms: 4329.632
    load_time_ms: 1.567
    num_steps_sampled: 1500000
    num_steps_trained: 1500000
    rl_0:
      cur_kl_coeff: 0.0
      cur_lr: 4.999999873689376e-05
      entropy: 0.40174999833106995
      kl: 0.006670753005892038


Result for PPO_MultiAgentIntersectionEnv_sharedPolicy_TeamSpirit-v0_0:
  custom_metrics: {}
  date: 2019-04-09_14-18-34
  done: false
  episode_len_mean: 89.23214285714286
  episode_reward_max: 7.228048956325394
  episode_reward_mean: -82.66931904659292
  episode_reward_min: -200.21733781737663
  episodes_this_iter: 112
  episodes_total: 15219
  experiment_id: 22b1cc9446844e61a011e26f0fdde742
  hostname: Gandalf
  info:
    grad_time_ms: 4327.182
    load_time_ms: 1.686
    num_steps_sampled: 1550000
    num_steps_trained: 1550000
    rl_0:
      cur_kl_coeff: 0.0
      cur_lr: 4.999999873689376e-05
      entropy: 0.3707539141178131
      kl: 0.00557762011885643
      policy_loss: -0.0013121970696374774
      total_loss: 1370.0377197265625
      vf_explained_var: 0.5012130737304688
      vf_loss: 1370.038818359375
    sample_time_ms: 20918.954
    update_time_ms: 5.926
  iterations_since_restore: 155
  node_ip: 172.16.123.117
  num_metric_batches_dropped: 0
  pid: 30302
  policy_reward

== Status ==
Using FIFO scheduling algorithm.
Resources requested: 3/3 CPUs, 0/1 GPUs
Memory usage on this node: 6.3/16.4 GB
Result logdir: /home/thorsten/ray_results/IntersectionExample
RUNNING trials:
 - PPO_MultiAgentIntersectionEnv_sharedPolicy_TeamSpirit-v0_0:	RUNNING [pid=30302], 4138 s, 160 iter, 1600000 ts, -91.5 rew

Result for PPO_MultiAgentIntersectionEnv_sharedPolicy_TeamSpirit-v0_0:
  custom_metrics: {}
  date: 2019-04-09_14-21-03
  done: false
  episode_len_mean: 87.56521739130434
  episode_reward_max: 7.174711101959087
  episode_reward_mean: -86.8148458888353
  episode_reward_min: -200.1712515667041
  episodes_this_iter: 115
  episodes_total: 15915
  experiment_id: 22b1cc9446844e61a011e26f0fdde742
  hostname: Gandalf
  info:
    grad_time_ms: 4187.136
    load_time_ms: 1.7
    num_steps_sampled: 1610000
    num_steps_trained: 1610000
    rl_0:
      cur_kl_coeff: 0.0
      cur_lr: 4.999999873689376e-05
      entropy: 0.32506662607192993
      kl: 0.007866689004004002
   