# TRAINING I3W


# A) Create Envorinment, Vehicles etc

### General Parameter

In [1]:
# Define horizon as a variable to ensure consistent use across notebook (length of one rollout)
HORIZON= 250                                 #103 max Horizon, wenn es vor verlassen abbrechen soll!, default war 500

# name of the experiment
experiment_name = "TenaciousD"

# scenario class
import flow.scenarios as scenarios
print("Available scenarios:")
print(scenarios.__all__)
scenario_name = "TenaciousDScenario"

# environment class
import flow.multiagent_envs as flowenvs
print("\nAvailable environments:")
print(flowenvs.__all__)
env_name = "MultiTenaciousDEnv"

Available scenarios:
['Scenario', 'BayBridgeScenario', 'BayBridgeTollScenario', 'BottleneckScenario', 'Figure8Scenario', 'SimpleGridScenario', 'HighwayScenario', 'LoopScenario', 'MergeScenario', 'TwoLoopsOneMergingScenario', 'MultiLoopScenario', 'IntersectionScenarioTW', 'TenaciousDScenario']

Available environments:
['MultiEnv', 'MultiAgentAccelEnv', 'MultiWaveAttenuationPOEnv', 'MultiAgentIntersectionEnv', 'MultiAgentTeamSpiritIntersectionEnv', 'MultiAgentIntersectionEnv_baseline_1', 'MultiAgentIntersectionEnv_baseline_2', 'MultiAgentIntersectionEnv_baseline_3', 'MultiAgentIntersectionEnv_sharedPolicy_TeamSpirit', 'MultiTenaciousDEnv']


### Net Parameter

In [2]:
from flow.core.params import NetParams
from flow.scenarios.intersection import ADDITIONAL_NET_PARAMS

additionalNetParams={
            'length': 230,
            'lanes': 1,
            'speed_limit': 15,
            'resolution': 40,
        }

net_params = NetParams( no_internal_links=False,                  #default: True   !! damit Kreuzungen nicht überspr. werden
                        inflows=None,                             #default: None
                        osm_path=None,                            #default: None
                        netfile=None,                             #default: None
                        additional_params=additionalNetParams     #default: None   !!
                      )

### InitialConfig Parameter

In [3]:
from flow.core.params import InitialConfig

initialEdges = []
initialEdges.append("right_upper")
initialEdges.append("right_lower")
initialEdges.append("left_upper")
initialEdges.append("left_lower")
print(initialEdges)

initial_config = InitialConfig( shuffle=True,                           #default: False          !!
                                spacing="custom",                        #default: "uniform"      !!
                                min_gap=10,                              #default: 0
                                perturbation=0.0,                        #default: 0.0            !!        
                                x0=0,                                    #default: 0
                                bunching=0,                              #default: 0
                                lanes_distribution=float("inf"),         #default: float("inf")
                                edges_distribution=initialEdges,         #default: "all"          !!
                                additional_params=None )                 #default: None

['right_upper', 'right_lower', 'left_upper', 'left_lower']


### SUMO Parameter

In [4]:
from flow.core.params import SumoParams

sumo_params = SumoParams( port = None,                  #default: None
                          sim_step=0.1,                 #default: 0.1
                          emission_path=None,           #default: None
                          lateral_resolution=None,      #default: None
                          no_step_log=True,             #default: True
                          render=False,                 #default: False !!
                          save_render=False,            #default: False
                          sight_radius=25,              #default: 25
                          show_radius=False,            #default: False
                          pxpm=2,                       #default: 2
                          overtake_right=False,         #default: False    
                          seed=None,                    #default: None
                          restart_instance=False,       #default: False
                          print_warnings=True,          #default: True
                          teleport_time=-1,             #default: -1
                          num_clients=1,                #default: 1
                          sumo_binary=None )            #default: None

### Environment Parameter

In [5]:
from flow.core.params import EnvParams

additionalEnvParams={
            'max_accel': 1,
            'max_decel': 1,
            'ring_length': [230, 230],
            'target_velocity': 4
        }

env_params = EnvParams( additional_params=additionalEnvParams, #default: None    !!
                        horizon=HORIZON,                       #default: 500     !!
                        warmup_steps=0,                        #default: 0       
                        sims_per_step=1,                       #default: 1
                        evaluate=False )                       #default: False

### Vehicles Parameter

In [6]:
from flow.core.params import VehicleParams

# import vehicles dynamics models
#from flow.controllers import SumoCarFollowingController
from flow.controllers import ContinuousRouter
from flow.controllers import TenaciousDRouter
#from flow.controllers.lane_change_controllers import SumoLaneChangeController
from flow.controllers.lane_change_controllers import StaticLaneChanger
from flow.controllers import RLController
from flow.core.params import SumoLaneChangeParams
from flow.core.params import SumoCarFollowingParams
from random import *

vehicles = VehicleParams()

#### Add RL-Agent controlled vehicles 

In [7]:
# car following parameters, default: None
cf_parameter = SumoCarFollowingParams(
                speed_mode="aggressive")
# lane change parameters, default: None
lc_parameter =  None

vehicles.add( # name of the vehicle
                veh_id = "rl",
              # acceleration controller, default: (SumoCarFollowingController, {})
                acceleration_controller=(RLController, {}),
              # lane_change_controller, default: (SumoLaneChangeController, {})
                lane_change_controller=(StaticLaneChanger,{}),
              # routing controller, default: None
                routing_controller=(TenaciousDRouter, {}),
              # initial speed, default: 0
                initial_speed=0,
              # number of vehicles, default: 1 
                num_vehicles=2,
                
                car_following_params=cf_parameter
              # speed mode, default: "right_of_way"
                #speed_mode="aggressive",
              # lane change mode, default: "no_lat_collide"
                #lane_change_mode="aggressive", 
              # car following parameter, default: None
                #sumo_car_following_params=cf_parameter,
              # lane change parameter, default: None
                #sumo_lc_params=lc_parameter
)

### Flow Parameter

In [8]:
# Creating flow_params. Make sure the dictionary keys are as specified. 
flow_params = dict( # name of the experiment
                      exp_tag=experiment_name,
                    # name of the flow environment the experiment is running on
                      env_name=env_name,
                    # name of the scenario class the experiment uses
                      scenario=scenario_name,
                    # simulator that is used by the experiment
                      simulator='traci',
                    # sumo-related parameters (see flow.core.params.SumoParams)
                      sim=sumo_params,
                    # environment related parameters (see flow.core.params.EnvParams)
                      env=env_params,
                    # network-related parameters (see flow.core.params.NetParams and
                    # the scenario's documentation or ADDITIONAL_NET_PARAMS component)
                      net=net_params,
                    # vehicles to be placed in the network at the start of a rollout 
                    # (see flow.core.vehicles.Vehicles)
                      veh=vehicles,
                   # (optional) parameters affecting the positioning of vehicles upon 
                   # initialization/reset (see flow.core.params.InitialConfig)
                      initial=initial_config
                )

# B) Training

In [9]:
import json

import ray
try:
    from ray.rllib.agents.agent import get_agent_class
except ImportError:
    from ray.rllib.agents.registry import get_agent_class
from ray.tune import run_experiments
from ray.tune.registry import register_env

from flow.utils.registry import make_create_env
from flow.utils.rllib import FlowParamsEncoder

from ray import tune
from ray.rllib.agents.ppo.ppo_policy_graph import PPOPolicyGraph

In [10]:
# number of parallel workers
N_CPUS = 2
# number of rollouts per training iteration
N_ROLLOUTS = 20

ray.init(redirect_output=True, num_cpus=N_CPUS+1)

Process STDOUT and STDERR is being redirected to /tmp/ray/session_2019-04-30_21-06-55_21411/logs.
Waiting for redis server at 127.0.0.1:56915 to respond...
Waiting for redis server at 127.0.0.1:19476 to respond...
Starting the Plasma object store with 6.554658406 GB memory using /dev/shm.

View the web UI at http://localhost:8888/notebooks/ray_ui.ipynb?token=19cd5e26f99c4fded114ee1d7c03c3b6fc72b1be25a567c0



{'node_ip_address': '172.16.123.117',
 'object_store_addresses': ['/tmp/ray/session_2019-04-30_21-06-55_21411/sockets/plasma_store'],
 'raylet_socket_names': ['/tmp/ray/session_2019-04-30_21-06-55_21411/sockets/raylet'],
 'redis_address': '172.16.123.117:56915',
 'webui_url': 'http://localhost:8888/notebooks/ray_ui.ipynb?token=19cd5e26f99c4fded114ee1d7c03c3b6fc72b1be25a567c0'}

In [11]:
# The algorithm or model to train. This may refer to "
#      "the name of a built-on algorithm (e.g. RLLib's DQN "
#      "or PPO), or a user-defined trainable function or "
#      "class registered in the tune registry.")
alg_run = "PPO"

agent_cls = get_agent_class(alg_run)
config = agent_cls._default_config.copy()
config["num_workers"] = N_CPUS  # number of parallel workers
config["train_batch_size"] = HORIZON * N_ROLLOUTS  # batch size
config["gamma"] = 0.999  # discount rate default 0.999
config["model"].update({"fcnet_hiddens": [100, 50, 25]})  # size of hidden layers in network defaule 64 32
config["use_gae"] = True  # using generalized advantage estimation
config["lambda"] = 0.97  
#config["sgd_minibatch_size"] = min(16 * 1024, config["train_batch_size"])  # stochastic gradient descent
#config["sample_batch_size"] = config["train_batch_size"]/config["num_workers"] # 200 default, trotzdem zu hoch?
config["kl_target"] = 0.02  # target KL divergence
config["num_sgd_iter"] = 10  # number of SGD iterations
config["horizon"] = HORIZON  # rollout horizon

# save the flow params for replay
flow_json = json.dumps(flow_params, cls=FlowParamsEncoder, sort_keys=True,
                       indent=4)  # generating a string version of flow_params
config['env_config']['flow_params'] = flow_json  # adding the flow_params to config dict
config['env_config']['run'] = alg_run

# Call the utility function make_create_env to be able to 
# register the Flow env for this experiment
create_env, gym_name = make_create_env(params=flow_params, version=0)

# Register as rllib env with Gym
register_env(gym_name, create_env)

In [12]:
# multi agent policy mapping
test_env = create_env()
obs_space = test_env.observation_space
act_space = test_env.action_space

def gen_policy():
    return (PPOPolicyGraph, obs_space, act_space, {})

# Setup PG with an ensemble of `num_policies` different policy graphs
policy_graphs = {'rl_0': gen_policy()}
    
def policy_mapping_fn(agent_id):
    return 'rl_0'

config.update({
        'multiagent': {
            'policy_graphs': policy_graphs,
            'policy_mapping_fn': tune.function(policy_mapping_fn),
            'policies_to_train': ['rl_0']
        }
    })

 Starting SUMO on port 55157


New Teamspirit:
-0.5307131062447403
0.19154012965531098
[('left_lower', 17.9513335971836), ('right_lower', 2.693978064496614)]


In [None]:
trials = run_experiments({
    flow_params["exp_tag"]: {
        "run": alg_run,  # RL algorithm to run
        "env": gym_name,  # environment name generated earlier
        "config": {  # configuration params (must match "run" value)
            **config
        },
        "checkpoint_freq": 1,  # number of iterations between checkpoints
        "max_failures": 999,
        "stop": {  # stopping conditions
            "training_iteration": 1000,  # number of iterations to stop after
        },
    },
})

== Status ==
Using FIFO scheduling algorithm.
Resources requested: 0/3 CPUs, 0/1 GPUs
Memory usage on this node: 9.7/16.4 GB

Created LogSyncer for /home/thorsten/ray_results/TenaciousD/PPO_MultiTenaciousDEnv-v0_0_2019-04-30_21-06-58ub1zbzov -> 
== Status ==
Using FIFO scheduling algorithm.
Resources requested: 3/3 CPUs, 0/1 GPUs
Memory usage on this node: 9.8/16.4 GB
Result logdir: /home/thorsten/ray_results/TenaciousD
RUNNING trials:
 - PPO_MultiTenaciousDEnv-v0_0:	RUNNING

Result for PPO_MultiTenaciousDEnv-v0_0:
  custom_metrics: {}
  date: 2019-04-30_21-08-07
  done: false
  episode_len_mean: 250.0
  episode_reward_max: 72.02327294833994
  episode_reward_mean: 35.444386482561875
  episode_reward_min: 12.833629996117578
  episodes_this_iter: 19
  episodes_total: 19
  experiment_id: 29ef68f31c7149f1bf937c50e572ef8b
  hostname: Gandalf
  info:
    grad_time_ms: 3545.259
    load_time_ms: 48.157
    num_steps_sampled: 5000
    num_steps_trained: 5000
    rl_0:
      cur_kl_coeff: 0.200

Result for PPO_MultiTenaciousDEnv-v0_0:
  custom_metrics: {}
  date: 2019-04-30_21-09-23
  done: false
  episode_len_mean: 236.37
  episode_reward_max: 277.3762538356018
  episode_reward_mean: 101.89552162606361
  episode_reward_min: -158.01989317867452
  episodes_this_iter: 24
  episodes_total: 125
  experiment_id: 29ef68f31c7149f1bf937c50e572ef8b
  hostname: Gandalf
  info:
    grad_time_ms: 2889.696
    load_time_ms: 9.447
    num_steps_sampled: 30000
    num_steps_trained: 30000
    rl_0:
      cur_kl_coeff: 0.012500000186264515
      cur_lr: 4.999999873689376e-05
      entropy: 1.3667470216751099
      kl: 0.008067733608186245
      policy_loss: -0.0027757249772548676
      total_loss: 265.32000732421875
      vf_explained_var: 0.033978354185819626
      vf_loss: 265.3227233886719
    sample_time_ms: 13344.062
    update_time_ms: 174.761
  iterations_since_restore: 6
  node_ip: 172.16.123.117
  num_metric_batches_dropped: 0
  pid: 21459
  policy_reward_mean:
    rl_0: 50.947760813

== Status ==
Using FIFO scheduling algorithm.
Resources requested: 3/3 CPUs, 0/1 GPUs
Memory usage on this node: 10.7/16.4 GB
Result logdir: /home/thorsten/ray_results/TenaciousD
RUNNING trials:
 - PPO_MultiTenaciousDEnv-v0_0:	RUNNING [pid=21459], 176 s, 11 iter, 55000 ts, 156 rew

Result for PPO_MultiTenaciousDEnv-v0_0:
  custom_metrics: {}
  date: 2019-04-30_21-10-55
  done: false
  episode_len_mean: 179.31
  episode_reward_max: 306.4172341068853
  episode_reward_mean: 160.96080367274035
  episode_reward_min: -166.9140972576301
  episodes_this_iter: 29
  episodes_total: 286
  experiment_id: 29ef68f31c7149f1bf937c50e572ef8b
  hostname: Gandalf
  info:
    grad_time_ms: 2802.743
    load_time_ms: 1.545
    num_steps_sampled: 60000
    num_steps_trained: 60000
    rl_0:
      cur_kl_coeff: 0.00019531250291038305
      cur_lr: 4.999999873689376e-05
      entropy: 1.380293846130371
      kl: 0.004193977452814579
      policy_loss: -0.0009535656427033246
      total_loss: 339.9620056152344

== Status ==
Using FIFO scheduling algorithm.
Resources requested: 3/3 CPUs, 0/1 GPUs
Memory usage on this node: 10.7/16.4 GB
Result logdir: /home/thorsten/ray_results/TenaciousD
RUNNING trials:
 - PPO_MultiTenaciousDEnv-v0_0:	RUNNING [pid=21459], 263 s, 17 iter, 85000 ts, 142 rew

Result for PPO_MultiTenaciousDEnv-v0_0:
  custom_metrics: {}
  date: 2019-04-30_21-12-22
  done: false
  episode_len_mean: 145.27
  episode_reward_max: 311.1265128906813
  episode_reward_mean: 140.2829821501003
  episode_reward_min: -163.58790706156336
  episodes_this_iter: 35
  episodes_total: 482
  experiment_id: 29ef68f31c7149f1bf937c50e572ef8b
  hostname: Gandalf
  info:
    grad_time_ms: 2692.655
    load_time_ms: 1.437
    num_steps_sampled: 90000
    num_steps_trained: 90000
    rl_0:
      cur_kl_coeff: 3.051757857974735e-06
      cur_lr: 4.999999873689376e-05
      entropy: 1.3559229373931885
      kl: 0.00841008685529232
      policy_loss: -0.0018155574798583984
      total_loss: 529.7063598632812


== Status ==
Using FIFO scheduling algorithm.
Resources requested: 3/3 CPUs, 0/1 GPUs
Memory usage on this node: 10.8/16.4 GB
Result logdir: /home/thorsten/ray_results/TenaciousD
RUNNING trials:
 - PPO_MultiTenaciousDEnv-v0_0:	RUNNING [pid=21459], 350 s, 23 iter, 115000 ts, 166 rew

Result for PPO_MultiTenaciousDEnv-v0_0:
  custom_metrics: {}
  date: 2019-04-30_21-13-51
  done: false
  episode_len_mean: 126.66
  episode_reward_max: 304.454894030646
  episode_reward_mean: 169.342560132717
  episode_reward_min: -160.37879003935444
  episodes_this_iter: 39
  episodes_total: 718
  experiment_id: 29ef68f31c7149f1bf937c50e572ef8b
  hostname: Gandalf
  info:
    grad_time_ms: 2678.945
    load_time_ms: 1.526
    num_steps_sampled: 120000
    num_steps_trained: 120000
    rl_0:
      cur_kl_coeff: 4.7683716530855236e-08
      cur_lr: 4.999999873689376e-05
      entropy: 1.3121503591537476
      kl: 0.0035427070688456297
      policy_loss: -0.0017069587484002113
      total_loss: 655.9857788085

== Status ==
Using FIFO scheduling algorithm.
Resources requested: 3/3 CPUs, 0/1 GPUs
Memory usage on this node: 10.8/16.4 GB
Result logdir: /home/thorsten/ray_results/TenaciousD
RUNNING trials:
 - PPO_MultiTenaciousDEnv-v0_0:	RUNNING [pid=21459], 443 s, 29 iter, 145000 ts, 150 rew

Result for PPO_MultiTenaciousDEnv-v0_0:
  custom_metrics: {}
  date: 2019-04-30_21-15-24
  done: false
  episode_len_mean: 112.34
  episode_reward_max: 311.47946095321464
  episode_reward_mean: 116.4063307115126
  episode_reward_min: -164.8855975359034
  episodes_this_iter: 48
  episodes_total: 973
  experiment_id: 29ef68f31c7149f1bf937c50e572ef8b
  hostname: Gandalf
  info:
    grad_time_ms: 2752.712
    load_time_ms: 1.578
    num_steps_sampled: 150000
    num_steps_trained: 150000
    rl_0:
      cur_kl_coeff: 7.450580707946131e-10
      cur_lr: 4.999999873689376e-05
      entropy: 1.2481536865234375
      kl: 0.007066559977829456
      policy_loss: -0.004497542977333069
      total_loss: 1119.2570800781

== Status ==
Using FIFO scheduling algorithm.
Resources requested: 3/3 CPUs, 0/1 GPUs
Memory usage on this node: 10.8/16.4 GB
Result logdir: /home/thorsten/ray_results/TenaciousD
RUNNING trials:
 - PPO_MultiTenaciousDEnv-v0_0:	RUNNING [pid=21459], 532 s, 35 iter, 175000 ts, 121 rew

Result for PPO_MultiTenaciousDEnv-v0_0:
  custom_metrics: {}
  date: 2019-04-30_21-16-54
  done: false
  episode_len_mean: 106.87
  episode_reward_max: 309.05449782799656
  episode_reward_mean: 100.91186300908365
  episode_reward_min: -169.09757844243563
  episodes_this_iter: 48
  episodes_total: 1242
  experiment_id: 29ef68f31c7149f1bf937c50e572ef8b
  hostname: Gandalf
  info:
    grad_time_ms: 2688.385
    load_time_ms: 1.532
    num_steps_sampled: 180000
    num_steps_trained: 180000
    rl_0:
      cur_kl_coeff: 1.1641532356165829e-11
      cur_lr: 4.999999873689376e-05
      entropy: 1.1761587858200073
      kl: 0.004308720584958792
      policy_loss: -0.0014069664757698774
      total_loss: 1046.68774

== Status ==
Using FIFO scheduling algorithm.
Resources requested: 3/3 CPUs, 0/1 GPUs
Memory usage on this node: 10.9/16.4 GB
Result logdir: /home/thorsten/ray_results/TenaciousD
RUNNING trials:
 - PPO_MultiTenaciousDEnv-v0_0:	RUNNING [pid=21459], 624 s, 41 iter, 205000 ts, 131 rew

Result for PPO_MultiTenaciousDEnv-v0_0:
  custom_metrics: {}
  date: 2019-04-30_21-18-26
  done: false
  episode_len_mean: 99.44
  episode_reward_max: 300.77836624764893
  episode_reward_mean: 105.3422814494574
  episode_reward_min: -165.0138887978151
  episodes_this_iter: 50
  episodes_total: 1526
  experiment_id: 29ef68f31c7149f1bf937c50e572ef8b
  hostname: Gandalf
  info:
    grad_time_ms: 2723.58
    load_time_ms: 1.474
    num_steps_sampled: 210000
    num_steps_trained: 210000
    rl_0:
      cur_kl_coeff: 3.6379788613018216e-13
      cur_lr: 4.999999873689376e-05
      entropy: 1.166243553161621
      kl: 0.003026565769687295
      policy_loss: -0.0022669711615890265
      total_loss: 990.4140625
   

== Status ==
Using FIFO scheduling algorithm.
Resources requested: 3/3 CPUs, 0/1 GPUs
Memory usage on this node: 10.9/16.4 GB
Result logdir: /home/thorsten/ray_results/TenaciousD
RUNNING trials:
 - PPO_MultiTenaciousDEnv-v0_0:	RUNNING [pid=21459], 713 s, 47 iter, 235000 ts, 120 rew

Result for PPO_MultiTenaciousDEnv-v0_0:
  custom_metrics: {}
  date: 2019-04-30_21-19-55
  done: false
  episode_len_mean: 99.52
  episode_reward_max: 306.0177255628034
  episode_reward_mean: 117.55834648655934
  episode_reward_min: -166.25178118147278
  episodes_this_iter: 50
  episodes_total: 1821
  experiment_id: 29ef68f31c7149f1bf937c50e572ef8b
  hostname: Gandalf
  info:
    grad_time_ms: 2782.136
    load_time_ms: 1.56
    num_steps_sampled: 240000
    num_steps_trained: 240000
    rl_0:
      cur_kl_coeff: 5.684341970784096e-15
      cur_lr: 4.999999873689376e-05
      entropy: 1.1504676342010498
      kl: 0.00628162594512105
      policy_loss: -0.0016935255844146013
      total_loss: 1057.5936279296

== Status ==
Using FIFO scheduling algorithm.
Resources requested: 3/3 CPUs, 0/1 GPUs
Memory usage on this node: 11.0/16.4 GB
Result logdir: /home/thorsten/ray_results/TenaciousD
RUNNING trials:
 - PPO_MultiTenaciousDEnv-v0_0:	RUNNING [pid=21459], 802 s, 53 iter, 265000 ts, 118 rew

Result for PPO_MultiTenaciousDEnv-v0_0:
  custom_metrics: {}
  date: 2019-04-30_21-21-23
  done: false
  episode_len_mean: 97.97
  episode_reward_max: 303.18921086396784
  episode_reward_mean: 82.67370054109688
  episode_reward_min: -164.24227236173263
  episodes_this_iter: 50
  episodes_total: 2115
  experiment_id: 29ef68f31c7149f1bf937c50e572ef8b
  hostname: Gandalf
  info:
    grad_time_ms: 2697.515
    load_time_ms: 1.485
    num_steps_sampled: 270000
    num_steps_trained: 270000
    rl_0:
      cur_kl_coeff: 8.88178432935015e-17
      cur_lr: 4.999999873689376e-05
      entropy: 1.0880571603775024
      kl: 0.005720173008739948
      policy_loss: -0.0022681679110974073
      total_loss: 1268.169799804

== Status ==
Using FIFO scheduling algorithm.
Resources requested: 3/3 CPUs, 0/1 GPUs
Memory usage on this node: 11.0/16.4 GB
Result logdir: /home/thorsten/ray_results/TenaciousD
RUNNING trials:
 - PPO_MultiTenaciousDEnv-v0_0:	RUNNING [pid=21459], 890 s, 59 iter, 295000 ts, 135 rew

Result for PPO_MultiTenaciousDEnv-v0_0:
  custom_metrics: {}
  date: 2019-04-30_21-22-52
  done: false
  episode_len_mean: 100.52
  episode_reward_max: 306.57238833330354
  episode_reward_mean: 125.41956396046153
  episode_reward_min: -164.32342776976222
  episodes_this_iter: 51
  episodes_total: 2409
  experiment_id: 29ef68f31c7149f1bf937c50e572ef8b
  hostname: Gandalf
  info:
    grad_time_ms: 2764.297
    load_time_ms: 1.566
    num_steps_sampled: 300000
    num_steps_trained: 300000
    rl_0:
      cur_kl_coeff: 2.775557602921922e-18
      cur_lr: 4.999999873689376e-05
      entropy: 1.0968317985534668
      kl: 0.0025609321892261505
      policy_loss: -4.875139347859658e-05
      total_loss: 1131.92260

== Status ==
Using FIFO scheduling algorithm.
Resources requested: 3/3 CPUs, 0/1 GPUs
Memory usage on this node: 11.1/16.4 GB
Result logdir: /home/thorsten/ray_results/TenaciousD
RUNNING trials:
 - PPO_MultiTenaciousDEnv-v0_0:	RUNNING [pid=21459], 981 s, 65 iter, 325000 ts, 117 rew

Result for PPO_MultiTenaciousDEnv-v0_0:
  custom_metrics: {}
  date: 2019-04-30_21-24-24
  done: false
  episode_len_mean: 97.2
  episode_reward_max: 302.0357784009515
  episode_reward_mean: 101.16649965087063
  episode_reward_min: -161.8224315901213
  episodes_this_iter: 51
  episodes_total: 2720
  experiment_id: 29ef68f31c7149f1bf937c50e572ef8b
  hostname: Gandalf
  info:
    grad_time_ms: 2826.401
    load_time_ms: 1.836
    num_steps_sampled: 330000
    num_steps_trained: 330000
    rl_0:
      cur_kl_coeff: 4.336808754565503e-20
      cur_lr: 4.999999873689376e-05
      entropy: 1.082587718963623
      kl: 0.005243158433586359
      policy_loss: -0.0014533903449773788
      total_loss: 1219.61474609375

== Status ==
Using FIFO scheduling algorithm.
Resources requested: 3/3 CPUs, 0/1 GPUs
Memory usage on this node: 11.0/16.4 GB
Result logdir: /home/thorsten/ray_results/TenaciousD
RUNNING trials:
 - PPO_MultiTenaciousDEnv-v0_0:	RUNNING [pid=21459], 1069 s, 71 iter, 355000 ts, 108 rew

Result for PPO_MultiTenaciousDEnv-v0_0:
  custom_metrics: {}
  date: 2019-04-30_21-25-55
  done: false
  episode_len_mean: 98.89
  episode_reward_max: 303.3232482266013
  episode_reward_mean: 115.31380429429258
  episode_reward_min: -164.31492301867723
  episodes_this_iter: 50
  episodes_total: 3030
  experiment_id: 29ef68f31c7149f1bf937c50e572ef8b
  hostname: Gandalf
  info:
    grad_time_ms: 2878.34
    load_time_ms: 1.746
    num_steps_sampled: 360000
    num_steps_trained: 360000
    rl_0:
      cur_kl_coeff: 6.776263679008599e-22
      cur_lr: 4.999999873689376e-05
      entropy: 1.0799401998519897
      kl: 0.0030241182539612055
      policy_loss: -0.001200423575937748
      total_loss: 905.7265625
 

== Status ==
Using FIFO scheduling algorithm.
Resources requested: 3/3 CPUs, 0/1 GPUs
Memory usage on this node: 11.1/16.4 GB
Result logdir: /home/thorsten/ray_results/TenaciousD
RUNNING trials:
 - PPO_MultiTenaciousDEnv-v0_0:	RUNNING [pid=21459], 1163 s, 77 iter, 385000 ts, 138 rew

Result for PPO_MultiTenaciousDEnv-v0_0:
  custom_metrics: {}
  date: 2019-04-30_21-27-27
  done: false
  episode_len_mean: 97.47
  episode_reward_max: 304.02701093942375
  episode_reward_mean: 105.22666531560844
  episode_reward_min: -158.69215435770806
  episodes_this_iter: 52
  episodes_total: 3336
  experiment_id: 29ef68f31c7149f1bf937c50e572ef8b
  hostname: Gandalf
  info:
    grad_time_ms: 2889.701
    load_time_ms: 1.568
    num_steps_sampled: 390000
    num_steps_trained: 390000
    rl_0:
      cur_kl_coeff: 1.0587911998450935e-23
      cur_lr: 4.999999873689376e-05
      entropy: 1.0511034727096558
      kl: 0.004600574728101492
      policy_loss: -0.00129034579731524
      total_loss: 1103.6715087

== Status ==
Using FIFO scheduling algorithm.
Resources requested: 3/3 CPUs, 0/1 GPUs
Memory usage on this node: 11.2/16.4 GB
Result logdir: /home/thorsten/ray_results/TenaciousD
RUNNING trials:
 - PPO_MultiTenaciousDEnv-v0_0:	RUNNING [pid=21459], 1277 s, 83 iter, 415000 ts, 131 rew

Result for PPO_MultiTenaciousDEnv-v0_0:
  custom_metrics: {}
  date: 2019-04-30_21-29-58
  done: false
  episode_len_mean: 102.6
  episode_reward_max: 305.56625206949485
  episode_reward_mean: 151.3590980500517
  episode_reward_min: -163.60637205697668
  episodes_this_iter: 49
  episodes_total: 3639
  experiment_id: 29ef68f31c7149f1bf937c50e572ef8b
  hostname: Gandalf
  info:
    grad_time_ms: 4556.731
    load_time_ms: 2.404
    num_steps_sampled: 420000
    num_steps_trained: 420000
    rl_0:
      cur_kl_coeff: 1.6543612497579586e-25
      cur_lr: 4.999999873689376e-05
      entropy: 0.9658523797988892
      kl: 0.0036912269424647093
      policy_loss: -0.0006641390500590205
      total_loss: 631.590637

== Status ==
Using FIFO scheduling algorithm.
Resources requested: 3/3 CPUs, 0/1 GPUs
Memory usage on this node: 11.2/16.4 GB
Result logdir: /home/thorsten/ray_results/TenaciousD
RUNNING trials:
 - PPO_MultiTenaciousDEnv-v0_0:	RUNNING [pid=21459], 1657 s, 89 iter, 445000 ts, 124 rew

Result for PPO_MultiTenaciousDEnv-v0_0:
  custom_metrics: {}
  date: 2019-04-30_21-36-38
  done: false
  episode_len_mean: 98.08
  episode_reward_max: 307.05109722831213
  episode_reward_mean: 143.51425488383606
  episode_reward_min: -161.47708715994654
  episodes_this_iter: 50
  episodes_total: 3950
  experiment_id: 29ef68f31c7149f1bf937c50e572ef8b
  hostname: Gandalf
  info:
    grad_time_ms: 11806.603
    load_time_ms: 4.241
    num_steps_sampled: 450000
    num_steps_trained: 450000
    rl_0:
      cur_kl_coeff: 2.5849394527468104e-27
      cur_lr: 4.999999873689376e-05
      entropy: 0.9472668170928955
      kl: 0.002647615037858486
      policy_loss: -0.0013408288359642029
      total_loss: 747.09381