In [1]:
import torch 
from torch import nn

import ray
from ray.rllib.agents import ppo
from ray.rllib.models import ModelCatalog
from ray.rllib.models.torch.torch_modelv2 import TorchModelV2
from ray.rllib.utils.annotations import override

#from models import VisualEncoder
from train import *
from wrappers_2 import *



In [2]:
class VisualEncoder(nn.Module):
    def __init__(self):
        super().__init__()

        self.cnn = nn.Sequential(
            nn.Conv2d(3, 32, kernel_size=2, stride=2, padding=0),  
            nn.ELU(),
            nn.Conv2d(32, 32, kernel_size=2, stride=2, padding=0), 
            nn.ELU(),
            nn.Conv2d(32, 64, kernel_size=2, stride=2, padding=0), 
            nn.ELU(),
            nn.Conv2d(64, 128, kernel_size=2, stride=2, padding=0),
            nn.ELU(), 
            nn.Conv2d(128, 256, kernel_size=2, stride=2, padding=0),
            nn.ELU(),
            nn.Conv2d(256, 512, kernel_size=2, stride=2, padding=0),
            nn.ELU(),
            nn.Flatten(),
        )

    def forward(self, x):
        return self.cnn(x)

In [3]:
class PovTargetFusion(nn.Module):
    def __init__(self):
        super(PovTargetFusion, self).__init__()
        
        #self.visual_encoder = VisualEncoder()
        self.visual_dim = 512
        
        self.conv_1 = nn.Conv3d(7, 7, 1)
        self.linear_1 = nn.Linear(self.visual_dim, 7)
        #self.act_1 = nn.ELU()
        
        self.conv_2 = nn.Conv3d(7, 16, 3)
        self.linear_2 = nn.Linear(self.visual_dim, 16)
        self.act_2 = nn.ELU()

        self.conv_3 = nn.Conv3d(16, 32, 3)
        self.linear_3 = nn.Linear(self.visual_dim, 32)
        self.act_3 = nn.ELU()
        
        self.conv_4 = nn.Conv3d(32, 64, 3)
        self.linear_4 = nn.Linear(self.visual_dim, 64)
        self.act_4 = nn.ELU()
        
        self.conv_5 = nn.Conv3d(64, 128, 3)
        self.act_5 = nn.ELU()
        
        self.linear_6 = nn.Linear(128 * 9, 512)
        self.act_6 = nn.ELU()
        
    def forward(self, target, vis_features):
        Q = self.linear_1(vis_features).reshape(-1, 1, 7).permute(0, 2, 1)
        V = self.conv_1(target).reshape(-1, 9 * 11 * 11, 7)
        result = torch.sigmoid(torch.bmm(V, Q)) * V
        result = result.reshape(-1, 9, 11, 11, 7).permute(0, 4, 1, 2, 3)
        
        Q = self.linear_2(vis_features).reshape(-1, 1, 16).permute(0, 2, 1)
        V = self.conv_2(result).reshape(-1, 7 * 9 * 9, 16)
        result = torch.sigmoid(torch.bmm(V, Q)) * self.act_2(V)
        result = result.reshape(-1, 7, 9, 9, 16).permute(0, 4, 1, 2, 3)
        
        Q = self.linear_3(vis_features).reshape(-1, 1, 32).permute(0, 2, 1)
        V = self.conv_3(result).reshape(-1, 5 * 7 * 7, 32)
        result = torch.sigmoid(torch.bmm(V, Q)) * self.act_3(V)
        result = result.reshape(-1, 5, 7, 7, 32).permute(0, 4, 1, 2, 3)
        
        Q = self.linear_4(vis_features).reshape(-1, 1, 64).permute(0, 2, 1)
        V = self.conv_4(result).reshape(-1, 3 * 5 * 5, 64)
        result = torch.sigmoid(torch.bmm(V, Q)) * self.act_4(V)
        result = result.reshape(-1, 3, 5, 5, 64).permute(0, 4, 1, 2, 3)
        
        result = self.act_5(self.conv_5(result))
        result = torch.flatten(result, start_dim=1)
        result = self.act_6(self.linear_6(result))
        
        return result

In [4]:
from torch.nn.functional import one_hot

class MyModelClass(TorchModelV2, nn.Module):
    def __init__(self, obs_space, action_space, num_outputs, model_config, name):
        TorchModelV2.__init__(self, obs_space, action_space, num_outputs, model_config, name)
        nn.Module.__init__(self)
        visual_features_dim = 512
        target_features_dim = 9 * 11 * 11 
        self.visual_encoder = VisualEncoder()
        self.visual_encoder.load_state_dict(
            torch.load("/IGLU-Minecraft/models/AngelaCNN/encoder_weigths.pth", map_location=torch.device('cpu'))
        )
        self.target_encoder = PovTargetFusion()
        policy_hidden_dim = 256 
        self.policy_network = nn.Sequential(
            nn.Linear(1024, 512),
            nn.ELU(),
            nn.Linear(512, policy_hidden_dim),
            nn.ELU(),
            nn.Linear(policy_hidden_dim, policy_hidden_dim),
            nn.ELU(),
        )
        self.action_head = nn.Linear(policy_hidden_dim, action_space.n)
        self.value_head = nn.Linear(policy_hidden_dim, 1)
        self.last_value = None
        
        self.use_cuda = torch.cuda.is_available()
        if self.use_cuda:
            self.visual_encoder.cuda()
            self.target_encoder.cuda()
            self.policy_network.cuda()
            self.action_head.cuda()
            self.value_head.cuda()
        
    @override(TorchModelV2)
    def forward(self, input_dict, state, seq_lens):
        obs = input_dict['obs']
        pov = obs['pov'].permute(0, 3, 1, 2).float() / 255.0
        target = one_hot(obs['target_grid'].long(), num_classes=7).permute(0, 4, 1, 2, 3).float()
        if self.use_cuda:
            pov.cuda()
            target.cuda()
            
        with torch.no_grad():
            visual_features = self.visual_encoder(pov)
            
        target_features = self.target_encoder(target, visual_features)
        target_features = target_features.reshape(target_features.shape[0], -1)
        features = torch.cat([visual_features, target_features], dim=1)
        features = self.policy_network(features)
        action = self.action_head(features)
        self.last_value = self.value_head(features).squeeze(1)
        return action, state
    
    @override(TorchModelV2)
    def value_function(self):
        assert self.last_value is not None, "must call forward() first"
        return self.last_value

In [5]:
visual_features_dim = 512
target_features_dim = 9 * 11 * 11
policy_hidden_dim = 256 

policy_network = nn.Sequential(
    nn.Linear(visual_features_dim + target_features_dim, 1024),
    nn.ELU(),
    nn.Linear(1024, 512),
    nn.ELU(),
    nn.Linear(512, policy_hidden_dim),
    nn.ELU(),
    nn.Linear(policy_hidden_dim, policy_hidden_dim),
    nn.ELU(),
    #nn.Linear(policy_hidden_dim, policy_hidden_dim),
    #nn.ELU(),
)

sum(p.numel() for p in policy_network.parameters())

2362368

In [6]:
ModelCatalog.register_custom_model("my_torch_model", MyModelClass)

In [7]:
class VisualObservationWrapper(ObsWrapper):
    def __init__(self, env, include_target=False):
        super().__init__(env)
        self.observation_space = {   
            'pov': gym.spaces.Box(low=0, high=255, shape=(64, 64, 3)),
            'inventory': gym.spaces.Box(low=0.0, high=20.0, shape=(6,)),
            'compass': gym.spaces.Box(low=-180.0, high=180.0, shape=(1,))
        }
        if include_target:
            self.observation_space['target_grid'] = \
                gym.spaces.Box(low=0, high=6, shape=(9, 11, 11))
        self.observation_space = gym.spaces.Dict(self.observation_space)

    def observation(self, obs, reward=None, done=None, info=None):
        if info is not None:
            if 'target_grid' in info:
                target_grid = info['target_grid']
                del info['target_grid']
            else:
                logger.error(f'info: {info}')
                if hasattr(self.unwrapped, 'should_reset'):
                    self.unwrapped.should_reset(True)
                target_grid = self.env.unwrapped.tasks.current.target_grid
        else:
            target_grid = self.env.unwrapped.tasks.current.target_grid
        return {
            'pov': obs['pov'].astype(np.float32),
            'inventory': obs['inventory'],
            'compass': np.array([obs['compass']['angle'].item()]),
            'target_grid': target_grid
        }

In [8]:
import os
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"   # see issue #152
os.environ["CUDA_VISIBLE_DEVICES"]="1"

tasks = []
for i in range(1,156):
    if ('C'+str(i)) == 'C38': continue
    tasks.append('C'+str(i))
    
class RewardWrapper(gym.RewardWrapper):
    def __init__(self, env):
        super().__init__(env)
    
    def reward(self, rew):
        if rew == 0:
            rew = -0.01
        if abs(rew) == 1:
            rew /= 10
            
        return rew
    
def env_creator(env_config):
    env = gym.make('IGLUSilentBuilder-v0', max_steps=250)
    env.update_taskset(TaskSet(preset=tasks))
    #env = PovOnlyWrapper(env)
    env = VisualObservationWrapper(env, include_target=True)
    env = SelectAndPlace(env)
    env = Discretization(env, flat_action_space('human-level'))
    env = RewardWrapper(env)
    return env

from ray.tune.registry import register_env
register_env("my_env", env_creator)

from ray import tune
from ray.rllib.agents.ppo import PPOTrainer

In [None]:
from ray.tune.integration.wandb import WandbLogger

analysis = tune.run(PPOTrainer, 
         config={
             "env": "my_env", 
             "framework": "torch",
             "num_gpus": 1,
             "num_workers": 3,
             "sgd_minibatch_size": 256,
             "clip_param": 0.2,
             "entropy_coeff": 0.01,
             "lambda": 0.95,
             "train_batch_size": 5_000,
             "lr": 1e-4,
             #"gamma": 0.99,
             "model": {
                    # Specify our custom model from above.
                    "custom_model": "my_torch_model",
                    # Extra kwargs to be passed to your model's c'tor.
                    "custom_model_config": {},
              },
             "logger_config": {
                  "wandb": {
                      "project": "IGLU-Minecraft",
                      "name": "PPO All Tasks pretrained (AngelaCNN+fusion) (3 noops after placement) r: -0.01 div10"
                  }
              }

        },
        loggers=[WandbLogger],
        local_dir="/IGLU-Minecraft/checkpoints/all_tasks",
        keep_checkpoints_num=50,
        checkpoint_freq=5,
        checkpoint_at_end=True)



Trial name,status,loc
PPO_my_env_ef690_00000,PENDING,


2021-11-07 15:39:46,527	INFO wandb.py:170 -- Already logged into W&B.
2021-11-07 15:39:46,549	ERROR syncer.py:72 -- Log sync requires rsync to be installed.
[34m[1mwandb[0m: Currently logged in as: [33mlinar[0m (use `wandb login --relogin` to force relogin)
[34m[1mwandb[0m: wandb version 0.12.6 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade


[2m[36m(pid=9113)[0m 2021-11-07 15:39:51,117	INFO ppo.py:159 -- In multi-agent mode, policies will be optimized sequentially by the multi-GPU optimizer. Consider setting simple_optimizer=True if this doesn't work for you.
[2m[36m(pid=9113)[0m 2021-11-07 15:39:51,117	INFO trainer.py:728 -- Current log_level is WARN. For more information, set 'log_level': 'INFO' / 'DEBUG' or use the -v and -vv flags.
[2m[36m(pid=9113)[0m 2021-11-07 15:40:08,938	INFO trainable.py:109 -- Trainable.setup took 20.983 seconds. If your trainable is slow to initialize, consider setting reuse_actors=True to reduce actor creation overheads.


Result for PPO_my_env_ef690_00000:
  agent_timesteps_total: 9996
  custom_metrics: {}
  date: 2021-11-07_15-44-45
  done: false
  episode_len_mean: 103.87368421052632
  episode_media: {}
  episode_reward_max: 4.690000000000007
  episode_reward_mean: -0.7170526315789476
  episode_reward_min: -1.6000000000000008
  episodes_this_iter: 95
  episodes_total: 95
  experiment_id: 5986ad5d0d3246ff80964fd8aadc2f01
  hostname: cds2
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.19999999999999998
          cur_lr: 0.00010000000000000002
          entropy: 2.881463285185333
          entropy_coeff: 0.01
          kl: 0.005726082684665919
          policy_loss: -0.014017798388615632
          total_loss: 0.024265516103587598
          vf_explained_var: -0.32391759753227234
          vf_loss: 0.06595273009425777
    num_agent_steps_sampled: 9996
    num_agent_steps_trained: 9996
    num_steps_sampled: 9996
    num_steps_trained: 9996
  iterations_since_res

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_ef690_00000,RUNNING,10.55.229.87:9113,1,276.673,9996,-0.717053,4.69,-1.6,103.874


Result for PPO_my_env_ef690_00000:
  agent_timesteps_total: 19992
  custom_metrics: {}
  date: 2021-11-07_15-48-14
  done: false
  episode_len_mean: 102.79
  episode_media: {}
  episode_reward_max: 4.860000000000011
  episode_reward_mean: -0.29689999999999983
  episode_reward_min: -1.540000000000001
  episodes_this_iter: 97
  episodes_total: 192
  experiment_id: 5986ad5d0d3246ff80964fd8aadc2f01
  hostname: cds2
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.19999999999999998
          cur_lr: 0.00010000000000000002
          entropy: 2.8669099536716427
          entropy_coeff: 0.01
          kl: 0.008004153515828798
          policy_loss: -0.021347131272857516
          total_loss: 0.04276177956698797
          vf_explained_var: 0.13229481875896454
          vf_loss: 0.09117717912727091
    num_agent_steps_sampled: 19992
    num_agent_steps_trained: 19992
    num_steps_sampled: 19992
    num_steps_trained: 19992
  iterations_since_restore: 2

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_ef690_00000,RUNNING,10.55.229.87:9113,2,485.465,19992,-0.2969,4.86,-1.54,102.79


Result for PPO_my_env_ef690_00000:
  agent_timesteps_total: 29988
  custom_metrics: {}
  date: 2021-11-07_15-51-41
  done: false
  episode_len_mean: 104.21
  episode_media: {}
  episode_reward_max: 8.680000000000005
  episode_reward_mean: 0.23140000000000044
  episode_reward_min: -2.189999999999997
  episodes_this_iter: 96
  episodes_total: 288
  experiment_id: 5986ad5d0d3246ff80964fd8aadc2f01
  hostname: cds2
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.19999999999999998
          cur_lr: 0.00010000000000000002
          entropy: 2.8375419239712576
          entropy_coeff: 0.01
          kl: 0.01280893030899631
          policy_loss: -0.025665039607347586
          total_loss: 0.15897202860468473
          vf_explained_var: 0.4611608386039734
          vf_loss: 0.21045070096659355
    num_agent_steps_sampled: 29988
    num_agent_steps_trained: 29988
    num_steps_sampled: 29988
    num_steps_trained: 29988
  iterations_since_restore: 3
  

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_ef690_00000,RUNNING,10.55.229.87:9113,3,692.303,29988,0.2314,8.68,-2.19,104.21




Result for PPO_my_env_ef690_00000:
  agent_timesteps_total: 39984
  custom_metrics: {}
  date: 2021-11-07_15-55-40
  done: false
  episode_len_mean: 100.23
  episode_media: {}
  episode_reward_max: 6.690000000000009
  episode_reward_mean: 0.2714000000000005
  episode_reward_min: -1.910000000000001
  episodes_this_iter: 99
  episodes_total: 387
  experiment_id: 5986ad5d0d3246ff80964fd8aadc2f01
  hostname: cds2
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.19999999999999998
          cur_lr: 0.00010000000000000002
          entropy: 2.814265903040894
          entropy_coeff: 0.01
          kl: 0.01251414466637949
          policy_loss: -0.024552390955261186
          total_loss: 0.15232630074581402
          vf_explained_var: 0.45500150322914124
          vf_loss: 0.20251852030603168
    num_agent_steps_sampled: 39984
    num_agent_steps_trained: 39984
    num_steps_sampled: 39984
    num_steps_trained: 39984
  iterations_since_restore: 4
  n

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_ef690_00000,RUNNING,10.55.229.87:9113,4,931.342,39984,0.2714,6.69,-1.91,100.23


Result for PPO_my_env_ef690_00000:
  agent_timesteps_total: 49980
  custom_metrics: {}
  date: 2021-11-07_15-59-21
  done: false
  episode_len_mean: 104.47
  episode_media: {}
  episode_reward_max: 4.920000000000011
  episode_reward_mean: 0.6159000000000014
  episode_reward_min: -1.870000000000001
  episodes_this_iter: 96
  episodes_total: 483
  experiment_id: 5986ad5d0d3246ff80964fd8aadc2f01
  hostname: cds2
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.19999999999999998
          cur_lr: 0.00010000000000000002
          entropy: 2.7958645411026786
          entropy_coeff: 0.01
          kl: 0.012164358488454263
          policy_loss: -0.025776770726865173
          total_loss: 0.1933178161055996
          vf_explained_var: 0.5301831960678101
          vf_loss: 0.2446203603512711
    num_agent_steps_sampled: 49980
    num_agent_steps_trained: 49980
    num_steps_sampled: 49980
    num_steps_trained: 49980
  iterations_since_restore: 5
  no

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_ef690_00000,RUNNING,10.55.229.87:9113,5,1152.58,49980,0.6159,4.92,-1.87,104.47


Result for PPO_my_env_ef690_00000:
  agent_timesteps_total: 59976
  custom_metrics: {}
  date: 2021-11-07_16-02-58
  done: false
  episode_len_mean: 103.08
  episode_media: {}
  episode_reward_max: 6.700000000000003
  episode_reward_mean: 0.7659000000000021
  episode_reward_min: -1.920000000000001
  episodes_this_iter: 97
  episodes_total: 580
  experiment_id: 5986ad5d0d3246ff80964fd8aadc2f01
  hostname: cds2
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.19999999999999998
          cur_lr: 0.00010000000000000002
          entropy: 2.7729798665413488
          entropy_coeff: 0.01
          kl: 0.013606591013315418
          policy_loss: -0.026233421842384544
          total_loss: 0.19056971003898443
          vf_explained_var: 0.5563409924507141
          vf_loss: 0.24181161136645027
    num_agent_steps_sampled: 59976
    num_agent_steps_trained: 59976
    num_steps_sampled: 59976
    num_steps_trained: 59976
  iterations_since_restore: 6
  

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_ef690_00000,RUNNING,10.55.229.87:9113,6,1369.62,59976,0.7659,6.7,-1.92,103.08


Result for PPO_my_env_ef690_00000:
  agent_timesteps_total: 69972
  custom_metrics: {}
  date: 2021-11-07_16-06-40
  done: false
  episode_len_mean: 102.43
  episode_media: {}
  episode_reward_max: 6.760000000000012
  episode_reward_mean: 0.7351000000000014
  episode_reward_min: -2.179999999999997
  episodes_this_iter: 97
  episodes_total: 677
  experiment_id: 5986ad5d0d3246ff80964fd8aadc2f01
  hostname: cds2
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.19999999999999998
          cur_lr: 0.00010000000000000002
          entropy: 2.7532448784917847
          entropy_coeff: 0.01
          kl: 0.014182066141346438
          policy_loss: -0.027021054394988933
          total_loss: 0.23320526017799464
          vf_explained_var: 0.5770830512046814
          vf_loss: 0.28492235109910496
    num_agent_steps_sampled: 69972
    num_agent_steps_trained: 69972
    num_steps_sampled: 69972
    num_steps_trained: 69972
  iterations_since_restore: 7
  

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_ef690_00000,RUNNING,10.55.229.87:9113,7,1590.77,69972,0.7351,6.76,-2.18,102.43




Result for PPO_my_env_ef690_00000:
  agent_timesteps_total: 79968
  custom_metrics: {}
  date: 2021-11-07_16-10-44
  done: false
  episode_len_mean: 100.08910891089108
  episode_media: {}
  episode_reward_max: 8.620000000000015
  episode_reward_mean: 1.062574257425745
  episode_reward_min: -1.940000000000001
  episodes_this_iter: 101
  episodes_total: 778
  experiment_id: 5986ad5d0d3246ff80964fd8aadc2f01
  hostname: cds2
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.19999999999999998
          cur_lr: 0.00010000000000000002
          entropy: 2.735616473458771
          entropy_coeff: 0.01
          kl: 0.015148684692153212
          policy_loss: -0.02821433074836038
          total_loss: 0.22985127607249042
          vf_explained_var: 0.5617288947105408
          vf_loss: 0.2823920345586589
    num_agent_steps_sampled: 79968
    num_agent_steps_trained: 79968
    num_steps_sampled: 79968
    num_steps_trained: 79968
  iterations_since_rest

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_ef690_00000,RUNNING,10.55.229.87:9113,8,1835.05,79968,1.06257,8.62,-1.94,100.089


Result for PPO_my_env_ef690_00000:
  agent_timesteps_total: 89964
  custom_metrics: {}
  date: 2021-11-07_16-14-25
  done: false
  episode_len_mean: 102.84
  episode_media: {}
  episode_reward_max: 5.0000000000000115
  episode_reward_mean: 0.7126000000000018
  episode_reward_min: -2.12
  episodes_this_iter: 96
  episodes_total: 874
  experiment_id: 5986ad5d0d3246ff80964fd8aadc2f01
  hostname: cds2
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.19999999999999998
          cur_lr: 0.00010000000000000002
          entropy: 2.725383094029549
          entropy_coeff: 0.01
          kl: 0.014758684166334132
          policy_loss: -0.030482410080730916
          total_loss: 0.23640633389895033
          vf_explained_var: 0.526335597038269
          vf_loss: 0.29119083863706924
    num_agent_steps_sampled: 89964
    num_agent_steps_trained: 89964
    num_steps_sampled: 89964
    num_steps_trained: 89964
  iterations_since_restore: 9
  node_ip: 10.55

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_ef690_00000,RUNNING,10.55.229.87:9113,9,2055.86,89964,0.7126,5,-2.12,102.84


Result for PPO_my_env_ef690_00000:
  agent_timesteps_total: 99960
  custom_metrics: {}
  date: 2021-11-07_16-18-10
  done: false
  episode_len_mean: 99.76237623762377
  episode_media: {}
  episode_reward_max: 10.310000000000013
  episode_reward_mean: 1.1178217821782208
  episode_reward_min: -2.0300000000000002
  episodes_this_iter: 101
  episodes_total: 975
  experiment_id: 5986ad5d0d3246ff80964fd8aadc2f01
  hostname: cds2
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.19999999999999998
          cur_lr: 0.00010000000000000002
          entropy: 2.7161481140006303
          entropy_coeff: 0.01
          kl: 0.015336657994412804
          policy_loss: -0.02860308372312122
          total_loss: 0.2253956856094619
          vf_explained_var: 0.5550938844680786
          vf_loss: 0.2780929193537459
    num_agent_steps_sampled: 99960
    num_agent_steps_trained: 99960
    num_steps_sampled: 99960
    num_steps_trained: 99960
  iterations_since_re

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_ef690_00000,RUNNING,10.55.229.87:9113,10,2281.3,99960,1.11782,10.31,-2.03,99.7624




Result for PPO_my_env_ef690_00000:
  agent_timesteps_total: 109956
  custom_metrics: {}
  date: 2021-11-07_16-22-10
  done: false
  episode_len_mean: 101.96
  episode_media: {}
  episode_reward_max: 6.7900000000000125
  episode_reward_mean: 0.9363000000000028
  episode_reward_min: -2.0599999999999996
  episodes_this_iter: 98
  episodes_total: 1073
  experiment_id: 5986ad5d0d3246ff80964fd8aadc2f01
  hostname: cds2
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.19999999999999998
          cur_lr: 0.00010000000000000002
          entropy: 2.7050572075395505
          entropy_coeff: 0.01
          kl: 0.01627367809150793
          policy_loss: -0.03365043133783799
          total_loss: 0.22872364655152982
          vf_explained_var: 0.566852867603302
          vf_loss: 0.28616991638182065
    num_agent_steps_sampled: 109956
    num_agent_steps_trained: 109956
    num_steps_sampled: 109956
    num_steps_trained: 109956
  iterations_since_restore:

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_ef690_00000,RUNNING,10.55.229.87:9113,11,2521.36,109956,0.9363,6.79,-2.06,101.96


Result for PPO_my_env_ef690_00000:
  agent_timesteps_total: 119952
  custom_metrics: {}
  date: 2021-11-07_16-26-00
  done: false
  episode_len_mean: 101.78
  episode_media: {}
  episode_reward_max: 8.610000000000014
  episode_reward_mean: 1.2251000000000032
  episode_reward_min: -2.179999999999997
  episodes_this_iter: 99
  episodes_total: 1172
  experiment_id: 5986ad5d0d3246ff80964fd8aadc2f01
  hostname: cds2
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.19999999999999998
          cur_lr: 0.00010000000000000002
          entropy: 2.685304493170518
          entropy_coeff: 0.01
          kl: 0.01812668949846224
          policy_loss: -0.031576821488192956
          total_loss: 0.2543524573278478
          vf_explained_var: 0.5433704257011414
          vf_loss: 0.3091569850937678
    num_agent_steps_sampled: 119952
    num_agent_steps_trained: 119952
    num_steps_sampled: 119952
    num_steps_trained: 119952
  iterations_since_restore: 12

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_ef690_00000,RUNNING,10.55.229.87:9113,12,2750.52,119952,1.2251,8.61,-2.18,101.78


Result for PPO_my_env_ef690_00000:
  agent_timesteps_total: 129948
  custom_metrics: {}
  date: 2021-11-07_16-29-41
  done: false
  episode_len_mean: 102.51
  episode_media: {}
  episode_reward_max: 8.620000000000012
  episode_reward_mean: 1.1395000000000028
  episode_reward_min: -2.22
  episodes_this_iter: 97
  episodes_total: 1269
  experiment_id: 5986ad5d0d3246ff80964fd8aadc2f01
  hostname: cds2
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.19999999999999998
          cur_lr: 0.00010000000000000002
          entropy: 2.675288236854423
          entropy_coeff: 0.01
          kl: 0.019158277029229495
          policy_loss: -0.03342935424775649
          total_loss: 0.24424498225602076
          vf_explained_var: 0.5621176362037659
          vf_loss: 0.30059556403221227
    num_agent_steps_sampled: 129948
    num_agent_steps_trained: 129948
    num_steps_sampled: 129948
    num_steps_trained: 129948
  iterations_since_restore: 13
  node_ip:

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_ef690_00000,RUNNING,10.55.229.87:9113,13,2972.16,129948,1.1395,8.62,-2.22,102.51


Result for PPO_my_env_ef690_00000:
  agent_timesteps_total: 139944
  custom_metrics: {}
  date: 2021-11-07_16-33-27
  done: false
  episode_len_mean: 101.26
  episode_media: {}
  episode_reward_max: 8.360000000000012
  episode_reward_mean: 1.2552000000000034
  episode_reward_min: -2.2099999999999986
  episodes_this_iter: 99
  episodes_total: 1368
  experiment_id: 5986ad5d0d3246ff80964fd8aadc2f01
  hostname: cds2
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.19999999999999998
          cur_lr: 0.00010000000000000002
          entropy: 2.6580861760000896
          entropy_coeff: 0.01
          kl: 0.02210920791495533
          policy_loss: -0.029866728506600247
          total_loss: 0.2913779709042392
          vf_explained_var: 0.5548118352890015
          vf_loss: 0.34340371924460444
    num_agent_steps_sampled: 139944
    num_agent_steps_trained: 139944
    num_steps_sampled: 139944
    num_steps_trained: 139944
  iterations_since_restore:

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_ef690_00000,RUNNING,10.55.229.87:9113,14,3197.78,139944,1.2552,8.36,-2.21,101.26




Result for PPO_my_env_ef690_00000:
  agent_timesteps_total: 149940
  custom_metrics: {}
  date: 2021-11-07_16-37-39
  done: false
  episode_len_mean: 99.61
  episode_media: {}
  episode_reward_max: 6.520000000000014
  episode_reward_mean: 1.4560000000000044
  episode_reward_min: -2.0199999999999996
  episodes_this_iter: 100
  episodes_total: 1468
  experiment_id: 5986ad5d0d3246ff80964fd8aadc2f01
  hostname: cds2
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.3
          cur_lr: 0.00010000000000000002
          entropy: 2.6538626375361387
          entropy_coeff: 0.01
          kl: 0.018803956699661593
          policy_loss: -0.036514532742783046
          total_loss: 0.22197361307807711
          vf_explained_var: 0.6335610151290894
          vf_loss: 0.2793855861784556
    num_agent_steps_sampled: 149940
    num_agent_steps_trained: 149940
    num_steps_sampled: 149940
    num_steps_trained: 149940
  iterations_since_restore: 15
  node_ip: 

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_ef690_00000,RUNNING,10.55.229.87:9113,15,3450.03,149940,1.456,6.52,-2.02,99.61


Result for PPO_my_env_ef690_00000:
  agent_timesteps_total: 159936
  custom_metrics: {}
  date: 2021-11-07_16-41-30
  done: false
  episode_len_mean: 102.2
  episode_media: {}
  episode_reward_max: 10.490000000000014
  episode_reward_mean: 1.6574000000000049
  episode_reward_min: -2.279999999999999
  episodes_this_iter: 98
  episodes_total: 1566
  experiment_id: 5986ad5d0d3246ff80964fd8aadc2f01
  hostname: cds2
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.3
          cur_lr: 0.00010000000000000002
          entropy: 2.6275053209728663
          entropy_coeff: 0.01
          kl: 0.021212963988494506
          policy_loss: -0.03393929168327242
          total_loss: 0.27409146750648306
          vf_explained_var: 0.6249567270278931
          vf_loss: 0.3279419239005472
    num_agent_steps_sampled: 159936
    num_agent_steps_trained: 159936
    num_steps_sampled: 159936
    num_steps_trained: 159936
  iterations_since_restore: 16
  node_ip: 10

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_ef690_00000,RUNNING,10.55.229.87:9113,16,3680.77,159936,1.6574,10.49,-2.28,102.2


Result for PPO_my_env_ef690_00000:
  agent_timesteps_total: 169932
  custom_metrics: {}
  date: 2021-11-07_16-45-21
  done: false
  episode_len_mean: 103.3
  episode_media: {}
  episode_reward_max: 10.130000000000017
  episode_reward_mean: 1.7559000000000051
  episode_reward_min: -2.1400000000000006
  episodes_this_iter: 96
  episodes_total: 1662
  experiment_id: 5986ad5d0d3246ff80964fd8aadc2f01
  hostname: cds2
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.4500000000000001
          cur_lr: 0.00010000000000000002
          entropy: 2.623352229900849
          entropy_coeff: 0.01
          kl: 0.019678593991341163
          policy_loss: -0.035520751268053666
          total_loss: 0.24357310466659374
          vf_explained_var: 0.586357831954956
          vf_loss: 0.29647201208604707
    num_agent_steps_sampled: 169932
    num_agent_steps_trained: 169932
    num_steps_sampled: 169932
    num_steps_trained: 169932
  iterations_since_restore: 

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_ef690_00000,RUNNING,10.55.229.87:9113,17,3911.26,169932,1.7559,10.13,-2.14,103.3


Result for PPO_my_env_ef690_00000:
  agent_timesteps_total: 179928
  custom_metrics: {}
  date: 2021-11-07_16-49-05
  done: false
  episode_len_mean: 101.27
  episode_media: {}
  episode_reward_max: 6.500000000000011
  episode_reward_mean: 1.3445000000000054
  episode_reward_min: -2.04
  episodes_this_iter: 99
  episodes_total: 1761
  experiment_id: 5986ad5d0d3246ff80964fd8aadc2f01
  hostname: cds2
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.4500000000000001
          cur_lr: 0.00010000000000000002
          entropy: 2.6283575642822137
          entropy_coeff: 0.01
          kl: 0.020804582247598857
          policy_loss: -0.03765451446430296
          total_loss: 0.21547712894976456
          vf_explained_var: 0.6893129944801331
          vf_loss: 0.27005315610868297
    num_agent_steps_sampled: 179928
    num_agent_steps_trained: 179928
    num_steps_sampled: 179928
    num_steps_trained: 179928
  iterations_since_restore: 18
  node_ip:

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_ef690_00000,RUNNING,10.55.229.87:9113,18,4135.76,179928,1.3445,6.5,-2.04,101.27




Result for PPO_my_env_ef690_00000:
  agent_timesteps_total: 189924
  custom_metrics: {}
  date: 2021-11-07_16-53-10
  done: false
  episode_len_mean: 99.17
  episode_media: {}
  episode_reward_max: 6.410000000000014
  episode_reward_mean: 1.3250000000000042
  episode_reward_min: -1.9100000000000008
  episodes_this_iter: 100
  episodes_total: 1861
  experiment_id: 5986ad5d0d3246ff80964fd8aadc2f01
  hostname: cds2
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.6749999999999999
          cur_lr: 0.00010000000000000002
          entropy: 2.6159864511245337
          entropy_coeff: 0.01
          kl: 0.01864839982678148
          policy_loss: -0.03840073677432588
          total_loss: 0.2902961689564917
          vf_explained_var: 0.6033046841621399
          vf_loss: 0.3422691006499987
    num_agent_steps_sampled: 189924
    num_agent_steps_trained: 189924
    num_steps_sampled: 189924
    num_steps_trained: 189924
  iterations_since_restore: 19

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_ef690_00000,RUNNING,10.55.229.87:9113,19,4380.77,189924,1.325,6.41,-1.91,99.17


Result for PPO_my_env_ef690_00000:
  agent_timesteps_total: 199920
  custom_metrics: {}
  date: 2021-11-07_16-56-53
  done: false
  episode_len_mean: 102.51
  episode_media: {}
  episode_reward_max: 8.010000000000014
  episode_reward_mean: 1.8736000000000055
  episode_reward_min: -2.1799999999999975
  episodes_this_iter: 98
  episodes_total: 1959
  experiment_id: 5986ad5d0d3246ff80964fd8aadc2f01
  hostname: cds2
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.6749999999999999
          cur_lr: 0.00010000000000000002
          entropy: 2.599187805509975
          entropy_coeff: 0.01
          kl: 0.01914685640908187
          policy_loss: -0.03634006816766456
          total_loss: 0.28411460114467857
          vf_explained_var: 0.6516804099082947
          vf_loss: 0.3335224202077868
    num_agent_steps_sampled: 199920
    num_agent_steps_trained: 199920
    num_steps_sampled: 199920
    num_steps_trained: 199920
  iterations_since_restore: 20

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_ef690_00000,RUNNING,10.55.229.87:9113,20,4603.57,199920,1.8736,8.01,-2.18,102.51


Result for PPO_my_env_ef690_00000:
  agent_timesteps_total: 209916
  custom_metrics: {}
  date: 2021-11-07_17-00-41
  done: false
  episode_len_mean: 101.88
  episode_media: {}
  episode_reward_max: 8.040000000000015
  episode_reward_mean: 1.5591000000000046
  episode_reward_min: -2.1100000000000008
  episodes_this_iter: 99
  episodes_total: 2058
  experiment_id: 5986ad5d0d3246ff80964fd8aadc2f01
  hostname: cds2
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.6749999999999999
          cur_lr: 0.00010000000000000002
          entropy: 2.5891009259427715
          entropy_coeff: 0.01
          kl: 0.018690139271363874
          policy_loss: -0.04415796919383554
          total_loss: 0.16790224770800424
          vf_explained_var: 0.7462381720542908
          vf_loss: 0.22533538005290887
    num_agent_steps_sampled: 209916
    num_agent_steps_trained: 209916
    num_steps_sampled: 209916
    num_steps_trained: 209916
  iterations_since_restore:

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_ef690_00000,RUNNING,10.55.229.87:9113,21,4831.21,209916,1.5591,8.04,-2.11,101.88




Result for PPO_my_env_ef690_00000:
  agent_timesteps_total: 219912
  custom_metrics: {}
  date: 2021-11-07_17-04-46
  done: false
  episode_len_mean: 98.85
  episode_media: {}
  episode_reward_max: 8.340000000000016
  episode_reward_mean: 1.5392000000000043
  episode_reward_min: -2.1
  episodes_this_iter: 100
  episodes_total: 2158
  experiment_id: 5986ad5d0d3246ff80964fd8aadc2f01
  hostname: cds2
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.6749999999999999
          cur_lr: 0.00010000000000000002
          entropy: 2.601542722873199
          entropy_coeff: 0.01
          kl: 0.019367127363236075
          policy_loss: -0.04049163330664747
          total_loss: 0.22793413317308595
          vf_explained_var: 0.6623491644859314
          vf_loss: 0.2813683819375996
    num_agent_steps_sampled: 219912
    num_agent_steps_trained: 219912
    num_steps_sampled: 219912
    num_steps_trained: 219912
  iterations_since_restore: 22
  node_ip: 10

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_ef690_00000,RUNNING,10.55.229.87:9113,22,5076.06,219912,1.5392,8.34,-2.1,98.85


Result for PPO_my_env_ef690_00000:
  agent_timesteps_total: 229908
  custom_metrics: {}
  date: 2021-11-07_17-08-31
  done: false
  episode_len_mean: 100.99
  episode_media: {}
  episode_reward_max: 10.580000000000014
  episode_reward_mean: 1.9033000000000058
  episode_reward_min: -1.750000000000001
  episodes_this_iter: 100
  episodes_total: 2258
  experiment_id: 5986ad5d0d3246ff80964fd8aadc2f01
  hostname: cds2
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.6749999999999999
          cur_lr: 0.00010000000000000002
          entropy: 2.5885871310519355
          entropy_coeff: 0.01
          kl: 0.021262941129300942
          policy_loss: -0.03870657231722377
          total_loss: 0.2352805390030655
          vf_explained_var: 0.7006276845932007
          vf_loss: 0.28552049693898257
    num_agent_steps_sampled: 229908
    num_agent_steps_trained: 229908
    num_steps_sampled: 229908
    num_steps_trained: 229908
  iterations_since_restore:

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_ef690_00000,RUNNING,10.55.229.87:9113,23,5301.55,229908,1.9033,10.58,-1.75,100.99


Result for PPO_my_env_ef690_00000:
  agent_timesteps_total: 239904
  custom_metrics: {}
  date: 2021-11-07_17-12-15
  done: false
  episode_len_mean: 102.36
  episode_media: {}
  episode_reward_max: 10.03000000000002
  episode_reward_mean: 1.6201000000000059
  episode_reward_min: -2.1599999999999957
  episodes_this_iter: 98
  episodes_total: 2356
  experiment_id: 5986ad5d0d3246ff80964fd8aadc2f01
  hostname: cds2
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 1.0125
          cur_lr: 0.00010000000000000002
          entropy: 2.5825857755465385
          entropy_coeff: 0.01
          kl: 0.018385470489326015
          policy_loss: -0.04014591830822392
          total_loss: 0.1955351237924053
          vf_explained_var: 0.7560774087905884
          vf_loss: 0.24289160979768404
    num_agent_steps_sampled: 239904
    num_agent_steps_trained: 239904
    num_steps_sampled: 239904
    num_steps_trained: 239904
  iterations_since_restore: 24
  node_ip

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_ef690_00000,RUNNING,10.55.229.87:9113,24,5524.94,239904,1.6201,10.03,-2.16,102.36


Result for PPO_my_env_ef690_00000:
  agent_timesteps_total: 249900
  custom_metrics: {}
  date: 2021-11-07_17-15-52
  done: false
  episode_len_mean: 103.93
  episode_media: {}
  episode_reward_max: 7.980000000000015
  episode_reward_mean: 1.6977000000000055
  episode_reward_min: -2.000000000000001
  episodes_this_iter: 96
  episodes_total: 2452
  experiment_id: 5986ad5d0d3246ff80964fd8aadc2f01
  hostname: cds2
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 1.0125
          cur_lr: 0.00010000000000000002
          entropy: 2.5827013498697524
          entropy_coeff: 0.01
          kl: 0.016713561168718756
          policy_loss: -0.04357304242240567
          total_loss: 0.19037485939936125
          vf_explained_var: 0.7218834757804871
          vf_loss: 0.24285243227759487
    num_agent_steps_sampled: 249900
    num_agent_steps_trained: 249900
    num_steps_sampled: 249900
    num_steps_trained: 249900
  iterations_since_restore: 25
  node_ip

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_ef690_00000,RUNNING,10.55.229.87:9113,25,5742.41,249900,1.6977,7.98,-2,103.93




Result for PPO_my_env_ef690_00000:
  agent_timesteps_total: 259896
  custom_metrics: {}
  date: 2021-11-07_17-19-58
  done: false
  episode_len_mean: 101.8
  episode_media: {}
  episode_reward_max: 8.650000000000016
  episode_reward_mean: 1.716600000000005
  episode_reward_min: -1.800000000000001
  episodes_this_iter: 98
  episodes_total: 2550
  experiment_id: 5986ad5d0d3246ff80964fd8aadc2f01
  hostname: cds2
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 1.0125
          cur_lr: 0.00010000000000000002
          entropy: 2.5849983394655407
          entropy_coeff: 0.01
          kl: 0.0182502747045965
          policy_loss: -0.03970588273408576
          total_loss: 0.1886894057608313
          vf_explained_var: 0.7178239822387695
          vf_loss: 0.23576686814363695
    num_agent_steps_sampled: 259896
    num_agent_steps_trained: 259896
    num_steps_sampled: 259896
    num_steps_trained: 259896
  iterations_since_restore: 26
  node_ip: 10.

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_ef690_00000,RUNNING,10.55.229.87:9113,26,5987.54,259896,1.7166,8.65,-1.8,101.8


Result for PPO_my_env_ef690_00000:
  agent_timesteps_total: 269892
  custom_metrics: {}
  date: 2021-11-07_17-23-51
  done: false
  episode_len_mean: 100.67
  episode_media: {}
  episode_reward_max: 12.15000000000002
  episode_reward_mean: 1.666000000000006
  episode_reward_min: -2.2699999999999974
  episodes_this_iter: 99
  episodes_total: 2649
  experiment_id: 5986ad5d0d3246ff80964fd8aadc2f01
  hostname: cds2
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 1.0125
          cur_lr: 0.00010000000000000002
          entropy: 2.5730708503315594
          entropy_coeff: 0.01
          kl: 0.01772320087068398
          policy_loss: -0.04475393624514596
          total_loss: 0.17393132797482178
          vf_explained_var: 0.7575363516807556
          vf_loss: 0.22647123055325613
    num_agent_steps_sampled: 269892
    num_agent_steps_trained: 269892
    num_steps_sampled: 269892
    num_steps_trained: 269892
  iterations_since_restore: 27
  node_ip:

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_ef690_00000,RUNNING,10.55.229.87:9113,27,6220.88,269892,1.666,12.15,-2.27,100.67


Result for PPO_my_env_ef690_00000:
  agent_timesteps_total: 279888
  custom_metrics: {}
  date: 2021-11-07_17-27-43
  done: false
  episode_len_mean: 101.74
  episode_media: {}
  episode_reward_max: 12.120000000000015
  episode_reward_mean: 1.9893000000000063
  episode_reward_min: -2.0200000000000005
  episodes_this_iter: 98
  episodes_total: 2747
  experiment_id: 5986ad5d0d3246ff80964fd8aadc2f01
  hostname: cds2
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 1.0125
          cur_lr: 0.00010000000000000002
          entropy: 2.5676373936172223
          entropy_coeff: 0.01
          kl: 0.01945985881261614
          policy_loss: -0.0434726883442356
          total_loss: 0.20954847002449709
          vf_explained_var: 0.7567579746246338
          vf_loss: 0.25899442442270937
    num_agent_steps_sampled: 279888
    num_agent_steps_trained: 279888
    num_steps_sampled: 279888
    num_steps_trained: 279888
  iterations_since_restore: 28
  node_ip

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_ef690_00000,RUNNING,10.55.229.87:9113,28,6453.43,279888,1.9893,12.12,-2.02,101.74




Result for PPO_my_env_ef690_00000:
  agent_timesteps_total: 289884
  custom_metrics: {}
  date: 2021-11-07_17-31-49
  done: false
  episode_len_mean: 99.94
  episode_media: {}
  episode_reward_max: 8.360000000000017
  episode_reward_mean: 1.7643000000000049
  episode_reward_min: -2.0699999999999994
  episodes_this_iter: 100
  episodes_total: 2847
  experiment_id: 5986ad5d0d3246ff80964fd8aadc2f01
  hostname: cds2
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 1.0125
          cur_lr: 0.00010000000000000002
          entropy: 2.574681407162267
          entropy_coeff: 0.01
          kl: 0.0186158744570433
          policy_loss: -0.04669317620464115
          total_loss: 0.16800117368738238
          vf_explained_var: 0.780113697052002
          vf_loss: 0.22159259046435867
    num_agent_steps_sampled: 289884
    num_agent_steps_trained: 289884
    num_steps_sampled: 289884
    num_steps_trained: 289884
  iterations_since_restore: 29
  node_ip: 1

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_ef690_00000,RUNNING,10.55.229.87:9113,29,6699.27,289884,1.7643,8.36,-2.07,99.94


