In [1]:
import torch 
from torch import nn

import ray
from ray.rllib.agents import ppo
from ray.rllib.models import ModelCatalog
from ray.rllib.models.torch.torch_modelv2 import TorchModelV2
from ray.rllib.utils.annotations import override

#from models import VisualEncoder
from train import *
from wrappers_2 import *



In [2]:
class VisualEncoder(nn.Module):
    def __init__(self):
        super().__init__()

        self.cnn = nn.Sequential(
            nn.Conv2d(3, 32, kernel_size=2, stride=2, padding=0),  
            nn.ELU(),
            nn.Conv2d(32, 32, kernel_size=2, stride=2, padding=0), 
            nn.ELU(),
            nn.Conv2d(32, 64, kernel_size=2, stride=2, padding=0), 
            nn.ELU(),
            nn.Conv2d(64, 128, kernel_size=2, stride=2, padding=0),
            nn.ELU(), 
            nn.Conv2d(128, 256, kernel_size=2, stride=2, padding=0),
            nn.ELU(),
            nn.Conv2d(256, 512, kernel_size=2, stride=2, padding=0),
            nn.ELU(),
            nn.Flatten(),
        )

    def forward(self, x):
        return self.cnn(x)

In [3]:
from torch.nn.functional import one_hot

class MyModelClass(TorchModelV2, nn.Module):
    def __init__(self, obs_space, action_space, num_outputs, model_config, name):
        TorchModelV2.__init__(self, obs_space, action_space, num_outputs, model_config, name)
        nn.Module.__init__(self)
        visual_features_dim = 512
        target_features_dim = 9 * 11 * 11 
        self.visual_encoder = VisualEncoder()
        self.visual_encoder.load_state_dict(
            torch.load("/IGLU-Minecraft/models/AngelaCNN/encoder_weigths.pth", map_location=torch.device('cpu'))
        )
        self.target_encoder = nn.Sequential(
            nn.Conv3d(7, 1, kernel_size=1, stride=1, padding=0),
            nn.ELU(),
        )
        policy_hidden_dim = 256 
        self.policy_network = nn.Sequential(
            nn.Linear(visual_features_dim + target_features_dim, 1024),
            nn.ELU(),
            nn.Linear(1024, 512),
            nn.ELU(),
            nn.Linear(512, policy_hidden_dim),
            nn.ELU(),
            nn.Linear(policy_hidden_dim, policy_hidden_dim),
            nn.ELU(),
            #nn.Linear(policy_hidden_dim, policy_hidden_dim),
            #nn.ELU(),
        )
        self.action_head = nn.Linear(policy_hidden_dim, action_space.n)
        self.value_head = nn.Linear(policy_hidden_dim, 1)
        self.last_value = None
        
        self.use_cuda = torch.cuda.is_available()
        if self.use_cuda:
            self.visual_encoder.cuda()
            self.target_encoder.cuda()
            self.policy_network.cuda()
            self.action_head.cuda()
            self.value_head.cuda()
        
    @override(TorchModelV2)
    def forward(self, input_dict, state, seq_lens):
        obs = input_dict['obs']
        pov = obs['pov'].permute(0, 3, 1, 2).float() / 255.0
        target = one_hot(obs['target_grid'].long(), num_classes=7).permute(0, 4, 1, 2, 3).float()
        if self.use_cuda:
            pov.cuda()
            target.cuda()
            
        with torch.no_grad():
            visual_features = self.visual_encoder(pov)
            
        target_features = self.target_encoder(target)
        target_features = target_features.reshape(target_features.shape[0], -1)
        features = torch.cat([visual_features, target_features], dim=1)
        features = self.policy_network(features)
        action = self.action_head(features)
        self.last_value = self.value_head(features).squeeze(1)
        return action, state
    
    @override(TorchModelV2)
    def value_function(self):
        assert self.last_value is not None, "must call forward() first"
        return self.last_value

In [4]:
ModelCatalog.register_custom_model("my_torch_model", MyModelClass)

In [5]:
class VisualObservationWrapper(ObsWrapper):
    def __init__(self, env, include_target=False):
        super().__init__(env)
        self.observation_space = {   
            'pov': gym.spaces.Box(low=0, high=255, shape=(64, 64, 3)),
            'inventory': gym.spaces.Box(low=0.0, high=20.0, shape=(6,)),
            'compass': gym.spaces.Box(low=-180.0, high=180.0, shape=(1,))
        }
        if include_target:
            self.observation_space['target_grid'] = \
                gym.spaces.Box(low=0, high=6, shape=(9, 11, 11))
        self.observation_space = gym.spaces.Dict(self.observation_space)

    def observation(self, obs, reward=None, done=None, info=None):
        if info is not None:
            if 'target_grid' in info:
                target_grid = info['target_grid']
                del info['target_grid']
            else:
                logger.error(f'info: {info}')
                if hasattr(self.unwrapped, 'should_reset'):
                    self.unwrapped.should_reset(True)
                target_grid = self.env.unwrapped.tasks.current.target_grid
        else:
            target_grid = self.env.unwrapped.tasks.current.target_grid
        return {
            'pov': obs['pov'].astype(np.float32),
            'inventory': obs['inventory'],
            'compass': np.array([obs['compass']['angle'].item()]),
            'target_grid': target_grid
        }

In [6]:
import os
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"   # see issue #152
os.environ["CUDA_VISIBLE_DEVICES"]="0"

class RewardWrapper(gym.RewardWrapper):
    def __init__(self, env):
        super().__init__(env)
    
    def reward(self, rew):
        if rew == 0:
            rew = -0.01
        if abs(rew) == 1:
            rew /= 10
        return rew
    
def env_creator(env_config):
    env = gym.make('IGLUSilentBuilder-v0', max_steps=250)
    env.update_taskset(TaskSet(preset=['C3', 'C17', 'C32', 'C8']))
    #env = PovOnlyWrapper(env)
    env = VisualObservationWrapper(env, include_target=True)
    env = SelectAndPlace(env)
    env = Discretization(env, flat_action_space('human-level'))
    env = RewardWrapper(env)
    return env

from ray.tune.registry import register_env
register_env("my_env", env_creator)

from ray import tune
from ray.rllib.agents.ppo import PPOTrainer

In [None]:
from ray.tune.integration.wandb import WandbLogger

analysis = tune.run(PPOTrainer, 
         config={
             "env": "my_env", 
             "framework": "torch",
             "num_gpus": 1,
             "num_workers": 3,
             "sgd_minibatch_size": 256,
             "clip_param": 0.2,
             "entropy_coeff": 0.01,
             "lambda": 0.95,
             "train_batch_size": 1000,
             #"gamma": 0.99,
             "model": {
                    # Specify our custom model from above.
                    "custom_model": "my_torch_model",
                    # Extra kwargs to be passed to your model's c'tor.
                    "custom_model_config": {},
              },
             "logger_config": {
                  "wandb": {
                      "project": "IGLU-Minecraft",
                      "name": "PPO MultiTask (C3, C17, C32, C8) pretrained (AngelaCNN) (3 noops after placement) r: -0.01 div10"
                  }
              }

        },
        loggers=[WandbLogger],
        local_dir="/IGLU-Minecraft/checkpoints/4_tasks",
        keep_checkpoints_num=50,
        checkpoint_freq=5,
        checkpoint_at_end=True,
        restore="/IGLU-Minecraft/checkpoints/4_tasks/PPO_2021-11-08_20-28-45/PPO_my_env_78cf0_00000_0_2021-11-08_20-28-45/checkpoint_000050/checkpoint-50")

2021-11-08 22:06:08,725	INFO trainable.py:76 -- Checkpoint size is 31502439 bytes
2021-11-08 22:06:08,755	INFO wandb.py:170 -- Already logged into W&B.
2021-11-08 22:06:08,790	ERROR syncer.py:72 -- Log sync requires rsync to be installed.


Trial name,status,loc
PPO_my_env_1379e_00000,RUNNING,


[34m[1mwandb[0m: Currently logged in as: [33mlinar[0m (use `wandb login --relogin` to force relogin)
[34m[1mwandb[0m: wandb version 0.12.6 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade


[2m[36m(pid=62698)[0m 2021-11-08 22:06:13,782	INFO ppo.py:159 -- In multi-agent mode, policies will be optimized sequentially by the multi-GPU optimizer. Consider setting simple_optimizer=True if this doesn't work for you.
[2m[36m(pid=62698)[0m 2021-11-08 22:06:13,782	INFO trainer.py:728 -- Current log_level is WARN. For more information, set 'log_level': 'INFO' / 'DEBUG' or use the -v and -vv flags.
[2m[36m(pid=62698)[0m 2021-11-08 22:06:22,360	INFO trainable.py:109 -- Trainable.setup took 12.264 seconds. If your trainable is slow to initialize, consider setting reuse_actors=True to reduce actor creation overheads.


Trial name,status,loc
PPO_my_env_1379e_00000,RUNNING,


[2m[36m(pid=62698)[0m 2021-11-08 22:06:22,467	INFO trainable.py:383 -- Restored on 192.168.1.96 from checkpoint: /IGLU-Minecraft/checkpoints/4_tasks/PPO_2021-11-08_22-06-08/PPO_my_env_1379e_00000_0_2021-11-08_22-06-08/tmpnemumt2yrestore_from_object/checkpoint-50
[2m[36m(pid=62698)[0m 2021-11-08 22:06:22,467	INFO trainable.py:390 -- Current state after restoring: {'_iteration': 50, '_timesteps_total': None, '_time_total': 5104.906400442123, '_episodes_total': 637}


Result for PPO_my_env_1379e_00000:
  agent_timesteps_total: 51998
  custom_metrics: {}
  date: 2021-11-08_22-10-28
  done: false
  episode_len_mean: 65.62068965517241
  episode_media: {}
  episode_reward_max: 9.93
  episode_reward_mean: 5.784482758620698
  episode_reward_min: 1.9200000000000113
  episodes_this_iter: 29
  episodes_total: 666
  experiment_id: 73503dd6be0c4d36b5d2eea88c6f3415
  hostname: cdsserver
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.2
          cur_lr: 5.000000000000001e-05
          entropy: 2.0565471910295035
          entropy_coeff: 0.009999999999999998
          kl: 0.014241318434432656
          policy_loss: -0.012536534879888808
          total_loss: 0.6745009437203408
          vf_explained_var: 0.8480382561683655
          vf_loss: 0.7047546832334428
    num_agent_steps_sampled: 51998
    num_agent_steps_trained: 51998
    num_steps_sampled: 51998
    num_steps_trained: 51998
  iterations_since_restore: 1
  n

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_1379e_00000,RUNNING,192.168.1.96:62698,51,5350.78,51998,5.78448,9.93,1.92,65.6207




Result for PPO_my_env_1379e_00000:
  agent_timesteps_total: 53996
  custom_metrics: {}
  date: 2021-11-08_22-13-53
  done: false
  episode_len_mean: 67.55172413793103
  episode_media: {}
  episode_reward_max: 10.370000000000013
  episode_reward_mean: 5.695344827586217
  episode_reward_min: -4.85722573273506e-16
  episodes_this_iter: 29
  episodes_total: 695
  experiment_id: 73503dd6be0c4d36b5d2eea88c6f3415
  hostname: cdsserver
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.2
          cur_lr: 5.000000000000001e-05
          entropy: 2.054632428714207
          entropy_coeff: 0.009999999999999998
          kl: 0.01787651900069836
          policy_loss: -0.05959009737485931
          total_loss: 0.5696755209494204
          vf_explained_var: 0.8116239905357361
          vf_loss: 0.6462366400730042
    num_agent_steps_sampled: 53996
    num_agent_steps_trained: 53996
    num_steps_sampled: 53996
    num_steps_trained: 53996
  iterations_since_

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_1379e_00000,RUNNING,192.168.1.96:62698,52,5556.28,53996,5.69534,10.37,-4.85723e-16,67.5517




Result for PPO_my_env_1379e_00000:
  agent_timesteps_total: 55994
  custom_metrics: {}
  date: 2021-11-08_22-18-43
  done: false
  episode_len_mean: 60.618556701030926
  episode_media: {}
  episode_reward_max: 10.370000000000013
  episode_reward_mean: 5.9547422680412465
  episode_reward_min: -4.85722573273506e-16
  episodes_this_iter: 39
  episodes_total: 734
  experiment_id: 73503dd6be0c4d36b5d2eea88c6f3415
  hostname: cdsserver
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.2
          cur_lr: 5.000000000000001e-05
          entropy: 2.0246361442974634
          entropy_coeff: 0.009999999999999998
          kl: 0.016678847650928304
          policy_loss: -0.010106236221534864
          total_loss: 0.561548975890591
          vf_explained_var: 0.8909662961959839
          vf_loss: 0.5885658003035046
    num_agent_steps_sampled: 55994
    num_agent_steps_trained: 55994
    num_steps_sampled: 55994
    num_steps_trained: 55994
  iterations_si

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_1379e_00000,RUNNING,192.168.1.96:62698,53,5845.86,55994,5.95474,10.37,-4.85723e-16,60.6186




Result for PPO_my_env_1379e_00000:
  agent_timesteps_total: 57992
  custom_metrics: {}
  date: 2021-11-08_22-23-26
  done: false
  episode_len_mean: 57.03
  episode_media: {}
  episode_reward_max: 10.430000000000016
  episode_reward_mean: 6.061600000000007
  episode_reward_min: -4.85722573273506e-16
  episodes_this_iter: 37
  episodes_total: 771
  experiment_id: 73503dd6be0c4d36b5d2eea88c6f3415
  hostname: cdsserver
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.2
          cur_lr: 5.000000000000001e-05
          entropy: 2.0439789698237463
          entropy_coeff: 0.009999999999999998
          kl: 0.013555709987890128
          policy_loss: -0.01903611549309322
          total_loss: 0.5942185406173979
          vf_explained_var: 0.8720524311065674
          vf_loss: 0.6309833000813212
    num_agent_steps_sampled: 57992
    num_agent_steps_trained: 57992
    num_steps_sampled: 57992
    num_steps_trained: 57992
  iterations_since_restore: 4

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_1379e_00000,RUNNING,192.168.1.96:62698,54,6128.8,57992,6.0616,10.43,-4.85723e-16,57.03




Result for PPO_my_env_1379e_00000:
  agent_timesteps_total: 59990
  custom_metrics: {}
  date: 2021-11-08_22-27-48
  done: false
  episode_len_mean: 53.93
  episode_media: {}
  episode_reward_max: 10.430000000000016
  episode_reward_mean: 6.204500000000007
  episode_reward_min: 0.259999999999999
  episodes_this_iter: 36
  episodes_total: 807
  experiment_id: 73503dd6be0c4d36b5d2eea88c6f3415
  hostname: cdsserver
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.2
          cur_lr: 5.000000000000001e-05
          entropy: 2.045295785722278
          entropy_coeff: 0.009999999999999998
          kl: 0.011321829703586458
          policy_loss: 0.02418609272156443
          total_loss: 0.547794891645511
          vf_explained_var: 0.8846254348754883
          vf_loss: 0.5417973879547346
    num_agent_steps_sampled: 59990
    num_agent_steps_trained: 59990
    num_steps_sampled: 59990
    num_steps_trained: 59990
  iterations_since_restore: 5
  node

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_1379e_00000,RUNNING,192.168.1.96:62698,55,6390.77,59990,6.2045,10.43,0.26,53.93




Result for PPO_my_env_1379e_00000:
  agent_timesteps_total: 61988
  custom_metrics: {}
  date: 2021-11-08_22-31-35
  done: false
  episode_len_mean: 56.33
  episode_media: {}
  episode_reward_max: 10.430000000000016
  episode_reward_mean: 6.124800000000007
  episode_reward_min: 0.2900000000000009
  episodes_this_iter: 34
  episodes_total: 841
  experiment_id: 73503dd6be0c4d36b5d2eea88c6f3415
  hostname: cdsserver
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.2
          cur_lr: 5.000000000000001e-05
          entropy: 2.051380259082431
          entropy_coeff: 0.009999999999999998
          kl: 0.014229876312188486
          policy_loss: -0.058776423494730674
          total_loss: 0.45497791082376526
          vf_explained_var: 0.8710749745368958
          vf_loss: 0.5314221621978851
    num_agent_steps_sampled: 61988
    num_agent_steps_trained: 61988
    num_steps_sampled: 61988
    num_steps_trained: 61988
  iterations_since_restore: 6
 

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_1379e_00000,RUNNING,192.168.1.96:62698,56,6617.33,61988,6.1248,10.43,0.29,56.33




Result for PPO_my_env_1379e_00000:
  agent_timesteps_total: 63986
  custom_metrics: {}
  date: 2021-11-08_22-39-14
  done: false
  episode_len_mean: 43.32
  episode_media: {}
  episode_reward_max: 10.720000000000011
  episode_reward_mean: 6.475100000000005
  episode_reward_min: 0.9899999999999995
  episodes_this_iter: 60
  episodes_total: 901
  experiment_id: 73503dd6be0c4d36b5d2eea88c6f3415
  hostname: cdsserver
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.2
          cur_lr: 5.000000000000001e-05
          entropy: 1.694506301198687
          entropy_coeff: 0.009999999999999998
          kl: 0.013006005140032969
          policy_loss: -0.026389063238388015
          total_loss: 0.6102047068732125
          vf_explained_var: 0.8971973061561584
          vf_loss: 0.6509376311586017
    num_agent_steps_sampled: 63986
    num_agent_steps_trained: 63986
    num_steps_sampled: 63986
    num_steps_trained: 63986
  iterations_since_restore: 7
  

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_1379e_00000,RUNNING,192.168.1.96:62698,57,7077.06,63986,6.4751,10.72,0.99,43.32




Result for PPO_my_env_1379e_00000:
  agent_timesteps_total: 65984
  custom_metrics: {}
  date: 2021-11-08_22-42-26
  done: false
  episode_len_mean: 43.63
  episode_media: {}
  episode_reward_max: 12.070000000000016
  episode_reward_mean: 6.520400000000003
  episode_reward_min: -2.0999999999999996
  episodes_this_iter: 32
  episodes_total: 933
  experiment_id: 73503dd6be0c4d36b5d2eea88c6f3415
  hostname: cdsserver
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.2
          cur_lr: 5.000000000000001e-05
          entropy: 1.9657836465608507
          entropy_coeff: 0.009999999999999998
          kl: 0.019692241517673157
          policy_loss: 0.020175657137518836
          total_loss: 0.9074083562408175
          vf_explained_var: 0.8329024314880371
          vf_loss: 0.9029520955823717
    num_agent_steps_sampled: 65984
    num_agent_steps_trained: 65984
    num_steps_sampled: 65984
    num_steps_trained: 65984
  iterations_since_restore: 8
 

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_1379e_00000,RUNNING,192.168.1.96:62698,58,7268.37,65984,6.5204,12.07,-2.1,43.63




Result for PPO_my_env_1379e_00000:
  agent_timesteps_total: 67982
  custom_metrics: {}
  date: 2021-11-08_22-47-44
  done: false
  episode_len_mean: 47.91
  episode_media: {}
  episode_reward_max: 14.660000000000014
  episode_reward_mean: 6.430700000000004
  episode_reward_min: -2.0999999999999996
  episodes_this_iter: 38
  episodes_total: 971
  experiment_id: 73503dd6be0c4d36b5d2eea88c6f3415
  hostname: cdsserver
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.2
          cur_lr: 5.000000000000001e-05
          entropy: 1.9742300726118542
          entropy_coeff: 0.009999999999999998
          kl: 0.019366018994173506
          policy_loss: 0.017919089285922903
          total_loss: 0.5774223250647386
          vf_explained_var: 0.8858834505081177
          vf_loss: 0.5753723369467826
    num_agent_steps_sampled: 67982
    num_agent_steps_trained: 67982
    num_steps_sampled: 67982
    num_steps_trained: 67982
  iterations_since_restore: 9
 

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_1379e_00000,RUNNING,192.168.1.96:62698,59,7586.59,67982,6.4307,14.66,-2.1,47.91




Result for PPO_my_env_1379e_00000:
  agent_timesteps_total: 69980
  custom_metrics: {}
  date: 2021-11-08_22-54-34
  done: false
  episode_len_mean: 50.32
  episode_media: {}
  episode_reward_max: 14.660000000000014
  episode_reward_mean: 6.630000000000007
  episode_reward_min: -2.0999999999999996
  episodes_this_iter: 45
  episodes_total: 1016
  experiment_id: 73503dd6be0c4d36b5d2eea88c6f3415
  hostname: cdsserver
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.2
          cur_lr: 5.000000000000001e-05
          entropy: 1.7242689575467791
          entropy_coeff: 0.009999999999999998
          kl: 0.013401096964467453
          policy_loss: -0.023200519515999725
          total_loss: 0.7075032752184641
          vf_explained_var: 0.8726544976234436
          vf_loss: 0.7452662675153642
    num_agent_steps_sampled: 69980
    num_agent_steps_trained: 69980
    num_steps_sampled: 69980
    num_steps_trained: 69980
  iterations_since_restore: 1

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_1379e_00000,RUNNING,192.168.1.96:62698,60,7996.66,69980,6.63,14.66,-2.1,50.32




Result for PPO_my_env_1379e_00000:
  agent_timesteps_total: 71978
  custom_metrics: {}
  date: 2021-11-08_22-58-42
  done: false
  episode_len_mean: 47.63
  episode_media: {}
  episode_reward_max: 16.529999999999966
  episode_reward_mean: 7.454400000000005
  episode_reward_min: 0.44999999999999885
  episodes_this_iter: 41
  episodes_total: 1057
  experiment_id: 73503dd6be0c4d36b5d2eea88c6f3415
  hostname: cdsserver
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.2
          cur_lr: 5.000000000000001e-05
          entropy: 1.7103699905531746
          entropy_coeff: 0.009999999999999998
          kl: 0.014930061264465565
          policy_loss: -0.08898957311397507
          total_loss: 0.8049651164384115
          vf_explained_var: 0.8457402586936951
          vf_loss: 0.9080723778122948
    num_agent_steps_sampled: 71978
    num_agent_steps_trained: 71978
    num_steps_sampled: 71978
    num_steps_trained: 71978
  iterations_since_restore: 11

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_1379e_00000,RUNNING,192.168.1.96:62698,61,8244.79,71978,7.4544,16.53,0.45,47.63




Result for PPO_my_env_1379e_00000:
  agent_timesteps_total: 73976
  custom_metrics: {}
  date: 2021-11-08_23-04-18
  done: false
  episode_len_mean: 47.85
  episode_media: {}
  episode_reward_max: 16.689999999999973
  episode_reward_mean: 7.556900000000004
  episode_reward_min: 0.44999999999999885
  episodes_this_iter: 40
  episodes_total: 1097
  experiment_id: 73503dd6be0c4d36b5d2eea88c6f3415
  hostname: cdsserver
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.2
          cur_lr: 5.000000000000001e-05
          entropy: 1.7255965743746076
          entropy_coeff: 0.009999999999999998
          kl: 0.028060248551755262
          policy_loss: 0.0022073060451518922
          total_loss: 1.1145887005896795
          vf_explained_var: 0.8709076046943665
          vf_loss: 1.1240253061056138
    num_agent_steps_sampled: 73976
    num_agent_steps_trained: 73976
    num_steps_sampled: 73976
    num_steps_trained: 73976
  iterations_since_restore: 1

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_1379e_00000,RUNNING,192.168.1.96:62698,62,8580,73976,7.5569,16.69,0.45,47.85




Result for PPO_my_env_1379e_00000:
  agent_timesteps_total: 75974
  custom_metrics: {}
  date: 2021-11-08_23-11-01
  done: false
  episode_len_mean: 45.77
  episode_media: {}
  episode_reward_max: 16.689999999999973
  episode_reward_mean: 7.544900000000004
  episode_reward_min: 0.32999999999999896
  episodes_this_iter: 51
  episodes_total: 1148
  experiment_id: 73503dd6be0c4d36b5d2eea88c6f3415
  hostname: cdsserver
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.3
          cur_lr: 5.000000000000001e-05
          entropy: 1.5828127270653134
          entropy_coeff: 0.009999999999999998
          kl: 0.012638548451890738
          policy_loss: -0.013457118826253073
          total_loss: 0.9778148995978492
          vf_explained_var: 0.9065148234367371
          vf_loss: 1.003308578474181
    num_agent_steps_sampled: 75974
    num_agent_steps_trained: 75974
    num_steps_sampled: 75974
    num_steps_trained: 75974
  iterations_since_restore: 13

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_1379e_00000,RUNNING,192.168.1.96:62698,63,8983.37,75974,7.5449,16.69,0.33,45.77




Result for PPO_my_env_1379e_00000:
  agent_timesteps_total: 77972
  custom_metrics: {}
  date: 2021-11-08_23-15-25
  done: false
  episode_len_mean: 42.3
  episode_media: {}
  episode_reward_max: 16.45999999999994
  episode_reward_mean: 7.924300000000003
  episode_reward_min: 0.32999999999999896
  episodes_this_iter: 43
  episodes_total: 1191
  experiment_id: 73503dd6be0c4d36b5d2eea88c6f3415
  hostname: cdsserver
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.3
          cur_lr: 5.000000000000001e-05
          entropy: 1.5897380516642616
          entropy_coeff: 0.009999999999999998
          kl: 0.011531099724723761
          policy_loss: -0.04896400904371625
          total_loss: 0.867828046636922
          vf_explained_var: 0.901735782623291
          vf_loss: 0.9292301036062695
    num_agent_steps_sampled: 77972
    num_agent_steps_trained: 77972
    num_steps_sampled: 77972
    num_steps_trained: 77972
  iterations_since_restore: 14
  n

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_1379e_00000,RUNNING,192.168.1.96:62698,64,9247.19,77972,7.9243,16.46,0.33,42.3




Result for PPO_my_env_1379e_00000:
  agent_timesteps_total: 79970
  custom_metrics: {}
  date: 2021-11-08_23-20-43
  done: false
  episode_len_mean: 46.58
  episode_media: {}
  episode_reward_max: 18.48999999999996
  episode_reward_mean: 7.916000000000002
  episode_reward_min: 0.3299999999999996
  episodes_this_iter: 44
  episodes_total: 1235
  experiment_id: 73503dd6be0c4d36b5d2eea88c6f3415
  hostname: cdsserver
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.3
          cur_lr: 5.000000000000001e-05
          entropy: 1.6630878255480812
          entropy_coeff: 0.009999999999999998
          kl: 0.017094586689314437
          policy_loss: 0.011365789068596704
          total_loss: 1.0502808430365154
          vf_explained_var: 0.8861271739006042
          vf_loss: 1.050417555655752
    num_agent_steps_sampled: 79970
    num_agent_steps_trained: 79970
    num_steps_sampled: 79970
    num_steps_trained: 79970
  iterations_since_restore: 15
  

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_1379e_00000,RUNNING,192.168.1.96:62698,65,9564.99,79970,7.916,18.49,0.33,46.58




Result for PPO_my_env_1379e_00000:
  agent_timesteps_total: 81968
  custom_metrics: {}
  date: 2021-11-08_23-24-38
  done: false
  episode_len_mean: 47.53
  episode_media: {}
  episode_reward_max: 18.48999999999996
  episode_reward_mean: 7.695200000000003
  episode_reward_min: 0.3299999999999996
  episodes_this_iter: 37
  episodes_total: 1272
  experiment_id: 73503dd6be0c4d36b5d2eea88c6f3415
  hostname: cdsserver
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.3
          cur_lr: 5.000000000000001e-05
          entropy: 1.6312147350538344
          entropy_coeff: 0.009999999999999998
          kl: 0.012180574847359956
          policy_loss: -0.004455973305517719
          total_loss: 0.6670942361156146
          vf_explained_var: 0.9349812865257263
          vf_loss: 0.6842081883123943
    num_agent_steps_sampled: 81968
    num_agent_steps_trained: 81968
    num_steps_sampled: 81968
    num_steps_trained: 81968
  iterations_since_restore: 16


Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_1379e_00000,RUNNING,192.168.1.96:62698,66,9800.08,81968,7.6952,18.49,0.33,47.53




Result for PPO_my_env_1379e_00000:
  agent_timesteps_total: 83966
  custom_metrics: {}
  date: 2021-11-08_23-30-40
  done: false
  episode_len_mean: 44.29
  episode_media: {}
  episode_reward_max: 14.640000000000013
  episode_reward_mean: 7.693300000000006
  episode_reward_min: 0.389999999999999
  episodes_this_iter: 50
  episodes_total: 1322
  experiment_id: 73503dd6be0c4d36b5d2eea88c6f3415
  hostname: cdsserver
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.3
          cur_lr: 5.000000000000001e-05
          entropy: 1.5412240056764512
          entropy_coeff: 0.009999999999999998
          kl: 0.008054698144283706
          policy_loss: 0.0008719766689907936
          total_loss: 0.7544911801815033
          vf_explained_var: 0.9178794026374817
          vf_loss: 0.7666150382586888
    num_agent_steps_sampled: 83966
    num_agent_steps_trained: 83966
    num_steps_sampled: 83966
    num_steps_trained: 83966
  iterations_since_restore: 17


Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_1379e_00000,RUNNING,192.168.1.96:62698,67,10161.6,83966,7.6933,14.64,0.39,44.29




Result for PPO_my_env_1379e_00000:
  agent_timesteps_total: 85964
  custom_metrics: {}
  date: 2021-11-08_23-39-01
  done: false
  episode_len_mean: 35.97
  episode_media: {}
  episode_reward_max: 18.469999999999956
  episode_reward_mean: 7.857100000000002
  episode_reward_min: 1.9700000000000062
  episodes_this_iter: 61
  episodes_total: 1383
  experiment_id: 73503dd6be0c4d36b5d2eea88c6f3415
  hostname: cdsserver
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.3
          cur_lr: 5.000000000000001e-05
          entropy: 1.413124692440033
          entropy_coeff: 0.009999999999999998
          kl: 0.00946987597686142
          policy_loss: 0.008810640676390556
          total_loss: 0.8289991412843977
          vf_explained_var: 0.9176111221313477
          vf_loss: 0.8314787854750951
    num_agent_steps_sampled: 85964
    num_agent_steps_trained: 85964
    num_steps_sampled: 85964
    num_steps_trained: 85964
  iterations_since_restore: 18
  

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_1379e_00000,RUNNING,192.168.1.96:62698,68,10663,85964,7.8571,18.47,1.97,35.97




Result for PPO_my_env_1379e_00000:
  agent_timesteps_total: 87962
  custom_metrics: {}
  date: 2021-11-08_23-46-03
  done: false
  episode_len_mean: 35.88
  episode_media: {}
  episode_reward_max: 18.659999999999958
  episode_reward_mean: 8.1109
  episode_reward_min: 0.2599999999999997
  episodes_this_iter: 51
  episodes_total: 1434
  experiment_id: 73503dd6be0c4d36b5d2eea88c6f3415
  hostname: cdsserver
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.3
          cur_lr: 5.000000000000001e-05
          entropy: 1.448130982830411
          entropy_coeff: 0.009999999999999998
          kl: 0.00971935887726083
          policy_loss: -0.057150852857601075
          total_loss: 0.7476289803073519
          vf_explained_var: 0.928865909576416
          vf_loss: 0.8163453346207028
    num_agent_steps_sampled: 87962
    num_agent_steps_trained: 87962
    num_steps_sampled: 87962
    num_steps_trained: 87962
  iterations_since_restore: 19
  node_ip: 19

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_1379e_00000,RUNNING,192.168.1.96:62698,69,11085.2,87962,8.1109,18.66,0.26,35.88




Result for PPO_my_env_1379e_00000:
  agent_timesteps_total: 89960
  custom_metrics: {}
  date: 2021-11-08_23-51-34
  done: false
  episode_len_mean: 40.85
  episode_media: {}
  episode_reward_max: 18.659999999999958
  episode_reward_mean: 8.757100000000001
  episode_reward_min: 0.2599999999999997
  episodes_this_iter: 42
  episodes_total: 1476
  experiment_id: 73503dd6be0c4d36b5d2eea88c6f3415
  hostname: cdsserver
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.3
          cur_lr: 5.000000000000001e-05
          entropy: 1.375038186141423
          entropy_coeff: 0.009999999999999998
          kl: 0.012222022308105875
          policy_loss: 0.00685416032515821
          total_loss: 0.9537604689598084
          vf_explained_var: 0.939556896686554
          vf_loss: 0.9569900788012005
    num_agent_steps_sampled: 89960
    num_agent_steps_trained: 89960
    num_steps_sampled: 89960
    num_steps_trained: 89960
  iterations_since_restore: 20
  n

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_1379e_00000,RUNNING,192.168.1.96:62698,70,11415.5,89960,8.7571,18.66,0.26,40.85




Result for PPO_my_env_1379e_00000:
  agent_timesteps_total: 91958
  custom_metrics: {}
  date: 2021-11-08_23-59-50
  done: false
  episode_len_mean: 39.26
  episode_media: {}
  episode_reward_max: 18.249999999999954
  episode_reward_mean: 8.678799999999999
  episode_reward_min: 1.8300000000000056
  episodes_this_iter: 59
  episodes_total: 1535
  experiment_id: 73503dd6be0c4d36b5d2eea88c6f3415
  hostname: cdsserver
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.3
          cur_lr: 5.000000000000001e-05
          entropy: 1.3124724899019513
          entropy_coeff: 0.009999999999999998
          kl: 0.01377370496879774
          policy_loss: -0.04128725032011668
          total_loss: 0.4171514405381112
          vf_explained_var: 0.9520395994186401
          vf_loss: 0.46743130428450447
    num_agent_steps_sampled: 91958
    num_agent_steps_trained: 91958
    num_steps_sampled: 91958
    num_steps_trained: 91958
  iterations_since_restore: 21


Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_1379e_00000,RUNNING,192.168.1.96:62698,71,11911.7,91958,8.6788,18.25,1.83,39.26




Result for PPO_my_env_1379e_00000:
  agent_timesteps_total: 93956
  custom_metrics: {}
  date: 2021-11-09_00-08-19
  done: false
  episode_len_mean: 32.55
  episode_media: {}
  episode_reward_max: 22.949999999999996
  episode_reward_mean: 8.546199999999999
  episode_reward_min: 2.4600000000000137
  episodes_this_iter: 64
  episodes_total: 1599
  experiment_id: 73503dd6be0c4d36b5d2eea88c6f3415
  hostname: cdsserver
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.3
          cur_lr: 5.000000000000001e-05
          entropy: 1.2163931508858998
          entropy_coeff: 0.009999999999999998
          kl: 0.013859086698255384
          policy_loss: -0.023442515686509156
          total_loss: 0.7636838817880267
          vf_explained_var: 0.9433857798576355
          vf_loss: 0.7951325994162333
    num_agent_steps_sampled: 93956
    num_agent_steps_trained: 93956
    num_steps_sampled: 93956
    num_steps_trained: 93956
  iterations_since_restore: 22

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_1379e_00000,RUNNING,192.168.1.96:62698,72,12421.3,93956,8.5462,22.95,2.46,32.55




Result for PPO_my_env_1379e_00000:
  agent_timesteps_total: 95954
  custom_metrics: {}
  date: 2021-11-09_00-14-36
  done: false
  episode_len_mean: 36.46
  episode_media: {}
  episode_reward_max: 22.949999999999996
  episode_reward_mean: 8.965099999999998
  episode_reward_min: 2.350000000000012
  episodes_this_iter: 45
  episodes_total: 1644
  experiment_id: 73503dd6be0c4d36b5d2eea88c6f3415
  hostname: cdsserver
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.3
          cur_lr: 5.000000000000001e-05
          entropy: 1.328194889000484
          entropy_coeff: 0.009999999999999998
          kl: 0.007879520792622973
          policy_loss: -0.11331479682454042
          total_loss: 0.5338726339240869
          vf_explained_var: 0.9501395225524902
          vf_loss: 0.6581055204783167
    num_agent_steps_sampled: 95954
    num_agent_steps_trained: 95954
    num_steps_sampled: 95954
    num_steps_trained: 95954
  iterations_since_restore: 23
  

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_1379e_00000,RUNNING,192.168.1.96:62698,73,12798.3,95954,8.9651,22.95,2.35,36.46




Result for PPO_my_env_1379e_00000:
  agent_timesteps_total: 97952
  custom_metrics: {}
  date: 2021-11-09_00-21-27
  done: false
  episode_len_mean: 42.48
  episode_media: {}
  episode_reward_max: 22.949999999999996
  episode_reward_mean: 9.588299999999995
  episode_reward_min: 2.350000000000012
  episodes_this_iter: 47
  episodes_total: 1691
  experiment_id: 73503dd6be0c4d36b5d2eea88c6f3415
  hostname: cdsserver
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.3
          cur_lr: 5.000000000000001e-05
          entropy: 1.2810787609645298
          entropy_coeff: 0.009999999999999998
          kl: 0.009203716537508733
          policy_loss: 0.039179489850288345
          total_loss: 0.6113199625696455
          vf_explained_var: 0.9559618234634399
          vf_loss: 0.582190142713842
    num_agent_steps_sampled: 97952
    num_agent_steps_trained: 97952
    num_steps_sampled: 97952
    num_steps_trained: 97952
  iterations_since_restore: 24
  

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_1379e_00000,RUNNING,192.168.1.96:62698,74,13208.8,97952,9.5883,22.95,2.35,42.48




Result for PPO_my_env_1379e_00000:
  agent_timesteps_total: 99950
  custom_metrics: {}
  date: 2021-11-09_00-30-19
  done: false
  episode_len_mean: 36.97
  episode_media: {}
  episode_reward_max: 23.130000000000006
  episode_reward_mean: 9.269199999999994
  episode_reward_min: 4.23000000000001
  episodes_this_iter: 56
  episodes_total: 1747
  experiment_id: 73503dd6be0c4d36b5d2eea88c6f3415
  hostname: cdsserver
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.3
          cur_lr: 5.000000000000001e-05
          entropy: 1.2443254592872801
          entropy_coeff: 0.009999999999999998
          kl: 0.015068073893164459
          policy_loss: -0.02474795560396853
          total_loss: 0.994323534766833
          vf_explained_var: 0.931327760219574
          vf_loss: 1.0269943150736036
    num_agent_steps_sampled: 99950
    num_agent_steps_trained: 99950
    num_steps_sampled: 99950
    num_steps_trained: 99950
  iterations_since_restore: 25
  no

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_1379e_00000,RUNNING,192.168.1.96:62698,75,13741,99950,9.2692,23.13,4.23,36.97




Result for PPO_my_env_1379e_00000:
  agent_timesteps_total: 101948
  custom_metrics: {}
  date: 2021-11-09_00-39-42
  done: false
  episode_len_mean: 35.04
  episode_media: {}
  episode_reward_max: 23.130000000000006
  episode_reward_mean: 9.298199999999996
  episode_reward_min: 1.7000000000000113
  episodes_this_iter: 58
  episodes_total: 1805
  experiment_id: 73503dd6be0c4d36b5d2eea88c6f3415
  hostname: cdsserver
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.3
          cur_lr: 5.000000000000001e-05
          entropy: 1.2243566907587506
          entropy_coeff: 0.009999999999999998
          kl: 0.010061088463769154
          policy_loss: -0.011651653193292163
          total_loss: 0.9070868819952012
          vf_explained_var: 0.9533015489578247
          vf_loss: 0.9279637739771889
    num_agent_steps_sampled: 101948
    num_agent_steps_trained: 101948
    num_steps_sampled: 101948
    num_steps_trained: 101948
  iterations_since_restor

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_1379e_00000,RUNNING,192.168.1.96:62698,76,14304,101948,9.2982,23.13,1.7,35.04




Result for PPO_my_env_1379e_00000:
  agent_timesteps_total: 103946
  custom_metrics: {}
  date: 2021-11-09_00-44-37
  done: false
  episode_len_mean: 40.57
  episode_media: {}
  episode_reward_max: 22.960000000000004
  episode_reward_mean: 9.241699999999998
  episode_reward_min: 1.7000000000000113
  episodes_this_iter: 42
  episodes_total: 1847
  experiment_id: 73503dd6be0c4d36b5d2eea88c6f3415
  hostname: cdsserver
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.3
          cur_lr: 5.000000000000001e-05
          entropy: 1.4001074194908143
          entropy_coeff: 0.009999999999999998
          kl: 0.009186759318344284
          policy_loss: 0.024083018515791213
          total_loss: 0.5895612811758405
          vf_explained_var: 0.9662694334983826
          vf_loss: 0.5767233049585706
    num_agent_steps_sampled: 103946
    num_agent_steps_trained: 103946
    num_steps_sampled: 103946
    num_steps_trained: 103946
  iterations_since_restore

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_1379e_00000,RUNNING,192.168.1.96:62698,77,14598.3,103946,9.2417,22.96,1.7,40.57




Result for PPO_my_env_1379e_00000:
  agent_timesteps_total: 105944
  custom_metrics: {}
  date: 2021-11-09_00-52-28
  done: false
  episode_len_mean: 40.08
  episode_media: {}
  episode_reward_max: 20.569999999999915
  episode_reward_mean: 9.110499999999995
  episode_reward_min: 1.9200000000000137
  episodes_this_iter: 54
  episodes_total: 1901
  experiment_id: 73503dd6be0c4d36b5d2eea88c6f3415
  hostname: cdsserver
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.3
          cur_lr: 5.000000000000001e-05
          entropy: 1.2548319175129845
          entropy_coeff: 0.009999999999999998
          kl: 0.013463069676473299
          policy_loss: -0.02620292493984813
          total_loss: 0.5829726975233782
          vf_explained_var: 0.9630655646324158
          vf_loss: 0.6176850128741491
    num_agent_steps_sampled: 105944
    num_agent_steps_trained: 105944
    num_steps_sampled: 105944
    num_steps_trained: 105944
  iterations_since_restore

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_1379e_00000,RUNNING,192.168.1.96:62698,78,15069.8,105944,9.1105,20.57,1.92,40.08




Result for PPO_my_env_1379e_00000:
  agent_timesteps_total: 107942
  custom_metrics: {}
  date: 2021-11-09_01-00-35
  done: false
  episode_len_mean: 36.32
  episode_media: {}
  episode_reward_max: 20.569999999999915
  episode_reward_mean: 9.732799999999989
  episode_reward_min: 1.9700000000000168
  episodes_this_iter: 57
  episodes_total: 1958
  experiment_id: 73503dd6be0c4d36b5d2eea88c6f3415
  hostname: cdsserver
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.3
          cur_lr: 5.000000000000001e-05
          entropy: 1.1940159093765985
          entropy_coeff: 0.009999999999999998
          kl: 0.014934635615275998
          policy_loss: -0.01153114079719498
          total_loss: 0.5860929142861139
          vf_explained_var: 0.9594265818595886
          vf_loss: 0.6050838270357677
    num_agent_steps_sampled: 107942
    num_agent_steps_trained: 107942
    num_steps_sampled: 107942
    num_steps_trained: 107942
  iterations_since_restore

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_1379e_00000,RUNNING,192.168.1.96:62698,79,15556.6,107942,9.7328,20.57,1.97,36.32




Result for PPO_my_env_1379e_00000:
  agent_timesteps_total: 109940
  custom_metrics: {}
  date: 2021-11-09_01-06-45
  done: false
  episode_len_mean: 36.95
  episode_media: {}
  episode_reward_max: 23.130000000000006
  episode_reward_mean: 10.005399999999987
  episode_reward_min: 5.080000000000004
  episodes_this_iter: 50
  episodes_total: 2008
  experiment_id: 73503dd6be0c4d36b5d2eea88c6f3415
  hostname: cdsserver
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.3
          cur_lr: 5.000000000000001e-05
          entropy: 1.157391905784607
          entropy_coeff: 0.009999999999999998
          kl: 0.016166385761270764
          policy_loss: -0.02781083480942817
          total_loss: 1.0233000482831682
          vf_explained_var: 0.9432592391967773
          vf_loss: 1.0578348802668707
    num_agent_steps_sampled: 109940
    num_agent_steps_trained: 109940
    num_steps_sampled: 109940
    num_steps_trained: 109940
  iterations_since_restore:

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_1379e_00000,RUNNING,192.168.1.96:62698,80,15926.5,109940,10.0054,23.13,5.08,36.95




Result for PPO_my_env_1379e_00000:
  agent_timesteps_total: 111938
  custom_metrics: {}
  date: 2021-11-09_01-14-54
  done: false
  episode_len_mean: 36.6
  episode_media: {}
  episode_reward_max: 23.130000000000006
  episode_reward_mean: 10.001599999999991
  episode_reward_min: 5.010000000000006
  episodes_this_iter: 55
  episodes_total: 2063
  experiment_id: 73503dd6be0c4d36b5d2eea88c6f3415
  hostname: cdsserver
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.3
          cur_lr: 5.000000000000001e-05
          entropy: 1.0338018198808034
          entropy_coeff: 0.009999999999999998
          kl: 0.014228750047386579
          policy_loss: -0.05012822746343556
          total_loss: 1.233716427286466
          vf_explained_var: 0.9288076758384705
          vf_loss: 1.2899140474342166
    num_agent_steps_sampled: 111938
    num_agent_steps_trained: 111938
    num_steps_sampled: 111938
    num_steps_trained: 111938
  iterations_since_restore: 

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_1379e_00000,RUNNING,192.168.1.96:62698,81,16415.5,111938,10.0016,23.13,5.01,36.6




Result for PPO_my_env_1379e_00000:
  agent_timesteps_total: 113936
  custom_metrics: {}
  date: 2021-11-09_01-22-17
  done: false
  episode_len_mean: 38.27
  episode_media: {}
  episode_reward_max: 23.170000000000005
  episode_reward_mean: 10.12009999999999
  episode_reward_min: 1.560000000000012
  episodes_this_iter: 47
  episodes_total: 2110
  experiment_id: 73503dd6be0c4d36b5d2eea88c6f3415
  hostname: cdsserver
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.3
          cur_lr: 5.000000000000001e-05
          entropy: 1.2856246448698498
          entropy_coeff: 0.009999999999999998
          kl: 0.013040037246031049
          policy_loss: -0.017098074690217062
          total_loss: 0.6909475913005215
          vf_explained_var: 0.9615551829338074
          vf_loss: 0.7169899029391152
    num_agent_steps_sampled: 113936
    num_agent_steps_trained: 113936
    num_steps_sampled: 113936
    num_steps_trained: 113936
  iterations_since_restore

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_1379e_00000,RUNNING,192.168.1.96:62698,82,16858.7,113936,10.1201,23.17,1.56,38.27




Result for PPO_my_env_1379e_00000:
  agent_timesteps_total: 115934
  custom_metrics: {}
  date: 2021-11-09_01-32-22
  done: false
  episode_len_mean: 37.15
  episode_media: {}
  episode_reward_max: 23.170000000000005
  episode_reward_mean: 10.413499999999992
  episode_reward_min: 1.560000000000012
  episodes_this_iter: 66
  episodes_total: 2176
  experiment_id: 73503dd6be0c4d36b5d2eea88c6f3415
  hostname: cdsserver
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.3
          cur_lr: 5.000000000000001e-05
          entropy: 1.0115785556180137
          entropy_coeff: 0.009999999999999998
          kl: 0.013025757743058043
          policy_loss: -0.011179761375699725
          total_loss: 0.6994928460745584
          vf_explained_var: 0.9573458433151245
          vf_loss: 0.7168806663581303
    num_agent_steps_sampled: 115934
    num_agent_steps_trained: 115934
    num_steps_sampled: 115934
    num_steps_trained: 115934
  iterations_since_restor

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_1379e_00000,RUNNING,192.168.1.96:62698,83,17463.5,115934,10.4135,23.17,1.56,37.15




Result for PPO_my_env_1379e_00000:
  agent_timesteps_total: 117932
  custom_metrics: {}
  date: 2021-11-09_01-41-38
  done: false
  episode_len_mean: 31.66
  episode_media: {}
  episode_reward_max: 23.049999999999997
  episode_reward_mean: 9.506299999999992
  episode_reward_min: 1.8600000000000145
  episodes_this_iter: 63
  episodes_total: 2239
  experiment_id: 73503dd6be0c4d36b5d2eea88c6f3415
  hostname: cdsserver
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.3
          cur_lr: 5.000000000000001e-05
          entropy: 1.137314323300407
          entropy_coeff: 0.009999999999999998
          kl: 0.0135492561666033
          policy_loss: 0.031007483175822668
          total_loss: 0.9205818786507561
          vf_explained_var: 0.9476696848869324
          vf_loss: 0.8968827524355479
    num_agent_steps_sampled: 117932
    num_agent_steps_trained: 117932
    num_steps_sampled: 117932
    num_steps_trained: 117932
  iterations_since_restore: 3

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_1379e_00000,RUNNING,192.168.1.96:62698,84,18019.4,117932,9.5063,23.05,1.86,31.66




Result for PPO_my_env_1379e_00000:
  agent_timesteps_total: 119930
  custom_metrics: {}
  date: 2021-11-09_01-49-52
  done: false
  episode_len_mean: 33.84
  episode_media: {}
  episode_reward_max: 23.120000000000005
  episode_reward_mean: 9.834999999999992
  episode_reward_min: 1.9800000000000173
  episodes_this_iter: 60
  episodes_total: 2299
  experiment_id: 73503dd6be0c4d36b5d2eea88c6f3415
  hostname: cdsserver
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.3
          cur_lr: 5.000000000000001e-05
          entropy: 1.0446897730940865
          entropy_coeff: 0.009999999999999998
          kl: 0.01276408700384787
          policy_loss: -0.05879712195268699
          total_loss: 0.8872587360086895
          vf_explained_var: 0.9506505131721497
          vf_loss: 0.9526735247600646
    num_agent_steps_sampled: 119930
    num_agent_steps_trained: 119930
    num_steps_sampled: 119930
    num_steps_trained: 119930
  iterations_since_restore:

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_1379e_00000,RUNNING,192.168.1.96:62698,85,18513.3,119930,9.835,23.12,1.98,33.84




Result for PPO_my_env_1379e_00000:
  agent_timesteps_total: 121928
  custom_metrics: {}
  date: 2021-11-09_01-58-37
  done: false
  episode_len_mean: 32.37
  episode_media: {}
  episode_reward_max: 23.080000000000005
  episode_reward_mean: 9.559699999999994
  episode_reward_min: 2.280000000000016
  episodes_this_iter: 60
  episodes_total: 2359
  experiment_id: 73503dd6be0c4d36b5d2eea88c6f3415
  hostname: cdsserver
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.3
          cur_lr: 5.000000000000001e-05
          entropy: 1.0652773962134408
          entropy_coeff: 0.009999999999999998
          kl: 0.011054242350101592
          policy_loss: 0.019146063090080306
          total_loss: 0.8333444062443006
          vf_explained_var: 0.9654128551483154
          vf_loss: 0.8215348462263743
    num_agent_steps_sampled: 121928
    num_agent_steps_trained: 121928
    num_steps_sampled: 121928
    num_steps_trained: 121928
  iterations_since_restore:

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_1379e_00000,RUNNING,192.168.1.96:62698,86,19037.8,121928,9.5597,23.08,2.28,32.37




Result for PPO_my_env_1379e_00000:
  agent_timesteps_total: 123926
  custom_metrics: {}
  date: 2021-11-09_02-06-27
  done: false
  episode_len_mean: 32.0
  episode_media: {}
  episode_reward_max: 23.200000000000003
  episode_reward_mean: 9.598199999999993
  episode_reward_min: 2.1900000000000155
  episodes_this_iter: 64
  episodes_total: 2423
  experiment_id: 73503dd6be0c4d36b5d2eea88c6f3415
  hostname: cdsserver
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.3
          cur_lr: 5.000000000000001e-05
          entropy: 1.034100372734524
          entropy_coeff: 0.009999999999999998
          kl: 0.01182809703799834
          policy_loss: 0.020458337025982994
          total_loss: 1.0958794362488247
          vf_explained_var: 0.9523130655288696
          vf_loss: 1.082213664622534
    num_agent_steps_sampled: 123926
    num_agent_steps_trained: 123926
    num_steps_sampled: 123926
    num_steps_trained: 123926
  iterations_since_restore: 37

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_1379e_00000,RUNNING,192.168.1.96:62698,87,19508.2,123926,9.5982,23.2,2.19,32




Result for PPO_my_env_1379e_00000:
  agent_timesteps_total: 125924
  custom_metrics: {}
  date: 2021-11-09_02-16-17
  done: false
  episode_len_mean: 29.07
  episode_media: {}
  episode_reward_max: 23.200000000000003
  episode_reward_mean: 10.214999999999998
  episode_reward_min: 2.1900000000000155
  episodes_this_iter: 64
  episodes_total: 2487
  experiment_id: 73503dd6be0c4d36b5d2eea88c6f3415
  hostname: cdsserver
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.3
          cur_lr: 5.000000000000001e-05
          entropy: 0.933320411897841
          entropy_coeff: 0.009999999999999998
          kl: 0.01015516697762203
          policy_loss: 0.014184952207974026
          total_loss: 1.0353746337550027
          vf_explained_var: 0.9618438482284546
          vf_loss: 1.0274763284694581
    num_agent_steps_sampled: 125924
    num_agent_steps_trained: 125924
    num_steps_sampled: 125924
    num_steps_trained: 125924
  iterations_since_restore:

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_1379e_00000,RUNNING,192.168.1.96:62698,88,20098,125924,10.215,23.2,2.19,29.07




Result for PPO_my_env_1379e_00000:
  agent_timesteps_total: 127922
  custom_metrics: {}
  date: 2021-11-09_02-23-21
  done: false
  episode_len_mean: 32.54
  episode_media: {}
  episode_reward_max: 23.200000000000003
  episode_reward_mean: 10.859299999999994
  episode_reward_min: 2.5300000000000082
  episodes_this_iter: 57
  episodes_total: 2544
  experiment_id: 73503dd6be0c4d36b5d2eea88c6f3415
  hostname: cdsserver
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.3
          cur_lr: 5.000000000000001e-05
          entropy: 0.8995607478278024
          entropy_coeff: 0.009999999999999998
          kl: 0.008147990961513812
          policy_loss: -0.0649423785507679
          total_loss: 0.7330140660206477
          vf_explained_var: 0.9657805562019348
          vf_loss: 0.8045076540538243
    num_agent_steps_sampled: 127922
    num_agent_steps_trained: 127922
    num_steps_sampled: 127922
    num_steps_trained: 127922
  iterations_since_restore

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_1379e_00000,RUNNING,192.168.1.96:62698,89,20522.2,127922,10.8593,23.2,2.53,32.54




Result for PPO_my_env_1379e_00000:
  agent_timesteps_total: 129920
  custom_metrics: {}
  date: 2021-11-09_02-32-07
  done: false
  episode_len_mean: 33.74
  episode_media: {}
  episode_reward_max: 23.19
  episode_reward_mean: 11.006099999999995
  episode_reward_min: 2.410000000000013
  episodes_this_iter: 63
  episodes_total: 2607
  experiment_id: 73503dd6be0c4d36b5d2eea88c6f3415
  hostname: cdsserver
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.3
          cur_lr: 5.000000000000001e-05
          entropy: 0.8860342942533039
          entropy_coeff: 0.009999999999999998
          kl: 0.012825825856345823
          policy_loss: -0.028723895194984618
          total_loss: 0.7359317973433506
          vf_explained_var: 0.9681501984596252
          vf_loss: 0.7696682932830993
    num_agent_steps_sampled: 129920
    num_agent_steps_trained: 129920
    num_steps_sampled: 129920
    num_steps_trained: 129920
  iterations_since_restore: 40
  node_

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_1379e_00000,RUNNING,192.168.1.96:62698,90,21048,129920,11.0061,23.19,2.41,33.74




Result for PPO_my_env_1379e_00000:
  agent_timesteps_total: 131918
  custom_metrics: {}
  date: 2021-11-09_02-38-52
  done: false
  episode_len_mean: 35.91
  episode_media: {}
  episode_reward_max: 23.19
  episode_reward_mean: 10.721899999999996
  episode_reward_min: 0.509999999999999
  episodes_this_iter: 55
  episodes_total: 2662
  experiment_id: 73503dd6be0c4d36b5d2eea88c6f3415
  hostname: cdsserver
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.3
          cur_lr: 5.000000000000001e-05
          entropy: 1.011393259820484
          entropy_coeff: 0.009999999999999998
          kl: 0.012000094253843492
          policy_loss: -0.026058791311723846
          total_loss: 0.932048050349667
          vf_explained_var: 0.9598470330238342
          vf_loss: 0.9646207404988153
    num_agent_steps_sampled: 131918
    num_agent_steps_trained: 131918
    num_steps_sampled: 131918
    num_steps_trained: 131918
  iterations_since_restore: 41
  node_ip

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_1379e_00000,RUNNING,192.168.1.96:62698,91,21453,131918,10.7219,23.19,0.51,35.91




Result for PPO_my_env_1379e_00000:
  agent_timesteps_total: 133916
  custom_metrics: {}
  date: 2021-11-09_02-48-30
  done: false
  episode_len_mean: 31.87
  episode_media: {}
  episode_reward_max: 23.210000000000004
  episode_reward_mean: 10.969699999999996
  episode_reward_min: 2.500000000000011
  episodes_this_iter: 68
  episodes_total: 2730
  experiment_id: 73503dd6be0c4d36b5d2eea88c6f3415
  hostname: cdsserver
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.3
          cur_lr: 5.000000000000001e-05
          entropy: 0.6993827950386774
          entropy_coeff: 0.009999999999999998
          kl: 0.01341681058710361
          policy_loss: 0.016916793017160325
          total_loss: 1.260288166999817
          vf_explained_var: 0.9535560011863708
          vf_loss: 1.2463401665290197
    num_agent_steps_sampled: 133916
    num_agent_steps_trained: 133916
    num_steps_sampled: 133916
    num_steps_trained: 133916
  iterations_since_restore: 

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_1379e_00000,RUNNING,192.168.1.96:62698,92,22030.4,133916,10.9697,23.21,2.5,31.87


