In [1]:
import torch 
from torch import nn

import ray
from ray.rllib.agents import ppo
from ray.rllib.models import ModelCatalog
from ray.rllib.models.torch.torch_modelv2 import TorchModelV2
from ray.rllib.utils.annotations import override

#from models import VisualEncoder
from train import *
from wrappers_2 import *



In [2]:
class VisualEncoder(nn.Module):
    def __init__(self):
        super().__init__()

        self.cnn = nn.Sequential(
            nn.Conv2d(3, 32, kernel_size=2, stride=2, padding=0),  
            nn.ELU(),
            nn.Conv2d(32, 32, kernel_size=2, stride=2, padding=0), 
            nn.ELU(),
            nn.Conv2d(32, 64, kernel_size=2, stride=2, padding=0), 
            nn.ELU(),
            nn.Conv2d(64, 128, kernel_size=2, stride=2, padding=0),
            nn.ELU(), 
            nn.Conv2d(128, 256, kernel_size=2, stride=2, padding=0),
            nn.ELU(),
            nn.Conv2d(256, 512, kernel_size=2, stride=2, padding=0),
            nn.ELU(),
            nn.Flatten(),
        )

    def forward(self, x):
        return self.cnn(x)

In [3]:
from torch.nn.functional import one_hot

class MyModelClass(TorchModelV2, nn.Module):
    def __init__(self, obs_space, action_space, num_outputs, model_config, name):
        TorchModelV2.__init__(self, obs_space, action_space, num_outputs, model_config, name)
        nn.Module.__init__(self)
        visual_features_dim = 512
        target_features_dim = 9 * 11 * 11 
        self.visual_encoder = VisualEncoder()
        self.visual_encoder.load_state_dict(
            torch.load("/IGLU-Minecraft/models/AngelaCNN/encoder_weigths.pth", map_location=torch.device('cpu'))
        )
        self.target_encoder = nn.Sequential(
            nn.Conv3d(7, 1, kernel_size=1, stride=1, padding=0),
            nn.ELU(),
        )
        policy_hidden_dim = 256 
        self.policy_network = nn.Sequential(
            nn.Linear(visual_features_dim + target_features_dim, 1024),
            nn.ELU(),
            nn.Linear(1024, 512),
            nn.ELU(),
            nn.Linear(512, policy_hidden_dim),
            nn.ELU(),
            nn.Linear(policy_hidden_dim, policy_hidden_dim),
            nn.ELU(),
            #nn.Linear(policy_hidden_dim, policy_hidden_dim),
            #nn.ELU(),
        )
        self.action_head = nn.Linear(policy_hidden_dim, action_space.n)
        self.value_head = nn.Linear(policy_hidden_dim, 1)
        self.last_value = None
        
        self.use_cuda = torch.cuda.is_available()
        if self.use_cuda:
            self.visual_encoder.cuda()
            self.target_encoder.cuda()
            self.policy_network.cuda()
            self.action_head.cuda()
            self.value_head.cuda()
        
    @override(TorchModelV2)
    def forward(self, input_dict, state, seq_lens):
        obs = input_dict['obs']
        pov = obs['pov'].permute(0, 3, 1, 2).float() / 255.0
        target = one_hot(obs['target_grid'].long(), num_classes=7).permute(0, 4, 1, 2, 3).float()
        if self.use_cuda:
            pov.cuda()
            target.cuda()
            
        with torch.no_grad():
            visual_features = self.visual_encoder(pov)
            
        target_features = self.target_encoder(target)
        target_features = target_features.reshape(target_features.shape[0], -1)
        features = torch.cat([visual_features, target_features], dim=1)
        features = self.policy_network(features)
        action = self.action_head(features)
        self.last_value = self.value_head(features).squeeze(1)
        return action, state
    
    @override(TorchModelV2)
    def value_function(self):
        assert self.last_value is not None, "must call forward() first"
        return self.last_value

In [4]:
ModelCatalog.register_custom_model("my_torch_model", MyModelClass)

In [5]:
class VisualObservationWrapper(ObsWrapper):
    def __init__(self, env, include_target=False):
        super().__init__(env)
        self.observation_space = {   
            'pov': gym.spaces.Box(low=0, high=255, shape=(64, 64, 3)),
            'inventory': gym.spaces.Box(low=0.0, high=20.0, shape=(6,)),
            'compass': gym.spaces.Box(low=-180.0, high=180.0, shape=(1,))
        }
        if include_target:
            self.observation_space['target_grid'] = \
                gym.spaces.Box(low=0, high=6, shape=(9, 11, 11))
        self.observation_space = gym.spaces.Dict(self.observation_space)

    def observation(self, obs, reward=None, done=None, info=None):
        if info is not None:
            if 'target_grid' in info:
                target_grid = info['target_grid']
                del info['target_grid']
            else:
                logger.error(f'info: {info}')
                if hasattr(self.unwrapped, 'should_reset'):
                    self.unwrapped.should_reset(True)
                target_grid = self.env.unwrapped.tasks.current.target_grid
        else:
            target_grid = self.env.unwrapped.tasks.current.target_grid
        return {
            'pov': obs['pov'].astype(np.float32),
            'inventory': obs['inventory'],
            'compass': np.array([obs['compass']['angle'].item()]),
            'target_grid': target_grid
        }

In [6]:
import os
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"   # see issue #152
os.environ["CUDA_VISIBLE_DEVICES"]="0"

class RewardWrapper(gym.RewardWrapper):
    def __init__(self, env):
        super().__init__(env)
    
    def reward(self, rew):
        if rew == 0:
            rew = -0.01
        if abs(rew) == 1:
            rew /= 10
        return rew
    
def env_creator(env_config):
    env = gym.make('IGLUSilentBuilder-v0', max_steps=1000)
    env.update_taskset(TaskSet(preset=['C3', 'C17', 'C32']))
    #env = PovOnlyWrapper(env)
    env = VisualObservationWrapper(env, include_target=True)
    env = SelectAndPlace(env)
    env = Discretization(env, flat_action_space('human-level'))
    env = RewardWrapper(env)
    return env

from ray.tune.registry import register_env
register_env("my_env", env_creator)

from ray import tune
from ray.rllib.agents.ppo import PPOTrainer

In [None]:
from ray.tune.integration.wandb import WandbLogger

analysis = tune.run(PPOTrainer, 
         config={
             "env": "my_env", 
             "framework": "torch",
             "num_gpus": 1,
             "num_workers": 1,
             "sgd_minibatch_size": 256,
             "clip_param": 0.2,
             "entropy_coeff": 0.01,
             "lambda": 0.95,
             "train_batch_size": 1000,
             #"gamma": 0.99,
             "model": {
                    # Specify our custom model from above.
                    "custom_model": "my_torch_model",
                    # Extra kwargs to be passed to your model's c'tor.
                    "custom_model_config": {},
              },
             "logger_config": {
                  "wandb": {
                      "project": "IGLU-Minecraft",
                      "name": "PPO MultiTask (C3, C17, C32) pretrained (AngelaCNN) (3 noops after placement) r: -0.01 div10"
                  }
              }

        },
        loggers=[WandbLogger],
        local_dir="/IGLU-Minecraft/checkpoints/",
        keep_checkpoints_num=50,
        checkpoint_freq=5,
        checkpoint_at_end=True)



Trial name,status,loc
PPO_my_env_c2555_00000,PENDING,


2021-11-05 20:27:13,090	INFO wandb.py:170 -- Already logged into W&B.
2021-11-05 20:27:13,129	ERROR syncer.py:72 -- Log sync requires rsync to be installed.
[34m[1mwandb[0m: Currently logged in as: [33mlinar[0m (use `wandb login --relogin` to force relogin)
[34m[1mwandb[0m: wandb version 0.12.6 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade


[2m[36m(pid=382971)[0m 2021-11-05 20:27:17,995	INFO ppo.py:159 -- In multi-agent mode, policies will be optimized sequentially by the multi-GPU optimizer. Consider setting simple_optimizer=True if this doesn't work for you.
[2m[36m(pid=382971)[0m 2021-11-05 20:27:17,995	INFO trainer.py:728 -- Current log_level is WARN. For more information, set 'log_level': 'INFO' / 'DEBUG' or use the -v and -vv flags.
[2m[36m(pid=382971)[0m 2021-11-05 20:27:25,806	INFO trainable.py:109 -- Trainable.setup took 11.331 seconds. If your trainable is slow to initialize, consider setting reuse_actors=True to reduce actor creation overheads.


Result for PPO_my_env_c2555_00000:
  agent_timesteps_total: 1000
  custom_metrics: {}
  date: 2021-11-05_20-28-55
  done: false
  episode_len_mean: 411.0
  episode_media: {}
  episode_reward_max: -4.139999999999956
  episode_reward_mean: -4.154999999999957
  episode_reward_min: -4.169999999999957
  episodes_this_iter: 2
  episodes_total: 2
  experiment_id: ff870b931ac0466ba6c548d9d0717a19
  hostname: cdsserver
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.19999999999999996
          cur_lr: 5.000000000000001e-05
          entropy: 2.878750189145406
          entropy_coeff: 0.009999999999999998
          kl: 0.009509325984693628
          policy_loss: 0.046141261772976984
          total_loss: 0.020941973477602006
          vf_explained_var: 0.08785516023635864
          vf_loss: 0.0016863471168714265
    num_agent_steps_sampled: 1000
    num_agent_steps_trained: 1000
    num_steps_sampled: 1000
    num_steps_trained: 1000
  iterations_since

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_c2555_00000,RUNNING,192.168.1.96:382971,1,89.402,1000,-4.155,-4.14,-4.17,411


Result for PPO_my_env_c2555_00000:
  agent_timesteps_total: 2000
  custom_metrics: {}
  date: 2021-11-05_20-29-31
  done: false
  episode_len_mean: 399.8
  episode_media: {}
  episode_reward_max: -1.2100000000000022
  episode_reward_mean: -3.5679999999999694
  episode_reward_min: -4.979999999999956
  episodes_this_iter: 3
  episodes_total: 5
  experiment_id: ff870b931ac0466ba6c548d9d0717a19
  hostname: cdsserver
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.19999999999999996
          cur_lr: 5.000000000000001e-05
          entropy: 2.869719534450107
          entropy_coeff: 0.009999999999999998
          kl: 0.010928736950149285
          policy_loss: -0.08635586665736304
          total_loss: 0.09094753488898277
          vf_explained_var: -0.1483485996723175
          vf_loss: 0.2038148485744993
    num_agent_steps_sampled: 2000
    num_agent_steps_trained: 2000
    num_steps_sampled: 2000
    num_steps_trained: 2000
  iterations_since_r

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_c2555_00000,RUNNING,192.168.1.96:382971,2,125.339,2000,-3.568,-1.21,-4.98,399.8




Result for PPO_my_env_c2555_00000:
  agent_timesteps_total: 3000
  custom_metrics: {}
  date: 2021-11-05_20-30-46
  done: false
  episode_len_mean: 394.85714285714283
  episode_media: {}
  episode_reward_max: 2.0600000000000063
  episode_reward_mean: -2.6871428571428324
  episode_reward_min: -4.979999999999956
  episodes_this_iter: 2
  episodes_total: 7
  experiment_id: ff870b931ac0466ba6c548d9d0717a19
  hostname: cdsserver
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.19999999999999996
          cur_lr: 5.000000000000001e-05
          entropy: 2.8639131599002416
          entropy_coeff: 0.009999999999999998
          kl: 0.00800819651705233
          policy_loss: -0.12533319724930658
          total_loss: -0.0022743696139918435
          vf_explained_var: 0.1485067754983902
          vf_loss: 0.15009631680117713
    num_agent_steps_sampled: 3000
    num_agent_steps_trained: 3000
    num_steps_sampled: 3000
    num_steps_trained: 3000
  ite

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_c2555_00000,RUNNING,192.168.1.96:382971,3,200.702,3000,-2.68714,2.06,-4.98,394.857


Result for PPO_my_env_c2555_00000:
  agent_timesteps_total: 4000
  custom_metrics: {}
  date: 2021-11-05_20-31-22
  done: false
  episode_len_mean: 399.7
  episode_media: {}
  episode_reward_max: 2.0600000000000063
  episode_reward_mean: -2.9199999999999746
  episode_reward_min: -4.979999999999956
  episodes_this_iter: 3
  episodes_total: 10
  experiment_id: ff870b931ac0466ba6c548d9d0717a19
  hostname: cdsserver
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.19999999999999996
          cur_lr: 5.000000000000001e-05
          entropy: 2.8478457000520496
          entropy_coeff: 0.009999999999999998
          kl: 0.008916679019131004
          policy_loss: -0.04861029527253575
          total_loss: 0.021084851523240408
          vf_explained_var: 0.11147531867027283
          vf_loss: 0.09639026907065676
    num_agent_steps_sampled: 4000
    num_agent_steps_trained: 4000
    num_steps_sampled: 4000
    num_steps_trained: 4000
  iterations_sinc

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_c2555_00000,RUNNING,192.168.1.96:382971,4,236.47,4000,-2.92,2.06,-4.98,399.7


Result for PPO_my_env_c2555_00000:
  agent_timesteps_total: 5000
  custom_metrics: {}
  date: 2021-11-05_20-31-56
  done: false
  episode_len_mean: 401.4166666666667
  episode_media: {}
  episode_reward_max: 2.0600000000000063
  episode_reward_mean: -2.4199999999999773
  episode_reward_min: -4.979999999999956
  episodes_this_iter: 2
  episodes_total: 12
  experiment_id: ff870b931ac0466ba6c548d9d0717a19
  hostname: cdsserver
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.19999999999999996
          cur_lr: 5.000000000000001e-05
          entropy: 2.835487882296244
          entropy_coeff: 0.009999999999999998
          kl: 0.008582832141086044
          policy_loss: 0.07007198515865538
          total_loss: 0.27668222954703703
          vf_explained_var: 0.2568071484565735
          vf_loss: 0.23324856099983057
    num_agent_steps_sampled: 5000
    num_agent_steps_trained: 5000
    num_steps_sampled: 5000
    num_steps_trained: 5000
  iterati

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_c2555_00000,RUNNING,192.168.1.96:382971,5,270.28,5000,-2.42,2.06,-4.98,401.417


Result for PPO_my_env_c2555_00000:
  agent_timesteps_total: 6000
  custom_metrics: {}
  date: 2021-11-05_20-32-30
  done: false
  episode_len_mean: 404.35714285714283
  episode_media: {}
  episode_reward_max: 2.0600000000000063
  episode_reward_mean: -2.2499999999999756
  episode_reward_min: -4.979999999999956
  episodes_this_iter: 2
  episodes_total: 14
  experiment_id: ff870b931ac0466ba6c548d9d0717a19
  hostname: cdsserver
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.19999999999999996
          cur_lr: 5.000000000000001e-05
          entropy: 2.818758993678623
          entropy_coeff: 0.009999999999999998
          kl: 0.009373819079898817
          policy_loss: 0.06912527639004919
          total_loss: 0.15560172374049822
          vf_explained_var: 0.366378515958786
          vf_loss: 0.11278927152355513
    num_agent_steps_sampled: 6000
    num_agent_steps_trained: 6000
    num_steps_sampled: 6000
    num_steps_trained: 6000
  iterati

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_c2555_00000,RUNNING,192.168.1.96:382971,6,304.478,6000,-2.25,2.06,-4.98,404.357


Result for PPO_my_env_c2555_00000:
  agent_timesteps_total: 7000
  custom_metrics: {}
  date: 2021-11-05_20-33-06
  done: false
  episode_len_mean: 401.2352941176471
  episode_media: {}
  episode_reward_max: 2.0600000000000063
  episode_reward_mean: -2.2264705882352716
  episode_reward_min: -4.979999999999956
  episodes_this_iter: 3
  episodes_total: 17
  experiment_id: ff870b931ac0466ba6c548d9d0717a19
  hostname: cdsserver
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.19999999999999996
          cur_lr: 5.000000000000001e-05
          entropy: 2.798145898183187
          entropy_coeff: 0.009999999999999998
          kl: 0.010994973299824542
          policy_loss: -0.07510843964086639
          total_loss: -0.023549755497111215
          vf_explained_var: 0.3355380892753601
          vf_loss: 0.07734114856769642
    num_agent_steps_sampled: 7000
    num_agent_steps_trained: 7000
    num_steps_sampled: 7000
    num_steps_trained: 7000
  iter

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_c2555_00000,RUNNING,192.168.1.96:382971,7,340.502,7000,-2.22647,2.06,-4.98,401.235


Result for PPO_my_env_c2555_00000:
  agent_timesteps_total: 8000
  custom_metrics: {}
  date: 2021-11-05_20-33-41
  done: false
  episode_len_mean: 400.8421052631579
  episode_media: {}
  episode_reward_max: 2.0600000000000063
  episode_reward_mean: -2.0594736842105057
  episode_reward_min: -4.979999999999956
  episodes_this_iter: 2
  episodes_total: 19
  experiment_id: ff870b931ac0466ba6c548d9d0717a19
  hostname: cdsserver
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.19999999999999996
          cur_lr: 5.000000000000001e-05
          entropy: 2.7989277601242066
          entropy_coeff: 0.009999999999999998
          kl: 0.008321602485222412
          policy_loss: -0.08513370272186067
          total_loss: 0.10303599459843503
          vf_explained_var: 0.23065054416656494
          vf_loss: 0.21449464874135124
    num_agent_steps_sampled: 8000
    num_agent_steps_trained: 8000
    num_steps_sampled: 8000
    num_steps_trained: 8000
  iter

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_c2555_00000,RUNNING,192.168.1.96:382971,8,375.788,8000,-2.05947,2.06,-4.98,400.842


Result for PPO_my_env_c2555_00000:
  agent_timesteps_total: 9000
  custom_metrics: {}
  date: 2021-11-05_20-34-16
  done: false
  episode_len_mean: 400.09090909090907
  episode_media: {}
  episode_reward_max: 2.0600000000000063
  episode_reward_mean: -2.0963636363636176
  episode_reward_min: -4.979999999999956
  episodes_this_iter: 3
  episodes_total: 22
  experiment_id: ff870b931ac0466ba6c548d9d0717a19
  hostname: cdsserver
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.19999999999999996
          cur_lr: 5.000000000000001e-05
          entropy: 2.778216176562839
          entropy_coeff: 0.009999999999999998
          kl: 0.011919210666138862
          policy_loss: -0.005463524659474691
          total_loss: 0.11884123952024513
          vf_explained_var: -0.03294960409402847
          vf_loss: 0.14970308239054347
    num_agent_steps_sampled: 9000
    num_agent_steps_trained: 9000
    num_steps_sampled: 9000
    num_steps_trained: 9000
  it

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_c2555_00000,RUNNING,192.168.1.96:382971,9,410.577,9000,-2.09636,2.06,-4.98,400.091


Result for PPO_my_env_c2555_00000:
  agent_timesteps_total: 10000
  custom_metrics: {}
  date: 2021-11-05_20-34-50
  done: false
  episode_len_mean: 401.5416666666667
  episode_media: {}
  episode_reward_max: 2.0600000000000063
  episode_reward_mean: -2.1504166666666475
  episode_reward_min: -4.979999999999956
  episodes_this_iter: 2
  episodes_total: 24
  experiment_id: ff870b931ac0466ba6c548d9d0717a19
  hostname: cdsserver
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.19999999999999996
          cur_lr: 5.000000000000001e-05
          entropy: 2.750260093477037
          entropy_coeff: 0.009999999999999998
          kl: 0.011243702691025328
          policy_loss: -0.04557857612768809
          total_loss: 0.0479824673384428
          vf_explained_var: 0.5034226179122925
          vf_loss: 0.11881490502920415
    num_agent_steps_sampled: 10000
    num_agent_steps_trained: 10000
    num_steps_sampled: 10000
    num_steps_trained: 10000
  it

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_c2555_00000,RUNNING,192.168.1.96:382971,10,444.009,10000,-2.15042,2.06,-4.98,401.542


Result for PPO_my_env_c2555_00000:
  agent_timesteps_total: 11000
  custom_metrics: {}
  date: 2021-11-05_20-35-26
  done: false
  episode_len_mean: 401.8888888888889
  episode_media: {}
  episode_reward_max: 2.0600000000000063
  episode_reward_mean: -2.212222222222204
  episode_reward_min: -4.979999999999956
  episodes_this_iter: 3
  episodes_total: 27
  experiment_id: ff870b931ac0466ba6c548d9d0717a19
  hostname: cdsserver
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.19999999999999996
          cur_lr: 5.000000000000001e-05
          entropy: 2.766965937614441
          entropy_coeff: 0.009999999999999998
          kl: 0.008916482369115325
          policy_loss: 0.018910453882482317
          total_loss: 0.1863621167010731
          vf_explained_var: 0.30861523747444153
          vf_loss: 0.1933380252785153
    num_agent_steps_sampled: 11000
    num_agent_steps_trained: 11000
    num_steps_sampled: 11000
    num_steps_trained: 11000
  ite

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_c2555_00000,RUNNING,192.168.1.96:382971,11,479.832,11000,-2.21222,2.06,-4.98,401.889


Result for PPO_my_env_c2555_00000:
  agent_timesteps_total: 12000
  custom_metrics: {}
  date: 2021-11-05_20-36-00
  done: false
  episode_len_mean: 401.7586206896552
  episode_media: {}
  episode_reward_max: 2.0600000000000063
  episode_reward_mean: -2.1358620689654995
  episode_reward_min: -4.979999999999956
  episodes_this_iter: 2
  episodes_total: 29
  experiment_id: ff870b931ac0466ba6c548d9d0717a19
  hostname: cdsserver
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.19999999999999996
          cur_lr: 5.000000000000001e-05
          entropy: 2.7743186288409762
          entropy_coeff: 0.009999999999999998
          kl: 0.010717347678773682
          policy_loss: -0.09303775102727943
          total_loss: 0.06297180698149735
          vf_explained_var: 0.2833046615123749
          vf_loss: 0.18160927432278792
    num_agent_steps_sampled: 12000
    num_agent_steps_trained: 12000
    num_steps_sampled: 12000
    num_steps_trained: 12000
  

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_c2555_00000,RUNNING,192.168.1.96:382971,12,514.23,12000,-2.13586,2.06,-4.98,401.759


Result for PPO_my_env_c2555_00000:
  agent_timesteps_total: 13000
  custom_metrics: {}
  date: 2021-11-05_20-36-36
  done: false
  episode_len_mean: 402.46875
  episode_media: {}
  episode_reward_max: 2.0600000000000063
  episode_reward_mean: -2.0365624999999805
  episode_reward_min: -4.979999999999956
  episodes_this_iter: 3
  episodes_total: 32
  experiment_id: ff870b931ac0466ba6c548d9d0717a19
  hostname: cdsserver
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.19999999999999996
          cur_lr: 5.000000000000001e-05
          entropy: 2.744903972413805
          entropy_coeff: 0.009999999999999998
          kl: 0.010858998144247718
          policy_loss: -0.06344245200355848
          total_loss: 0.09839994926005602
          vf_explained_var: 0.6244847774505615
          vf_loss: 0.1871196443008052
    num_agent_steps_sampled: 13000
    num_agent_steps_trained: 13000
    num_steps_sampled: 13000
    num_steps_trained: 13000
  iterations

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_c2555_00000,RUNNING,192.168.1.96:382971,13,549.672,13000,-2.03656,2.06,-4.98,402.469


Result for PPO_my_env_c2555_00000:
  agent_timesteps_total: 14000
  custom_metrics: {}
  date: 2021-11-05_20-37-13
  done: false
  episode_len_mean: 401.61764705882354
  episode_media: {}
  episode_reward_max: 2.0600000000000063
  episode_reward_mean: -2.1114705882352744
  episode_reward_min: -4.979999999999956
  episodes_this_iter: 2
  episodes_total: 34
  experiment_id: ff870b931ac0466ba6c548d9d0717a19
  hostname: cdsserver
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.19999999999999996
          cur_lr: 5.000000000000001e-05
          entropy: 2.7772578345404733
          entropy_coeff: 0.009999999999999998
          kl: 0.011247855246502562
          policy_loss: -0.025533474940392705
          total_loss: 0.06885179264677896
          vf_explained_var: 0.5300604701042175
          vf_loss: 0.11990827259918053
    num_agent_steps_sampled: 14000
    num_agent_steps_trained: 14000
    num_steps_sampled: 14000
    num_steps_trained: 14000


Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_c2555_00000,RUNNING,192.168.1.96:382971,14,586.969,14000,-2.11147,2.06,-4.98,401.618




Result for PPO_my_env_c2555_00000:
  agent_timesteps_total: 15000
  custom_metrics: {}
  date: 2021-11-05_20-38-26
  done: false
  episode_len_mean: 391.60526315789474
  episode_media: {}
  episode_reward_max: 5.33
  episode_reward_mean: -1.9265789473684014
  episode_reward_min: -4.979999999999956
  episodes_this_iter: 4
  episodes_total: 38
  experiment_id: ff870b931ac0466ba6c548d9d0717a19
  hostname: cdsserver
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.19999999999999996
          cur_lr: 5.000000000000001e-05
          entropy: 2.7573852247662014
          entropy_coeff: 0.009999999999999998
          kl: 0.008852936254648194
          policy_loss: 0.025173151327504053
          total_loss: 0.13699780946804418
          vf_explained_var: 0.45000430941581726
          vf_loss: 0.13762792299191157
    num_agent_steps_sampled: 15000
    num_agent_steps_trained: 15000
    num_steps_sampled: 15000
    num_steps_trained: 15000
  iterations_s

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_c2555_00000,RUNNING,192.168.1.96:382971,15,660.138,15000,-1.92658,5.33,-4.98,391.605




Result for PPO_my_env_c2555_00000:
  agent_timesteps_total: 16000
  custom_metrics: {}
  date: 2021-11-05_20-40-29
  done: false
  episode_len_mean: 371.2093023255814
  episode_media: {}
  episode_reward_max: 5.33
  episode_reward_mean: -1.4644186046511445
  episode_reward_min: -4.979999999999956
  episodes_this_iter: 5
  episodes_total: 43
  experiment_id: ff870b931ac0466ba6c548d9d0717a19
  hostname: cdsserver
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.19999999999999996
          cur_lr: 5.000000000000001e-05
          entropy: 2.7185783359739517
          entropy_coeff: 0.009999999999999998
          kl: 0.01313044895265002
          policy_loss: -0.1678030978060431
          total_loss: 0.16328392285439702
          vf_explained_var: 0.11948759108781815
          vf_loss: 0.35564670852488944
    num_agent_steps_sampled: 16000
    num_agent_steps_trained: 16000
    num_steps_sampled: 16000
    num_steps_trained: 16000
  iterations_sinc

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_c2555_00000,RUNNING,192.168.1.96:382971,16,783.332,16000,-1.46442,5.33,-4.98,371.209


Result for PPO_my_env_c2555_00000:
  agent_timesteps_total: 17000
  custom_metrics: {}
  date: 2021-11-05_20-41-06
  done: false
  episode_len_mean: 372.97777777777776
  episode_media: {}
  episode_reward_max: 5.33
  episode_reward_mean: -1.4337777777777596
  episode_reward_min: -4.979999999999956
  episodes_this_iter: 2
  episodes_total: 45
  experiment_id: ff870b931ac0466ba6c548d9d0717a19
  hostname: cdsserver
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.19999999999999996
          cur_lr: 5.000000000000001e-05
          entropy: 2.716971156332228
          entropy_coeff: 0.009999999999999998
          kl: 0.010306969684661462
          policy_loss: -0.00444153282377455
          total_loss: 0.058341307938098906
          vf_explained_var: 0.3316081762313843
          vf_loss: 0.08789115924802092
    num_agent_steps_sampled: 17000
    num_agent_steps_trained: 17000
    num_steps_sampled: 17000
    num_steps_trained: 17000
  iterations_si

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_c2555_00000,RUNNING,192.168.1.96:382971,17,819.505,17000,-1.43378,5.33,-4.98,372.978




Result for PPO_my_env_c2555_00000:
  agent_timesteps_total: 18000
  custom_metrics: {}
  date: 2021-11-05_20-42-16
  done: false
  episode_len_mean: 368.375
  episode_media: {}
  episode_reward_max: 5.33
  episode_reward_mean: -1.2806249999999808
  episode_reward_min: -4.979999999999956
  episodes_this_iter: 3
  episodes_total: 48
  experiment_id: ff870b931ac0466ba6c548d9d0717a19
  hostname: cdsserver
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.19999999999999996
          cur_lr: 5.000000000000001e-05
          entropy: 2.6692831092410616
          entropy_coeff: 0.009999999999999998
          kl: 0.010933409736867574
          policy_loss: -0.10132481687598759
          total_loss: 0.07588327959593799
          vf_explained_var: 0.17513208091259003
          vf_loss: 0.20171424320174589
    num_agent_steps_sampled: 18000
    num_agent_steps_trained: 18000
    num_steps_sampled: 18000
    num_steps_trained: 18000
  iterations_since_restor

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_c2555_00000,RUNNING,192.168.1.96:382971,18,889.934,18000,-1.28062,5.33,-4.98,368.375
