In [1]:
import torch 
from torch import nn

import ray
from ray.rllib.agents import ppo
from ray.rllib.models import ModelCatalog
from ray.rllib.models.torch.torch_modelv2 import TorchModelV2
from ray.rllib.utils.annotations import override

#from models import VisualEncoder
from train import *
from wrappers_2 import *



In [2]:
class VisualEncoder(nn.Module):
    def __init__(self):
        super().__init__()

        self.cnn = nn.Sequential(
            nn.Conv2d(3, 32, kernel_size=2, stride=2, padding=0),  
            nn.ELU(),
            nn.Conv2d(32, 32, kernel_size=2, stride=2, padding=0), 
            nn.ELU(),
            nn.Conv2d(32, 64, kernel_size=2, stride=2, padding=0), 
            nn.ELU(),
            nn.Conv2d(64, 128, kernel_size=2, stride=2, padding=0),
            nn.ELU(), 
            nn.Conv2d(128, 256, kernel_size=2, stride=2, padding=0),
            nn.ELU(),
            nn.Conv2d(256, 512, kernel_size=2, stride=2, padding=0),
            nn.ELU(),
            nn.Flatten(),
        )

    def forward(self, x):
        return self.cnn(x)

In [3]:
from torch.nn.functional import one_hot

class MyModelClass(TorchModelV2, nn.Module):
    def __init__(self, obs_space, action_space, num_outputs, model_config, name):
        TorchModelV2.__init__(self, obs_space, action_space, num_outputs, model_config, name)
        nn.Module.__init__(self)
        visual_features_dim = 512
        target_features_dim = 9 * 11 * 11 
        self.visual_encoder = VisualEncoder()
        self.visual_encoder.load_state_dict(
            torch.load("/IGLU-Minecraft/models/AngelaCNN/encoder_weigths.pth", map_location=torch.device('cpu'))
        )
        self.target_encoder = nn.Sequential(
            nn.Conv3d(7, 1, kernel_size=1, stride=1, padding=0),
            nn.ELU(),
        )
        policy_hidden_dim = 256 
        self.policy_network = nn.Sequential(
            nn.Linear(visual_features_dim + target_features_dim, 1024),
            nn.ELU(),
            nn.Linear(1024, 512),
            nn.ELU(),
            nn.Linear(512, policy_hidden_dim),
            nn.ELU(),
            nn.Linear(policy_hidden_dim, policy_hidden_dim),
            nn.ELU(),
            #nn.Linear(policy_hidden_dim, policy_hidden_dim),
            #nn.ELU(),
        )
        self.action_head = nn.Linear(policy_hidden_dim, action_space.n)
        self.value_head = nn.Linear(policy_hidden_dim, 1)
        self.last_value = None
        
        self.use_cuda = torch.cuda.is_available()
        if self.use_cuda:
            self.visual_encoder.cuda()
            self.target_encoder.cuda()
            self.policy_network.cuda()
            self.action_head.cuda()
            self.value_head.cuda()
        
    @override(TorchModelV2)
    def forward(self, input_dict, state, seq_lens):
        obs = input_dict['obs']
        pov = obs['pov'].permute(0, 3, 1, 2).float() / 255.0
        target = one_hot(obs['target_grid'].long(), num_classes=7).permute(0, 4, 1, 2, 3).float()
        if self.use_cuda:
            pov.cuda()
            target.cuda()
            
        with torch.no_grad():
            visual_features = self.visual_encoder(pov)
            
        target_features = self.target_encoder(target)
        target_features = target_features.reshape(target_features.shape[0], -1)
        features = torch.cat([visual_features, target_features], dim=1)
        features = self.policy_network(features)
        action = self.action_head(features)
        self.last_value = self.value_head(features).squeeze(1)
        return action, state
    
    @override(TorchModelV2)
    def value_function(self):
        assert self.last_value is not None, "must call forward() first"
        return self.last_value

In [4]:
visual_features_dim = 512
target_features_dim = 9 * 11 * 11
policy_hidden_dim = 256 

policy_network = nn.Sequential(
    nn.Linear(visual_features_dim + target_features_dim, 1024),
    nn.ELU(),
    nn.Linear(1024, 512),
    nn.ELU(),
    nn.Linear(512, policy_hidden_dim),
    nn.ELU(),
    nn.Linear(policy_hidden_dim, policy_hidden_dim),
    nn.ELU(),
    #nn.Linear(policy_hidden_dim, policy_hidden_dim),
    #nn.ELU(),
)

sum(p.numel() for p in policy_network.parameters())

2362368

In [5]:
ModelCatalog.register_custom_model("my_torch_model", MyModelClass)

In [6]:
class VisualObservationWrapper(ObsWrapper):
    def __init__(self, env, include_target=False):
        super().__init__(env)
        self.observation_space = {   
            'pov': gym.spaces.Box(low=0, high=255, shape=(64, 64, 3)),
            'inventory': gym.spaces.Box(low=0.0, high=20.0, shape=(6,)),
            'compass': gym.spaces.Box(low=-180.0, high=180.0, shape=(1,))
        }
        if include_target:
            self.observation_space['target_grid'] = \
                gym.spaces.Box(low=0, high=6, shape=(9, 11, 11))
        self.observation_space = gym.spaces.Dict(self.observation_space)

    def observation(self, obs, reward=None, done=None, info=None):
        if info is not None:
            if 'target_grid' in info:
                target_grid = info['target_grid']
                del info['target_grid']
            else:
                logger.error(f'info: {info}')
                if hasattr(self.unwrapped, 'should_reset'):
                    self.unwrapped.should_reset(True)
                target_grid = self.env.unwrapped.tasks.current.target_grid
        else:
            target_grid = self.env.unwrapped.tasks.current.target_grid
        return {
            'pov': obs['pov'].astype(np.float32),
            'inventory': obs['inventory'],
            'compass': np.array([obs['compass']['angle'].item()]),
            'target_grid': target_grid
        }

In [7]:
import os
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"   # see issue #152
os.environ["CUDA_VISIBLE_DEVICES"]="0"

tasks = []
for i in range(1,156):
    if ('C'+str(i)) == 'C38': continue
    tasks.append('C'+str(i))
    
class RewardWrapper(gym.RewardWrapper):
    def __init__(self, env):
        super().__init__(env)
    
    def reward(self, rew):
        if rew == 0:
            rew = -0.01
        if abs(rew) == 1:
            rew /= 10
            
        return rew
    
def env_creator(env_config):
    env = gym.make('IGLUSilentBuilder-v0', max_steps=250)
    env.update_taskset(TaskSet(preset=tasks))
    #env = PovOnlyWrapper(env)
    env = VisualObservationWrapper(env, include_target=True)
    env = SelectAndPlace(env)
    env = Discretization(env, flat_action_space('human-level'))
    env = RewardWrapper(env)
    return env

from ray.tune.registry import register_env
register_env("my_env", env_creator)

from ray import tune
from ray.rllib.agents.ppo import PPOTrainer

In [None]:
from ray.tune.integration.wandb import WandbLogger

analysis = tune.run(PPOTrainer, 
         config={
             "env": "my_env", 
             "framework": "torch",
             "num_gpus": 1,
             "num_workers": 3,
             "sgd_minibatch_size": 256,
             "clip_param": 0.2,
             "entropy_coeff": 0.01,
             "lambda": 0.95,
             "train_batch_size": 5_000,
             "lr": 1e-4,
             #"gamma": 0.99,
             "model": {
                    # Specify our custom model from above.
                    "custom_model": "my_torch_model",
                    # Extra kwargs to be passed to your model's c'tor.
                    "custom_model_config": {},
              },
             "logger_config": {
                  "wandb": {
                      "project": "IGLU-Minecraft",
                      "name": "PPO All Tasks pretrained (AngelaCNN) (3 noops after placement) r: -0.01 div10"
                  }
              }

        },
        loggers=[WandbLogger],
        local_dir="/IGLU-Minecraft/checkpoints/all_tasks",
        keep_checkpoints_num=50,
        checkpoint_freq=5,
        checkpoint_at_end=True)



Trial name,status,loc
PPO_my_env_2d626_00000,PENDING,


2021-11-07 11:02:20,138	INFO wandb.py:170 -- Already logged into W&B.
2021-11-07 11:02:20,186	ERROR syncer.py:72 -- Log sync requires rsync to be installed.
[34m[1mwandb[0m: Currently logged in as: [33mlinar[0m (use `wandb login --relogin` to force relogin)
[34m[1mwandb[0m: wandb version 0.12.6 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade


[2m[36m(pid=480810)[0m 2021-11-07 11:02:25,484	INFO ppo.py:159 -- In multi-agent mode, policies will be optimized sequentially by the multi-GPU optimizer. Consider setting simple_optimizer=True if this doesn't work for you.
[2m[36m(pid=480810)[0m 2021-11-07 11:02:25,484	INFO trainer.py:728 -- Current log_level is WARN. For more information, set 'log_level': 'INFO' / 'DEBUG' or use the -v and -vv flags.
[2m[36m(pid=480810)[0m 2021-11-07 11:02:36,843	INFO trainable.py:109 -- Trainable.setup took 15.320 seconds. If your trainable is slow to initialize, consider setting reuse_actors=True to reduce actor creation overheads.


Result for PPO_my_env_2d626_00000:
  agent_timesteps_total: 9996
  custom_metrics: {}
  date: 2021-11-07_11-06-04
  done: false
  episode_len_mean: 100.33673469387755
  episode_media: {}
  episode_reward_max: 2.7100000000000017
  episode_reward_mean: -0.8501020408163271
  episode_reward_min: -1.5200000000000007
  episodes_this_iter: 98
  episodes_total: 98
  experiment_id: df9a4fe7112046deb1a947ba7e3d3727
  hostname: cdsserver
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.19999999999999998
          cur_lr: 0.00010000000000000002
          entropy: 2.8824299178571784
          entropy_coeff: 0.01
          kl: 0.006863669508837896
          policy_loss: -0.015661495382714476
          total_loss: -0.013310577764979795
          vf_explained_var: -0.3116755485534668
          vf_loss: 0.02980248174081859
    num_agent_steps_sampled: 9996
    num_agent_steps_trained: 9996
    num_steps_sampled: 9996
    num_steps_trained: 9996
  iterations_si

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_2d626_00000,RUNNING,192.168.1.96:480810,1,207.762,9996,-0.850102,2.71,-1.52,100.337


Result for PPO_my_env_2d626_00000:
  agent_timesteps_total: 19992
  custom_metrics: {}
  date: 2021-11-07_11-08-26
  done: false
  episode_len_mean: 98.63725490196079
  episode_media: {}
  episode_reward_max: 4.980000000000009
  episode_reward_mean: -0.6757843137254905
  episode_reward_min: -1.5299999999999994
  episodes_this_iter: 102
  episodes_total: 200
  experiment_id: df9a4fe7112046deb1a947ba7e3d3727
  hostname: cdsserver
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.19999999999999998
          cur_lr: 0.00010000000000000002
          entropy: 2.8692716274506007
          entropy_coeff: 0.01
          kl: 0.008616683900278067
          policy_loss: -0.022889049405343514
          total_loss: 0.0355423043983487
          vf_explained_var: 0.005149574019014835
          vf_loss: 0.08540073371607747
    num_agent_steps_sampled: 19992
    num_agent_steps_trained: 19992
    num_steps_sampled: 19992
    num_steps_trained: 19992
  iterations

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_2d626_00000,RUNNING,192.168.1.96:480810,2,349.598,19992,-0.675784,4.98,-1.53,98.6373


Result for PPO_my_env_2d626_00000:
  agent_timesteps_total: 29988
  custom_metrics: {}
  date: 2021-11-07_11-10-46
  done: false
  episode_len_mean: 96.68932038834951
  episode_media: {}
  episode_reward_max: 4.760000000000003
  episode_reward_mean: 0.14650485436893218
  episode_reward_min: -2.1599999999999997
  episodes_this_iter: 103
  episodes_total: 303
  experiment_id: df9a4fe7112046deb1a947ba7e3d3727
  hostname: cdsserver
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.19999999999999998
          cur_lr: 0.00010000000000000002
          entropy: 2.8413309853301087
          entropy_coeff: 0.01
          kl: 0.012549451784464913
          policy_loss: -0.02715505621372125
          total_loss: 0.1572444661297541
          vf_explained_var: 0.2533511221408844
          vf_loss: 0.21030294048225778
    num_agent_steps_sampled: 29988
    num_agent_steps_trained: 29988
    num_steps_sampled: 29988
    num_steps_trained: 29988
  iterations_si

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_2d626_00000,RUNNING,192.168.1.96:480810,3,489.234,29988,0.146505,4.76,-2.16,96.6893




Result for PPO_my_env_2d626_00000:
  agent_timesteps_total: 39984
  custom_metrics: {}
  date: 2021-11-07_11-13-47
  done: false
  episode_len_mean: 94.99056603773585
  episode_media: {}
  episode_reward_max: 4.950000000000011
  episode_reward_mean: -0.020377358490565843
  episode_reward_min: -1.8600000000000012
  episodes_this_iter: 106
  episodes_total: 409
  experiment_id: df9a4fe7112046deb1a947ba7e3d3727
  hostname: cdsserver
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.19999999999999998
          cur_lr: 0.00010000000000000002
          entropy: 2.8288764187413404
          entropy_coeff: 0.01
          kl: 0.01456120752207934
          policy_loss: -0.03160574484266277
          total_loss: 0.1447193642385686
          vf_explained_var: 0.32688796520233154
          vf_loss: 0.2017016316461576
    num_agent_steps_sampled: 39984
    num_agent_steps_trained: 39984
    num_steps_sampled: 39984
    num_steps_trained: 39984
  iterations_s

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_2d626_00000,RUNNING,192.168.1.96:480810,4,670.65,39984,-0.0203774,4.95,-1.86,94.9906


Result for PPO_my_env_2d626_00000:
  agent_timesteps_total: 49980
  custom_metrics: {}
  date: 2021-11-07_11-16-13
  done: false
  episode_len_mean: 99.20792079207921
  episode_media: {}
  episode_reward_max: 6.740000000000011
  episode_reward_mean: 0.3672277227722781
  episode_reward_min: -1.800000000000001
  episodes_this_iter: 101
  episodes_total: 510
  experiment_id: df9a4fe7112046deb1a947ba7e3d3727
  hostname: cdsserver
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.19999999999999998
          cur_lr: 0.00010000000000000002
          entropy: 2.7900016894707313
          entropy_coeff: 0.01
          kl: 0.017356929272214057
          policy_loss: -0.034028121275015366
          total_loss: 0.21365480529001127
          vf_explained_var: 0.49137574434280396
          vf_loss: 0.27211155756416483
    num_agent_steps_sampled: 49980
    num_agent_steps_trained: 49980
    num_steps_sampled: 49980
    num_steps_trained: 49980
  iterations_s

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_2d626_00000,RUNNING,192.168.1.96:480810,5,816.643,49980,0.367228,6.74,-1.8,99.2079


Result for PPO_my_env_2d626_00000:
  agent_timesteps_total: 59976
  custom_metrics: {}
  date: 2021-11-07_11-18-42
  done: false
  episode_len_mean: 101.12
  episode_media: {}
  episode_reward_max: 4.700000000000011
  episode_reward_mean: 0.7994000000000016
  episode_reward_min: -1.890000000000001
  episodes_this_iter: 97
  episodes_total: 607
  experiment_id: df9a4fe7112046deb1a947ba7e3d3727
  hostname: cdsserver
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.19999999999999998
          cur_lr: 0.00010000000000000002
          entropy: 2.7651500689677704
          entropy_coeff: 0.01
          kl: 0.0169888024268638
          policy_loss: -0.02865576571983914
          total_loss: 0.2669947608947181
          vf_explained_var: 0.526110053062439
          vf_loss: 0.3199042649032214
    num_agent_steps_sampled: 59976
    num_agent_steps_trained: 59976
    num_steps_sampled: 59976
    num_steps_trained: 59976
  iterations_since_restore: 6
  n

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_2d626_00000,RUNNING,192.168.1.96:480810,6,964.911,59976,0.7994,4.7,-1.89,101.12




Result for PPO_my_env_2d626_00000:
  agent_timesteps_total: 69972
  custom_metrics: {}
  date: 2021-11-07_11-21-28
  done: false
  episode_len_mean: 99.87128712871286
  episode_media: {}
  episode_reward_max: 8.68000000000001
  episode_reward_mean: 1.159405940594062
  episode_reward_min: -1.980000000000001
  episodes_this_iter: 101
  episodes_total: 708
  experiment_id: df9a4fe7112046deb1a947ba7e3d3727
  hostname: cdsserver
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.19999999999999998
          cur_lr: 0.00010000000000000002
          entropy: 2.7489571300327267
          entropy_coeff: 0.01
          kl: 0.01725398939928771
          policy_loss: -0.030773013384423703
          total_loss: 0.2902104803273438
          vf_explained_var: 0.4236692786216736
          vf_loss: 0.3450222675807965
    num_agent_steps_sampled: 69972
    num_agent_steps_trained: 69972
    num_steps_sampled: 69972
    num_steps_trained: 69972
  iterations_since_r

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_2d626_00000,RUNNING,192.168.1.96:480810,7,1131.36,69972,1.15941,8.68,-1.98,99.8713




Result for PPO_my_env_2d626_00000:
  agent_timesteps_total: 79968
  custom_metrics: {}
  date: 2021-11-07_11-24-33
  done: false
  episode_len_mean: 100.62
  episode_media: {}
  episode_reward_max: 8.430000000000009
  episode_reward_mean: 1.1632000000000022
  episode_reward_min: -2.3299999999999983
  episodes_this_iter: 99
  episodes_total: 807
  experiment_id: df9a4fe7112046deb1a947ba7e3d3727
  hostname: cdsserver
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.19999999999999998
          cur_lr: 0.00010000000000000002
          entropy: 2.7295347162800976
          entropy_coeff: 0.01
          kl: 0.017289181047384604
          policy_loss: -0.029841385960069477
          total_loss: 0.32073425146329226
          vf_explained_var: 0.3982108235359192
          vf_loss: 0.3744131489728506
    num_agent_steps_sampled: 79968
    num_agent_steps_trained: 79968
    num_steps_sampled: 79968
    num_steps_trained: 79968
  iterations_since_restore:

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_2d626_00000,RUNNING,192.168.1.96:480810,8,1316.57,79968,1.1632,8.43,-2.33,100.62


Result for PPO_my_env_2d626_00000:
  agent_timesteps_total: 89964
  custom_metrics: {}
  date: 2021-11-07_11-27-19
  done: false
  episode_len_mean: 99.79
  episode_media: {}
  episode_reward_max: 8.900000000000011
  episode_reward_mean: 1.2344000000000026
  episode_reward_min: -1.960000000000001
  episodes_this_iter: 100
  episodes_total: 907
  experiment_id: df9a4fe7112046deb1a947ba7e3d3727
  hostname: cdsserver
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.19999999999999998
          cur_lr: 0.00010000000000000002
          entropy: 2.713480949605632
          entropy_coeff: 0.01
          kl: 0.0183344661856083
          policy_loss: -0.03112746148298566
          total_loss: 0.32692285677752436
          vf_explained_var: 0.4642910361289978
          vf_loss: 0.3815182348792879
    num_agent_steps_sampled: 89964
    num_agent_steps_trained: 89964
    num_steps_sampled: 89964
    num_steps_trained: 89964
  iterations_since_restore: 9
  

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_2d626_00000,RUNNING,192.168.1.96:480810,9,1482.46,89964,1.2344,8.9,-1.96,99.79


Result for PPO_my_env_2d626_00000:
  agent_timesteps_total: 99960
  custom_metrics: {}
  date: 2021-11-07_11-30-11
  done: false
  episode_len_mean: 100.39
  episode_media: {}
  episode_reward_max: 6.690000000000014
  episode_reward_mean: 1.3341000000000034
  episode_reward_min: -2.3399999999999985
  episodes_this_iter: 99
  episodes_total: 1006
  experiment_id: df9a4fe7112046deb1a947ba7e3d3727
  hostname: cdsserver
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.19999999999999998
          cur_lr: 0.00010000000000000002
          entropy: 2.6809067689455475
          entropy_coeff: 0.01
          kl: 0.01793057939114074
          policy_loss: -0.03519265549655399
          total_loss: 0.2704306389658879
          vf_explained_var: 0.592963457107544
          vf_loss: 0.3288462467873708
    num_agent_steps_sampled: 99960
    num_agent_steps_trained: 99960
    num_steps_sampled: 99960
    num_steps_trained: 99960
  iterations_since_restore: 10

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_2d626_00000,RUNNING,192.168.1.96:480810,10,1654.54,99960,1.3341,6.69,-2.34,100.39




Result for PPO_my_env_2d626_00000:
  agent_timesteps_total: 109956
  custom_metrics: {}
  date: 2021-11-07_11-33-12
  done: false
  episode_len_mean: 100.87
  episode_media: {}
  episode_reward_max: 12.240000000000016
  episode_reward_mean: 1.614600000000004
  episode_reward_min: -2.159999999999998
  episodes_this_iter: 100
  episodes_total: 1106
  experiment_id: df9a4fe7112046deb1a947ba7e3d3727
  hostname: cdsserver
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.19999999999999998
          cur_lr: 0.00010000000000000002
          entropy: 2.662437510286641
          entropy_coeff: 0.01
          kl: 0.021484141610822162
          policy_loss: -0.03407769135844249
          total_loss: 0.30724473188225276
          vf_explained_var: 0.674784243106842
          vf_loss: 0.36364996986001985
    num_agent_steps_sampled: 109956
    num_agent_steps_trained: 109956
    num_steps_sampled: 109956
    num_steps_trained: 109956
  iterations_since_rest

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_2d626_00000,RUNNING,192.168.1.96:480810,11,1835.08,109956,1.6146,12.24,-2.16,100.87


Result for PPO_my_env_2d626_00000:
  agent_timesteps_total: 119952
  custom_metrics: {}
  date: 2021-11-07_11-35-48
  done: false
  episode_len_mean: 100.29
  episode_media: {}
  episode_reward_max: 6.240000000000016
  episode_reward_mean: 1.202200000000004
  episode_reward_min: -1.6700000000000008
  episodes_this_iter: 99
  episodes_total: 1205
  experiment_id: df9a4fe7112046deb1a947ba7e3d3727
  hostname: cdsserver
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.3
          cur_lr: 0.00010000000000000002
          entropy: 2.6180043240897675
          entropy_coeff: 0.01
          kl: 0.019916820623234393
          policy_loss: -0.03613648228984103
          total_loss: 0.3193798809001843
          vf_explained_var: 0.5948522686958313
          vf_loss: 0.3757213587562243
    num_agent_steps_sampled: 119952
    num_agent_steps_trained: 119952
    num_steps_sampled: 119952
    num_steps_trained: 119952
  iterations_since_restore: 12
  node_ip

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_2d626_00000,RUNNING,192.168.1.96:480810,12,1990.77,119952,1.2022,6.24,-1.67,100.29




Result for PPO_my_env_2d626_00000:
  agent_timesteps_total: 129948
  custom_metrics: {}
  date: 2021-11-07_11-38-42
  done: false
  episode_len_mean: 102.01
  episode_media: {}
  episode_reward_max: 8.010000000000018
  episode_reward_mean: 1.5771000000000046
  episode_reward_min: -1.9700000000000006
  episodes_this_iter: 99
  episodes_total: 1304
  experiment_id: df9a4fe7112046deb1a947ba7e3d3727
  hostname: cdsserver
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.3
          cur_lr: 0.00010000000000000002
          entropy: 2.614659170207814
          entropy_coeff: 0.01
          kl: 0.021549868906797946
          policy_loss: -0.034046648546225494
          total_loss: 0.3504248603605307
          vf_explained_var: 0.605085551738739
          vf_loss: 0.40415314065340235
    num_agent_steps_sampled: 129948
    num_agent_steps_trained: 129948
    num_steps_sampled: 129948
    num_steps_trained: 129948
  iterations_since_restore: 13
  node_i

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_2d626_00000,RUNNING,192.168.1.96:480810,13,2164.82,129948,1.5771,8.01,-1.97,102.01


Result for PPO_my_env_2d626_00000:
  agent_timesteps_total: 139944
  custom_metrics: {}
  date: 2021-11-07_11-41-18
  done: false
  episode_len_mean: 102.45
  episode_media: {}
  episode_reward_max: 8.35000000000001
  episode_reward_mean: 1.7069000000000047
  episode_reward_min: -2.219999999999996
  episodes_this_iter: 97
  episodes_total: 1401
  experiment_id: df9a4fe7112046deb1a947ba7e3d3727
  hostname: cdsserver
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.4500000000000001
          cur_lr: 0.00010000000000000002
          entropy: 2.5943965365744046
          entropy_coeff: 0.01
          kl: 0.01987462641473992
          policy_loss: -0.03398648996988678
          total_loss: 0.3431404800209989
          vf_explained_var: 0.6039495468139648
          vf_loss: 0.3941273544206578
    num_agent_steps_sampled: 139944
    num_agent_steps_trained: 139944
    num_steps_sampled: 139944
    num_steps_trained: 139944
  iterations_since_restore:

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_2d626_00000,RUNNING,192.168.1.96:480810,14,2321.39,139944,1.7069,8.35,-2.22,102.45




Result for PPO_my_env_2d626_00000:
  agent_timesteps_total: 149940
  custom_metrics: {}
  date: 2021-11-07_11-44-33
  done: false
  episode_len_mean: 101.33
  episode_media: {}
  episode_reward_max: 10.870000000000019
  episode_reward_mean: 2.2744000000000058
  episode_reward_min: -2.3000000000000003
  episodes_this_iter: 99
  episodes_total: 1500
  experiment_id: df9a4fe7112046deb1a947ba7e3d3727
  hostname: cdsserver
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.4500000000000001
          cur_lr: 0.00010000000000000002
          entropy: 2.5963516989324846
          entropy_coeff: 0.01
          kl: 0.021240241890703844
          policy_loss: -0.03041607791510148
          total_loss: 0.37552797342133193
          vf_explained_var: 0.6254737973213196
          vf_loss: 0.42234946002817564
    num_agent_steps_sampled: 149940
    num_agent_steps_trained: 149940
    num_steps_sampled: 149940
    num_steps_trained: 149940
  iterations_since_re

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_2d626_00000,RUNNING,192.168.1.96:480810,15,2516.32,149940,2.2744,10.87,-2.3,101.33


Result for PPO_my_env_2d626_00000:
  agent_timesteps_total: 159936
  custom_metrics: {}
  date: 2021-11-07_11-47-13
  done: false
  episode_len_mean: 102.04
  episode_media: {}
  episode_reward_max: 8.360000000000015
  episode_reward_mean: 1.8732000000000053
  episode_reward_min: -2.05
  episodes_this_iter: 98
  episodes_total: 1598
  experiment_id: df9a4fe7112046deb1a947ba7e3d3727
  hostname: cdsserver
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.6749999999999999
          cur_lr: 0.00010000000000000002
          entropy: 2.59061807139307
          entropy_coeff: 0.01
          kl: 0.01842378289118406
          policy_loss: -0.03514813760566151
          total_loss: 0.31956948150331393
          vf_explained_var: 0.6440045833587646
          vf_loss: 0.3681877443805719
    num_agent_steps_sampled: 159936
    num_agent_steps_trained: 159936
    num_steps_sampled: 159936
    num_steps_trained: 159936
  iterations_since_restore: 16
  node_ip

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_2d626_00000,RUNNING,192.168.1.96:480810,16,2676.03,159936,1.8732,8.36,-2.05,102.04


Result for PPO_my_env_2d626_00000:
  agent_timesteps_total: 169932
  custom_metrics: {}
  date: 2021-11-07_11-49-53
  done: false
  episode_len_mean: 102.84
  episode_media: {}
  episode_reward_max: 14.260000000000007
  episode_reward_mean: 2.000500000000006
  episode_reward_min: -2.279999999999997
  episodes_this_iter: 97
  episodes_total: 1695
  experiment_id: df9a4fe7112046deb1a947ba7e3d3727
  hostname: cdsserver
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.6749999999999999
          cur_lr: 0.00010000000000000002
          entropy: 2.5483795161940095
          entropy_coeff: 0.01
          kl: 0.022659175432540844
          policy_loss: -0.031090029104588888
          total_loss: 0.36387887579572
          vf_explained_var: 0.6696630120277405
          vf_loss: 0.4051577560762819
    num_agent_steps_sampled: 169932
    num_agent_steps_trained: 169932
    num_steps_sampled: 169932
    num_steps_trained: 169932
  iterations_since_restore

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_2d626_00000,RUNNING,192.168.1.96:480810,17,2835.85,169932,2.0005,14.26,-2.28,102.84




Result for PPO_my_env_2d626_00000:
  agent_timesteps_total: 179928
  custom_metrics: {}
  date: 2021-11-07_11-52-51
  done: false
  episode_len_mean: 97.72549019607843
  episode_media: {}
  episode_reward_max: 10.610000000000015
  episode_reward_mean: 1.9702941176470643
  episode_reward_min: -1.8300000000000007
  episodes_this_iter: 102
  episodes_total: 1797
  experiment_id: df9a4fe7112046deb1a947ba7e3d3727
  hostname: cdsserver
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 1.0125
          cur_lr: 0.00010000000000000002
          entropy: 2.5413418700552395
          entropy_coeff: 0.01
          kl: 0.017422394847111736
          policy_loss: -0.03603555550559973
          total_loss: 0.35187148137225044
          vf_explained_var: 0.6744968891143799
          vf_loss: 0.39568027912551523
    num_agent_steps_sampled: 179928
    num_agent_steps_trained: 179928
    num_steps_sampled: 179928
    num_steps_trained: 179928
  iterations_since_re

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_2d626_00000,RUNNING,192.168.1.96:480810,18,3013.98,179928,1.97029,10.61,-1.83,97.7255


Result for PPO_my_env_2d626_00000:
  agent_timesteps_total: 189924
  custom_metrics: {}
  date: 2021-11-07_11-55-32
  done: false
  episode_len_mean: 99.74
  episode_media: {}
  episode_reward_max: 12.290000000000017
  episode_reward_mean: 2.283800000000006
  episode_reward_min: -1.890000000000001
  episodes_this_iter: 100
  episodes_total: 1897
  experiment_id: df9a4fe7112046deb1a947ba7e3d3727
  hostname: cdsserver
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 1.0125
          cur_lr: 0.00010000000000000002
          entropy: 2.5340015947309316
          entropy_coeff: 0.01
          kl: 0.01759126624453554
          policy_loss: -0.03222607903373547
          total_loss: 0.33446433681867316
          vf_explained_var: 0.6947193741798401
          vf_loss: 0.37421927476922673
    num_agent_steps_sampled: 189924
    num_agent_steps_trained: 189924
    num_steps_sampled: 189924
    num_steps_trained: 189924
  iterations_since_restore: 19
  nod

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_2d626_00000,RUNNING,192.168.1.96:480810,19,3174.5,189924,2.2838,12.29,-1.89,99.74


Result for PPO_my_env_2d626_00000:
  agent_timesteps_total: 199920
  custom_metrics: {}
  date: 2021-11-07_11-58-11
  done: false
  episode_len_mean: 99.94059405940594
  episode_media: {}
  episode_reward_max: 8.75000000000001
  episode_reward_mean: 2.449504950495056
  episode_reward_min: -2.08
  episodes_this_iter: 101
  episodes_total: 1998
  experiment_id: df9a4fe7112046deb1a947ba7e3d3727
  hostname: cdsserver
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 1.0125
          cur_lr: 0.00010000000000000002
          entropy: 2.500853073087513
          entropy_coeff: 0.01
          kl: 0.018609703954739022
          policy_loss: -0.02951853795088509
          total_loss: 0.3500171518558238
          vf_explained_var: 0.7009139060974121
          vf_loss: 0.3857018941838262
    num_agent_steps_sampled: 199920
    num_agent_steps_trained: 199920
    num_steps_sampled: 199920
    num_steps_trained: 199920
  iterations_since_restore: 20
  node_ip:

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_2d626_00000,RUNNING,192.168.1.96:480810,20,3333.36,199920,2.4495,8.75,-2.08,99.9406


Result for PPO_my_env_2d626_00000:
  agent_timesteps_total: 209916
  custom_metrics: {}
  date: 2021-11-07_12-00-51
  done: false
  episode_len_mean: 98.61386138613861
  episode_media: {}
  episode_reward_max: 8.690000000000012
  episode_reward_mean: 2.1023762376237687
  episode_reward_min: -2.359999999999997
  episodes_this_iter: 101
  episodes_total: 2099
  experiment_id: df9a4fe7112046deb1a947ba7e3d3727
  hostname: cdsserver
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 1.0125
          cur_lr: 0.00010000000000000002
          entropy: 2.521676038676857
          entropy_coeff: 0.01
          kl: 0.015389998334360765
          policy_loss: -0.03269229088233322
          total_loss: 0.3262905108400135
          vf_explained_var: 0.6767266988754272
          vf_loss: 0.3686171882555016
    num_agent_steps_sampled: 209916
    num_agent_steps_trained: 209916
    num_steps_sampled: 209916
    num_steps_trained: 209916
  iterations_since_restore

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_2d626_00000,RUNNING,192.168.1.96:480810,21,3493.29,209916,2.10238,8.69,-2.36,98.6139


