In [1]:
import torch 
from torch import nn

import ray
from ray.rllib.agents import ppo
from ray.rllib.models import ModelCatalog
from ray.rllib.models.torch.torch_modelv2 import TorchModelV2
from ray.rllib.utils.annotations import override

#from models import VisualEncoder
from train import *



In [2]:
class VisualEncoder(nn.Module):
    def __init__(self):
        super().__init__()

        self.cnn = nn.Sequential(
            nn.Conv2d(3, 32, kernel_size=8, stride=4, padding=0),  
            nn.ReLU(),
            nn.Conv2d(32, 64, kernel_size=4, stride=2, padding=0), 
            nn.ReLU(),
            nn.Conv2d(64, 64, kernel_size=3, stride=1, padding=0), 
            nn.ReLU(),
            nn.Conv2d(64, 64, kernel_size=3, stride=1, padding=0),
            nn.ReLU(), 
            nn.Conv2d(64, 512, kernel_size=2, stride=1, padding=0),
            nn.ReLU(),
            nn.Flatten(),
        )

    def forward(self, x):
        return self.cnn(x)

In [3]:
class MyModelClass(TorchModelV2, nn.Module):
    def __init__(self, obs_space, action_space, num_outputs, model_config, name):
        TorchModelV2.__init__(self, obs_space, action_space, num_outputs, model_config, name)
        nn.Module.__init__(self)
        features_dim = 512
        self.encoder = VisualEncoder()
        self.encoder.load_state_dict(
            torch.load("/IGLU-Minecraft/models/AnnaCNN/encoder_weigths.pth", map_location=torch.device('cpu'))
        )
        self.action_head = nn.Linear(features_dim, action_space.n)
        self.value_head = nn.Linear(features_dim, 1)
        self.last_value = None
        
        self.use_cuda = torch.cuda.is_available()
        if self.use_cuda:
            self.encoder.cuda()
            self.action_head.cuda()
            self.value_head.cuda()
        
    @override(TorchModelV2)
    def forward(self, input_dict, state, seq_lens):
        obs = input_dict['obs'].permute(0, 3, 1, 2).float() / 255.0
        if self.use_cuda:
            obs.cuda()
            
        features = self.encoder(obs)
        action = self.action_head(features)
        self.last_value = self.value_head(features).squeeze(1)
        return action, state
    
    @override(TorchModelV2)
    def value_function(self):
        assert self.last_value is not None, "must call forward() first"
        return self.last_value

In [4]:
ModelCatalog.register_custom_model("my_torch_model", MyModelClass)

In [5]:
import os
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"   # see issue #152
os.environ["CUDA_VISIBLE_DEVICES"]="0"

def env_creator(env_config):
    env = gym.make('IGLUSilentBuilder-v0', max_steps=1000)
    env.update_taskset(TaskSet(preset=['C8']))
    env = PovOnlyWrapper(env)
    env = IgluActionWrapper(env)
    return env

from ray.tune.registry import register_env
register_env("my_env", env_creator)

from ray import tune
from ray.rllib.agents.ppo import PPOTrainer

In [None]:
from ray.tune.integration.wandb import WandbLogger

analysis = tune.run(PPOTrainer, 
         config={
             "env": "my_env", 
             "framework": "torch",
             "num_gpus": 1,
             "num_workers": 1,
             "sgd_minibatch_size": 256,
             "clip_param": 0.2,
             "entropy_coeff": 0.01,
             "lambda": 0.95,
             "train_batch_size": 1000,
             "model": {
                    # Specify our custom model from above.
                    "custom_model": "my_torch_model",
                    # Extra kwargs to be passed to your model's c'tor.
                    "custom_model_config": {},
              },
             "logger_config": {
                  "wandb": {
                      "project": "IGLU-Minecraft",
                      "name": "PPO C8 pretrained (AnnaCNN)"
                  }
              }

        },
        loggers=[WandbLogger])



Trial name,status,loc
PPO_my_env_ba7ae_00000,PENDING,


2021-10-02 07:52:45,934	INFO wandb.py:170 -- Already logged into W&B.
2021-10-02 07:52:46,167	ERROR syncer.py:72 -- Log sync requires rsync to be installed.
[34m[1mwandb[0m: Currently logged in as: [33mlinar[0m (use `wandb login --relogin` to force relogin)
[34m[1mwandb[0m: wandb version 0.12.3 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade
[2m[36m(pid=50048)[0m 2021-10-02 07:52:52,665	INFO ppo.py:159 -- In multi-agent mode, policies will be optimized sequentially by the multi-GPU optimizer. Consider setting simple_optimizer=True if this doesn't work for you.
[2m[36m(pid=50048)[0m 2021-10-02 07:52:52,665	INFO trainer.py:728 -- Current log_level is WARN. For more information, set 'log_level': 'INFO' / 'DEBUG' or use the -v and -vv flags.


[2m[36m(pid=50048)[0m 2021-10-02 07:55:59,270	INFO trainable.py:109 -- Trainable.setup took 191.149 seconds. If your trainable is slow to initialize, consider setting reuse_actors=True to reduce actor creation overheads.


Result for PPO_my_env_ba7ae_00000:
  agent_timesteps_total: 1000
  custom_metrics: {}
  date: 2021-10-02_08-00-09
  done: false
  episode_len_mean: 1000.0
  episode_media: {}
  episode_reward_max: 0.0
  episode_reward_mean: 0.0
  episode_reward_min: 0.0
  episodes_this_iter: 1
  episodes_total: 1
  experiment_id: e6b91d6d56dc46e0b618e1e217a489ac
  hostname: cdsserver
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.19999999999999996
          cur_lr: 5.000000000000001e-05
          entropy: 1.7033940553665161
          entropy_coeff: 0.009999999999999998
          kl: 0.0057295713520358026
          policy_loss: -0.08215594333079126
          total_loss: -0.0870968044632011
          vf_explained_var: -0.010583942756056786
          vf_loss: 0.010947168166361128
    num_agent_steps_sampled: 1000
    num_agent_steps_trained: 1000
    num_steps_sampled: 1000
    num_steps_trained: 1000
  iterations_since_restore: 1
  node_ip: 192.168.1.96
  num_

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_ba7ae_00000,RUNNING,192.168.1.96:50048,1,250.376,1000,0,0,0,1000


Result for PPO_my_env_ba7ae_00000:
  agent_timesteps_total: 2000
  custom_metrics: {}
  date: 2021-10-02_08-00-24
  done: false
  episode_len_mean: 1000.0
  episode_media: {}
  episode_reward_max: 0.0
  episode_reward_mean: 0.0
  episode_reward_min: 0.0
  episodes_this_iter: 1
  episodes_total: 2
  experiment_id: e6b91d6d56dc46e0b618e1e217a489ac
  hostname: cdsserver
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.19999999999999996
          cur_lr: 5.000000000000001e-05
          entropy: 1.6928278128306071
          entropy_coeff: 0.009999999999999998
          kl: 0.00904520250513509
          policy_loss: -0.09347861359516779
          total_loss: -0.10446699799762832
          vf_explained_var: 0.20475783944129944
          vf_loss: 0.0041308524901978675
    num_agent_steps_sampled: 2000
    num_agent_steps_trained: 2000
    num_steps_sampled: 2000
    num_steps_trained: 2000
  iterations_since_restore: 2
  node_ip: 192.168.1.96
  num_he

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_ba7ae_00000,RUNNING,192.168.1.96:50048,2,265.592,2000,0,0,0,1000


Result for PPO_my_env_ba7ae_00000:
  agent_timesteps_total: 3000
  custom_metrics: {}
  date: 2021-10-02_08-00-39
  done: false
  episode_len_mean: 1000.0
  episode_media: {}
  episode_reward_max: 0.0
  episode_reward_mean: 0.0
  episode_reward_min: 0.0
  episodes_this_iter: 1
  episodes_total: 3
  experiment_id: e6b91d6d56dc46e0b618e1e217a489ac
  hostname: cdsserver
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.19999999999999996
          cur_lr: 5.000000000000001e-05
          entropy: 1.6625326010915968
          entropy_coeff: 0.009999999999999998
          kl: 0.015246788401227462
          policy_loss: -0.13549536143740018
          total_loss: -0.14701449589596854
          vf_explained_var: 0.182473286986351
          vf_loss: 0.0020568322244798763
    num_agent_steps_sampled: 3000
    num_agent_steps_trained: 3000
    num_steps_sampled: 3000
    num_steps_trained: 3000
  iterations_since_restore: 3
  node_ip: 192.168.1.96
  num_hea

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_ba7ae_00000,RUNNING,192.168.1.96:50048,3,280.302,3000,0,0,0,1000


Result for PPO_my_env_ba7ae_00000:
  agent_timesteps_total: 4000
  custom_metrics: {}
  date: 2021-10-02_08-00-54
  done: false
  episode_len_mean: 1000.0
  episode_media: {}
  episode_reward_max: 0.0
  episode_reward_mean: 0.0
  episode_reward_min: 0.0
  episodes_this_iter: 1
  episodes_total: 4
  experiment_id: e6b91d6d56dc46e0b618e1e217a489ac
  hostname: cdsserver
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.19999999999999996
          cur_lr: 5.000000000000001e-05
          entropy: 1.711222055223253
          entropy_coeff: 0.009999999999999998
          kl: 0.01555327559668432
          policy_loss: -0.14178306518329514
          total_loss: -0.15419599894020292
          vf_explained_var: 0.1656806766986847
          vf_loss: 0.0015886313563290362
    num_agent_steps_sampled: 4000
    num_agent_steps_trained: 4000
    num_steps_sampled: 4000
    num_steps_trained: 4000
  iterations_since_restore: 4
  node_ip: 192.168.1.96
  num_heal

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_ba7ae_00000,RUNNING,192.168.1.96:50048,4,294.946,4000,0,0,0,1000


Result for PPO_my_env_ba7ae_00000:
  agent_timesteps_total: 5000
  custom_metrics: {}
  date: 2021-10-02_08-01-11
  done: false
  episode_len_mean: 1000.0
  episode_media: {}
  episode_reward_max: 0.0
  episode_reward_mean: 0.0
  episode_reward_min: 0.0
  episodes_this_iter: 1
  episodes_total: 5
  experiment_id: e6b91d6d56dc46e0b618e1e217a489ac
  hostname: cdsserver
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.19999999999999996
          cur_lr: 5.000000000000001e-05
          entropy: 2.414307369126214
          entropy_coeff: 0.009999999999999998
          kl: 0.011918093467656253
          policy_loss: -0.22832251009013918
          total_loss: -0.24024085861941177
          vf_explained_var: 0.20854517817497253
          vf_loss: 0.009841103280066616
    num_agent_steps_sampled: 5000
    num_agent_steps_trained: 5000
    num_steps_sampled: 5000
    num_steps_trained: 5000
  iterations_since_restore: 5
  node_ip: 192.168.1.96
  num_hea

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_ba7ae_00000,RUNNING,192.168.1.96:50048,5,312.016,5000,0,0,0,1000


Result for PPO_my_env_ba7ae_00000:
  agent_timesteps_total: 6000
  custom_metrics: {}
  date: 2021-10-02_08-01-27
  done: false
  episode_len_mean: 1000.0
  episode_media: {}
  episode_reward_max: 0.0
  episode_reward_mean: 0.0
  episode_reward_min: 0.0
  episodes_this_iter: 1
  episodes_total: 6
  experiment_id: e6b91d6d56dc46e0b618e1e217a489ac
  hostname: cdsserver
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.19999999999999996
          cur_lr: 5.000000000000001e-05
          entropy: 2.350110077857971
          entropy_coeff: 0.009999999999999998
          kl: 0.012982086003341959
          policy_loss: -0.06267721951007843
          total_loss: -0.057134075545602375
          vf_explained_var: 0.3407261371612549
          vf_loss: 0.026447828233035073
    num_agent_steps_sampled: 6000
    num_agent_steps_trained: 6000
    num_steps_sampled: 6000
    num_steps_trained: 6000
  iterations_since_restore: 6
  node_ip: 192.168.1.96
  num_hea

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_ba7ae_00000,RUNNING,192.168.1.96:50048,6,328.396,6000,0,0,0,1000


Result for PPO_my_env_ba7ae_00000:
  agent_timesteps_total: 7000
  custom_metrics: {}
  date: 2021-10-02_08-01-45
  done: false
  episode_len_mean: 1000.0
  episode_media: {}
  episode_reward_max: 0.0
  episode_reward_mean: 0.0
  episode_reward_min: 0.0
  episodes_this_iter: 1
  episodes_total: 7
  experiment_id: e6b91d6d56dc46e0b618e1e217a489ac
  hostname: cdsserver
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.19999999999999996
          cur_lr: 5.000000000000001e-05
          entropy: 2.3456860780715942
          entropy_coeff: 0.009999999999999998
          kl: 0.011152195521835691
          policy_loss: -0.10105257810403903
          total_loss: -0.10958368144929409
          vf_explained_var: 0.49182116985321045
          vf_loss: 0.012695318750209279
    num_agent_steps_sampled: 7000
    num_agent_steps_trained: 7000
    num_steps_sampled: 7000
    num_steps_trained: 7000
  iterations_since_restore: 7
  node_ip: 192.168.1.96
  num_he

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_ba7ae_00000,RUNNING,192.168.1.96:50048,7,345.597,7000,0,0,0,1000


Result for PPO_my_env_ba7ae_00000:
  agent_timesteps_total: 8000
  custom_metrics: {}
  date: 2021-10-02_08-02-02
  done: false
  episode_len_mean: 1000.0
  episode_media: {}
  episode_reward_max: 0.0
  episode_reward_mean: 0.0
  episode_reward_min: 0.0
  episodes_this_iter: 1
  episodes_total: 8
  experiment_id: e6b91d6d56dc46e0b618e1e217a489ac
  hostname: cdsserver
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.19999999999999996
          cur_lr: 5.000000000000001e-05
          entropy: 1.9639955202738444
          entropy_coeff: 0.009999999999999998
          kl: 0.026395138365796277
          policy_loss: -0.008113656839769747
          total_loss: -0.0015758781849096219
          vf_explained_var: 0.19546976685523987
          vf_loss: 0.020898711740867132
    num_agent_steps_sampled: 8000
    num_agent_steps_trained: 8000
    num_steps_sampled: 8000
    num_steps_trained: 8000
  iterations_since_restore: 8
  node_ip: 192.168.1.96
  num

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_ba7ae_00000,RUNNING,192.168.1.96:50048,8,363.091,8000,0,0,0,1000


Result for PPO_my_env_ba7ae_00000:
  agent_timesteps_total: 9000
  custom_metrics: {}
  date: 2021-10-02_08-02-17
  done: false
  episode_len_mean: 1000.0
  episode_media: {}
  episode_reward_max: 0.0
  episode_reward_mean: 0.0
  episode_reward_min: 0.0
  episodes_this_iter: 1
  episodes_total: 9
  experiment_id: e6b91d6d56dc46e0b618e1e217a489ac
  hostname: cdsserver
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.3
          cur_lr: 5.000000000000001e-05
          entropy: 1.9588770164383782
          entropy_coeff: 0.009999999999999998
          kl: 0.007259550089954203
          policy_loss: -0.1982526576353444
          total_loss: -0.21398147319753966
          vf_explained_var: 0.23233485221862793
          vf_loss: 0.0016820880480938488
    num_agent_steps_sampled: 9000
    num_agent_steps_trained: 9000
    num_steps_sampled: 9000
    num_steps_trained: 9000
  iterations_since_restore: 9
  node_ip: 192.168.1.96
  num_healthy_workers: 1

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_ba7ae_00000,RUNNING,192.168.1.96:50048,9,377.597,9000,0,0,0,1000


Result for PPO_my_env_ba7ae_00000:
  agent_timesteps_total: 10000
  custom_metrics: {}
  date: 2021-10-02_08-02-33
  done: false
  episode_len_mean: 1000.0
  episode_media: {}
  episode_reward_max: 0.0
  episode_reward_mean: 0.0
  episode_reward_min: 0.0
  episodes_this_iter: 1
  episodes_total: 10
  experiment_id: e6b91d6d56dc46e0b618e1e217a489ac
  hostname: cdsserver
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.3
          cur_lr: 5.000000000000001e-05
          entropy: 2.296233571900262
          entropy_coeff: 0.009999999999999998
          kl: 0.010914303352281262
          policy_loss: -0.14011111992100875
          total_loss: -0.15559267128507295
          vf_explained_var: 0.19625301659107208
          vf_loss: 0.004206494155288156
    num_agent_steps_sampled: 10000
    num_agent_steps_trained: 10000
    num_steps_sampled: 10000
    num_steps_trained: 10000
  iterations_since_restore: 10
  node_ip: 192.168.1.96
  num_healthy_work

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_ba7ae_00000,RUNNING,192.168.1.96:50048,10,393.402,10000,0,0,0,1000


Result for PPO_my_env_ba7ae_00000:
  agent_timesteps_total: 11000
  custom_metrics: {}
  date: 2021-10-02_08-02-49
  done: false
  episode_len_mean: 1000.0
  episode_media: {}
  episode_reward_max: 0.0
  episode_reward_mean: 0.0
  episode_reward_min: 0.0
  episodes_this_iter: 1
  episodes_total: 11
  experiment_id: e6b91d6d56dc46e0b618e1e217a489ac
  hostname: cdsserver
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.3
          cur_lr: 5.000000000000001e-05
          entropy: 2.386650392744276
          entropy_coeff: 0.009999999999999998
          kl: 0.009933997771297559
          policy_loss: -0.09689303172959221
          total_loss: -0.11367242965433333
          vf_explained_var: 0.22492121160030365
          vf_loss: 0.004106906537587444
    num_agent_steps_sampled: 11000
    num_agent_steps_trained: 11000
    num_steps_sampled: 11000
    num_steps_trained: 11000
  iterations_since_restore: 11
  node_ip: 192.168.1.96
  num_healthy_work

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_ba7ae_00000,RUNNING,192.168.1.96:50048,11,410.033,11000,0,0,0,1000


Result for PPO_my_env_ba7ae_00000:
  agent_timesteps_total: 12000
  custom_metrics: {}
  date: 2021-10-02_08-03-04
  done: false
  episode_len_mean: 1000.0
  episode_media: {}
  episode_reward_max: 0.0
  episode_reward_mean: 0.0
  episode_reward_min: 0.0
  episodes_this_iter: 1
  episodes_total: 12
  experiment_id: e6b91d6d56dc46e0b618e1e217a489ac
  hostname: cdsserver
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.3
          cur_lr: 5.000000000000001e-05
          entropy: 1.8740591565767923
          entropy_coeff: 0.009999999999999998
          kl: 0.005029126855457831
          policy_loss: -0.201232731466492
          total_loss: -0.21762040766576926
          vf_explained_var: 0.23509421944618225
          vf_loss: 0.0008441742221798955
    num_agent_steps_sampled: 12000
    num_agent_steps_trained: 12000
    num_steps_sampled: 12000
    num_steps_trained: 12000
  iterations_since_restore: 12
  node_ip: 192.168.1.96
  num_healthy_work

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_ba7ae_00000,RUNNING,192.168.1.96:50048,12,425.16,12000,0,0,0,1000


Result for PPO_my_env_ba7ae_00000:
  agent_timesteps_total: 13000
  custom_metrics: {}
  date: 2021-10-02_08-03-22
  done: false
  episode_len_mean: 1000.0
  episode_media: {}
  episode_reward_max: 0.0
  episode_reward_mean: 0.0
  episode_reward_min: 0.0
  episodes_this_iter: 1
  episodes_total: 13
  experiment_id: e6b91d6d56dc46e0b618e1e217a489ac
  hostname: cdsserver
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.3
          cur_lr: 5.000000000000001e-05
          entropy: 2.380464103486803
          entropy_coeff: 0.009999999999999998
          kl: 0.00878413366902536
          policy_loss: -0.16162001540263493
          total_loss: -0.1798016674609648
          vf_explained_var: -0.2627602219581604
          vf_loss: 0.0029877474738491906
    num_agent_steps_sampled: 13000
    num_agent_steps_trained: 13000
    num_steps_sampled: 13000
    num_steps_trained: 13000
  iterations_since_restore: 13
  node_ip: 192.168.1.96
  num_healthy_worke

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_ba7ae_00000,RUNNING,192.168.1.96:50048,13,442.933,13000,0,0,0,1000


Result for PPO_my_env_ba7ae_00000:
  agent_timesteps_total: 14000
  custom_metrics: {}
  date: 2021-10-02_08-03-39
  done: false
  episode_len_mean: 1000.0
  episode_media: {}
  episode_reward_max: 0.0
  episode_reward_mean: 0.0
  episode_reward_min: 0.0
  episodes_this_iter: 1
  episodes_total: 14
  experiment_id: e6b91d6d56dc46e0b618e1e217a489ac
  hostname: cdsserver
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.3
          cur_lr: 5.000000000000001e-05
          entropy: 2.350667651494344
          entropy_coeff: 0.009999999999999998
          kl: 0.01220372760733852
          policy_loss: -0.09368460807535384
          total_loss: -0.11101704790360398
          vf_explained_var: -0.06730862706899643
          vf_loss: 0.002513117877404309
    num_agent_steps_sampled: 14000
    num_agent_steps_trained: 14000
    num_steps_sampled: 14000
    num_steps_trained: 14000
  iterations_since_restore: 14
  node_ip: 192.168.1.96
  num_healthy_work

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_ba7ae_00000,RUNNING,192.168.1.96:50048,14,459.435,14000,0,0,0,1000


Result for PPO_my_env_ba7ae_00000:
  agent_timesteps_total: 15000
  custom_metrics: {}
  date: 2021-10-02_08-03-55
  done: false
  episode_len_mean: 1000.0
  episode_media: {}
  episode_reward_max: 0.0
  episode_reward_mean: 0.0
  episode_reward_min: 0.0
  episodes_this_iter: 1
  episodes_total: 15
  experiment_id: e6b91d6d56dc46e0b618e1e217a489ac
  hostname: cdsserver
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.3
          cur_lr: 5.000000000000001e-05
          entropy: 2.3587653080622357
          entropy_coeff: 0.009999999999999998
          kl: 0.010714324040161502
          policy_loss: -0.10776974144909117
          total_loss: -0.12615874136487643
          vf_explained_var: -0.5301680564880371
          vf_loss: 0.001984355960222375
    num_agent_steps_sampled: 15000
    num_agent_steps_trained: 15000
    num_steps_sampled: 15000
    num_steps_trained: 15000
  iterations_since_restore: 15
  node_ip: 192.168.1.96
  num_healthy_wor

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_ba7ae_00000,RUNNING,192.168.1.96:50048,15,475.606,15000,0,0,0,1000


Result for PPO_my_env_ba7ae_00000:
  agent_timesteps_total: 16000
  custom_metrics: {}
  date: 2021-10-02_08-04-12
  done: false
  episode_len_mean: 1000.0
  episode_media: {}
  episode_reward_max: 0.0
  episode_reward_mean: 0.0
  episode_reward_min: 0.0
  episodes_this_iter: 1
  episodes_total: 16
  experiment_id: e6b91d6d56dc46e0b618e1e217a489ac
  hostname: cdsserver
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.3
          cur_lr: 5.000000000000001e-05
          entropy: 2.2662361515892875
          entropy_coeff: 0.009999999999999998
          kl: 0.010066390428302666
          policy_loss: -0.09792334909240405
          total_loss: -0.11547860784663094
          vf_explained_var: -0.9567546248435974
          vf_loss: 0.002087186795607623
    num_agent_steps_sampled: 16000
    num_agent_steps_trained: 16000
    num_steps_sampled: 16000
    num_steps_trained: 16000
  iterations_since_restore: 16
  node_ip: 192.168.1.96
  num_healthy_wor

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_ba7ae_00000,RUNNING,192.168.1.96:50048,16,492.136,16000,0,0,0,1000


Result for PPO_my_env_ba7ae_00000:
  agent_timesteps_total: 17000
  custom_metrics: {}
  date: 2021-10-02_08-04-29
  done: false
  episode_len_mean: 1000.0
  episode_media: {}
  episode_reward_max: 0.0
  episode_reward_mean: 0.0
  episode_reward_min: 0.0
  episodes_this_iter: 1
  episodes_total: 17
  experiment_id: e6b91d6d56dc46e0b618e1e217a489ac
  hostname: cdsserver
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.3
          cur_lr: 5.000000000000001e-05
          entropy: 2.295942846934001
          entropy_coeff: 0.009999999999999998
          kl: 0.009359804366588955
          policy_loss: -0.09024037793278694
          total_loss: -0.1084925083650483
          vf_explained_var: -0.9236060976982117
          vf_loss: 0.0018993563342115118
    num_agent_steps_sampled: 17000
    num_agent_steps_trained: 17000
    num_steps_sampled: 17000
    num_steps_trained: 17000
  iterations_since_restore: 17
  node_ip: 192.168.1.96
  num_healthy_work

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_ba7ae_00000,RUNNING,192.168.1.96:50048,17,508.96,17000,0,0,0,1000


Result for PPO_my_env_ba7ae_00000:
  agent_timesteps_total: 18000
  custom_metrics: {}
  date: 2021-10-02_08-04-45
  done: false
  episode_len_mean: 1000.0
  episode_media: {}
  episode_reward_max: 0.0
  episode_reward_mean: 0.0
  episode_reward_min: 0.0
  episodes_this_iter: 1
  episodes_total: 18
  experiment_id: e6b91d6d56dc46e0b618e1e217a489ac
  hostname: cdsserver
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.3
          cur_lr: 5.000000000000001e-05
          entropy: 2.308965635299683
          entropy_coeff: 0.009999999999999998
          kl: 0.010724465343371833
          policy_loss: -0.1465073295144571
          total_loss: -0.1646737790770001
          vf_explained_var: -0.9383972883224487
          vf_loss: 0.001705866341944784
    num_agent_steps_sampled: 18000
    num_agent_steps_trained: 18000
    num_steps_sampled: 18000
    num_steps_trained: 18000
  iterations_since_restore: 18
  node_ip: 192.168.1.96
  num_healthy_worker

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_ba7ae_00000,RUNNING,192.168.1.96:50048,18,525.109,18000,0,0,0,1000


Result for PPO_my_env_ba7ae_00000:
  agent_timesteps_total: 19000
  custom_metrics: {}
  date: 2021-10-02_08-05-00
  done: false
  episode_len_mean: 1000.0
  episode_media: {}
  episode_reward_max: 0.0
  episode_reward_mean: 0.0
  episode_reward_min: 0.0
  episodes_this_iter: 1
  episodes_total: 19
  experiment_id: e6b91d6d56dc46e0b618e1e217a489ac
  hostname: cdsserver
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.3
          cur_lr: 5.000000000000001e-05
          entropy: 2.215860652923584
          entropy_coeff: 0.009999999999999998
          kl: 0.009488277918171306
          policy_loss: -0.2925342020061281
          total_loss: -0.31007658309406705
          vf_explained_var: -0.35835760831832886
          vf_loss: 0.0017697355173166014
    num_agent_steps_sampled: 19000
    num_agent_steps_trained: 19000
    num_steps_sampled: 19000
    num_steps_trained: 19000
  iterations_since_restore: 19
  node_ip: 192.168.1.96
  num_healthy_wor

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_ba7ae_00000,RUNNING,192.168.1.96:50048,19,540.789,19000,0,0,0,1000


Result for PPO_my_env_ba7ae_00000:
  agent_timesteps_total: 20000
  custom_metrics: {}
  date: 2021-10-02_08-05-17
  done: false
  episode_len_mean: 1000.0
  episode_media: {}
  episode_reward_max: 0.0
  episode_reward_mean: 0.0
  episode_reward_min: 0.0
  episodes_this_iter: 1
  episodes_total: 20
  experiment_id: e6b91d6d56dc46e0b618e1e217a489ac
  hostname: cdsserver
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.3
          cur_lr: 5.000000000000001e-05
          entropy: 2.24553022119734
          entropy_coeff: 0.009999999999999998
          kl: 0.009003435538894104
          policy_loss: -0.05972740418381161
          total_loss: -0.0773909698964821
          vf_explained_var: -0.9506805539131165
          vf_loss: 0.002090705519852539
    num_agent_steps_sampled: 20000
    num_agent_steps_trained: 20000
    num_steps_sampled: 20000
    num_steps_trained: 20000
  iterations_since_restore: 20
  node_ip: 192.168.1.96
  num_healthy_worker

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_ba7ae_00000,RUNNING,192.168.1.96:50048,20,557.243,20000,0,0,0,1000


Result for PPO_my_env_ba7ae_00000:
  agent_timesteps_total: 21000
  custom_metrics: {}
  date: 2021-10-02_08-05-34
  done: false
  episode_len_mean: 1000.0
  episode_media: {}
  episode_reward_max: 0.0
  episode_reward_mean: 0.0
  episode_reward_min: 0.0
  episodes_this_iter: 1
  episodes_total: 21
  experiment_id: e6b91d6d56dc46e0b618e1e217a489ac
  hostname: cdsserver
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.3
          cur_lr: 5.000000000000001e-05
          entropy: 2.189465970463223
          entropy_coeff: 0.009999999999999998
          kl: 0.009367137870199235
          policy_loss: -0.03264522407617834
          total_loss: -0.049852876075439985
          vf_explained_var: -1.0
          vf_loss: 0.001876864143155722
    num_agent_steps_sampled: 21000
    num_agent_steps_trained: 21000
    num_steps_sampled: 21000
    num_steps_trained: 21000
  iterations_since_restore: 21
  node_ip: 192.168.1.96
  num_healthy_workers: 1
  off_p

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_ba7ae_00000,RUNNING,192.168.1.96:50048,21,574.193,21000,0,0,0,1000


Result for PPO_my_env_ba7ae_00000:
  agent_timesteps_total: 22000
  custom_metrics: {}
  date: 2021-10-02_08-05-51
  done: false
  episode_len_mean: 1000.0
  episode_media: {}
  episode_reward_max: 0.0
  episode_reward_mean: 0.0
  episode_reward_min: 0.0
  episodes_this_iter: 1
  episodes_total: 22
  experiment_id: e6b91d6d56dc46e0b618e1e217a489ac
  hostname: cdsserver
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.3
          cur_lr: 5.000000000000001e-05
          entropy: 2.247083483801948
          entropy_coeff: 0.009999999999999998
          kl: 0.009809034730543557
          policy_loss: -0.12085900281866392
          total_loss: -0.13882948611345555
          vf_explained_var: -1.0
          vf_loss: 0.0015576437076864143
    num_agent_steps_sampled: 22000
    num_agent_steps_trained: 22000
    num_steps_sampled: 22000
    num_steps_trained: 22000
  iterations_since_restore: 22
  node_ip: 192.168.1.96
  num_healthy_workers: 1
  off_p

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_ba7ae_00000,RUNNING,192.168.1.96:50048,22,591.06,22000,0,0,0,1000


Result for PPO_my_env_ba7ae_00000:
  agent_timesteps_total: 23000
  custom_metrics: {}
  date: 2021-10-02_08-06-06
  done: false
  episode_len_mean: 1000.0
  episode_media: {}
  episode_reward_max: 0.0
  episode_reward_mean: 0.0
  episode_reward_min: 0.0
  episodes_this_iter: 1
  episodes_total: 23
  experiment_id: e6b91d6d56dc46e0b618e1e217a489ac
  hostname: cdsserver
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.3
          cur_lr: 5.000000000000001e-05
          entropy: 2.29186880853441
          entropy_coeff: 0.009999999999999998
          kl: 0.012569230207493062
          policy_loss: -0.060284621434079276
          total_loss: -0.07803077167934841
          vf_explained_var: -0.48386430740356445
          vf_loss: 0.0014017721528135653
    num_agent_steps_sampled: 23000
    num_agent_steps_trained: 23000
    num_steps_sampled: 23000
    num_steps_trained: 23000
  iterations_since_restore: 23
  node_ip: 192.168.1.96
  num_healthy_wo

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_ba7ae_00000,RUNNING,192.168.1.96:50048,23,606.647,23000,0,0,0,1000


Result for PPO_my_env_ba7ae_00000:
  agent_timesteps_total: 24000
  custom_metrics: {}
  date: 2021-10-02_08-06-20
  done: false
  episode_len_mean: 1000.0
  episode_media: {}
  episode_reward_max: 0.0
  episode_reward_mean: 0.0
  episode_reward_min: 0.0
  episodes_this_iter: 1
  episodes_total: 24
  experiment_id: e6b91d6d56dc46e0b618e1e217a489ac
  hostname: cdsserver
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.3
          cur_lr: 5.000000000000001e-05
          entropy: 1.7910295168558756
          entropy_coeff: 0.009999999999999998
          kl: 0.007871447651408816
          policy_loss: -0.16238004378974438
          total_loss: -0.17768344949516984
          vf_explained_var: 0.29554057121276855
          vf_loss: 0.00024546026638240113
    num_agent_steps_sampled: 24000
    num_agent_steps_trained: 24000
    num_steps_sampled: 24000
    num_steps_trained: 24000
  iterations_since_restore: 24
  node_ip: 192.168.1.96
  num_healthy_w

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_ba7ae_00000,RUNNING,192.168.1.96:50048,24,620.455,24000,0,0,0,1000


Result for PPO_my_env_ba7ae_00000:
  agent_timesteps_total: 25000
  custom_metrics: {}
  date: 2021-10-02_08-06-35
  done: false
  episode_len_mean: 1000.0
  episode_media: {}
  episode_reward_max: 0.0
  episode_reward_mean: 0.0
  episode_reward_min: 0.0
  episodes_this_iter: 1
  episodes_total: 25
  experiment_id: e6b91d6d56dc46e0b618e1e217a489ac
  hostname: cdsserver
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.3
          cur_lr: 5.000000000000001e-05
          entropy: 1.7241903556717766
          entropy_coeff: 0.009999999999999998
          kl: 0.004136660007062155
          policy_loss: -0.15480173548890486
          total_loss: -0.1703178840999802
          vf_explained_var: 0.04485338553786278
          vf_loss: 0.00048475229840631883
    num_agent_steps_sampled: 25000
    num_agent_steps_trained: 25000
    num_steps_sampled: 25000
    num_steps_trained: 25000
  iterations_since_restore: 25
  node_ip: 192.168.1.96
  num_healthy_wo

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_ba7ae_00000,RUNNING,192.168.1.96:50048,25,634.876,25000,0,0,0,1000


Result for PPO_my_env_ba7ae_00000:
  agent_timesteps_total: 26000
  custom_metrics: {}
  date: 2021-10-02_08-06-49
  done: false
  episode_len_mean: 1000.0
  episode_media: {}
  episode_reward_max: 0.0
  episode_reward_mean: 0.0
  episode_reward_min: 0.0
  episodes_this_iter: 1
  episodes_total: 26
  experiment_id: e6b91d6d56dc46e0b618e1e217a489ac
  hostname: cdsserver
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.15
          cur_lr: 5.000000000000001e-05
          entropy: 1.700627491209242
          entropy_coeff: 0.009999999999999998
          kl: 0.008560338233513123
          policy_loss: -0.13626314542359777
          total_loss: -0.1515316622124778
          vf_explained_var: 0.07711713761091232
          vf_loss: 0.00045370558225638687
    num_agent_steps_sampled: 26000
    num_agent_steps_trained: 26000
    num_steps_sampled: 26000
    num_steps_trained: 26000
  iterations_since_restore: 26
  node_ip: 192.168.1.96
  num_healthy_wo

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_ba7ae_00000,RUNNING,192.168.1.96:50048,26,648.701,26000,0,0,0,1000
