In [1]:
import torch 
from torch import nn

import ray
from ray.rllib.agents import ppo
from ray.rllib.models import ModelCatalog
from ray.rllib.models.torch.torch_modelv2 import TorchModelV2
from ray.rllib.utils.annotations import override

#from models import VisualEncoder
from train import *



In [2]:
class VisualEncoder(nn.Module):
    def __init__(self):
        super().__init__()

        self.cnn = nn.Sequential(
            nn.Conv2d(3, 32, kernel_size=8, stride=4, padding=0),  
            nn.ReLU(),
            nn.Conv2d(32, 64, kernel_size=4, stride=2, padding=0), 
            nn.ReLU(),
            nn.Conv2d(64, 64, kernel_size=3, stride=1, padding=0), 
            nn.ReLU(),
            nn.Conv2d(64, 64, kernel_size=3, stride=1, padding=0),
            nn.ReLU(), 
            nn.Conv2d(64, 64, kernel_size=2, stride=1, padding=0),
            nn.Flatten(),
        )

    def forward(self, x):
        return self.cnn(x)

In [3]:
class MyModelClass(TorchModelV2, nn.Module):
    def __init__(self, obs_space, action_space, num_outputs, model_config, name):
        TorchModelV2.__init__(self, obs_space, action_space, num_outputs, model_config, name)
        nn.Module.__init__(self)
        features_dim = 64
        self.encoder = VisualEncoder()
        self.encoder.load_state_dict(
            torch.load("/IGLU-Minecraft/models/AlinaCNN/encoder_weigths.pth", map_location=torch.device('cpu'))
        )
        self.action_head = nn.Linear(features_dim, action_space.n)
        self.value_head = nn.Linear(features_dim, 1)
        self.last_value = None
        
        self.use_cuda = torch.cuda.is_available()
        if self.use_cuda:
            self.encoder.cuda()
            self.action_head.cuda()
            self.value_head.cuda()
        
    @override(TorchModelV2)
    def forward(self, input_dict, state, seq_lens):
        obs = input_dict['obs'].permute(0, 3, 1, 2).float() / 255.0
        if self.use_cuda:
            obs.cuda()
            
        features = self.encoder(obs)
        action = self.action_head(features)
        self.last_value = self.value_head(features).squeeze(1)
        return action, state
    
    @override(TorchModelV2)
    def value_function(self):
        assert self.last_value is not None, "must call forward() first"
        return self.last_value

In [4]:
ModelCatalog.register_custom_model("my_torch_model", MyModelClass)

In [5]:
import os
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"   # see issue #152
os.environ["CUDA_VISIBLE_DEVICES"]="0"

def env_creator(env_config):
    env = gym.make('IGLUSilentBuilder-v0', max_steps=1000)
    env.update_taskset(TaskSet(preset=['C17']))
    env = PovOnlyWrapper(env)
    env = IgluActionWrapper(env)
    return env

from ray.tune.registry import register_env
register_env("my_env", env_creator)

from ray import tune
from ray.rllib.agents.ppo import PPOTrainer

In [None]:
from ray.tune.integration.wandb import WandbLogger

tune.run(PPOTrainer, 
         config={
             "env": "my_env", 
             "framework": "torch",
             "num_gpus": 1,
             "num_workers": 1,
             "sgd_minibatch_size": 256,
             "clip_param": 0.2,
             "entropy_coeff": 0.01,
             "lambda": 0.95,
             "train_batch_size": 1000,
             "model": {
                    # Specify our custom model from above.
                    "custom_model": "my_torch_model",
                    # Extra kwargs to be passed to your model's c'tor.
                    "custom_model_config": {},
              },
             "logger_config": {
                  "wandb": {
                      "project": "IGLU-Minecraft",
                      "name": "PPO C17 pretrained"
                  }
              }

        },
        loggers=[WandbLogger])



Trial name,status,loc
PPO_my_env_421d3_00000,PENDING,


2021-09-18 11:04:56,824	INFO wandb.py:170 -- Already logged into W&B.
2021-09-18 11:04:56,836	ERROR syncer.py:72 -- Log sync requires rsync to be installed.
[34m[1mwandb[0m: Currently logged in as: [33mlinar[0m (use `wandb login --relogin` to force relogin)


[2m[36m(pid=58357)[0m 2021-09-18 11:05:00,414	INFO ppo.py:159 -- In multi-agent mode, policies will be optimized sequentially by the multi-GPU optimizer. Consider setting simple_optimizer=True if this doesn't work for you.
[2m[36m(pid=58357)[0m 2021-09-18 11:05:00,414	INFO trainer.py:728 -- Current log_level is WARN. For more information, set 'log_level': 'INFO' / 'DEBUG' or use the -v and -vv flags.


Result for PPO_my_env_421d3_00000:
  agent_timesteps_total: 1000
  custom_metrics: {}
  date: 2021-09-18_11-06-54
  done: false
  episode_len_mean: 1000.0
  episode_media: {}
  episode_reward_max: 0.0
  episode_reward_mean: 0.0
  episode_reward_min: 0.0
  episodes_this_iter: 1
  episodes_total: 1
  experiment_id: e1746907352f4fe585fcb0799b3f64f3
  hostname: linar-Z390-GAMING-X
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.19999999999999996
          cur_lr: 5.000000000000001e-05
          entropy: 0.2018264439370897
          entropy_coeff: 0.009999999999999998
          kl: 0.0011697852865390403
          policy_loss: -0.19659400859640705
          total_loss: -0.19176375745899146
          vf_explained_var: 0.24490948021411896
          vf_loss: 0.00661455605748213
    num_agent_steps_sampled: 1000
    num_agent_steps_trained: 1000
    num_steps_sampled: 1000
    num_steps_trained: 1000
  iterations_since_restore: 1
  node_ip: 192.168.3.5

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_421d3_00000,RUNNING,192.168.3.5:58357,1,107.968,1000,0,0,0,1000


Result for PPO_my_env_421d3_00000:
  agent_timesteps_total: 2000
  custom_metrics: {}
  date: 2021-09-18_11-07-04
  done: false
  episode_len_mean: 1000.0
  episode_media: {}
  episode_reward_max: 0.0
  episode_reward_mean: 0.0
  episode_reward_min: 0.0
  episodes_this_iter: 1
  episodes_total: 2
  experiment_id: e1746907352f4fe585fcb0799b3f64f3
  hostname: linar-Z390-GAMING-X
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.09999999999999998
          cur_lr: 5.000000000000001e-05
          entropy: 0.17195624974038864
          entropy_coeff: 0.009999999999999998
          kl: 0.0013576971563139355
          policy_loss: -0.19032121176520983
          total_loss: -0.1885722594956557
          vf_explained_var: 0.26884379982948303
          vf_loss: 0.0033327439237230768
    num_agent_steps_sampled: 2000
    num_agent_steps_trained: 2000
    num_steps_sampled: 2000
    num_steps_trained: 2000
  iterations_since_restore: 2
  node_ip: 192.168.3

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_421d3_00000,RUNNING,192.168.3.5:58357,2,117.666,2000,0,0,0,1000


Result for PPO_my_env_421d3_00000:
  agent_timesteps_total: 3000
  custom_metrics: {}
  date: 2021-09-18_11-07-13
  done: false
  episode_len_mean: 1000.0
  episode_media: {}
  episode_reward_max: 0.0
  episode_reward_mean: 0.0
  episode_reward_min: 0.0
  episodes_this_iter: 1
  episodes_total: 3
  experiment_id: e1746907352f4fe585fcb0799b3f64f3
  hostname: linar-Z390-GAMING-X
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.04999999999999999
          cur_lr: 5.000000000000001e-05
          entropy: 0.15569361878765953
          entropy_coeff: 0.009999999999999998
          kl: 0.0016002887828744348
          policy_loss: -0.23744630076818996
          total_loss: -0.23661195950375663
          vf_explained_var: 0.2895536720752716
          vf_loss: 0.0023112639066918445
    num_agent_steps_sampled: 3000
    num_agent_steps_trained: 3000
    num_steps_sampled: 3000
    num_steps_trained: 3000
  iterations_since_restore: 3
  node_ip: 192.168.3

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_421d3_00000,RUNNING,192.168.3.5:58357,3,127.393,3000,0,0,0,1000


Result for PPO_my_env_421d3_00000:
  agent_timesteps_total: 4000
  custom_metrics: {}
  date: 2021-09-18_11-07-23
  done: false
  episode_len_mean: 1000.0
  episode_media: {}
  episode_reward_max: 0.0
  episode_reward_mean: 0.0
  episode_reward_min: 0.0
  episodes_this_iter: 1
  episodes_total: 4
  experiment_id: e1746907352f4fe585fcb0799b3f64f3
  hostname: linar-Z390-GAMING-X
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.024999999999999994
          cur_lr: 5.000000000000001e-05
          entropy: 0.12306880975763003
          entropy_coeff: 0.009999999999999998
          kl: 0.000839934298924473
          policy_loss: -0.22062717196014192
          total_loss: -0.22025686071978676
          vf_explained_var: 0.5524377822875977
          vf_loss: 0.0015800038456088967
    num_agent_steps_sampled: 4000
    num_agent_steps_trained: 4000
    num_steps_sampled: 4000
    num_steps_trained: 4000
  iterations_since_restore: 4
  node_ip: 192.168.3

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_421d3_00000,RUNNING,192.168.3.5:58357,4,136.854,4000,0,0,0,1000


Result for PPO_my_env_421d3_00000:
  agent_timesteps_total: 5000
  custom_metrics: {}
  date: 2021-09-18_11-07-32
  done: false
  episode_len_mean: 1000.0
  episode_media: {}
  episode_reward_max: 0.0
  episode_reward_mean: 0.0
  episode_reward_min: 0.0
  episodes_this_iter: 1
  episodes_total: 5
  experiment_id: e1746907352f4fe585fcb0799b3f64f3
  hostname: linar-Z390-GAMING-X
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.012499999999999997
          cur_lr: 5.000000000000001e-05
          entropy: 0.13940821637709935
          entropy_coeff: 0.009999999999999998
          kl: 0.00042468792188975847
          policy_loss: -0.17516648636923896
          total_loss: -0.17484141050113572
          vf_explained_var: 0.42267173528671265
          vf_loss: 0.001713849793628065
    num_agent_steps_sampled: 5000
    num_agent_steps_trained: 5000
    num_steps_sampled: 5000
    num_steps_trained: 5000
  iterations_since_restore: 5
  node_ip: 192.168

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_421d3_00000,RUNNING,192.168.3.5:58357,5,146.084,5000,0,0,0,1000


Result for PPO_my_env_421d3_00000:
  agent_timesteps_total: 6000
  custom_metrics: {}
  date: 2021-09-18_11-07-41
  done: false
  episode_len_mean: 1000.0
  episode_media: {}
  episode_reward_max: 0.0
  episode_reward_mean: 0.0
  episode_reward_min: 0.0
  episodes_this_iter: 1
  episodes_total: 6
  experiment_id: e1746907352f4fe585fcb0799b3f64f3
  hostname: linar-Z390-GAMING-X
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.006249999999999999
          cur_lr: 5.000000000000001e-05
          entropy: 0.12240182004041142
          entropy_coeff: 0.009999999999999998
          kl: 0.0012270989028972206
          policy_loss: -0.24358405735757616
          total_loss: -0.24378442449702156
          vf_explained_var: 0.6282027959823608
          vf_loss: 0.0010159841415265367
    num_agent_steps_sampled: 6000
    num_agent_steps_trained: 6000
    num_steps_sampled: 6000
    num_steps_trained: 6000
  iterations_since_restore: 6
  node_ip: 192.168.

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_421d3_00000,RUNNING,192.168.3.5:58357,6,155.298,6000,0,0,0,1000


Result for PPO_my_env_421d3_00000:
  agent_timesteps_total: 7000
  custom_metrics: {}
  date: 2021-09-18_11-07-51
  done: false
  episode_len_mean: 1000.0
  episode_media: {}
  episode_reward_max: 0.0
  episode_reward_mean: 0.0
  episode_reward_min: 0.0
  episodes_this_iter: 1
  episodes_total: 7
  experiment_id: e1746907352f4fe585fcb0799b3f64f3
  hostname: linar-Z390-GAMING-X
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.0031249999999999993
          cur_lr: 5.000000000000001e-05
          entropy: 0.13986577557192909
          entropy_coeff: 0.009999999999999998
          kl: 0.0003794179507494672
          policy_loss: -0.18513635471463202
          total_loss: -0.18561745170089933
          vf_explained_var: 0.7448583841323853
          vf_loss: 0.0009163742960986888
    num_agent_steps_sampled: 7000
    num_agent_steps_trained: 7000
    num_steps_sampled: 7000
    num_steps_trained: 7000
  iterations_since_restore: 7
  node_ip: 192.168

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_421d3_00000,RUNNING,192.168.3.5:58357,7,164.488,7000,0,0,0,1000


Result for PPO_my_env_421d3_00000:
  agent_timesteps_total: 8000
  custom_metrics: {}
  date: 2021-09-18_11-08-00
  done: false
  episode_len_mean: 1000.0
  episode_media: {}
  episode_reward_max: 0.0
  episode_reward_mean: 0.0
  episode_reward_min: 0.0
  episodes_this_iter: 1
  episodes_total: 8
  experiment_id: e1746907352f4fe585fcb0799b3f64f3
  hostname: linar-Z390-GAMING-X
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.0015624999999999997
          cur_lr: 5.000000000000001e-05
          entropy: 0.14316424594985114
          entropy_coeff: 0.009999999999999998
          kl: 0.0004500684930109047
          policy_loss: -0.25178282550639575
          total_loss: -0.25204691597157053
          vf_explained_var: 0.47237062454223633
          vf_loss: 0.0011668493491015397
    num_agent_steps_sampled: 8000
    num_agent_steps_trained: 8000
    num_steps_sampled: 8000
    num_steps_trained: 8000
  iterations_since_restore: 8
  node_ip: 192.16

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_421d3_00000,RUNNING,192.168.3.5:58357,8,173.637,8000,0,0,0,1000


Result for PPO_my_env_421d3_00000:
  agent_timesteps_total: 9000
  custom_metrics: {}
  date: 2021-09-18_11-08-09
  done: false
  episode_len_mean: 1000.0
  episode_media: {}
  episode_reward_max: 0.0
  episode_reward_mean: 0.0
  episode_reward_min: 0.0
  episodes_this_iter: 1
  episodes_total: 9
  experiment_id: e1746907352f4fe585fcb0799b3f64f3
  hostname: linar-Z390-GAMING-X
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.0007812499999999998
          cur_lr: 5.000000000000001e-05
          entropy: 0.16709488299157885
          entropy_coeff: 0.009999999999999998
          kl: 0.0004365125089325122
          policy_loss: -0.26218835032648513
          total_loss: -0.2633674282166693
          vf_explained_var: 0.6881450414657593
          vf_loss: 0.0004915261861848801
    num_agent_steps_sampled: 9000
    num_agent_steps_trained: 9000
    num_steps_sampled: 9000
    num_steps_trained: 9000
  iterations_since_restore: 9
  node_ip: 192.168.

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_421d3_00000,RUNNING,192.168.3.5:58357,9,182.697,9000,0,0,0,1000


Result for PPO_my_env_421d3_00000:
  agent_timesteps_total: 10000
  custom_metrics: {}
  date: 2021-09-18_11-08-18
  done: false
  episode_len_mean: 1000.0
  episode_media: {}
  episode_reward_max: 0.0
  episode_reward_mean: 0.0
  episode_reward_min: 0.0
  episodes_this_iter: 1
  episodes_total: 10
  experiment_id: e1746907352f4fe585fcb0799b3f64f3
  hostname: linar-Z390-GAMING-X
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.0003906249999999999
          cur_lr: 5.000000000000001e-05
          entropy: 0.20750636491510605
          entropy_coeff: 0.009999999999999998
          kl: 0.0013582561992599353
          policy_loss: -0.30657355181045004
          total_loss: -0.30610203229718735
          vf_explained_var: 0.8595510721206665
          vf_loss: 0.002546053606945659
    num_agent_steps_sampled: 10000
    num_agent_steps_trained: 10000
    num_steps_sampled: 10000
    num_steps_trained: 10000
  iterations_since_restore: 10
  node_ip: 1

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_421d3_00000,RUNNING,192.168.3.5:58357,10,191.855,10000,0,0,0,1000


Result for PPO_my_env_421d3_00000:
  agent_timesteps_total: 11000
  custom_metrics: {}
  date: 2021-09-18_11-08-27
  done: false
  episode_len_mean: 1000.0
  episode_media: {}
  episode_reward_max: 0.0
  episode_reward_mean: 0.0
  episode_reward_min: 0.0
  episodes_this_iter: 1
  episodes_total: 11
  experiment_id: e1746907352f4fe585fcb0799b3f64f3
  hostname: linar-Z390-GAMING-X
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.00019531249999999996
          cur_lr: 5.000000000000001e-05
          entropy: 0.2162388735347324
          entropy_coeff: 0.009999999999999998
          kl: 0.0007738437878677118
          policy_loss: -0.29779169923729365
          total_loss: -0.29901888229780726
          vf_explained_var: 0.8082315325737
          vf_loss: 0.0009350520164136671
    num_agent_steps_sampled: 11000
    num_agent_steps_trained: 11000
    num_steps_sampled: 11000
    num_steps_trained: 11000
  iterations_since_restore: 11
  node_ip: 192

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_421d3_00000,RUNNING,192.168.3.5:58357,11,200.959,11000,0,0,0,1000


Result for PPO_my_env_421d3_00000:
  agent_timesteps_total: 12000
  custom_metrics: {}
  date: 2021-09-18_11-08-36
  done: false
  episode_len_mean: 1000.0
  episode_media: {}
  episode_reward_max: 0.0
  episode_reward_mean: 0.0
  episode_reward_min: 0.0
  episodes_this_iter: 1
  episodes_total: 12
  experiment_id: e1746907352f4fe585fcb0799b3f64f3
  hostname: linar-Z390-GAMING-X
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 9.765624999999998e-05
          cur_lr: 5.000000000000001e-05
          entropy: 0.3019250732329157
          entropy_coeff: 0.009999999999999998
          kl: 0.017863470547736505
          policy_loss: -0.29298865778578653
          total_loss: -0.29388851159148743
          vf_explained_var: 0.8795185685157776
          vf_loss: 0.002117653380668748
    num_agent_steps_sampled: 12000
    num_agent_steps_trained: 12000
    num_steps_sampled: 12000
    num_steps_trained: 12000
  iterations_since_restore: 12
  node_ip: 192

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_421d3_00000,RUNNING,192.168.3.5:58357,12,210.114,12000,0,0,0,1000


Result for PPO_my_env_421d3_00000:
  agent_timesteps_total: 13000
  custom_metrics: {}
  date: 2021-09-18_11-08-45
  done: false
  episode_len_mean: 1000.0
  episode_media: {}
  episode_reward_max: 0.0
  episode_reward_mean: 0.0
  episode_reward_min: 0.0
  episodes_this_iter: 1
  episodes_total: 13
  experiment_id: e1746907352f4fe585fcb0799b3f64f3
  hostname: linar-Z390-GAMING-X
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 9.765624999999998e-05
          cur_lr: 5.000000000000001e-05
          entropy: 0.41232521103488073
          entropy_coeff: 0.009999999999999998
          kl: 0.0028415517179936956
          policy_loss: -0.29263091534376146
          total_loss: -0.29511622670623994
          vf_explained_var: 0.36268362402915955
          vf_loss: 0.0016376667436917261
    num_agent_steps_sampled: 13000
    num_agent_steps_trained: 13000
    num_steps_sampled: 13000
    num_steps_trained: 13000
  iterations_since_restore: 13
  node_ip:

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_421d3_00000,RUNNING,192.168.3.5:58357,13,219.264,13000,0,0,0,1000


Result for PPO_my_env_421d3_00000:
  agent_timesteps_total: 14000
  custom_metrics: {}
  date: 2021-09-18_11-08-55
  done: false
  episode_len_mean: 1000.0
  episode_media: {}
  episode_reward_max: 0.0
  episode_reward_mean: 0.0
  episode_reward_min: 0.0
  episodes_this_iter: 1
  episodes_total: 14
  experiment_id: e1746907352f4fe585fcb0799b3f64f3
  hostname: linar-Z390-GAMING-X
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 4.882812499999999e-05
          cur_lr: 5.000000000000001e-05
          entropy: 0.3610242345266872
          entropy_coeff: 0.009999999999999998
          kl: 0.0030075919693449767
          policy_loss: -0.3418657117419773
          total_loss: -0.34502124674618245
          vf_explained_var: 0.8625590801239014
          vf_loss: 0.0004545595512253284
    num_agent_steps_sampled: 14000
    num_agent_steps_trained: 14000
    num_steps_sampled: 14000
    num_steps_trained: 14000
  iterations_since_restore: 14
  node_ip: 19

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_421d3_00000,RUNNING,192.168.3.5:58357,14,228.38,14000,0,0,0,1000


Result for PPO_my_env_421d3_00000:
  agent_timesteps_total: 15000
  custom_metrics: {}
  date: 2021-09-18_11-09-04
  done: false
  episode_len_mean: 1000.0
  episode_media: {}
  episode_reward_max: 0.0
  episode_reward_mean: 0.0
  episode_reward_min: 0.0
  episodes_this_iter: 1
  episodes_total: 15
  experiment_id: e1746907352f4fe585fcb0799b3f64f3
  hostname: linar-Z390-GAMING-X
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 2.4414062499999995e-05
          cur_lr: 5.000000000000001e-05
          entropy: 0.6607209573189418
          entropy_coeff: 0.009999999999999998
          kl: 0.06460219104603085
          policy_loss: -0.01888044277826945
          total_loss: -0.02389566467867957
          vf_explained_var: 0.663105845451355
          vf_loss: 0.001590412508812733
    num_agent_steps_sampled: 15000
    num_agent_steps_trained: 15000
    num_steps_sampled: 15000
    num_steps_trained: 15000
  iterations_since_restore: 15
  node_ip: 192.

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_421d3_00000,RUNNING,192.168.3.5:58357,15,237.583,15000,0,0,0,1000


Result for PPO_my_env_421d3_00000:
  agent_timesteps_total: 16000
  custom_metrics: {}
  date: 2021-09-18_11-09-14
  done: false
  episode_len_mean: 1000.0
  episode_media: {}
  episode_reward_max: 3.0
  episode_reward_mean: 0.1875
  episode_reward_min: 0.0
  episodes_this_iter: 1
  episodes_total: 16
  experiment_id: e1746907352f4fe585fcb0799b3f64f3
  hostname: linar-Z390-GAMING-X
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 3.662109375e-05
          cur_lr: 5.000000000000001e-05
          entropy: 0.8807664725515577
          entropy_coeff: 0.009999999999999998
          kl: 0.023539998221427678
          policy_loss: 0.15372695823510488
          total_loss: 0.18073118097252316
          vf_explained_var: 0.7232553362846375
          vf_loss: 0.03581102263156532
    num_agent_steps_sampled: 16000
    num_agent_steps_trained: 16000
    num_steps_sampled: 16000
    num_steps_trained: 16000
  iterations_since_restore: 16
  node_ip: 192.168.3

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_421d3_00000,RUNNING,192.168.3.5:58357,16,247.398,16000,0.1875,3,0,1000


Result for PPO_my_env_421d3_00000:
  agent_timesteps_total: 17000
  custom_metrics: {}
  date: 2021-09-18_11-09-24
  done: false
  episode_len_mean: 1000.0
  episode_media: {}
  episode_reward_max: 3.0
  episode_reward_mean: -0.11764705882352941
  episode_reward_min: -5.0
  episodes_this_iter: 1
  episodes_total: 17
  experiment_id: e1746907352f4fe585fcb0799b3f64f3
  hostname: linar-Z390-GAMING-X
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 5.493164062500002e-05
          cur_lr: 5.000000000000001e-05
          entropy: 0.7813320272498661
          entropy_coeff: 0.009999999999999998
          kl: 0.02645949265188765
          policy_loss: 0.06311811101105479
          total_loss: 0.1279387176864677
          vf_explained_var: 0.5167554020881653
          vf_loss: 0.072632474033162
    num_agent_steps_sampled: 17000
    num_agent_steps_trained: 17000
    num_steps_sampled: 17000
    num_steps_trained: 17000
  iterations_since_restore: 17
  n

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_421d3_00000,RUNNING,192.168.3.5:58357,17,257.214,17000,-0.117647,3,-5,1000


Result for PPO_my_env_421d3_00000:
  agent_timesteps_total: 18000
  custom_metrics: {}
  date: 2021-09-18_11-09-34
  done: false
  episode_len_mean: 1000.0
  episode_media: {}
  episode_reward_max: 3.0
  episode_reward_mean: -0.1111111111111111
  episode_reward_min: -5.0
  episodes_this_iter: 1
  episodes_total: 18
  experiment_id: e1746907352f4fe585fcb0799b3f64f3
  hostname: linar-Z390-GAMING-X
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 8.239746093749998e-05
          cur_lr: 5.000000000000001e-05
          entropy: 1.4423211455345153
          entropy_coeff: 0.009999999999999998
          kl: 0.03675220778792191
          policy_loss: 0.16689947607616584
          total_loss: 0.17278852835297584
          vf_explained_var: 0.679054319858551
          vf_loss: 0.02030923782537381
    num_agent_steps_sampled: 18000
    num_agent_steps_trained: 18000
    num_steps_sampled: 18000
    num_steps_trained: 18000
  iterations_since_restore: 18
  

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_421d3_00000,RUNNING,192.168.3.5:58357,18,268.012,18000,-0.111111,3,-5,1000


Result for PPO_my_env_421d3_00000:
  agent_timesteps_total: 19000
  custom_metrics: {}
  date: 2021-09-18_11-09-44
  done: false
  episode_len_mean: 1000.0
  episode_media: {}
  episode_reward_max: 3.0
  episode_reward_mean: -0.10526315789473684
  episode_reward_min: -5.0
  episodes_this_iter: 1
  episodes_total: 19
  experiment_id: e1746907352f4fe585fcb0799b3f64f3
  hostname: linar-Z390-GAMING-X
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.00012359619140625
          cur_lr: 5.000000000000001e-05
          entropy: 2.0058646029896208
          entropy_coeff: 0.009999999999999998
          kl: 0.01809309908322166
          policy_loss: 0.007175225350591872
          total_loss: -0.0009995124406284757
          vf_explained_var: 0.7102770805358887
          vf_loss: 0.011881674614010585
    num_agent_steps_sampled: 19000
    num_agent_steps_trained: 19000
    num_steps_sampled: 19000
    num_steps_trained: 19000
  iterations_since_restore: 

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_421d3_00000,RUNNING,192.168.3.5:58357,19,277.614,19000,-0.105263,3,-5,1000


Result for PPO_my_env_421d3_00000:
  agent_timesteps_total: 20000
  custom_metrics: {}
  date: 2021-09-18_11-09-53
  done: false
  episode_len_mean: 1000.0
  episode_media: {}
  episode_reward_max: 3.0
  episode_reward_mean: -0.1
  episode_reward_min: -5.0
  episodes_this_iter: 1
  episodes_total: 20
  experiment_id: e1746907352f4fe585fcb0799b3f64f3
  hostname: linar-Z390-GAMING-X
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.00012359619140625
          cur_lr: 5.000000000000001e-05
          entropy: 1.9537920488251581
          entropy_coeff: 0.009999999999999998
          kl: 0.008438425565529744
          policy_loss: -0.1206215523597267
          total_loss: -0.13344770107004378
          vf_explained_var: 0.12720167636871338
          vf_loss: 0.0067107313894666735
    num_agent_steps_sampled: 20000
    num_agent_steps_trained: 20000
    num_steps_sampled: 20000
    num_steps_trained: 20000
  iterations_since_restore: 20
  node_ip: 19

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_421d3_00000,RUNNING,192.168.3.5:58357,20,286.29,20000,-0.1,3,-5,1000


Result for PPO_my_env_421d3_00000:
  agent_timesteps_total: 21000
  custom_metrics: {}
  date: 2021-09-18_11-10-01
  done: false
  episode_len_mean: 1000.0
  episode_media: {}
  episode_reward_max: 3.0
  episode_reward_mean: -0.09523809523809523
  episode_reward_min: -5.0
  episodes_this_iter: 1
  episodes_total: 21
  experiment_id: e1746907352f4fe585fcb0799b3f64f3
  hostname: linar-Z390-GAMING-X
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.00012359619140625
          cur_lr: 5.000000000000001e-05
          entropy: 1.7380413797166612
          entropy_coeff: 0.009999999999999998
          kl: 0.01368374277540755
          policy_loss: 0.0805840445889367
          total_loss: 0.06372020807531145
          vf_explained_var: 0.12812069058418274
          vf_loss: 0.0005148870280309994
    num_agent_steps_sampled: 21000
    num_agent_steps_trained: 21000
    num_steps_sampled: 21000
    num_steps_trained: 21000
  iterations_since_restore: 21


Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_421d3_00000,RUNNING,192.168.3.5:58357,21,294.162,21000,-0.0952381,3,-5,1000


Result for PPO_my_env_421d3_00000:
  agent_timesteps_total: 22000
  custom_metrics: {}
  date: 2021-09-18_11-10-10
  done: false
  episode_len_mean: 1000.0
  episode_media: {}
  episode_reward_max: 3.0
  episode_reward_mean: -0.09090909090909091
  episode_reward_min: -5.0
  episodes_this_iter: 1
  episodes_total: 22
  experiment_id: e1746907352f4fe585fcb0799b3f64f3
  hostname: linar-Z390-GAMING-X
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.00012359619140625
          cur_lr: 5.000000000000001e-05
          entropy: 0.8593016352918413
          entropy_coeff: 0.009999999999999998
          kl: 0.006778134844624143
          policy_loss: -0.10300603128141828
          total_loss: -0.10640543558531337
          vf_explained_var: 0.1125631332397461
          vf_loss: 0.00519277036914395
    num_agent_steps_sampled: 22000
    num_agent_steps_trained: 22000
    num_steps_sampled: 22000
    num_steps_trained: 22000
  iterations_since_restore: 22

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_421d3_00000,RUNNING,192.168.3.5:58357,22,303.837,22000,-0.0909091,3,-5,1000


Result for PPO_my_env_421d3_00000:
  agent_timesteps_total: 23000
  custom_metrics: {}
  date: 2021-09-18_11-10-19
  done: false
  episode_len_mean: 1000.0
  episode_media: {}
  episode_reward_max: 3.0
  episode_reward_mean: -0.08695652173913043
  episode_reward_min: -5.0
  episodes_this_iter: 1
  episodes_total: 23
  experiment_id: e1746907352f4fe585fcb0799b3f64f3
  hostname: linar-Z390-GAMING-X
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.00012359619140625
          cur_lr: 5.000000000000001e-05
          entropy: 0.8704562856091393
          entropy_coeff: 0.009999999999999998
          kl: 0.006414486770255302
          policy_loss: -0.1381760804189576
          total_loss: -0.14335700041717953
          vf_explained_var: 0.21256276965141296
          vf_loss: 0.0035228481833150405
    num_agent_steps_sampled: 23000
    num_agent_steps_trained: 23000
    num_steps_sampled: 23000
    num_steps_trained: 23000
  iterations_since_restore: 

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_421d3_00000,RUNNING,192.168.3.5:58357,23,312.473,23000,-0.0869565,3,-5,1000


Result for PPO_my_env_421d3_00000:
  agent_timesteps_total: 24000
  custom_metrics: {}
  date: 2021-09-18_11-10-29
  done: false
  episode_len_mean: 1000.0
  episode_media: {}
  episode_reward_max: 3.0
  episode_reward_mean: -0.08333333333333333
  episode_reward_min: -5.0
  episodes_this_iter: 1
  episodes_total: 24
  experiment_id: e1746907352f4fe585fcb0799b3f64f3
  hostname: linar-Z390-GAMING-X
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.00012359619140625
          cur_lr: 5.000000000000001e-05
          entropy: 1.6874538130230374
          entropy_coeff: 0.009999999999999998
          kl: 0.012950988931919996
          policy_loss: 0.0292768367462688
          total_loss: 0.020437346026301383
          vf_explained_var: 0.024658210575580597
          vf_loss: 0.008033448380107682
    num_agent_steps_sampled: 24000
    num_agent_steps_trained: 24000
    num_steps_sampled: 24000
    num_steps_trained: 24000
  iterations_since_restore: 2

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_421d3_00000,RUNNING,192.168.3.5:58357,24,322.599,24000,-0.0833333,3,-5,1000


Result for PPO_my_env_421d3_00000:
  agent_timesteps_total: 25000
  custom_metrics: {}
  date: 2021-09-18_11-10-39
  done: false
  episode_len_mean: 1000.0
  episode_media: {}
  episode_reward_max: 3.0
  episode_reward_mean: -0.08
  episode_reward_min: -5.0
  episodes_this_iter: 1
  episodes_total: 25
  experiment_id: e1746907352f4fe585fcb0799b3f64f3
  hostname: linar-Z390-GAMING-X
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.00012359619140625
          cur_lr: 5.000000000000001e-05
          entropy: 1.7816539406776428
          entropy_coeff: 0.009999999999999998
          kl: 0.011588512964183116
          policy_loss: -0.0294151504834493
          total_loss: -0.041229958997832404
          vf_explained_var: -0.11195705085992813
          vf_loss: 0.006000296042152008
    num_agent_steps_sampled: 25000
    num_agent_steps_trained: 25000
    num_steps_sampled: 25000
    num_steps_trained: 25000
  iterations_since_restore: 25
  node_ip: 

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_421d3_00000,RUNNING,192.168.3.5:58357,25,332.92,25000,-0.08,3,-5,1000


Result for PPO_my_env_421d3_00000:
  agent_timesteps_total: 26000
  custom_metrics: {}
  date: 2021-09-18_11-10-50
  done: false
  episode_len_mean: 1000.0
  episode_media: {}
  episode_reward_max: 3.0
  episode_reward_mean: -0.07692307692307693
  episode_reward_min: -5.0
  episodes_this_iter: 1
  episodes_total: 26
  experiment_id: e1746907352f4fe585fcb0799b3f64f3
  hostname: linar-Z390-GAMING-X
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.00012359619140625
          cur_lr: 5.000000000000001e-05
          entropy: 1.770128779941135
          entropy_coeff: 0.009999999999999998
          kl: 0.01599521642388134
          policy_loss: -0.039872885371247925
          total_loss: -0.052074534859922195
          vf_explained_var: -0.4746800661087036
          vf_loss: 0.005497662994700173
    num_agent_steps_sampled: 26000
    num_agent_steps_trained: 26000
    num_steps_sampled: 26000
    num_steps_trained: 26000
  iterations_since_restore: 

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_421d3_00000,RUNNING,192.168.3.5:58357,26,343.265,26000,-0.0769231,3,-5,1000


Result for PPO_my_env_421d3_00000:
  agent_timesteps_total: 27000
  custom_metrics: {}
  date: 2021-09-18_11-11-00
  done: false
  episode_len_mean: 1000.0
  episode_media: {}
  episode_reward_max: 3.0
  episode_reward_mean: -0.07407407407407407
  episode_reward_min: -5.0
  episodes_this_iter: 1
  episodes_total: 27
  experiment_id: e1746907352f4fe585fcb0799b3f64f3
  hostname: linar-Z390-GAMING-X
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.00012359619140625
          cur_lr: 5.000000000000001e-05
          entropy: 1.9447512798839146
          entropy_coeff: 0.009999999999999998
          kl: 0.016517577385555822
          policy_loss: -0.09943221130718788
          total_loss: -0.11274852990690205
          vf_explained_var: 0.2861216366291046
          vf_loss: 0.00612915197852999
    num_agent_steps_sampled: 27000
    num_agent_steps_trained: 27000
    num_steps_sampled: 27000
    num_steps_trained: 27000
  iterations_since_restore: 27

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_421d3_00000,RUNNING,192.168.3.5:58357,27,353.127,27000,-0.0740741,3,-5,1000


Result for PPO_my_env_421d3_00000:
  agent_timesteps_total: 28000
  custom_metrics: {}
  date: 2021-09-18_11-11-10
  done: false
  episode_len_mean: 1000.0
  episode_media: {}
  episode_reward_max: 3.0
  episode_reward_mean: -0.07142857142857142
  episode_reward_min: -5.0
  episodes_this_iter: 1
  episodes_total: 28
  experiment_id: e1746907352f4fe585fcb0799b3f64f3
  hostname: linar-Z390-GAMING-X
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.00012359619140625
          cur_lr: 5.000000000000001e-05
          entropy: 1.7941524108250937
          entropy_coeff: 0.009999999999999998
          kl: 0.01644895938988322
          policy_loss: -0.06326146556271448
          total_loss: -0.07805497679445479
          vf_explained_var: 0.6235350966453552
          vf_loss: 0.0031459786023737657
    num_agent_steps_sampled: 28000
    num_agent_steps_trained: 28000
    num_steps_sampled: 28000
    num_steps_trained: 28000
  iterations_since_restore: 2

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_421d3_00000,RUNNING,192.168.3.5:58357,28,363.638,28000,-0.0714286,3,-5,1000


Result for PPO_my_env_421d3_00000:
  agent_timesteps_total: 29000
  custom_metrics: {}
  date: 2021-09-18_11-11-20
  done: false
  episode_len_mean: 1000.0
  episode_media: {}
  episode_reward_max: 3.0
  episode_reward_mean: -0.06896551724137931
  episode_reward_min: -5.0
  episodes_this_iter: 1
  episodes_total: 29
  experiment_id: e1746907352f4fe585fcb0799b3f64f3
  hostname: linar-Z390-GAMING-X
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.00012359619140625
          cur_lr: 5.000000000000001e-05
          entropy: 2.109642067220476
          entropy_coeff: 0.009999999999999998
          kl: 0.012402049227611714
          policy_loss: -0.0008963064601023991
          total_loss: -0.0175599814289146
          vf_explained_var: -0.25409334897994995
          vf_loss: 0.004431212176051405
    num_agent_steps_sampled: 29000
    num_agent_steps_trained: 29000
    num_steps_sampled: 29000
    num_steps_trained: 29000
  iterations_since_restore:

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_421d3_00000,RUNNING,192.168.3.5:58357,29,373.38,29000,-0.0689655,3,-5,1000




Result for PPO_my_env_421d3_00000:
  agent_timesteps_total: 30000
  custom_metrics: {}
  date: 2021-09-18_11-11-46
  done: false
  episode_len_mean: 996.0666666666667
  episode_media: {}
  episode_reward_max: 3.0
  episode_reward_mean: -0.06666666666666667
  episode_reward_min: -5.0
  episodes_this_iter: 1
  episodes_total: 30
  experiment_id: e1746907352f4fe585fcb0799b3f64f3
  hostname: linar-Z390-GAMING-X
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.00012359619140625
          cur_lr: 5.000000000000001e-05
          entropy: 2.095770502090454
          entropy_coeff: 0.009999999999999998
          kl: 0.010864856369254354
          policy_loss: 0.02779309472276105
          total_loss: 0.009935800400045183
          vf_explained_var: 0.2577665150165558
          vf_loss: 0.0030990660798528957
    num_agent_steps_sampled: 30000
    num_agent_steps_trained: 30000
    num_steps_sampled: 30000
    num_steps_trained: 30000
  iterations_since_

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_421d3_00000,RUNNING,192.168.3.5:58357,30,399.911,30000,-0.0666667,3,-5,996.067


Result for PPO_my_env_421d3_00000:
  agent_timesteps_total: 31000
  custom_metrics: {}
  date: 2021-09-18_11-11-57
  done: false
  episode_len_mean: 996.1935483870968
  episode_media: {}
  episode_reward_max: 3.0
  episode_reward_mean: -0.06451612903225806
  episode_reward_min: -5.0
  episodes_this_iter: 1
  episodes_total: 31
  experiment_id: e1746907352f4fe585fcb0799b3f64f3
  hostname: linar-Z390-GAMING-X
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.00012359619140625
          cur_lr: 5.000000000000001e-05
          entropy: 2.0456457773844403
          entropy_coeff: 0.009999999999999998
          kl: 0.009132240737117206
          policy_loss: 0.03887356294112073
          total_loss: 0.021112927173574766
          vf_explained_var: -0.3266350328922272
          vf_loss: 0.00269469449413009
    num_agent_steps_sampled: 31000
    num_agent_steps_trained: 31000
    num_steps_sampled: 31000
    num_steps_trained: 31000
  iterations_since_

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_421d3_00000,RUNNING,192.168.3.5:58357,31,410.523,31000,-0.0645161,3,-5,996.194


Result for PPO_my_env_421d3_00000:
  agent_timesteps_total: 32000
  custom_metrics: {}
  date: 2021-09-18_11-12-07
  done: false
  episode_len_mean: 996.3125
  episode_media: {}
  episode_reward_max: 3.0
  episode_reward_mean: -0.0625
  episode_reward_min: -5.0
  episodes_this_iter: 1
  episodes_total: 32
  experiment_id: e1746907352f4fe585fcb0799b3f64f3
  hostname: linar-Z390-GAMING-X
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.00012359619140625
          cur_lr: 5.000000000000001e-05
          entropy: 2.172387300597297
          entropy_coeff: 0.009999999999999998
          kl: 0.012328516236619224
          policy_loss: -0.08037130898899503
          total_loss: -0.10112953268819386
          vf_explained_var: -0.4840739071369171
          vf_loss: 0.0009641265842623802
    num_agent_steps_sampled: 32000
    num_agent_steps_trained: 32000
    num_steps_sampled: 32000
    num_steps_trained: 32000
  iterations_since_restore: 32
  node_i

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_421d3_00000,RUNNING,192.168.3.5:58357,32,420.813,32000,-0.0625,3,-5,996.312


Result for PPO_my_env_421d3_00000:
  agent_timesteps_total: 33000
  custom_metrics: {}
  date: 2021-09-18_11-12-18
  done: false
  episode_len_mean: 996.4242424242424
  episode_media: {}
  episode_reward_max: 3.0
  episode_reward_mean: -0.06060606060606061
  episode_reward_min: -5.0
  episodes_this_iter: 1
  episodes_total: 33
  experiment_id: e1746907352f4fe585fcb0799b3f64f3
  hostname: linar-Z390-GAMING-X
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.00012359619140625
          cur_lr: 5.000000000000001e-05
          entropy: 2.1235780742433334
          entropy_coeff: 0.009999999999999998
          kl: 0.011599546108501179
          policy_loss: -0.03696580570605066
          total_loss: -0.05730803575780657
          vf_explained_var: -0.3118656575679779
          vf_loss: 0.0008921169142316406
    num_agent_steps_sampled: 33000
    num_agent_steps_trained: 33000
    num_steps_sampled: 33000
    num_steps_trained: 33000
  iterations_sin

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_421d3_00000,RUNNING,192.168.3.5:58357,33,431.354,33000,-0.0606061,3,-5,996.424


Result for PPO_my_env_421d3_00000:
  agent_timesteps_total: 34000
  custom_metrics: {}
  date: 2021-09-18_11-12-29
  done: false
  episode_len_mean: 996.5294117647059
  episode_media: {}
  episode_reward_max: 3.0
  episode_reward_mean: -0.058823529411764705
  episode_reward_min: -5.0
  episodes_this_iter: 1
  episodes_total: 34
  experiment_id: e1746907352f4fe585fcb0799b3f64f3
  hostname: linar-Z390-GAMING-X
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.00012359619140625
          cur_lr: 5.000000000000001e-05
          entropy: 1.9142776052157084
          entropy_coeff: 0.009999999999999998
          kl: 0.009818458341828871
          policy_loss: 0.07874921965930197
          total_loss: 0.06015040386054251
          vf_explained_var: -0.7098816633224487
          vf_loss: 0.0005427467583407027
    num_agent_steps_sampled: 34000
    num_agent_steps_trained: 34000
    num_steps_sampled: 34000
    num_steps_trained: 34000
  iterations_sinc

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_421d3_00000,RUNNING,192.168.3.5:58357,34,442.208,34000,-0.0588235,3,-5,996.529


Result for PPO_my_env_421d3_00000:
  agent_timesteps_total: 35000
  custom_metrics: {}
  date: 2021-09-18_11-12-39
  done: false
  episode_len_mean: 996.6285714285714
  episode_media: {}
  episode_reward_max: 3.0
  episode_reward_mean: -0.05714285714285714
  episode_reward_min: -5.0
  episodes_this_iter: 1
  episodes_total: 35
  experiment_id: e1746907352f4fe585fcb0799b3f64f3
  hostname: linar-Z390-GAMING-X
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.00012359619140625
          cur_lr: 5.000000000000001e-05
          entropy: 2.1527813381618923
          entropy_coeff: 0.009999999999999998
          kl: 0.01133659979486487
          policy_loss: 0.021502523703707588
          total_loss: 0.0025051930712329017
          vf_explained_var: -0.4942820966243744
          vf_loss: 0.0025290776348103664
    num_agent_steps_sampled: 35000
    num_agent_steps_trained: 35000
    num_steps_sampled: 35000
    num_steps_trained: 35000
  iterations_sin

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_421d3_00000,RUNNING,192.168.3.5:58357,35,452.456,35000,-0.0571429,3,-5,996.629


Result for PPO_my_env_421d3_00000:
  agent_timesteps_total: 36000
  custom_metrics: {}
  date: 2021-09-18_11-12-49
  done: false
  episode_len_mean: 996.7222222222222
  episode_media: {}
  episode_reward_max: 3.0
  episode_reward_mean: -0.05555555555555555
  episode_reward_min: -5.0
  episodes_this_iter: 1
  episodes_total: 36
  experiment_id: e1746907352f4fe585fcb0799b3f64f3
  hostname: linar-Z390-GAMING-X
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.00012359619140625
          cur_lr: 5.000000000000001e-05
          entropy: 2.0370110630989076
          entropy_coeff: 0.009999999999999998
          kl: 0.014701381772580992
          policy_loss: 0.044881261533333196
          total_loss: 0.02819081179590689
          vf_explained_var: -0.2142939269542694
          vf_loss: 0.0036778447864991093
    num_agent_steps_sampled: 36000
    num_agent_steps_trained: 36000
    num_steps_sampled: 36000
    num_steps_trained: 36000
  iterations_sinc

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_421d3_00000,RUNNING,192.168.3.5:58357,36,462.367,36000,-0.0555556,3,-5,996.722


Result for PPO_my_env_421d3_00000:
  agent_timesteps_total: 37000
  custom_metrics: {}
  date: 2021-09-18_11-12-59
  done: false
  episode_len_mean: 996.8108108108108
  episode_media: {}
  episode_reward_max: 3.0
  episode_reward_mean: -0.05405405405405406
  episode_reward_min: -5.0
  episodes_this_iter: 1
  episodes_total: 37
  experiment_id: e1746907352f4fe585fcb0799b3f64f3
  hostname: linar-Z390-GAMING-X
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.00012359619140625
          cur_lr: 5.000000000000001e-05
          entropy: 2.1688566896650525
          entropy_coeff: 0.009999999999999998
          kl: 0.0102637997664965
          policy_loss: -0.07569030183884833
          total_loss: -0.09046404154764282
          vf_explained_var: 0.23527081310749054
          vf_loss: 0.006913555351396402
    num_agent_steps_sampled: 37000
    num_agent_steps_trained: 37000
    num_steps_sampled: 37000
    num_steps_trained: 37000
  iterations_since_

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_421d3_00000,RUNNING,192.168.3.5:58357,37,472.354,37000,-0.0540541,3,-5,996.811


Result for PPO_my_env_421d3_00000:
  agent_timesteps_total: 38000
  custom_metrics: {}
  date: 2021-09-18_11-13-24
  done: false
  episode_len_mean: 996.8947368421053
  episode_media: {}
  episode_reward_max: 3.0
  episode_reward_mean: -0.05263157894736842
  episode_reward_min: -5.0
  episodes_this_iter: 1
  episodes_total: 38
  experiment_id: e1746907352f4fe585fcb0799b3f64f3
  hostname: linar-Z390-GAMING-X
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.00012359619140625
          cur_lr: 5.000000000000001e-05
          entropy: 2.2625883473290336
          entropy_coeff: 0.009999999999999998
          kl: 0.01029669067981271
          policy_loss: -0.11208353485498164
          total_loss: -0.1282469150920709
          vf_explained_var: 0.20722801983356476
          vf_loss: 0.0064612270746794015
    num_agent_steps_sampled: 38000
    num_agent_steps_trained: 38000
    num_steps_sampled: 38000
    num_steps_trained: 38000
  iterations_since

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_421d3_00000,RUNNING,192.168.3.5:58357,38,496.868,38000,-0.0526316,3,-5,996.895




Result for PPO_my_env_421d3_00000:
  agent_timesteps_total: 39000
  custom_metrics: {}
  date: 2021-09-18_11-13-37
  done: false
  episode_len_mean: 996.974358974359
  episode_media: {}
  episode_reward_max: 3.0
  episode_reward_mean: -0.05128205128205128
  episode_reward_min: -5.0
  episodes_this_iter: 1
  episodes_total: 39
  experiment_id: e1746907352f4fe585fcb0799b3f64f3
  hostname: linar-Z390-GAMING-X
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.00012359619140625
          cur_lr: 5.000000000000001e-05
          entropy: 2.2796993997361925
          entropy_coeff: 0.009999999999999998
          kl: 0.01315724665048656
          policy_loss: 0.05450283686319987
          total_loss: 0.03510063687960307
          vf_explained_var: 0.12637978792190552
          vf_loss: 0.0033931700862012805
    num_agent_steps_sampled: 39000
    num_agent_steps_trained: 39000
    num_steps_sampled: 39000
    num_steps_trained: 39000
  iterations_since_r

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_421d3_00000,RUNNING,192.168.3.5:58357,39,509.403,39000,-0.0512821,3,-5,996.974
