In [1]:
import torch 
from torch import nn

import ray
from ray.rllib.agents import ppo
from ray.rllib.models import ModelCatalog
from ray.rllib.models.torch.torch_modelv2 import TorchModelV2
from ray.rllib.utils.annotations import override

#from models import VisualEncoder
from train import *
from wrappers_2 import *



In [2]:
class VisualEncoder(nn.Module):
    def __init__(self):
        super().__init__()

        self.cnn = nn.Sequential(
            nn.Conv2d(3, 32, kernel_size=2, stride=2, padding=0),  
            nn.ELU(),
            nn.Conv2d(32, 32, kernel_size=2, stride=2, padding=0), 
            nn.ELU(),
            nn.Conv2d(32, 64, kernel_size=2, stride=2, padding=0), 
            nn.ELU(),
            nn.Conv2d(64, 128, kernel_size=2, stride=2, padding=0),
            nn.ELU(), 
            nn.Conv2d(128, 256, kernel_size=2, stride=2, padding=0),
            nn.ELU(),
            nn.Conv2d(256, 512, kernel_size=2, stride=2, padding=0),
            nn.ELU(),
            nn.Flatten(),
        )

    def forward(self, x):
        return self.cnn(x)

In [3]:
from torch.nn.functional import one_hot

class MyModelClass(TorchModelV2, nn.Module):
    def __init__(self, obs_space, action_space, num_outputs, model_config, name):
        TorchModelV2.__init__(self, obs_space, action_space, num_outputs, model_config, name)
        nn.Module.__init__(self)
        visual_features_dim = 512
        target_features_dim = 9 * 11 * 11 
        self.visual_encoder = VisualEncoder()
        self.visual_encoder.load_state_dict(
            torch.load("/IGLU-Minecraft/models/AngelaCNN/encoder_weigths.pth", map_location=torch.device('cpu'))
        )
        self.target_encoder = nn.Sequential(
            nn.Conv3d(7, 1, kernel_size=1, stride=1, padding=0),
            nn.ELU(),
        )
        policy_hidden_dim = 256 
        self.policy_network = nn.Sequential(
            nn.Linear(visual_features_dim + target_features_dim, 2048),
            nn.ELU(),
            nn.Linear(2048, 2048),
            nn.ELU(),
            nn.Linear(2048, 1024),
            nn.ELU(),
            nn.Linear(1024, 1024),
            nn.ELU(),
            nn.Linear(1024, 512),
            nn.ELU(),
            nn.Linear(512, 512),
            nn.ELU(),
            nn.Linear(512, 256),
            nn.ELU(),
            nn.Linear(256, policy_hidden_dim),
            nn.ELU(),
        )
        self.action_head = nn.Linear(policy_hidden_dim, action_space.n)
        self.value_head = nn.Linear(policy_hidden_dim, 1)
        self.last_value = None
        
        self.use_cuda = torch.cuda.is_available()
        if self.use_cuda:
            self.visual_encoder.cuda()
            self.target_encoder.cuda()
            self.policy_network.cuda()
            self.action_head.cuda()
            self.value_head.cuda()
        
    @override(TorchModelV2)
    def forward(self, input_dict, state, seq_lens):
        obs = input_dict['obs']
        pov = obs['pov'].permute(0, 3, 1, 2).float() / 255.0
        target = one_hot(obs['target_grid'].long(), num_classes=7).permute(0, 4, 1, 2, 3).float()
        if self.use_cuda:
            pov.cuda()
            target.cuda()
            
        with torch.no_grad():
            visual_features = self.visual_encoder(pov)
            
        target_features = self.target_encoder(target)
        target_features = target_features.reshape(target_features.shape[0], -1)
        features = torch.cat([visual_features, target_features], dim=1)
        features = self.policy_network(features)
        action = self.action_head(features)
        self.last_value = self.value_head(features).squeeze(1)
        return action, state
    
    @override(TorchModelV2)
    def value_function(self):
        assert self.last_value is not None, "must call forward() first"
        return self.last_value

In [4]:
visual_features_dim = 512
target_features_dim = 9 * 11 * 11
policy_hidden_dim = 256 

policy_network = nn.Sequential(
    nn.Linear(visual_features_dim + target_features_dim, 2048),
    nn.ELU(),
    nn.Linear(2048, 2048),
    nn.ELU(),
    nn.Linear(2048, 1024),
    nn.ELU(),
    nn.Linear(1024, 1024),
    nn.ELU(),
    nn.Linear(1024, 512),
    nn.ELU(),
    nn.Linear(512, 512),
    nn.ELU(),
    nn.Linear(512, 256),
    nn.ELU(),
    nn.Linear(256, policy_hidden_dim),
    nn.ELU(),
)

sum(p.numel() for p in policy_network.parameters())

11609600

In [5]:
ModelCatalog.register_custom_model("my_torch_model", MyModelClass)

In [6]:
class VisualObservationWrapper(ObsWrapper):
    def __init__(self, env, include_target=False):
        super().__init__(env)
        self.observation_space = {   
            'pov': gym.spaces.Box(low=0, high=255, shape=(64, 64, 3)),
            'inventory': gym.spaces.Box(low=0.0, high=20.0, shape=(6,)),
            'compass': gym.spaces.Box(low=-180.0, high=180.0, shape=(1,))
        }
        if include_target:
            self.observation_space['target_grid'] = \
                gym.spaces.Box(low=0, high=6, shape=(9, 11, 11))
        self.observation_space = gym.spaces.Dict(self.observation_space)

    def observation(self, obs, reward=None, done=None, info=None):
        if info is not None:
            if 'target_grid' in info:
                target_grid = info['target_grid']
                del info['target_grid']
            else:
                logger.error(f'info: {info}')
                if hasattr(self.unwrapped, 'should_reset'):
                    self.unwrapped.should_reset(True)
                target_grid = self.env.unwrapped.tasks.current.target_grid
        else:
            target_grid = self.env.unwrapped.tasks.current.target_grid
        return {
            'pov': obs['pov'].astype(np.float32),
            'inventory': obs['inventory'],
            'compass': np.array([obs['compass']['angle'].item()]),
            'target_grid': target_grid
        }

In [7]:
import os
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"   # see issue #152
os.environ["CUDA_VISIBLE_DEVICES"]="0"

class RewardWrapper(gym.RewardWrapper):
    def __init__(self, env):
        super().__init__(env)
    
    def reward(self, rew):
        if rew == 0:
            rew = -0.01
        return rew
    
def env_creator(env_config):
    env = gym.make('IGLUSilentBuilder-v0', max_steps=1000)
    env.update_taskset(TaskSet(preset=['C3', 'C17', 'C20', 'C22', 'C32', 'C40', 'C85', 'C87', 'C93']))
    #env = PovOnlyWrapper(env)
    env = VisualObservationWrapper(env, include_target=True)
    env = SelectAndPlace(env)
    env = Discretization(env, flat_action_space('human-level'))
    env = RewardWrapper(env)
    return env

from ray.tune.registry import register_env
register_env("my_env", env_creator)

from ray import tune
from ray.rllib.agents.ppo import PPOTrainer

In [None]:
from ray.tune.integration.wandb import WandbLogger

analysis = tune.run(PPOTrainer, 
         config={
             "env": "my_env", 
             "framework": "torch",
             "num_gpus": 1,
             "num_workers": 3,
             "sgd_minibatch_size": 256,
             "clip_param": 0.2,
             "entropy_coeff": 0.01,
             "lambda": 0.95,
             "train_batch_size": 5_000,
             "lr": 1e-4,
             #"gamma": 0.99,
             "model": {
                    # Specify our custom model from above.
                    "custom_model": "my_torch_model",
                    # Extra kwargs to be passed to your model's c'tor.
                    "custom_model_config": {},
              },
             "logger_config": {
                  "wandb": {
                      "project": "IGLU-Minecraft",
                      "name": "PPO MultiTask <=10 pretrained (AngelaCNN + MLP 8) (3 noops after placement) r: -0.01"
                  }
              }

        },
        loggers=[WandbLogger])

2021-10-29 21:04:12,888	INFO wandb.py:170 -- Already logged into W&B.
2021-10-29 21:04:12,902	ERROR syncer.py:72 -- Log sync requires rsync to be installed.


Trial name,status,loc
PPO_my_env_c4887_00000,RUNNING,


[34m[1mwandb[0m: Currently logged in as: [33mlinar[0m (use `wandb login --relogin` to force relogin)
[34m[1mwandb[0m: wandb version 0.12.6 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade


[2m[36m(pid=625567)[0m 2021-10-29 21:04:16,352	INFO ppo.py:159 -- In multi-agent mode, policies will be optimized sequentially by the multi-GPU optimizer. Consider setting simple_optimizer=True if this doesn't work for you.
[2m[36m(pid=625567)[0m 2021-10-29 21:04:16,352	INFO trainer.py:728 -- Current log_level is WARN. For more information, set 'log_level': 'INFO' / 'DEBUG' or use the -v and -vv flags.


Result for PPO_my_env_c4887_00000:
  agent_timesteps_total: 9996
  custom_metrics: {}
  date: 2021-10-29_21-08-02
  done: false
  episode_len_mean: 404.0
  episode_media: {}
  episode_reward_max: -3.789999999999963
  episode_reward_mean: -9.165833333333303
  episode_reward_min: -19.480000000000178
  episodes_this_iter: 24
  episodes_total: 24
  experiment_id: 39255c2c32a148c1abc3e53c967b4ad5
  hostname: linar-Z390-GAMING-X
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.19999999999999998
          cur_lr: 0.00010000000000000002
          entropy: 2.8832698318693373
          entropy_coeff: 0.01
          kl: 0.005976090252176761
          policy_loss: -0.008823641306824155
          total_loss: 0.36435518509939185
          vf_explained_var: -0.370869904756546
          vf_loss: 0.40081630689854403
    num_agent_steps_sampled: 9996
    num_agent_steps_trained: 9996
    num_steps_sampled: 9996
    num_steps_trained: 9996
  iterations_since_res

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_c4887_00000,RUNNING,192.168.3.5:625567,1,220.175,9996,-9.16583,-3.79,-19.48,404


Result for PPO_my_env_c4887_00000:
  agent_timesteps_total: 19992
  custom_metrics: {}
  date: 2021-10-29_21-10-04
  done: false
  episode_len_mean: 404.3333333333333
  episode_media: {}
  episode_reward_max: -3.789999999999963
  episode_reward_mean: -7.538541666666632
  episode_reward_min: -19.480000000000178
  episodes_this_iter: 24
  episodes_total: 48
  experiment_id: 39255c2c32a148c1abc3e53c967b4ad5
  hostname: linar-Z390-GAMING-X
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.19999999999999998
          cur_lr: 0.00010000000000000002
          entropy: 2.8783106999519545
          entropy_coeff: 0.01
          kl: 0.005715139597830428
          policy_loss: -0.006147194440420876
          total_loss: 0.16474733425375934
          vf_explained_var: 0.02605714462697506
          vf_loss: 0.19853460734643433
    num_agent_steps_sampled: 19992
    num_agent_steps_trained: 19992
    num_steps_sampled: 19992
    num_steps_trained: 19992
  it

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_c4887_00000,RUNNING,192.168.3.5:625567,2,341.956,19992,-7.53854,-3.79,-19.48,404.333


Result for PPO_my_env_c4887_00000:
  agent_timesteps_total: 29988
  custom_metrics: {}
  date: 2021-10-29_21-12-04
  done: false
  episode_len_mean: 402.18055555555554
  episode_media: {}
  episode_reward_max: -3.619999999999967
  episode_reward_mean: -6.589583333333296
  episode_reward_min: -19.480000000000178
  episodes_this_iter: 24
  episodes_total: 72
  experiment_id: 39255c2c32a148c1abc3e53c967b4ad5
  hostname: linar-Z390-GAMING-X
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.19999999999999998
          cur_lr: 0.00010000000000000002
          entropy: 2.87183359321366
          entropy_coeff: 0.01
          kl: 0.005993062470161649
          policy_loss: -0.00039545998701618777
          total_loss: 0.08547777000050515
          vf_explained_var: -0.030956359580159187
          vf_loss: 0.11339295119732846
    num_agent_steps_sampled: 29988
    num_agent_steps_trained: 29988
    num_steps_sampled: 29988
    num_steps_trained: 29988
 

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_c4887_00000,RUNNING,192.168.3.5:625567,3,462.297,29988,-6.58958,-3.62,-19.48,402.181




Result for PPO_my_env_c4887_00000:
  agent_timesteps_total: 39984
  custom_metrics: {}
  date: 2021-10-29_21-14-28
  done: false
  episode_len_mean: 400.7878787878788
  episode_media: {}
  episode_reward_max: -2.4699999999999718
  episode_reward_mean: -6.128989898989861
  episode_reward_min: -19.480000000000178
  episodes_this_iter: 27
  episodes_total: 99
  experiment_id: 39255c2c32a148c1abc3e53c967b4ad5
  hostname: linar-Z390-GAMING-X
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.19999999999999998
          cur_lr: 0.00010000000000000002
          entropy: 2.8719291316138373
          entropy_coeff: 0.01
          kl: 0.005865483286475509
          policy_loss: -0.005399203996579998
          total_loss: 0.06425475380706808
          vf_explained_var: -0.12049980461597443
          vf_loss: 0.09720015220489617
    num_agent_steps_sampled: 39984
    num_agent_steps_trained: 39984
    num_steps_sampled: 39984
    num_steps_trained: 39984
  

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_c4887_00000,RUNNING,192.168.3.5:625567,4,605.858,39984,-6.12899,-2.47,-19.48,400.788


Result for PPO_my_env_c4887_00000:
  agent_timesteps_total: 49980
  custom_metrics: {}
  date: 2021-10-29_21-16-37
  done: false
  episode_len_mean: 398.94
  episode_media: {}
  episode_reward_max: -0.7700000000000019
  episode_reward_mean: -4.96849999999996
  episode_reward_min: -14.849999999999962
  episodes_this_iter: 24
  episodes_total: 123
  experiment_id: 39255c2c32a148c1abc3e53c967b4ad5
  hostname: linar-Z390-GAMING-X
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.19999999999999998
          cur_lr: 0.00010000000000000002
          entropy: 2.859299398895003
          entropy_coeff: 0.01
          kl: 0.006903323522985729
          policy_loss: -0.007857704982131671
          total_loss: 0.0786986000640875
          vf_explained_var: -0.19872279465198517
          vf_loss: 0.11376863251679245
    num_agent_steps_sampled: 49980
    num_agent_steps_trained: 49980
    num_steps_sampled: 49980
    num_steps_trained: 49980
  iterations_si

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_c4887_00000,RUNNING,192.168.3.5:625567,5,734.599,49980,-4.9685,-0.77,-14.85,398.94


Result for PPO_my_env_c4887_00000:
  agent_timesteps_total: 59976
  custom_metrics: {}
  date: 2021-10-29_21-18-47
  done: false
  episode_len_mean: 396.35
  episode_media: {}
  episode_reward_max: -0.7700000000000019
  episode_reward_mean: -4.635599999999959
  episode_reward_min: -14.849999999999962
  episodes_this_iter: 26
  episodes_total: 149
  experiment_id: 39255c2c32a148c1abc3e53c967b4ad5
  hostname: linar-Z390-GAMING-X
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.19999999999999998
          cur_lr: 0.00010000000000000002
          entropy: 2.8567220208991286
          entropy_coeff: 0.01
          kl: 0.007402953224623023
          policy_loss: -0.008427571315859627
          total_loss: 0.014109141534815232
          vf_explained_var: -0.12967818975448608
          vf_loss: 0.049623341272736536
    num_agent_steps_sampled: 59976
    num_agent_steps_trained: 59976
    num_steps_sampled: 59976
    num_steps_trained: 59976
  iteratio

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_c4887_00000,RUNNING,192.168.3.5:625567,6,864.707,59976,-4.6356,-0.77,-14.85,396.35


Result for PPO_my_env_c4887_00000:
  agent_timesteps_total: 69972
  custom_metrics: {}
  date: 2021-10-29_21-20-50
  done: false
  episode_len_mean: 394.93
  episode_media: {}
  episode_reward_max: -0.7700000000000019
  episode_reward_mean: -4.46909999999996
  episode_reward_min: -14.849999999999962
  episodes_this_iter: 25
  episodes_total: 174
  experiment_id: 39255c2c32a148c1abc3e53c967b4ad5
  hostname: linar-Z390-GAMING-X
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.19999999999999998
          cur_lr: 0.00010000000000000002
          entropy: 2.843265289119166
          entropy_coeff: 0.01
          kl: 0.00916006100470442
          policy_loss: -0.012616257203949823
          total_loss: -0.018872724291987907
          vf_explained_var: 0.24371007084846497
          vf_loss: 0.020344172762724287
    num_agent_steps_sampled: 69972
    num_agent_steps_trained: 69972
    num_steps_sampled: 69972
    num_steps_trained: 69972
  iterations_

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_c4887_00000,RUNNING,192.168.3.5:625567,7,987.982,69972,-4.4691,-0.77,-14.85,394.93




Result for PPO_my_env_c4887_00000:
  agent_timesteps_total: 79968
  custom_metrics: {}
  date: 2021-10-29_21-23-09
  done: false
  episode_len_mean: 393.19
  episode_media: {}
  episode_reward_max: -3.1999999999999758
  episode_reward_mean: -4.2322999999999595
  episode_reward_min: -13.509999999999955
  episodes_this_iter: 26
  episodes_total: 200
  experiment_id: 39255c2c32a148c1abc3e53c967b4ad5
  hostname: linar-Z390-GAMING-X
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.19999999999999998
          cur_lr: 0.00010000000000000002
          entropy: 2.8299141918492112
          entropy_coeff: 0.01
          kl: 0.009429405921508866
          policy_loss: -0.011609189207546222
          total_loss: -0.028535528980896004
          vf_explained_var: 0.31521379947662354
          vf_loss: 0.009486920256655202
    num_agent_steps_sampled: 79968
    num_agent_steps_trained: 79968
    num_steps_sampled: 79968
    num_steps_trained: 79968
  iterati

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_c4887_00000,RUNNING,192.168.3.5:625567,8,1127.21,79968,-4.2323,-3.2,-13.51,393.19


Result for PPO_my_env_c4887_00000:
  agent_timesteps_total: 89964
  custom_metrics: {}
  date: 2021-10-29_21-25-09
  done: false
  episode_len_mean: 391.53
  episode_media: {}
  episode_reward_max: -3.1999999999999758
  episode_reward_mean: -4.057299999999961
  episode_reward_min: -11.36999999999996
  episodes_this_iter: 26
  episodes_total: 226
  experiment_id: 39255c2c32a148c1abc3e53c967b4ad5
  hostname: linar-Z390-GAMING-X
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.19999999999999998
          cur_lr: 0.00010000000000000002
          entropy: 2.823943393658369
          entropy_coeff: 0.01
          kl: 0.00990417727956674
          policy_loss: -0.014482465195350158
          total_loss: -0.033047017614301455
          vf_explained_var: 0.48553383350372314
          vf_loss: 0.007694045563465239
    num_agent_steps_sampled: 89964
    num_agent_steps_trained: 89964
    num_steps_sampled: 89964
    num_steps_trained: 89964
  iterations_

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_c4887_00000,RUNNING,192.168.3.5:625567,9,1246.8,89964,-4.0573,-3.2,-11.37,391.53


Result for PPO_my_env_c4887_00000:
  agent_timesteps_total: 99960
  custom_metrics: {}
  date: 2021-10-29_21-27-09
  done: false
  episode_len_mean: 386.67
  episode_media: {}
  episode_reward_max: -3.1999999999999758
  episode_reward_mean: -3.885499999999961
  episode_reward_min: -5.819999999999958
  episodes_this_iter: 27
  episodes_total: 253
  experiment_id: 39255c2c32a148c1abc3e53c967b4ad5
  hostname: linar-Z390-GAMING-X
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.19999999999999998
          cur_lr: 0.00010000000000000002
          entropy: 2.8010744956823497
          entropy_coeff: 0.01
          kl: 0.011831489042012096
          policy_loss: -0.016278078254216757
          total_loss: -0.03441328588459227
          vf_explained_var: 0.6044422388076782
          vf_loss: 0.007509238083407076
    num_agent_steps_sampled: 99960
    num_agent_steps_trained: 99960
    num_steps_sampled: 99960
    num_steps_trained: 99960
  iterations_

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_c4887_00000,RUNNING,192.168.3.5:625567,10,1366.26,99960,-3.8855,-3.2,-5.82,386.67




Result for PPO_my_env_c4887_00000:
  agent_timesteps_total: 109956
  custom_metrics: {}
  date: 2021-10-29_21-29-26
  done: false
  episode_len_mean: 379.98
  episode_media: {}
  episode_reward_max: -3.189999999999976
  episode_reward_mean: -3.7997999999999625
  episode_reward_min: -4.209999999999955
  episodes_this_iter: 27
  episodes_total: 280
  experiment_id: 39255c2c32a148c1abc3e53c967b4ad5
  hostname: linar-Z390-GAMING-X
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.19999999999999998
          cur_lr: 0.00010000000000000002
          entropy: 2.7708889447725737
          entropy_coeff: 0.01
          kl: 0.014445696190251737
          policy_loss: -0.014678755771910024
          total_loss: -0.0317896332559932
          vf_explained_var: 0.5103092193603516
          vf_loss: 0.007708872475812578
    num_agent_steps_sampled: 109956
    num_agent_steps_trained: 109956
    num_steps_sampled: 109956
    num_steps_trained: 109956
  iterati

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_c4887_00000,RUNNING,192.168.3.5:625567,11,1504.12,109956,-3.7998,-3.19,-4.21,379.98


Result for PPO_my_env_c4887_00000:
  agent_timesteps_total: 119952
  custom_metrics: {}
  date: 2021-10-29_21-31-27
  done: false
  episode_len_mean: 374.98
  episode_media: {}
  episode_reward_max: -3.189999999999976
  episode_reward_mean: -3.7497999999999636
  episode_reward_min: -4.089999999999957
  episodes_this_iter: 26
  episodes_total: 306
  experiment_id: 39255c2c32a148c1abc3e53c967b4ad5
  hostname: linar-Z390-GAMING-X
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.19999999999999998
          cur_lr: 0.00010000000000000002
          entropy: 2.748139734757252
          entropy_coeff: 0.01
          kl: 0.013918578106001488
          policy_loss: -0.013981476343340344
          total_loss: -0.031169562809105614
          vf_explained_var: 0.5569139719009399
          vf_loss: 0.007509594696061985
    num_agent_steps_sampled: 119952
    num_agent_steps_trained: 119952
    num_steps_sampled: 119952
    num_steps_trained: 119952
  iterat

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_c4887_00000,RUNNING,192.168.3.5:625567,12,1624.89,119952,-3.7498,-3.19,-4.09,374.98


Result for PPO_my_env_c4887_00000:
  agent_timesteps_total: 129948
  custom_metrics: {}
  date: 2021-10-29_21-33-26
  done: false
  episode_len_mean: 369.94
  episode_media: {}
  episode_reward_max: -3.189999999999976
  episode_reward_mean: -3.6993999999999647
  episode_reward_min: -4.089999999999957
  episodes_this_iter: 27
  episodes_total: 333
  experiment_id: 39255c2c32a148c1abc3e53c967b4ad5
  hostname: linar-Z390-GAMING-X
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.19999999999999998
          cur_lr: 0.00010000000000000002
          entropy: 2.7288184948456595
          entropy_coeff: 0.01
          kl: 0.016153578655671876
          policy_loss: -0.015599416610267427
          total_loss: -0.031735727802301064
          vf_explained_var: 0.47395259141921997
          vf_loss: 0.007921157692584934
    num_agent_steps_sampled: 129948
    num_agent_steps_trained: 129948
    num_steps_sampled: 129948
    num_steps_trained: 129948
  iter

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_c4887_00000,RUNNING,192.168.3.5:625567,13,1743.25,129948,-3.6994,-3.19,-4.09,369.94




Result for PPO_my_env_c4887_00000:
  agent_timesteps_total: 139944
  custom_metrics: {}
  date: 2021-10-29_21-35-42
  done: false
  episode_len_mean: 366.09
  episode_media: {}
  episode_reward_max: -3.139999999999977
  episode_reward_mean: -3.660899999999966
  episode_reward_min: -4.089999999999957
  episodes_this_iter: 29
  episodes_total: 362
  experiment_id: 39255c2c32a148c1abc3e53c967b4ad5
  hostname: linar-Z390-GAMING-X
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.19999999999999998
          cur_lr: 0.00010000000000000002
          entropy: 2.697187589580177
          entropy_coeff: 0.01
          kl: 0.016535257152983613
          policy_loss: -0.018392473618444215
          total_loss: -0.033025577078517686
          vf_explained_var: 0.4784652292728424
          vf_loss: 0.009031720794229888
    num_agent_steps_sampled: 139944
    num_agent_steps_trained: 139944
    num_steps_sampled: 139944
    num_steps_trained: 139944
  iterati

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_c4887_00000,RUNNING,192.168.3.5:625567,14,1879.24,139944,-3.6609,-3.14,-4.09,366.09


Result for PPO_my_env_c4887_00000:
  agent_timesteps_total: 149940
  custom_metrics: {}
  date: 2021-10-29_21-37-42
  done: false
  episode_len_mean: 363.71
  episode_media: {}
  episode_reward_max: -3.139999999999977
  episode_reward_mean: -3.6370999999999656
  episode_reward_min: -4.059999999999958
  episodes_this_iter: 27
  episodes_total: 389
  experiment_id: 39255c2c32a148c1abc3e53c967b4ad5
  hostname: linar-Z390-GAMING-X
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.19999999999999998
          cur_lr: 0.00010000000000000002
          entropy: 2.6840950142624034
          entropy_coeff: 0.01
          kl: 0.014594825186529912
          policy_loss: -0.018851986059393637
          total_loss: -0.03432929736808834
          vf_explained_var: 0.5043028593063354
          vf_loss: 0.00844467373089228
    num_agent_steps_sampled: 149940
    num_agent_steps_trained: 149940
    num_steps_sampled: 149940
    num_steps_trained: 149940
  iterati

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_c4887_00000,RUNNING,192.168.3.5:625567,15,1999.42,149940,-3.6371,-3.14,-4.06,363.71


Result for PPO_my_env_c4887_00000:
  agent_timesteps_total: 159936
  custom_metrics: {}
  date: 2021-10-29_21-39-40
  done: false
  episode_len_mean: 359.59
  episode_media: {}
  episode_reward_max: -3.139999999999977
  episode_reward_mean: -3.5958999999999666
  episode_reward_min: -3.989999999999959
  episodes_this_iter: 28
  episodes_total: 417
  experiment_id: 39255c2c32a148c1abc3e53c967b4ad5
  hostname: linar-Z390-GAMING-X
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.19999999999999998
          cur_lr: 0.00010000000000000002
          entropy: 2.673925769634736
          entropy_coeff: 0.01
          kl: 0.018525136458072508
          policy_loss: -0.020327709812639105
          total_loss: -0.03250559601518843
          vf_explained_var: 0.36277905106544495
          vf_loss: 0.010856343619326431
    num_agent_steps_sampled: 159936
    num_agent_steps_trained: 159936
    num_steps_sampled: 159936
    num_steps_trained: 159936
  iterat

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_c4887_00000,RUNNING,192.168.3.5:625567,16,2117.49,159936,-3.5959,-3.14,-3.99,359.59


Result for PPO_my_env_c4887_00000:
  agent_timesteps_total: 169932
  custom_metrics: {}
  date: 2021-10-29_21-41-39
  done: false
  episode_len_mean: 356.52
  episode_media: {}
  episode_reward_max: -3.139999999999977
  episode_reward_mean: -3.5651999999999675
  episode_reward_min: -3.989999999999959
  episodes_this_iter: 29
  episodes_total: 446
  experiment_id: 39255c2c32a148c1abc3e53c967b4ad5
  hostname: linar-Z390-GAMING-X
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.19999999999999998
          cur_lr: 0.00010000000000000002
          entropy: 2.650360490521814
          entropy_coeff: 0.01
          kl: 0.0159370044839628
          policy_loss: -0.020528173917888574
          total_loss: -0.03395871111215689
          vf_explained_var: 0.4250655770301819
          vf_loss: 0.009885665684521525
    num_agent_steps_sampled: 169932
    num_agent_steps_trained: 169932
    num_steps_sampled: 169932
    num_steps_trained: 169932
  iteration

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_c4887_00000,RUNNING,192.168.3.5:625567,17,2236.57,169932,-3.5652,-3.14,-3.99,356.52




Result for PPO_my_env_c4887_00000:
  agent_timesteps_total: 179928
  custom_metrics: {}
  date: 2021-10-29_21-43-57
  done: false
  episode_len_mean: 351.03
  episode_media: {}
  episode_reward_max: -2.869999999999983
  episode_reward_mean: -3.5102999999999684
  episode_reward_min: -3.849999999999962
  episodes_this_iter: 30
  episodes_total: 476
  experiment_id: 39255c2c32a148c1abc3e53c967b4ad5
  hostname: linar-Z390-GAMING-X
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.19999999999999998
          cur_lr: 0.00010000000000000002
          entropy: 2.647235590779883
          entropy_coeff: 0.01
          kl: 0.0336255600042519
          policy_loss: 0.0020575098088409147
          total_loss: -0.005411271585358514
          vf_explained_var: 0.3502059578895569
          vf_loss: 0.012278460971309612
    num_agent_steps_sampled: 179928
    num_agent_steps_trained: 179928
    num_steps_sampled: 179928
    num_steps_trained: 179928
  iteratio

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_c4887_00000,RUNNING,192.168.3.5:625567,18,2374.59,179928,-3.5103,-2.87,-3.85,351.03


Result for PPO_my_env_c4887_00000:
  agent_timesteps_total: 189924
  custom_metrics: {}
  date: 2021-10-29_21-45-58
  done: false
  episode_len_mean: 346.52
  episode_media: {}
  episode_reward_max: -2.869999999999983
  episode_reward_mean: -3.4651999999999696
  episode_reward_min: -3.849999999999962
  episodes_this_iter: 28
  episodes_total: 504
  experiment_id: 39255c2c32a148c1abc3e53c967b4ad5
  hostname: linar-Z390-GAMING-X
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.3
          cur_lr: 0.00010000000000000002
          entropy: 2.6203505950096324
          entropy_coeff: 0.01
          kl: 0.015328205244416018
          policy_loss: -0.019133502178085154
          total_loss: -0.031046132240285223
          vf_explained_var: 0.42118608951568604
          vf_loss: 0.009692413643499763
    num_agent_steps_sampled: 189924
    num_agent_steps_trained: 189924
    num_steps_sampled: 189924
    num_steps_trained: 189924
  iterations_since_res

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_c4887_00000,RUNNING,192.168.3.5:625567,19,2495.37,189924,-3.4652,-2.87,-3.85,346.52


Result for PPO_my_env_c4887_00000:
  agent_timesteps_total: 199920
  custom_metrics: {}
  date: 2021-10-29_21-47-58
  done: false
  episode_len_mean: 343.45
  episode_media: {}
  episode_reward_max: -2.869999999999983
  episode_reward_mean: -3.43449999999997
  episode_reward_min: -3.9999999999999587
  episodes_this_iter: 29
  episodes_total: 533
  experiment_id: 39255c2c32a148c1abc3e53c967b4ad5
  hostname: linar-Z390-GAMING-X
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.3
          cur_lr: 0.00010000000000000002
          entropy: 2.6142915246833085
          entropy_coeff: 0.01
          kl: 0.016774899511858316
          policy_loss: -0.019028127702892337
          total_loss: -0.02764210909222945
          vf_explained_var: 0.14600670337677002
          vf_loss: 0.01249646338740907
    num_agent_steps_sampled: 199920
    num_agent_steps_trained: 199920
    num_steps_sampled: 199920
    num_steps_trained: 199920
  iterations_since_restor

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_c4887_00000,RUNNING,192.168.3.5:625567,20,2615.83,199920,-3.4345,-2.87,-4,343.45




Result for PPO_my_env_c4887_00000:
  agent_timesteps_total: 209916
  custom_metrics: {}
  date: 2021-10-29_21-50-18
  done: false
  episode_len_mean: 339.31
  episode_media: {}
  episode_reward_max: -2.9299999999999815
  episode_reward_mean: -3.393099999999971
  episode_reward_min: -3.9999999999999587
  episodes_this_iter: 30
  episodes_total: 563
  experiment_id: 39255c2c32a148c1abc3e53c967b4ad5
  hostname: linar-Z390-GAMING-X
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.3
          cur_lr: 0.00010000000000000002
          entropy: 2.6072287276259853
          entropy_coeff: 0.01
          kl: 0.013898418321317264
          policy_loss: -0.018560377530689932
          total_loss: -0.02918115337817078
          vf_explained_var: 0.24773888289928436
          vf_loss: 0.011281984345514813
    num_agent_steps_sampled: 209916
    num_agent_steps_trained: 209916
    num_steps_sampled: 209916
    num_steps_trained: 209916
  iterations_since_res

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_c4887_00000,RUNNING,192.168.3.5:625567,21,2755.84,209916,-3.3931,-2.93,-4,339.31


Result for PPO_my_env_c4887_00000:
  agent_timesteps_total: 219912
  custom_metrics: {}
  date: 2021-10-29_21-52-20
  done: false
  episode_len_mean: 336.76
  episode_media: {}
  episode_reward_max: -2.9699999999999807
  episode_reward_mean: -3.367599999999972
  episode_reward_min: -3.9999999999999587
  episodes_this_iter: 30
  episodes_total: 593
  experiment_id: 39255c2c32a148c1abc3e53c967b4ad5
  hostname: linar-Z390-GAMING-X
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.3
          cur_lr: 0.00010000000000000002
          entropy: 2.605514992811741
          entropy_coeff: 0.01
          kl: 0.016487242803742856
          policy_loss: -0.01916974249940652
          total_loss: -0.02913717405918317
          vf_explained_var: 0.35976043343544006
          vf_loss: 0.01114154502429301
    num_agent_steps_sampled: 219912
    num_agent_steps_trained: 219912
    num_steps_sampled: 219912
    num_steps_trained: 219912
  iterations_since_restor

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_c4887_00000,RUNNING,192.168.3.5:625567,22,2877.04,219912,-3.3676,-2.97,-4,336.76


Result for PPO_my_env_c4887_00000:
  agent_timesteps_total: 229908
  custom_metrics: {}
  date: 2021-10-29_21-54-21
  done: false
  episode_len_mean: 335.86
  episode_media: {}
  episode_reward_max: -2.9699999999999807
  episode_reward_mean: -3.3585999999999725
  episode_reward_min: -3.799999999999963
  episodes_this_iter: 30
  episodes_total: 623
  experiment_id: 39255c2c32a148c1abc3e53c967b4ad5
  hostname: linar-Z390-GAMING-X
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.3
          cur_lr: 0.00010000000000000002
          entropy: 2.590216198334327
          entropy_coeff: 0.01
          kl: 0.015808230927384606
          policy_loss: -0.02228547969243975
          total_loss: -0.033759151271775235
          vf_explained_var: 0.47682833671569824
          vf_loss: 0.009686020790048553
    num_agent_steps_sampled: 229908
    num_agent_steps_trained: 229908
    num_steps_sampled: 229908
    num_steps_trained: 229908
  iterations_since_rest

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_c4887_00000,RUNNING,192.168.3.5:625567,23,2998.88,229908,-3.3586,-2.97,-3.8,335.86




Result for PPO_my_env_c4887_00000:
  agent_timesteps_total: 239904
  custom_metrics: {}
  date: 2021-10-29_21-56-42
  done: false
  episode_len_mean: 335.39
  episode_media: {}
  episode_reward_max: -2.9199999999999817
  episode_reward_mean: -3.353899999999973
  episode_reward_min: -3.759999999999964
  episodes_this_iter: 30
  episodes_total: 653
  experiment_id: 39255c2c32a148c1abc3e53c967b4ad5
  hostname: linar-Z390-GAMING-X
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.3
          cur_lr: 0.00010000000000000002
          entropy: 2.5340582069168742
          entropy_coeff: 0.01
          kl: 0.016402513513692492
          policy_loss: -0.015112484507581108
          total_loss: -0.024588220302238423
          vf_explained_var: 0.36747103929519653
          vf_loss: 0.010944092055047866
    num_agent_steps_sampled: 239904
    num_agent_steps_trained: 239904
    num_steps_sampled: 239904
    num_steps_trained: 239904
  iterations_since_res

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_c4887_00000,RUNNING,192.168.3.5:625567,24,3139.64,239904,-3.3539,-2.92,-3.76,335.39


Result for PPO_my_env_c4887_00000:
  agent_timesteps_total: 249900
  custom_metrics: {}
  date: 2021-10-29_21-58-46
  done: false
  episode_len_mean: 334.26
  episode_media: {}
  episode_reward_max: -2.9199999999999817
  episode_reward_mean: -3.342599999999972
  episode_reward_min: -3.759999999999964
  episodes_this_iter: 30
  episodes_total: 683
  experiment_id: 39255c2c32a148c1abc3e53c967b4ad5
  hostname: linar-Z390-GAMING-X
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.3
          cur_lr: 0.00010000000000000002
          entropy: 2.5581381331142197
          entropy_coeff: 0.01
          kl: 0.016281500909050877
          policy_loss: -0.0224372102950628
          total_loss: -0.032254301450955564
          vf_explained_var: 0.38476794958114624
          vf_loss: 0.01087983891144783
    num_agent_steps_sampled: 249900
    num_agent_steps_trained: 249900
    num_steps_sampled: 249900
    num_steps_trained: 249900
  iterations_since_restor

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_c4887_00000,RUNNING,192.168.3.5:625567,25,3263.72,249900,-3.3426,-2.92,-3.76,334.26


Result for PPO_my_env_c4887_00000:
  agent_timesteps_total: 259896
  custom_metrics: {}
  date: 2021-10-29_22-00-51
  done: false
  episode_len_mean: 330.36
  episode_media: {}
  episode_reward_max: -2.9199999999999817
  episode_reward_mean: -3.3035999999999732
  episode_reward_min: -3.839999999999962
  episodes_this_iter: 31
  episodes_total: 714
  experiment_id: 39255c2c32a148c1abc3e53c967b4ad5
  hostname: linar-Z390-GAMING-X
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.3
          cur_lr: 0.00010000000000000002
          entropy: 2.514530855977637
          entropy_coeff: 0.01
          kl: 0.01693860484762962
          policy_loss: -0.02450768023920365
          total_loss: -0.033990137291769695
          vf_explained_var: 0.46373456716537476
          vf_loss: 0.01058126967790851
    num_agent_steps_sampled: 259896
    num_agent_steps_trained: 259896
    num_steps_sampled: 259896
    num_steps_trained: 259896
  iterations_since_restor

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_c4887_00000,RUNNING,192.168.3.5:625567,26,3388.42,259896,-3.3036,-2.92,-3.84,330.36




Result for PPO_my_env_c4887_00000:
  agent_timesteps_total: 269892
  custom_metrics: {}
  date: 2021-10-29_22-03-12
  done: false
  episode_len_mean: 325.64
  episode_media: {}
  episode_reward_max: -2.719999999999986
  episode_reward_mean: -3.2563999999999744
  episode_reward_min: -3.839999999999962
  episodes_this_iter: 31
  episodes_total: 745
  experiment_id: 39255c2c32a148c1abc3e53c967b4ad5
  hostname: linar-Z390-GAMING-X
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.3
          cur_lr: 0.00010000000000000002
          entropy: 2.5747376853584223
          entropy_coeff: 0.01
          kl: 0.01836315547892131
          policy_loss: -0.02160740277578688
          total_loss: -0.03002023630672031
          vf_explained_var: 0.3663577437400818
          vf_loss: 0.011825596110180863
    num_agent_steps_sampled: 269892
    num_agent_steps_trained: 269892
    num_steps_sampled: 269892
    num_steps_trained: 269892
  iterations_since_restore

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_c4887_00000,RUNNING,192.168.3.5:625567,27,3529.78,269892,-3.2564,-2.72,-3.84,325.64


Result for PPO_my_env_c4887_00000:
  agent_timesteps_total: 279888
  custom_metrics: {}
  date: 2021-10-29_22-05-15
  done: false
  episode_len_mean: 326.44
  episode_media: {}
  episode_reward_max: -2.719999999999986
  episode_reward_mean: -3.264399999999974
  episode_reward_min: -3.839999999999962
  episodes_this_iter: 30
  episodes_total: 775
  experiment_id: 39255c2c32a148c1abc3e53c967b4ad5
  hostname: linar-Z390-GAMING-X
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.3
          cur_lr: 0.00010000000000000002
          entropy: 2.543680451058934
          entropy_coeff: 0.01
          kl: 0.01792138740122211
          policy_loss: -0.020016354685410475
          total_loss: -0.028538150359422733
          vf_explained_var: 0.3204682767391205
          vf_loss: 0.011538591295277748
    num_agent_steps_sampled: 279888
    num_agent_steps_trained: 279888
    num_steps_sampled: 279888
    num_steps_trained: 279888
  iterations_since_restore

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_c4887_00000,RUNNING,192.168.3.5:625567,28,3652.74,279888,-3.2644,-2.72,-3.84,326.44




Result for PPO_my_env_c4887_00000:
  agent_timesteps_total: 289884
  custom_metrics: {}
  date: 2021-10-29_22-07-37
  done: false
  episode_len_mean: 321.48
  episode_media: {}
  episode_reward_max: -2.719999999999986
  episode_reward_mean: -3.214799999999975
  episode_reward_min: -3.839999999999962
  episodes_this_iter: 33
  episodes_total: 808
  experiment_id: 39255c2c32a148c1abc3e53c967b4ad5
  hostname: linar-Z390-GAMING-X
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.3
          cur_lr: 0.00010000000000000002
          entropy: 2.480481813707922
          entropy_coeff: 0.01
          kl: 0.0168978026015408
          policy_loss: -0.02002618005578844
          total_loss: -0.028826697801168148
          vf_explained_var: 0.4878488779067993
          vf_loss: 0.010934959393445876
    num_agent_steps_sampled: 289884
    num_agent_steps_trained: 289884
    num_steps_sampled: 289884
    num_steps_trained: 289884
  iterations_since_restore: 

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_c4887_00000,RUNNING,192.168.3.5:625567,29,3794.05,289884,-3.2148,-2.72,-3.84,321.48




Result for PPO_my_env_c4887_00000:
  agent_timesteps_total: 299880
  custom_metrics: {}
  date: 2021-10-29_22-09-57
  done: false
  episode_len_mean: 320.51
  episode_media: {}
  episode_reward_max: -2.7299999999999858
  episode_reward_mean: -3.205099999999975
  episode_reward_min: -3.839999999999962
  episodes_this_iter: 30
  episodes_total: 838
  experiment_id: 39255c2c32a148c1abc3e53c967b4ad5
  hostname: linar-Z390-GAMING-X
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.3
          cur_lr: 0.00010000000000000002
          entropy: 2.462966432530656
          entropy_coeff: 0.01
          kl: 0.01843382190936982
          policy_loss: -0.019856300478817052
          total_loss: -0.027921302498787895
          vf_explained_var: 0.41814863681793213
          vf_loss: 0.01103451513154452
    num_agent_steps_sampled: 299880
    num_agent_steps_trained: 299880
    num_steps_sampled: 299880
    num_steps_trained: 299880
  iterations_since_restor

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_c4887_00000,RUNNING,192.168.3.5:625567,30,3934.17,299880,-3.2051,-2.73,-3.84,320.51


Result for PPO_my_env_c4887_00000:
  agent_timesteps_total: 309876
  custom_metrics: {}
  date: 2021-10-29_22-12-04
  done: false
  episode_len_mean: 315.25
  episode_media: {}
  episode_reward_max: -2.7299999999999858
  episode_reward_mean: -3.1524999999999763
  episode_reward_min: -3.6399999999999664
  episodes_this_iter: 33
  episodes_total: 871
  experiment_id: 39255c2c32a148c1abc3e53c967b4ad5
  hostname: linar-Z390-GAMING-X
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.3
          cur_lr: 0.00010000000000000002
          entropy: 2.486479832779648
          entropy_coeff: 0.01
          kl: 0.04357411827604883
          policy_loss: -0.0015788276735534015
          total_loss: 0.0003608049872594002
          vf_explained_var: 0.284025639295578
          vf_loss: 0.013732194087147697
    num_agent_steps_sampled: 309876
    num_agent_steps_trained: 309876
    num_steps_sampled: 309876
    num_steps_trained: 309876
  iterations_since_rest

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_c4887_00000,RUNNING,192.168.3.5:625567,31,4060.88,309876,-3.1525,-2.73,-3.64,315.25




Result for PPO_my_env_c4887_00000:
  agent_timesteps_total: 319872
  custom_metrics: {}
  date: 2021-10-29_22-14-27
  done: false
  episode_len_mean: 312.98
  episode_media: {}
  episode_reward_max: -2.7499999999999853
  episode_reward_mean: -3.1297999999999773
  episode_reward_min: -3.6399999999999664
  episodes_this_iter: 33
  episodes_total: 904
  experiment_id: 39255c2c32a148c1abc3e53c967b4ad5
  hostname: linar-Z390-GAMING-X
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.4500000000000001
          cur_lr: 0.00010000000000000002
          entropy: 2.4833546893209473
          entropy_coeff: 0.01
          kl: 0.017415653507915407
          policy_loss: -0.012616865066254241
          total_loss: -0.016658770598662206
          vf_explained_var: 0.3220987915992737
          vf_loss: 0.012954597231768058
    num_agent_steps_sampled: 319872
    num_agent_steps_trained: 319872
    num_steps_sampled: 319872
    num_steps_trained: 319872
  iter

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_c4887_00000,RUNNING,192.168.3.5:625567,32,4204.48,319872,-3.1298,-2.75,-3.64,312.98


Result for PPO_my_env_c4887_00000:
  agent_timesteps_total: 329868
  custom_metrics: {}
  date: 2021-10-29_22-16-36
  done: false
  episode_len_mean: 312.83
  episode_media: {}
  episode_reward_max: -2.7899999999999845
  episode_reward_mean: -3.1282999999999777
  episode_reward_min: -3.5999999999999672
  episodes_this_iter: 31
  episodes_total: 935
  experiment_id: 39255c2c32a148c1abc3e53c967b4ad5
  hostname: linar-Z390-GAMING-X
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.4500000000000001
          cur_lr: 0.00010000000000000002
          entropy: 2.4905378690132727
          entropy_coeff: 0.01
          kl: 0.014784408288709201
          policy_loss: -0.01881795882159828
          total_loss: -0.025536895466920655
          vf_explained_var: 0.3332158029079437
          vf_loss: 0.011533457410654141
    num_agent_steps_sampled: 329868
    num_agent_steps_trained: 329868
    num_steps_sampled: 329868
    num_steps_trained: 329868
  itera

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_c4887_00000,RUNNING,192.168.3.5:625567,33,4332.91,329868,-3.1283,-2.79,-3.6,312.83


Result for PPO_my_env_c4887_00000:
  agent_timesteps_total: 339864
  custom_metrics: {}
  date: 2021-10-29_22-18-43
  done: false
  episode_len_mean: 311.87
  episode_media: {}
  episode_reward_max: -2.7399999999999856
  episode_reward_mean: -3.1186999999999774
  episode_reward_min: -3.5999999999999672
  episodes_this_iter: 32
  episodes_total: 967
  experiment_id: 39255c2c32a148c1abc3e53c967b4ad5
  hostname: linar-Z390-GAMING-X
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.4500000000000001
          cur_lr: 0.00010000000000000002
          entropy: 2.455656281291929
          entropy_coeff: 0.01
          kl: 0.015142638221916953
          policy_loss: -0.02143822324772676
          total_loss: -0.02736675529780551
          vf_explained_var: 0.3762071430683136
          vf_loss: 0.011813842216454363
    num_agent_steps_sampled: 339864
    num_agent_steps_trained: 339864
    num_steps_sampled: 339864
    num_steps_trained: 339864
  iterati

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_c4887_00000,RUNNING,192.168.3.5:625567,34,4460.25,339864,-3.1187,-2.74,-3.6,311.87




Result for PPO_my_env_c4887_00000:
  agent_timesteps_total: 349860
  custom_metrics: {}
  date: 2021-10-29_22-21-07
  done: false
  episode_len_mean: 312.39
  episode_media: {}
  episode_reward_max: -2.679999999999987
  episode_reward_mean: -3.1238999999999777
  episode_reward_min: -3.649999999999966
  episodes_this_iter: 32
  episodes_total: 999
  experiment_id: 39255c2c32a148c1abc3e53c967b4ad5
  hostname: linar-Z390-GAMING-X
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.4500000000000001
          cur_lr: 0.00010000000000000002
          entropy: 2.4785326240409136
          entropy_coeff: 0.01
          kl: 0.01757062327916766
          policy_loss: -0.02220239574010046
          total_loss: -0.027840495367462817
          vf_explained_var: 0.4516887366771698
          vf_loss: 0.011240446177643573
    num_agent_steps_sampled: 349860
    num_agent_steps_trained: 349860
    num_steps_sampled: 349860
    num_steps_trained: 349860
  iteratio

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_c4887_00000,RUNNING,192.168.3.5:625567,35,4603.98,349860,-3.1239,-2.68,-3.65,312.39


Result for PPO_my_env_c4887_00000:
  agent_timesteps_total: 359856
  custom_metrics: {}
  date: 2021-10-29_22-23-14
  done: false
  episode_len_mean: 312.1
  episode_media: {}
  episode_reward_max: -2.679999999999987
  episode_reward_mean: -3.120999999999977
  episode_reward_min: -3.649999999999966
  episodes_this_iter: 32
  episodes_total: 1031
  experiment_id: 39255c2c32a148c1abc3e53c967b4ad5
  hostname: linar-Z390-GAMING-X
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.4500000000000001
          cur_lr: 0.00010000000000000002
          entropy: 2.465301500630175
          entropy_coeff: 0.01
          kl: 0.014987771715023178
          policy_loss: -0.020971454151420512
          total_loss: -0.028570239981397603
          vf_explained_var: 0.43308722972869873
          vf_loss: 0.010309731472446782
    num_agent_steps_sampled: 359856
    num_agent_steps_trained: 359856
    num_steps_sampled: 359856
    num_steps_trained: 359856
  iterati

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_c4887_00000,RUNNING,192.168.3.5:625567,36,4730.8,359856,-3.121,-2.68,-3.65,312.1


Result for PPO_my_env_c4887_00000:
  agent_timesteps_total: 369852
  custom_metrics: {}
  date: 2021-10-29_22-25-22
  done: false
  episode_len_mean: 312.87
  episode_media: {}
  episode_reward_max: -2.679999999999987
  episode_reward_mean: -3.128699999999978
  episode_reward_min: -3.649999999999966
  episodes_this_iter: 33
  episodes_total: 1064
  experiment_id: 39255c2c32a148c1abc3e53c967b4ad5
  hostname: linar-Z390-GAMING-X
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.4500000000000001
          cur_lr: 0.00010000000000000002
          entropy: 2.4439613786518066
          entropy_coeff: 0.01
          kl: 0.015856081691068277
          policy_loss: -0.020754738303267548
          total_loss: -0.027710292099887488
          vf_explained_var: 0.4735862910747528
          vf_loss: 0.010348822307796815
    num_agent_steps_sampled: 369852
    num_agent_steps_trained: 369852
    num_steps_sampled: 369852
    num_steps_trained: 369852
  iterat

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_c4887_00000,RUNNING,192.168.3.5:625567,37,4859.53,369852,-3.1287,-2.68,-3.65,312.87




Result for PPO_my_env_c4887_00000:
  agent_timesteps_total: 379848
  custom_metrics: {}
  date: 2021-10-29_22-28-02
  done: false
  episode_len_mean: 309.76
  episode_media: {}
  episode_reward_max: -2.4099999999999926
  episode_reward_mean: -3.0975999999999773
  episode_reward_min: -3.649999999999966
  episodes_this_iter: 33
  episodes_total: 1097
  experiment_id: 39255c2c32a148c1abc3e53c967b4ad5
  hostname: linar-Z390-GAMING-X
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.4500000000000001
          cur_lr: 0.00010000000000000002
          entropy: 2.374099533170716
          entropy_coeff: 0.01
          kl: 0.014943379940512772
          policy_loss: -0.018988210274877713
          total_loss: -0.026447981076999607
          vf_explained_var: 0.5625393390655518
          vf_loss: 0.009556703569599364
    num_agent_steps_sampled: 379848
    num_agent_steps_trained: 379848
    num_steps_sampled: 379848
    num_steps_trained: 379848
  itera

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_c4887_00000,RUNNING,192.168.3.5:625567,38,5018.98,379848,-3.0976,-2.41,-3.65,309.76


Result for PPO_my_env_c4887_00000:
  agent_timesteps_total: 389844
  custom_metrics: {}
  date: 2021-10-29_22-30-14
  done: false
  episode_len_mean: 304.9
  episode_media: {}
  episode_reward_max: -2.4099999999999926
  episode_reward_mean: -3.0489999999999795
  episode_reward_min: -3.6399999999999664
  episodes_this_iter: 32
  episodes_total: 1129
  experiment_id: 39255c2c32a148c1abc3e53c967b4ad5
  hostname: linar-Z390-GAMING-X
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.4500000000000001
          cur_lr: 0.00010000000000000002
          entropy: 2.3354024942104634
          entropy_coeff: 0.01
          kl: 0.015965880162440018
          policy_loss: -0.024223454298180902
          total_loss: -0.031201171717391563
          vf_explained_var: 0.4857080578804016
          vf_loss: 0.00919166130959514
    num_agent_steps_sampled: 389844
    num_agent_steps_trained: 389844
    num_steps_sampled: 389844
    num_steps_trained: 389844
  itera

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_c4887_00000,RUNNING,192.168.3.5:625567,39,5150.7,389844,-3.049,-2.41,-3.64,304.9


Result for PPO_my_env_c4887_00000:
  agent_timesteps_total: 399840
  custom_metrics: {}
  date: 2021-10-29_22-32-21
  done: false
  episode_len_mean: 304.84
  episode_media: {}
  episode_reward_max: -2.4099999999999926
  episode_reward_mean: -3.048399999999979
  episode_reward_min: -3.5999999999999672
  episodes_this_iter: 33
  episodes_total: 1162
  experiment_id: 39255c2c32a148c1abc3e53c967b4ad5
  hostname: linar-Z390-GAMING-X
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.4500000000000001
          cur_lr: 0.00010000000000000002
          entropy: 2.3881343299507076
          entropy_coeff: 0.01
          kl: 0.016355600907076107
          policy_loss: -0.017645637458588322
          total_loss: -0.023507251065128888
          vf_explained_var: 0.4070455729961395
          vf_loss: 0.01065970875743705
    num_agent_steps_sampled: 399840
    num_agent_steps_trained: 399840
    num_steps_sampled: 399840
    num_steps_trained: 399840
  itera

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_c4887_00000,RUNNING,192.168.3.5:625567,40,5278.51,399840,-3.0484,-2.41,-3.6,304.84




Result for PPO_my_env_c4887_00000:
  agent_timesteps_total: 409836
  custom_metrics: {}
  date: 2021-10-29_22-34-43
  done: false
  episode_len_mean: 307.03
  episode_media: {}
  episode_reward_max: -2.489999999999991
  episode_reward_mean: -3.0702999999999783
  episode_reward_min: -3.699999999999965
  episodes_this_iter: 32
  episodes_total: 1194
  experiment_id: 39255c2c32a148c1abc3e53c967b4ad5
  hostname: linar-Z390-GAMING-X
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.4500000000000001
          cur_lr: 0.00010000000000000002
          entropy: 2.3994258157208432
          entropy_coeff: 0.01
          kl: 0.015232055339597816
          policy_loss: -0.01932046360726285
          total_loss: -0.026096213363811502
          vf_explained_var: 0.3386276066303253
          vf_loss: 0.01036408297803639
    num_agent_steps_sampled: 409836
    num_agent_steps_trained: 409836
    num_steps_sampled: 409836
    num_steps_trained: 409836
  iterati

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_c4887_00000,RUNNING,192.168.3.5:625567,41,5420.21,409836,-3.0703,-2.49,-3.7,307.03


Result for PPO_my_env_c4887_00000:
  agent_timesteps_total: 419832
  custom_metrics: {}
  date: 2021-10-29_22-36-49
  done: false
  episode_len_mean: 311.49
  episode_media: {}
  episode_reward_max: -2.489999999999991
  episode_reward_mean: -3.1148999999999774
  episode_reward_min: -3.789999999999963
  episodes_this_iter: 32
  episodes_total: 1226
  experiment_id: 39255c2c32a148c1abc3e53c967b4ad5
  hostname: linar-Z390-GAMING-X
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.4500000000000001
          cur_lr: 0.00010000000000000002
          entropy: 2.3764708171542894
          entropy_coeff: 0.01
          kl: 0.014758756555181315
          policy_loss: -0.021561736691520256
          total_loss: -0.028459307317359326
          vf_explained_var: 0.3527114689350128
          vf_loss: 0.01022569637393139
    num_agent_steps_sampled: 419832
    num_agent_steps_trained: 419832
    num_steps_sampled: 419832
    num_steps_trained: 419832
  iterat

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_c4887_00000,RUNNING,192.168.3.5:625567,42,5545.87,419832,-3.1149,-2.49,-3.79,311.49




Result for PPO_my_env_c4887_00000:
  agent_timesteps_total: 429828
  custom_metrics: {}
  date: 2021-10-29_22-39-12
  done: false
  episode_len_mean: 309.93
  episode_media: {}
  episode_reward_max: -2.399999999999993
  episode_reward_mean: -3.0992999999999773
  episode_reward_min: -3.789999999999963
  episodes_this_iter: 33
  episodes_total: 1259
  experiment_id: 39255c2c32a148c1abc3e53c967b4ad5
  hostname: linar-Z390-GAMING-X
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.4500000000000001
          cur_lr: 0.00010000000000000002
          entropy: 2.338917745076693
          entropy_coeff: 0.01
          kl: 0.015512373400396457
          policy_loss: -0.016073763950003518
          total_loss: -0.022079420538666922
          vf_explained_var: 0.3974491059780121
          vf_loss: 0.010402953043482047
    num_agent_steps_sampled: 429828
    num_agent_steps_trained: 429828
    num_steps_sampled: 429828
    num_steps_trained: 429828
  iterat

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_c4887_00000,RUNNING,192.168.3.5:625567,43,5689.43,429828,-3.0993,-2.4,-3.79,309.93




Result for PPO_my_env_c4887_00000:
  agent_timesteps_total: 439824
  custom_metrics: {}
  date: 2021-10-29_22-41-34
  done: false
  episode_len_mean: 305.99
  episode_media: {}
  episode_reward_max: -2.399999999999993
  episode_reward_mean: -3.0598999999999785
  episode_reward_min: -3.789999999999963
  episodes_this_iter: 33
  episodes_total: 1292
  experiment_id: 39255c2c32a148c1abc3e53c967b4ad5
  hostname: linar-Z390-GAMING-X
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.4500000000000001
          cur_lr: 0.00010000000000000002
          entropy: 2.304757446814806
          entropy_coeff: 0.01
          kl: 0.014093102431892898
          policy_loss: -0.018636864076694872
          total_loss: -0.02538163314263026
          vf_explained_var: 0.43813446164131165
          vf_loss: 0.009960909416743864
    num_agent_steps_sampled: 439824
    num_agent_steps_trained: 439824
    num_steps_sampled: 439824
    num_steps_trained: 439824
  iterat

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_c4887_00000,RUNNING,192.168.3.5:625567,44,5831.27,439824,-3.0599,-2.4,-3.79,305.99


Result for PPO_my_env_c4887_00000:
  agent_timesteps_total: 449820
  custom_metrics: {}
  date: 2021-10-29_22-43-43
  done: false
  episode_len_mean: 303.14
  episode_media: {}
  episode_reward_max: -2.399999999999993
  episode_reward_mean: -3.031399999999979
  episode_reward_min: -3.749999999999964
  episodes_this_iter: 33
  episodes_total: 1325
  experiment_id: 39255c2c32a148c1abc3e53c967b4ad5
  hostname: linar-Z390-GAMING-X
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.4500000000000001
          cur_lr: 0.00010000000000000002
          entropy: 2.3633854591948356
          entropy_coeff: 0.01
          kl: 0.018280340920817456
          policy_loss: -0.01769227715384247
          total_loss: -0.022249851245273893
          vf_explained_var: 0.3553963005542755
          vf_loss: 0.010850126716123225
    num_agent_steps_sampled: 449820
    num_agent_steps_trained: 449820
    num_steps_sampled: 449820
    num_steps_trained: 449820
  iterati

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_c4887_00000,RUNNING,192.168.3.5:625567,45,5959.86,449820,-3.0314,-2.4,-3.75,303.14




Result for PPO_my_env_c4887_00000:
  agent_timesteps_total: 459816
  custom_metrics: {}
  date: 2021-10-29_22-46-11
  done: false
  episode_len_mean: 300.22
  episode_media: {}
  episode_reward_max: -2.4199999999999924
  episode_reward_mean: -3.0021999999999798
  episode_reward_min: -3.7299999999999645
  episodes_this_iter: 34
  episodes_total: 1359
  experiment_id: 39255c2c32a148c1abc3e53c967b4ad5
  hostname: linar-Z390-GAMING-X
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.4500000000000001
          cur_lr: 0.00010000000000000002
          entropy: 2.289929468611367
          entropy_coeff: 0.01
          kl: 0.0162376785452844
          policy_loss: -0.019333535767136475
          total_loss: -0.024323479009744447
          vf_explained_var: 0.43398892879486084
          vf_loss: 0.010602397071525979
    num_agent_steps_sampled: 459816
    num_agent_steps_trained: 459816
    num_steps_sampled: 459816
    num_steps_trained: 459816
  itera

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_c4887_00000,RUNNING,192.168.3.5:625567,46,6107.61,459816,-3.0022,-2.42,-3.73,300.22


Result for PPO_my_env_c4887_00000:
  agent_timesteps_total: 469812
  custom_metrics: {}
  date: 2021-10-29_22-48-20
  done: false
  episode_len_mean: 300.58
  episode_media: {}
  episode_reward_max: -2.4199999999999924
  episode_reward_mean: -3.005799999999979
  episode_reward_min: -3.4899999999999696
  episodes_this_iter: 33
  episodes_total: 1392
  experiment_id: 39255c2c32a148c1abc3e53c967b4ad5
  hostname: linar-Z390-GAMING-X
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.4500000000000001
          cur_lr: 0.00010000000000000002
          entropy: 2.2836106721152607
          entropy_coeff: 0.01
          kl: 0.015097520499631933
          policy_loss: -0.01865168148572119
          total_loss: -0.024397287288537392
          vf_explained_var: 0.46282070875167847
          vf_loss: 0.010296616560886972
    num_agent_steps_sampled: 469812
    num_agent_steps_trained: 469812
    num_steps_sampled: 469812
    num_steps_trained: 469812
  iter

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_c4887_00000,RUNNING,192.168.3.5:625567,47,6237.19,469812,-3.0058,-2.42,-3.49,300.58


Result for PPO_my_env_c4887_00000:
  agent_timesteps_total: 479808
  custom_metrics: {}
  date: 2021-10-29_22-50-30
  done: false
  episode_len_mean: 300.25
  episode_media: {}
  episode_reward_max: -2.4199999999999924
  episode_reward_mean: -3.00249999999998
  episode_reward_min: -3.4899999999999696
  episodes_this_iter: 33
  episodes_total: 1425
  experiment_id: 39255c2c32a148c1abc3e53c967b4ad5
  hostname: linar-Z390-GAMING-X
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.4500000000000001
          cur_lr: 0.00010000000000000002
          entropy: 2.31308002461735
          entropy_coeff: 0.01
          kl: 0.01651572449124078
          policy_loss: -0.018782907869252893
          total_loss: -0.02424049158540801
          vf_explained_var: 0.4735865890979767
          vf_loss: 0.010241139664053002
    num_agent_steps_sampled: 479808
    num_agent_steps_trained: 479808
    num_steps_sampled: 479808
    num_steps_trained: 479808
  iteration

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_c4887_00000,RUNNING,192.168.3.5:625567,48,6366.75,479808,-3.0025,-2.42,-3.49,300.25




Result for PPO_my_env_c4887_00000:
  agent_timesteps_total: 489804
  custom_metrics: {}
  date: 2021-10-29_22-52-57
  done: false
  episode_len_mean: 299.35
  episode_media: {}
  episode_reward_max: -2.489999999999991
  episode_reward_mean: -2.9934999999999805
  episode_reward_min: -3.4899999999999696
  episodes_this_iter: 33
  episodes_total: 1458
  experiment_id: 39255c2c32a148c1abc3e53c967b4ad5
  hostname: linar-Z390-GAMING-X
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.4500000000000001
          cur_lr: 0.00010000000000000002
          entropy: 2.3147392332044423
          entropy_coeff: 0.01
          kl: 0.015496314287551114
          policy_loss: -0.018402710299079236
          total_loss: -0.02376255417226726
          vf_explained_var: 0.4498850107192993
          vf_loss: 0.010814205956178654
    num_agent_steps_sampled: 489804
    num_agent_steps_trained: 489804
    num_steps_sampled: 489804
    num_steps_trained: 489804
  itera

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_c4887_00000,RUNNING,192.168.3.5:625567,49,6514.1,489804,-2.9935,-2.49,-3.49,299.35


Result for PPO_my_env_c4887_00000:
  agent_timesteps_total: 499800
  custom_metrics: {}
  date: 2021-10-29_22-55-11
  done: false
  episode_len_mean: 295.56
  episode_media: {}
  episode_reward_max: -2.489999999999991
  episode_reward_mean: -2.95559999999998
  episode_reward_min: -3.46999999999997
  episodes_this_iter: 34
  episodes_total: 1492
  experiment_id: 39255c2c32a148c1abc3e53c967b4ad5
  hostname: linar-Z390-GAMING-X
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.4500000000000001
          cur_lr: 0.00010000000000000002
          entropy: 2.1793257379124307
          entropy_coeff: 0.01
          kl: 0.016372420892667997
          policy_loss: -0.019051698620757486
          total_loss: -0.02399324507922189
          vf_explained_var: 0.619396984577179
          vf_loss: 0.0094841219370132
    num_agent_steps_sampled: 499800
    num_agent_steps_trained: 499800
    num_steps_sampled: 499800
    num_steps_trained: 499800
  iterations_s

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_c4887_00000,RUNNING,192.168.3.5:625567,50,6647.96,499800,-2.9556,-2.49,-3.47,295.56




Result for PPO_my_env_c4887_00000:
  agent_timesteps_total: 509796
  custom_metrics: {}
  date: 2021-10-29_22-57-40
  done: false
  episode_len_mean: 290.3
  episode_media: {}
  episode_reward_max: -2.489999999999991
  episode_reward_mean: -2.902999999999982
  episode_reward_min: -3.46999999999997
  episodes_this_iter: 36
  episodes_total: 1528
  experiment_id: 39255c2c32a148c1abc3e53c967b4ad5
  hostname: linar-Z390-GAMING-X
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.4500000000000001
          cur_lr: 0.00010000000000000002
          entropy: 2.2030139334181436
          entropy_coeff: 0.01
          kl: 0.013451662042164204
          policy_loss: -0.015658262507337282
          total_loss: -0.021766033434332945
          vf_explained_var: 0.5676963329315186
          vf_loss: 0.009869120108972614
    num_agent_steps_sampled: 509796
    num_agent_steps_trained: 509796
    num_steps_sampled: 509796
    num_steps_trained: 509796
  iteratio

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_c4887_00000,RUNNING,192.168.3.5:625567,51,6796.44,509796,-2.903,-2.49,-3.47,290.3




Result for PPO_my_env_c4887_00000:
  agent_timesteps_total: 519792
  custom_metrics: {}
  date: 2021-10-29_23-00-07
  done: false
  episode_len_mean: 289.94
  episode_media: {}
  episode_reward_max: -2.53999999999999
  episode_reward_mean: -2.899399999999982
  episode_reward_min: -3.46999999999997
  episodes_this_iter: 35
  episodes_total: 1563
  experiment_id: 39255c2c32a148c1abc3e53c967b4ad5
  hostname: linar-Z390-GAMING-X
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.4500000000000001
          cur_lr: 0.00010000000000000002
          entropy: 2.222625909707485
          entropy_coeff: 0.01
          kl: 0.01802687505469138
          policy_loss: -0.020581856132763574
          total_loss: -0.02484679869097522
          vf_explained_var: 0.5447456240653992
          vf_loss: 0.009849222769744845
    num_agent_steps_sampled: 519792
    num_agent_steps_trained: 519792
    num_steps_sampled: 519792
    num_steps_trained: 519792
  iterations_

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_c4887_00000,RUNNING,192.168.3.5:625567,52,6943.32,519792,-2.8994,-2.54,-3.47,289.94


Result for PPO_my_env_c4887_00000:
  agent_timesteps_total: 529788
  custom_metrics: {}
  date: 2021-10-29_23-02-20
  done: false
  episode_len_mean: 288.69
  episode_media: {}
  episode_reward_max: -2.53999999999999
  episode_reward_mean: -2.8868999999999825
  episode_reward_min: -3.279999999999974
  episodes_this_iter: 34
  episodes_total: 1597
  experiment_id: 39255c2c32a148c1abc3e53c967b4ad5
  hostname: linar-Z390-GAMING-X
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.4500000000000001
          cur_lr: 0.00010000000000000002
          entropy: 2.1713716874774707
          entropy_coeff: 0.01
          kl: 0.01634742466014644
          policy_loss: -0.02297049639189345
          total_loss: -0.028771786796104194
          vf_explained_var: 0.5805609822273254
          vf_loss: 0.00855608444280811
    num_agent_steps_sampled: 529788
    num_agent_steps_trained: 529788
    num_steps_sampled: 529788
    num_steps_trained: 529788
  iteration

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_c4887_00000,RUNNING,192.168.3.5:625567,53,7076.76,529788,-2.8869,-2.54,-3.28,288.69




Result for PPO_my_env_c4887_00000:
  agent_timesteps_total: 539784
  custom_metrics: {}
  date: 2021-10-29_23-04-47
  done: false
  episode_len_mean: 291.24
  episode_media: {}
  episode_reward_max: -2.489999999999991
  episode_reward_mean: -2.9123999999999812
  episode_reward_min: -3.429999999999971
  episodes_this_iter: 33
  episodes_total: 1630
  experiment_id: 39255c2c32a148c1abc3e53c967b4ad5
  hostname: linar-Z390-GAMING-X
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.4500000000000001
          cur_lr: 0.00010000000000000002
          entropy: 2.1629161366030702
          entropy_coeff: 0.01
          kl: 0.018812124458975198
          policy_loss: -0.024726992937871534
          total_loss: -0.029555408128051675
          vf_explained_var: 0.6291961669921875
          vf_loss: 0.008335290178626728
    num_agent_steps_sampled: 539784
    num_agent_steps_trained: 539784
    num_steps_sampled: 539784
    num_steps_trained: 539784
  itera

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_c4887_00000,RUNNING,192.168.3.5:625567,54,7223.87,539784,-2.9124,-2.49,-3.43,291.24


Result for PPO_my_env_c4887_00000:
  agent_timesteps_total: 549780
  custom_metrics: {}
  date: 2021-10-29_23-06-59
  done: false
  episode_len_mean: 293.63
  episode_media: {}
  episode_reward_max: -2.489999999999991
  episode_reward_mean: -2.936299999999981
  episode_reward_min: -3.45999999999997
  episodes_this_iter: 34
  episodes_total: 1664
  experiment_id: 39255c2c32a148c1abc3e53c967b4ad5
  hostname: linar-Z390-GAMING-X
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.4500000000000001
          cur_lr: 0.00010000000000000002
          entropy: 2.190102571911282
          entropy_coeff: 0.01
          kl: 0.017827942630340057
          policy_loss: -0.022195738372512354
          total_loss: -0.02813126426190138
          vf_explained_var: 0.612973690032959
          vf_loss: 0.007942924573923
    num_agent_steps_sampled: 549780
    num_agent_steps_trained: 549780
    num_steps_sampled: 549780
    num_steps_trained: 549780
  iterations_si

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_c4887_00000,RUNNING,192.168.3.5:625567,55,7355.82,549780,-2.9363,-2.49,-3.46,293.63


Result for PPO_my_env_c4887_00000:
  agent_timesteps_total: 559776
  custom_metrics: {}
  date: 2021-10-29_23-09-12
  done: false
  episode_len_mean: 294.69
  episode_media: {}
  episode_reward_max: -2.489999999999991
  episode_reward_mean: -2.946899999999981
  episode_reward_min: -3.45999999999997
  episodes_this_iter: 35
  episodes_total: 1699
  experiment_id: 39255c2c32a148c1abc3e53c967b4ad5
  hostname: linar-Z390-GAMING-X
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.4500000000000001
          cur_lr: 0.00010000000000000002
          entropy: 2.0848699658344954
          entropy_coeff: 0.01
          kl: 0.01594373286038608
          policy_loss: -0.022048574284865306
          total_loss: -0.02707297321822908
          vf_explained_var: 0.5545592904090881
          vf_loss: 0.008649621297491906
    num_agent_steps_sampled: 559776
    num_agent_steps_trained: 559776
    num_steps_sampled: 559776
    num_steps_trained: 559776
  iteration

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_c4887_00000,RUNNING,192.168.3.5:625567,56,7488.35,559776,-2.9469,-2.49,-3.46,294.69




Result for PPO_my_env_c4887_00000:
  agent_timesteps_total: 569772
  custom_metrics: {}
  date: 2021-10-29_23-11-40
  done: false
  episode_len_mean: 291.33
  episode_media: {}
  episode_reward_max: -2.429999999999992
  episode_reward_mean: -2.913299999999982
  episode_reward_min: -3.379999999999972
  episodes_this_iter: 34
  episodes_total: 1733
  experiment_id: 39255c2c32a148c1abc3e53c967b4ad5
  hostname: linar-Z390-GAMING-X
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.4500000000000001
          cur_lr: 0.00010000000000000002
          entropy: 2.1093609109902993
          entropy_coeff: 0.01
          kl: 0.015910196046946223
          policy_loss: -0.021765779404558688
          total_loss: -0.02769345939796195
          vf_explained_var: 0.5633935332298279
          vf_loss: 0.008006340928244381
    num_agent_steps_sampled: 569772
    num_agent_steps_trained: 569772
    num_steps_sampled: 569772
    num_steps_trained: 569772
  iterati

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_c4887_00000,RUNNING,192.168.3.5:625567,57,7636.57,569772,-2.9133,-2.43,-3.38,291.33


Result for PPO_my_env_c4887_00000:
  agent_timesteps_total: 579768
  custom_metrics: {}
  date: 2021-10-29_23-13-54
  done: false
  episode_len_mean: 286.89
  episode_media: {}
  episode_reward_max: -2.429999999999992
  episode_reward_mean: -2.8688999999999827
  episode_reward_min: -3.289999999999974
  episodes_this_iter: 36
  episodes_total: 1769
  experiment_id: 39255c2c32a148c1abc3e53c967b4ad5
  hostname: linar-Z390-GAMING-X
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.4500000000000001
          cur_lr: 0.00010000000000000002
          entropy: 2.07872747366245
          entropy_coeff: 0.01
          kl: 0.0175620039529285
          policy_loss: -0.017759458028162138
          total_loss: -0.02163725811828915
          vf_explained_var: 0.5480151772499084
          vf_loss: 0.009006572793174185
    num_agent_steps_sampled: 579768
    num_agent_steps_trained: 579768
    num_steps_sampled: 579768
    num_steps_trained: 579768
  iterations

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_c4887_00000,RUNNING,192.168.3.5:625567,58,7770.84,579768,-2.8689,-2.43,-3.29,286.89




Result for PPO_my_env_c4887_00000:
  agent_timesteps_total: 589764
  custom_metrics: {}
  date: 2021-10-29_23-16-27
  done: false
  episode_len_mean: 282.67
  episode_media: {}
  episode_reward_max: -2.339999999999994
  episode_reward_mean: -2.826699999999984
  episode_reward_min: -3.289999999999974
  episodes_this_iter: 36
  episodes_total: 1805
  experiment_id: 39255c2c32a148c1abc3e53c967b4ad5
  hostname: linar-Z390-GAMING-X
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.4500000000000001
          cur_lr: 0.00010000000000000002
          entropy: 2.0517369370175222
          entropy_coeff: 0.01
          kl: 0.01644579320157949
          policy_loss: -0.018114786362673482
          total_loss: -0.02239180891177593
          vf_explained_var: 0.5638426542282104
          vf_loss: 0.008839739460456304
    num_agent_steps_sampled: 589764
    num_agent_steps_trained: 589764
    num_steps_sampled: 589764
    num_steps_trained: 589764
  iteratio

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_c4887_00000,RUNNING,192.168.3.5:625567,59,7923.63,589764,-2.8267,-2.34,-3.29,282.67


Result for PPO_my_env_c4887_00000:
  agent_timesteps_total: 599760
  custom_metrics: {}
  date: 2021-10-29_23-18-45
  done: false
  episode_len_mean: 283.79
  episode_media: {}
  episode_reward_max: -2.339999999999994
  episode_reward_mean: -2.837899999999983
  episode_reward_min: -3.419999999999971
  episodes_this_iter: 34
  episodes_total: 1839
  experiment_id: 39255c2c32a148c1abc3e53c967b4ad5
  hostname: linar-Z390-GAMING-X
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.4500000000000001
          cur_lr: 0.00010000000000000002
          entropy: 2.065214273868463
          entropy_coeff: 0.01
          kl: 0.014123745496059291
          policy_loss: -0.021670338177146056
          total_loss: -0.027071743842182507
          vf_explained_var: 0.5465595722198486
          vf_loss: 0.00889505126989152
    num_agent_steps_sampled: 599760
    num_agent_steps_trained: 599760
    num_steps_sampled: 599760
    num_steps_trained: 599760
  iteratio

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_c4887_00000,RUNNING,192.168.3.5:625567,60,8061.41,599760,-2.8379,-2.34,-3.42,283.79


Result for PPO_my_env_c4887_00000:
  agent_timesteps_total: 609756
  custom_metrics: {}
  date: 2021-10-29_23-20-58
  done: false
  episode_len_mean: 285.84
  episode_media: {}
  episode_reward_max: -2.339999999999994
  episode_reward_mean: -2.858399999999983
  episode_reward_min: -3.5499999999999683
  episodes_this_iter: 35
  episodes_total: 1874
  experiment_id: 39255c2c32a148c1abc3e53c967b4ad5
  hostname: linar-Z390-GAMING-X
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.4500000000000001
          cur_lr: 0.00010000000000000002
          entropy: 2.0670559655906806
          entropy_coeff: 0.01
          kl: 0.04254650097345052
          policy_loss: -0.0050093069768104796
          total_loss: 0.007067105479729482
          vf_explained_var: 0.4369203746318817
          vf_loss: 0.013601046668029287
    num_agent_steps_sampled: 609756
    num_agent_steps_trained: 609756
    num_steps_sampled: 609756
    num_steps_trained: 609756
  iterat

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_c4887_00000,RUNNING,192.168.3.5:625567,61,8194.17,609756,-2.8584,-2.34,-3.55,285.84




Result for PPO_my_env_c4887_00000:
  agent_timesteps_total: 619752
  custom_metrics: {}
  date: 2021-10-29_23-23-40
  done: false
  episode_len_mean: 288.6
  episode_media: {}
  episode_reward_max: -2.299999999999995
  episode_reward_mean: -2.8859999999999815
  episode_reward_min: -3.5499999999999683
  episodes_this_iter: 34
  episodes_total: 1908
  experiment_id: 39255c2c32a148c1abc3e53c967b4ad5
  hostname: linar-Z390-GAMING-X
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.6749999999999999
          cur_lr: 0.00010000000000000002
          entropy: 2.0855036985160957
          entropy_coeff: 0.01
          kl: 0.012300640591275499
          policy_loss: -0.021298991414344208
          total_loss: -0.02258242990375839
          vf_explained_var: 0.41605859994888306
          vf_loss: 0.011268665338866413
    num_agent_steps_sampled: 619752
    num_agent_steps_trained: 619752
    num_steps_sampled: 619752
    num_steps_trained: 619752
  itera

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_c4887_00000,RUNNING,192.168.3.5:625567,62,8356.64,619752,-2.886,-2.3,-3.55,288.6


Result for PPO_my_env_c4887_00000:
  agent_timesteps_total: 629748
  custom_metrics: {}
  date: 2021-10-29_23-25-54
  done: false
  episode_len_mean: 288.19
  episode_media: {}
  episode_reward_max: -2.299999999999995
  episode_reward_mean: -2.881899999999982
  episode_reward_min: -3.5499999999999683
  episodes_this_iter: 36
  episodes_total: 1944
  experiment_id: 39255c2c32a148c1abc3e53c967b4ad5
  hostname: linar-Z390-GAMING-X
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.6749999999999999
          cur_lr: 0.00010000000000000002
          entropy: 2.0460161824511665
          entropy_coeff: 0.01
          kl: 0.010634216407628147
          policy_loss: -0.01714293073830951
          total_loss: -0.020729632476647185
          vf_explained_var: 0.47515279054641724
          vf_loss: 0.009695363039614505
    num_agent_steps_sampled: 629748
    num_agent_steps_trained: 629748
    num_steps_sampled: 629748
    num_steps_trained: 629748
  itera

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_c4887_00000,RUNNING,192.168.3.5:625567,63,8490.91,629748,-2.8819,-2.3,-3.55,288.19




Result for PPO_my_env_c4887_00000:
  agent_timesteps_total: 639744
  custom_metrics: {}
  date: 2021-10-29_23-28-24
  done: false
  episode_len_mean: 287.41
  episode_media: {}
  episode_reward_max: -2.299999999999995
  episode_reward_mean: -2.8740999999999826
  episode_reward_min: -3.369999999999972
  episodes_this_iter: 34
  episodes_total: 1978
  experiment_id: 39255c2c32a148c1abc3e53c967b4ad5
  hostname: linar-Z390-GAMING-X
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.6749999999999999
          cur_lr: 0.00010000000000000002
          entropy: 2.045762222139244
          entropy_coeff: 0.01
          kl: 0.011985253630256007
          policy_loss: -0.01930820350973015
          total_loss: -0.022359022666883263
          vf_explained_var: 0.5474748015403748
          vf_loss: 0.009316756012332108
    num_agent_steps_sampled: 639744
    num_agent_steps_trained: 639744
    num_steps_sampled: 639744
    num_steps_trained: 639744
  iterati

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_c4887_00000,RUNNING,192.168.3.5:625567,64,8640.02,639744,-2.8741,-2.3,-3.37,287.41




Result for PPO_my_env_c4887_00000:
  agent_timesteps_total: 649740
  custom_metrics: {}
  date: 2021-10-29_23-30-52
  done: false
  episode_len_mean: 285.73
  episode_media: {}
  episode_reward_max: -2.3699999999999934
  episode_reward_mean: -2.8572999999999826
  episode_reward_min: -3.289999999999974
  episodes_this_iter: 35
  episodes_total: 2013
  experiment_id: 39255c2c32a148c1abc3e53c967b4ad5
  hostname: linar-Z390-GAMING-X
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.6749999999999999
          cur_lr: 0.00010000000000000002
          entropy: 2.028498539659712
          entropy_coeff: 0.01
          kl: 0.012719947084878499
          policy_loss: -0.019898597004576624
          total_loss: -0.02132507791249161
          vf_explained_var: 0.5349735617637634
          vf_loss: 0.010272539911132991
    num_agent_steps_sampled: 649740
    num_agent_steps_trained: 649740
    num_steps_sampled: 649740
    num_steps_trained: 649740
  iterat

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_c4887_00000,RUNNING,192.168.3.5:625567,65,8788.17,649740,-2.8573,-2.37,-3.29,285.73


Result for PPO_my_env_c4887_00000:
  agent_timesteps_total: 659736
  custom_metrics: {}
  date: 2021-10-29_23-33-10
  done: false
  episode_len_mean: 283.07
  episode_media: {}
  episode_reward_max: -2.3699999999999934
  episode_reward_mean: -2.8306999999999833
  episode_reward_min: -3.289999999999974
  episodes_this_iter: 36
  episodes_total: 2049
  experiment_id: 39255c2c32a148c1abc3e53c967b4ad5
  hostname: linar-Z390-GAMING-X
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.6749999999999999
          cur_lr: 0.00010000000000000002
          entropy: 1.9929540474190672
          entropy_coeff: 0.01
          kl: 0.011212341012001959
          policy_loss: -0.014168124184267133
          total_loss: -0.01581960196296374
          vf_explained_var: 0.5183500051498413
          vf_loss: 0.010709732177848411
    num_agent_steps_sampled: 659736
    num_agent_steps_trained: 659736
    num_steps_sampled: 659736
    num_steps_trained: 659736
  itera

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_c4887_00000,RUNNING,192.168.3.5:625567,66,8926.49,659736,-2.8307,-2.37,-3.29,283.07




Result for PPO_my_env_c4887_00000:
  agent_timesteps_total: 669732
  custom_metrics: {}
  date: 2021-10-29_23-35-52
  done: false
  episode_len_mean: 283.38
  episode_media: {}
  episode_reward_max: -2.3699999999999934
  episode_reward_mean: -2.8337999999999837
  episode_reward_min: -3.46999999999997
  episodes_this_iter: 35
  episodes_total: 2084
  experiment_id: 39255c2c32a148c1abc3e53c967b4ad5
  hostname: linar-Z390-GAMING-X
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.6749999999999999
          cur_lr: 0.00010000000000000002
          entropy: 1.9871881185433804
          entropy_coeff: 0.01
          kl: 0.01419904472908332
          policy_loss: -0.017674300724115126
          total_loss: -0.01631366869386954
          vf_explained_var: 0.392121285200119
          vf_loss: 0.011648157352092079
    num_agent_steps_sampled: 669732
    num_agent_steps_trained: 669732
    num_steps_sampled: 669732
    num_steps_trained: 669732
  iteratio

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_c4887_00000,RUNNING,192.168.3.5:625567,67,9088.34,669732,-2.8338,-2.37,-3.47,283.38


Result for PPO_my_env_c4887_00000:
  agent_timesteps_total: 679728
  custom_metrics: {}
  date: 2021-10-29_23-38-07
  done: false
  episode_len_mean: 283.2
  episode_media: {}
  episode_reward_max: -2.4599999999999915
  episode_reward_mean: -2.8319999999999834
  episode_reward_min: -3.46999999999997
  episodes_this_iter: 35
  episodes_total: 2119
  experiment_id: 39255c2c32a148c1abc3e53c967b4ad5
  hostname: linar-Z390-GAMING-X
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.6749999999999999
          cur_lr: 0.00010000000000000002
          entropy: 1.9528093075140929
          entropy_coeff: 0.01
          kl: 0.01206058641523219
          policy_loss: -0.013660110616021686
          total_loss: -0.014016258070229465
          vf_explained_var: 0.3820677697658539
          vf_loss: 0.011031048572408313
    num_agent_steps_sampled: 679728
    num_agent_steps_trained: 679728
    num_steps_sampled: 679728
    num_steps_trained: 679728
  iterati

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_c4887_00000,RUNNING,192.168.3.5:625567,68,9223.73,679728,-2.832,-2.46,-3.47,283.2
