In [1]:
import torch 
from torch import nn

import ray
from ray.rllib.agents import ppo
from ray.rllib.models import ModelCatalog
from ray.rllib.models.torch.torch_modelv2 import TorchModelV2
from ray.rllib.utils.annotations import override

#from models import VisualEncoder
from train import *
from wrappers_2 import *



In [2]:
class VisualEncoder(nn.Module):
    def __init__(self):
        super().__init__()

        self.cnn = nn.Sequential(
            nn.Conv2d(3, 32, kernel_size=2, stride=2, padding=0),  
            nn.ELU(),
            nn.Conv2d(32, 32, kernel_size=2, stride=2, padding=0), 
            nn.ELU(),
            nn.Conv2d(32, 64, kernel_size=2, stride=2, padding=0), 
            nn.ELU(),
            nn.Conv2d(64, 128, kernel_size=2, stride=2, padding=0),
            nn.ELU(), 
            nn.Conv2d(128, 256, kernel_size=2, stride=2, padding=0),
            nn.ELU(),
            nn.Conv2d(256, 512, kernel_size=2, stride=2, padding=0),
            nn.ELU(),
            nn.Flatten(),
        )

    def forward(self, x):
        return self.cnn(x)

In [3]:
from torch.nn.functional import one_hot

class MyModelClass(TorchModelV2, nn.Module):
    def __init__(self, obs_space, action_space, num_outputs, model_config, name):
        TorchModelV2.__init__(self, obs_space, action_space, num_outputs, model_config, name)
        nn.Module.__init__(self)
        visual_features_dim = 512
        target_features_dim = 9 * 11 * 11 
        self.visual_encoder = VisualEncoder()
        self.visual_encoder.load_state_dict(
            torch.load("/IGLU-Minecraft/models/AngelaCNN/encoder_weigths.pth", map_location=torch.device('cpu'))
        )
        self.target_encoder = nn.Sequential(
            nn.Conv3d(7, 1, kernel_size=1, stride=1, padding=0),
            nn.ELU(),
        )
        policy_hidden_dim = 256 
        self.policy_network = nn.Sequential(
            nn.Linear(visual_features_dim + target_features_dim, 1024),
            nn.ELU(),
            nn.Linear(1024, 512),
            nn.ELU(),
            nn.Linear(512, policy_hidden_dim),
            nn.ELU(),
            nn.Linear(policy_hidden_dim, policy_hidden_dim),
            nn.ELU(),
            #nn.Linear(policy_hidden_dim, policy_hidden_dim),
            #nn.ELU(),
        )
        self.action_head = nn.Linear(policy_hidden_dim, action_space.n)
        self.value_head = nn.Linear(policy_hidden_dim, 1)
        self.last_value = None
        
        self.use_cuda = torch.cuda.is_available()
        if self.use_cuda:
            self.visual_encoder.cuda()
            self.target_encoder.cuda()
            self.policy_network.cuda()
            self.action_head.cuda()
            self.value_head.cuda()
        
    @override(TorchModelV2)
    def forward(self, input_dict, state, seq_lens):
        obs = input_dict['obs']
        pov = obs['pov'].permute(0, 3, 1, 2).float() / 255.0
        target = one_hot(obs['target_grid'].long(), num_classes=7).permute(0, 4, 1, 2, 3).float()
        if self.use_cuda:
            pov.cuda()
            target.cuda()
            
        with torch.no_grad():
            visual_features = self.visual_encoder(pov)
            
        target_features = self.target_encoder(target)
        target_features = target_features.reshape(target_features.shape[0], -1)
        features = torch.cat([visual_features, target_features], dim=1)
        features = self.policy_network(features)
        action = self.action_head(features)
        self.last_value = self.value_head(features).squeeze(1)
        return action, state
    
    @override(TorchModelV2)
    def value_function(self):
        assert self.last_value is not None, "must call forward() first"
        return self.last_value

In [4]:
visual_features_dim = 512
target_features_dim = 9 * 11 * 11
policy_hidden_dim = 256 

policy_network = nn.Sequential(
    nn.Linear(visual_features_dim + target_features_dim, 1024),
    nn.ELU(),
    nn.Linear(1024, 512),
    nn.ELU(),
    nn.Linear(512, policy_hidden_dim),
    nn.ELU(),
    nn.Linear(policy_hidden_dim, policy_hidden_dim),
    nn.ELU(),
    #nn.Linear(policy_hidden_dim, policy_hidden_dim),
    #nn.ELU(),
)

sum(p.numel() for p in policy_network.parameters())

2362368

In [5]:
ModelCatalog.register_custom_model("my_torch_model", MyModelClass)

In [6]:
class VisualObservationWrapper(ObsWrapper):
    def __init__(self, env, include_target=False):
        super().__init__(env)
        self.observation_space = {   
            'pov': gym.spaces.Box(low=0, high=255, shape=(64, 64, 3)),
            'inventory': gym.spaces.Box(low=0.0, high=20.0, shape=(6,)),
            'compass': gym.spaces.Box(low=-180.0, high=180.0, shape=(1,))
        }
        if include_target:
            self.observation_space['target_grid'] = \
                gym.spaces.Box(low=0, high=6, shape=(9, 11, 11))
        self.observation_space = gym.spaces.Dict(self.observation_space)

    def observation(self, obs, reward=None, done=None, info=None):
        if info is not None:
            if 'target_grid' in info:
                target_grid = info['target_grid']
                del info['target_grid']
            else:
                logger.error(f'info: {info}')
                if hasattr(self.unwrapped, 'should_reset'):
                    self.unwrapped.should_reset(True)
                target_grid = self.env.unwrapped.tasks.current.target_grid
        else:
            target_grid = self.env.unwrapped.tasks.current.target_grid
        return {
            'pov': obs['pov'].astype(np.float32),
            'inventory': obs['inventory'],
            'compass': np.array([obs['compass']['angle'].item()]),
            'target_grid': target_grid
        }

In [7]:
import os
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"   # see issue #152
os.environ["CUDA_VISIBLE_DEVICES"]="0"

class RewardWrapper(gym.RewardWrapper):
    def __init__(self, env):
        super().__init__(env)
    
    def reward(self, rew):
        if rew == 0:
            rew = -0.01
        if abs(rew) == 1:
            rew /= 10
            
        return rew
    
def env_creator(env_config):
    env = gym.make('IGLUSilentBuilder-v0', max_steps=500)
    env.update_taskset(TaskSet(preset=['C3',  'C17', 'C20',
                                       'C22', 'C32', 'C40',
                                       'C85', 'C87', 'C93']))
    #env = PovOnlyWrapper(env)
    env = VisualObservationWrapper(env, include_target=True)
    env = SelectAndPlace(env)
    env = Discretization(env, flat_action_space('human-level'))
    env = RewardWrapper(env)
    return env

from ray.tune.registry import register_env
register_env("my_env", env_creator)

from ray import tune
from ray.rllib.agents.ppo import PPOTrainer

In [None]:
from ray.tune.integration.wandb import WandbLogger

analysis = tune.run(PPOTrainer, 
         config={
             "env": "my_env", 
             "framework": "torch",
             "num_gpus": 1,
             "num_workers": 3,
             "sgd_minibatch_size": 256,
             "clip_param": 0.2,
             "entropy_coeff": 0.01,
             "lambda": 0.95,
             "train_batch_size": 5_000,
             "lr": 1e-4,
             #"gamma": 0.99,
             "model": {
                    # Specify our custom model from above.
                    "custom_model": "my_torch_model",
                    # Extra kwargs to be passed to your model's c'tor.
                    "custom_model_config": {},
              },
             "logger_config": {
                  "wandb": {
                      "project": "IGLU-Minecraft",
                      "name": "PPO MultiTask <=10 pretrained (AngelaCNN) (3 noops after placement) r: -0.01 div10"
                  }
              }

        },
        loggers=[WandbLogger],
        local_dir="/IGLU-Minecraft/checkpoints/10_blocks_max",
        keep_checkpoints_num=50,
        checkpoint_freq=5,
        checkpoint_at_end=True)

2021-11-06 18:47:04,029	INFO wandb.py:170 -- Already logged into W&B.
2021-11-06 18:47:04,045	ERROR syncer.py:72 -- Log sync requires rsync to be installed.


Trial name,status,loc
PPO_my_env_ef0ef_00000,RUNNING,


[34m[1mwandb[0m: Currently logged in as: [33mlinar[0m (use `wandb login --relogin` to force relogin)
[34m[1mwandb[0m: wandb version 0.12.6 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade


[2m[36m(pid=480903)[0m 2021-11-06 18:47:07,458	INFO ppo.py:159 -- In multi-agent mode, policies will be optimized sequentially by the multi-GPU optimizer. Consider setting simple_optimizer=True if this doesn't work for you.
[2m[36m(pid=480903)[0m 2021-11-06 18:47:07,458	INFO trainer.py:728 -- Current log_level is WARN. For more information, set 'log_level': 'INFO' / 'DEBUG' or use the -v and -vv flags.


Result for PPO_my_env_ef0ef_00000:
  agent_timesteps_total: 9996
  custom_metrics: {}
  date: 2021-11-06_18-51-05
  done: false
  episode_len_mean: 200.39583333333334
  episode_media: {}
  episode_reward_max: 1.8800000000000023
  episode_reward_mean: -1.2727083333333329
  episode_reward_min: -3.019999999999994
  episodes_this_iter: 48
  episodes_total: 48
  experiment_id: d164439ba9304db690d8b387f5275ed7
  hostname: linar-Z390-GAMING-X
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.19999999999999998
          cur_lr: 0.00010000000000000002
          entropy: 2.8797739416106136
          entropy_coeff: 0.01
          kl: 0.008081097647910228
          policy_loss: -0.02008541603373666
          total_loss: 0.06370990906770413
          vf_explained_var: -0.22477690875530243
          vf_loss: 0.11097684448913762
    num_agent_steps_sampled: 9996
    num_agent_steps_trained: 9996
    num_steps_sampled: 9996
    num_steps_trained: 9996
  iterat

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_ef0ef_00000,RUNNING,192.168.3.5:480903,1,232.092,9996,-1.27271,1.88,-3.02,200.396


Result for PPO_my_env_ef0ef_00000:
  agent_timesteps_total: 19992
  custom_metrics: {}
  date: 2021-11-06_18-52-53
  done: false
  episode_len_mean: 202.84536082474227
  episode_media: {}
  episode_reward_max: 1.8800000000000023
  episode_reward_mean: -1.085876288659793
  episode_reward_min: -3.019999999999994
  episodes_this_iter: 49
  episodes_total: 97
  experiment_id: d164439ba9304db690d8b387f5275ed7
  hostname: linar-Z390-GAMING-X
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.19999999999999998
          cur_lr: 0.00010000000000000002
          entropy: 2.8607627041319494
          entropy_coeff: 0.01
          kl: 0.011250534336129502
          policy_loss: -0.02113158804261022
          total_loss: 0.10540727802958244
          vf_explained_var: 0.07498275488615036
          vf_loss: 0.15289638614297932
    num_agent_steps_sampled: 19992
    num_agent_steps_trained: 19992
    num_steps_sampled: 19992
    num_steps_trained: 19992
  ite

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_ef0ef_00000,RUNNING,192.168.3.5:480903,2,340.407,19992,-1.08588,1.88,-3.02,202.845




Result for PPO_my_env_ef0ef_00000:
  agent_timesteps_total: 29988
  custom_metrics: {}
  date: 2021-11-06_18-55-01
  done: false
  episode_len_mean: 204.53
  episode_media: {}
  episode_reward_max: 5.110000000000029
  episode_reward_mean: -0.7794999999999983
  episode_reward_min: -2.849999999999993
  episodes_this_iter: 48
  episodes_total: 145
  experiment_id: d164439ba9304db690d8b387f5275ed7
  hostname: linar-Z390-GAMING-X
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.19999999999999998
          cur_lr: 0.00010000000000000002
          entropy: 2.8464959556220943
          entropy_coeff: 0.01
          kl: 0.01322233132716843
          policy_loss: -0.02388313355990964
          total_loss: 0.17145574445493966
          vf_explained_var: 0.163921520113945
          vf_loss: 0.22115937053966217
    num_agent_steps_sampled: 29988
    num_agent_steps_trained: 29988
    num_steps_sampled: 29988
    num_steps_trained: 29988
  iterations_since_

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_ef0ef_00000,RUNNING,192.168.3.5:480903,3,468.285,29988,-0.7795,5.11,-2.85,204.53




Result for PPO_my_env_ef0ef_00000:
  agent_timesteps_total: 39984
  custom_metrics: {}
  date: 2021-11-06_18-57-27
  done: false
  episode_len_mean: 203.03
  episode_media: {}
  episode_reward_max: 5.110000000000029
  episode_reward_mean: -0.21619999999999634
  episode_reward_min: -3.2299999999999813
  episodes_this_iter: 51
  episodes_total: 196
  experiment_id: d164439ba9304db690d8b387f5275ed7
  hostname: linar-Z390-GAMING-X
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.19999999999999998
          cur_lr: 0.00010000000000000002
          entropy: 2.825464665176522
          entropy_coeff: 0.01
          kl: 0.01352720382198351
          policy_loss: -0.025367344358665313
          total_loss: 0.18918303084583618
          vf_explained_var: 0.310432493686676
          vf_loss: 0.24009958113519808
    num_agent_steps_sampled: 39984
    num_agent_steps_trained: 39984
    num_steps_sampled: 39984
    num_steps_trained: 39984
  iterations_sinc

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_ef0ef_00000,RUNNING,192.168.3.5:480903,4,613.798,39984,-0.2162,5.11,-3.23,203.03


Result for PPO_my_env_ef0ef_00000:
  agent_timesteps_total: 49980
  custom_metrics: {}
  date: 2021-11-06_18-59-23
  done: false
  episode_len_mean: 205.01
  episode_media: {}
  episode_reward_max: 3.740000000000019
  episode_reward_mean: -0.036899999999995076
  episode_reward_min: -3.2299999999999813
  episodes_this_iter: 47
  episodes_total: 243
  experiment_id: d164439ba9304db690d8b387f5275ed7
  hostname: linar-Z390-GAMING-X
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.19999999999999998
          cur_lr: 0.00010000000000000002
          entropy: 2.805906440457727
          entropy_coeff: 0.01
          kl: 0.014333567865945288
          policy_loss: -0.02555209455979813
          total_loss: 0.2307128119521225
          vf_explained_var: 0.2717740833759308
          vf_loss: 0.2814572568377878
    num_agent_steps_sampled: 49980
    num_agent_steps_trained: 49980
    num_steps_sampled: 49980
    num_steps_trained: 49980
  iterations_sinc

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_ef0ef_00000,RUNNING,192.168.3.5:480903,5,730.129,49980,-0.0369,3.74,-3.23,205.01




Result for PPO_my_env_ef0ef_00000:
  agent_timesteps_total: 59976
  custom_metrics: {}
  date: 2021-11-06_19-01-44
  done: false
  episode_len_mean: 207.5
  episode_media: {}
  episode_reward_max: 5.150000000000001
  episode_reward_mean: 0.14130000000000598
  episode_reward_min: -3.389999999999989
  episodes_this_iter: 49
  episodes_total: 292
  experiment_id: d164439ba9304db690d8b387f5275ed7
  hostname: linar-Z390-GAMING-X
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.19999999999999998
          cur_lr: 0.00010000000000000002
          entropy: 2.7795258112442798
          entropy_coeff: 0.01
          kl: 0.016481228214389285
          policy_loss: -0.025772445094891083
          total_loss: 0.24201763234873358
          vf_explained_var: 0.44720637798309326
          vf_loss: 0.29228908826206995
    num_agent_steps_sampled: 59976
    num_agent_steps_trained: 59976
    num_steps_sampled: 59976
    num_steps_trained: 59976
  iterations_sin

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_ef0ef_00000,RUNNING,192.168.3.5:480903,6,870.434,59976,0.1413,5.15,-3.39,207.5


Result for PPO_my_env_ef0ef_00000:
  agent_timesteps_total: 69972
  custom_metrics: {}
  date: 2021-11-06_19-03-37
  done: false
  episode_len_mean: 210.17
  episode_media: {}
  episode_reward_max: 5.150000000000001
  episode_reward_mean: 0.4144000000000077
  episode_reward_min: -3.389999999999989
  episodes_this_iter: 47
  episodes_total: 339
  experiment_id: d164439ba9304db690d8b387f5275ed7
  hostname: linar-Z390-GAMING-X
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.19999999999999998
          cur_lr: 0.00010000000000000002
          entropy: 2.758192437326806
          entropy_coeff: 0.01
          kl: 0.01535587565698042
          policy_loss: -0.029103549730637644
          total_loss: 0.1653280968626595
          vf_explained_var: 0.6362412571907043
          vf_loss: 0.21894239662294715
    num_agent_steps_sampled: 69972
    num_agent_steps_trained: 69972
    num_steps_sampled: 69972
    num_steps_trained: 69972
  iterations_since_r

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_ef0ef_00000,RUNNING,192.168.3.5:480903,7,983.809,69972,0.4144,5.15,-3.39,210.17




Result for PPO_my_env_ef0ef_00000:
  agent_timesteps_total: 79968
  custom_metrics: {}
  date: 2021-11-06_19-06-39
  done: false
  episode_len_mean: 206.51
  episode_media: {}
  episode_reward_max: 5.510000000000003
  episode_reward_mean: 0.5555000000000077
  episode_reward_min: -3.2299999999999858
  episodes_this_iter: 50
  episodes_total: 389
  experiment_id: d164439ba9304db690d8b387f5275ed7
  hostname: linar-Z390-GAMING-X
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.19999999999999998
          cur_lr: 0.00010000000000000002
          entropy: 2.737486555026128
          entropy_coeff: 0.01
          kl: 0.017114823568895734
          policy_loss: -0.030752749828637665
          total_loss: 0.25078697337920214
          vf_explained_var: 0.5811748504638672
          vf_loss: 0.3054916237361538
    num_agent_steps_sampled: 79968
    num_agent_steps_trained: 79968
    num_steps_sampled: 79968
    num_steps_trained: 79968
  iterations_since

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_ef0ef_00000,RUNNING,192.168.3.5:480903,8,1166.05,79968,0.5555,5.51,-3.23,206.51




Result for PPO_my_env_ef0ef_00000:
  agent_timesteps_total: 89964
  custom_metrics: {}
  date: 2021-11-06_19-08-46
  done: false
  episode_len_mean: 204.68
  episode_media: {}
  episode_reward_max: 5.750000000000001
  episode_reward_mean: 0.5717000000000075
  episode_reward_min: -3.2299999999999858
  episodes_this_iter: 48
  episodes_total: 437
  experiment_id: d164439ba9304db690d8b387f5275ed7
  hostname: linar-Z390-GAMING-X
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.19999999999999998
          cur_lr: 0.00010000000000000002
          entropy: 2.729132144471519
          entropy_coeff: 0.01
          kl: 0.017688247730380788
          policy_loss: -0.03238563204072734
          total_loss: 0.18445001798084912
          vf_explained_var: 0.5972334742546082
          vf_loss: 0.24058932141265554
    num_agent_steps_sampled: 89964
    num_agent_steps_trained: 89964
    num_steps_sampled: 89964
    num_steps_trained: 89964
  iterations_since

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_ef0ef_00000,RUNNING,192.168.3.5:480903,9,1293.1,89964,0.5717,5.75,-3.23,204.68




Result for PPO_my_env_ef0ef_00000:
  agent_timesteps_total: 99960
  custom_metrics: {}
  date: 2021-11-06_19-11-22
  done: false
  episode_len_mean: 203.32
  episode_media: {}
  episode_reward_max: 5.750000000000001
  episode_reward_mean: 0.7403000000000078
  episode_reward_min: -2.66999999999999
  episodes_this_iter: 51
  episodes_total: 488
  experiment_id: d164439ba9304db690d8b387f5275ed7
  hostname: linar-Z390-GAMING-X
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.19999999999999998
          cur_lr: 0.00010000000000000002
          entropy: 2.7009323205703346
          entropy_coeff: 0.01
          kl: 0.01920845411184645
          policy_loss: -0.03443845350963947
          total_loss: 0.2022124265296719
          vf_explained_var: 0.6428674459457397
          vf_loss: 0.25981851301641545
    num_agent_steps_sampled: 99960
    num_agent_steps_trained: 99960
    num_steps_sampled: 99960
    num_steps_trained: 99960
  iterations_since_re

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_ef0ef_00000,RUNNING,192.168.3.5:480903,10,1448.71,99960,0.7403,5.75,-2.67,203.32




Result for PPO_my_env_ef0ef_00000:
  agent_timesteps_total: 109956
  custom_metrics: {}
  date: 2021-11-06_19-14-02
  done: false
  episode_len_mean: 202.2
  episode_media: {}
  episode_reward_max: 9.16000000000001
  episode_reward_mean: 0.9805000000000085
  episode_reward_min: -2.099999999999999
  episodes_this_iter: 48
  episodes_total: 536
  experiment_id: d164439ba9304db690d8b387f5275ed7
  hostname: linar-Z390-GAMING-X
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.19999999999999998
          cur_lr: 0.00010000000000000002
          entropy: 2.682330435769171
          entropy_coeff: 0.01
          kl: 0.019681308826636493
          policy_loss: -0.0347476195766885
          total_loss: 0.1821190401298814
          vf_explained_var: 0.5556125044822693
          vf_loss: 0.23975370180848826
    num_agent_steps_sampled: 109956
    num_agent_steps_trained: 109956
    num_steps_sampled: 109956
    num_steps_trained: 109956
  iterations_since

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_ef0ef_00000,RUNNING,192.168.3.5:480903,11,1608.44,109956,0.9805,9.16,-2.1,202.2




Result for PPO_my_env_ef0ef_00000:
  agent_timesteps_total: 119952
  custom_metrics: {}
  date: 2021-11-06_19-16-48
  done: false
  episode_len_mean: 201.92
  episode_media: {}
  episode_reward_max: 9.16000000000001
  episode_reward_mean: 1.0015000000000098
  episode_reward_min: -2.859999999999985
  episodes_this_iter: 50
  episodes_total: 586
  experiment_id: d164439ba9304db690d8b387f5275ed7
  hostname: linar-Z390-GAMING-X
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.19999999999999998
          cur_lr: 0.00010000000000000002
          entropy: 2.6726238466735577
          entropy_coeff: 0.01
          kl: 0.020982473071253113
          policy_loss: -0.03403046653629878
          total_loss: 0.1767531195989786
          vf_explained_var: 0.6581957936286926
          vf_loss: 0.23331332987604234
    num_agent_steps_sampled: 119952
    num_agent_steps_trained: 119952
    num_steps_sampled: 119952
    num_steps_trained: 119952
  iterations_si

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_ef0ef_00000,RUNNING,192.168.3.5:480903,12,1774.54,119952,1.0015,9.16,-2.86,201.92




Result for PPO_my_env_ef0ef_00000:
  agent_timesteps_total: 129948
  custom_metrics: {}
  date: 2021-11-06_19-19-48
  done: false
  episode_len_mean: 197.73
  episode_media: {}
  episode_reward_max: 8.200000000000028
  episode_reward_mean: 0.97300000000001
  episode_reward_min: -2.859999999999985
  episodes_this_iter: 50
  episodes_total: 636
  experiment_id: d164439ba9304db690d8b387f5275ed7
  hostname: linar-Z390-GAMING-X
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.3
          cur_lr: 0.00010000000000000002
          entropy: 2.6776886135085016
          entropy_coeff: 0.01
          kl: 0.019911472287438968
          policy_loss: -0.03617760359317574
          total_loss: 0.19491454140943848
          vf_explained_var: 0.6117089986801147
          vf_loss: 0.25189558885927893
    num_agent_steps_sampled: 129948
    num_agent_steps_trained: 129948
    num_steps_sampled: 129948
    num_steps_trained: 129948
  iterations_since_restore: 13


Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_ef0ef_00000,RUNNING,192.168.3.5:480903,13,1955.09,129948,0.973,8.2,-2.86,197.73




Result for PPO_my_env_ef0ef_00000:
  agent_timesteps_total: 139944
  custom_metrics: {}
  date: 2021-11-06_19-22-29
  done: false
  episode_len_mean: 201.01
  episode_media: {}
  episode_reward_max: 7.310000000000036
  episode_reward_mean: 1.0113000000000105
  episode_reward_min: -2.579999999999983
  episodes_this_iter: 49
  episodes_total: 685
  experiment_id: d164439ba9304db690d8b387f5275ed7
  hostname: linar-Z390-GAMING-X
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.3
          cur_lr: 0.00010000000000000002
          entropy: 2.66624738130814
          entropy_coeff: 0.01
          kl: 0.01743054454060612
          policy_loss: -0.040523816008343656
          total_loss: 0.13664065229778105
          vf_explained_var: 0.6571878790855408
          vf_loss: 0.19859777813793247
    num_agent_steps_sampled: 139944
    num_agent_steps_trained: 139944
    num_steps_sampled: 139944
    num_steps_trained: 139944
  iterations_since_restore: 14


Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_ef0ef_00000,RUNNING,192.168.3.5:480903,14,2115.36,139944,1.0113,7.31,-2.58,201.01




Result for PPO_my_env_ef0ef_00000:
  agent_timesteps_total: 149940
  custom_metrics: {}
  date: 2021-11-06_19-26-13
  done: false
  episode_len_mean: 201.55
  episode_media: {}
  episode_reward_max: 7.310000000000036
  episode_reward_mean: 1.4724000000000117
  episode_reward_min: -1.4000000000000017
  episodes_this_iter: 51
  episodes_total: 736
  experiment_id: d164439ba9304db690d8b387f5275ed7
  hostname: linar-Z390-GAMING-X
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.3
          cur_lr: 0.00010000000000000002
          entropy: 2.648270601288885
          entropy_coeff: 0.01
          kl: 0.020819217417366143
          policy_loss: -0.03853266974990694
          total_loss: 0.21214430841943646
          vf_explained_var: 0.6640358567237854
          vf_loss: 0.2709139182845242
    num_agent_steps_sampled: 149940
    num_agent_steps_trained: 149940
    num_steps_sampled: 149940
    num_steps_trained: 149940
  iterations_since_restore: 15

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_ef0ef_00000,RUNNING,192.168.3.5:480903,15,2339.35,149940,1.4724,7.31,-1.4,201.55




Result for PPO_my_env_ef0ef_00000:
  agent_timesteps_total: 159936
  custom_metrics: {}
  date: 2021-11-06_19-29-56
  done: false
  episode_len_mean: 197.24
  episode_media: {}
  episode_reward_max: 5.8100000000000005
  episode_reward_mean: 1.6902000000000124
  episode_reward_min: -1.4000000000000017
  episodes_this_iter: 50
  episodes_total: 786
  experiment_id: d164439ba9304db690d8b387f5275ed7
  hostname: linar-Z390-GAMING-X
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.4500000000000001
          cur_lr: 0.00010000000000000002
          entropy: 2.6299683621805956
          entropy_coeff: 0.01
          kl: 0.017379509153834263
          policy_loss: -0.038695568177435136
          total_loss: 0.19416306102170777
          vf_explained_var: 0.5841421484947205
          vf_loss: 0.2513375339536076
    num_agent_steps_sampled: 159936
    num_agent_steps_trained: 159936
    num_steps_sampled: 159936
    num_steps_trained: 159936
  iterations

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_ef0ef_00000,RUNNING,192.168.3.5:480903,16,2562.6,159936,1.6902,5.81,-1.4,197.24




Result for PPO_my_env_ef0ef_00000:
  agent_timesteps_total: 169932
  custom_metrics: {}
  date: 2021-11-06_19-34-48
  done: false
  episode_len_mean: 189.79
  episode_media: {}
  episode_reward_max: 9.800000000000002
  episode_reward_mean: 1.779900000000012
  episode_reward_min: -2.6599999999999904
  episodes_this_iter: 55
  episodes_total: 841
  experiment_id: d164439ba9304db690d8b387f5275ed7
  hostname: linar-Z390-GAMING-X
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.4500000000000001
          cur_lr: 0.00010000000000000002
          entropy: 2.6029515926654523
          entropy_coeff: 0.01
          kl: 0.01943848362939168
          policy_loss: -0.03827319459305105
          total_loss: 0.24910938075035174
          vf_explained_var: 0.721899688243866
          vf_loss: 0.30466477267571496
    num_agent_steps_sampled: 169932
    num_agent_steps_trained: 169932
    num_steps_sampled: 169932
    num_steps_trained: 169932
  iterations_sin

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_ef0ef_00000,RUNNING,192.168.3.5:480903,17,2854.98,169932,1.7799,9.8,-2.66,189.79




Result for PPO_my_env_ef0ef_00000:
  agent_timesteps_total: 179928
  custom_metrics: {}
  date: 2021-11-06_19-39-26
  done: false
  episode_len_mean: 174.59
  episode_media: {}
  episode_reward_max: 9.800000000000002
  episode_reward_mean: 2.554600000000013
  episode_reward_min: -2.6599999999999904
  episodes_this_iter: 58
  episodes_total: 899
  experiment_id: d164439ba9304db690d8b387f5275ed7
  hostname: linar-Z390-GAMING-X
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.4500000000000001
          cur_lr: 0.00010000000000000002
          entropy: 2.60669840013879
          entropy_coeff: 0.01
          kl: 0.019316274494037944
          policy_loss: -0.042949062480758395
          total_loss: 0.2120262338198785
          vf_explained_var: 0.6999718546867371
          vf_loss: 0.2723499565839003
    num_agent_steps_sampled: 179928
    num_agent_steps_trained: 179928
    num_steps_sampled: 179928
    num_steps_trained: 179928
  iterations_sinc

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_ef0ef_00000,RUNNING,192.168.3.5:480903,18,3132.61,179928,2.5546,9.8,-2.66,174.59




Result for PPO_my_env_ef0ef_00000:
  agent_timesteps_total: 189924
  custom_metrics: {}
  date: 2021-11-06_19-44-16
  done: false
  episode_len_mean: 175.21
  episode_media: {}
  episode_reward_max: 6.260000000000035
  episode_reward_mean: 2.571500000000014
  episode_reward_min: -1.2400000000000013
  episodes_this_iter: 54
  episodes_total: 953
  experiment_id: d164439ba9304db690d8b387f5275ed7
  hostname: linar-Z390-GAMING-X
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.4500000000000001
          cur_lr: 0.00010000000000000002
          entropy: 2.5879794558908187
          entropy_coeff: 0.01
          kl: 0.018638033193466103
          policy_loss: -0.044602923478899346
          total_loss: 0.14832181873627834
          vf_explained_var: 0.7777464985847473
          vf_loss: 0.21041742228608357
    num_agent_steps_sampled: 189924
    num_agent_steps_trained: 189924
    num_steps_sampled: 189924
    num_steps_trained: 189924
  iterations_

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_ef0ef_00000,RUNNING,192.168.3.5:480903,19,3422.77,189924,2.5715,6.26,-1.24,175.21




Result for PPO_my_env_ef0ef_00000:
  agent_timesteps_total: 199920
  custom_metrics: {}
  date: 2021-11-06_19-48-58
  done: false
  episode_len_mean: 175.35
  episode_media: {}
  episode_reward_max: 9.800000000000002
  episode_reward_mean: 2.955000000000016
  episode_reward_min: -1.1200000000000017
  episodes_this_iter: 58
  episodes_total: 1011
  experiment_id: d164439ba9304db690d8b387f5275ed7
  hostname: linar-Z390-GAMING-X
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.4500000000000001
          cur_lr: 0.00010000000000000002
          entropy: 2.5643301436024855
          entropy_coeff: 0.01
          kl: 0.02108570113036361
          policy_loss: -0.03949755506319368
          total_loss: 0.21519595820806983
          vf_explained_var: 0.7304996848106384
          vf_loss: 0.2708482492301199
    num_agent_steps_sampled: 199920
    num_agent_steps_trained: 199920
    num_steps_sampled: 199920
    num_steps_trained: 199920
  iterations_si

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_ef0ef_00000,RUNNING,192.168.3.5:480903,20,3704.47,199920,2.955,9.8,-1.12,175.35




Result for PPO_my_env_ef0ef_00000:
  agent_timesteps_total: 209916
  custom_metrics: {}
  date: 2021-11-06_19-54-46
  done: false
  episode_len_mean: 158.21
  episode_media: {}
  episode_reward_max: 9.82
  episode_reward_mean: 2.7658000000000103
  episode_reward_min: -2.259999999999996
  episodes_this_iter: 66
  episodes_total: 1077
  experiment_id: d164439ba9304db690d8b387f5275ed7
  hostname: linar-Z390-GAMING-X
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.6749999999999999
          cur_lr: 0.00010000000000000002
          entropy: 2.55096442780943
          entropy_coeff: 0.01
          kl: 0.017055343247626777
          policy_loss: -0.04538892504050691
          total_loss: 0.1635411264311172
          vf_explained_var: 0.7916896343231201
          vf_loss: 0.22292733777823867
    num_agent_steps_sampled: 209916
    num_agent_steps_trained: 209916
    num_steps_sampled: 209916
    num_steps_trained: 209916
  iterations_since_restore: 2

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_ef0ef_00000,RUNNING,192.168.3.5:480903,21,4052.55,209916,2.7658,9.82,-2.26,158.21




Result for PPO_my_env_ef0ef_00000:
  agent_timesteps_total: 219912
  custom_metrics: {}
  date: 2021-11-06_19-59-41
  done: false
  episode_len_mean: 160.73
  episode_media: {}
  episode_reward_max: 9.82
  episode_reward_mean: 2.649900000000011
  episode_reward_min: -2.259999999999996
  episodes_this_iter: 58
  episodes_total: 1135
  experiment_id: d164439ba9304db690d8b387f5275ed7
  hostname: linar-Z390-GAMING-X
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.6749999999999999
          cur_lr: 0.00010000000000000002
          entropy: 2.5369529226906278
          entropy_coeff: 0.01
          kl: 0.01733013650121731
          policy_loss: -0.048565820596602735
          total_loss: 0.1362491008419639
          vf_explained_var: 0.789801836013794
          vf_loss: 0.19848660783380526
    num_agent_steps_sampled: 219912
    num_agent_steps_trained: 219912
    num_steps_sampled: 219912
    num_steps_trained: 219912
  iterations_since_restore: 2

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_ef0ef_00000,RUNNING,192.168.3.5:480903,22,4347.43,219912,2.6499,9.82,-2.26,160.73




Result for PPO_my_env_ef0ef_00000:
  agent_timesteps_total: 229908
  custom_metrics: {}
  date: 2021-11-06_20-04-01
  done: false
  episode_len_mean: 176.5
  episode_media: {}
  episode_reward_max: 9.670000000000002
  episode_reward_mean: 2.677900000000015
  episode_reward_min: -2.7199999999999855
  episodes_this_iter: 53
  episodes_total: 1188
  experiment_id: d164439ba9304db690d8b387f5275ed7
  hostname: linar-Z390-GAMING-X
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.6749999999999999
          cur_lr: 0.00010000000000000002
          entropy: 2.556369695296654
          entropy_coeff: 0.01
          kl: 0.0180008313053207
          policy_loss: -0.04409573866197696
          total_loss: 0.16099591307007732
          vf_explained_var: 0.7370864152908325
          vf_loss: 0.2185047865487062
    num_agent_steps_sampled: 229908
    num_agent_steps_trained: 229908
    num_steps_sampled: 229908
    num_steps_trained: 229908
  iterations_since

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_ef0ef_00000,RUNNING,192.168.3.5:480903,23,4607.77,229908,2.6779,9.67,-2.72,176.5




Result for PPO_my_env_ef0ef_00000:
  agent_timesteps_total: 239904
  custom_metrics: {}
  date: 2021-11-06_20-08-21
  done: false
  episode_len_mean: 179.93
  episode_media: {}
  episode_reward_max: 9.670000000000002
  episode_reward_mean: 2.696100000000016
  episode_reward_min: -3.509999999999989
  episodes_this_iter: 58
  episodes_total: 1246
  experiment_id: d164439ba9304db690d8b387f5275ed7
  hostname: linar-Z390-GAMING-X
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.6749999999999999
          cur_lr: 0.00010000000000000002
          entropy: 2.521818524751908
          entropy_coeff: 0.01
          kl: 0.020420232074984928
          policy_loss: -0.042488070578975044
          total_loss: 0.19295878641577996
          vf_explained_var: 0.6995707154273987
          vf_loss: 0.24688138448529773
    num_agent_steps_sampled: 239904
    num_agent_steps_trained: 239904
    num_steps_sampled: 239904
    num_steps_trained: 239904
  iterations_s

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_ef0ef_00000,RUNNING,192.168.3.5:480903,24,4867.6,239904,2.6961,9.67,-3.51,179.93




Result for PPO_my_env_ef0ef_00000:
  agent_timesteps_total: 249900
  custom_metrics: {}
  date: 2021-11-06_20-12-14
  done: false
  episode_len_mean: 176.03
  episode_media: {}
  episode_reward_max: 9.89
  episode_reward_mean: 2.692000000000014
  episode_reward_min: -3.509999999999989
  episodes_this_iter: 55
  episodes_total: 1301
  experiment_id: d164439ba9304db690d8b387f5275ed7
  hostname: linar-Z390-GAMING-X
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 1.0125
          cur_lr: 0.00010000000000000002
          entropy: 2.540516776712532
          entropy_coeff: 0.01
          kl: 0.015767358597539527
          policy_loss: -0.04937759722288475
          total_loss: 0.1938657844104828
          vf_explained_var: 0.7294347882270813
          vf_loss: 0.25268409821467525
    num_agent_steps_sampled: 249900
    num_agent_steps_trained: 249900
    num_steps_sampled: 249900
    num_steps_trained: 249900
  iterations_since_restore: 25
  node_ip:

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_ef0ef_00000,RUNNING,192.168.3.5:480903,25,5100.54,249900,2.692,9.89,-3.51,176.03




Result for PPO_my_env_ef0ef_00000:
  agent_timesteps_total: 259896
  custom_metrics: {}
  date: 2021-11-06_20-17-32
  done: false
  episode_len_mean: 173.48
  episode_media: {}
  episode_reward_max: 9.88
  episode_reward_mean: 3.199800000000016
  episode_reward_min: -1.3399999999999992
  episodes_this_iter: 60
  episodes_total: 1361
  experiment_id: d164439ba9304db690d8b387f5275ed7
  hostname: linar-Z390-GAMING-X
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 1.0125
          cur_lr: 0.00010000000000000002
          entropy: 2.4983164151509603
          entropy_coeff: 0.01
          kl: 0.016843786240070154
          policy_loss: -0.04616760919109369
          total_loss: 0.18509108203455296
          vf_explained_var: 0.8200007677078247
          vf_loss: 0.23918752091116885
    num_agent_steps_sampled: 259896
    num_agent_steps_trained: 259896
    num_steps_sampled: 259896
    num_steps_trained: 259896
  iterations_since_restore: 26
  node_

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_ef0ef_00000,RUNNING,192.168.3.5:480903,26,5418.17,259896,3.1998,9.88,-1.34,173.48




Result for PPO_my_env_ef0ef_00000:
  agent_timesteps_total: 269892
  custom_metrics: {}
  date: 2021-11-06_20-22-17
  done: false
  episode_len_mean: 174.11
  episode_media: {}
  episode_reward_max: 9.920000000000002
  episode_reward_mean: 3.238700000000017
  episode_reward_min: -1.3399999999999992
  episodes_this_iter: 57
  episodes_total: 1418
  experiment_id: d164439ba9304db690d8b387f5275ed7
  hostname: linar-Z390-GAMING-X
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 1.0125
          cur_lr: 0.00010000000000000002
          entropy: 2.498418702834692
          entropy_coeff: 0.01
          kl: 0.015112260057591826
          policy_loss: -0.051145826303997104
          total_loss: 0.16731830333224218
          vf_explained_var: 0.809043824672699
          vf_loss: 0.22814715183698214
    num_agent_steps_sampled: 269892
    num_agent_steps_trained: 269892
    num_steps_sampled: 269892
    num_steps_trained: 269892
  iterations_since_restore

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_ef0ef_00000,RUNNING,192.168.3.5:480903,27,5702.98,269892,3.2387,9.92,-1.34,174.11




Result for PPO_my_env_ef0ef_00000:
  agent_timesteps_total: 279888
  custom_metrics: {}
  date: 2021-11-06_20-29-01
  done: false
  episode_len_mean: 156.71
  episode_media: {}
  episode_reward_max: 9.9
  episode_reward_mean: 3.194700000000013
  episode_reward_min: -1.1500000000000015
  episodes_this_iter: 66
  episodes_total: 1484
  experiment_id: d164439ba9304db690d8b387f5275ed7
  hostname: linar-Z390-GAMING-X
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 1.0125
          cur_lr: 0.00010000000000000002
          entropy: 2.4896753816523103
          entropy_coeff: 0.01
          kl: 0.0169716268013502
          policy_loss: -0.04931929438239616
          total_loss: 0.20018481615946715
          vf_explained_var: 0.7724143266677856
          vf_loss: 0.2572170891281631
    num_agent_steps_sampled: 279888
    num_agent_steps_trained: 279888
    num_steps_sampled: 279888
    num_steps_trained: 279888
  iterations_since_restore: 28
  node_ip: 

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_ef0ef_00000,RUNNING,192.168.3.5:480903,28,6106.84,279888,3.1947,9.9,-1.15,156.71




Result for PPO_my_env_ef0ef_00000:
  agent_timesteps_total: 289884
  custom_metrics: {}
  date: 2021-11-06_20-34-32
  done: false
  episode_len_mean: 152.58
  episode_media: {}
  episode_reward_max: 9.880000000000003
  episode_reward_mean: 3.416800000000013
  episode_reward_min: -2.2800000000000016
  episodes_this_iter: 61
  episodes_total: 1545
  experiment_id: d164439ba9304db690d8b387f5275ed7
  hostname: linar-Z390-GAMING-X
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 1.0125
          cur_lr: 0.00010000000000000002
          entropy: 2.482929065492418
          entropy_coeff: 0.01
          kl: 0.015419580001220906
          policy_loss: -0.054298016634316014
          total_loss: 0.15187304815452576
          vf_explained_var: 0.7938852310180664
          vf_loss: 0.21538802972143023
    num_agent_steps_sampled: 289884
    num_agent_steps_trained: 289884
    num_steps_sampled: 289884
    num_steps_trained: 289884
  iterations_since_restor

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_ef0ef_00000,RUNNING,192.168.3.5:480903,29,6438.57,289884,3.4168,9.88,-2.28,152.58




Result for PPO_my_env_ef0ef_00000:
  agent_timesteps_total: 299880
  custom_metrics: {}
  date: 2021-11-06_20-40-27
  done: false
  episode_len_mean: 144.52
  episode_media: {}
  episode_reward_max: 9.88
  episode_reward_mean: 3.882400000000013
  episode_reward_min: -2.2800000000000016
  episodes_this_iter: 70
  episodes_total: 1615
  experiment_id: d164439ba9304db690d8b387f5275ed7
  hostname: linar-Z390-GAMING-X
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 1.0125
          cur_lr: 0.00010000000000000002
          entropy: 2.47923766079112
          entropy_coeff: 0.01
          kl: 0.01734041388953302
          policy_loss: -0.047824206801019925
          total_loss: 0.2063232985110237
          vf_explained_var: 0.8125782608985901
          vf_loss: 0.2613827116023272
    num_agent_steps_sampled: 299880
    num_agent_steps_trained: 299880
    num_steps_sampled: 299880
    num_steps_trained: 299880
  iterations_since_restore: 30
  node_ip: 

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_ef0ef_00000,RUNNING,192.168.3.5:480903,30,6793,299880,3.8824,9.88,-2.28,144.52




Result for PPO_my_env_ef0ef_00000:
  agent_timesteps_total: 309876
  custom_metrics: {}
  date: 2021-11-06_20-45-58
  done: false
  episode_len_mean: 152.85
  episode_media: {}
  episode_reward_max: 9.89
  episode_reward_mean: 3.6148000000000144
  episode_reward_min: -0.8099999999999868
  episodes_this_iter: 63
  episodes_total: 1678
  experiment_id: d164439ba9304db690d8b387f5275ed7
  hostname: linar-Z390-GAMING-X
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 1.0125
          cur_lr: 0.00010000000000000002
          entropy: 2.492918090535025
          entropy_coeff: 0.01
          kl: 0.014085976693629224
          policy_loss: -0.05295753919352324
          total_loss: 0.14076466212351607
          vf_explained_var: 0.7937976717948914
          vf_loss: 0.20438932918935504
    num_agent_steps_sampled: 309876
    num_agent_steps_trained: 309876
    num_steps_sampled: 309876
    num_steps_trained: 309876
  iterations_since_restore: 31
  node_

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_ef0ef_00000,RUNNING,192.168.3.5:480903,31,7124.3,309876,3.6148,9.89,-0.81,152.85




Result for PPO_my_env_ef0ef_00000:
  agent_timesteps_total: 319872
  custom_metrics: {}
  date: 2021-11-06_20-51-24
  done: false
  episode_len_mean: 160.13
  episode_media: {}
  episode_reward_max: 9.960000000000031
  episode_reward_mean: 3.5064000000000153
  episode_reward_min: -1.3000000000000018
  episodes_this_iter: 65
  episodes_total: 1743
  experiment_id: d164439ba9304db690d8b387f5275ed7
  hostname: linar-Z390-GAMING-X
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 1.0125
          cur_lr: 0.00010000000000000002
          entropy: 2.478540201472421
          entropy_coeff: 0.01
          kl: 0.01506501062794037
          policy_loss: -0.05790178179104104
          total_loss: 0.09355465964788301
          vf_explained_var: 0.8439602851867676
          vf_loss: 0.16098851958592222
    num_agent_steps_sampled: 319872
    num_agent_steps_trained: 319872
    num_steps_sampled: 319872
    num_steps_trained: 319872
  iterations_since_restore

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_ef0ef_00000,RUNNING,192.168.3.5:480903,32,7449.74,319872,3.5064,9.96,-1.3,160.13




Result for PPO_my_env_ef0ef_00000:
  agent_timesteps_total: 329868
  custom_metrics: {}
  date: 2021-11-06_20-56-25
  done: false
  episode_len_mean: 155.57
  episode_media: {}
  episode_reward_max: 9.960000000000031
  episode_reward_mean: 3.8893000000000155
  episode_reward_min: -1.3599999999999883
  episodes_this_iter: 64
  episodes_total: 1807
  experiment_id: d164439ba9304db690d8b387f5275ed7
  hostname: linar-Z390-GAMING-X
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 1.0125
          cur_lr: 0.00010000000000000002
          entropy: 2.461630885417645
          entropy_coeff: 0.01
          kl: 0.01599660218432368
          policy_loss: -0.05600094480646981
          total_loss: 0.13853544527266778
          vf_explained_var: 0.8291873335838318
          vf_loss: 0.20295613838566673
    num_agent_steps_sampled: 329868
    num_agent_steps_trained: 329868
    num_steps_sampled: 329868
    num_steps_trained: 329868
  iterations_since_restore

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_ef0ef_00000,RUNNING,192.168.3.5:480903,33,7750.86,329868,3.8893,9.96,-1.36,155.57


