In [1]:
import torch 
from torch import nn

import ray
from ray.rllib.agents import ppo
from ray.rllib.models import ModelCatalog
from ray.rllib.models.torch.torch_modelv2 import TorchModelV2
from ray.rllib.utils.annotations import override

#from models import VisualEncoder
from train import *
from wrappers_2 import *



In [2]:
class VisualEncoder(nn.Module):
    def __init__(self):
        super().__init__()

        self.cnn = nn.Sequential(
            nn.Conv2d(3, 32, kernel_size=2, stride=2, padding=0),  
            nn.ELU(),
            nn.Conv2d(32, 32, kernel_size=2, stride=2, padding=0), 
            nn.ELU(),
            nn.Conv2d(32, 64, kernel_size=2, stride=2, padding=0), 
            nn.ELU(),
            nn.Conv2d(64, 128, kernel_size=2, stride=2, padding=0),
            nn.ELU(), 
            nn.Conv2d(128, 256, kernel_size=2, stride=2, padding=0),
            nn.ELU(),
            nn.Conv2d(256, 512, kernel_size=2, stride=2, padding=0),
            nn.ELU(),
            nn.Flatten(),
        )

    def forward(self, x):
        return self.cnn(x)

In [3]:
class TransformerEncoderLayer(nn.Module):
    def __init__(self, d_model, num_heads, dim_feedforward=None, activation=nn.ELU):
        super().__init__()
        if dim_feedforward is None:
            dim_feedforward = 4 * d_model
        self.self_attn = nn.MultiheadAttention(d_model, num_heads, dropout=0.0, batch_first=True)
        # Implementation of feedforward model
        self.linear1 = nn.Linear(d_model, dim_feedforward)
        self.linear2 = nn.Linear(dim_feedforward, d_model)
        self.activation = activation()
        
    def forward(self, query, key, value):
        src = query
        src2 = self.self_attn(query=query, key=key, value=value)[0]
        src = src + src2
        src2 = self.linear2(self.activation(self.linear1(src2)))
        src = src + src2
        return src

In [4]:
class FusionNet(nn.Module):
    def __init__(self, d_model=8, num_heads=1):
        super().__init__()
        self.img_preproc = nn.Sequential(
            nn.Linear(512, 2048),
            nn.ELU(),
        )
        
        self.target_cross_attn_1 = TransformerEncoderLayer(d_model=d_model, num_heads=num_heads)
        self.img_cross_attn_1 = TransformerEncoderLayer(d_model=d_model, num_heads=num_heads)
        self.conv_1 = nn.Conv3d(d_model, 2 * d_model, kernel_size=3, stride=1)
        self.act_1 = nn.ELU()
        
        self.target_cross_attn_2 = TransformerEncoderLayer(d_model=2 * d_model, num_heads=num_heads)
        self.img_cross_attn_2 = TransformerEncoderLayer(d_model=2 * d_model, num_heads=num_heads)
        self.conv_2 = nn.Conv3d(2 * d_model, 4 * d_model, kernel_size=3, stride=1)
        self.act_2 = nn.ELU()
        
        self.target_cross_attn_3 = TransformerEncoderLayer(d_model=4 * d_model, num_heads=num_heads)
        self.img_cross_attn_3 = TransformerEncoderLayer(d_model=4 * d_model, num_heads=num_heads)
        self.conv_3 = nn.Conv3d(4 * d_model, 8 * d_model, kernel_size=3, stride=1)
        self.act_3 = nn.ELU()
        
        self.target_cross_attn_4 = TransformerEncoderLayer(d_model=8 * d_model, num_heads=num_heads)
        self.img_cross_attn_4 = TransformerEncoderLayer(d_model=8 * d_model, num_heads=num_heads)
        self.conv_4 = nn.Conv3d(8 * d_model, 16 * d_model, kernel_size=3, stride=1)
        self.act_4 = nn.ELU()
        
        self.max_pool = nn.MaxPool3d(kernel_size=(1, 3, 3))
        
        
    def forward(self, target, img_features):
        batch_size = target.shape[0]
        
        img_features = self.img_preproc(img_features)
        
        # layer 1
        target = target.permute(0, 2, 3, 4, 1).reshape(batch_size, 9*11*11, 8)
        img = img_features.reshape(batch_size, 256, 8)
        target_1 = self.target_cross_attn_1(query=target, key=img, value=img)
        img_1 = self.img_cross_attn_1(query=img, key=target, value=target)
        target_1 = target_1.reshape(batch_size, 9, 11, 11, 8).permute(0, 4, 1, 2, 3)
        target_1 = self.act_1(self.conv_1(target_1))
        img_1 = img_1.reshape(batch_size, 2048)
        
        # layer 2
        target_1 = target_1.permute(0, 2, 3, 4, 1).reshape(batch_size, 7*9*9, 16)
        img_1 = img_1.reshape(batch_size, 128, 16)
        target_2 = self.target_cross_attn_2(query=target_1, key=img_1, value=img_1)
        img_2 = self.img_cross_attn_2(query=img_1, key=target_1, value=target_1)
        target_2 = target_2.reshape(batch_size, 7, 9, 9, 16).permute(0, 4, 1, 2, 3)
        target_2 = self.act_2(self.conv_2(target_2))
        img_2 = img_2.reshape(batch_size, 2048)
        
        # layer 3
        target_2 = target_2.permute(0, 2, 3, 4, 1).reshape(batch_size, 5*7*7, 32)
        img_2 = img_2.reshape(batch_size, 64, 32)
        target_3 = self.target_cross_attn_3(query=target_2, key=img_2, value=img_2)
        img_3 = self.img_cross_attn_3(query=img_2, key=target_2, value=target_2)
        target_3 = target_3.reshape(batch_size, 5, 7, 7, 32).permute(0, 4, 1, 2, 3)
        target_3 = self.act_3(self.conv_3(target_3))
        img_3 = img_3.reshape(batch_size, 2048)
        
        # layer 4
        target_3 = target_3.permute(0, 2, 3, 4, 1).reshape(batch_size, 3*5*5, 64)
        img_3 = img_3.reshape(batch_size, 32, 64)
        target_4 = self.target_cross_attn_4(query=target_3, key=img_3, value=img_3)
        img_4 = self.img_cross_attn_4(query=img_3, key=target_3, value=target_3)
        target_4 = target_4.reshape(batch_size, 3, 5, 5, 64).permute(0, 4, 1, 2, 3)
        target_4 = self.act_4(self.conv_4(target_4))
        img_4 = img_4.reshape(batch_size, 2048)
        
        
        target_4 = self.max_pool(target_4)
        
        features = target_4.reshape(batch_size, -1)
        
        return features

In [5]:
from torch.nn.functional import one_hot

class MyModelClass(TorchModelV2, nn.Module):
    def __init__(self, obs_space, action_space, num_outputs, model_config, name):
        TorchModelV2.__init__(self, obs_space, action_space, num_outputs, model_config, name)
        nn.Module.__init__(self)
        visual_features_dim = 512
        target_features_dim = 9 * 11 * 11 
        self.visual_encoder = VisualEncoder()
        self.visual_encoder.load_state_dict(
            torch.load("/IGLU-Minecraft/models/AngelaCNN/encoder_weigths.pth", map_location=torch.device('cpu'))
        )
        self.target_encoder = nn.Sequential(
            nn.Conv3d(7, 8, kernel_size=1, stride=1, padding=0),
            nn.ELU(),
        )
        policy_hidden_dim = 128 
        self.policy_network = FusionNet()
        
        self.action_head = nn.Linear(policy_hidden_dim, action_space.n)
        self.value_head = nn.Linear(policy_hidden_dim, 1)
        self.last_value = None
        
        self.use_cuda = torch.cuda.is_available()
        if self.use_cuda:
            self.visual_encoder.cuda()
            self.target_encoder.cuda()
            self.policy_network.cuda()
            self.action_head.cuda()
            self.value_head.cuda()
        
    @override(TorchModelV2)
    def forward(self, input_dict, state, seq_lens):
        obs = input_dict['obs']
        pov = obs['pov'].permute(0, 3, 1, 2).float() / 255.0
        target = one_hot(obs['target_grid'].long(), num_classes=7).permute(0, 4, 1, 2, 3).float()
        if self.use_cuda:
            pov.cuda()
            target.cuda()
            
        with torch.no_grad():
            visual_features = self.visual_encoder(pov)
            
        target_features = self.target_encoder(target)
        
        features = self.policy_network(target_features, visual_features)
        
        action = self.action_head(features)
        self.last_value = self.value_head(features).squeeze(1)
        return action, state
    
    @override(TorchModelV2)
    def value_function(self):
        assert self.last_value is not None, "must call forward() first"
        return self.last_value

In [6]:
ModelCatalog.register_custom_model("my_torch_model", MyModelClass)

In [7]:
class VisualObservationWrapper(ObsWrapper):
    def __init__(self, env, include_target=False):
        super().__init__(env)
        self.observation_space = {   
            'pov': gym.spaces.Box(low=0, high=255, shape=(64, 64, 3)),
            'inventory': gym.spaces.Box(low=0.0, high=20.0, shape=(6,)),
            'compass': gym.spaces.Box(low=-180.0, high=180.0, shape=(1,))
        }
        if include_target:
            self.observation_space['target_grid'] = \
                gym.spaces.Box(low=0, high=6, shape=(9, 11, 11))
        self.observation_space = gym.spaces.Dict(self.observation_space)

    def observation(self, obs, reward=None, done=None, info=None):
        if info is not None:
            if 'target_grid' in info:
                target_grid = info['target_grid']
                del info['target_grid']
            else:
                logger.error(f'info: {info}')
                if hasattr(self.unwrapped, 'should_reset'):
                    self.unwrapped.should_reset(True)
                target_grid = self.env.unwrapped.tasks.current.target_grid
        else:
            target_grid = self.env.unwrapped.tasks.current.target_grid
        return {
            'pov': obs['pov'].astype(np.float32),
            'inventory': obs['inventory'],
            'compass': np.array([obs['compass']['angle'].item()]),
            'target_grid': target_grid
        }

In [8]:
import os
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"   # see issue #152
os.environ["CUDA_VISIBLE_DEVICES"]="0"

tasks = []
for i in range(1,156):
    if ('C'+str(i)) == 'C38': continue
    tasks.append('C'+str(i))
    
class RewardWrapper(gym.RewardWrapper):
    def __init__(self, env):
        super().__init__(env)
    
    def reward(self, rew):
        if rew == 0:
            rew = -0.01
        if abs(rew) == 1:
            rew /= 10
            
        return rew
    
def env_creator(env_config):
    env = gym.make('IGLUSilentBuilder-v0', max_steps=250)
    env.update_taskset(TaskSet(preset=tasks))
    #env = PovOnlyWrapper(env)
    env = VisualObservationWrapper(env, include_target=True)
    env = SelectAndPlace(env)
    env = Discretization(env, flat_action_space('human-level'))
    env = RewardWrapper(env)
    return env

from ray.tune.registry import register_env
register_env("my_env", env_creator)

from ray import tune
from ray.rllib.agents.ppo import PPOTrainer

In [None]:
from ray.tune.integration.wandb import WandbLogger

analysis = tune.run(PPOTrainer, 
         config={
             "env": "my_env", 
             "framework": "torch",
             "num_gpus": 1,
             "num_workers": 3,
             "sgd_minibatch_size": 128,
             "clip_param": 0.2,
             "entropy_coeff": 0.01,
             "lambda": 0.95,
             "train_batch_size": 5_000,
             #"lr": 1e-4,
             #"gamma": 0.99,
             "model": {
                    # Specify our custom model from above.
                    "custom_model": "my_torch_model",
                    # Extra kwargs to be passed to your model's c'tor.
                    "custom_model_config": {},
              },
             "logger_config": {
                  "wandb": {
                      "project": "IGLU-Minecraft",
                      "name": "PPO All Tasks pretrained (visual pretrained AngelaCNN + CrossAttn) (3 noops after placement) r: -0.01 div10"
                  }
              }

        },
        loggers=[WandbLogger],
        local_dir="/IGLU-Minecraft/checkpoints/all_tasks_cross_attn",
        keep_checkpoints_num=50,
        checkpoint_freq=5,
        checkpoint_at_end=True)

2021-11-14 14:38:42,674	INFO wandb.py:170 -- Already logged into W&B.
2021-11-14 14:38:42,686	ERROR syncer.py:72 -- Log sync requires rsync to be installed.


Trial name,status,loc
PPO_my_env_907c1_00000,RUNNING,


[34m[1mwandb[0m: Currently logged in as: [33mlinar[0m (use `wandb login --relogin` to force relogin)
[34m[1mwandb[0m: wandb version 0.12.6 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade


[2m[36m(pid=154354)[0m 2021-11-14 14:38:46,158	INFO ppo.py:159 -- In multi-agent mode, policies will be optimized sequentially by the multi-GPU optimizer. Consider setting simple_optimizer=True if this doesn't work for you.
[2m[36m(pid=154354)[0m 2021-11-14 14:38:46,158	INFO trainer.py:728 -- Current log_level is WARN. For more information, set 'log_level': 'INFO' / 'DEBUG' or use the -v and -vv flags.
[2m[36m(pid=154354)[0m 2021-11-14 14:38:54,731	INFO trainable.py:109 -- Trainable.setup took 11.081 seconds. If your trainable is slow to initialize, consider setting reuse_actors=True to reduce actor creation overheads.


Result for PPO_my_env_907c1_00000:
  agent_timesteps_total: 9996
  custom_metrics: {}
  date: 2021-11-14_14-45-00
  done: false
  episode_len_mean: 99.03030303030303
  episode_media: {}
  episode_reward_max: 4.700000000000003
  episode_reward_mean: -0.5804040404040409
  episode_reward_min: -1.450000000000001
  episodes_this_iter: 99
  episodes_total: 99
  experiment_id: 214763e727544e648f411667af87eede
  hostname: linar-Z390-GAMING-X
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.19999999999999998
          cur_lr: 5.000000000000001e-05
          entropy: 2.8841074453459847
          entropy_coeff: 0.01
          kl: 0.004704763754277773
          policy_loss: -0.011509279726853228
          total_loss: 0.040411061462030835
          vf_explained_var: -0.3121291697025299
          vf_loss: 0.07982046209319503
    num_agent_steps_sampled: 9996
    num_agent_steps_trained: 9996
    num_steps_sampled: 9996
    num_steps_trained: 9996
  iteratio

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_907c1_00000,RUNNING,192.168.3.5:154354,1,366.176,9996,-0.580404,4.7,-1.45,99.0303


Result for PPO_my_env_907c1_00000:
  agent_timesteps_total: 19992
  custom_metrics: {}
  date: 2021-11-14_14-49-26
  done: false
  episode_len_mean: 100.59
  episode_media: {}
  episode_reward_max: 4.930000000000001
  episode_reward_mean: -0.7361000000000005
  episode_reward_min: -1.6000000000000008
  episodes_this_iter: 100
  episodes_total: 199
  experiment_id: 214763e727544e648f411667af87eede
  hostname: linar-Z390-GAMING-X
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.09999999999999999
          cur_lr: 5.000000000000001e-05
          entropy: 2.872266720400916
          entropy_coeff: 0.01
          kl: 0.007915726438018678
          policy_loss: -0.01592555729767833
          total_loss: 0.020403559366241098
          vf_explained_var: -0.07412627339363098
          vf_loss: 0.06426021011280589
    num_agent_steps_sampled: 19992
    num_agent_steps_trained: 19992
    num_steps_sampled: 19992
    num_steps_trained: 19992
  iterations_s

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_907c1_00000,RUNNING,192.168.3.5:154354,2,631.563,19992,-0.7361,4.93,-1.6,100.59


Result for PPO_my_env_907c1_00000:
  agent_timesteps_total: 29988
  custom_metrics: {}
  date: 2021-11-14_14-53-56
  done: false
  episode_len_mean: 100.37
  episode_media: {}
  episode_reward_max: 4.730000000000006
  episode_reward_mean: 0.15750000000000042
  episode_reward_min: -1.730000000000001
  episodes_this_iter: 99
  episodes_total: 298
  experiment_id: 214763e727544e648f411667af87eede
  hostname: linar-Z390-GAMING-X
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.09999999999999999
          cur_lr: 5.000000000000001e-05
          entropy: 2.839373106528551
          entropy_coeff: 0.01
          kl: 0.010519051259605366
          policy_loss: -0.021232346243137478
          total_loss: 0.18197832048749232
          vf_explained_var: 0.0877896323800087
          vf_loss: 0.23055249153393614
    num_agent_steps_sampled: 29988
    num_agent_steps_trained: 29988
    num_steps_sampled: 29988
    num_steps_trained: 29988
  iterations_since

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_907c1_00000,RUNNING,192.168.3.5:154354,3,901.434,29988,0.1575,4.73,-1.73,100.37




Result for PPO_my_env_907c1_00000:
  agent_timesteps_total: 39984
  custom_metrics: {}
  date: 2021-11-14_14-58-51
  done: false
  episode_len_mean: 97.14563106796116
  episode_media: {}
  episode_reward_max: 4.710000000000005
  episode_reward_mean: 0.5757281553398065
  episode_reward_min: -1.8300000000000007
  episodes_this_iter: 103
  episodes_total: 401
  experiment_id: 214763e727544e648f411667af87eede
  hostname: linar-Z390-GAMING-X
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.09999999999999999
          cur_lr: 5.000000000000001e-05
          entropy: 2.811861156194638
          entropy_coeff: 0.01
          kl: 0.011586275796279816
          policy_loss: -0.02427993158284479
          total_loss: 0.2371059584005489
          vf_explained_var: 0.19990791380405426
          vf_loss: 0.28834587312820886
    num_agent_steps_sampled: 39984
    num_agent_steps_trained: 39984
    num_steps_sampled: 39984
    num_steps_trained: 39984
  itera

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_907c1_00000,RUNNING,192.168.3.5:154354,4,1196.43,39984,0.575728,4.71,-1.83,97.1456


Result for PPO_my_env_907c1_00000:
  agent_timesteps_total: 49980
  custom_metrics: {}
  date: 2021-11-14_15-03-31
  done: false
  episode_len_mean: 97.65686274509804
  episode_media: {}
  episode_reward_max: 8.530000000000014
  episode_reward_mean: 1.14294117647059
  episode_reward_min: -1.800000000000001
  episodes_this_iter: 102
  episodes_total: 503
  experiment_id: 214763e727544e648f411667af87eede
  hostname: linar-Z390-GAMING-X
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.09999999999999999
          cur_lr: 5.000000000000001e-05
          entropy: 2.7914847830421903
          entropy_coeff: 0.01
          kl: 0.01291240033164486
          policy_loss: -0.02431462747721463
          total_loss: 0.38614972693065547
          vf_explained_var: 0.2639394998550415
          vf_loss: 0.437087961875348
    num_agent_steps_sampled: 49980
    num_agent_steps_trained: 49980
    num_steps_sampled: 49980
    num_steps_trained: 49980
  iterations

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_907c1_00000,RUNNING,192.168.3.5:154354,5,1476.74,49980,1.14294,8.53,-1.8,97.6569


Result for PPO_my_env_907c1_00000:
  agent_timesteps_total: 59976
  custom_metrics: {}
  date: 2021-11-14_15-08-10
  done: false
  episode_len_mean: 99.87128712871286
  episode_media: {}
  episode_reward_max: 10.260000000000012
  episode_reward_mean: 1.2247524752475272
  episode_reward_min: -1.9499999999999995
  episodes_this_iter: 101
  episodes_total: 604
  experiment_id: 214763e727544e648f411667af87eede
  hostname: linar-Z390-GAMING-X
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.09999999999999999
          cur_lr: 5.000000000000001e-05
          entropy: 2.7712131653076564
          entropy_coeff: 0.01
          kl: 0.012632949608454996
          policy_loss: -0.02826809365159044
          total_loss: 0.3065449736855176
          vf_explained_var: 0.37885403633117676
          vf_loss: 0.36126190269541025
    num_agent_steps_sampled: 59976
    num_agent_steps_trained: 59976
    num_steps_sampled: 59976
    num_steps_trained: 59976
  ite

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_907c1_00000,RUNNING,192.168.3.5:154354,6,1755.41,59976,1.22475,10.26,-1.95,99.8713




Result for PPO_my_env_907c1_00000:
  agent_timesteps_total: 69972
  custom_metrics: {}
  date: 2021-11-14_15-13-05
  done: false
  episode_len_mean: 100.58
  episode_media: {}
  episode_reward_max: 6.530000000000015
  episode_reward_mean: 1.2506000000000028
  episode_reward_min: -1.9600000000000009
  episodes_this_iter: 99
  episodes_total: 703
  experiment_id: 214763e727544e648f411667af87eede
  hostname: linar-Z390-GAMING-X
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.09999999999999999
          cur_lr: 5.000000000000001e-05
          entropy: 2.747085771805201
          entropy_coeff: 0.01
          kl: 0.019294186270850836
          policy_loss: -0.034431641933341056
          total_loss: 0.2905691921695048
          vf_explained_var: 0.4129098951816559
          vf_loss: 0.35054227306117486
    num_agent_steps_sampled: 69972
    num_agent_steps_trained: 69972
    num_steps_sampled: 69972
    num_steps_trained: 69972
  iterations_since_

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_907c1_00000,RUNNING,192.168.3.5:154354,7,2050.6,69972,1.2506,6.53,-1.96,100.58




Result for PPO_my_env_907c1_00000:
  agent_timesteps_total: 79968
  custom_metrics: {}
  date: 2021-11-14_15-18-05
  done: false
  episode_len_mean: 99.54455445544555
  episode_media: {}
  episode_reward_max: 4.820000000000013
  episode_reward_mean: 0.9281188118811902
  episode_reward_min: -1.9200000000000013
  episodes_this_iter: 101
  episodes_total: 804
  experiment_id: 214763e727544e648f411667af87eede
  hostname: linar-Z390-GAMING-X
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.09999999999999999
          cur_lr: 5.000000000000001e-05
          entropy: 2.716490948302114
          entropy_coeff: 0.01
          kl: 0.0277014662107412
          policy_loss: -0.038532258618352376
          total_loss: 0.23172735154517313
          vf_explained_var: 0.5254629254341125
          vf_loss: 0.2946543720467255
    num_agent_steps_sampled: 79968
    num_agent_steps_trained: 79968
    num_steps_sampled: 79968
    num_steps_trained: 79968
  iterati

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_907c1_00000,RUNNING,192.168.3.5:154354,8,2350.99,79968,0.928119,4.82,-1.92,99.5446


Result for PPO_my_env_907c1_00000:
  agent_timesteps_total: 89964
  custom_metrics: {}
  date: 2021-11-14_15-22-46
  done: false
  episode_len_mean: 101.49
  episode_media: {}
  episode_reward_max: 6.870000000000013
  episode_reward_mean: 0.8922000000000024
  episode_reward_min: -2.05
  episodes_this_iter: 98
  episodes_total: 902
  experiment_id: 214763e727544e648f411667af87eede
  hostname: linar-Z390-GAMING-X
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.15
          cur_lr: 5.000000000000001e-05
          entropy: 2.6958542383634128
          entropy_coeff: 0.01
          kl: 0.029198183727480908
          policy_loss: -0.03811441195261084
          total_loss: 0.21304377028439989
          vf_explained_var: 0.549401044845581
          vf_loss: 0.2737369965594739
    num_agent_steps_sampled: 89964
    num_agent_steps_trained: 89964
    num_steps_sampled: 89964
    num_steps_trained: 89964
  iterations_since_restore: 9
  node_ip: 192.168.

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_907c1_00000,RUNNING,192.168.3.5:154354,9,2631.39,89964,0.8922,6.87,-2.05,101.49


Result for PPO_my_env_907c1_00000:
  agent_timesteps_total: 99960
  custom_metrics: {}
  date: 2021-11-14_15-27-31
  done: false
  episode_len_mean: 98.76470588235294
  episode_media: {}
  episode_reward_max: 8.750000000000012
  episode_reward_mean: 1.419803921568631
  episode_reward_min: -2.000000000000001
  episodes_this_iter: 102
  episodes_total: 1004
  experiment_id: 214763e727544e648f411667af87eede
  hostname: linar-Z390-GAMING-X
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.22500000000000006
          cur_lr: 5.000000000000001e-05
          entropy: 2.6617645257558578
          entropy_coeff: 0.01
          kl: 0.03276358538475574
          policy_loss: -0.044040175498678134
          total_loss: 0.23528799822327132
          vf_explained_var: 0.5880103707313538
          vf_loss: 0.2985740117099868
    num_agent_steps_sampled: 99960
    num_agent_steps_trained: 99960
    num_steps_sampled: 99960
    num_steps_trained: 99960
  iterat

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_907c1_00000,RUNNING,192.168.3.5:154354,10,2916.89,99960,1.4198,8.75,-2,98.7647




Result for PPO_my_env_907c1_00000:
  agent_timesteps_total: 109956
  custom_metrics: {}
  date: 2021-11-14_15-32-35
  done: false
  episode_len_mean: 95.24038461538461
  episode_media: {}
  episode_reward_max: 8.360000000000014
  episode_reward_mean: 1.7176923076923116
  episode_reward_min: -1.6100000000000005
  episodes_this_iter: 104
  episodes_total: 1108
  experiment_id: 214763e727544e648f411667af87eede
  hostname: linar-Z390-GAMING-X
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.33749999999999997
          cur_lr: 5.000000000000001e-05
          entropy: 2.6492050871889816
          entropy_coeff: 0.01
          kl: 0.029999021829840426
          policy_loss: -0.04716765971070267
          total_loss: 0.20583039400322983
          vf_explained_var: 0.6615533828735352
          vf_loss: 0.2693654337563576
    num_agent_steps_sampled: 109956
    num_agent_steps_trained: 109956
    num_steps_sampled: 109956
    num_steps_trained: 109956
 

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_907c1_00000,RUNNING,192.168.3.5:154354,11,3220.9,109956,1.71769,8.36,-1.61,95.2404


Result for PPO_my_env_907c1_00000:
  agent_timesteps_total: 119952
  custom_metrics: {}
  date: 2021-11-14_15-37-24
  done: false
  episode_len_mean: 96.23076923076923
  episode_media: {}
  episode_reward_max: 8.760000000000005
  episode_reward_mean: 1.5182692307692345
  episode_reward_min: -2.0200000000000005
  episodes_this_iter: 104
  episodes_total: 1212
  experiment_id: 214763e727544e648f411667af87eede
  hostname: linar-Z390-GAMING-X
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.50625
          cur_lr: 5.000000000000001e-05
          entropy: 2.6219841663654035
          entropy_coeff: 0.01
          kl: 0.028015319427843768
          policy_loss: -0.046292343868826254
          total_loss: 0.19925196828304703
          vf_explained_var: 0.6058682203292847
          vf_loss: 0.2575813973823992
    num_agent_steps_sampled: 119952
    num_agent_steps_trained: 119952
    num_steps_sampled: 119952
    num_steps_trained: 119952
  iterations

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_907c1_00000,RUNNING,192.168.3.5:154354,12,3509.05,119952,1.51827,8.76,-2.02,96.2308


Result for PPO_my_env_907c1_00000:
  agent_timesteps_total: 129948
  custom_metrics: {}
  date: 2021-11-14_15-42-11
  done: false
  episode_len_mean: 96.58653846153847
  episode_media: {}
  episode_reward_max: 10.190000000000015
  episode_reward_mean: 1.6795192307692342
  episode_reward_min: -2.1100000000000003
  episodes_this_iter: 104
  episodes_total: 1316
  experiment_id: 214763e727544e648f411667af87eede
  hostname: linar-Z390-GAMING-X
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.7593750000000002
          cur_lr: 5.000000000000001e-05
          entropy: 2.6332171709109575
          entropy_coeff: 0.01
          kl: 0.024243443993249278
          policy_loss: -0.050442353914627154
          total_loss: 0.24711239260569628
          vf_explained_var: 0.5755695104598999
          vf_loss: 0.3054770540095802
    num_agent_steps_sampled: 129948
    num_agent_steps_trained: 129948
    num_steps_sampled: 129948
    num_steps_trained: 129948


Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_907c1_00000,RUNNING,192.168.3.5:154354,13,3796.48,129948,1.67952,10.19,-2.11,96.5865




Result for PPO_my_env_907c1_00000:
  agent_timesteps_total: 139944
  custom_metrics: {}
  date: 2021-11-14_15-47-16
  done: false
  episode_len_mean: 95.20192307692308
  episode_media: {}
  episode_reward_max: 6.870000000000012
  episode_reward_mean: 1.2180769230769264
  episode_reward_min: -1.8100000000000007
  episodes_this_iter: 104
  episodes_total: 1420
  experiment_id: 214763e727544e648f411667af87eede
  hostname: linar-Z390-GAMING-X
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 1.1390624999999999
          cur_lr: 5.000000000000001e-05
          entropy: 2.614535803468818
          entropy_coeff: 0.01
          kl: 0.02094004718509212
          policy_loss: -0.049892968075891206
          total_loss: 0.17042035843667566
          vf_explained_var: 0.6678788661956787
          vf_loss: 0.2226066606091415
    num_agent_steps_sampled: 139944
    num_agent_steps_trained: 139944
    num_steps_sampled: 139944
    num_steps_trained: 139944
  i

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_907c1_00000,RUNNING,192.168.3.5:154354,14,4101.07,139944,1.21808,6.87,-1.81,95.2019




Result for PPO_my_env_907c1_00000:
  agent_timesteps_total: 149940
  custom_metrics: {}
  date: 2021-11-14_15-52-12
  done: false
  episode_len_mean: 95.82692307692308
  episode_media: {}
  episode_reward_max: 8.470000000000017
  episode_reward_mean: 1.651826923076927
  episode_reward_min: -1.7800000000000007
  episodes_this_iter: 104
  episodes_total: 1524
  experiment_id: 214763e727544e648f411667af87eede
  hostname: linar-Z390-GAMING-X
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 1.7085937500000001
          cur_lr: 5.000000000000001e-05
          entropy: 2.5801324088349302
          entropy_coeff: 0.01
          kl: 0.019684808400661603
          policy_loss: -0.04797709975670227
          total_loss: 0.19475129989958886
          vf_explained_var: 0.7160896062850952
          vf_loss: 0.23489638360647055
    num_agent_steps_sampled: 149940
    num_agent_steps_trained: 149940
    num_steps_sampled: 149940
    num_steps_trained: 149940
  

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_907c1_00000,RUNNING,192.168.3.5:154354,15,4397.38,149940,1.65183,8.47,-1.78,95.8269


Result for PPO_my_env_907c1_00000:
  agent_timesteps_total: 159936
  custom_metrics: {}
  date: 2021-11-14_15-56-54
  done: false
  episode_len_mean: 96.11428571428571
  episode_media: {}
  episode_reward_max: 8.81000000000001
  episode_reward_mean: 1.754476190476195
  episode_reward_min: -2.289999999999996
  episodes_this_iter: 105
  episodes_total: 1629
  experiment_id: 214763e727544e648f411667af87eede
  hostname: linar-Z390-GAMING-X
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 1.7085937500000001
          cur_lr: 5.000000000000001e-05
          entropy: 2.5887092774749823
          entropy_coeff: 0.01
          kl: 0.01763131987004727
          policy_loss: -0.05092837934055899
          total_loss: 0.16808417862058322
          vf_explained_var: 0.7048277258872986
          vf_loss: 0.21477488728088892
    num_agent_steps_sampled: 159936
    num_agent_steps_trained: 159936
    num_steps_sampled: 159936
    num_steps_trained: 159936
  ite

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_907c1_00000,RUNNING,192.168.3.5:154354,16,4679.75,159936,1.75448,8.81,-2.29,96.1143


Result for PPO_my_env_907c1_00000:
  agent_timesteps_total: 169932
  custom_metrics: {}
  date: 2021-11-14_16-01-37
  done: false
  episode_len_mean: 96.94174757281553
  episode_media: {}
  episode_reward_max: 8.64000000000002
  episode_reward_mean: 1.5957281553398093
  episode_reward_min: -1.9100000000000013
  episodes_this_iter: 103
  episodes_total: 1732
  experiment_id: 214763e727544e648f411667af87eede
  hostname: linar-Z390-GAMING-X
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 1.7085937500000001
          cur_lr: 5.000000000000001e-05
          entropy: 2.600336382429824
          entropy_coeff: 0.01
          kl: 0.015568854749177555
          policy_loss: -0.056665287115491736
          total_loss: 0.12470793101486838
          vf_explained_var: 0.7119825482368469
          vf_loss: 0.18077573356552956
    num_agent_steps_sampled: 169932
    num_agent_steps_trained: 169932
    num_steps_sampled: 169932
    num_steps_trained: 169932
  

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_907c1_00000,RUNNING,192.168.3.5:154354,17,4962.59,169932,1.59573,8.64,-1.91,96.9417




Result for PPO_my_env_907c1_00000:
  agent_timesteps_total: 179928
  custom_metrics: {}
  date: 2021-11-14_16-06-39
  done: false
  episode_len_mean: 92.03669724770643
  episode_media: {}
  episode_reward_max: 9.740000000000014
  episode_reward_mean: 1.929816513761472
  episode_reward_min: -1.7500000000000009
  episodes_this_iter: 109
  episodes_total: 1841
  experiment_id: 214763e727544e648f411667af87eede
  hostname: linar-Z390-GAMING-X
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 1.7085937500000001
          cur_lr: 5.000000000000001e-05
          entropy: 2.5967230091747058
          entropy_coeff: 0.01
          kl: 0.021757631094439362
          policy_loss: -0.045536717239958356
          total_loss: 0.2753134146675022
          vf_explained_var: 0.6728546023368835
          vf_loss: 0.30964240723838793
    num_agent_steps_sampled: 179928
    num_agent_steps_trained: 179928
    num_steps_sampled: 179928
    num_steps_trained: 179928
  

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_907c1_00000,RUNNING,192.168.3.5:154354,18,5264.19,179928,1.92982,9.74,-1.75,92.0367




Result for PPO_my_env_907c1_00000:
  agent_timesteps_total: 189924
  custom_metrics: {}
  date: 2021-11-14_16-11-47
  done: false
  episode_len_mean: 95.28571428571429
  episode_media: {}
  episode_reward_max: 8.680000000000007
  episode_reward_mean: 1.5261904761904797
  episode_reward_min: -2.0900000000000007
  episodes_this_iter: 105
  episodes_total: 1946
  experiment_id: 214763e727544e648f411667af87eede
  hostname: linar-Z390-GAMING-X
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 2.562890625
          cur_lr: 5.000000000000001e-05
          entropy: 2.5923866663223656
          entropy_coeff: 0.01
          kl: 0.014990808165429826
          policy_loss: -0.05124915889424519
          total_loss: 0.17733568799896882
          vf_explained_var: 0.725115954875946
          vf_loss: 0.21608891286489226
    num_agent_steps_sampled: 189924
    num_agent_steps_trained: 189924
    num_steps_sampled: 189924
    num_steps_trained: 189924
  iterati

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_907c1_00000,RUNNING,192.168.3.5:154354,19,5572.06,189924,1.52619,8.68,-2.09,95.2857


Result for PPO_my_env_907c1_00000:
  agent_timesteps_total: 199920
  custom_metrics: {}
  date: 2021-11-14_16-16-30
  done: false
  episode_len_mean: 96.14423076923077
  episode_media: {}
  episode_reward_max: 8.810000000000013
  episode_reward_mean: 1.977019230769235
  episode_reward_min: -1.660000000000001
  episodes_this_iter: 104
  episodes_total: 2050
  experiment_id: 214763e727544e648f411667af87eede
  hostname: linar-Z390-GAMING-X
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 2.562890625
          cur_lr: 5.000000000000001e-05
          entropy: 2.5920385914990027
          entropy_coeff: 0.01
          kl: 0.014553636918623747
          policy_loss: -0.050832802840731404
          total_loss: 0.1898929063636714
          vf_explained_var: 0.6800243258476257
          vf_loss: 0.22934671548975266
    num_agent_steps_sampled: 199920
    num_agent_steps_trained: 199920
    num_steps_sampled: 199920
    num_steps_trained: 199920
  iteratio

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_907c1_00000,RUNNING,192.168.3.5:154354,20,5854.99,199920,1.97702,8.81,-1.66,96.1442




Result for PPO_my_env_907c1_00000:
  agent_timesteps_total: 209916
  custom_metrics: {}
  date: 2021-11-14_16-21-29
  done: false
  episode_len_mean: 95.3076923076923
  episode_media: {}
  episode_reward_max: 12.690000000000014
  episode_reward_mean: 1.4474038461538492
  episode_reward_min: -2.229999999999996
  episodes_this_iter: 104
  episodes_total: 2154
  experiment_id: 214763e727544e648f411667af87eede
  hostname: linar-Z390-GAMING-X
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 2.562890625
          cur_lr: 5.000000000000001e-05
          entropy: 2.573831805612287
          entropy_coeff: 0.01
          kl: 0.01478891218465308
          policy_loss: -0.054327342605106854
          total_loss: 0.14469867380073245
          vf_explained_var: 0.7236766219139099
          vf_loss: 0.18686197163720225
    num_agent_steps_sampled: 209916
    num_agent_steps_trained: 209916
    num_steps_sampled: 209916
    num_steps_trained: 209916
  iteratio

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_907c1_00000,RUNNING,192.168.3.5:154354,21,6154.5,209916,1.4474,12.69,-2.23,95.3077




Result for PPO_my_env_907c1_00000:
  agent_timesteps_total: 219912
  custom_metrics: {}
  date: 2021-11-14_16-26-28
  done: false
  episode_len_mean: 94.97169811320755
  episode_media: {}
  episode_reward_max: 8.540000000000015
  episode_reward_mean: 1.9624528301886834
  episode_reward_min: -1.9100000000000008
  episodes_this_iter: 106
  episodes_total: 2260
  experiment_id: 214763e727544e648f411667af87eede
  hostname: linar-Z390-GAMING-X
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 2.562890625
          cur_lr: 5.000000000000001e-05
          entropy: 2.577983933738154
          entropy_coeff: 0.01
          kl: 0.016482307783089026
          policy_loss: -0.05310742948920681
          total_loss: 0.20362163194630326
          vf_explained_var: 0.7068442702293396
          vf_loss: 0.24026654987699456
    num_agent_steps_sampled: 219912
    num_agent_steps_trained: 219912
    num_steps_sampled: 219912
    num_steps_trained: 219912
  iterati

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_907c1_00000,RUNNING,192.168.3.5:154354,22,6453.21,219912,1.96245,8.54,-1.91,94.9717


Result for PPO_my_env_907c1_00000:
  agent_timesteps_total: 229908
  custom_metrics: {}
  date: 2021-11-14_16-31-12
  done: false
  episode_len_mean: 94.78095238095239
  episode_media: {}
  episode_reward_max: 8.540000000000017
  episode_reward_mean: 1.8037142857142896
  episode_reward_min: -2.0599999999999996
  episodes_this_iter: 105
  episodes_total: 2365
  experiment_id: 214763e727544e648f411667af87eede
  hostname: linar-Z390-GAMING-X
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 2.562890625
          cur_lr: 5.000000000000001e-05
          entropy: 2.5670608746699797
          entropy_coeff: 0.01
          kl: 0.01626357898875941
          policy_loss: -0.052882363696972655
          total_loss: 0.20770867521452727
          vf_explained_var: 0.7249283194541931
          vf_loss: 0.24457987422664834
    num_agent_steps_sampled: 229908
    num_agent_steps_trained: 229908
    num_steps_sampled: 229908
    num_steps_trained: 229908
  iterat

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_907c1_00000,RUNNING,192.168.3.5:154354,23,6737.41,229908,1.80371,8.54,-2.06,94.781


Result for PPO_my_env_907c1_00000:
  agent_timesteps_total: 239904
  custom_metrics: {}
  date: 2021-11-14_16-35-57
  done: false
  episode_len_mean: 94.12264150943396
  episode_media: {}
  episode_reward_max: 12.420000000000016
  episode_reward_mean: 1.6654716981132118
  episode_reward_min: -2.0200000000000005
  episodes_this_iter: 106
  episodes_total: 2471
  experiment_id: 214763e727544e648f411667af87eede
  hostname: linar-Z390-GAMING-X
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 2.562890625
          cur_lr: 5.000000000000001e-05
          entropy: 2.5606892810927495
          entropy_coeff: 0.01
          kl: 0.015767214697755196
          policy_loss: -0.05150007929127568
          total_loss: 0.16309441970163782
          vf_explained_var: 0.7561711072921753
          vf_loss: 0.19979174573722686
    num_agent_steps_sampled: 239904
    num_agent_steps_trained: 239904
    num_steps_sampled: 239904
    num_steps_trained: 239904
  itera

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_907c1_00000,RUNNING,192.168.3.5:154354,24,7022.31,239904,1.66547,12.42,-2.02,94.1226




Result for PPO_my_env_907c1_00000:
  agent_timesteps_total: 249900
  custom_metrics: {}
  date: 2021-11-14_16-41-11
  done: false
  episode_len_mean: 90.7909090909091
  episode_media: {}
  episode_reward_max: 10.810000000000011
  episode_reward_mean: 1.8768181818181864
  episode_reward_min: -2.229999999999999
  episodes_this_iter: 110
  episodes_total: 2581
  experiment_id: 214763e727544e648f411667af87eede
  hostname: linar-Z390-GAMING-X
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 2.562890625
          cur_lr: 5.000000000000001e-05
          entropy: 2.584720910920037
          entropy_coeff: 0.01
          kl: 0.015708214264731103
          policy_loss: -0.05371588069834134
          total_loss: 0.19130616118737426
          vf_explained_var: 0.7147061228752136
          vf_loss: 0.23061081553674023
    num_agent_steps_sampled: 249900
    num_agent_steps_trained: 249900
    num_steps_sampled: 249900
    num_steps_trained: 249900
  iteratio

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_907c1_00000,RUNNING,192.168.3.5:154354,25,7335.72,249900,1.87682,10.81,-2.23,90.7909


Result for PPO_my_env_907c1_00000:
  agent_timesteps_total: 259896
  custom_metrics: {}
  date: 2021-11-14_16-45-53
  done: false
  episode_len_mean: 95.88571428571429
  episode_media: {}
  episode_reward_max: 10.820000000000014
  episode_reward_mean: 1.9579047619047671
  episode_reward_min: -2.0700000000000007
  episodes_this_iter: 105
  episodes_total: 2686
  experiment_id: 214763e727544e648f411667af87eede
  hostname: linar-Z390-GAMING-X
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 2.562890625
          cur_lr: 5.000000000000001e-05
          entropy: 2.5596622325416303
          entropy_coeff: 0.01
          kl: 0.015764957250735318
          policy_loss: -0.05896871415420603
          total_loss: 0.16775916708537783
          vf_explained_var: 0.7666202187538147
          vf_loss: 0.21192064354371312
    num_agent_steps_sampled: 259896
    num_agent_steps_trained: 259896
    num_steps_sampled: 259896
    num_steps_trained: 259896
  itera

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_907c1_00000,RUNNING,192.168.3.5:154354,26,7618.06,259896,1.9579,10.82,-2.07,95.8857




Result for PPO_my_env_907c1_00000:
  agent_timesteps_total: 269892
  custom_metrics: {}
  date: 2021-11-14_16-51-04
  done: false
  episode_len_mean: 93.59433962264151
  episode_media: {}
  episode_reward_max: 6.600000000000014
  episode_reward_mean: 1.6884905660377398
  episode_reward_min: -2.2199999999999993
  episodes_this_iter: 106
  episodes_total: 2792
  experiment_id: 214763e727544e648f411667af87eede
  hostname: linar-Z390-GAMING-X
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 2.562890625
          cur_lr: 5.000000000000001e-05
          entropy: 2.573404266895392
          entropy_coeff: 0.01
          kl: 0.014795967476210993
          policy_loss: -0.060399559496814366
          total_loss: 0.1477896484070752
          vf_explained_var: 0.7464619278907776
          vf_loss: 0.1960028048023645
    num_agent_steps_sampled: 269892
    num_agent_steps_trained: 269892
    num_steps_sampled: 269892
    num_steps_trained: 269892
  iteratio

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_907c1_00000,RUNNING,192.168.3.5:154354,27,7929.08,269892,1.68849,6.6,-2.22,93.5943




Result for PPO_my_env_907c1_00000:
  agent_timesteps_total: 279888
  custom_metrics: {}
  date: 2021-11-14_16-56-02
  done: false
  episode_len_mean: 93.71296296296296
  episode_media: {}
  episode_reward_max: 11.010000000000014
  episode_reward_mean: 2.0086111111111147
  episode_reward_min: -1.800000000000001
  episodes_this_iter: 108
  episodes_total: 2900
  experiment_id: 214763e727544e648f411667af87eede
  hostname: linar-Z390-GAMING-X
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 2.562890625
          cur_lr: 5.000000000000001e-05
          entropy: 2.5691857629352146
          entropy_coeff: 0.01
          kl: 0.016589410967632265
          policy_loss: -0.05413030481172933
          total_loss: 0.18442133229751234
          vf_explained_var: 0.7092527151107788
          vf_loss: 0.22172664952758922
    num_agent_steps_sampled: 279888
    num_agent_steps_trained: 279888
    num_steps_sampled: 279888
    num_steps_trained: 279888
  iterat

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_907c1_00000,RUNNING,192.168.3.5:154354,28,8226.58,279888,2.00861,11.01,-1.8,93.713




Result for PPO_my_env_907c1_00000:
  agent_timesteps_total: 289884
  custom_metrics: {}
  date: 2021-11-14_17-01-04
  done: false
  episode_len_mean: 94.14150943396227
  episode_media: {}
  episode_reward_max: 14.630000000000013
  episode_reward_mean: 1.8386792452830225
  episode_reward_min: -2.2199999999999984
  episodes_this_iter: 106
  episodes_total: 3006
  experiment_id: 214763e727544e648f411667af87eede
  hostname: linar-Z390-GAMING-X
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 2.562890625
          cur_lr: 5.000000000000001e-05
          entropy: 2.582319697049948
          entropy_coeff: 0.01
          kl: 0.014526438856344704
          policy_loss: -0.057790955503144836
          total_loss: 0.1470491826777052
          vf_explained_var: 0.7188235521316528
          vf_loss: 0.19343366217759683
    num_agent_steps_sampled: 289884
    num_agent_steps_trained: 289884
    num_steps_sampled: 289884
    num_steps_trained: 289884
  iterat

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_907c1_00000,RUNNING,192.168.3.5:154354,29,8528.88,289884,1.83868,14.63,-2.22,94.1415


Result for PPO_my_env_907c1_00000:
  agent_timesteps_total: 299880
  custom_metrics: {}
  date: 2021-11-14_17-05-56
  done: false
  episode_len_mean: 94.60952380952381
  episode_media: {}
  episode_reward_max: 8.810000000000011
  episode_reward_mean: 1.920380952380957
  episode_reward_min: -1.7000000000000006
  episodes_this_iter: 105
  episodes_total: 3111
  experiment_id: 214763e727544e648f411667af87eede
  hostname: linar-Z390-GAMING-X
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 2.562890625
          cur_lr: 5.000000000000001e-05
          entropy: 2.5851261117519475
          entropy_coeff: 0.01
          kl: 0.014818931295880404
          policy_loss: -0.05378797799348831
          total_loss: 0.16705890245831165
          vf_explained_var: 0.7302387952804565
          vf_loss: 0.2087188432383168
    num_agent_steps_sampled: 299880
    num_agent_steps_trained: 299880
    num_steps_sampled: 299880
    num_steps_trained: 299880
  iteratio

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_907c1_00000,RUNNING,192.168.3.5:154354,30,8820.88,299880,1.92038,8.81,-1.7,94.6095




Result for PPO_my_env_907c1_00000:
  agent_timesteps_total: 309876
  custom_metrics: {}
  date: 2021-11-14_17-11-07
  done: false
  episode_len_mean: 93.16822429906541
  episode_media: {}
  episode_reward_max: 14.550000000000015
  episode_reward_mean: 2.1513084112149574
  episode_reward_min: -1.9000000000000012
  episodes_this_iter: 107
  episodes_total: 3218
  experiment_id: 214763e727544e648f411667af87eede
  hostname: linar-Z390-GAMING-X
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 2.562890625
          cur_lr: 5.000000000000001e-05
          entropy: 2.5808543984706587
          entropy_coeff: 0.01
          kl: 0.016710962400869445
          policy_loss: -0.05846476658510092
          total_loss: 0.18203106402761787
          vf_explained_var: 0.7445911169052124
          vf_loss: 0.22347600632546166
    num_agent_steps_sampled: 309876
    num_agent_steps_trained: 309876
    num_steps_sampled: 309876
    num_steps_trained: 309876
  itera

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_907c1_00000,RUNNING,192.168.3.5:154354,31,9132.03,309876,2.15131,14.55,-1.9,93.1682




Result for PPO_my_env_907c1_00000:
  agent_timesteps_total: 319872
  custom_metrics: {}
  date: 2021-11-14_17-16-08
  done: false
  episode_len_mean: 94.80952380952381
  episode_media: {}
  episode_reward_max: 6.830000000000014
  episode_reward_mean: 1.5287619047619085
  episode_reward_min: -1.9800000000000009
  episodes_this_iter: 105
  episodes_total: 3323
  experiment_id: 214763e727544e648f411667af87eede
  hostname: linar-Z390-GAMING-X
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 2.562890625
          cur_lr: 5.000000000000001e-05
          entropy: 2.5896173026826648
          entropy_coeff: 0.01
          kl: 0.014390020027995568
          policy_loss: -0.06033985315201183
          total_loss: 0.15838939999270008
          vf_explained_var: 0.7431181073188782
          vf_loss: 0.20774537968393575
    num_agent_steps_sampled: 319872
    num_agent_steps_trained: 319872
    num_steps_sampled: 319872
    num_steps_trained: 319872
  iterat

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_907c1_00000,RUNNING,192.168.3.5:154354,32,9433.01,319872,1.52876,6.83,-1.98,94.8095




Result for PPO_my_env_907c1_00000:
  agent_timesteps_total: 329868
  custom_metrics: {}
  date: 2021-11-14_17-21-05
  done: false
  episode_len_mean: 95.87619047619047
  episode_media: {}
  episode_reward_max: 8.66000000000001
  episode_reward_mean: 2.235238095238101
  episode_reward_min: -1.9600000000000013
  episodes_this_iter: 105
  episodes_total: 3428
  experiment_id: 214763e727544e648f411667af87eede
  hostname: linar-Z390-GAMING-X
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 2.562890625
          cur_lr: 5.000000000000001e-05
          entropy: 2.5857653080907643
          entropy_coeff: 0.01
          kl: 0.01626686169534125
          policy_loss: -0.057534125109768325
          total_loss: 0.16942929432202036
          vf_explained_var: 0.7877746820449829
          vf_loss: 0.21113088652093567
    num_agent_steps_sampled: 329868
    num_agent_steps_trained: 329868
    num_steps_sampled: 329868
    num_steps_trained: 329868
  iteratio

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_907c1_00000,RUNNING,192.168.3.5:154354,33,9729.39,329868,2.23524,8.66,-1.96,95.8762




Result for PPO_my_env_907c1_00000:
  agent_timesteps_total: 339864
  custom_metrics: {}
  date: 2021-11-14_17-26-12
  done: false
  episode_len_mean: 94.98095238095237
  episode_media: {}
  episode_reward_max: 9.010000000000016
  episode_reward_mean: 2.045428571428577
  episode_reward_min: -1.7100000000000006
  episodes_this_iter: 105
  episodes_total: 3533
  experiment_id: 214763e727544e648f411667af87eede
  hostname: linar-Z390-GAMING-X
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 2.562890625
          cur_lr: 5.000000000000001e-05
          entropy: 2.559666298291622
          entropy_coeff: 0.01
          kl: 0.016356336874932775
          policy_loss: -0.062473083486478044
          total_loss: 0.15810741582073462
          vf_explained_var: 0.7880002856254578
          vf_loss: 0.20425766033924417
    num_agent_steps_sampled: 339864
    num_agent_steps_trained: 339864
    num_steps_sampled: 339864
    num_steps_trained: 339864
  iterati

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_907c1_00000,RUNNING,192.168.3.5:154354,34,10037,339864,2.04543,9.01,-1.71,94.981




Result for PPO_my_env_907c1_00000:
  agent_timesteps_total: 349860
  custom_metrics: {}
  date: 2021-11-14_17-31-16
  done: false
  episode_len_mean: 94.5
  episode_media: {}
  episode_reward_max: 10.450000000000017
  episode_reward_mean: 1.8481132075471747
  episode_reward_min: -1.980000000000001
  episodes_this_iter: 106
  episodes_total: 3639
  experiment_id: 214763e727544e648f411667af87eede
  hostname: linar-Z390-GAMING-X
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 2.562890625
          cur_lr: 5.000000000000001e-05
          entropy: 2.5613448188855097
          entropy_coeff: 0.01
          kl: 0.015348474480575422
          policy_loss: -0.06234284111569261
          total_loss: 0.12560899546389812
          vf_explained_var: 0.7819280624389648
          vf_loss: 0.1742288253039249
    num_agent_steps_sampled: 349860
    num_agent_steps_trained: 349860
    num_steps_sampled: 349860
    num_steps_trained: 349860
  iterations_since_res

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_907c1_00000,RUNNING,192.168.3.5:154354,35,10340.6,349860,1.84811,10.45,-1.98,94.5




Result for PPO_my_env_907c1_00000:
  agent_timesteps_total: 359856
  custom_metrics: {}
  date: 2021-11-14_17-36-23
  done: false
  episode_len_mean: 96.14423076923077
  episode_media: {}
  episode_reward_max: 10.280000000000017
  episode_reward_mean: 2.070480769230775
  episode_reward_min: -2.259999999999998
  episodes_this_iter: 104
  episodes_total: 3743
  experiment_id: 214763e727544e648f411667af87eede
  hostname: linar-Z390-GAMING-X
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 2.562890625
          cur_lr: 5.000000000000001e-05
          entropy: 2.560037349941384
          entropy_coeff: 0.01
          kl: 0.01489954991172307
          policy_loss: -0.06522251498849632
          total_loss: 0.12923692465306091
          vf_explained_var: 0.749926745891571
          vf_loss: 0.1818738979104365
    num_agent_steps_sampled: 359856
    num_agent_steps_trained: 359856
    num_steps_sampled: 359856
    num_steps_trained: 359856
  iterations_

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_907c1_00000,RUNNING,192.168.3.5:154354,36,10647.6,359856,2.07048,10.28,-2.26,96.1442




Result for PPO_my_env_907c1_00000:
  agent_timesteps_total: 369852
  custom_metrics: {}
  date: 2021-11-14_17-41-26
  done: false
  episode_len_mean: 95.0
  episode_media: {}
  episode_reward_max: 12.750000000000018
  episode_reward_mean: 2.5010476190476245
  episode_reward_min: -1.7300000000000009
  episodes_this_iter: 105
  episodes_total: 3848
  experiment_id: 214763e727544e648f411667af87eede
  hostname: linar-Z390-GAMING-X
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 2.562890625
          cur_lr: 5.000000000000001e-05
          entropy: 2.5612338684562945
          entropy_coeff: 0.01
          kl: 0.016568342916479
          policy_loss: -0.06030153909729969
          total_loss: 0.1614219836365336
          vf_explained_var: 0.8153680562973022
          vf_loss: 0.2048730126924367
    num_agent_steps_sampled: 369852
    num_agent_steps_trained: 369852
    num_steps_sampled: 369852
    num_steps_trained: 369852
  iterations_since_restor

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_907c1_00000,RUNNING,192.168.3.5:154354,37,10950.9,369852,2.50105,12.75,-1.73,95




Result for PPO_my_env_907c1_00000:
  agent_timesteps_total: 379848
  custom_metrics: {}
  date: 2021-11-14_17-46-27
  done: false
  episode_len_mean: 96.09615384615384
  episode_media: {}
  episode_reward_max: 10.300000000000017
  episode_reward_mean: 2.296923076923083
  episode_reward_min: -2.0200000000000005
  episodes_this_iter: 104
  episodes_total: 3952
  experiment_id: 214763e727544e648f411667af87eede
  hostname: linar-Z390-GAMING-X
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 2.562890625
          cur_lr: 5.000000000000001e-05
          entropy: 2.5830588850200686
          entropy_coeff: 0.01
          kl: 0.015432534270562591
          policy_loss: -0.058273292255675437
          total_loss: 0.15640810851791762
          vf_explained_var: 0.8080220222473145
          vf_loss: 0.20096009283676808
    num_agent_steps_sampled: 379848
    num_agent_steps_trained: 379848
    num_steps_sampled: 379848
    num_steps_trained: 379848
  itera

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_907c1_00000,RUNNING,192.168.3.5:154354,38,11251.7,379848,2.29692,10.3,-2.02,96.0962




Result for PPO_my_env_907c1_00000:
  agent_timesteps_total: 389844
  custom_metrics: {}
  date: 2021-11-14_17-51-29
  done: false
  episode_len_mean: 97.25242718446601
  episode_media: {}
  episode_reward_max: 10.610000000000017
  episode_reward_mean: 1.9056310679611708
  episode_reward_min: -2.149999999999998
  episodes_this_iter: 103
  episodes_total: 4055
  experiment_id: 214763e727544e648f411667af87eede
  hostname: linar-Z390-GAMING-X
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 2.562890625
          cur_lr: 5.000000000000001e-05
          entropy: 2.583685732091594
          entropy_coeff: 0.01
          kl: 0.015040750830955197
          policy_loss: -0.0636506521055459
          total_loss: 0.13192660332076314
          vf_explained_var: 0.7843947410583496
          vf_loss: 0.18286631515520252
    num_agent_steps_sampled: 389844
    num_agent_steps_trained: 389844
    num_steps_sampled: 389844
    num_steps_trained: 389844
  iteratio

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_907c1_00000,RUNNING,192.168.3.5:154354,39,11553.5,389844,1.90563,10.61,-2.15,97.2524




Result for PPO_my_env_907c1_00000:
  agent_timesteps_total: 399840
  custom_metrics: {}
  date: 2021-11-14_17-56-31
  done: false
  episode_len_mean: 96.0673076923077
  episode_media: {}
  episode_reward_max: 10.610000000000014
  episode_reward_mean: 1.513173076923081
  episode_reward_min: -2.079999999999999
  episodes_this_iter: 104
  episodes_total: 4159
  experiment_id: 214763e727544e648f411667af87eede
  hostname: linar-Z390-GAMING-X
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 2.562890625
          cur_lr: 5.000000000000001e-05
          entropy: 2.588281627903637
          entropy_coeff: 0.01
          kl: 0.013877036709522603
          policy_loss: -0.06546299894873658
          total_loss: 0.11415193341990822
          vf_explained_var: 0.7516317367553711
          vf_loss: 0.16993242299550365
    num_agent_steps_sampled: 399840
    num_agent_steps_trained: 399840
    num_steps_sampled: 399840
    num_steps_trained: 399840
  iteration

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_907c1_00000,RUNNING,192.168.3.5:154354,40,11855.1,399840,1.51317,10.61,-2.08,96.0673




Result for PPO_my_env_907c1_00000:
  agent_timesteps_total: 409836
  custom_metrics: {}
  date: 2021-11-14_18-01-29
  done: false
  episode_len_mean: 95.66666666666667
  episode_media: {}
  episode_reward_max: 11.10000000000001
  episode_reward_mean: 2.126857142857147
  episode_reward_min: -2.1199999999999983
  episodes_this_iter: 105
  episodes_total: 4264
  experiment_id: 214763e727544e648f411667af87eede
  hostname: linar-Z390-GAMING-X
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 2.562890625
          cur_lr: 5.000000000000001e-05
          entropy: 2.5762263618982755
          entropy_coeff: 0.01
          kl: 0.015821370600555343
          policy_loss: -0.0641740671528153
          total_loss: 0.12568211444597852
          vf_explained_var: 0.8009827136993408
          vf_loss: 0.17507000355503688
    num_agent_steps_sampled: 409836
    num_agent_steps_trained: 409836
    num_steps_sampled: 409836
    num_steps_trained: 409836
  iteratio

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_907c1_00000,RUNNING,192.168.3.5:154354,41,12153.2,409836,2.12686,11.1,-2.12,95.6667


Result for PPO_my_env_907c1_00000:
  agent_timesteps_total: 419832
  custom_metrics: {}
  date: 2021-11-14_18-06-20
  done: false
  episode_len_mean: 98.89108910891089
  episode_media: {}
  episode_reward_max: 8.680000000000014
  episode_reward_mean: 1.9891089108910942
  episode_reward_min: -2.0700000000000007
  episodes_this_iter: 101
  episodes_total: 4365
  experiment_id: 214763e727544e648f411667af87eede
  hostname: linar-Z390-GAMING-X
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 2.562890625
          cur_lr: 5.000000000000001e-05
          entropy: 2.5894092527210204
          entropy_coeff: 0.01
          kl: 0.015754456086417484
          policy_loss: -0.06015002717518717
          total_loss: 0.13434963416483284
          vf_explained_var: 0.7770410776138306
          vf_loss: 0.18001680749221743
    num_agent_steps_sampled: 419832
    num_agent_steps_trained: 419832
    num_steps_sampled: 419832
    num_steps_trained: 419832
  iterat

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_907c1_00000,RUNNING,192.168.3.5:154354,42,12444.8,419832,1.98911,8.68,-2.07,98.8911


Result for PPO_my_env_907c1_00000:
  agent_timesteps_total: 429828
  custom_metrics: {}
  date: 2021-11-14_18-11-11
  done: false
  episode_len_mean: 97.96078431372548
  episode_media: {}
  episode_reward_max: 10.490000000000016
  episode_reward_mean: 1.9486274509803974
  episode_reward_min: -1.980000000000001
  episodes_this_iter: 102
  episodes_total: 4467
  experiment_id: 214763e727544e648f411667af87eede
  hostname: linar-Z390-GAMING-X
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 2.562890625
          cur_lr: 5.000000000000001e-05
          entropy: 2.5749557081450765
          entropy_coeff: 0.01
          kl: 0.014576571356247278
          policy_loss: -0.06364019793243363
          total_loss: 0.11612465370756885
          vf_explained_var: 0.8113839626312256
          vf_loss: 0.16815625105658147
    num_agent_steps_sampled: 429828
    num_agent_steps_trained: 429828
    num_steps_sampled: 429828
    num_steps_trained: 429828
  iterat

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_907c1_00000,RUNNING,192.168.3.5:154354,43,12735.4,429828,1.94863,10.49,-1.98,97.9608




Result for PPO_my_env_907c1_00000:
  agent_timesteps_total: 439824
  custom_metrics: {}
  date: 2021-11-14_18-16-49
  done: false
  episode_len_mean: 94.95192307692308
  episode_media: {}
  episode_reward_max: 12.660000000000014
  episode_reward_mean: 2.071250000000005
  episode_reward_min: -2.219999999999998
  episodes_this_iter: 104
  episodes_total: 4571
  experiment_id: 214763e727544e648f411667af87eede
  hostname: linar-Z390-GAMING-X
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 2.562890625
          cur_lr: 5.000000000000001e-05
          entropy: 2.5755681451569257
          entropy_coeff: 0.01
          kl: 0.015923553186511327
          policy_loss: -0.06246066974798361
          total_loss: 0.1371981978870164
          vf_explained_var: 0.7613310813903809
          vf_loss: 0.18460422446712468
    num_agent_steps_sampled: 439824
    num_agent_steps_trained: 439824
    num_steps_sampled: 439824
    num_steps_trained: 439824
  iteratio

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_907c1_00000,RUNNING,192.168.3.5:154354,44,13073,439824,2.07125,12.66,-2.22,94.9519


Result for PPO_my_env_907c1_00000:
  agent_timesteps_total: 449820
  custom_metrics: {}
  date: 2021-11-14_18-21-50
  done: false
  episode_len_mean: 98.46601941747574
  episode_media: {}
  episode_reward_max: 10.680000000000016
  episode_reward_mean: 2.037281553398064
  episode_reward_min: -1.7400000000000009
  episodes_this_iter: 103
  episodes_total: 4674
  experiment_id: 214763e727544e648f411667af87eede
  hostname: linar-Z390-GAMING-X
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 2.562890625
          cur_lr: 5.000000000000001e-05
          entropy: 2.5734053449753005
          entropy_coeff: 0.01
          kl: 0.014926903928372641
          policy_loss: -0.06521144844949818
          total_loss: 0.11814858285256494
          vf_explained_var: 0.7864267230033875
          vf_loss: 0.17083806361223006
    num_agent_steps_sampled: 449820
    num_agent_steps_trained: 449820
    num_steps_sampled: 449820
    num_steps_trained: 449820
  iterat

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_907c1_00000,RUNNING,192.168.3.5:154354,45,13374.9,449820,2.03728,10.68,-1.74,98.466




Result for PPO_my_env_907c1_00000:
  agent_timesteps_total: 459816
  custom_metrics: {}
  date: 2021-11-14_18-27-04
  done: false
  episode_len_mean: 96.67961165048544
  episode_media: {}
  episode_reward_max: 11.010000000000012
  episode_reward_mean: 2.4835922330097144
  episode_reward_min: -1.880000000000001
  episodes_this_iter: 103
  episodes_total: 4777
  experiment_id: 214763e727544e648f411667af87eede
  hostname: linar-Z390-GAMING-X
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 2.562890625
          cur_lr: 5.000000000000001e-05
          entropy: 2.5662724315610705
          entropy_coeff: 0.01
          kl: 0.016925706331568768
          policy_loss: -0.06522116734622381
          total_loss: 0.1492521928722819
          vf_explained_var: 0.7851283550262451
          vf_loss: 0.1967573508524742
    num_agent_steps_sampled: 459816
    num_agent_steps_trained: 459816
    num_steps_sampled: 459816
    num_steps_trained: 459816
  iteratio

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_907c1_00000,RUNNING,192.168.3.5:154354,46,13688.8,459816,2.48359,11.01,-1.88,96.6796




Result for PPO_my_env_907c1_00000:
  agent_timesteps_total: 469812
  custom_metrics: {}
  date: 2021-11-14_18-32-27
  done: false
  episode_len_mean: 96.80582524271844
  episode_media: {}
  episode_reward_max: 16.419999999999963
  episode_reward_mean: 1.8071844660194212
  episode_reward_min: -1.730000000000001
  episodes_this_iter: 103
  episodes_total: 4880
  experiment_id: 214763e727544e648f411667af87eede
  hostname: linar-Z390-GAMING-X
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 2.562890625
          cur_lr: 5.000000000000001e-05
          entropy: 2.56159122693233
          entropy_coeff: 0.01
          kl: 0.016070698658833087
          policy_loss: -0.06289070767119655
          total_loss: 0.1547669149748185
          vf_explained_var: 0.781723141670227
          vf_loss: 0.20208609324609303
    num_agent_steps_sampled: 469812
    num_agent_steps_trained: 469812
    num_steps_sampled: 469812
    num_steps_trained: 469812
  iterations

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_907c1_00000,RUNNING,192.168.3.5:154354,47,14010.9,469812,1.80718,16.42,-1.73,96.8058




Result for PPO_my_env_907c1_00000:
  agent_timesteps_total: 479808
  custom_metrics: {}
  date: 2021-11-14_18-37-56
  done: false
  episode_len_mean: 96.96116504854369
  episode_media: {}
  episode_reward_max: 10.470000000000013
  episode_reward_mean: 2.2660194174757335
  episode_reward_min: -1.7400000000000009
  episodes_this_iter: 103
  episodes_total: 4983
  experiment_id: 214763e727544e648f411667af87eede
  hostname: linar-Z390-GAMING-X
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 2.562890625
          cur_lr: 5.000000000000001e-05
          entropy: 2.568333717696687
          entropy_coeff: 0.01
          kl: 0.016492835037834486
          policy_loss: -0.0642034399163965
          total_loss: 0.14482020382554486
          vf_explained_var: 0.7662680149078369
          vf_loss: 0.19243764904622213
    num_agent_steps_sampled: 479808
    num_agent_steps_trained: 479808
    num_steps_sampled: 479808
    num_steps_trained: 479808
  iterati

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_907c1_00000,RUNNING,192.168.3.5:154354,48,14339.9,479808,2.26602,10.47,-1.74,96.9612


Result for PPO_my_env_907c1_00000:
  agent_timesteps_total: 489804
  custom_metrics: {}
  date: 2021-11-14_18-42-56
  done: false
  episode_len_mean: 96.64423076923077
  episode_media: {}
  episode_reward_max: 12.420000000000012
  episode_reward_mean: 2.8210576923076993
  episode_reward_min: -2.0099999999999993
  episodes_this_iter: 104
  episodes_total: 5087
  experiment_id: 214763e727544e648f411667af87eede
  hostname: linar-Z390-GAMING-X
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 2.562890625
          cur_lr: 5.000000000000001e-05
          entropy: 2.555722782652602
          entropy_coeff: 0.01
          kl: 0.015873147133034112
          policy_loss: -0.06911472527222692
          total_loss: 0.11072130407421635
          vf_explained_var: 0.8455419540405273
          vf_loss: 0.16471211756300977
    num_agent_steps_sampled: 489804
    num_agent_steps_trained: 489804
    num_steps_sampled: 489804
    num_steps_trained: 489804
  iterat

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_907c1_00000,RUNNING,192.168.3.5:154354,49,14639.9,489804,2.82106,12.42,-2.01,96.6442


Result for PPO_my_env_907c1_00000:
  agent_timesteps_total: 499800
  custom_metrics: {}
  date: 2021-11-14_18-47-52
  done: false
  episode_len_mean: 99.77
  episode_media: {}
  episode_reward_max: 8.490000000000016
  episode_reward_mean: 2.142900000000006
  episode_reward_min: -1.830000000000001
  episodes_this_iter: 100
  episodes_total: 5187
  experiment_id: 214763e727544e648f411667af87eede
  hostname: linar-Z390-GAMING-X
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 2.562890625
          cur_lr: 5.000000000000001e-05
          entropy: 2.56054032765902
          entropy_coeff: 0.01
          kl: 0.01662307405462549
          policy_loss: -0.06482438477767138
          total_loss: 0.14611082546389065
          vf_explained_var: 0.8170629739761353
          vf_loss: 0.1939374939005217
    num_agent_steps_sampled: 499800
    num_agent_steps_trained: 499800
    num_steps_sampled: 499800
    num_steps_trained: 499800
  iterations_since_restore

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_907c1_00000,RUNNING,192.168.3.5:154354,50,14936,499800,2.1429,8.49,-1.83,99.77




Result for PPO_my_env_907c1_00000:
  agent_timesteps_total: 509796
  custom_metrics: {}
  date: 2021-11-14_18-53-15
  done: false
  episode_len_mean: 94.64761904761905
  episode_media: {}
  episode_reward_max: 6.9000000000000155
  episode_reward_mean: 1.6574285714285755
  episode_reward_min: -1.9800000000000009
  episodes_this_iter: 105
  episodes_total: 5292
  experiment_id: 214763e727544e648f411667af87eede
  hostname: linar-Z390-GAMING-X
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 2.562890625
          cur_lr: 5.000000000000001e-05
          entropy: 2.5728517648501272
          entropy_coeff: 0.01
          kl: 0.015414993540687138
          policy_loss: -0.06842817571252967
          total_loss: 0.12108448853210793
          vf_explained_var: 0.7387208938598633
          vf_loss: 0.17573424040212526
    num_agent_steps_sampled: 509796
    num_agent_steps_trained: 509796
    num_steps_sampled: 509796
    num_steps_trained: 509796
  itera

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_907c1_00000,RUNNING,192.168.3.5:154354,51,15259.1,509796,1.65743,6.9,-1.98,94.6476




Result for PPO_my_env_907c1_00000:
  agent_timesteps_total: 519792
  custom_metrics: {}
  date: 2021-11-14_18-58-46
  done: false
  episode_len_mean: 97.41176470588235
  episode_media: {}
  episode_reward_max: 11.060000000000013
  episode_reward_mean: 2.326764705882358
  episode_reward_min: -1.790000000000001
  episodes_this_iter: 102
  episodes_total: 5394
  experiment_id: 214763e727544e648f411667af87eede
  hostname: linar-Z390-GAMING-X
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 2.562890625
          cur_lr: 5.000000000000001e-05
          entropy: 2.5635588821182904
          entropy_coeff: 0.01
          kl: 0.01670597467390989
          policy_loss: -0.06633417455750143
          total_loss: 0.12356031411924423
          vf_explained_var: 0.7999658584594727
          vf_loss: 0.1727144927618245
    num_agent_steps_sampled: 519792
    num_agent_steps_trained: 519792
    num_steps_sampled: 519792
    num_steps_trained: 519792
  iteration

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_907c1_00000,RUNNING,192.168.3.5:154354,52,15590.2,519792,2.32676,11.06,-1.79,97.4118


Result for PPO_my_env_907c1_00000:
  agent_timesteps_total: 529788
  custom_metrics: {}
  date: 2021-11-14_19-03-44
  done: false
  episode_len_mean: 97.47572815533981
  episode_media: {}
  episode_reward_max: 9.100000000000012
  episode_reward_mean: 1.8826213592233056
  episode_reward_min: -1.9400000000000008
  episodes_this_iter: 103
  episodes_total: 5497
  experiment_id: 214763e727544e648f411667af87eede
  hostname: linar-Z390-GAMING-X
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 2.562890625
          cur_lr: 5.000000000000001e-05
          entropy: 2.5643992714392834
          entropy_coeff: 0.01
          kl: 0.01694179143524277
          policy_loss: -0.0630826420421338
          total_loss: 0.1367810504273193
          vf_explained_var: 0.8140629529953003
          vf_loss: 0.18208772794216171
    num_agent_steps_sampled: 529788
    num_agent_steps_trained: 529788
    num_steps_sampled: 529788
    num_steps_trained: 529788
  iteration

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_907c1_00000,RUNNING,192.168.3.5:154354,53,15888,529788,1.88262,9.1,-1.94,97.4757


Result for PPO_my_env_907c1_00000:
  agent_timesteps_total: 539784
  custom_metrics: {}
  date: 2021-11-14_19-08-40
  done: false
  episode_len_mean: 99.01960784313725
  episode_media: {}
  episode_reward_max: 10.250000000000016
  episode_reward_mean: 2.2787254901960847
  episode_reward_min: -1.7200000000000009
  episodes_this_iter: 102
  episodes_total: 5599
  experiment_id: 214763e727544e648f411667af87eede
  hostname: linar-Z390-GAMING-X
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 2.562890625
          cur_lr: 5.000000000000001e-05
          entropy: 2.5650242653667417
          entropy_coeff: 0.01
          kl: 0.015093052584488913
          policy_loss: -0.07204266106908838
          total_loss: 0.09134344308175402
          vf_explained_var: 0.8394483327865601
          vf_loss: 0.15035450451800392
    num_agent_steps_sampled: 539784
    num_agent_steps_trained: 539784
    num_steps_sampled: 539784
    num_steps_trained: 539784
  itera

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_907c1_00000,RUNNING,192.168.3.5:154354,54,16184.1,539784,2.27873,10.25,-1.72,99.0196




Result for PPO_my_env_907c1_00000:
  agent_timesteps_total: 549780
  custom_metrics: {}
  date: 2021-11-14_19-13-50
  done: false
  episode_len_mean: 95.25961538461539
  episode_media: {}
  episode_reward_max: 7.240000000000014
  episode_reward_mean: 1.8837500000000058
  episode_reward_min: -2.11
  episodes_this_iter: 104
  episodes_total: 5703
  experiment_id: 214763e727544e648f411667af87eede
  hostname: linar-Z390-GAMING-X
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 2.562890625
          cur_lr: 5.000000000000001e-05
          entropy: 2.5664462346297046
          entropy_coeff: 0.01
          kl: 0.016794230592454523
          policy_loss: -0.07028174814171134
          total_loss: 0.11398230192219663
          vf_explained_var: 0.8116009831428528
          vf_loss: 0.16688673812617413
    num_agent_steps_sampled: 549780
    num_agent_steps_trained: 549780
    num_steps_sampled: 549780
    num_steps_trained: 549780
  iterations_since_res

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_907c1_00000,RUNNING,192.168.3.5:154354,55,16493.7,549780,1.88375,7.24,-2.11,95.2596




Result for PPO_my_env_907c1_00000:
  agent_timesteps_total: 559776
  custom_metrics: {}
  date: 2021-11-14_19-19-03
  done: false
  episode_len_mean: 95.75961538461539
  episode_media: {}
  episode_reward_max: 10.310000000000016
  episode_reward_mean: 2.2206730769230827
  episode_reward_min: -1.9400000000000013
  episodes_this_iter: 104
  episodes_total: 5807
  experiment_id: 214763e727544e648f411667af87eede
  hostname: linar-Z390-GAMING-X
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 2.562890625
          cur_lr: 5.000000000000001e-05
          entropy: 2.570980368822049
          entropy_coeff: 0.01
          kl: 0.01622199195330858
          policy_loss: -0.06567862729987718
          total_loss: 0.11127333860629453
          vf_explained_var: 0.8532184958457947
          vf_loss: 0.16108658016125998
    num_agent_steps_sampled: 559776
    num_agent_steps_trained: 559776
    num_steps_sampled: 559776
    num_steps_trained: 559776
  iterati

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_907c1_00000,RUNNING,192.168.3.5:154354,56,16807.2,559776,2.22067,10.31,-1.94,95.7596




Result for PPO_my_env_907c1_00000:
  agent_timesteps_total: 569772
  custom_metrics: {}
  date: 2021-11-14_19-24-16
  done: false
  episode_len_mean: 98.33980582524272
  episode_media: {}
  episode_reward_max: 12.800000000000015
  episode_reward_mean: 2.0951456310679664
  episode_reward_min: -1.8800000000000012
  episodes_this_iter: 103
  episodes_total: 5910
  experiment_id: 214763e727544e648f411667af87eede
  hostname: linar-Z390-GAMING-X
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 2.562890625
          cur_lr: 5.000000000000001e-05
          entropy: 2.5658853833491984
          entropy_coeff: 0.01
          kl: 0.01619378611772238
          policy_loss: -0.06474879480516299
          total_loss: 0.1200120730795221
          vf_explained_var: 0.7873438000679016
          vf_loss: 0.16891682003107336
    num_agent_steps_sampled: 569772
    num_agent_steps_trained: 569772
    num_steps_sampled: 569772
    num_steps_trained: 569772
  iterati

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_907c1_00000,RUNNING,192.168.3.5:154354,57,17119.8,569772,2.09515,12.8,-1.88,98.3398




Result for PPO_my_env_907c1_00000:
  agent_timesteps_total: 579768
  custom_metrics: {}
  date: 2021-11-14_19-29-31
  done: false
  episode_len_mean: 95.54368932038835
  episode_media: {}
  episode_reward_max: 8.620000000000017
  episode_reward_mean: 2.3000970873786466
  episode_reward_min: -1.6600000000000008
  episodes_this_iter: 103
  episodes_total: 6013
  experiment_id: 214763e727544e648f411667af87eede
  hostname: linar-Z390-GAMING-X
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 2.562890625
          cur_lr: 5.000000000000001e-05
          entropy: 2.5499601354965797
          entropy_coeff: 0.01
          kl: 0.015548596245024046
          policy_loss: -0.06934854329452236
          total_loss: 0.09234743708720765
          vf_explained_var: 0.860692024230957
          vf_loss: 0.14734623023054094
    num_agent_steps_sampled: 579768
    num_agent_steps_trained: 579768
    num_steps_sampled: 579768
    num_steps_trained: 579768
  iterati

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_907c1_00000,RUNNING,192.168.3.5:154354,58,17435.2,579768,2.3001,8.62,-1.66,95.5437




Result for PPO_my_env_907c1_00000:
  agent_timesteps_total: 589764
  custom_metrics: {}
  date: 2021-11-14_19-35-11
  done: false
  episode_len_mean: 97.36893203883496
  episode_media: {}
  episode_reward_max: 9.760000000000002
  episode_reward_mean: 2.061650485436899
  episode_reward_min: -1.7300000000000004
  episodes_this_iter: 103
  episodes_total: 6116
  experiment_id: 214763e727544e648f411667af87eede
  hostname: linar-Z390-GAMING-X
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 2.562890625
          cur_lr: 5.000000000000001e-05
          entropy: 2.5746788846121893
          entropy_coeff: 0.01
          kl: 0.016106837603738735
          policy_loss: -0.0678455212006234
          total_loss: 0.12461793024140673
          vf_explained_var: 0.7892800569534302
          vf_loss: 0.17693017796796356
    num_agent_steps_sampled: 589764
    num_agent_steps_trained: 589764
    num_steps_sampled: 589764
    num_steps_trained: 589764
  iteratio

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_907c1_00000,RUNNING,192.168.3.5:154354,59,17775.1,589764,2.06165,9.76,-1.73,97.3689


Result for PPO_my_env_907c1_00000:
  agent_timesteps_total: 599760
  custom_metrics: {}
  date: 2021-11-14_19-40-04
  done: false
  episode_len_mean: 99.27450980392157
  episode_media: {}
  episode_reward_max: 10.34000000000002
  episode_reward_mean: 2.465294117647065
  episode_reward_min: -2.0599999999999987
  episodes_this_iter: 102
  episodes_total: 6218
  experiment_id: 214763e727544e648f411667af87eede
  hostname: linar-Z390-GAMING-X
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 2.562890625
          cur_lr: 5.000000000000001e-05
          entropy: 2.577608354275043
          entropy_coeff: 0.01
          kl: 0.015623231323259202
          policy_loss: -0.06602522920833057
          total_loss: 0.08886749180328324
          vf_explained_var: 0.8619842529296875
          vf_loss: 0.14062817245522816
    num_agent_steps_sampled: 599760
    num_agent_steps_trained: 599760
    num_steps_sampled: 599760
    num_steps_trained: 599760
  iteratio

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_907c1_00000,RUNNING,192.168.3.5:154354,60,18068.2,599760,2.46529,10.34,-2.06,99.2745
