In [1]:
import torch 
from torch import nn

import ray
from ray.rllib.agents import ppo
from ray.rllib.models import ModelCatalog
from ray.rllib.models.torch.torch_modelv2 import TorchModelV2
from ray.rllib.utils.annotations import override

#from models import VisualEncoder
from train import *
from wrappers_2 import *



In [2]:
class VisualEncoder(nn.Module):
    def __init__(self):
        super().__init__()

        self.cnn = nn.Sequential(
            nn.Conv2d(3, 32, kernel_size=2, stride=2, padding=0),  
            nn.ELU(),
            nn.Conv2d(32, 32, kernel_size=2, stride=2, padding=0), 
            nn.ELU(),
            nn.Conv2d(32, 64, kernel_size=2, stride=2, padding=0), 
            nn.ELU(),
            nn.Conv2d(64, 128, kernel_size=2, stride=2, padding=0),
            nn.ELU(), 
            nn.Conv2d(128, 256, kernel_size=2, stride=2, padding=0),
            nn.ELU(),
            nn.Conv2d(256, 512, kernel_size=2, stride=2, padding=0),
            nn.ELU(),
            nn.Flatten(),
        )

    def forward(self, x):
        return self.cnn(x)

In [3]:
from torch.nn.functional import one_hot
from ray.rllib.policy.rnn_sequencing import add_time_dimension


class MyModelClass(TorchModelV2, nn.Module):
    def __init__(self, obs_space, action_space, num_outputs, model_config, name):
        TorchModelV2.__init__(self, obs_space, action_space, num_outputs, model_config, name)
        nn.Module.__init__(self)
        visual_features_dim = 512
        target_features_dim = 9 * 11 * 11 
        self.visual_encoder = VisualEncoder()
        self.visual_encoder.load_state_dict(
            torch.load("/IGLU-Minecraft/models/AngelaCNN/encoder_weigths.pth", map_location=torch.device('cpu'))
        )
        self.target_encoder = nn.Sequential(
            nn.Conv3d(7, 1, kernel_size=1, stride=1, padding=0),
            nn.ELU(),
        )
        self.policy_hidden_dim = 256 
        self.policy_network = nn.Sequential(
            nn.Linear(visual_features_dim + target_features_dim, 1024),
            nn.ELU(),
            nn.Linear(1024, 512),
            nn.ELU(),
            nn.Linear(512, self.policy_hidden_dim),
            nn.ELU(),
            nn.Linear(self.policy_hidden_dim, self.policy_hidden_dim),
            nn.ELU(),
            #nn.Linear(policy_hidden_dim, policy_hidden_dim),
            #nn.ELU(),
        )
        
        self.time_major = self.model_config.get("_time_major", False)
        self.gru = nn.GRU(self.policy_hidden_dim, self.policy_hidden_dim, batch_first=not self.time_major)
        
        self.action_head = nn.Linear(self.policy_hidden_dim, action_space.n)
        self.value_head = nn.Linear(self.policy_hidden_dim, 1)
        self.last_value = None
        
        self.use_cuda = torch.cuda.is_available()
        if self.use_cuda:
            self.visual_encoder.cuda()
            self.target_encoder.cuda()
            self.policy_network.cuda()
            self.gru.cuda()
            self.action_head.cuda()
            self.value_head.cuda()
        
    @override(TorchModelV2)
    def forward(self, input_dict, state, seq_lens):
        obs = input_dict['obs']
        pov = obs['pov'].permute(0, 3, 1, 2).float() / 255.0
        target = one_hot(obs['target_grid'].long(), num_classes=7).permute(0, 4, 1, 2, 3).float()
        if self.use_cuda:
            pov.cuda()
            target.cuda()
            
        with torch.no_grad():
            visual_features = self.visual_encoder(pov)
            
        target_features = self.target_encoder(target)
        target_features = target_features.reshape(target_features.shape[0], -1)
        features = torch.cat([visual_features, target_features], dim=1)
        features = self.policy_network(features)
        
        if isinstance(seq_lens, np.ndarray):
            seq_lens = torch.Tensor(seq_lens).int()
        max_seq_len = features.shape[0] // seq_lens.shape[0]    
        inputs = add_time_dimension(
            features,
            max_seq_len=max_seq_len,
            framework="torch",
            time_major=self.time_major,
        )
        
        h = state[0].permute(1, 0, 2)
        output, new_h = self.gru(inputs, h)
        new_state = [new_h.permute(1, 0, 2)]
        
        gru_output = output.reshape(-1, self.policy_hidden_dim)
        
        action = self.action_head(gru_output)
        self.last_value = self.value_head(gru_output).squeeze(1)
        return action, new_state
    
    @override(TorchModelV2)
    def value_function(self):
        assert self.last_value is not None, "must call forward() first"
        return self.last_value
    
    @override(TorchModelV2)
    def get_initial_state(self):
        return [torch.zeros(1, self.policy_hidden_dim)]

In [4]:
visual_features_dim = 512
target_features_dim = 9 * 11 * 11
policy_hidden_dim = 256 

policy_network = nn.Sequential(
    nn.Linear(visual_features_dim + target_features_dim, 1024),
    nn.ELU(),
    nn.Linear(1024, 512),
    nn.ELU(),
    nn.Linear(512, policy_hidden_dim),
    nn.ELU(),
    nn.Linear(policy_hidden_dim, policy_hidden_dim),
    nn.ELU(),
    #nn.Linear(policy_hidden_dim, policy_hidden_dim),
    #nn.ELU(),
)

sum(p.numel() for p in policy_network.parameters())

2362368

In [5]:
ModelCatalog.register_custom_model("my_torch_model", MyModelClass)

In [6]:
class VisualObservationWrapper(ObsWrapper):
    def __init__(self, env, include_target=False):
        super().__init__(env)
        self.observation_space = {   
            'pov': gym.spaces.Box(low=0, high=255, shape=(64, 64, 3)),
            'inventory': gym.spaces.Box(low=0.0, high=20.0, shape=(6,)),
            'compass': gym.spaces.Box(low=-180.0, high=180.0, shape=(1,))
        }
        if include_target:
            self.observation_space['target_grid'] = \
                gym.spaces.Box(low=0, high=6, shape=(9, 11, 11))
        self.observation_space = gym.spaces.Dict(self.observation_space)

    def observation(self, obs, reward=None, done=None, info=None):
        if info is not None:
            if 'target_grid' in info:
                target_grid = info['target_grid']
                del info['target_grid']
            else:
                logger.error(f'info: {info}')
                if hasattr(self.unwrapped, 'should_reset'):
                    self.unwrapped.should_reset(True)
                target_grid = self.env.unwrapped.tasks.current.target_grid
        else:
            target_grid = self.env.unwrapped.tasks.current.target_grid
        return {
            'pov': obs['pov'].astype(np.float32),
            'inventory': obs['inventory'],
            'compass': np.array([obs['compass']['angle'].item()]),
            'target_grid': target_grid
        }

In [7]:
import os
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"   # see issue #152
os.environ["CUDA_VISIBLE_DEVICES"]="0"

tasks = []
for i in range(1,156):
    if ('C'+str(i)) == 'C38': continue
    tasks.append('C'+str(i))
    
class RewardWrapper(gym.RewardWrapper):
    def __init__(self, env):
        super().__init__(env)
    
    def reward(self, rew):
        if rew == 0:
            rew = -0.01
        if abs(rew) == 1:
            rew /= 10
            
        return rew
    
def env_creator(env_config):
    env = gym.make('IGLUSilentBuilder-v0', max_steps=250)
    env.update_taskset(TaskSet(preset=tasks))
    #env = PovOnlyWrapper(env)
    env = VisualObservationWrapper(env, include_target=True)
    env = SelectAndPlace(env)
    env = Discretization(env, flat_action_space('human-level'))
    env = RewardWrapper(env)
    return env

from ray.tune.registry import register_env
register_env("my_env", env_creator)

from ray import tune
from ray.rllib.agents.ppo import PPOTrainer

In [8]:
from ray.tune.integration.wandb import WandbLogger

analysis = tune.run(PPOTrainer, 
         config={
             "env": "my_env", 
             "framework": "torch",
             "num_gpus": 1,
             "num_workers": 3,
             "sgd_minibatch_size": 256,
             "clip_param": 0.2,
             "entropy_coeff": 0.01,
             "lambda": 0.95,
             "train_batch_size": 5_000,
             "lr": 1e-4,
             #"gamma": 0.99,
             "model": {
                    # Specify our custom model from above.
                    "custom_model": "my_torch_model",
                    # Extra kwargs to be passed to your model's c'tor.
                    "custom_model_config": {},
              },
             "logger_config": {
                  "wandb": {
                      "project": "IGLU-Minecraft",
                      "name": "PPO All Tasks pretrained (AngelaCNN+GRU) (3 noops after placement) r: -0.01 div10"
                  }
              }

        },
        loggers=[WandbLogger],
        local_dir="/IGLU-Minecraft/checkpoints/all_tasks",
        keep_checkpoints_num=50,
        checkpoint_freq=5,
        checkpoint_at_end=True)

2021-11-07 12:37:45,307	INFO wandb.py:170 -- Already logged into W&B.
2021-11-07 12:37:45,327	ERROR syncer.py:72 -- Log sync requires rsync to be installed.


Trial name,status,loc
PPO_my_env_81db5_00000,RUNNING,


[34m[1mwandb[0m: Currently logged in as: [33mlinar[0m (use `wandb login --relogin` to force relogin)
[34m[1mwandb[0m: wandb version 0.12.6 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade


[2m[36m(pid=552473)[0m 2021-11-07 12:37:48,754	INFO ppo.py:159 -- In multi-agent mode, policies will be optimized sequentially by the multi-GPU optimizer. Consider setting simple_optimizer=True if this doesn't work for you.
[2m[36m(pid=552473)[0m 2021-11-07 12:37:48,754	INFO trainer.py:728 -- Current log_level is WARN. For more information, set 'log_level': 'INFO' / 'DEBUG' or use the -v and -vv flags.
[2m[36m(pid=552473)[0m 2021-11-07 12:37:56,909	INFO trainable.py:109 -- Trainable.setup took 10.587 seconds. If your trainable is slow to initialize, consider setting reuse_actors=True to reduce actor creation overheads.


Result for PPO_my_env_81db5_00000:
  agent_timesteps_total: 9996
  custom_metrics: {}
  date: 2021-11-07_12-41-32
  done: false
  episode_len_mean: 100.77319587628865
  episode_media: {}
  episode_reward_max: 2.580000000000002
  episode_reward_mean: -0.8639175257731962
  episode_reward_min: -1.8000000000000012
  episodes_this_iter: 97
  episodes_total: 97
  experiment_id: fa07df177b244fd6a682580aebaab5aa
  hostname: linar-Z390-GAMING-X
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.19999999999999998
          cur_lr: 0.00010000000000000002
          entropy: 2.884062057478815
          entropy_coeff: 0.01
          kl: 0.005448134565703074
          policy_loss: -0.012407635053237661
          total_loss: -0.008587556214541452
          vf_explained_var: 0.203287735581398
          vf_loss: 0.03157107095218176
    num_agent_steps_sampled: 9996
    num_agent_steps_trained: 9996
    num_steps_sampled: 9996
    num_steps_trained: 9996
  iterati

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_81db5_00000,RUNNING,192.168.3.5:552473,1,215.455,9996,-0.863918,2.58,-1.8,100.773


Result for PPO_my_env_81db5_00000:
  agent_timesteps_total: 19992
  custom_metrics: {}
  date: 2021-11-07_12-43-30
  done: false
  episode_len_mean: 98.43137254901961
  episode_media: {}
  episode_reward_max: 4.460000000000004
  episode_reward_mean: -0.5802941176470592
  episode_reward_min: -1.600000000000001
  episodes_this_iter: 102
  episodes_total: 199
  experiment_id: fa07df177b244fd6a682580aebaab5aa
  hostname: linar-Z390-GAMING-X
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.19999999999999998
          cur_lr: 0.00010000000000000002
          entropy: 2.869527016541897
          entropy_coeff: 0.01
          kl: 0.00825794731229035
          policy_loss: -0.015938285919768005
          total_loss: 0.08055777117579133
          vf_explained_var: -0.18294155597686768
          vf_loss: 0.12353973829497894
    num_agent_steps_sampled: 19992
    num_agent_steps_trained: 19992
    num_steps_sampled: 19992
    num_steps_trained: 19992
  it

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_81db5_00000,RUNNING,192.168.3.5:552473,2,333.903,19992,-0.580294,4.46,-1.6,98.4314


Result for PPO_my_env_81db5_00000:
  agent_timesteps_total: 29988
  custom_metrics: {}
  date: 2021-11-07_12-45-31
  done: false
  episode_len_mean: 96.24038461538461
  episode_media: {}
  episode_reward_max: 2.9000000000000035
  episode_reward_mean: -0.39307692307692343
  episode_reward_min: -1.810000000000001
  episodes_this_iter: 104
  episodes_total: 303
  experiment_id: fa07df177b244fd6a682580aebaab5aa
  hostname: linar-Z390-GAMING-X
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.19999999999999998
          cur_lr: 0.00010000000000000002
          entropy: 2.8557294466556646
          entropy_coeff: 0.01
          kl: 0.010938297962752872
          policy_loss: -0.02790377158962034
          total_loss: 0.07786307880877812
          vf_explained_var: 0.15343286097049713
          vf_loss: 0.13213648553539672
    num_agent_steps_sampled: 29988
    num_agent_steps_trained: 29988
    num_steps_sampled: 29988
    num_steps_trained: 29988
  

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_81db5_00000,RUNNING,192.168.3.5:552473,3,454.48,29988,-0.393077,2.9,-1.81,96.2404




Result for PPO_my_env_81db5_00000:
  agent_timesteps_total: 39984
  custom_metrics: {}
  date: 2021-11-07_12-47-58
  done: false
  episode_len_mean: 93.66037735849056
  episode_media: {}
  episode_reward_max: 8.710000000000006
  episode_reward_mean: 0.1292452830188682
  episode_reward_min: -1.6500000000000008
  episodes_this_iter: 106
  episodes_total: 409
  experiment_id: fa07df177b244fd6a682580aebaab5aa
  hostname: linar-Z390-GAMING-X
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.19999999999999998
          cur_lr: 0.00010000000000000002
          entropy: 2.822630866164835
          entropy_coeff: 0.01
          kl: 0.01252816994067562
          policy_loss: -0.03149683301243595
          total_loss: 0.14709871391600104
          vf_explained_var: 0.3094733655452728
          vf_loss: 0.20431622123807414
    num_agent_steps_sampled: 39984
    num_agent_steps_trained: 39984
    num_steps_sampled: 39984
    num_steps_trained: 39984
  itera

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_81db5_00000,RUNNING,192.168.3.5:552473,4,601.953,39984,0.129245,8.71,-1.65,93.6604


Result for PPO_my_env_81db5_00000:
  agent_timesteps_total: 49980
  custom_metrics: {}
  date: 2021-11-07_12-50-06
  done: false
  episode_len_mean: 96.4
  episode_media: {}
  episode_reward_max: 5.0000000000000036
  episode_reward_mean: 0.6787619047619061
  episode_reward_min: -2.0300000000000002
  episodes_this_iter: 105
  episodes_total: 514
  experiment_id: fa07df177b244fd6a682580aebaab5aa
  hostname: linar-Z390-GAMING-X
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.19999999999999998
          cur_lr: 0.00010000000000000002
          entropy: 2.8020932246477175
          entropy_coeff: 0.01
          kl: 0.016654536320332553
          policy_loss: -0.03329561155351102
          total_loss: 0.31227522916379424
          vf_explained_var: 0.3225064277648926
          vf_loss: 0.3702608666804611
    num_agent_steps_sampled: 49980
    num_agent_steps_trained: 49980
    num_steps_sampled: 49980
    num_steps_trained: 49980
  iterations_since

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_81db5_00000,RUNNING,192.168.3.5:552473,5,729.964,49980,0.678762,5,-2.03,96.4


Result for PPO_my_env_81db5_00000:
  agent_timesteps_total: 59976
  custom_metrics: {}
  date: 2021-11-07_12-52-16
  done: false
  episode_len_mean: 97.06862745098039
  episode_media: {}
  episode_reward_max: 4.750000000000002
  episode_reward_mean: 0.6676470588235304
  episode_reward_min: -1.6700000000000008
  episodes_this_iter: 102
  episodes_total: 616
  experiment_id: fa07df177b244fd6a682580aebaab5aa
  hostname: linar-Z390-GAMING-X
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.19999999999999998
          cur_lr: 0.00010000000000000002
          entropy: 2.7822406593550983
          entropy_coeff: 0.01
          kl: 0.024434898960707628
          policy_loss: -0.03113511052129575
          total_loss: 0.2316218328905246
          vf_explained_var: 0.4430922865867615
          vf_loss: 0.2856923694093513
    num_agent_steps_sampled: 59976
    num_agent_steps_trained: 59976
    num_steps_sampled: 59976
    num_steps_trained: 59976
  itera

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_81db5_00000,RUNNING,192.168.3.5:552473,6,859.728,59976,0.667647,4.75,-1.67,97.0686




Result for PPO_my_env_81db5_00000:
  agent_timesteps_total: 69972
  custom_metrics: {}
  date: 2021-11-07_12-54-43
  done: false
  episode_len_mean: 95.24528301886792
  episode_media: {}
  episode_reward_max: 10.920000000000009
  episode_reward_mean: 1.1926415094339642
  episode_reward_min: -1.790000000000001
  episodes_this_iter: 106
  episodes_total: 722
  experiment_id: fa07df177b244fd6a682580aebaab5aa
  hostname: linar-Z390-GAMING-X
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.3
          cur_lr: 0.00010000000000000002
          entropy: 2.7595538860712296
          entropy_coeff: 0.01
          kl: 0.021193696005868148
          policy_loss: -0.03799824479517863
          total_loss: 0.2749890881298512
          vf_explained_var: 0.5662446618080139
          vf_loss: 0.33422476245552046
    num_agent_steps_sampled: 69972
    num_agent_steps_trained: 69972
    num_steps_sampled: 69972
    num_steps_trained: 69972
  iterations_since_res

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_81db5_00000,RUNNING,192.168.3.5:552473,7,1006.4,69972,1.19264,10.92,-1.79,95.2453


Result for PPO_my_env_81db5_00000:
  agent_timesteps_total: 79968
  custom_metrics: {}
  date: 2021-11-07_12-56-56
  done: false
  episode_len_mean: 96.20192307692308
  episode_media: {}
  episode_reward_max: 4.87000000000001
  episode_reward_mean: 0.737019230769232
  episode_reward_min: -1.6500000000000008
  episodes_this_iter: 104
  episodes_total: 826
  experiment_id: fa07df177b244fd6a682580aebaab5aa
  hostname: linar-Z390-GAMING-X
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.4500000000000001
          cur_lr: 0.00010000000000000002
          entropy: 2.7454164447947447
          entropy_coeff: 0.01
          kl: 0.017158678038308908
          policy_loss: -0.050093303976827264
          total_loss: 0.17008151742371627
          vf_explained_var: 0.5696343779563904
          vf_loss: 0.23990758145148428
    num_agent_steps_sampled: 79968
    num_agent_steps_trained: 79968
    num_steps_sampled: 79968
    num_steps_trained: 79968
  itera

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_81db5_00000,RUNNING,192.168.3.5:552473,8,1139.24,79968,0.737019,4.87,-1.65,96.2019


Result for PPO_my_env_81db5_00000:
  agent_timesteps_total: 89964
  custom_metrics: {}
  date: 2021-11-07_12-59-06
  done: false
  episode_len_mean: 96.36893203883496
  episode_media: {}
  episode_reward_max: 8.680000000000012
  episode_reward_mean: 1.035242718446604
  episode_reward_min: -2.0999999999999988
  episodes_this_iter: 103
  episodes_total: 929
  experiment_id: fa07df177b244fd6a682580aebaab5aa
  hostname: linar-Z390-GAMING-X
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.4500000000000001
          cur_lr: 0.00010000000000000002
          entropy: 2.7407099888874935
          entropy_coeff: 0.01
          kl: 0.021436836153406247
          policy_loss: -0.048860654188083726
          total_loss: 0.16117291721574062
          vf_explained_var: 0.577031672000885
          vf_loss: 0.22779409277897614
    num_agent_steps_sampled: 89964
    num_agent_steps_trained: 89964
    num_steps_sampled: 89964
    num_steps_trained: 89964
  itera

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_81db5_00000,RUNNING,192.168.3.5:552473,9,1269.73,89964,1.03524,8.68,-2.1,96.3689


Result for PPO_my_env_81db5_00000:
  agent_timesteps_total: 99960
  custom_metrics: {}
  date: 2021-11-07_13-01-18
  done: false
  episode_len_mean: 95.29523809523809
  episode_media: {}
  episode_reward_max: 4.84000000000001
  episode_reward_mean: 0.9066666666666683
  episode_reward_min: -1.7300000000000006
  episodes_this_iter: 105
  episodes_total: 1034
  experiment_id: fa07df177b244fd6a682580aebaab5aa
  hostname: linar-Z390-GAMING-X
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.6749999999999999
          cur_lr: 0.00010000000000000002
          entropy: 2.7319690694156873
          entropy_coeff: 0.01
          kl: 0.02141083725505932
          policy_loss: -0.05107570455767041
          total_loss: 0.1447404513026102
          vf_explained_var: 0.66274493932724
          vf_loss: 0.2086835321643923
    num_agent_steps_sampled: 99960
    num_agent_steps_trained: 99960
    num_steps_sampled: 99960
    num_steps_trained: 99960
  iteration

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_81db5_00000,RUNNING,192.168.3.5:552473,10,1400.84,99960,0.906667,4.84,-1.73,95.2952




Result for PPO_my_env_81db5_00000:
  agent_timesteps_total: 109956
  custom_metrics: {}
  date: 2021-11-07_13-03-47
  done: false
  episode_len_mean: 94.85849056603773
  episode_media: {}
  episode_reward_max: 5.040000000000006
  episode_reward_mean: 0.9728301886792473
  episode_reward_min: -2.0599999999999996
  episodes_this_iter: 106
  episodes_total: 1140
  experiment_id: fa07df177b244fd6a682580aebaab5aa
  hostname: linar-Z390-GAMING-X
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 1.0125
          cur_lr: 0.00010000000000000002
          entropy: 2.731305471240965
          entropy_coeff: 0.01
          kl: 0.019618779847453045
          policy_loss: -0.052315612175967294
          total_loss: 0.13186002302495464
          vf_explained_var: 0.6459531188011169
          vf_loss: 0.19162467405613925
    num_agent_steps_sampled: 109956
    num_agent_steps_trained: 109956
    num_steps_sampled: 109956
    num_steps_trained: 109956
  iterations

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_81db5_00000,RUNNING,192.168.3.5:552473,11,1549.93,109956,0.97283,5.04,-2.06,94.8585


Result for PPO_my_env_81db5_00000:
  agent_timesteps_total: 119952
  custom_metrics: {}
  date: 2021-11-07_13-06-03
  done: false
  episode_len_mean: 97.37623762376238
  episode_media: {}
  episode_reward_max: 6.27000000000001
  episode_reward_mean: 0.8184158415841599
  episode_reward_min: -1.760000000000001
  episodes_this_iter: 101
  episodes_total: 1241
  experiment_id: fa07df177b244fd6a682580aebaab5aa
  hostname: linar-Z390-GAMING-X
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 1.0125
          cur_lr: 0.00010000000000000002
          entropy: 2.7289472606447007
          entropy_coeff: 0.01
          kl: 0.020563775863684797
          policy_loss: -0.05436774108427802
          total_loss: 0.09429352682466424
          vf_explained_var: 0.6483254432678223
          vf_loss: 0.15512991710287383
    num_agent_steps_sampled: 119952
    num_agent_steps_trained: 119952
    num_steps_sampled: 119952
    num_steps_trained: 119952
  iterations_s

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_81db5_00000,RUNNING,192.168.3.5:552473,12,1685.72,119952,0.818416,6.27,-1.76,97.3762


Result for PPO_my_env_81db5_00000:
  agent_timesteps_total: 129948
  custom_metrics: {}
  date: 2021-11-07_13-08-17
  done: false
  episode_len_mean: 97.7378640776699
  episode_media: {}
  episode_reward_max: 7.110000000000012
  episode_reward_mean: 1.0252427184466038
  episode_reward_min: -1.7400000000000009
  episodes_this_iter: 103
  episodes_total: 1344
  experiment_id: fa07df177b244fd6a682580aebaab5aa
  hostname: linar-Z390-GAMING-X
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 1.5187500000000005
          cur_lr: 0.00010000000000000002
          entropy: 2.7243337504884115
          entropy_coeff: 0.01
          kl: 0.017992767751852
          policy_loss: -0.060302324225967124
          total_loss: 0.0958034462014633
          vf_explained_var: 0.5844916701316833
          vf_loss: 0.156022593495237
    num_agent_steps_sampled: 129948
    num_agent_steps_trained: 129948
    num_steps_sampled: 129948
    num_steps_trained: 129948
  iter

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_81db5_00000,RUNNING,192.168.3.5:552473,13,1820.27,129948,1.02524,7.11,-1.74,97.7379




Result for PPO_my_env_81db5_00000:
  agent_timesteps_total: 139944
  custom_metrics: {}
  date: 2021-11-07_13-10-49
  done: false
  episode_len_mean: 96.57692307692308
  episode_media: {}
  episode_reward_max: 6.9700000000000095
  episode_reward_mean: 0.8920192307692321
  episode_reward_min: -2.0100000000000007
  episodes_this_iter: 104
  episodes_total: 1448
  experiment_id: fa07df177b244fd6a682580aebaab5aa
  hostname: linar-Z390-GAMING-X
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 1.5187500000000005
          cur_lr: 0.00010000000000000002
          entropy: 2.7175154745069325
          entropy_coeff: 0.01
          kl: 0.01688571123425493
          policy_loss: -0.057515333496799305
          total_loss: 0.10269569298299726
          vf_explained_var: 0.7024216651916504
          vf_loss: 0.16174100869390953
    num_agent_steps_sampled: 139944
    num_agent_steps_trained: 139944
    num_steps_sampled: 139944
    num_steps_trained: 139944

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_81db5_00000,RUNNING,192.168.3.5:552473,14,1972.3,139944,0.892019,6.97,-2.01,96.5769


Result for PPO_my_env_81db5_00000:
  agent_timesteps_total: 149940
  custom_metrics: {}
  date: 2021-11-07_13-12-58
  done: false
  episode_len_mean: 96.92233009708738
  episode_media: {}
  episode_reward_max: 5.030000000000013
  episode_reward_mean: 0.9675728155339824
  episode_reward_min: -1.910000000000001
  episodes_this_iter: 103
  episodes_total: 1551
  experiment_id: fa07df177b244fd6a682580aebaab5aa
  hostname: linar-Z390-GAMING-X
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 1.5187500000000005
          cur_lr: 0.00010000000000000002
          entropy: 2.7206488605238435
          entropy_coeff: 0.01
          kl: 0.018552981338901573
          policy_loss: -0.06165777401567741
          total_loss: 0.10138632404295304
          vf_explained_var: 0.7040499448776245
          vf_loss: 0.16207324609033064
    num_agent_steps_sampled: 149940
    num_agent_steps_trained: 149940
    num_steps_sampled: 149940
    num_steps_trained: 149940
 

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_81db5_00000,RUNNING,192.168.3.5:552473,15,2101.26,149940,0.967573,5.03,-1.91,96.9223


Result for PPO_my_env_81db5_00000:
  agent_timesteps_total: 159936
  custom_metrics: {}
  date: 2021-11-07_13-15-08
  done: false
  episode_len_mean: 97.91176470588235
  episode_media: {}
  episode_reward_max: 4.94000000000001
  episode_reward_mean: 0.7946078431372565
  episode_reward_min: -1.820000000000001
  episodes_this_iter: 102
  episodes_total: 1653
  experiment_id: fa07df177b244fd6a682580aebaab5aa
  hostname: linar-Z390-GAMING-X
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 1.5187500000000005
          cur_lr: 0.00010000000000000002
          entropy: 2.7162166035073434
          entropy_coeff: 0.01
          kl: 0.01788260955443526
          policy_loss: -0.059286390386350675
          total_loss: 0.08364793523109039
          vf_explained_var: 0.7393493056297302
          vf_loss: 0.14293727800409253
    num_agent_steps_sampled: 159936
    num_agent_steps_trained: 159936
    num_steps_sampled: 159936
    num_steps_trained: 159936
  

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_81db5_00000,RUNNING,192.168.3.5:552473,16,2230.59,159936,0.794608,4.94,-1.82,97.9118


Result for PPO_my_env_81db5_00000:
  agent_timesteps_total: 169932
  custom_metrics: {}
  date: 2021-11-07_13-17-20
  done: false
  episode_len_mean: 95.46666666666667
  episode_media: {}
  episode_reward_max: 6.6400000000000095
  episode_reward_mean: 0.9340952380952401
  episode_reward_min: -1.860000000000001
  episodes_this_iter: 105
  episodes_total: 1758
  experiment_id: fa07df177b244fd6a682580aebaab5aa
  hostname: linar-Z390-GAMING-X
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 1.5187500000000005
          cur_lr: 0.00010000000000000002
          entropy: 2.7122392255016883
          entropy_coeff: 0.01
          kl: 0.02113112106516222
          policy_loss: -0.05204400480539863
          total_loss: 0.12250012070227626
          vf_explained_var: 0.6810734868049622
          vf_loss: 0.16957362833727374
    num_agent_steps_sampled: 169932
    num_agent_steps_trained: 169932
    num_steps_sampled: 169932
    num_steps_trained: 169932
 

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_81db5_00000,RUNNING,192.168.3.5:552473,17,2362.83,169932,0.934095,6.64,-1.86,95.4667




Result for PPO_my_env_81db5_00000:
  agent_timesteps_total: 179928
  custom_metrics: {}
  date: 2021-11-07_13-19-46
  done: false
  episode_len_mean: 93.34579439252336
  episode_media: {}
  episode_reward_max: 5.060000000000012
  episode_reward_mean: 0.9314953271028059
  episode_reward_min: -1.7200000000000009
  episodes_this_iter: 107
  episodes_total: 1865
  experiment_id: fa07df177b244fd6a682580aebaab5aa
  hostname: linar-Z390-GAMING-X
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 2.2781249999999997
          cur_lr: 0.00010000000000000002
          entropy: 2.7130565820596155
          entropy_coeff: 0.01
          kl: 0.015504323764823977
          policy_loss: -0.05911546085503959
          total_loss: 0.10715785208645746
          vf_explained_var: 0.7006322741508484
          vf_loss: 0.15808309013039892
    num_agent_steps_sampled: 179928
    num_agent_steps_trained: 179928
    num_steps_sampled: 179928
    num_steps_trained: 179928


Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_81db5_00000,RUNNING,192.168.3.5:552473,18,2509.5,179928,0.931495,5.06,-1.72,93.3458


Result for PPO_my_env_81db5_00000:
  agent_timesteps_total: 189924
  custom_metrics: {}
  date: 2021-11-07_13-21-59
  done: false
  episode_len_mean: 95.07619047619048
  episode_media: {}
  episode_reward_max: 6.6400000000000095
  episode_reward_mean: 1.1096190476190495
  episode_reward_min: -1.9900000000000009
  episodes_this_iter: 105
  episodes_total: 1970
  experiment_id: fa07df177b244fd6a682580aebaab5aa
  hostname: linar-Z390-GAMING-X
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 2.2781249999999997
          cur_lr: 0.00010000000000000002
          entropy: 2.7099881056027533
          entropy_coeff: 0.01
          kl: 0.016072543428741575
          policy_loss: -0.052196202120022565
          total_loss: 0.1169102566274685
          vf_explained_var: 0.7402122616767883
          vf_loss: 0.1595910735667134
    num_agent_steps_sampled: 189924
    num_agent_steps_trained: 189924
    num_steps_sampled: 189924
    num_steps_trained: 189924


Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_81db5_00000,RUNNING,192.168.3.5:552473,19,2642.02,189924,1.10962,6.64,-1.99,95.0762


Result for PPO_my_env_81db5_00000:
  agent_timesteps_total: 199920
  custom_metrics: {}
  date: 2021-11-07_13-24-11
  done: false
  episode_len_mean: 95.44761904761904
  episode_media: {}
  episode_reward_max: 4.990000000000008
  episode_reward_mean: 1.0897142857142892
  episode_reward_min: -1.8000000000000007
  episodes_this_iter: 105
  episodes_total: 2075
  experiment_id: fa07df177b244fd6a682580aebaab5aa
  hostname: linar-Z390-GAMING-X
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 2.2781249999999997
          cur_lr: 0.00010000000000000002
          entropy: 2.700362457169427
          entropy_coeff: 0.01
          kl: 0.016706487531398827
          policy_loss: -0.056920925291093674
          total_loss: 0.11833684539649253
          vf_explained_var: 0.7622712254524231
          vf_loss: 0.16420192731878697
    num_agent_steps_sampled: 199920
    num_agent_steps_trained: 199920
    num_steps_sampled: 199920
    num_steps_trained: 199920


Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_81db5_00000,RUNNING,192.168.3.5:552473,20,2773.96,199920,1.08971,4.99,-1.8,95.4476




Result for PPO_my_env_81db5_00000:
  agent_timesteps_total: 209916
  custom_metrics: {}
  date: 2021-11-07_13-26-49
  done: false
  episode_len_mean: 94.8952380952381
  episode_media: {}
  episode_reward_max: 4.93000000000001
  episode_reward_mean: 0.8868571428571451
  episode_reward_min: -1.800000000000001
  episodes_this_iter: 105
  episodes_total: 2180
  experiment_id: fa07df177b244fd6a682580aebaab5aa
  hostname: linar-Z390-GAMING-X
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 2.2781249999999997
          cur_lr: 0.00010000000000000002
          entropy: 2.7029424543054694
          entropy_coeff: 0.01
          kl: 0.015577950840711561
          policy_loss: -0.060171757663628006
          total_loss: 0.09002083634672105
          vf_explained_var: 0.7640243172645569
          vf_loss: 0.14173349899550278
    num_agent_steps_sampled: 209916
    num_agent_steps_trained: 209916
    num_steps_sampled: 209916
    num_steps_trained: 209916
  

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_81db5_00000,RUNNING,192.168.3.5:552473,21,2932.34,209916,0.886857,4.93,-1.8,94.8952




Result for PPO_my_env_81db5_00000:
  agent_timesteps_total: 219912
  custom_metrics: {}
  date: 2021-11-07_13-29-10
  done: false
  episode_len_mean: 97.49019607843137
  episode_media: {}
  episode_reward_max: 8.540000000000013
  episode_reward_mean: 1.360588235294121
  episode_reward_min: -1.820000000000001
  episodes_this_iter: 102
  episodes_total: 2282
  experiment_id: fa07df177b244fd6a682580aebaab5aa
  hostname: linar-Z390-GAMING-X
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 2.2781249999999997
          cur_lr: 0.00010000000000000002
          entropy: 2.700569127359961
          entropy_coeff: 0.01
          kl: 0.01948220990775801
          policy_loss: -0.052184801796800025
          total_loss: 0.14508064652312522
          vf_explained_var: 0.7345572113990784
          vf_loss: 0.17988822873777305
    num_agent_steps_sampled: 219912
    num_agent_steps_trained: 219912
    num_steps_sampled: 219912
    num_steps_trained: 219912
  i

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_81db5_00000,RUNNING,192.168.3.5:552473,22,3072.4,219912,1.36059,8.54,-1.82,97.4902


Result for PPO_my_env_81db5_00000:
  agent_timesteps_total: 229908
  custom_metrics: {}
  date: 2021-11-07_13-31-18
  done: false
  episode_len_mean: 97.4423076923077
  episode_media: {}
  episode_reward_max: 7.070000000000011
  episode_reward_mean: 0.9260576923076942
  episode_reward_min: -1.9000000000000008
  episodes_this_iter: 104
  episodes_total: 2386
  experiment_id: fa07df177b244fd6a682580aebaab5aa
  hostname: linar-Z390-GAMING-X
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 2.2781249999999997
          cur_lr: 0.00010000000000000002
          entropy: 2.705834284806863
          entropy_coeff: 0.01
          kl: 0.015849795060138186
          policy_loss: -0.05672982235733083
          total_loss: 0.10344135139401679
          vf_explained_var: 0.772047758102417
          vf_loss: 0.1511217026453879
    num_agent_steps_sampled: 229908
    num_agent_steps_trained: 229908
    num_steps_sampled: 229908
    num_steps_trained: 229908
  it

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_81db5_00000,RUNNING,192.168.3.5:552473,23,3201.2,229908,0.926058,7.07,-1.9,97.4423




Result for PPO_my_env_81db5_00000:
  agent_timesteps_total: 239904
  custom_metrics: {}
  date: 2021-11-07_13-33-43
  done: false
  episode_len_mean: 95.50961538461539
  episode_media: {}
  episode_reward_max: 6.800000000000006
  episode_reward_mean: 0.6714423076923093
  episode_reward_min: -1.9300000000000013
  episodes_this_iter: 104
  episodes_total: 2490
  experiment_id: fa07df177b244fd6a682580aebaab5aa
  hostname: linar-Z390-GAMING-X
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 2.2781249999999997
          cur_lr: 0.00010000000000000002
          entropy: 2.705974098759839
          entropy_coeff: 0.01
          kl: 0.013654281641177511
          policy_loss: -0.06573103746533858
          total_loss: 0.04941376058866606
          vf_explained_var: 0.8229418396949768
          vf_loss: 0.11109837763823378
    num_agent_steps_sampled: 239904
    num_agent_steps_trained: 239904
    num_steps_sampled: 239904
    num_steps_trained: 239904
 

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_81db5_00000,RUNNING,192.168.3.5:552473,24,3345.48,239904,0.671442,6.8,-1.93,95.5096




Result for PPO_my_env_81db5_00000:
  agent_timesteps_total: 249900
  custom_metrics: {}
  date: 2021-11-07_13-36-12
  done: false
  episode_len_mean: 95.5047619047619
  episode_media: {}
  episode_reward_max: 4.910000000000005
  episode_reward_mean: 0.8048571428571444
  episode_reward_min: -1.7200000000000006
  episodes_this_iter: 105
  episodes_total: 2595
  experiment_id: fa07df177b244fd6a682580aebaab5aa
  hostname: linar-Z390-GAMING-X
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 2.2781249999999997
          cur_lr: 0.00010000000000000002
          entropy: 2.707100103655432
          entropy_coeff: 0.01
          kl: 0.01598095140212039
          policy_loss: -0.06017398072163993
          total_loss: 0.07524562221976658
          vf_explained_var: 0.7774372696876526
          vf_loss: 0.12608399865273226
    num_agent_steps_sampled: 249900
    num_agent_steps_trained: 249900
    num_steps_sampled: 249900
    num_steps_trained: 249900
  i

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_81db5_00000,RUNNING,192.168.3.5:552473,25,3494.95,249900,0.804857,4.91,-1.72,95.5048


Result for PPO_my_env_81db5_00000:
  agent_timesteps_total: 259896
  custom_metrics: {}
  date: 2021-11-07_13-38-23
  done: false
  episode_len_mean: 96.46601941747574
  episode_media: {}
  episode_reward_max: 6.590000000000014
  episode_reward_mean: 1.0924271844660218
  episode_reward_min: -2.0399999999999996
  episodes_this_iter: 103
  episodes_total: 2698
  experiment_id: fa07df177b244fd6a682580aebaab5aa
  hostname: linar-Z390-GAMING-X
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 2.2781249999999997
          cur_lr: 0.00010000000000000002
          entropy: 2.7021835814174424
          entropy_coeff: 0.01
          kl: 0.016764205971913936
          policy_loss: -0.05916271383976603
          total_loss: 0.0849987753404381
          vf_explained_var: 0.7875007390975952
          vf_loss: 0.1329923674559746
    num_agent_steps_sampled: 259896
    num_agent_steps_trained: 259896
    num_steps_sampled: 259896
    num_steps_trained: 259896
  

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_81db5_00000,RUNNING,192.168.3.5:552473,26,3625.94,259896,1.09243,6.59,-2.04,96.466


Result for PPO_my_env_81db5_00000:
  agent_timesteps_total: 269892
  custom_metrics: {}
  date: 2021-11-07_13-40-35
  done: false
  episode_len_mean: 95.10377358490567
  episode_media: {}
  episode_reward_max: 6.4800000000000075
  episode_reward_mean: 1.113490566037738
  episode_reward_min: -1.9900000000000009
  episodes_this_iter: 106
  episodes_total: 2804
  experiment_id: fa07df177b244fd6a682580aebaab5aa
  hostname: linar-Z390-GAMING-X
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 2.2781249999999997
          cur_lr: 0.00010000000000000002
          entropy: 2.702842872163169
          entropy_coeff: 0.01
          kl: 0.017012666659806296
          policy_loss: -0.05642311986218382
          total_loss: 0.09115412494990553
          vf_explained_var: 0.7988400459289551
          vf_loss: 0.13584869124043064
    num_agent_steps_sampled: 269892
    num_agent_steps_trained: 269892
    num_steps_sampled: 269892
    num_steps_trained: 269892
 

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_81db5_00000,RUNNING,192.168.3.5:552473,27,3758.04,269892,1.11349,6.48,-1.99,95.1038




Result for PPO_my_env_81db5_00000:
  agent_timesteps_total: 279888
  custom_metrics: {}
  date: 2021-11-07_13-43-02
  done: false
  episode_len_mean: 95.67961165048544
  episode_media: {}
  episode_reward_max: 6.910000000000019
  episode_reward_mean: 1.0958252427184494
  episode_reward_min: -2.0300000000000007
  episodes_this_iter: 103
  episodes_total: 2907
  experiment_id: fa07df177b244fd6a682580aebaab5aa
  hostname: linar-Z390-GAMING-X
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 2.2781249999999997
          cur_lr: 0.00010000000000000002
          entropy: 2.708665060997009
          entropy_coeff: 0.01
          kl: 0.016861953852492388
          policy_loss: -0.05944416807376397
          total_loss: 0.09076555278132327
          vf_explained_var: 0.7898940443992615
          vf_loss: 0.13888273092193737
    num_agent_steps_sampled: 279888
    num_agent_steps_trained: 279888
    num_steps_sampled: 279888
    num_steps_trained: 279888
 

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_81db5_00000,RUNNING,192.168.3.5:552473,28,3904.66,279888,1.09583,6.91,-2.03,95.6796




Result for PPO_my_env_81db5_00000:
  agent_timesteps_total: 289884
  custom_metrics: {}
  date: 2021-11-07_13-45-37
  done: false
  episode_len_mean: 96.59615384615384
  episode_media: {}
  episode_reward_max: 8.830000000000013
  episode_reward_mean: 1.2093269230769261
  episode_reward_min: -2.0100000000000007
  episodes_this_iter: 104
  episodes_total: 3011
  experiment_id: fa07df177b244fd6a682580aebaab5aa
  hostname: linar-Z390-GAMING-X
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 2.2781249999999997
          cur_lr: 0.00010000000000000002
          entropy: 2.713868202918615
          entropy_coeff: 0.01
          kl: 0.01622758503200376
          policy_loss: -0.056005145744005794
          total_loss: 0.09591588506402456
          vf_explained_var: 0.7645260691642761
          vf_loss: 0.14209124455148847
    num_agent_steps_sampled: 289884
    num_agent_steps_trained: 289884
    num_steps_sampled: 289884
    num_steps_trained: 289884
 

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_81db5_00000,RUNNING,192.168.3.5:552473,29,4060.16,289884,1.20933,8.83,-2.01,96.5962


Result for PPO_my_env_81db5_00000:
  agent_timesteps_total: 299880
  custom_metrics: {}
  date: 2021-11-07_13-47-48
  done: false
  episode_len_mean: 96.93203883495146
  episode_media: {}
  episode_reward_max: 6.480000000000012
  episode_reward_mean: 0.8754368932038863
  episode_reward_min: -1.94
  episodes_this_iter: 103
  episodes_total: 3114
  experiment_id: fa07df177b244fd6a682580aebaab5aa
  hostname: linar-Z390-GAMING-X
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 2.2781249999999997
          cur_lr: 0.00010000000000000002
          entropy: 2.7122195904071513
          entropy_coeff: 0.01
          kl: 0.017025060210008686
          policy_loss: -0.06020125627198864
          total_loss: 0.09091485077028315
          vf_explained_var: 0.7639102339744568
          vf_loss: 0.13945308590514793
    num_agent_steps_sampled: 299880
    num_agent_steps_trained: 299880
    num_steps_sampled: 299880
    num_steps_trained: 299880
  iterations_s

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_81db5_00000,RUNNING,192.168.3.5:552473,30,4190.22,299880,0.875437,6.48,-1.94,96.932




Result for PPO_my_env_81db5_00000:
  agent_timesteps_total: 309876
  custom_metrics: {}
  date: 2021-11-07_13-50-13
  done: false
  episode_len_mean: 94.15094339622641
  episode_media: {}
  episode_reward_max: 8.610000000000014
  episode_reward_mean: 1.276792452830191
  episode_reward_min: -1.840000000000001
  episodes_this_iter: 106
  episodes_total: 3220
  experiment_id: fa07df177b244fd6a682580aebaab5aa
  hostname: linar-Z390-GAMING-X
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 2.2781249999999997
          cur_lr: 0.00010000000000000002
          entropy: 2.709471003214518
          entropy_coeff: 0.01
          kl: 0.01819409205455538
          policy_loss: -0.054841221122142784
          total_loss: 0.11570147908298681
          vf_explained_var: 0.7465397119522095
          vf_loss: 0.156188993054068
    num_agent_steps_sampled: 309876
    num_agent_steps_trained: 309876
    num_steps_sampled: 309876
    num_steps_trained: 309876
  ite

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_81db5_00000,RUNNING,192.168.3.5:552473,31,4335.82,309876,1.27679,8.61,-1.84,94.1509




Result for PPO_my_env_81db5_00000:
  agent_timesteps_total: 319872
  custom_metrics: {}
  date: 2021-11-07_13-52-38
  done: false
  episode_len_mean: 97.84313725490196
  episode_media: {}
  episode_reward_max: 6.590000000000013
  episode_reward_mean: 0.9889215686274534
  episode_reward_min: -1.990000000000001
  episodes_this_iter: 102
  episodes_total: 3322
  experiment_id: fa07df177b244fd6a682580aebaab5aa
  hostname: linar-Z390-GAMING-X
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 2.2781249999999997
          cur_lr: 0.00010000000000000002
          entropy: 2.712596161141355
          entropy_coeff: 0.01
          kl: 0.01599820323550902
          policy_loss: -0.05543722986078495
          total_loss: 0.08367633392693076
          vf_explained_var: 0.7451394200325012
          vf_loss: 0.12979361726623825
    num_agent_steps_sampled: 319872
    num_agent_steps_trained: 319872
    num_steps_sampled: 319872
    num_steps_trained: 319872
  i

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_81db5_00000,RUNNING,192.168.3.5:552473,32,4480.68,319872,0.988922,6.59,-1.99,97.8431


Result for PPO_my_env_81db5_00000:
  agent_timesteps_total: 329868
  custom_metrics: {}
  date: 2021-11-07_13-54-51
  done: false
  episode_len_mean: 99.46078431372548
  episode_media: {}
  episode_reward_max: 4.460000000000013
  episode_reward_mean: 0.8931372549019624
  episode_reward_min: -1.950000000000001
  episodes_this_iter: 102
  episodes_total: 3424
  experiment_id: fa07df177b244fd6a682580aebaab5aa
  hostname: linar-Z390-GAMING-X
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 2.2781249999999997
          cur_lr: 0.00010000000000000002
          entropy: 2.711072022283179
          entropy_coeff: 0.01
          kl: 0.01409713317381184
          policy_loss: -0.06184566554724852
          total_loss: 0.05098609736966095
          vf_explained_var: 0.7589762806892395
          vf_loss: 0.1078274502347295
    num_agent_steps_sampled: 329868
    num_agent_steps_trained: 329868
    num_steps_sampled: 329868
    num_steps_trained: 329868
  it

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_81db5_00000,RUNNING,192.168.3.5:552473,33,4613.52,329868,0.893137,4.46,-1.95,99.4608




Result for PPO_my_env_81db5_00000:
  agent_timesteps_total: 339864
  custom_metrics: {}
  date: 2021-11-07_13-57-20
  done: false
  episode_len_mean: 97.09708737864078
  episode_media: {}
  episode_reward_max: 8.80000000000001
  episode_reward_mean: 1.0263106796116532
  episode_reward_min: -1.810000000000001
  episodes_this_iter: 103
  episodes_total: 3527
  experiment_id: fa07df177b244fd6a682580aebaab5aa
  hostname: linar-Z390-GAMING-X
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 2.2781249999999997
          cur_lr: 0.00010000000000000002
          entropy: 2.7088166094233848
          entropy_coeff: 0.01
          kl: 0.01676516191609502
          policy_loss: -0.05642231985152143
          total_loss: 0.08512800530616083
          vf_explained_var: 0.8004439473152161
          vf_loss: 0.13044535623640457
    num_agent_steps_sampled: 339864
    num_agent_steps_trained: 339864
    num_steps_sampled: 339864
    num_steps_trained: 339864
  i

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_81db5_00000,RUNNING,192.168.3.5:552473,34,4762.29,339864,1.02631,8.8,-1.81,97.0971




Result for PPO_my_env_81db5_00000:
  agent_timesteps_total: 349860
  custom_metrics: {}
  date: 2021-11-07_13-59-45
  done: false
  episode_len_mean: 96.33009708737865
  episode_media: {}
  episode_reward_max: 8.590000000000014
  episode_reward_mean: 1.2452427184466042
  episode_reward_min: -2.05
  episodes_this_iter: 103
  episodes_total: 3630
  experiment_id: fa07df177b244fd6a682580aebaab5aa
  hostname: linar-Z390-GAMING-X
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 2.2781249999999997
          cur_lr: 0.00010000000000000002
          entropy: 2.7061106787787543
          entropy_coeff: 0.01
          kl: 0.018915783407043214
          policy_loss: -0.054425117682315345
          total_loss: 0.11094976774769841
          vf_explained_var: 0.7687221169471741
          vf_loss: 0.14934347298426123
    num_agent_steps_sampled: 349860
    num_agent_steps_trained: 349860
    num_steps_sampled: 349860
    num_steps_trained: 349860
  iterations_

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_81db5_00000,RUNNING,192.168.3.5:552473,35,4907.51,349860,1.24524,8.59,-2.05,96.3301


Result for PPO_my_env_81db5_00000:
  agent_timesteps_total: 359856
  custom_metrics: {}
  date: 2021-11-07_14-01-54
  done: false
  episode_len_mean: 97.32038834951456
  episode_media: {}
  episode_reward_max: 6.480000000000011
  episode_reward_mean: 1.0744660194174787
  episode_reward_min: -1.9700000000000006
  episodes_this_iter: 103
  episodes_total: 3733
  experiment_id: fa07df177b244fd6a682580aebaab5aa
  hostname: linar-Z390-GAMING-X
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 2.2781249999999997
          cur_lr: 0.00010000000000000002
          entropy: 2.708618607887855
          entropy_coeff: 0.01
          kl: 0.016554946444030175
          policy_loss: -0.05340847213425834
          total_loss: 0.09604238723839398
          vf_explained_var: 0.8036810755729675
          vf_loss: 0.138822806513526
    num_agent_steps_sampled: 359856
    num_agent_steps_trained: 359856
    num_steps_sampled: 359856
    num_steps_trained: 359856
  i

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_81db5_00000,RUNNING,192.168.3.5:552473,36,5036.66,359856,1.07447,6.48,-1.97,97.3204




Result for PPO_my_env_81db5_00000:
  agent_timesteps_total: 369852
  custom_metrics: {}
  date: 2021-11-07_14-04-21
  done: false
  episode_len_mean: 95.88461538461539
  episode_media: {}
  episode_reward_max: 6.9700000000000095
  episode_reward_mean: 1.337019230769234
  episode_reward_min: -2.1899999999999986
  episodes_this_iter: 104
  episodes_total: 3837
  experiment_id: fa07df177b244fd6a682580aebaab5aa
  hostname: linar-Z390-GAMING-X
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 2.2781249999999997
          cur_lr: 0.00010000000000000002
          entropy: 2.698765781394437
          entropy_coeff: 0.01
          kl: 0.018428331711593263
          policy_loss: -0.04798429309330536
          total_loss: 0.1424753571843826
          vf_explained_var: 0.7764484882354736
          vf_loss: 0.1754652638720651
    num_agent_steps_sampled: 369852
    num_agent_steps_trained: 369852
    num_steps_sampled: 369852
    num_steps_trained: 369852
  i

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_81db5_00000,RUNNING,192.168.3.5:552473,37,5183.14,369852,1.33702,6.97,-2.19,95.8846




Result for PPO_my_env_81db5_00000:
  agent_timesteps_total: 379848
  custom_metrics: {}
  date: 2021-11-07_14-06-45
  done: false
  episode_len_mean: 94.84761904761905
  episode_media: {}
  episode_reward_max: 6.9500000000000135
  episode_reward_mean: 1.451809523809527
  episode_reward_min: -1.8400000000000007
  episodes_this_iter: 105
  episodes_total: 3942
  experiment_id: fa07df177b244fd6a682580aebaab5aa
  hostname: linar-Z390-GAMING-X
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 2.2781249999999997
          cur_lr: 0.00010000000000000002
          entropy: 2.7010525489464783
          entropy_coeff: 0.01
          kl: 0.015702452981060527
          policy_loss: -0.05412472604818745
          total_loss: 0.11406689591304191
          vf_explained_var: 0.7943245768547058
          vf_loss: 0.1594299938163569
    num_agent_steps_sampled: 379848
    num_agent_steps_trained: 379848
    num_steps_sampled: 379848
    num_steps_trained: 379848
 

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_81db5_00000,RUNNING,192.168.3.5:552473,38,5327.21,379848,1.45181,6.95,-1.84,94.8476


Result for PPO_my_env_81db5_00000:
  agent_timesteps_total: 389844
  custom_metrics: {}
  date: 2021-11-07_14-08-57
  done: false
  episode_len_mean: 96.34615384615384
  episode_media: {}
  episode_reward_max: 8.510000000000012
  episode_reward_mean: 1.1890384615384642
  episode_reward_min: -2.17
  episodes_this_iter: 104
  episodes_total: 4046
  experiment_id: fa07df177b244fd6a682580aebaab5aa
  hostname: linar-Z390-GAMING-X
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 2.2781249999999997
          cur_lr: 0.00010000000000000002
          entropy: 2.698417137830685
          entropy_coeff: 0.01
          kl: 0.017353736332599973
          policy_loss: -0.05533135909447596
          total_loss: 0.10735864956205528
          vf_explained_var: 0.7998394966125488
          vf_loss: 0.1501401976101164
    num_agent_steps_sampled: 389844
    num_agent_steps_trained: 389844
    num_steps_sampled: 389844
    num_steps_trained: 389844
  iterations_sin

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_81db5_00000,RUNNING,192.168.3.5:552473,39,5459.36,389844,1.18904,8.51,-2.17,96.3462


Result for PPO_my_env_81db5_00000:
  agent_timesteps_total: 399840
  custom_metrics: {}
  date: 2021-11-07_14-11-08
  done: false
  episode_len_mean: 97.2621359223301
  episode_media: {}
  episode_reward_max: 6.610000000000017
  episode_reward_mean: 1.1024271844660216
  episode_reward_min: -1.7900000000000007
  episodes_this_iter: 103
  episodes_total: 4149
  experiment_id: fa07df177b244fd6a682580aebaab5aa
  hostname: linar-Z390-GAMING-X
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 2.2781249999999997
          cur_lr: 0.00010000000000000002
          entropy: 2.7054897664958597
          entropy_coeff: 0.01
          kl: 0.014789822021000239
          policy_loss: -0.05670293285394521
          total_loss: 0.08354698522369632
          vf_explained_var: 0.7896628379821777
          vf_loss: 0.13361175131434813
    num_agent_steps_sampled: 399840
    num_agent_steps_trained: 399840
    num_steps_sampled: 399840
    num_steps_trained: 399840
 

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_81db5_00000,RUNNING,192.168.3.5:552473,40,5590.25,399840,1.10243,6.61,-1.79,97.2621


Process _WandbLoggingProcess-1:
Traceback (most recent call last):
  File "/root/miniconda/envs/py37/lib/python3.7/multiprocessing/process.py", line 297, in _bootstrap
    self.run()
  File "/root/miniconda/envs/py37/lib/python3.7/site-packages/ray/tune/integration/wandb.py", line 200, in run
    result = self.queue.get()
  File "/root/miniconda/envs/py37/lib/python3.7/multiprocessing/queues.py", line 94, in get
    res = self._recv_bytes()
  File "/root/miniconda/envs/py37/lib/python3.7/multiprocessing/connection.py", line 216, in recv_bytes
    buf = self._recv_bytes(maxlength)
  File "/root/miniconda/envs/py37/lib/python3.7/multiprocessing/connection.py", line 407, in _recv_bytes
    buf = self._recv(4)
  File "/root/miniconda/envs/py37/lib/python3.7/multiprocessing/connection.py", line 379, in _recv
    chunk = read(handle, remaining)
KeyboardInterrupt

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/root/miniconda/en

Traceback (most recent call last):
  File "/root/miniconda/envs/py37/lib/python3.7/site-packages/ipython-7.25.0-py3.7.egg/IPython/core/interactiveshell.py", line 3441, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "/tmp/ipykernel_552359/2342650933.py", line 34, in <module>
    checkpoint_at_end=True)
  File "/root/miniconda/envs/py37/lib/python3.7/site-packages/ray/tune/tune.py", line 532, in run
    runner.step()
  File "/root/miniconda/envs/py37/lib/python3.7/site-packages/ray/tune/trial_runner.py", line 554, in step
    self._process_events(timeout=timeout)  # blocking
  File "/root/miniconda/envs/py37/lib/python3.7/site-packages/ray/tune/trial_runner.py", line 675, in _process_events
    timeout=timeout)  # blocking
  File "/root/miniconda/envs/py37/lib/python3.7/site-packages/ray/tune/ray_trial_executor.py", line 718, in get_next_available_trial
    ready, _ = ray.wait(shuffled_results, timeout=timeout)
  File "/root/miniconda/envs/py37/lib/python3.7/sit

TypeError: object of type 'NoneType' has no len()