In [1]:
import numpy as np

import torch 
from torch import nn

import ray
from ray.rllib.agents import ppo
from ray.rllib.models import ModelCatalog
from ray.rllib.models.torch.torch_modelv2 import TorchModelV2
from ray.rllib.utils.annotations import override

#from models import VisualEncoder
from train import *



In [2]:
from ray.rllib.models.torch.recurrent_net import DeveloperAPI, RecurrentNetwork

In [3]:
class VisualEncoder(nn.Module):
    def __init__(self):
        super().__init__()

        self.cnn = nn.Sequential(
            nn.Conv2d(3, 32, kernel_size=8, stride=4, padding=0),  
            nn.ReLU(),
            nn.Conv2d(32, 64, kernel_size=4, stride=2, padding=0), 
            nn.ReLU(),
            nn.Conv2d(64, 64, kernel_size=3, stride=1, padding=0), 
            nn.ReLU(),
            nn.Conv2d(64, 64, kernel_size=3, stride=1, padding=0),
            nn.ReLU(), 
            nn.Conv2d(64, 512, kernel_size=2, stride=1, padding=0),
            nn.ReLU(),
            nn.Flatten(),
        )

    def forward(self, x):
        return self.cnn(x)

In [4]:
from ray.rllib.policy.rnn_sequencing import add_time_dimension

class MyModelClass(RecurrentNetwork, nn.Module): # RecurrentNetwork
    def __init__(self, obs_space, action_space, num_outputs, model_config, name):
        TorchModelV2.__init__(self, obs_space, action_space, num_outputs, model_config, name)
        nn.Module.__init__(self)
        self.time_major = self.model_config.get("_time_major", False)
        self.features_dim = 512
        self.encoder = VisualEncoder()
        self.encoder.load_state_dict(
            torch.load("/IGLU-Minecraft/models/AnnaCNN/encoder_weigths.pth", map_location=torch.device('cpu'))
        )
        self.gru_hidden_dim = 64
        self.gru = nn.GRU(self.features_dim, self.gru_hidden_dim, batch_first=not self.time_major)
        
        self.action_head = nn.Linear(self.gru_hidden_dim, action_space.n)
        self.value_head = nn.Linear(self.gru_hidden_dim, 1)
        self.last_value = None
        
        self.use_cuda = torch.cuda.is_available()
        if self.use_cuda:
            self.encoder.cuda()
            self.action_head.cuda()
            self.value_head.cuda()
        
        
    @override(TorchModelV2)
    def forward(self, input_dict, state, seq_lens):
        obs = input_dict['obs'].permute(0, 3, 1, 2).float() / 255.0
        if self.use_cuda:
            obs.cuda()
            
        features = self.encoder(obs)
        
        if isinstance(seq_lens, np.ndarray):
            seq_lens = torch.Tensor(seq_lens).int()
        max_seq_len = features.shape[0] // seq_lens.shape[0]    
        inputs = add_time_dimension(
            features,
            max_seq_len=max_seq_len,
            framework="torch",
            time_major=self.time_major,
        )
        #print('features:', features, 'shape:', features.shape)
        #print('inputs:', inputs, 'shape:', inputs.shape)
        #print('hidden state:', state[0], 'shape:', state[0].shape)
        
        h = state[0].permute(1, 0, 2)
        output, new_h = self.gru(inputs, h)
        new_state = [new_h.permute(1, 0, 2)]
        
        output = output.reshape(-1, self.gru_hidden_dim)
        
        action = self.action_head(output)
        self.last_value = self.value_head(output).squeeze(1)
        return action, new_state
    
    @override(TorchModelV2)
    def value_function(self):
        assert self.last_value is not None, "must call forward() first"
        return self.last_value
    
    @override(TorchModelV2)
    def get_initial_state(self):
        #print('i was here')
        return [torch.zeros(1, self.gru_hidden_dim)]
    
    """@override(RecurrentNetwork)
    def forward_rnn(self, input_dict, state, seq_lens):
        x = nn.functional.relu(self.fc1(input_dict["obs_flat"].float()))
        h_in = state[0].reshape(-1, self.rnn_hidden_dim)
        h = self.rnn(x, h_in)
        q = self.fc2(h)
        self._cur_value = self.value_branch(h).squeeze(1)
        return q, [h]"""

In [5]:
ModelCatalog.register_custom_model("my_torch_model", MyModelClass)

In [6]:
import os
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"   # see issue #152
os.environ["CUDA_VISIBLE_DEVICES"]="0"

def env_creator(env_config):
    env = gym.make('IGLUSilentBuilder-v0', max_steps=1000)
    env.update_taskset(TaskSet(preset=['C17']))
    env = PovOnlyWrapper(env)
    env = IgluActionWrapper(env)
    return env

from ray.tune.registry import register_env
register_env("my_env", env_creator)

from ray import tune
from ray.rllib.agents.ppo import PPOTrainer

In [None]:
from ray.tune.integration.wandb import WandbLogger

analysis = tune.run(PPOTrainer, 
         config={
             "env": "my_env", 
             "framework": "torch",
             "num_gpus": 1,
             "num_workers": 1,
             "sgd_minibatch_size": 256,
             "clip_param": 0.2,
             "entropy_coeff": 0.01,
             "lambda": 0.95,
             "train_batch_size": 1000,
             "model": {
                    # Specify our custom model from above.
                    "custom_model": "my_torch_model",
                    # Extra kwargs to be passed to your model's c'tor.
                    "custom_model_config": {},
              },
             "logger_config": {
                  "wandb": {
                      "project": "IGLU-Minecraft",
                      "name": "PPO C17 pretrained (AnnaCNN) + GRU",
                      "notes": "camera noop removed from actions"
                  }
              }

        },
        loggers=[WandbLogger])



Trial name,status,loc
PPO_my_env_2f3c2_00000,PENDING,


2021-10-01 10:20:22,097	INFO wandb.py:170 -- Already logged into W&B.
2021-10-01 10:20:22,110	ERROR syncer.py:72 -- Log sync requires rsync to be installed.
[34m[1mwandb[0m: Currently logged in as: [33mlinar[0m (use `wandb login --relogin` to force relogin)
[34m[1mwandb[0m: wandb version 0.12.3 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade


[2m[36m(pid=9363)[0m 2021-10-01 10:20:26,161	INFO ppo.py:159 -- In multi-agent mode, policies will be optimized sequentially by the multi-GPU optimizer. Consider setting simple_optimizer=True if this doesn't work for you.
[2m[36m(pid=9363)[0m 2021-10-01 10:20:26,161	INFO trainer.py:728 -- Current log_level is WARN. For more information, set 'log_level': 'INFO' / 'DEBUG' or use the -v and -vv flags.


Result for PPO_my_env_2f3c2_00000:
  agent_timesteps_total: 1000
  custom_metrics: {}
  date: 2021-10-01_10-21-39
  done: false
  episode_len_mean: 1000.0
  episode_media: {}
  episode_reward_max: 0.0
  episode_reward_mean: 0.0
  episode_reward_min: 0.0
  episodes_this_iter: 1
  episodes_total: 1
  experiment_id: ba18be64619d4801b3ca85757cadc21e
  hostname: linar-B360M-D2V
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.19999999999999996
          cur_lr: 5.000000000000001e-05
          entropy: 2.703709946738349
          entropy_coeff: 0.009999999999999998
          kl: 0.00965274962776467
          policy_loss: -0.0923098218627274
          total_loss: -0.11293265991824025
          vf_explained_var: 0.03169650956988335
          vf_loss: 0.0044837128991882
    num_agent_steps_sampled: 1000
    num_agent_steps_trained: 1000
    num_steps_sampled: 1000
    num_steps_trained: 1000
  iterations_since_restore: 1
  node_ip: 192.168.1.100
  num_

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_2f3c2_00000,RUNNING,192.168.1.100:9363,1,66.3465,1000,0,0,0,1000


Result for PPO_my_env_2f3c2_00000:
  agent_timesteps_total: 2000
  custom_metrics: {}
  date: 2021-10-01_10-21-51
  done: false
  episode_len_mean: 1000.0
  episode_media: {}
  episode_reward_max: 0.0
  episode_reward_mean: 0.0
  episode_reward_min: 0.0
  episodes_this_iter: 1
  episodes_total: 2
  experiment_id: ba18be64619d4801b3ca85757cadc21e
  hostname: linar-B360M-D2V
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.19999999999999996
          cur_lr: 5.000000000000001e-05
          entropy: 2.739139864179823
          entropy_coeff: 0.009999999999999998
          kl: 0.009254969133539218
          policy_loss: 0.18727723095152113
          total_loss: 0.16425911539958582
          vf_explained_var: 0.4264393150806427
          vf_loss: 0.002522287904866971
    num_agent_steps_sampled: 2000
    num_agent_steps_trained: 2000
    num_steps_sampled: 2000
    num_steps_trained: 2000
  iterations_since_restore: 2
  node_ip: 192.168.1.100
  num

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_2f3c2_00000,RUNNING,192.168.1.100:9363,2,78.684,2000,0,0,0,1000


Result for PPO_my_env_2f3c2_00000:
  agent_timesteps_total: 3000
  custom_metrics: {}
  date: 2021-10-01_10-22-04
  done: false
  episode_len_mean: 1000.0
  episode_media: {}
  episode_reward_max: 0.0
  episode_reward_mean: -1.0
  episode_reward_min: -3.0
  episodes_this_iter: 1
  episodes_total: 3
  experiment_id: ba18be64619d4801b3ca85757cadc21e
  hostname: linar-B360M-D2V
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.19999999999999996
          cur_lr: 5.000000000000001e-05
          entropy: 2.73066815800137
          entropy_coeff: 0.009999999999999998
          kl: 0.006969921565658385
          policy_loss: 0.12619963967137868
          total_loss: 0.13434874812761943
          vf_explained_var: 0.4080866575241089
          vf_loss: 0.034061807580292223
    num_agent_steps_sampled: 3000
    num_agent_steps_trained: 3000
    num_steps_sampled: 3000
    num_steps_trained: 3000
  iterations_since_restore: 3
  node_ip: 192.168.1.100
  nu

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_2f3c2_00000,RUNNING,192.168.1.100:9363,3,91.7229,3000,-1,0,-3,1000


Result for PPO_my_env_2f3c2_00000:
  agent_timesteps_total: 4000
  custom_metrics: {}
  date: 2021-10-01_10-22-16
  done: false
  episode_len_mean: 1000.0
  episode_media: {}
  episode_reward_max: 0.0
  episode_reward_mean: -0.75
  episode_reward_min: -3.0
  episodes_this_iter: 1
  episodes_total: 4
  experiment_id: ba18be64619d4801b3ca85757cadc21e
  hostname: linar-B360M-D2V
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.19999999999999996
          cur_lr: 5.000000000000001e-05
          entropy: 2.7435591565238107
          entropy_coeff: 0.009999999999999998
          kl: 0.008900370724115822
          policy_loss: 0.19691290573941336
          total_loss: 0.17288700342178345
          vf_explained_var: 0.3730916678905487
          vf_loss: 0.0016296139403291616
    num_agent_steps_sampled: 4000
    num_agent_steps_trained: 4000
    num_steps_sampled: 4000
    num_steps_trained: 4000
  iterations_since_restore: 4
  node_ip: 192.168.1.100


Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_2f3c2_00000,RUNNING,192.168.1.100:9363,4,103.083,4000,-0.75,0,-3,1000


Result for PPO_my_env_2f3c2_00000:
  agent_timesteps_total: 5000
  custom_metrics: {}
  date: 2021-10-01_10-22-27
  done: false
  episode_len_mean: 1000.0
  episode_media: {}
  episode_reward_max: 0.0
  episode_reward_mean: -0.6
  episode_reward_min: -3.0
  episodes_this_iter: 1
  episodes_total: 5
  experiment_id: ba18be64619d4801b3ca85757cadc21e
  hostname: linar-B360M-D2V
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.19999999999999996
          cur_lr: 5.000000000000001e-05
          entropy: 2.7383231427934436
          entropy_coeff: 0.009999999999999998
          kl: 0.008033587776009616
          policy_loss: 0.03687968982590569
          total_loss: 0.013218507046500842
          vf_explained_var: -0.7381928563117981
          vf_loss: 0.0021153326482615535
    num_agent_steps_sampled: 5000
    num_agent_steps_trained: 5000
    num_steps_sampled: 5000
    num_steps_trained: 5000
  iterations_since_restore: 5
  node_ip: 192.168.1.100

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_2f3c2_00000,RUNNING,192.168.1.100:9363,5,114.906,5000,-0.6,0,-3,1000


Result for PPO_my_env_2f3c2_00000:
  agent_timesteps_total: 6000
  custom_metrics: {}
  date: 2021-10-01_10-22-39
  done: false
  episode_len_mean: 1000.0
  episode_media: {}
  episode_reward_max: 0.0
  episode_reward_mean: -0.5
  episode_reward_min: -3.0
  episodes_this_iter: 1
  episodes_total: 6
  experiment_id: ba18be64619d4801b3ca85757cadc21e
  hostname: linar-B360M-D2V
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.19999999999999996
          cur_lr: 5.000000000000001e-05
          entropy: 2.744913864135742
          entropy_coeff: 0.009999999999999998
          kl: 0.0076107483911666465
          policy_loss: 0.19132733868641985
          total_loss: 0.16749370172862352
          vf_explained_var: -0.16877679526805878
          vf_loss: 0.0020933512641931884
    num_agent_steps_sampled: 6000
    num_agent_steps_trained: 6000
    num_steps_sampled: 6000
    num_steps_trained: 6000
  iterations_since_restore: 6
  node_ip: 192.168.1.100

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_2f3c2_00000,RUNNING,192.168.1.100:9363,6,126.427,6000,-0.5,0,-3,1000


Result for PPO_my_env_2f3c2_00000:
  agent_timesteps_total: 7000
  custom_metrics: {}
  date: 2021-10-01_10-22-52
  done: false
  episode_len_mean: 1000.0
  episode_media: {}
  episode_reward_max: 0.0
  episode_reward_mean: -0.42857142857142855
  episode_reward_min: -3.0
  episodes_this_iter: 1
  episodes_total: 7
  experiment_id: ba18be64619d4801b3ca85757cadc21e
  hostname: linar-B360M-D2V
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.19999999999999996
          cur_lr: 5.000000000000001e-05
          entropy: 2.6988585048251683
          entropy_coeff: 0.009999999999999998
          kl: 0.007876575393220444
          policy_loss: 0.07629010923103326
          total_loss: 0.052838297767771616
          vf_explained_var: -0.9252479672431946
          vf_loss: 0.001961457618098292
    num_agent_steps_sampled: 7000
    num_agent_steps_trained: 7000
    num_steps_sampled: 7000
    num_steps_trained: 7000
  iterations_since_restore: 7
  node_ip

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_2f3c2_00000,RUNNING,192.168.1.100:9363,7,139.11,7000,-0.428571,0,-3,1000


Result for PPO_my_env_2f3c2_00000:
  agent_timesteps_total: 8000
  custom_metrics: {}
  date: 2021-10-01_10-23-04
  done: false
  episode_len_mean: 1000.0
  episode_media: {}
  episode_reward_max: 0.0
  episode_reward_mean: -0.375
  episode_reward_min: -3.0
  episodes_this_iter: 1
  episodes_total: 8
  experiment_id: ba18be64619d4801b3ca85757cadc21e
  hostname: linar-B360M-D2V
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.19999999999999996
          cur_lr: 5.000000000000001e-05
          entropy: 2.7230493836932714
          entropy_coeff: 0.009999999999999998
          kl: 0.011894338149331891
          policy_loss: 0.05437346007092856
          total_loss: 0.03240594930248335
          vf_explained_var: 0.1587696671485901
          vf_loss: 0.002884113597166207
    num_agent_steps_sampled: 8000
    num_agent_steps_trained: 8000
    num_steps_sampled: 8000
    num_steps_trained: 8000
  iterations_since_restore: 8
  node_ip: 192.168.1.100


Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_2f3c2_00000,RUNNING,192.168.1.100:9363,8,151.111,8000,-0.375,0,-3,1000


Result for PPO_my_env_2f3c2_00000:
  agent_timesteps_total: 9000
  custom_metrics: {}
  date: 2021-10-01_10-23-16
  done: false
  episode_len_mean: 1000.0
  episode_media: {}
  episode_reward_max: 0.0
  episode_reward_mean: -0.3333333333333333
  episode_reward_min: -3.0
  episodes_this_iter: 1
  episodes_total: 9
  experiment_id: ba18be64619d4801b3ca85757cadc21e
  hostname: linar-B360M-D2V
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.19999999999999996
          cur_lr: 5.000000000000001e-05
          entropy: 2.7217303117116294
          entropy_coeff: 0.009999999999999998
          kl: 0.010318207712395031
          policy_loss: -0.07122858320362865
          total_loss: -0.09359369141360124
          vf_explained_var: -0.23562747240066528
          vf_loss: 0.002788553085540318
    num_agent_steps_sampled: 9000
    num_agent_steps_trained: 9000
    num_steps_sampled: 9000
    num_steps_trained: 9000
  iterations_since_restore: 9
  node_i

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_2f3c2_00000,RUNNING,192.168.1.100:9363,9,162.902,9000,-0.333333,0,-3,1000


Result for PPO_my_env_2f3c2_00000:
  agent_timesteps_total: 10000
  custom_metrics: {}
  date: 2021-10-01_10-23-27
  done: false
  episode_len_mean: 1000.0
  episode_media: {}
  episode_reward_max: 0.0
  episode_reward_mean: -0.3
  episode_reward_min: -3.0
  episodes_this_iter: 1
  episodes_total: 10
  experiment_id: ba18be64619d4801b3ca85757cadc21e
  hostname: linar-B360M-D2V
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.19999999999999996
          cur_lr: 5.000000000000001e-05
          entropy: 2.712425473001268
          entropy_coeff: 0.009999999999999998
          kl: 0.008882743794480829
          policy_loss: 0.05272905917631255
          total_loss: 0.029905850662746362
          vf_explained_var: -0.6862989068031311
          vf_loss: 0.002524496318720695
    num_agent_steps_sampled: 10000
    num_agent_steps_trained: 10000
    num_steps_sampled: 10000
    num_steps_trained: 10000
  iterations_since_restore: 10
  node_ip: 192.168.

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_2f3c2_00000,RUNNING,192.168.1.100:9363,10,174.612,10000,-0.3,0,-3,1000


Result for PPO_my_env_2f3c2_00000:
  agent_timesteps_total: 11000
  custom_metrics: {}
  date: 2021-10-01_10-23-39
  done: false
  episode_len_mean: 1000.0
  episode_media: {}
  episode_reward_max: 0.0
  episode_reward_mean: -0.2727272727272727
  episode_reward_min: -3.0
  episodes_this_iter: 1
  episodes_total: 11
  experiment_id: ba18be64619d4801b3ca85757cadc21e
  hostname: linar-B360M-D2V
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.19999999999999996
          cur_lr: 5.000000000000001e-05
          entropy: 2.6983704725901285
          entropy_coeff: 0.009999999999999998
          kl: 0.00930188855091623
          policy_loss: -0.041410812021543585
          total_loss: -0.06410642845763101
          vf_explained_var: -0.8949530720710754
          vf_loss: 0.0024277069667328357
    num_agent_steps_sampled: 11000
    num_agent_steps_trained: 11000
    num_steps_sampled: 11000
    num_steps_trained: 11000
  iterations_since_restore: 11
 

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_2f3c2_00000,RUNNING,192.168.1.100:9363,11,186.36,11000,-0.272727,0,-3,1000


Result for PPO_my_env_2f3c2_00000:
  agent_timesteps_total: 12000
  custom_metrics: {}
  date: 2021-10-01_10-23-51
  done: false
  episode_len_mean: 1000.0
  episode_media: {}
  episode_reward_max: 0.0
  episode_reward_mean: -0.25
  episode_reward_min: -3.0
  episodes_this_iter: 1
  episodes_total: 12
  experiment_id: ba18be64619d4801b3ca85757cadc21e
  hostname: linar-B360M-D2V
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.19999999999999996
          cur_lr: 5.000000000000001e-05
          entropy: 2.7235150416692098
          entropy_coeff: 0.009999999999999998
          kl: 0.010566616058651477
          policy_loss: -0.035482479590508674
          total_loss: -0.056249481646551025
          vf_explained_var: -0.4826214611530304
          vf_loss: 0.004354823634235395
    num_agent_steps_sampled: 12000
    num_agent_steps_trained: 12000
    num_steps_sampled: 12000
    num_steps_trained: 12000
  iterations_since_restore: 12
  node_ip: 192

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_2f3c2_00000,RUNNING,192.168.1.100:9363,12,198.602,12000,-0.25,0,-3,1000


Result for PPO_my_env_2f3c2_00000:
  agent_timesteps_total: 13000
  custom_metrics: {}
  date: 2021-10-01_10-24-04
  done: false
  episode_len_mean: 1000.0
  episode_media: {}
  episode_reward_max: 0.0
  episode_reward_mean: -0.23076923076923078
  episode_reward_min: -3.0
  episodes_this_iter: 1
  episodes_total: 13
  experiment_id: ba18be64619d4801b3ca85757cadc21e
  hostname: linar-B360M-D2V
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.19999999999999996
          cur_lr: 5.000000000000001e-05
          entropy: 2.71121363374922
          entropy_coeff: 0.009999999999999998
          kl: 0.011089640761362244
          policy_loss: -0.006153399538662698
          total_loss: -0.02784951444508326
          vf_explained_var: -0.6530073881149292
          vf_loss: 0.0031980905862939025
    num_agent_steps_sampled: 13000
    num_agent_steps_trained: 13000
    num_steps_sampled: 13000
    num_steps_trained: 13000
  iterations_since_restore: 13
 

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_2f3c2_00000,RUNNING,192.168.1.100:9363,13,210.862,13000,-0.230769,0,-3,1000


Result for PPO_my_env_2f3c2_00000:
  agent_timesteps_total: 14000
  custom_metrics: {}
  date: 2021-10-01_10-24-16
  done: false
  episode_len_mean: 1000.0
  episode_media: {}
  episode_reward_max: 0.0
  episode_reward_mean: -0.21428571428571427
  episode_reward_min: -3.0
  episodes_this_iter: 1
  episodes_total: 14
  experiment_id: ba18be64619d4801b3ca85757cadc21e
  hostname: linar-B360M-D2V
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.19999999999999996
          cur_lr: 5.000000000000001e-05
          entropy: 2.723418500688341
          entropy_coeff: 0.009999999999999998
          kl: 0.005954186678036984
          policy_loss: -0.05089437961578369
          total_loss: -0.07478558679835665
          vf_explained_var: -0.6406571865081787
          vf_loss: 0.0021521408519119076
    num_agent_steps_sampled: 14000
    num_agent_steps_trained: 14000
    num_steps_sampled: 14000
    num_steps_trained: 14000
  iterations_since_restore: 14
 

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_2f3c2_00000,RUNNING,192.168.1.100:9363,14,222.855,14000,-0.214286,0,-3,1000


Result for PPO_my_env_2f3c2_00000:
  agent_timesteps_total: 15000
  custom_metrics: {}
  date: 2021-10-01_10-24-27
  done: false
  episode_len_mean: 1000.0
  episode_media: {}
  episode_reward_max: 0.0
  episode_reward_mean: -0.2
  episode_reward_min: -3.0
  episodes_this_iter: 1
  episodes_total: 15
  experiment_id: ba18be64619d4801b3ca85757cadc21e
  hostname: linar-B360M-D2V
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.19999999999999996
          cur_lr: 5.000000000000001e-05
          entropy: 2.7303375800450644
          entropy_coeff: 0.009999999999999998
          kl: 0.009546727111153747
          policy_loss: 0.004513467084487072
          total_loss: -0.018852394757171473
          vf_explained_var: -0.4909204840660095
          vf_loss: 0.002028166020560699
    num_agent_steps_sampled: 15000
    num_agent_steps_trained: 15000
    num_steps_sampled: 15000
    num_steps_trained: 15000
  iterations_since_restore: 15
  node_ip: 192.1

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_2f3c2_00000,RUNNING,192.168.1.100:9363,15,234.056,15000,-0.2,0,-3,1000


Result for PPO_my_env_2f3c2_00000:
  agent_timesteps_total: 16000
  custom_metrics: {}
  date: 2021-10-01_10-24-39
  done: false
  episode_len_mean: 1000.0
  episode_media: {}
  episode_reward_max: 0.0
  episode_reward_mean: -0.1875
  episode_reward_min: -3.0
  episodes_this_iter: 1
  episodes_total: 16
  experiment_id: ba18be64619d4801b3ca85757cadc21e
  hostname: linar-B360M-D2V
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.19999999999999996
          cur_lr: 5.000000000000001e-05
          entropy: 2.682371507750617
          entropy_coeff: 0.009999999999999998
          kl: 0.012422928624433647
          policy_loss: -0.05420239522225327
          total_loss: -0.07525481371105545
          vf_explained_var: -0.21129010617733002
          vf_loss: 0.0032867091248691494
    num_agent_steps_sampled: 16000
    num_agent_steps_trained: 16000
    num_steps_sampled: 16000
    num_steps_trained: 16000
  iterations_since_restore: 16
  node_ip: 19

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_2f3c2_00000,RUNNING,192.168.1.100:9363,16,245.951,16000,-0.1875,0,-3,1000


Result for PPO_my_env_2f3c2_00000:
  agent_timesteps_total: 17000
  custom_metrics: {}
  date: 2021-10-01_10-24-51
  done: false
  episode_len_mean: 1000.0
  episode_media: {}
  episode_reward_max: 0.0
  episode_reward_mean: -0.17647058823529413
  episode_reward_min: -3.0
  episodes_this_iter: 1
  episodes_total: 17
  experiment_id: ba18be64619d4801b3ca85757cadc21e
  hostname: linar-B360M-D2V
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.19999999999999996
          cur_lr: 5.000000000000001e-05
          entropy: 2.632796944512261
          entropy_coeff: 0.009999999999999998
          kl: 0.008543929694583594
          policy_loss: -0.11504525161451763
          total_loss: -0.13681527227163315
          vf_explained_var: -0.4962610900402069
          vf_loss: 0.0028491648017532297
    num_agent_steps_sampled: 17000
    num_agent_steps_trained: 17000
    num_steps_sampled: 17000
    num_steps_trained: 17000
  iterations_since_restore: 17
 

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_2f3c2_00000,RUNNING,192.168.1.100:9363,17,258.58,17000,-0.176471,0,-3,1000


Result for PPO_my_env_2f3c2_00000:
  agent_timesteps_total: 18000
  custom_metrics: {}
  date: 2021-10-01_10-25-04
  done: false
  episode_len_mean: 1000.0
  episode_media: {}
  episode_reward_max: 0.0
  episode_reward_mean: -0.16666666666666666
  episode_reward_min: -3.0
  episodes_this_iter: 1
  episodes_total: 18
  experiment_id: ba18be64619d4801b3ca85757cadc21e
  hostname: linar-B360M-D2V
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.19999999999999996
          cur_lr: 5.000000000000001e-05
          entropy: 2.667787048551771
          entropy_coeff: 0.009999999999999998
          kl: 0.009053410413795709
          policy_loss: 0.15276608020067214
          total_loss: 0.13231454350882108
          vf_explained_var: -0.4116354286670685
          vf_loss: 0.00441565242751191
    num_agent_steps_sampled: 18000
    num_agent_steps_trained: 18000
    num_steps_sampled: 18000
    num_steps_trained: 18000
  iterations_since_restore: 18
  nod

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_2f3c2_00000,RUNNING,192.168.1.100:9363,18,271.299,18000,-0.166667,0,-3,1000


Result for PPO_my_env_2f3c2_00000:
  agent_timesteps_total: 19000
  custom_metrics: {}
  date: 2021-10-01_10-25-15
  done: false
  episode_len_mean: 1000.0
  episode_media: {}
  episode_reward_max: 0.0
  episode_reward_mean: -0.15789473684210525
  episode_reward_min: -3.0
  episodes_this_iter: 1
  episodes_total: 19
  experiment_id: ba18be64619d4801b3ca85757cadc21e
  hostname: linar-B360M-D2V
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.19999999999999996
          cur_lr: 5.000000000000001e-05
          entropy: 2.7340723461574976
          entropy_coeff: 0.009999999999999998
          kl: 0.008657786306896299
          policy_loss: -0.039854896831093355
          total_loss: -0.0632725209929049
          vf_explained_var: -0.5479024648666382
          vf_loss: 0.0021915383426757114
    num_agent_steps_sampled: 19000
    num_agent_steps_trained: 19000
    num_steps_sampled: 19000
    num_steps_trained: 19000
  iterations_since_restore: 19


Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_2f3c2_00000,RUNNING,192.168.1.100:9363,19,282.522,19000,-0.157895,0,-3,1000


Result for PPO_my_env_2f3c2_00000:
  agent_timesteps_total: 20000
  custom_metrics: {}
  date: 2021-10-01_10-25-28
  done: false
  episode_len_mean: 1000.0
  episode_media: {}
  episode_reward_max: 0.0
  episode_reward_mean: -0.15
  episode_reward_min: -3.0
  episodes_this_iter: 1
  episodes_total: 20
  experiment_id: ba18be64619d4801b3ca85757cadc21e
  hostname: linar-B360M-D2V
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.19999999999999996
          cur_lr: 5.000000000000001e-05
          entropy: 2.6943329572677612
          entropy_coeff: 0.009999999999999998
          kl: 0.01019458976984706
          policy_loss: -0.008706297725439072
          total_loss: -0.031437198321024575
          vf_explained_var: -0.581558346748352
          vf_loss: 0.0021735133027606128
    num_agent_steps_sampled: 20000
    num_agent_steps_trained: 20000
    num_steps_sampled: 20000
    num_steps_trained: 20000
  iterations_since_restore: 20
  node_ip: 192.

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_2f3c2_00000,RUNNING,192.168.1.100:9363,20,294.794,20000,-0.15,0,-3,1000


Result for PPO_my_env_2f3c2_00000:
  agent_timesteps_total: 21000
  custom_metrics: {}
  date: 2021-10-01_10-25-40
  done: false
  episode_len_mean: 1000.0
  episode_media: {}
  episode_reward_max: 0.0
  episode_reward_mean: -0.14285714285714285
  episode_reward_min: -3.0
  episodes_this_iter: 1
  episodes_total: 21
  experiment_id: ba18be64619d4801b3ca85757cadc21e
  hostname: linar-B360M-D2V
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.19999999999999996
          cur_lr: 5.000000000000001e-05
          entropy: 2.7036946720547146
          entropy_coeff: 0.009999999999999998
          kl: 0.00753261841578989
          policy_loss: -0.1357502227322483
          total_loss: -0.15676699148542766
          vf_explained_var: -0.4938938021659851
          vf_loss: 0.004513651254819706
    num_agent_steps_sampled: 21000
    num_agent_steps_trained: 21000
    num_steps_sampled: 21000
    num_steps_trained: 21000
  iterations_since_restore: 21
  n

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_2f3c2_00000,RUNNING,192.168.1.100:9363,21,307.41,21000,-0.142857,0,-3,1000


Result for PPO_my_env_2f3c2_00000:
  agent_timesteps_total: 22000
  custom_metrics: {}
  date: 2021-10-01_10-25-52
  done: false
  episode_len_mean: 1000.0
  episode_media: {}
  episode_reward_max: 2.0
  episode_reward_mean: -0.045454545454545456
  episode_reward_min: -3.0
  episodes_this_iter: 1
  episodes_total: 22
  experiment_id: ba18be64619d4801b3ca85757cadc21e
  hostname: linar-B360M-D2V
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.19999999999999996
          cur_lr: 5.000000000000001e-05
          entropy: 2.690987375047472
          entropy_coeff: 0.009999999999999998
          kl: 0.008689853599971513
          policy_loss: -0.12039416581392288
          total_loss: -0.10324848973088795
          vf_explained_var: 0.3901374936103821
          vf_loss: 0.04231757907998852
    num_agent_steps_sampled: 22000
    num_agent_steps_trained: 22000
    num_steps_sampled: 22000
    num_steps_trained: 22000
  iterations_since_restore: 22
  n

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_2f3c2_00000,RUNNING,192.168.1.100:9363,22,319.089,22000,-0.0454545,2,-3,1000


Result for PPO_my_env_2f3c2_00000:
  agent_timesteps_total: 23000
  custom_metrics: {}
  date: 2021-10-01_10-26-04
  done: false
  episode_len_mean: 1000.0
  episode_media: {}
  episode_reward_max: 2.0
  episode_reward_mean: -0.043478260869565216
  episode_reward_min: -3.0
  episodes_this_iter: 1
  episodes_total: 23
  experiment_id: ba18be64619d4801b3ca85757cadc21e
  hostname: linar-B360M-D2V
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.19999999999999996
          cur_lr: 5.000000000000001e-05
          entropy: 2.7255021810531614
          entropy_coeff: 0.009999999999999998
          kl: 0.010211754657593896
          policy_loss: -0.1937797842754258
          total_loss: -0.216247995197773
          vf_explained_var: -0.28861263394355774
          vf_loss: 0.0027444567932333385
    num_agent_steps_sampled: 23000
    num_agent_steps_trained: 23000
    num_steps_sampled: 23000
    num_steps_trained: 23000
  iterations_since_restore: 23
 

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_2f3c2_00000,RUNNING,192.168.1.100:9363,23,330.989,23000,-0.0434783,2,-3,1000


Result for PPO_my_env_2f3c2_00000:
  agent_timesteps_total: 24000
  custom_metrics: {}
  date: 2021-10-01_10-26-15
  done: false
  episode_len_mean: 1000.0
  episode_media: {}
  episode_reward_max: 2.0
  episode_reward_mean: -0.041666666666666664
  episode_reward_min: -3.0
  episodes_this_iter: 1
  episodes_total: 24
  experiment_id: ba18be64619d4801b3ca85757cadc21e
  hostname: linar-B360M-D2V
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.19999999999999996
          cur_lr: 5.000000000000001e-05
          entropy: 2.7357361131244238
          entropy_coeff: 0.009999999999999998
          kl: 0.006138144202830099
          policy_loss: -0.15744254146185185
          total_loss: -0.18149308090408642
          vf_explained_var: 0.10192135721445084
          vf_loss: 0.002079194350193979
    num_agent_steps_sampled: 24000
    num_agent_steps_trained: 24000
    num_steps_sampled: 24000
    num_steps_trained: 24000
  iterations_since_restore: 24


Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_2f3c2_00000,RUNNING,192.168.1.100:9363,24,342.254,24000,-0.0416667,2,-3,1000


Result for PPO_my_env_2f3c2_00000:
  agent_timesteps_total: 25000
  custom_metrics: {}
  date: 2021-10-01_10-26-27
  done: false
  episode_len_mean: 1000.0
  episode_media: {}
  episode_reward_max: 2.0
  episode_reward_mean: -0.04
  episode_reward_min: -3.0
  episodes_this_iter: 1
  episodes_total: 25
  experiment_id: ba18be64619d4801b3ca85757cadc21e
  hostname: linar-B360M-D2V
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.19999999999999996
          cur_lr: 5.000000000000001e-05
          entropy: 2.734258540471395
          entropy_coeff: 0.009999999999999998
          kl: 0.009176388143457625
          policy_loss: -0.12917184498575
          total_loss: -0.15330548588600423
          vf_explained_var: -0.7588332295417786
          vf_loss: 0.0013736655708473538
    num_agent_steps_sampled: 25000
    num_agent_steps_trained: 25000
    num_steps_sampled: 25000
    num_steps_trained: 25000
  iterations_since_restore: 25
  node_ip: 192.168.

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_2f3c2_00000,RUNNING,192.168.1.100:9363,25,353.445,25000,-0.04,2,-3,1000


Result for PPO_my_env_2f3c2_00000:
  agent_timesteps_total: 26000
  custom_metrics: {}
  date: 2021-10-01_10-26-38
  done: false
  episode_len_mean: 1000.0
  episode_media: {}
  episode_reward_max: 2.0
  episode_reward_mean: -0.038461538461538464
  episode_reward_min: -3.0
  episodes_this_iter: 1
  episodes_total: 26
  experiment_id: ba18be64619d4801b3ca85757cadc21e
  hostname: linar-B360M-D2V
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.19999999999999996
          cur_lr: 5.000000000000001e-05
          entropy: 2.7031096537907917
          entropy_coeff: 0.009999999999999998
          kl: 0.011837801763319182
          policy_loss: -0.0032949855939174693
          total_loss: -0.025255027216755682
          vf_explained_var: -0.8992470502853394
          vf_loss: 0.002703494913940732
    num_agent_steps_sampled: 26000
    num_agent_steps_trained: 26000
    num_steps_sampled: 26000
    num_steps_trained: 26000
  iterations_since_restore: 

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_2f3c2_00000,RUNNING,192.168.1.100:9363,26,364.502,26000,-0.0384615,2,-3,1000


Result for PPO_my_env_2f3c2_00000:
  agent_timesteps_total: 27000
  custom_metrics: {}
  date: 2021-10-01_10-26-49
  done: false
  episode_len_mean: 1000.0
  episode_media: {}
  episode_reward_max: 2.0
  episode_reward_mean: -0.1111111111111111
  episode_reward_min: -3.0
  episodes_this_iter: 1
  episodes_total: 27
  experiment_id: ba18be64619d4801b3ca85757cadc21e
  hostname: linar-B360M-D2V
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.19999999999999996
          cur_lr: 5.000000000000001e-05
          entropy: 2.7134987115859985
          entropy_coeff: 0.009999999999999998
          kl: 0.010321515849314915
          policy_loss: 0.0418708071940475
          total_loss: 0.10698903934357482
          vf_explained_var: -0.06141440197825432
          vf_loss: 0.09018891727642363
    num_agent_steps_sampled: 27000
    num_agent_steps_trained: 27000
    num_steps_sampled: 27000
    num_steps_trained: 27000
  iterations_since_restore: 27
  nod

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_2f3c2_00000,RUNNING,192.168.1.100:9363,27,376.152,27000,-0.111111,2,-3,1000


Result for PPO_my_env_2f3c2_00000:
  agent_timesteps_total: 28000
  custom_metrics: {}
  date: 2021-10-01_10-27-00
  done: false
  episode_len_mean: 1000.0
  episode_media: {}
  episode_reward_max: 2.0
  episode_reward_mean: -0.10714285714285714
  episode_reward_min: -3.0
  episodes_this_iter: 1
  episodes_total: 28
  experiment_id: ba18be64619d4801b3ca85757cadc21e
  hostname: linar-B360M-D2V
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.19999999999999996
          cur_lr: 5.000000000000001e-05
          entropy: 2.722605013847351
          entropy_coeff: 0.009999999999999998
          kl: 0.007869530889055218
          policy_loss: -0.05946897849337095
          total_loss: -0.08335346584661035
          vf_explained_var: -1.0
          vf_loss: 0.0017676554154604674
    num_agent_steps_sampled: 28000
    num_agent_steps_trained: 28000
    num_steps_sampled: 28000
    num_steps_trained: 28000
  iterations_since_restore: 28
  node_ip: 192.1

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_2f3c2_00000,RUNNING,192.168.1.100:9363,28,387.282,28000,-0.107143,2,-3,1000


Result for PPO_my_env_2f3c2_00000:
  agent_timesteps_total: 29000
  custom_metrics: {}
  date: 2021-10-01_10-27-12
  done: false
  episode_len_mean: 1000.0
  episode_media: {}
  episode_reward_max: 2.0
  episode_reward_mean: -0.10344827586206896
  episode_reward_min: -3.0
  episodes_this_iter: 1
  episodes_total: 29
  experiment_id: ba18be64619d4801b3ca85757cadc21e
  hostname: linar-B360M-D2V
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.19999999999999996
          cur_lr: 5.000000000000001e-05
          entropy: 2.7282517035802205
          entropy_coeff: 0.009999999999999998
          kl: 0.006468207584613881
          policy_loss: -0.018372396232249837
          total_loss: -0.042629944789870124
          vf_explained_var: -0.844727635383606
          vf_loss: 0.001731329189432371
    num_agent_steps_sampled: 29000
    num_agent_steps_trained: 29000
    num_steps_sampled: 29000
    num_steps_trained: 29000
  iterations_since_restore: 29


Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_2f3c2_00000,RUNNING,192.168.1.100:9363,29,398.669,29000,-0.103448,2,-3,1000




Result for PPO_my_env_2f3c2_00000:
  agent_timesteps_total: 30000
  custom_metrics: {}
  date: 2021-10-01_10-27-42
  done: false
  episode_len_mean: 996.0333333333333
  episode_media: {}
  episode_reward_max: 2.0
  episode_reward_mean: -0.1
  episode_reward_min: -3.0
  episodes_this_iter: 1
  episodes_total: 30
  experiment_id: ba18be64619d4801b3ca85757cadc21e
  hostname: linar-B360M-D2V
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.19999999999999996
          cur_lr: 5.000000000000001e-05
          entropy: 2.7072768105400935
          entropy_coeff: 0.009999999999999998
          kl: 0.00629696250512123
          policy_loss: -0.08779191594156954
          total_loss: -0.1101781387709909
          vf_explained_var: -1.0
          vf_loss: 0.003427151375217363
    num_agent_steps_sampled: 30000
    num_agent_steps_trained: 30000
    num_steps_sampled: 30000
    num_steps_trained: 30000
  iterations_since_restore: 30
  node_ip: 192.168.1.10

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_2f3c2_00000,RUNNING,192.168.1.100:9363,30,428.24,30000,-0.1,2,-3,996.033


Result for PPO_my_env_2f3c2_00000:
  agent_timesteps_total: 31000
  custom_metrics: {}
  date: 2021-10-01_10-27-56
  done: false
  episode_len_mean: 996.1612903225806
  episode_media: {}
  episode_reward_max: 2.0
  episode_reward_mean: -0.0967741935483871
  episode_reward_min: -3.0
  episodes_this_iter: 1
  episodes_total: 31
  experiment_id: ba18be64619d4801b3ca85757cadc21e
  hostname: linar-B360M-D2V
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.19999999999999996
          cur_lr: 5.000000000000001e-05
          entropy: 2.724403484662374
          entropy_coeff: 0.009999999999999998
          kl: 0.005917082343840439
          policy_loss: -0.002041113707754347
          total_loss: -0.027149839285347196
          vf_explained_var: -0.9990139603614807
          vf_loss: 0.0009518911441167196
    num_agent_steps_sampled: 31000
    num_agent_steps_trained: 31000
    num_steps_sampled: 31000
    num_steps_trained: 31000
  iterations_since_r

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_2f3c2_00000,RUNNING,192.168.1.100:9363,31,442.423,31000,-0.0967742,2,-3,996.161


Result for PPO_my_env_2f3c2_00000:
  agent_timesteps_total: 32000
  custom_metrics: {}
  date: 2021-10-01_10-28-09
  done: false
  episode_len_mean: 996.28125
  episode_media: {}
  episode_reward_max: 2.0
  episode_reward_mean: -0.09375
  episode_reward_min: -3.0
  episodes_this_iter: 1
  episodes_total: 32
  experiment_id: ba18be64619d4801b3ca85757cadc21e
  hostname: linar-B360M-D2V
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.19999999999999996
          cur_lr: 5.000000000000001e-05
          entropy: 2.6959205812878078
          entropy_coeff: 0.009999999999999998
          kl: 0.008702416074404577
          policy_loss: -0.03678029063675139
          total_loss: -0.029355961084365844
          vf_explained_var: -0.1736154854297638
          vf_loss: 0.032643050897038645
    num_agent_steps_sampled: 32000
    num_agent_steps_trained: 32000
    num_steps_sampled: 32000
    num_steps_trained: 32000
  iterations_since_restore: 32
  node_ip

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_2f3c2_00000,RUNNING,192.168.1.100:9363,32,455.538,32000,-0.09375,2,-3,996.281


Result for PPO_my_env_2f3c2_00000:
  agent_timesteps_total: 33000
  custom_metrics: {}
  date: 2021-10-01_10-28-22
  done: false
  episode_len_mean: 996.3939393939394
  episode_media: {}
  episode_reward_max: 2.0
  episode_reward_mean: -0.09090909090909091
  episode_reward_min: -3.0
  episodes_this_iter: 1
  episodes_total: 33
  experiment_id: ba18be64619d4801b3ca85757cadc21e
  hostname: linar-B360M-D2V
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.19999999999999996
          cur_lr: 5.000000000000001e-05
          entropy: 2.695132573445638
          entropy_coeff: 0.009999999999999998
          kl: 0.009163244326590811
          policy_loss: -0.06481612101197243
          total_loss: -0.02368549071252346
          vf_explained_var: -0.5608433485031128
          vf_loss: 0.06624930936830221
    num_agent_steps_sampled: 33000
    num_agent_steps_trained: 33000
    num_steps_sampled: 33000
    num_steps_trained: 33000
  iterations_since_rest

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_2f3c2_00000,RUNNING,192.168.1.100:9363,33,468.724,33000,-0.0909091,2,-3,996.394


Result for PPO_my_env_2f3c2_00000:
  agent_timesteps_total: 34000
  custom_metrics: {}
  date: 2021-10-01_10-28-35
  done: false
  episode_len_mean: 996.5
  episode_media: {}
  episode_reward_max: 2.0
  episode_reward_mean: -0.08823529411764706
  episode_reward_min: -3.0
  episodes_this_iter: 1
  episodes_total: 34
  experiment_id: ba18be64619d4801b3ca85757cadc21e
  hostname: linar-B360M-D2V
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.19999999999999996
          cur_lr: 5.000000000000001e-05
          entropy: 2.689849093225267
          entropy_coeff: 0.009999999999999998
          kl: 0.009978526611128196
          policy_loss: -0.06044511381122801
          total_loss: -0.08141781878140238
          vf_explained_var: -0.20683643221855164
          vf_loss: 0.003930079559278157
    num_agent_steps_sampled: 34000
    num_agent_steps_trained: 34000
    num_steps_sampled: 34000
    num_steps_trained: 34000
  iterations_since_restore: 34
  

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_2f3c2_00000,RUNNING,192.168.1.100:9363,34,481.231,34000,-0.0882353,2,-3,996.5


Result for PPO_my_env_2f3c2_00000:
  agent_timesteps_total: 35000
  custom_metrics: {}
  date: 2021-10-01_10-28-48
  done: false
  episode_len_mean: 996.6
  episode_media: {}
  episode_reward_max: 2.0
  episode_reward_mean: -0.08571428571428572
  episode_reward_min: -3.0
  episodes_this_iter: 1
  episodes_total: 35
  experiment_id: ba18be64619d4801b3ca85757cadc21e
  hostname: linar-B360M-D2V
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.19999999999999996
          cur_lr: 5.000000000000001e-05
          entropy: 2.7104939248826767
          entropy_coeff: 0.009999999999999998
          kl: 0.00803047145813107
          policy_loss: 0.024992129392921926
          total_loss: 0.000952526581628869
          vf_explained_var: -0.1752758026123047
          vf_loss: 0.0014592402763406022
    num_agent_steps_sampled: 35000
    num_agent_steps_trained: 35000
    num_steps_sampled: 35000
    num_steps_trained: 35000
  iterations_since_restore: 35
  

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_2f3c2_00000,RUNNING,192.168.1.100:9363,35,494.116,35000,-0.0857143,2,-3,996.6


Result for PPO_my_env_2f3c2_00000:
  agent_timesteps_total: 36000
  custom_metrics: {}
  date: 2021-10-01_10-29-00
  done: false
  episode_len_mean: 996.6944444444445
  episode_media: {}
  episode_reward_max: 2.0
  episode_reward_mean: -0.08333333333333333
  episode_reward_min: -3.0
  episodes_this_iter: 1
  episodes_total: 36
  experiment_id: ba18be64619d4801b3ca85757cadc21e
  hostname: linar-B360M-D2V
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.19999999999999996
          cur_lr: 5.000000000000001e-05
          entropy: 2.7201423433091905
          entropy_coeff: 0.009999999999999998
          kl: 0.006079422675809076
          policy_loss: -0.00865871202485222
          total_loss: -0.033352758538805774
          vf_explained_var: -0.8657673597335815
          vf_loss: 0.001291493815369904
    num_agent_steps_sampled: 36000
    num_agent_steps_trained: 36000
    num_steps_sampled: 36000
    num_steps_trained: 36000
  iterations_since_r

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_2f3c2_00000,RUNNING,192.168.1.100:9363,36,506.896,36000,-0.0833333,2,-3,996.694


Result for PPO_my_env_2f3c2_00000:
  agent_timesteps_total: 37000
  custom_metrics: {}
  date: 2021-10-01_10-29-13
  done: false
  episode_len_mean: 996.7837837837837
  episode_media: {}
  episode_reward_max: 2.0
  episode_reward_mean: -0.08108108108108109
  episode_reward_min: -3.0
  episodes_this_iter: 1
  episodes_total: 37
  experiment_id: ba18be64619d4801b3ca85757cadc21e
  hostname: linar-B360M-D2V
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.19999999999999996
          cur_lr: 5.000000000000001e-05
          entropy: 2.692693249384562
          entropy_coeff: 0.009999999999999998
          kl: 0.008193924307821001
          policy_loss: -0.06418568429847558
          total_loss: -0.08710422503451506
          vf_explained_var: -1.0
          vf_loss: 0.0023696053385113677
    num_agent_steps_sampled: 37000
    num_agent_steps_trained: 37000
    num_steps_sampled: 37000
    num_steps_trained: 37000
  iterations_since_restore: 37
  nod

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_2f3c2_00000,RUNNING,192.168.1.100:9363,37,519.267,37000,-0.0810811,2,-3,996.784


Result for PPO_my_env_2f3c2_00000:
  agent_timesteps_total: 38000
  custom_metrics: {}
  date: 2021-10-01_10-29-27
  done: false
  episode_len_mean: 996.8684210526316
  episode_media: {}
  episode_reward_max: 2.0
  episode_reward_mean: -0.07894736842105263
  episode_reward_min: -3.0
  episodes_this_iter: 1
  episodes_total: 38
  experiment_id: ba18be64619d4801b3ca85757cadc21e
  hostname: linar-B360M-D2V
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.19999999999999996
          cur_lr: 5.000000000000001e-05
          entropy: 2.704553238550822
          entropy_coeff: 0.009999999999999998
          kl: 0.0088088931476096
          policy_loss: -0.021105708016289606
          total_loss: 0.06386137321694857
          vf_explained_var: 0.12875548005104065
          vf_loss: 0.11025083062316601
    num_agent_steps_sampled: 38000
    num_agent_steps_trained: 38000
    num_steps_sampled: 38000
    num_steps_trained: 38000
  iterations_since_restor

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_2f3c2_00000,RUNNING,192.168.1.100:9363,38,532.978,38000,-0.0789474,2,-3,996.868


Result for PPO_my_env_2f3c2_00000:
  agent_timesteps_total: 39000
  custom_metrics: {}
  date: 2021-10-01_10-29-39
  done: false
  episode_len_mean: 996.9487179487179
  episode_media: {}
  episode_reward_max: 2.0
  episode_reward_mean: -0.07692307692307693
  episode_reward_min: -3.0
  episodes_this_iter: 1
  episodes_total: 39
  experiment_id: ba18be64619d4801b3ca85757cadc21e
  hostname: linar-B360M-D2V
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.19999999999999996
          cur_lr: 5.000000000000001e-05
          entropy: 2.705911625756158
          entropy_coeff: 0.009999999999999998
          kl: 0.00957933652233461
          policy_loss: -0.029268877477281623
          total_loss: -0.050722723236928384
          vf_explained_var: 0.04848836734890938
          vf_loss: 0.0036894004145223234
    num_agent_steps_sampled: 39000
    num_agent_steps_trained: 39000
    num_steps_sampled: 39000
    num_steps_trained: 39000
  iterations_since_r

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_2f3c2_00000,RUNNING,192.168.1.100:9363,39,545.373,39000,-0.0769231,2,-3,996.949


Result for PPO_my_env_2f3c2_00000:
  agent_timesteps_total: 40000
  custom_metrics: {}
  date: 2021-10-01_10-29-52
  done: false
  episode_len_mean: 997.025
  episode_media: {}
  episode_reward_max: 2.0
  episode_reward_mean: -0.1
  episode_reward_min: -3.0
  episodes_this_iter: 1
  episodes_total: 40
  experiment_id: ba18be64619d4801b3ca85757cadc21e
  hostname: linar-B360M-D2V
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.19999999999999996
          cur_lr: 5.000000000000001e-05
          entropy: 2.710772024260627
          entropy_coeff: 0.009999999999999998
          kl: 0.0084305051749667
          policy_loss: -0.01610534629888005
          total_loss: -0.010575517184204526
          vf_explained_var: 0.1123543307185173
          vf_loss: 0.030951444758102298
    num_agent_steps_sampled: 40000
    num_agent_steps_trained: 40000
    num_steps_sampled: 40000
    num_steps_trained: 40000
  iterations_since_restore: 40
  node_ip: 192.168.

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_2f3c2_00000,RUNNING,192.168.1.100:9363,40,558.075,40000,-0.1,2,-3,997.025


Result for PPO_my_env_2f3c2_00000:
  agent_timesteps_total: 41000
  custom_metrics: {}
  date: 2021-10-01_10-30-03
  done: false
  episode_len_mean: 997.0975609756098
  episode_media: {}
  episode_reward_max: 2.0
  episode_reward_mean: -0.0975609756097561
  episode_reward_min: -3.0
  episodes_this_iter: 1
  episodes_total: 41
  experiment_id: ba18be64619d4801b3ca85757cadc21e
  hostname: linar-B360M-D2V
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.19999999999999996
          cur_lr: 5.000000000000001e-05
          entropy: 2.7209715366363527
          entropy_coeff: 0.009999999999999998
          kl: 0.005702868451177167
          policy_loss: -0.03095887965626187
          total_loss: -0.05586780409018199
          vf_explained_var: -0.3401646018028259
          vf_loss: 0.0011602171832540383
    num_agent_steps_sampled: 41000
    num_agent_steps_trained: 41000
    num_steps_sampled: 41000
    num_steps_trained: 41000
  iterations_since_re

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_2f3c2_00000,RUNNING,192.168.1.100:9363,41,569.877,41000,-0.097561,2,-3,997.098


Result for PPO_my_env_2f3c2_00000:
  agent_timesteps_total: 42000
  custom_metrics: {}
  date: 2021-10-01_10-30-15
  done: false
  episode_len_mean: 997.1666666666666
  episode_media: {}
  episode_reward_max: 2.0
  episode_reward_mean: -0.09523809523809523
  episode_reward_min: -3.0
  episodes_this_iter: 1
  episodes_total: 42
  experiment_id: ba18be64619d4801b3ca85757cadc21e
  hostname: linar-B360M-D2V
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.19999999999999996
          cur_lr: 5.000000000000001e-05
          entropy: 2.7336245351367525
          entropy_coeff: 0.009999999999999998
          kl: 0.0065880413232153245
          policy_loss: -0.12184563784135713
          total_loss: -0.14717358267969555
          vf_explained_var: -0.8765683770179749
          vf_loss: 0.000690691123357586
    num_agent_steps_sampled: 42000
    num_agent_steps_trained: 42000
    num_steps_sampled: 42000
    num_steps_trained: 42000
  iterations_since_r

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_2f3c2_00000,RUNNING,192.168.1.100:9363,42,581.686,42000,-0.0952381,2,-3,997.167


Result for PPO_my_env_2f3c2_00000:
  agent_timesteps_total: 43000
  custom_metrics: {}
  date: 2021-10-01_10-30-28
  done: false
  episode_len_mean: 997.2325581395348
  episode_media: {}
  episode_reward_max: 2.0
  episode_reward_mean: -0.06976744186046512
  episode_reward_min: -3.0
  episodes_this_iter: 1
  episodes_total: 43
  experiment_id: ba18be64619d4801b3ca85757cadc21e
  hostname: linar-B360M-D2V
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.19999999999999996
          cur_lr: 5.000000000000001e-05
          entropy: 2.676725482940674
          entropy_coeff: 0.009999999999999998
          kl: 0.006663335742310809
          policy_loss: -0.06016012549904796
          total_loss: -0.00682949208550983
          vf_explained_var: -0.2617712914943695
          vf_loss: 0.07876522023838738
    num_agent_steps_sampled: 43000
    num_agent_steps_trained: 43000
    num_steps_sampled: 43000
    num_steps_trained: 43000
  iterations_since_rest

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_2f3c2_00000,RUNNING,192.168.1.100:9363,43,594.466,43000,-0.0697674,2,-3,997.233


Result for PPO_my_env_2f3c2_00000:
  agent_timesteps_total: 44000
  custom_metrics: {}
  date: 2021-10-01_10-30-41
  done: false
  episode_len_mean: 997.2954545454545
  episode_media: {}
  episode_reward_max: 2.0
  episode_reward_mean: -0.13636363636363635
  episode_reward_min: -3.0
  episodes_this_iter: 1
  episodes_total: 44
  experiment_id: ba18be64619d4801b3ca85757cadc21e
  hostname: linar-B360M-D2V
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.19999999999999996
          cur_lr: 5.000000000000001e-05
          entropy: 2.6847104999754166
          entropy_coeff: 0.009999999999999998
          kl: 0.009909645461298498
          policy_loss: 0.035555445425496955
          total_loss: 0.060869502360259904
          vf_explained_var: 0.3641158938407898
          vf_loss: 0.05017923130136397
    num_agent_steps_sampled: 44000
    num_agent_steps_trained: 44000
    num_steps_sampled: 44000
    num_steps_trained: 44000
  iterations_since_rest

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_2f3c2_00000,RUNNING,192.168.1.100:9363,44,607.338,44000,-0.136364,2,-3,997.295


Result for PPO_my_env_2f3c2_00000:
  agent_timesteps_total: 45000
  custom_metrics: {}
  date: 2021-10-01_10-30-52
  done: false
  episode_len_mean: 997.3555555555556
  episode_media: {}
  episode_reward_max: 2.0
  episode_reward_mean: -0.13333333333333333
  episode_reward_min: -3.0
  episodes_this_iter: 1
  episodes_total: 45
  experiment_id: ba18be64619d4801b3ca85757cadc21e
  hostname: linar-B360M-D2V
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.19999999999999996
          cur_lr: 5.000000000000001e-05
          entropy: 2.672575275103251
          entropy_coeff: 0.009999999999999998
          kl: 0.006145960435465778
          policy_loss: -0.06824239546226131
          total_loss: -0.09308898529658714
          vf_explained_var: -0.8498677015304565
          vf_loss: 0.0006499725672054208
    num_agent_steps_sampled: 45000
    num_agent_steps_trained: 45000
    num_steps_sampled: 45000
    num_steps_trained: 45000
  iterations_since_re

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_2f3c2_00000,RUNNING,192.168.1.100:9363,45,618.637,45000,-0.133333,2,-3,997.356


Result for PPO_my_env_2f3c2_00000:
  agent_timesteps_total: 46000
  custom_metrics: {}
  date: 2021-10-01_10-31-05
  done: false
  episode_len_mean: 997.4130434782609
  episode_media: {}
  episode_reward_max: 2.0
  episode_reward_mean: -0.15217391304347827
  episode_reward_min: -3.0
  episodes_this_iter: 1
  episodes_total: 46
  experiment_id: ba18be64619d4801b3ca85757cadc21e
  hostname: linar-B360M-D2V
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.19999999999999996
          cur_lr: 5.000000000000001e-05
          entropy: 2.676489814122518
          entropy_coeff: 0.009999999999999998
          kl: 0.008103108883968218
          policy_loss: -0.008482361543509695
          total_loss: 0.026142248345745935
          vf_explained_var: 0.47413334250450134
          vf_loss: 0.059768886774286836
    num_agent_steps_sampled: 46000
    num_agent_steps_trained: 46000
    num_steps_sampled: 46000
    num_steps_trained: 46000
  iterations_since_re

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_2f3c2_00000,RUNNING,192.168.1.100:9363,46,631.139,46000,-0.152174,2,-3,997.413


Result for PPO_my_env_2f3c2_00000:
  agent_timesteps_total: 47000
  custom_metrics: {}
  date: 2021-10-01_10-31-18
  done: false
  episode_len_mean: 997.468085106383
  episode_media: {}
  episode_reward_max: 2.0
  episode_reward_mean: -0.19148936170212766
  episode_reward_min: -3.0
  episodes_this_iter: 1
  episodes_total: 47
  experiment_id: ba18be64619d4801b3ca85757cadc21e
  hostname: linar-B360M-D2V
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.19999999999999996
          cur_lr: 5.000000000000001e-05
          entropy: 2.6724461767408583
          entropy_coeff: 0.009999999999999998
          kl: 0.009982950807639071
          policy_loss: -0.01870488367146916
          total_loss: 0.02716317938433753
          vf_explained_var: 0.21686363220214844
          vf_loss: 0.07059593113760153
    num_agent_steps_sampled: 47000
    num_agent_steps_trained: 47000
    num_steps_sampled: 47000
    num_steps_trained: 47000
  iterations_since_resto

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_2f3c2_00000,RUNNING,192.168.1.100:9363,47,644.34,47000,-0.191489,2,-3,997.468


Result for PPO_my_env_2f3c2_00000:
  agent_timesteps_total: 48000
  custom_metrics: {}
  date: 2021-10-01_10-31-30
  done: false
  episode_len_mean: 997.5208333333334
  episode_media: {}
  episode_reward_max: 2.0
  episode_reward_mean: -0.1875
  episode_reward_min: -3.0
  episodes_this_iter: 1
  episodes_total: 48
  experiment_id: ba18be64619d4801b3ca85757cadc21e
  hostname: linar-B360M-D2V
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.19999999999999996
          cur_lr: 5.000000000000001e-05
          entropy: 2.655480718612671
          entropy_coeff: 0.009999999999999998
          kl: 0.008668941282091749
          policy_loss: 0.03999067787081003
          total_loss: 0.01598631539899442
          vf_explained_var: -0.8452224135398865
          vf_loss: 0.0008166570783942007
    num_agent_steps_sampled: 48000
    num_agent_steps_trained: 48000
    num_steps_sampled: 48000
    num_steps_trained: 48000
  iterations_since_restore: 48
  nod

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_2f3c2_00000,RUNNING,192.168.1.100:9363,48,655.991,48000,-0.1875,2,-3,997.521
