In [1]:
import torch 
from torch import nn

import ray
from ray.rllib.agents import ppo
from ray.rllib.models import ModelCatalog
from ray.rllib.models.torch.torch_modelv2 import TorchModelV2
from ray.rllib.utils.annotations import override

#from models import VisualEncoder
from train import *
from wrappers_2 import *



In [2]:
class VisualEncoder(nn.Module):
    def __init__(self):
        super().__init__()

        self.cnn = nn.Sequential(
            nn.Conv2d(3, 32, kernel_size=2, stride=2, padding=0),  
            nn.ELU(),
            nn.Conv2d(32, 32, kernel_size=2, stride=2, padding=0), 
            nn.ELU(),
            nn.Conv2d(32, 64, kernel_size=2, stride=2, padding=0), 
            nn.ELU(),
            nn.Conv2d(64, 128, kernel_size=2, stride=2, padding=0),
            nn.ELU(), 
            nn.Conv2d(128, 256, kernel_size=2, stride=2, padding=0),
            nn.ELU(),
            nn.Conv2d(256, 512, kernel_size=2, stride=2, padding=0),
            nn.ELU(),
            nn.Flatten(),
        )

    def forward(self, x):
        return self.cnn(x)

In [3]:
from torch.nn.functional import one_hot

class MyModelClass(TorchModelV2, nn.Module):
    def __init__(self, obs_space, action_space, num_outputs, model_config, name):
        TorchModelV2.__init__(self, obs_space, action_space, num_outputs, model_config, name)
        nn.Module.__init__(self)
        visual_features_dim = 512
        target_features_dim = 9 * 11 * 11 
        self.visual_encoder = VisualEncoder()
        self.visual_encoder.load_state_dict(
            torch.load("/IGLU-Minecraft/models/AngelaCNN/encoder_weigths.pth", map_location=torch.device('cpu'))
        )
        self.target_encoder = nn.Sequential(
            nn.Conv3d(7, 1, kernel_size=1, stride=1, padding=0),
            nn.ELU(),
        )
        policy_hidden_dim = 256 
        self.policy_network = nn.Sequential(
            nn.Linear(visual_features_dim + target_features_dim, 1024),
            nn.ELU(),
            nn.Linear(1024, 512),
            nn.ELU(),
            nn.Linear(512, policy_hidden_dim),
            nn.ELU(),
            nn.Linear(policy_hidden_dim, policy_hidden_dim),
            nn.ELU(),
            #nn.Linear(policy_hidden_dim, policy_hidden_dim),
            #nn.ELU(),
        )
        self.action_head = nn.Linear(policy_hidden_dim, action_space.n)
        self.value_head = nn.Linear(policy_hidden_dim, 1)
        self.last_value = None
        
        self.use_cuda = torch.cuda.is_available()
        if self.use_cuda:
            self.visual_encoder.cuda()
            self.target_encoder.cuda()
            self.policy_network.cuda()
            self.action_head.cuda()
            self.value_head.cuda()
        
    @override(TorchModelV2)
    def forward(self, input_dict, state, seq_lens):
        obs = input_dict['obs']
        pov = obs['pov'].permute(0, 3, 1, 2).float() / 255.0
        target = one_hot(obs['target_grid'].long(), num_classes=7).permute(0, 4, 1, 2, 3).float()
        if self.use_cuda:
            pov.cuda()
            target.cuda()
            
        with torch.no_grad():
            visual_features = self.visual_encoder(pov)
            
        target_features = self.target_encoder(target)
        target_features = target_features.reshape(target_features.shape[0], -1)
        features = torch.cat([visual_features, target_features], dim=1)
        features = self.policy_network(features)
        action = self.action_head(features)
        self.last_value = self.value_head(features).squeeze(1)
        return action, state
    
    @override(TorchModelV2)
    def value_function(self):
        assert self.last_value is not None, "must call forward() first"
        return self.last_value

In [4]:
ModelCatalog.register_custom_model("my_torch_model", MyModelClass)

In [5]:
class VisualObservationWrapper(ObsWrapper):
    def __init__(self, env, include_target=False):
        super().__init__(env)
        self.observation_space = {   
            'pov': gym.spaces.Box(low=0, high=255, shape=(64, 64, 3)),
            'inventory': gym.spaces.Box(low=0.0, high=20.0, shape=(6,)),
            'compass': gym.spaces.Box(low=-180.0, high=180.0, shape=(1,))
        }
        if include_target:
            self.observation_space['target_grid'] = \
                gym.spaces.Box(low=0, high=6, shape=(9, 11, 11))
        self.observation_space = gym.spaces.Dict(self.observation_space)

    def observation(self, obs, reward=None, done=None, info=None):
        if info is not None:
            if 'target_grid' in info:
                target_grid = info['target_grid']
                del info['target_grid']
            else:
                logger.error(f'info: {info}')
                if hasattr(self.unwrapped, 'should_reset'):
                    self.unwrapped.should_reset(True)
                target_grid = self.env.unwrapped.tasks.current.target_grid
        else:
            target_grid = self.env.unwrapped.tasks.current.target_grid
        return {
            'pov': obs['pov'].astype(np.float32),
            'inventory': obs['inventory'],
            'compass': np.array([obs['compass']['angle'].item()]),
            'target_grid': target_grid
        }

In [6]:
import os
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"   # see issue #152
os.environ["CUDA_VISIBLE_DEVICES"]="0"

class RewardWrapper(gym.RewardWrapper):
    def __init__(self, env):
        super().__init__(env)
    
    def reward(self, rew):
        if rew == 0:
            rew = -0.01
        if abs(rew) == 1:
            rew /= 10
        return rew
    
def env_creator(env_config):
    env = gym.make('IGLUSilentBuilder-v0', max_steps=250)
    env.update_taskset(TaskSet(preset=["C139"]))
    #env = PovOnlyWrapper(env)
    env = VisualObservationWrapper(env, include_target=True)
    env = SelectAndPlace(env)
    env = Discretization(env, flat_action_space('human-level'))
    env = RewardWrapper(env)
    return env

from ray.tune.registry import register_env
register_env("my_env", env_creator)

from ray import tune
from ray.rllib.agents.ppo import PPOTrainer

In [None]:
from ray.tune.integration.wandb import WandbLogger

analysis = tune.run(PPOTrainer, 
         config={
             "env": "my_env", 
             "framework": "torch",
             "num_gpus": 1,
             "num_workers": 3,
             "sgd_minibatch_size": 256,
             "clip_param": 0.2,
             "entropy_coeff": 0.01,
             "lambda": 0.95,
             "train_batch_size": 1000,
             #"gamma": 0.99,
             "model": {
                    # Specify our custom model from above.
                    "custom_model": "my_torch_model",
                    # Extra kwargs to be passed to your model's c'tor.
                    "custom_model_config": {},
              },
             "logger_config": {
                  "wandb": {
                      "project": "IGLU-Minecraft",
                      "name": "PPO MultiTask (C139) pretrained (AngelaCNN) (3 noops after placement) r: -0.01 div10"
                  }
              }

        },
        loggers=[WandbLogger],
        local_dir="/IGLU-Minecraft/checkpoints/C139",
        keep_checkpoints_num=50,
        checkpoint_freq=5,
        checkpoint_at_end=True)

2021-11-10 10:35:37,367	INFO wandb.py:170 -- Already logged into W&B.
2021-11-10 10:35:37,382	ERROR syncer.py:72 -- Log sync requires rsync to be installed.


Trial name,status,loc
PPO_my_env_f147d_00000,RUNNING,


[34m[1mwandb[0m: Currently logged in as: [33mlinar[0m (use `wandb login --relogin` to force relogin)
[34m[1mwandb[0m: wandb version 0.12.6 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade


[2m[36m(pid=128236)[0m 2021-11-10 10:35:40,945	INFO ppo.py:159 -- In multi-agent mode, policies will be optimized sequentially by the multi-GPU optimizer. Consider setting simple_optimizer=True if this doesn't work for you.
[2m[36m(pid=128236)[0m 2021-11-10 10:35:40,945	INFO trainer.py:728 -- Current log_level is WARN. For more information, set 'log_level': 'INFO' / 'DEBUG' or use the -v and -vv flags.


Result for PPO_my_env_f147d_00000:
  agent_timesteps_total: 1998
  custom_metrics: {}
  date: 2021-11-10_10-37-13
  done: false
  episode_len_mean: 99.83333333333333
  episode_media: {}
  episode_reward_max: 0.7499999999999996
  episode_reward_mean: -0.8227777777777784
  episode_reward_min: -1.0900000000000007
  episodes_this_iter: 18
  episodes_total: 18
  experiment_id: fc595125a55c42209f18ff09cd4b9a94
  hostname: linar-Z390-GAMING-X
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.2
          cur_lr: 5.000000000000001e-05
          entropy: 2.8846336500985283
          entropy_coeff: 0.009999999999999998
          kl: 0.0059371544273840345
          policy_loss: 0.01761572343252954
          total_loss: 0.0038777662529831843
          vf_explained_var: -0.03886004909873009
          vf_loss: 0.013920948527465086
    num_agent_steps_sampled: 1998
    num_agent_steps_trained: 1998
    num_steps_sampled: 1998
    num_steps_trained: 1998
  iter

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_f147d_00000,RUNNING,192.168.3.5:128236,1,86.5928,1998,-0.822778,0.75,-1.09,99.8333


Result for PPO_my_env_f147d_00000:
  agent_timesteps_total: 3996
  custom_metrics: {}
  date: 2021-11-10_10-37-49
  done: false
  episode_len_mean: 99.6923076923077
  episode_media: {}
  episode_reward_max: 0.7499999999999996
  episode_reward_mean: -0.8494871794871801
  episode_reward_min: -1.3400000000000007
  episodes_this_iter: 21
  episodes_total: 39
  experiment_id: fc595125a55c42209f18ff09cd4b9a94
  hostname: linar-Z390-GAMING-X
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.2
          cur_lr: 5.000000000000001e-05
          entropy: 2.8730496065957207
          entropy_coeff: 0.009999999999999998
          kl: 0.008612706028534482
          policy_loss: -0.03290103610072817
          total_loss: -0.025930298687446684
          vf_explained_var: 0.3067989945411682
          vf_loss: 0.03397869317060603
    num_agent_steps_sampled: 3996
    num_agent_steps_trained: 3996
    num_steps_sampled: 3996
    num_steps_trained: 3996
  iteratio

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_f147d_00000,RUNNING,192.168.3.5:128236,2,122.686,3996,-0.849487,0.75,-1.34,99.6923


Result for PPO_my_env_f147d_00000:
  agent_timesteps_total: 5994
  custom_metrics: {}
  date: 2021-11-10_10-38-22
  done: false
  episode_len_mean: 98.91525423728814
  episode_media: {}
  episode_reward_max: 0.7899999999999994
  episode_reward_mean: -0.8749152542372889
  episode_reward_min: -1.910000000000001
  episodes_this_iter: 20
  episodes_total: 59
  experiment_id: fc595125a55c42209f18ff09cd4b9a94
  hostname: linar-Z390-GAMING-X
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.2
          cur_lr: 5.000000000000001e-05
          entropy: 2.8548530101776124
          entropy_coeff: 0.009999999999999998
          kl: 0.008013804037931435
          policy_loss: -0.038929452108485356
          total_loss: -0.041990378747383754
          vf_explained_var: 0.19605733454227448
          vf_loss: 0.02388484131855269
    num_agent_steps_sampled: 5994
    num_agent_steps_trained: 5994
    num_steps_sampled: 5994
    num_steps_trained: 5994
  iterat

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_f147d_00000,RUNNING,192.168.3.5:128236,3,155.404,5994,-0.874915,0.79,-1.91,98.9153


Result for PPO_my_env_f147d_00000:
  agent_timesteps_total: 7992
  custom_metrics: {}
  date: 2021-11-10_10-38-57
  done: false
  episode_len_mean: 97.90123456790124
  episode_media: {}
  episode_reward_max: 2.3500000000000036
  episode_reward_mean: -0.7911111111111117
  episode_reward_min: -1.910000000000001
  episodes_this_iter: 22
  episodes_total: 81
  experiment_id: fc595125a55c42209f18ff09cd4b9a94
  hostname: linar-Z390-GAMING-X
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.2
          cur_lr: 5.000000000000001e-05
          entropy: 2.827227659452529
          entropy_coeff: 0.009999999999999998
          kl: 0.008567488513662734
          policy_loss: -0.06361453935858749
          total_loss: 0.09278932550833338
          vf_explained_var: 0.11655293405056
          vf_loss: 0.18296264303582055
    num_agent_steps_sampled: 7992
    num_agent_steps_trained: 7992
    num_steps_sampled: 7992
    num_steps_trained: 7992
  iterations_si

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_f147d_00000,RUNNING,192.168.3.5:128236,4,190.413,7992,-0.791111,2.35,-1.91,97.9012


Result for PPO_my_env_f147d_00000:
  agent_timesteps_total: 9990
  custom_metrics: {}
  date: 2021-11-10_10-39-35
  done: false
  episode_len_mean: 96.19
  episode_media: {}
  episode_reward_max: 2.3500000000000036
  episode_reward_mean: -0.7246000000000005
  episode_reward_min: -1.910000000000001
  episodes_this_iter: 21
  episodes_total: 102
  experiment_id: fc595125a55c42209f18ff09cd4b9a94
  hostname: linar-Z390-GAMING-X
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.2
          cur_lr: 5.000000000000001e-05
          entropy: 2.8010340690612794
          entropy_coeff: 0.009999999999999998
          kl: 0.009017899413901484
          policy_loss: -0.04689217706521352
          total_loss: 0.0741770662367344
          vf_explained_var: 0.3636839985847473
          vf_loss: 0.1472760030955431
    num_agent_steps_sampled: 9990
    num_agent_steps_trained: 9990
    num_steps_sampled: 9990
    num_steps_trained: 9990
  iterations_since_restor

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_f147d_00000,RUNNING,192.168.3.5:128236,5,228.801,9990,-0.7246,2.35,-1.91,96.19


Result for PPO_my_env_f147d_00000:
  agent_timesteps_total: 11988
  custom_metrics: {}
  date: 2021-11-10_10-40-02
  done: false
  episode_len_mean: 95.11
  episode_media: {}
  episode_reward_max: 2.950000000000001
  episode_reward_mean: -0.5433000000000004
  episode_reward_min: -1.910000000000001
  episodes_this_iter: 21
  episodes_total: 123
  experiment_id: fc595125a55c42209f18ff09cd4b9a94
  hostname: linar-Z390-GAMING-X
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.2
          cur_lr: 5.000000000000001e-05
          entropy: 2.7760098877407255
          entropy_coeff: 0.009999999999999998
          kl: 0.010169733748337316
          policy_loss: -0.06027013790749368
          total_loss: 0.12585413730925038
          vf_explained_var: 0.26121073961257935
          vf_loss: 0.21185042609771093
    num_agent_steps_sampled: 11988
    num_agent_steps_trained: 11988
    num_steps_sampled: 11988
    num_steps_trained: 11988
  iterations_since

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_f147d_00000,RUNNING,192.168.3.5:128236,6,254.632,11988,-0.5433,2.95,-1.91,95.11


Result for PPO_my_env_f147d_00000:
  agent_timesteps_total: 13986
  custom_metrics: {}
  date: 2021-11-10_10-40-28
  done: false
  episode_len_mean: 94.16
  episode_media: {}
  episode_reward_max: 4.610000000000003
  episode_reward_mean: -0.20210000000000009
  episode_reward_min: -1.910000000000001
  episodes_this_iter: 22
  episodes_total: 145
  experiment_id: fc595125a55c42209f18ff09cd4b9a94
  hostname: linar-Z390-GAMING-X
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.2
          cur_lr: 5.000000000000001e-05
          entropy: 2.7535278763089863
          entropy_coeff: 0.009999999999999998
          kl: 0.010370750505143347
          policy_loss: -0.015740461718468438
          total_loss: 0.32030851720344455
          vf_explained_var: 0.37943047285079956
          vf_loss: 0.36151010933376493
    num_agent_steps_sampled: 13986
    num_agent_steps_trained: 13986
    num_steps_sampled: 13986
    num_steps_trained: 13986
  iterations_sin

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_f147d_00000,RUNNING,192.168.3.5:128236,7,280.455,13986,-0.2021,4.61,-1.91,94.16


Result for PPO_my_env_f147d_00000:
  agent_timesteps_total: 15984
  custom_metrics: {}
  date: 2021-11-10_10-40-54
  done: false
  episode_len_mean: 93.03
  episode_media: {}
  episode_reward_max: 4.610000000000003
  episode_reward_mean: 0.21490000000000056
  episode_reward_min: -1.760000000000001
  episodes_this_iter: 22
  episodes_total: 167
  experiment_id: fc595125a55c42209f18ff09cd4b9a94
  hostname: linar-Z390-GAMING-X
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.2
          cur_lr: 5.000000000000001e-05
          entropy: 2.7281327100027175
          entropy_coeff: 0.009999999999999998
          kl: 0.012083349209859456
          policy_loss: -0.07540986440366222
          total_loss: 0.2409613195629347
          vf_explained_var: 0.42334964871406555
          vf_loss: 0.34123583993031864
    num_agent_steps_sampled: 15984
    num_agent_steps_trained: 15984
    num_steps_sampled: 15984
    num_steps_trained: 15984
  iterations_since_

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_f147d_00000,RUNNING,192.168.3.5:128236,8,306.89,15984,0.2149,4.61,-1.76,93.03


Result for PPO_my_env_f147d_00000:
  agent_timesteps_total: 17982
  custom_metrics: {}
  date: 2021-11-10_10-41-20
  done: false
  episode_len_mean: 92.94
  episode_media: {}
  episode_reward_max: 4.730000000000013
  episode_reward_mean: 0.6155000000000009
  episode_reward_min: -1.760000000000001
  episodes_this_iter: 21
  episodes_total: 188
  experiment_id: fc595125a55c42209f18ff09cd4b9a94
  hostname: linar-Z390-GAMING-X
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.2
          cur_lr: 5.000000000000001e-05
          entropy: 2.6759758608681814
          entropy_coeff: 0.009999999999999998
          kl: 0.012136201706455636
          policy_loss: -0.047241078512299625
          total_loss: 0.28339605209018504
          vf_explained_var: 0.44020822644233704
          vf_loss: 0.35496964752674104
    num_agent_steps_sampled: 17982
    num_agent_steps_trained: 17982
    num_steps_sampled: 17982
    num_steps_trained: 17982
  iterations_since

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_f147d_00000,RUNNING,192.168.3.5:128236,9,332.735,17982,0.6155,4.73,-1.76,92.94


Result for PPO_my_env_f147d_00000:
  agent_timesteps_total: 19980
  custom_metrics: {}
  date: 2021-11-10_10-41-46
  done: false
  episode_len_mean: 93.42
  episode_media: {}
  episode_reward_max: 6.570000000000007
  episode_reward_mean: 1.1367000000000018
  episode_reward_min: -1.610000000000001
  episodes_this_iter: 21
  episodes_total: 209
  experiment_id: fc595125a55c42209f18ff09cd4b9a94
  hostname: linar-Z390-GAMING-X
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.2
          cur_lr: 5.000000000000001e-05
          entropy: 2.6339369194848197
          entropy_coeff: 0.009999999999999998
          kl: 0.012751676368233688
          policy_loss: 0.061154713783235774
          total_loss: 0.4211538431545099
          vf_explained_var: 0.49939966201782227
          vf_loss: 0.38378816426155116
    num_agent_steps_sampled: 19980
    num_agent_steps_trained: 19980
    num_steps_sampled: 19980
    num_steps_trained: 19980
  iterations_since_r

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_f147d_00000,RUNNING,192.168.3.5:128236,10,358.575,19980,1.1367,6.57,-1.61,93.42


Result for PPO_my_env_f147d_00000:
  agent_timesteps_total: 21978
  custom_metrics: {}
  date: 2021-11-10_10-42-12
  done: false
  episode_len_mean: 93.42
  episode_media: {}
  episode_reward_max: 6.770000000000017
  episode_reward_mean: 1.6699000000000033
  episode_reward_min: -1.5600000000000007
  episodes_this_iter: 21
  episodes_total: 230
  experiment_id: fc595125a55c42209f18ff09cd4b9a94
  hostname: linar-Z390-GAMING-X
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.2
          cur_lr: 5.000000000000001e-05
          entropy: 2.6059079578944613
          entropy_coeff: 0.009999999999999998
          kl: 0.014474807855054663
          policy_loss: -0.04128521340233939
          total_loss: 0.4358946313460668
          vf_explained_var: 0.5607660412788391
          vf_loss: 0.5003439644262904
    num_agent_steps_sampled: 21978
    num_agent_steps_trained: 21978
    num_steps_sampled: 21978
    num_steps_trained: 21978
  iterations_since_re

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_f147d_00000,RUNNING,192.168.3.5:128236,11,384.206,21978,1.6699,6.77,-1.56,93.42


Result for PPO_my_env_f147d_00000:
  agent_timesteps_total: 23976
  custom_metrics: {}
  date: 2021-11-10_10-42-37
  done: false
  episode_len_mean: 94.23
  episode_media: {}
  episode_reward_max: 6.770000000000017
  episode_reward_mean: 1.938300000000004
  episode_reward_min: -1.4300000000000006
  episodes_this_iter: 21
  episodes_total: 251
  experiment_id: fc595125a55c42209f18ff09cd4b9a94
  hostname: linar-Z390-GAMING-X
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.2
          cur_lr: 5.000000000000001e-05
          entropy: 2.5799201897212436
          entropy_coeff: 0.009999999999999998
          kl: 0.013117755277246387
          policy_loss: -0.021668550230207896
          total_loss: 0.3918440913160642
          vf_explained_var: 0.554253876209259
          vf_loss: 0.43668829245226726
    num_agent_steps_sampled: 23976
    num_agent_steps_trained: 23976
    num_steps_sampled: 23976
    num_steps_trained: 23976
  iterations_since_re

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_f147d_00000,RUNNING,192.168.3.5:128236,12,409.099,23976,1.9383,6.77,-1.43,94.23


Result for PPO_my_env_f147d_00000:
  agent_timesteps_total: 25974
  custom_metrics: {}
  date: 2021-11-10_10-43-02
  done: false
  episode_len_mean: 95.34
  episode_media: {}
  episode_reward_max: 6.770000000000017
  episode_reward_mean: 2.381300000000005
  episode_reward_min: -1.8200000000000007
  episodes_this_iter: 20
  episodes_total: 271
  experiment_id: fc595125a55c42209f18ff09cd4b9a94
  hostname: linar-Z390-GAMING-X
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.2
          cur_lr: 5.000000000000001e-05
          entropy: 2.566561840829395
          entropy_coeff: 0.009999999999999998
          kl: 0.017127647586102173
          policy_loss: -0.014942598076803343
          total_loss: 0.6442029872997885
          vf_explained_var: 0.46783003211021423
          vf_loss: 0.6813856733696801
    num_agent_steps_sampled: 25974
    num_agent_steps_trained: 25974
    num_steps_sampled: 25974
    num_steps_trained: 25974
  iterations_since_re

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_f147d_00000,RUNNING,192.168.3.5:128236,13,434.238,25974,2.3813,6.77,-1.82,95.34


Result for PPO_my_env_f147d_00000:
  agent_timesteps_total: 27972
  custom_metrics: {}
  date: 2021-11-10_10-43-28
  done: false
  episode_len_mean: 95.43
  episode_media: {}
  episode_reward_max: 10.310000000000013
  episode_reward_mean: 3.007800000000006
  episode_reward_min: -1.8200000000000007
  episodes_this_iter: 21
  episodes_total: 292
  experiment_id: fc595125a55c42209f18ff09cd4b9a94
  hostname: linar-Z390-GAMING-X
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.2
          cur_lr: 5.000000000000001e-05
          entropy: 2.5277411824180964
          entropy_coeff: 0.009999999999999998
          kl: 0.015635706187381585
          policy_loss: -0.040556291952019645
          total_loss: 0.5555570735817864
          vf_explained_var: 0.6243453025817871
          vf_loss: 0.6182636367423194
    num_agent_steps_sampled: 27972
    num_agent_steps_trained: 27972
    num_steps_sampled: 27972
    num_steps_trained: 27972
  iterations_since_r

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_f147d_00000,RUNNING,192.168.3.5:128236,14,459.98,27972,3.0078,10.31,-1.82,95.43


Result for PPO_my_env_f147d_00000:
  agent_timesteps_total: 29970
  custom_metrics: {}
  date: 2021-11-10_10-43-54
  done: false
  episode_len_mean: 95.37
  episode_media: {}
  episode_reward_max: 10.310000000000013
  episode_reward_mean: 3.5009000000000072
  episode_reward_min: -1.8200000000000007
  episodes_this_iter: 22
  episodes_total: 314
  experiment_id: fc595125a55c42209f18ff09cd4b9a94
  hostname: linar-Z390-GAMING-X
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.2
          cur_lr: 5.000000000000001e-05
          entropy: 2.480413414183117
          entropy_coeff: 0.009999999999999998
          kl: 0.014872395669080406
          policy_loss: -0.050481128160442625
          total_loss: 0.6902895957231522
          vf_explained_var: 0.5811328887939453
          vf_loss: 0.7626003779116131
    num_agent_steps_sampled: 29970
    num_agent_steps_trained: 29970
    num_steps_sampled: 29970
    num_steps_trained: 29970
  iterations_since_r

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_f147d_00000,RUNNING,192.168.3.5:128236,15,486.14,29970,3.5009,10.31,-1.82,95.37


Result for PPO_my_env_f147d_00000:
  agent_timesteps_total: 31968
  custom_metrics: {}
  date: 2021-11-10_10-44-20
  done: false
  episode_len_mean: 94.6
  episode_media: {}
  episode_reward_max: 10.540000000000012
  episode_reward_mean: 3.9978000000000082
  episode_reward_min: -1.8200000000000007
  episodes_this_iter: 22
  episodes_total: 336
  experiment_id: fc595125a55c42209f18ff09cd4b9a94
  hostname: linar-Z390-GAMING-X
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.2
          cur_lr: 5.000000000000001e-05
          entropy: 2.411397689864749
          entropy_coeff: 0.009999999999999998
          kl: 0.028979933521667552
          policy_loss: -0.004457156768157369
          total_loss: 0.8487850526968638
          vf_explained_var: 0.6977705359458923
          vf_loss: 0.8715602057320732
    num_agent_steps_sampled: 31968
    num_agent_steps_trained: 31968
    num_steps_sampled: 31968
    num_steps_trained: 31968
  iterations_since_re

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_f147d_00000,RUNNING,192.168.3.5:128236,16,512.239,31968,3.9978,10.54,-1.82,94.6




Result for PPO_my_env_f147d_00000:
  agent_timesteps_total: 33966
  custom_metrics: {}
  date: 2021-11-10_10-45-06
  done: false
  episode_len_mean: 90.85
  episode_media: {}
  episode_reward_max: 10.540000000000012
  episode_reward_mean: 4.33200000000001
  episode_reward_min: -1.8200000000000007
  episodes_this_iter: 24
  episodes_total: 360
  experiment_id: fc595125a55c42209f18ff09cd4b9a94
  hostname: linar-Z390-GAMING-X
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.3
          cur_lr: 5.000000000000001e-05
          entropy: 2.3758330220267885
          entropy_coeff: 0.009999999999999998
          kl: 0.011814456925538685
          policy_loss: 0.01262821587068694
          total_loss: 0.7465044772341138
          vf_explained_var: 0.744027853012085
          vf_loss: 0.7540902574857076
    num_agent_steps_sampled: 33966
    num_agent_steps_trained: 33966
    num_steps_sampled: 33966
    num_steps_trained: 33966
  iterations_since_resto

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_f147d_00000,RUNNING,192.168.3.5:128236,17,558.04,33966,4.332,10.54,-1.82,90.85


Result for PPO_my_env_f147d_00000:
  agent_timesteps_total: 35964
  custom_metrics: {}
  date: 2021-11-10_10-45-37
  done: false
  episode_len_mean: 90.67
  episode_media: {}
  episode_reward_max: 10.540000000000012
  episode_reward_mean: 4.75030000000001
  episode_reward_min: -0.19
  episodes_this_iter: 21
  episodes_total: 381
  experiment_id: fc595125a55c42209f18ff09cd4b9a94
  hostname: linar-Z390-GAMING-X
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.3
          cur_lr: 5.000000000000001e-05
          entropy: 2.3775629088992165
          entropy_coeff: 0.009999999999999998
          kl: 0.011298506239298838
          policy_loss: -0.06982921574796949
          total_loss: 0.6733536081654685
          vf_explained_var: 0.7481786608695984
          vf_loss: 0.7635689011641911
    num_agent_steps_sampled: 35964
    num_agent_steps_trained: 35964
    num_steps_sampled: 35964
    num_steps_trained: 35964
  iterations_since_restore: 18
  nod

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_f147d_00000,RUNNING,192.168.3.5:128236,18,588.973,35964,4.7503,10.54,-0.19,90.67


Result for PPO_my_env_f147d_00000:
  agent_timesteps_total: 37962
  custom_metrics: {}
  date: 2021-11-10_10-46-04
  done: false
  episode_len_mean: 91.22
  episode_media: {}
  episode_reward_max: 10.540000000000012
  episode_reward_mean: 5.0664000000000105
  episode_reward_min: -0.19
  episodes_this_iter: 21
  episodes_total: 402
  experiment_id: fc595125a55c42209f18ff09cd4b9a94
  hostname: linar-Z390-GAMING-X
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.3
          cur_lr: 5.000000000000001e-05
          entropy: 2.3426107429322744
          entropy_coeff: 0.009999999999999998
          kl: 0.012119988154709814
          policy_loss: 0.036767999934298655
          total_loss: 0.6634153509423846
          vf_explained_var: 0.7865819931030273
          vf_loss: 0.6464374584811075
    num_agent_steps_sampled: 37962
    num_agent_steps_trained: 37962
    num_steps_sampled: 37962
    num_steps_trained: 37962
  iterations_since_restore: 19
  n

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_f147d_00000,RUNNING,192.168.3.5:128236,19,615.548,37962,5.0664,10.54,-0.19,91.22


Result for PPO_my_env_f147d_00000:
  agent_timesteps_total: 39960
  custom_metrics: {}
  date: 2021-11-10_10-46-31
  done: false
  episode_len_mean: 92.2
  episode_media: {}
  episode_reward_max: 10.540000000000012
  episode_reward_mean: 5.210700000000012
  episode_reward_min: -0.19
  episodes_this_iter: 21
  episodes_total: 423
  experiment_id: fc595125a55c42209f18ff09cd4b9a94
  hostname: linar-Z390-GAMING-X
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.3
          cur_lr: 5.000000000000001e-05
          entropy: 2.3928366592952184
          entropy_coeff: 0.009999999999999998
          kl: 0.010503608902106823
          policy_loss: -0.029348456079051607
          total_loss: 0.49177761620708876
          vf_explained_var: 0.7984403967857361
          vf_loss: 0.5419033552919116
    num_agent_steps_sampled: 39960
    num_agent_steps_trained: 39960
    num_steps_sampled: 39960
    num_steps_trained: 39960
  iterations_since_restore: 20
  n

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_f147d_00000,RUNNING,192.168.3.5:128236,20,642.566,39960,5.2107,10.54,-0.19,92.2


Result for PPO_my_env_f147d_00000:
  agent_timesteps_total: 41958
  custom_metrics: {}
  date: 2021-11-10_10-46-58
  done: false
  episode_len_mean: 92.56
  episode_media: {}
  episode_reward_max: 10.400000000000015
  episode_reward_mean: 5.336300000000011
  episode_reward_min: -1.4700000000000009
  episodes_this_iter: 20
  episodes_total: 443
  experiment_id: fc595125a55c42209f18ff09cd4b9a94
  hostname: linar-Z390-GAMING-X
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.3
          cur_lr: 5.000000000000001e-05
          entropy: 2.265721947806222
          entropy_coeff: 0.009999999999999998
          kl: 0.011252681971815396
          policy_loss: 0.006879181201968875
          total_loss: 1.016281931882813
          vf_explained_var: 0.74711012840271
          vf_loss: 1.0286841684863681
    num_agent_steps_sampled: 41958
    num_agent_steps_trained: 41958
    num_steps_sampled: 41958
    num_steps_trained: 41958
  iterations_since_restor

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_f147d_00000,RUNNING,192.168.3.5:128236,21,669.905,41958,5.3363,10.4,-1.47,92.56


Result for PPO_my_env_f147d_00000:
  agent_timesteps_total: 43956
  custom_metrics: {}
  date: 2021-11-10_10-47-24
  done: false
  episode_len_mean: 96.65
  episode_media: {}
  episode_reward_max: 10.400000000000015
  episode_reward_mean: 5.507400000000013
  episode_reward_min: -1.4700000000000009
  episodes_this_iter: 20
  episodes_total: 463
  experiment_id: fc595125a55c42209f18ff09cd4b9a94
  hostname: linar-Z390-GAMING-X
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.3
          cur_lr: 5.000000000000001e-05
          entropy: 2.307974105789548
          entropy_coeff: 0.009999999999999998
          kl: 0.008839242778811436
          policy_loss: 0.023779102769635973
          total_loss: 0.576098541773501
          vf_explained_var: 0.8117967247962952
          vf_loss: 0.5727474097694669
    num_agent_steps_sampled: 43956
    num_agent_steps_trained: 43956
    num_steps_sampled: 43956
    num_steps_trained: 43956
  iterations_since_rest

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_f147d_00000,RUNNING,192.168.3.5:128236,22,695.567,43956,5.5074,10.4,-1.47,96.65


Result for PPO_my_env_f147d_00000:
  agent_timesteps_total: 45954
  custom_metrics: {}
  date: 2021-11-10_10-47-50
  done: false
  episode_len_mean: 97.75
  episode_media: {}
  episode_reward_max: 10.400000000000015
  episode_reward_mean: 5.652000000000015
  episode_reward_min: -1.4700000000000009
  episodes_this_iter: 21
  episodes_total: 484
  experiment_id: fc595125a55c42209f18ff09cd4b9a94
  hostname: linar-Z390-GAMING-X
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.3
          cur_lr: 5.000000000000001e-05
          entropy: 2.22153890473502
          entropy_coeff: 0.009999999999999998
          kl: 0.011417649633313568
          policy_loss: -0.03119412681886128
          total_loss: 0.6075078728653136
          vf_explained_var: 0.8003586530685425
          vf_loss: 0.6574920979284105
    num_agent_steps_sampled: 45954
    num_agent_steps_trained: 45954
    num_steps_sampled: 45954
    num_steps_trained: 45954
  iterations_since_rest

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_f147d_00000,RUNNING,192.168.3.5:128236,23,721.661,45954,5.652,10.4,-1.47,97.75


Result for PPO_my_env_f147d_00000:
  agent_timesteps_total: 47952
  custom_metrics: {}
  date: 2021-11-10_10-48-17
  done: false
  episode_len_mean: 97.66
  episode_media: {}
  episode_reward_max: 10.270000000000014
  episode_reward_mean: 5.745800000000014
  episode_reward_min: -1.4700000000000009
  episodes_this_iter: 20
  episodes_total: 504
  experiment_id: fc595125a55c42209f18ff09cd4b9a94
  hostname: linar-Z390-GAMING-X
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.3
          cur_lr: 5.000000000000001e-05
          entropy: 2.227656358764285
          entropy_coeff: 0.009999999999999998
          kl: 0.012494995685089539
          policy_loss: -0.04271360827122061
          total_loss: 0.7109435819444202
          vf_explained_var: 0.7887855172157288
          vf_loss: 0.7721852514005842
    num_agent_steps_sampled: 47952
    num_agent_steps_trained: 47952
    num_steps_sampled: 47952
    num_steps_trained: 47952
  iterations_since_res

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_f147d_00000,RUNNING,192.168.3.5:128236,24,748.38,47952,5.7458,10.27,-1.47,97.66


Result for PPO_my_env_f147d_00000:
  agent_timesteps_total: 49950
  custom_metrics: {}
  date: 2021-11-10_10-48-43
  done: false
  episode_len_mean: 97.17
  episode_media: {}
  episode_reward_max: 10.270000000000014
  episode_reward_mean: 6.002800000000013
  episode_reward_min: -1.4700000000000009
  episodes_this_iter: 21
  episodes_total: 525
  experiment_id: fc595125a55c42209f18ff09cd4b9a94
  hostname: linar-Z390-GAMING-X
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.3
          cur_lr: 5.000000000000001e-05
          entropy: 2.1957642714182537
          entropy_coeff: 0.009999999999999998
          kl: 0.01286217273354184
          policy_loss: -0.012418724170752933
          total_loss: 0.9355167519478571
          vf_explained_var: 0.7663384675979614
          vf_loss: 0.9660344694341932
    num_agent_steps_sampled: 49950
    num_agent_steps_trained: 49950
    num_steps_sampled: 49950
    num_steps_trained: 49950
  iterations_since_re

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_f147d_00000,RUNNING,192.168.3.5:128236,25,774.869,49950,6.0028,10.27,-1.47,97.17


Result for PPO_my_env_f147d_00000:
  agent_timesteps_total: 51948
  custom_metrics: {}
  date: 2021-11-10_10-49-10
  done: false
  episode_len_mean: 97.79
  episode_media: {}
  episode_reward_max: 10.270000000000014
  episode_reward_mean: 6.208900000000015
  episode_reward_min: 1.9900000000000126
  episodes_this_iter: 21
  episodes_total: 546
  experiment_id: fc595125a55c42209f18ff09cd4b9a94
  hostname: linar-Z390-GAMING-X
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.3
          cur_lr: 5.000000000000001e-05
          entropy: 2.1293957971391224
          entropy_coeff: 0.009999999999999998
          kl: 0.01281589746078513
          policy_loss: -0.0315396452943484
          total_loss: 0.7530185425565357
          vf_explained_var: 0.7764570116996765
          vf_loss: 0.8020073797021593
    num_agent_steps_sampled: 51948
    num_agent_steps_trained: 51948
    num_steps_sampled: 51948
    num_steps_trained: 51948
  iterations_since_resto

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_f147d_00000,RUNNING,192.168.3.5:128236,26,801.438,51948,6.2089,10.27,1.99,97.79


Result for PPO_my_env_f147d_00000:
  agent_timesteps_total: 53946
  custom_metrics: {}
  date: 2021-11-10_10-49-37
  done: false
  episode_len_mean: 97.35
  episode_media: {}
  episode_reward_max: 10.270000000000014
  episode_reward_mean: 6.478700000000015
  episode_reward_min: 1.9900000000000126
  episodes_this_iter: 20
  episodes_total: 566
  experiment_id: fc595125a55c42209f18ff09cd4b9a94
  hostname: linar-Z390-GAMING-X
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.3
          cur_lr: 5.000000000000001e-05
          entropy: 2.1299749045144942
          entropy_coeff: 0.009999999999999998
          kl: 0.012301247637542718
          policy_loss: -0.0253513226551669
          total_loss: 0.7402340168044681
          vf_explained_var: 0.8115695118904114
          vf_loss: 0.7831947122301374
    num_agent_steps_sampled: 53946
    num_agent_steps_trained: 53946
    num_steps_sampled: 53946
    num_steps_trained: 53946
  iterations_since_rest

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_f147d_00000,RUNNING,192.168.3.5:128236,27,828.302,53946,6.4787,10.27,1.99,97.35


Result for PPO_my_env_f147d_00000:
  agent_timesteps_total: 55944
  custom_metrics: {}
  date: 2021-11-10_10-50-03
  done: false
  episode_len_mean: 97.57
  episode_media: {}
  episode_reward_max: 10.270000000000014
  episode_reward_mean: 6.364400000000015
  episode_reward_min: 2.3800000000000012
  episodes_this_iter: 20
  episodes_total: 586
  experiment_id: fc595125a55c42209f18ff09cd4b9a94
  hostname: linar-Z390-GAMING-X
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.3
          cur_lr: 5.000000000000001e-05
          entropy: 2.154387102808271
          entropy_coeff: 0.009999999999999998
          kl: 0.012668717275979067
          policy_loss: -0.02155579665587062
          total_loss: 0.7123023685245287
          vf_explained_var: 0.8117629289627075
          vf_loss: 0.7516014269420079
    num_agent_steps_sampled: 55944
    num_agent_steps_trained: 55944
    num_steps_sampled: 55944
    num_steps_trained: 55944
  iterations_since_rest

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_f147d_00000,RUNNING,192.168.3.5:128236,28,855.035,55944,6.3644,10.27,2.38,97.57


Result for PPO_my_env_f147d_00000:
  agent_timesteps_total: 57942
  custom_metrics: {}
  date: 2021-11-10_10-50-30
  done: false
  episode_len_mean: 97.96
  episode_media: {}
  episode_reward_max: 10.170000000000016
  episode_reward_mean: 6.398000000000016
  episode_reward_min: 2.3800000000000012
  episodes_this_iter: 20
  episodes_total: 606
  experiment_id: fc595125a55c42209f18ff09cd4b9a94
  hostname: linar-Z390-GAMING-X
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.3
          cur_lr: 5.000000000000001e-05
          entropy: 2.1237727278754823
          entropy_coeff: 0.009999999999999998
          kl: 0.011549540168803234
          policy_loss: -0.044458790079113984
          total_loss: 0.6101092695480301
          vf_explained_var: 0.8668228983879089
          vf_loss: 0.6723409252507346
    num_agent_steps_sampled: 57942
    num_agent_steps_trained: 57942
    num_steps_sampled: 57942
    num_steps_trained: 57942
  iterations_since_re

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_f147d_00000,RUNNING,192.168.3.5:128236,29,881.796,57942,6.398,10.17,2.38,97.96


Result for PPO_my_env_f147d_00000:
  agent_timesteps_total: 59940
  custom_metrics: {}
  date: 2021-11-10_10-50-58
  done: false
  episode_len_mean: 98.42
  episode_media: {}
  episode_reward_max: 10.090000000000018
  episode_reward_mean: 6.107100000000015
  episode_reward_min: 2.0600000000000054
  episodes_this_iter: 21
  episodes_total: 627
  experiment_id: fc595125a55c42209f18ff09cd4b9a94
  hostname: linar-Z390-GAMING-X
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.3
          cur_lr: 5.000000000000001e-05
          entropy: 2.1734012478873845
          entropy_coeff: 0.009999999999999998
          kl: 0.013057920636007438
          policy_loss: -0.025017070663826805
          total_loss: 0.8296772919950031
          vf_explained_var: 0.7531847953796387
          vf_loss: 0.8725110002926417
    num_agent_steps_sampled: 59940
    num_agent_steps_trained: 59940
    num_steps_sampled: 59940
    num_steps_trained: 59940
  iterations_since_re

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_f147d_00000,RUNNING,192.168.3.5:128236,30,909.237,59940,6.1071,10.09,2.06,98.42
