In [1]:
import torch 
from torch import nn

import ray
from ray.rllib.agents import ppo
from ray.rllib.models import ModelCatalog
from ray.rllib.models.torch.torch_modelv2 import TorchModelV2
from ray.rllib.utils.annotations import override

#from models import VisualEncoder
from train import *
from wrappers_2 import *



In [2]:
class VisualEncoder(nn.Module):
    def __init__(self):
        super().__init__()

        self.cnn = nn.Sequential(
            nn.Conv2d(3, 32, kernel_size=2, stride=2, padding=0),  
            nn.ELU(),
            nn.Conv2d(32, 32, kernel_size=2, stride=2, padding=0), 
            nn.ELU(),
            nn.Conv2d(32, 64, kernel_size=2, stride=2, padding=0), 
            nn.ELU(),
            nn.Conv2d(64, 128, kernel_size=2, stride=2, padding=0),
            nn.ELU(), 
            nn.Conv2d(128, 256, kernel_size=2, stride=2, padding=0),
            nn.ELU(),
            nn.Conv2d(256, 512, kernel_size=2, stride=2, padding=0),
            nn.ELU(),
            nn.Flatten(),
        )

    def forward(self, x):
        return self.cnn(x)

In [3]:
from torch.nn.functional import one_hot
from ray.rllib.policy.rnn_sequencing import add_time_dimension


class MyModelClass(TorchModelV2, nn.Module):
    def __init__(self, obs_space, action_space, num_outputs, model_config, name):
        TorchModelV2.__init__(self, obs_space, action_space, num_outputs, model_config, name)
        nn.Module.__init__(self)
        visual_features_dim = 512
        target_features_dim = 9 * 11 * 11 
        self.visual_encoder = VisualEncoder()
        self.visual_encoder.load_state_dict(
            torch.load("/IGLU-Minecraft/models/AngelaCNN/encoder_weigths.pth", map_location=torch.device('cpu'))
        )
        self.target_encoder = nn.Sequential(
            nn.Conv3d(7, 1, kernel_size=1, stride=1, padding=0),
            nn.ELU(),
        )
        self.policy_hidden_dim = 256 
        self.policy_network = nn.Sequential(
            nn.Linear(visual_features_dim + target_features_dim, 1024),
            nn.ELU(),
            nn.Linear(1024, 512),
            nn.ELU(),
            nn.Linear(512, self.policy_hidden_dim),
            nn.ELU(),
            nn.Linear(self.policy_hidden_dim, self.policy_hidden_dim),
            nn.ELU(),
            #nn.Linear(policy_hidden_dim, policy_hidden_dim),
            #nn.ELU(),
        )
        
        self.time_major = self.model_config.get("_time_major", False)
        self.gru = nn.GRU(self.policy_hidden_dim, self.policy_hidden_dim, batch_first=not self.time_major)
        
        self.action_head = nn.Linear(self.policy_hidden_dim, action_space.n)
        self.value_head = nn.Linear(self.policy_hidden_dim, 1)
        self.last_value = None
        
        self.use_cuda = torch.cuda.is_available()
        if self.use_cuda:
            self.visual_encoder.cuda()
            self.target_encoder.cuda()
            self.policy_network.cuda()
            self.gru.cuda()
            self.action_head.cuda()
            self.value_head.cuda()
        
    @override(TorchModelV2)
    def forward(self, input_dict, state, seq_lens):
        obs = input_dict['obs']
        pov = obs['pov'].permute(0, 3, 1, 2).float() / 255.0
        target = one_hot(obs['target_grid'].long(), num_classes=7).permute(0, 4, 1, 2, 3).float()
        if self.use_cuda:
            pov.cuda()
            target.cuda()
            
        with torch.no_grad():
            visual_features = self.visual_encoder(pov)
            
        target_features = self.target_encoder(target)
        target_features = target_features.reshape(target_features.shape[0], -1)
        features = torch.cat([visual_features, target_features], dim=1)
        features = self.policy_network(features)
        
        if isinstance(seq_lens, np.ndarray):
            seq_lens = torch.Tensor(seq_lens).int()
        max_seq_len = features.shape[0] // seq_lens.shape[0]    
        inputs = add_time_dimension(
            features,
            max_seq_len=max_seq_len,
            framework="torch",
            time_major=self.time_major,
        )
        
        h = state[0].permute(1, 0, 2)
        output, new_h = self.gru(inputs, h)
        new_state = [new_h.permute(1, 0, 2)]
        
        gru_output = output.reshape(-1, self.policy_hidden_dim)
        
        action = self.action_head(gru_output)
        self.last_value = self.value_head(gru_output).squeeze(1)
        return action, state
    
    @override(TorchModelV2)
    def value_function(self):
        assert self.last_value is not None, "must call forward() first"
        return self.last_value
    
    @override(TorchModelV2)
    def get_initial_state(self):
        return [torch.zeros(1, self.policy_hidden_dim)]

In [4]:
visual_features_dim = 512
target_features_dim = 9 * 11 * 11
policy_hidden_dim = 256 

policy_network = nn.Sequential(
    nn.Linear(visual_features_dim + target_features_dim, 1024),
    nn.ELU(),
    nn.Linear(1024, 512),
    nn.ELU(),
    nn.Linear(512, policy_hidden_dim),
    nn.ELU(),
    nn.Linear(policy_hidden_dim, policy_hidden_dim),
    nn.ELU(),
    #nn.Linear(policy_hidden_dim, policy_hidden_dim),
    #nn.ELU(),
)

sum(p.numel() for p in policy_network.parameters())

2362368

In [5]:
ModelCatalog.register_custom_model("my_torch_model", MyModelClass)

In [6]:
class VisualObservationWrapper(ObsWrapper):
    def __init__(self, env, include_target=False):
        super().__init__(env)
        self.observation_space = {   
            'pov': gym.spaces.Box(low=0, high=255, shape=(64, 64, 3)),
            'inventory': gym.spaces.Box(low=0.0, high=20.0, shape=(6,)),
            'compass': gym.spaces.Box(low=-180.0, high=180.0, shape=(1,))
        }
        if include_target:
            self.observation_space['target_grid'] = \
                gym.spaces.Box(low=0, high=6, shape=(9, 11, 11))
        self.observation_space = gym.spaces.Dict(self.observation_space)

    def observation(self, obs, reward=None, done=None, info=None):
        if info is not None:
            if 'target_grid' in info:
                target_grid = info['target_grid']
                del info['target_grid']
            else:
                logger.error(f'info: {info}')
                if hasattr(self.unwrapped, 'should_reset'):
                    self.unwrapped.should_reset(True)
                target_grid = self.env.unwrapped.tasks.current.target_grid
        else:
            target_grid = self.env.unwrapped.tasks.current.target_grid
        return {
            'pov': obs['pov'].astype(np.float32),
            'inventory': obs['inventory'],
            'compass': np.array([obs['compass']['angle'].item()]),
            'target_grid': target_grid
        }

In [7]:
import os
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"   # see issue #152
os.environ["CUDA_VISIBLE_DEVICES"]="0"

tasks = []
for i in range(1,156):
    if ('C'+str(i)) == 'C38': continue
    tasks.append('C'+str(i))
    
class RewardWrapper(gym.RewardWrapper):
    def __init__(self, env):
        super().__init__(env)
    
    def reward(self, rew):
        if rew == 0:
            rew = -0.01
        if abs(rew) == 1:
            rew /= 10
            
        return rew
    
def env_creator(env_config):
    env = gym.make('IGLUSilentBuilder-v0', max_steps=250)
    env.update_taskset(TaskSet(preset=tasks))
    #env = PovOnlyWrapper(env)
    env = VisualObservationWrapper(env, include_target=True)
    env = SelectAndPlace(env)
    env = Discretization(env, flat_action_space('human-level'))
    env = RewardWrapper(env)
    return env

from ray.tune.registry import register_env
register_env("my_env", env_creator)

from ray import tune
from ray.rllib.agents.ppo import PPOTrainer

In [None]:
from ray.tune.integration.wandb import WandbLogger

analysis = tune.run(PPOTrainer, 
         config={
             "env": "my_env", 
             "framework": "torch",
             "num_gpus": 1,
             "num_workers": 3,
             "sgd_minibatch_size": 256,
             "clip_param": 0.2,
             "entropy_coeff": 0.01,
             "lambda": 0.95,
             "train_batch_size": 5_000,
             "lr": 1e-4,
             #"gamma": 0.99,
             "model": {
                    # Specify our custom model from above.
                    "custom_model": "my_torch_model",
                    # Extra kwargs to be passed to your model's c'tor.
                    "custom_model_config": {},
              },
             "logger_config": {
                  "wandb": {
                      "project": "IGLU-Minecraft",
                      "name": "PPO All Tasks pretrained (AngelaCNN+GRU) (3 noops after placement) r: -0.01 div10"
                  }
              }

        },
        loggers=[WandbLogger],
        local_dir="/IGLU-Minecraft/checkpoints/all_tasks",
        keep_checkpoints_num=50,
        checkpoint_freq=5,
        checkpoint_at_end=True)

2021-11-07 12:16:50,713	INFO wandb.py:170 -- Already logged into W&B.
2021-11-07 12:16:50,727	ERROR syncer.py:72 -- Log sync requires rsync to be installed.


Trial name,status,loc
PPO_my_env_960ce_00000,RUNNING,


[34m[1mwandb[0m: Currently logged in as: [33mlinar[0m (use `wandb login --relogin` to force relogin)
[34m[1mwandb[0m: wandb version 0.12.6 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade


[2m[36m(pid=550421)[0m 2021-11-07 12:16:54,107	INFO ppo.py:159 -- In multi-agent mode, policies will be optimized sequentially by the multi-GPU optimizer. Consider setting simple_optimizer=True if this doesn't work for you.
[2m[36m(pid=550421)[0m 2021-11-07 12:16:54,107	INFO trainer.py:728 -- Current log_level is WARN. For more information, set 'log_level': 'INFO' / 'DEBUG' or use the -v and -vv flags.
[2m[36m(pid=550421)[0m 2021-11-07 12:17:02,169	INFO trainable.py:109 -- Trainable.setup took 10.476 seconds. If your trainable is slow to initialize, consider setting reuse_actors=True to reduce actor creation overheads.


Result for PPO_my_env_960ce_00000:
  agent_timesteps_total: 9996
  custom_metrics: {}
  date: 2021-11-07_12-20-00
  done: false
  episode_len_mean: 101.70103092783505
  episode_media: {}
  episode_reward_max: 2.590000000000001
  episode_reward_mean: -0.7984536082474234
  episode_reward_min: -1.5900000000000005
  episodes_this_iter: 97
  episodes_total: 97
  experiment_id: d03564bf33354e528816561f1ba7167b
  hostname: linar-Z390-GAMING-X
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.19999999999999998
          cur_lr: 0.00010000000000000002
          entropy: 2.8843685247959234
          entropy_coeff: 0.01
          kl: 0.005787131851031565
          policy_loss: -0.015160348295019223
          total_loss: 0.0048022730944630426
          vf_explained_var: -0.13828356564044952
          vf_loss: 0.04764888057369305
    num_agent_steps_sampled: 9996
    num_agent_steps_trained: 9996
    num_steps_sampled: 9996
    num_steps_trained: 9996
  ite

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_960ce_00000,RUNNING,192.168.3.5:550421,1,178.235,9996,-0.798454,2.59,-1.59,101.701


Result for PPO_my_env_960ce_00000:
  agent_timesteps_total: 19992
  custom_metrics: {}
  date: 2021-11-07_12-21-55
  done: false
  episode_len_mean: 98.39603960396039
  episode_media: {}
  episode_reward_max: 2.5200000000000022
  episode_reward_mean: -0.7946534653465352
  episode_reward_min: -1.5400000000000005
  episodes_this_iter: 101
  episodes_total: 198
  experiment_id: d03564bf33354e528816561f1ba7167b
  hostname: linar-Z390-GAMING-X
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.19999999999999998
          cur_lr: 0.00010000000000000002
          entropy: 2.8792491413589216
          entropy_coeff: 0.01
          kl: 0.007026159120763405
          policy_loss: -0.019986279375112864
          total_loss: 0.006758365585999419
          vf_explained_var: -0.4184056520462036
          vf_loss: 0.054131903732892
    num_agent_steps_sampled: 19992
    num_agent_steps_trained: 19992
    num_steps_sampled: 19992
    num_steps_trained: 19992
  

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_960ce_00000,RUNNING,192.168.3.5:550421,2,293.638,19992,-0.794653,2.52,-1.54,98.396


Result for PPO_my_env_960ce_00000:
  agent_timesteps_total: 29988
  custom_metrics: {}
  date: 2021-11-07_12-23-51
  done: false
  episode_len_mean: 98.57843137254902
  episode_media: {}
  episode_reward_max: 3.11
  episode_reward_mean: -0.699411764705883
  episode_reward_min: -1.9100000000000008
  episodes_this_iter: 102
  episodes_total: 300
  experiment_id: d03564bf33354e528816561f1ba7167b
  hostname: linar-Z390-GAMING-X
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.19999999999999998
          cur_lr: 0.00010000000000000002
          entropy: 2.873959662567856
          entropy_coeff: 0.01
          kl: 0.008215863863404236
          policy_loss: -0.02320211596914336
          total_loss: 0.012691705124791012
          vf_explained_var: -0.1844165325164795
          vf_loss: 0.06299024441828713
    num_agent_steps_sampled: 29988
    num_agent_steps_trained: 29988
    num_steps_sampled: 29988
    num_steps_trained: 29988
  iterations_sinc

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_960ce_00000,RUNNING,192.168.3.5:550421,3,409.767,29988,-0.699412,3.11,-1.91,98.5784




Result for PPO_my_env_960ce_00000:
  agent_timesteps_total: 39984
  custom_metrics: {}
  date: 2021-11-07_12-26-12
  done: false
  episode_len_mean: 96.57281553398059
  episode_media: {}
  episode_reward_max: 4.770000000000001
  episode_reward_mean: -0.5916504854368936
  episode_reward_min: -1.4700000000000006
  episodes_this_iter: 103
  episodes_total: 403
  experiment_id: d03564bf33354e528816561f1ba7167b
  hostname: linar-Z390-GAMING-X
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.19999999999999998
          cur_lr: 0.00010000000000000002
          entropy: 2.871249017959986
          entropy_coeff: 0.01
          kl: 0.008111144584388687
          policy_loss: -0.01956814361306337
          total_loss: 0.038457083821686736
          vf_explained_var: -0.09535246342420578
          vf_loss: 0.08511548809484284
    num_agent_steps_sampled: 39984
    num_agent_steps_trained: 39984
    num_steps_sampled: 39984
    num_steps_trained: 39984
  

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_960ce_00000,RUNNING,192.168.3.5:550421,4,550.442,39984,-0.59165,4.77,-1.47,96.5728


Result for PPO_my_env_960ce_00000:
  agent_timesteps_total: 49980
  custom_metrics: {}
  date: 2021-11-07_12-28-13
  done: false
  episode_len_mean: 100.15
  episode_media: {}
  episode_reward_max: 4.710000000000004
  episode_reward_mean: -0.8549000000000007
  episode_reward_min: -1.8600000000000008
  episodes_this_iter: 100
  episodes_total: 503
  experiment_id: d03564bf33354e528816561f1ba7167b
  hostname: linar-Z390-GAMING-X
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.19999999999999998
          cur_lr: 0.00010000000000000002
          entropy: 2.8667098827851123
          entropy_coeff: 0.01
          kl: 0.008871607625514715
          policy_loss: -0.022918244961322817
          total_loss: -7.148530547164826e-05
          vf_explained_var: 0.17319510877132416
          vf_loss: 0.04973953606154865
    num_agent_steps_sampled: 49980
    num_agent_steps_trained: 49980
    num_steps_sampled: 49980
    num_steps_trained: 49980
  iteratio

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_960ce_00000,RUNNING,192.168.3.5:550421,5,670.782,49980,-0.8549,4.71,-1.86,100.15


Result for PPO_my_env_960ce_00000:
  agent_timesteps_total: 59976
  custom_metrics: {}
  date: 2021-11-07_12-30-14
  done: false
  episode_len_mean: 98.29411764705883
  episode_media: {}
  episode_reward_max: 2.9000000000000035
  episode_reward_mean: -0.6413725490196083
  episode_reward_min: -1.910000000000001
  episodes_this_iter: 102
  episodes_total: 605
  experiment_id: d03564bf33354e528816561f1ba7167b
  hostname: linar-Z390-GAMING-X
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.19999999999999998
          cur_lr: 0.00010000000000000002
          entropy: 2.858221541306911
          entropy_coeff: 0.01
          kl: 0.010438789652350048
          policy_loss: -0.030405953164614388
          total_loss: 0.018758949520798703
          vf_explained_var: 0.14639359712600708
          vf_loss: 0.07565935982040997
    num_agent_steps_sampled: 59976
    num_agent_steps_trained: 59976
    num_steps_sampled: 59976
    num_steps_trained: 59976
  

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_960ce_00000,RUNNING,192.168.3.5:550421,6,792.376,59976,-0.641373,2.9,-1.91,98.2941


Result for PPO_my_env_960ce_00000:
  agent_timesteps_total: 69972
  custom_metrics: {}
  date: 2021-11-07_12-32-16
  done: false
  episode_len_mean: 99.83168316831683
  episode_media: {}
  episode_reward_max: 4.6900000000000075
  episode_reward_mean: -0.3659405940594059
  episode_reward_min: -1.880000000000001
  episodes_this_iter: 101
  episodes_total: 706
  experiment_id: d03564bf33354e528816561f1ba7167b
  hostname: linar-Z390-GAMING-X
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.19999999999999998
          cur_lr: 0.00010000000000000002
          entropy: 2.8370240290959674
          entropy_coeff: 0.01
          kl: 0.014191464562780963
          policy_loss: -0.037628927366817014
          total_loss: 0.041279713383728524
          vf_explained_var: 0.3577720522880554
          vf_loss: 0.10444058780837008
    num_agent_steps_sampled: 69972
    num_agent_steps_trained: 69972
    num_steps_sampled: 69972
    num_steps_trained: 69972
  

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_960ce_00000,RUNNING,192.168.3.5:550421,7,914.208,69972,-0.365941,4.69,-1.88,99.8317




Result for PPO_my_env_960ce_00000:
  agent_timesteps_total: 79968
  custom_metrics: {}
  date: 2021-11-07_12-34-39
  done: false
  episode_len_mean: 99.76
  episode_media: {}
  episode_reward_max: 4.920000000000009
  episode_reward_mean: -0.001399999999999455
  episode_reward_min: -2.0100000000000007
  episodes_this_iter: 99
  episodes_total: 805
  experiment_id: d03564bf33354e528816561f1ba7167b
  hostname: linar-Z390-GAMING-X
  info:
    learner:
      default_policy:
        learner_stats:
          cur_kl_coeff: 0.19999999999999998
          cur_lr: 0.00010000000000000002
          entropy: 2.803601525991391
          entropy_coeff: 0.01
          kl: 0.0197072519793406
          policy_loss: -0.03646253860746132
          total_loss: 0.12053933267991067
          vf_explained_var: 0.2784615755081177
          vf_loss: 0.18109643551258323
    num_agent_steps_sampled: 79968
    num_agent_steps_trained: 79968
    num_steps_sampled: 79968
    num_steps_trained: 79968
  iterations_since

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_my_env_960ce_00000,RUNNING,192.168.3.5:550421,8,1057.12,79968,-0.0014,4.92,-2.01,99.76
