In [1]:
import torch 
from torch import nn

import ray
from ray.rllib.agents import impala
from ray.rllib.models import ModelCatalog
from ray.rllib.models.torch.torch_modelv2 import TorchModelV2
from ray.rllib.utils.annotations import override

#from models import VisualEncoder
from train import *
from wrappers_2 import *



In [2]:
class VisualEncoder(nn.Module):
    def __init__(self):
        super().__init__()

        self.cnn = nn.Sequential(
            nn.Conv2d(3, 32, kernel_size=8, stride=4, padding=0),  
            nn.ReLU(),
            nn.Conv2d(32, 64, kernel_size=4, stride=2, padding=0), 
            nn.ReLU(),
            nn.Conv2d(64, 64, kernel_size=3, stride=1, padding=0), 
            nn.ReLU(),
            nn.Conv2d(64, 64, kernel_size=3, stride=1, padding=0),
            nn.ReLU(), 
            nn.Conv2d(64, 512, kernel_size=2, stride=1, padding=0),
            nn.ReLU(),
            nn.Flatten(),
        )

    def forward(self, x):
        return self.cnn(x)

In [3]:
class MyModelClass(TorchModelV2, nn.Module):
    def __init__(self, obs_space, action_space, num_outputs, model_config, name):
        TorchModelV2.__init__(self, obs_space, action_space, num_outputs, model_config, name)
        nn.Module.__init__(self)
        features_dim = 512
        self.encoder = VisualEncoder()
        self.encoder.load_state_dict(
            torch.load("/IGLU-Minecraft/models/AnnaCNN/encoder_weigths.pth", map_location=torch.device('cpu'))
        )
        self.action_head = nn.Linear(features_dim, action_space.n)
        self.value_head = nn.Linear(features_dim, 1)
        self.last_value = None
        
        self.use_cuda = torch.cuda.is_available()
        if self.use_cuda:
            self.encoder.cuda()
            self.action_head.cuda()
            self.value_head.cuda()
        
    @override(TorchModelV2)
    def forward(self, input_dict, state, seq_lens):
        obs = input_dict['obs'].permute(0, 3, 1, 2).float() / 255.0
        if self.use_cuda:
            obs.cuda()
            
        features = self.encoder(obs)
        action = self.action_head(features)
        self.last_value = self.value_head(features).squeeze(1)
        return action, state
    
    @override(TorchModelV2)
    def value_function(self):
        assert self.last_value is not None, "must call forward() first"
        return self.last_value

In [4]:
ModelCatalog.register_custom_model("my_torch_model", MyModelClass)

In [5]:
import os
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"   # see issue #152
os.environ["CUDA_VISIBLE_DEVICES"]="0"

def env_creator(env_config):
    env = gym.make('IGLUSilentBuilder-v0', max_steps=1000)
    env.update_taskset(TaskSet(preset=['C32']))
    env = PovOnlyWrapper(env)
    env = SelectAndPlace(env)
    env = Discretization(env, flat_action_space('human-level'))
    return env

from ray.tune.registry import register_env
register_env("my_env", env_creator)

from ray import tune
from ray.rllib.agents.impala import ImpalaTrainer

In [6]:
from ray.tune.integration.wandb import WandbLogger

analysis = tune.run(ImpalaTrainer, 
         config={
             "env": "my_env", 
             "framework": "torch",
             "num_gpus": 1,
             "num_workers": 3,
             #"sgd_minibatch_size": 256,
             #"clip_param": 0.2,
             "entropy_coeff": 0.01,
             #"lambda": 0.95,
             "train_batch_size": 5000,
             "grad_clip": 100.0,
             "rollout_fragment_length": 100,
             # Number of passes to make over each train batch.
             "num_sgd_iter": 5,
             "num_multi_gpu_tower_stacks": 1,
             "replay_proportion": 1.0,
             "replay_buffer_num_slots": 100,
             #"minibatch_buffer_size": 10,
             "learner_queue_size": 32,
             "model": {
                    # Specify our custom model from above.
                    "custom_model": "my_torch_model",
                    # Extra kwargs to be passed to your model's c'tor.
                    "custom_model_config": {},
              },
             "logger_config": {
                  "wandb": {
                      "project": "IGLU-Minecraft",
                      "name": "IMPALA C32 pretrained (AnnaCNN) (3 noops after placement)"
                  }
              }

        },
        loggers=[WandbLogger])

2021-10-07 22:37:07,796	INFO wandb.py:170 -- Already logged into W&B.
2021-10-07 22:37:07,809	ERROR syncer.py:72 -- Log sync requires rsync to be installed.


Trial name,status,loc
IMPALA_my_env_1a598_00000,RUNNING,


[34m[1mwandb[0m: Currently logged in as: [33mlinar[0m (use `wandb login --relogin` to force relogin)
[34m[1mwandb[0m: wandb version 0.12.4 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade


[2m[36m(pid=11153)[0m 2021-10-07 22:37:11,275	INFO trainer.py:728 -- Current log_level is WARN. For more information, set 'log_level': 'INFO' / 'DEBUG' or use the -v and -vv flags.


Result for IMPALA_my_env_1a598_00000:
  agent_timesteps_total: 2500
  custom_metrics: {}
  date: 2021-10-07_22-42-44
  done: false
  episode_len_mean: 441.8
  episode_media: {}
  episode_reward_max: 0.0
  episode_reward_mean: 0.0
  episode_reward_min: 0.0
  episodes_this_iter: 5
  episodes_total: 5
  experiment_id: a24d0318e6224d569c66a7bb0a3b8f0b
  hostname: linar-Z390-GAMING-X
  info:
    learner:
      default_policy: {}
    learner_queue:
      size_count: 1
      size_mean: 0.0
      size_quantiles:
      - 0.0
      - 0.0
      - 0.0
      - 0.0
      - 0.0
      size_std: 0.0
    num_agent_steps_sampled: 2500
    num_steps_sampled: 2500
    num_steps_trained: 1
    timing_breakdown:
      learner_dequeue_time_ms: 145350.868
      learner_grad_time_ms: 93.857
      learner_load_time_ms: 22.654
      learner_load_wait_time_ms: 145372.356
  iterations_since_restore: 1
  node_ip: 192.168.3.5
  num_healthy_workers: 3
  off_policy_estimator: {}
  perf:
    cpu_util_percent: 68.8404301

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
IMPALA_my_env_1a598_00000,RUNNING,192.168.3.5:11153,1,325.479,2500,0,0,0,441.8


[2m[36m(pid=11150)[0m Failed to take a step (error timed out). Terminating episode and sending random observation, be aware. To account for this failure case in your code check to see if `'error' in info` where info is the info dictionary returned by the step function.


Result for IMPALA_my_env_1a598_00000:
  agent_timesteps_total: 5000
  custom_metrics: {}
  date: 2021-10-07_22-46-15
  done: false
  episode_len_mean: 439.1818181818182
  episode_media: {}
  episode_reward_max: 0.0
  episode_reward_mean: 0.0
  episode_reward_min: 0.0
  episodes_this_iter: 6
  episodes_total: 11
  experiment_id: a24d0318e6224d569c66a7bb0a3b8f0b
  hostname: linar-Z390-GAMING-X
  info:
    learner:
      default_policy: {}
    learner_queue:
      size_count: 3
      size_mean: 0.0
      size_quantiles:
      - 0.0
      - 0.0
      - 0.0
      - 0.0
      - 0.0
      size_std: 0.0
    num_agent_steps_sampled: 5000
    num_steps_sampled: 5000
    num_steps_trained: 2
    num_weight_broadcasts: 1
    timing_breakdown:
      learner_dequeue_time_ms: 145350.868
      learner_grad_time_ms: 96.527
      learner_load_time_ms: 22.654
      learner_load_wait_time_ms: 118753.925
  iterations_since_restore: 2
  node_ip: 192.168.3.5
  num_healthy_workers: 3
  off_policy_estimator: {

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
IMPALA_my_env_1a598_00000,RUNNING,192.168.3.5:11153,2,536.443,5000,0,0,0,439.182


Result for IMPALA_my_env_1a598_00000:
  agent_timesteps_total: 5400
  custom_metrics: {}
  date: 2021-10-07_22-47-03
  done: false
  episode_len_mean: 432.6923076923077
  episode_media: {}
  episode_reward_max: 0.0
  episode_reward_mean: -0.7692307692307693
  episode_reward_min: -10.0
  episodes_this_iter: 2
  episodes_total: 13
  experiment_id: a24d0318e6224d569c66a7bb0a3b8f0b
  hostname: linar-Z390-GAMING-X
  info:
    learner:
      default_policy: {}
    learner_queue:
      size_count: 3
      size_mean: 0.0
      size_quantiles:
      - 0.0
      - 0.0
      - 0.0
      - 0.0
      - 0.0
      size_std: 0.0
    num_agent_steps_sampled: 5400
    num_steps_sampled: 5400
    num_steps_trained: 3
    num_weight_broadcasts: 1
    timing_breakdown:
      learner_dequeue_time_ms: 145350.868
      learner_grad_time_ms: 96.527
      learner_load_time_ms: 22.654
      learner_load_wait_time_ms: 118753.925
  iterations_since_restore: 3
  node_ip: 192.168.3.5
  num_healthy_workers: 3
  off_p

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
IMPALA_my_env_1a598_00000,RUNNING,192.168.3.5:11153,3,584.881,5400,-0.769231,0,-10,432.692


2021-10-07 22:47:03,621	ERROR trial_runner.py:773 -- Trial IMPALA_my_env_1a598_00000: Error processing event.
Traceback (most recent call last):
  File "/root/miniconda/envs/py37/lib/python3.7/site-packages/ray/tune/trial_runner.py", line 739, in _process_trial
    results = self.trial_executor.fetch_result(trial)
  File "/root/miniconda/envs/py37/lib/python3.7/site-packages/ray/tune/ray_trial_executor.py", line 746, in fetch_result
    result = ray.get(trial_future[0], timeout=DEFAULT_GET_TIMEOUT)
  File "/root/miniconda/envs/py37/lib/python3.7/site-packages/ray/_private/client_mode_hook.py", line 82, in wrapper
    return func(*args, **kwargs)
  File "/root/miniconda/envs/py37/lib/python3.7/site-packages/ray/worker.py", line 1621, in get
    raise value.as_instanceof_cause()
ray.exceptions.RayTaskError(timeout): [36mray::IMPALA.train_buffered()[39m (pid=11153, ip=192.168.3.5, repr=IMPALA)
  File "/root/miniconda/envs/py37/lib/python3.7/site-packages/ray/tune/trainable.py", line 178

Result for IMPALA_my_env_1a598_00000:
  agent_timesteps_total: 5400
  custom_metrics: {}
  date: 2021-10-07_22-47-03
  done: false
  episode_len_mean: 432.6923076923077
  episode_media: {}
  episode_reward_max: 0.0
  episode_reward_mean: -0.7692307692307693
  episode_reward_min: -10.0
  episodes_this_iter: 2
  episodes_total: 13
  experiment_id: a24d0318e6224d569c66a7bb0a3b8f0b
  experiment_tag: '0'
  hostname: linar-Z390-GAMING-X
  info:
    learner:
      default_policy: {}
    learner_queue:
      size_count: 3
      size_mean: 0.0
      size_quantiles:
      - 0.0
      - 0.0
      - 0.0
      - 0.0
      - 0.0
      size_std: 0.0
    num_agent_steps_sampled: 5400
    num_steps_sampled: 5400
    num_steps_trained: 3
    num_weight_broadcasts: 1
    timing_breakdown:
      learner_dequeue_time_ms: 145350.868
      learner_grad_time_ms: 96.527
      learner_load_time_ms: 22.654
      learner_load_wait_time_ms: 118753.925
  iterations_since_restore: 3
  node_ip: 192.168.3.5
  num_heal

VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
agent_timesteps_total,5400.0
episode_len_mean,432.69231
episode_reward_max,0.0
episode_reward_mean,-0.76923
episode_reward_min,-10.0
episodes_this_iter,2.0
episodes_total,13.0
info/learner_queue/size_count,3.0
info/learner_queue/size_mean,0.0
info/learner_queue/size_std,0.0


0,1
agent_timesteps_total,▁▇█
episode_len_mean,█▆▁
episode_reward_max,▁▁▁
episode_reward_mean,██▁
episode_reward_min,██▁
episodes_this_iter,▆█▁
episodes_total,▁▆█
info/learner_queue/size_count,▁██
info/learner_queue/size_mean,▁▁▁
info/learner_queue/size_std,▁▁▁




Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
IMPALA_my_env_1a598_00000,ERROR,,3,584.881,5400,-0.769231,0,-10,432.692

Trial name,# failures,error file
IMPALA_my_env_1a598_00000,1,/root/ray_results/IMPALA_2021-10-07_22-37-07/IMPALA_my_env_1a598_00000_0_2021-10-07_22-37-07/error.txt


TuneError: ('Trials did not complete', [IMPALA_my_env_1a598_00000])