In [1]:
import torch 
from torch import nn

import ray
from ray.rllib.agents import dqn
from ray.rllib.models import ModelCatalog
from ray.rllib.models.torch.torch_modelv2 import TorchModelV2
from ray.rllib.utils.annotations import override

#from models import VisualEncoder
from train import *
from wrappers_2 import *



In [2]:
class VisualEncoder(nn.Module):
    def __init__(self):
        super().__init__()

        self.cnn = nn.Sequential(
            nn.Conv2d(3, 32, kernel_size=8, stride=4, padding=0),  
            nn.ReLU(),
            nn.Conv2d(32, 64, kernel_size=4, stride=2, padding=0), 
            nn.ReLU(),
            nn.Conv2d(64, 64, kernel_size=3, stride=1, padding=0), 
            nn.ReLU(),
            nn.Conv2d(64, 64, kernel_size=3, stride=1, padding=0),
            nn.ReLU(), 
            nn.Conv2d(64, 512, kernel_size=2, stride=1, padding=0),
            nn.ReLU(),
            nn.Flatten(),
        )

    def forward(self, x):
        return self.cnn(x)

In [3]:
class MyModelClass(TorchModelV2, nn.Module):
    def __init__(self, obs_space, action_space, num_outputs, model_config, name):
        TorchModelV2.__init__(self, obs_space, action_space, num_outputs, model_config, name)
        nn.Module.__init__(self)
        features_dim = 512
        self.encoder = VisualEncoder()
        self.encoder.load_state_dict(
            torch.load("/IGLU-Minecraft/models/AnnaCNN/encoder_weigths.pth", map_location=torch.device('cpu'))
        )
        self.qvalue_head = nn.Linear(features_dim, num_outputs)
        
        self.use_cuda = torch.cuda.is_available()
        if self.use_cuda:
            self.encoder.cuda()
            self.qvalue_head.cuda()
        
    @override(TorchModelV2)
    def forward(self, input_dict, state, seq_lens):
        obs = input_dict['obs'].permute(0, 3, 1, 2).float() / 255.0
        if self.use_cuda:
            obs.cuda()
            
        features = self.encoder(obs)
        qvalues = self.qvalue_head(features)
        return qvalues, state

In [4]:
ModelCatalog.register_custom_model("my_torch_model", MyModelClass)

In [5]:
import os
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"   # see issue #152
os.environ["CUDA_VISIBLE_DEVICES"]="0"

class RewardWrapper(gym.RewardWrapper):
    def __init__(self, env):
        super().__init__(env)
    
    def reward(self, rew):
        print(rew)
        if rew == 0:
            rew = -0.02
        return rew
    
def env_creator(env_config):
    env = gym.make('IGLUSilentBuilder-v0', max_steps=500)
    env.update_taskset(TaskSet(preset=['C32']))
    env = PovOnlyWrapper(env)
    env = SelectAndPlace(env)
    env = Discretization(env, flat_action_space('human-level'))
    #env = RewardWrapper(env)
    return env

from ray.tune.registry import register_env
register_env("my_env", env_creator)

from ray import tune
from ray.rllib.agents.dqn import ApexTrainer

In [None]:
from ray.tune.integration.wandb import WandbLogger

analysis = tune.run(ApexTrainer, 
         config={
             "env": "my_env", 
             "framework": "torch",
             "gamma": 0.95,
             "num_gpus": 1,
             "num_workers": 6,
             "buffer_size": 5000000,
             "learning_starts": 5000,
             "train_batch_size": 5000,
             "target_network_update_freq": 5000,
             "prioritized_replay_alpha": 0.5,
             "final_prioritized_replay_beta": 1.0,
             "min_iter_time_s": 10,
             "rollout_fragment_length": 8,
             "collect_metrics_timeout": 1800,
             "exploration_config": {
                  "initial_epsilon": 1,
                  "epsilon_timesteps": 500000,
                  "final_epsilon": 0.0,
              },
             "model": {
                    # Specify our custom model from above.
                    "custom_model": "my_torch_model",
                    # Extra kwargs to be passed to your model's c'tor.
                    "custom_model_config": {},
              },
             "logger_config": {
                  "wandb": {
                      "project": "IGLU-Minecraft",
                      "name": "APEX C32 pretrained (AnnaCNN)"
                  }
              }

        },
        loggers=[WandbLogger])



Trial name,status,loc
APEX_my_env_a0624_00000,PENDING,


2021-10-11 11:44:43,840	INFO wandb.py:170 -- Already logged into W&B.
2021-10-11 11:44:43,900	ERROR syncer.py:72 -- Log sync requires rsync to be installed.
[34m[1mwandb[0m: Currently logged in as: [33mlinar[0m (use `wandb login --relogin` to force relogin)
[34m[1mwandb[0m: wandb version 0.12.4 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade


[2m[36m(pid=23009)[0m 2021-10-11 11:44:50,893	INFO dqn.py:188 -- In multi-agent mode, policies will be optimized sequentially by the multi-GPU optimizer. Consider setting simple_optimizer=True if this doesn't work for you.
[2m[36m(pid=23009)[0m 2021-10-11 11:44:50,893	INFO trainer.py:728 -- Current log_level is WARN. For more information, set 'log_level': 'INFO' / 'DEBUG' or use the -v and -vv flags.
[2m[36m(pid=23009)[0m 2021-10-11 11:45:05,794	INFO trainable.py:109 -- Trainable.setup took 20.231 seconds. If your trainable is slow to initialize, consider setting reuse_actors=True to reduce actor creation overheads.




Result for APEX_my_env_a0624_00000:
  agent_timesteps_total: 25016
  custom_metrics: {}
  date: 2021-10-11_11-59-58
  done: false
  episode_len_mean: 153.60416666666666
  episode_media: {}
  episode_reward_max: 10.0
  episode_reward_mean: 3.5625
  episode_reward_min: -4.0
  episodes_this_iter: 48
  episodes_total: 48
  experiment_id: 243561e26b0c41869a55ea50c7801cd6
  hostname: cdsserver
  info:
    exploration_infos:
    - cur_epsilon: 0.0
      last_timestep: 0
    - cur_epsilon: 0.4
      last_timestep: 24623
    - cur_epsilon: 0.1109031749048234
      last_timestep: 23111
    - cur_epsilon: 0.030748785509924638
      last_timestep: 24623
    - cur_epsilon: 0.008525344843795182
      last_timestep: 23551
    - cur_epsilon: 0.0023637195258383783
      last_timestep: 24023
    - cur_epsilon: 0.0006553600000000003
      last_timestep: 16143
    last_target_update_ts: 3540000
    learner:
      default_policy:
        allreduce_latency: 0.0
        cur_lr: 0.0005
        grad_gnorm: 0.8

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
APEX_my_env_a0624_00000,RUNNING,192.168.1.96:23009,1,892.815,25016,3.5625,10,-4,153.604




Result for APEX_my_env_a0624_00000:
  agent_timesteps_total: 50032
  custom_metrics: {}
  date: 2021-10-11_12-14-39
  done: false
  episode_len_mean: 150.81
  episode_media: {}
  episode_reward_max: 10.0
  episode_reward_mean: 6.27
  episode_reward_min: -4.0
  episodes_this_iter: 64
  episodes_total: 112
  experiment_id: 243561e26b0c41869a55ea50c7801cd6
  hostname: cdsserver
  info:
    exploration_infos:
    - cur_epsilon: 0.0
      last_timestep: 0
    - cur_epsilon: 0.4
      last_timestep: 48719
    - cur_epsilon: 0.1109031749048234
      last_timestep: 48495
    - cur_epsilon: 0.030748785509924638
      last_timestep: 49759
    - cur_epsilon: 0.008525344843795182
      last_timestep: 39847
    - cur_epsilon: 0.0023637195258383783
      last_timestep: 48399
    - cur_epsilon: 0.0006553600000000003
      last_timestep: 47735
    last_target_update_ts: 8990000
    learner:
      default_policy:
        allreduce_latency: 0.0
        cur_lr: 0.0005
        grad_gnorm: 0.26182749867439

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
APEX_my_env_a0624_00000,RUNNING,192.168.1.96:23009,2,1773.2,50032,6.27,10,-4,150.81




Result for APEX_my_env_a0624_00000:
  agent_timesteps_total: 75040
  custom_metrics: {}
  date: 2021-10-11_12-29-16
  done: false
  episode_len_mean: 130.23
  episode_media: {}
  episode_reward_max: 10.0
  episode_reward_mean: 7.32
  episode_reward_min: 0.0
  episodes_this_iter: 67
  episodes_total: 179
  experiment_id: 243561e26b0c41869a55ea50c7801cd6
  hostname: cdsserver
  info:
    exploration_infos:
    - cur_epsilon: 0.0
      last_timestep: 0
    - cur_epsilon: 0.4
      last_timestep: 75007
    - cur_epsilon: 0.1109031749048234
      last_timestep: 74975
    - cur_epsilon: 0.030748785509924638
      last_timestep: 74935
    - cur_epsilon: 0.008525344843795182
      last_timestep: 71015
    - cur_epsilon: 0.0023637195258383783
      last_timestep: 71831
    - cur_epsilon: 0.0006553600000000003
      last_timestep: 74391
    last_target_update_ts: 14100000
    learner:
      default_policy:
        allreduce_latency: 0.0
        cur_lr: 0.0005
        grad_gnorm: 0.15581476688385

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
APEX_my_env_a0624_00000,RUNNING,192.168.1.96:23009,3,2650.53,75040,7.32,10,0,130.23




Result for APEX_my_env_a0624_00000:
  agent_timesteps_total: 100072
  custom_metrics: {}
  date: 2021-10-11_12-42-00
  done: false
  episode_len_mean: 132.01
  episode_media: {}
  episode_reward_max: 10.0
  episode_reward_mean: 7.43
  episode_reward_min: 0.0
  episodes_this_iter: 55
  episodes_total: 234
  experiment_id: 243561e26b0c41869a55ea50c7801cd6
  hostname: cdsserver
  info:
    exploration_infos:
    - cur_epsilon: 0.0
      last_timestep: 0
    - cur_epsilon: 0.4
      last_timestep: 99711
    - cur_epsilon: 0.1109031749048234
      last_timestep: 99031
    - cur_epsilon: 0.030748785509924638
      last_timestep: 98271
    - cur_epsilon: 0.008525344843795182
      last_timestep: 98271
    - cur_epsilon: 0.0023637195258383783
      last_timestep: 99983
    - cur_epsilon: 0.0006553600000000003
      last_timestep: 98311
    last_target_update_ts: 18190000
    learner:
      default_policy:
        allreduce_latency: 0.0
        cur_lr: 0.0005
        grad_gnorm: 0.0877395048737

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
APEX_my_env_a0624_00000,RUNNING,192.168.1.96:23009,4,3414.33,100072,7.43,10,0,132.01


