In [19]:
import copy
import tempfile

import torch

# from matplotlib import pyplot as plt
from tensordict import TensorDictBase

from tensordict.nn import TensorDictModule, TensorDictSequential
from torch import multiprocessing

from torchrl.collectors import SyncDataCollector
from torchrl.data import LazyMemmapStorage, RandomSampler, ReplayBuffer

from torchrl.envs import (
    check_env_specs,
    ExplorationType,
    PettingZooEnv,
    RewardSum,
    set_exploration_type,
    TransformedEnv,
    VmasEnv,
)

from torchrl.modules import (
    AdditiveGaussianModule,
    MultiAgentMLP,
    ProbabilisticActor,
    TanhDelta,
)

from torchrl.objectives import DDPGLoss, SoftUpdate, ValueEstimators

from torchrl.record import CSVLogger, PixelRenderTransform, VideoRecorder

from tqdm import tqdm

In [20]:
# Seed
seed = 0
torch.manual_seed(seed)

# Devices
is_fork = multiprocessing.get_start_method() == "fork"
device = (
    torch.device(0)
    if torch.cuda.is_available() and not is_fork
    else torch.device("cpu")
)
print(f"Using device: {device}")
# Sampling
frames_per_batch = 1_000  # Number of team frames collected per sampling iteration
n_iters = 10  # Number of sampling and training iterations
total_frames = frames_per_batch * n_iters

# We will stop training the evaders after this many iterations,
# should be 0 <= iteration_when_stop_training_evaders <= n_iters
iteration_when_stop_training_evaders = n_iters // 2

# Replay buffer
memory_size = 1_000_000  # The replay buffer of each group can store this many frames

# Training
n_optimiser_steps = 100  # Number of optimisation steps per training iteration
train_batch_size = 128  # Number of frames trained in each optimiser step
lr = 3e-4  # Learning rate
max_grad_norm = 1.0  # Maximum norm for the gradients

# DDPG
gamma = 0.99  # Discount factor
polyak_tau = 0.005  # Tau for the soft-update of the target network

Using device: cuda:0


In [21]:
from torchrl.envs import PettingZooEnv, MarlGroupMapType

scenario_name = "hanabi_v5"
num_players = 2
num_ranks = 5
num_colors = 2
hand_size = 2
render_mode = "human"

base_env = PettingZooEnv(
    task=scenario_name,
    parallel=False,
    use_mask=True,
    players=num_players,
    ranks=num_ranks,
    colors=num_colors,
    hand_size=hand_size,
    max_life_tokens=1,
    max_information_tokens=3,
    categorical_actions=True,
    seed=0,
    done_on_any=True,
    # group_map=MarlGroupMapType.ALL_IN_ONE_GROUP,
    device=device,
)

In [22]:
base_env.reset_keys

['_reset', ('player_0', '_reset'), ('player_1', '_reset')]

In [23]:
from torchrl.envs.transforms import DoubleToFloat, StepCounter

env = TransformedEnv(
    base_env,
    # DoubleToFloat(),
    # StepCounter(),
)

In [24]:
env.reset()

TensorDict(
    fields={
        done: Tensor(shape=torch.Size([1]), device=cuda:0, dtype=torch.bool, is_shared=True),
        player_0: TensorDict(
            fields={
                action_mask: Tensor(shape=torch.Size([1, 11]), device=cuda:0, dtype=torch.bool, is_shared=True),
                done: Tensor(shape=torch.Size([1, 1]), device=cuda:0, dtype=torch.bool, is_shared=True),
                mask: Tensor(shape=torch.Size([1]), device=cuda:0, dtype=torch.bool, is_shared=True),
                observation: TensorDict(
                    fields={
                        observation: Tensor(shape=torch.Size([1, 171]), device=cuda:0, dtype=torch.float64, is_shared=True)},
                    batch_size=torch.Size([1]),
                    device=cuda:0,
                    is_shared=True),
                terminated: Tensor(shape=torch.Size([1, 1]), device=cuda:0, dtype=torch.bool, is_shared=True),
                truncated: Tensor(shape=torch.Size([1, 1]), device=cuda:0, dtype=to

In [25]:
import torch.nn as nn
from numpy import random
class RandomPolicy(nn.Module):
    def __init__(self, action_space):
        super().__init__()
        self.action_space = action_space

    def forward(self, obs, action_mask):
        # choose one of the legal actions uniformly at random from the action mask ([true, false, true] -> [0, 2])
        action_mask = action_mask.squeeze()
        action_mask = action_mask.bool()
        action_mask = action_mask.cpu().numpy()
        action = random.choice(len(action_mask), p=action_mask/sum(action_mask))
        
        return action

In [26]:

from tensordict import TensorDict    
state = env.reset()
ragent = RandomPolicy(env.action_spec)
for agent in env.agent_iter():
    print(agent)
    print(state["done"])
    obs = state[agent]['observation']
    action_mask = state[agent]['action_mask']

    if state["done"]:
        break

    action = ragent(obs, action_mask)
    print(action)
    state = env.step(TensorDict({agent: {"action": [action]}}))
    print(state)
    state = state['next']

player_0
tensor([False], device='cuda:0')
9
TensorDict(
    fields={
        next: TensorDict(
            fields={
                done: Tensor(shape=torch.Size([1]), device=cuda:0, dtype=torch.bool, is_shared=True),
                player_0: TensorDict(
                    fields={
                        action_mask: Tensor(shape=torch.Size([1, 11]), device=cuda:0, dtype=torch.bool, is_shared=True),
                        done: Tensor(shape=torch.Size([1, 1]), device=cuda:0, dtype=torch.bool, is_shared=True),
                        mask: Tensor(shape=torch.Size([1]), device=cuda:0, dtype=torch.bool, is_shared=True),
                        observation: TensorDict(
                            fields={
                                observation: Tensor(shape=torch.Size([1, 171]), device=cuda:0, dtype=torch.float64, is_shared=True)},
                            batch_size=torch.Size([1]),
                            device=cuda:0,
                            is_shared=True),
       

In [27]:
policy = RandomPolicy(env.action_space)
collector = SyncDataCollector(
    env,
    policy=policy,
    device=device,
    frames_per_batch=frames_per_batch,
    total_frames=total_frames,
)

TypeError: Arguments to policy.forward are incompatible with entries in
    env.observation_spec (got incongruent signatures: fun signature is {'action_mask', 'obs'} vs specs {'player_1', 'player_0'}).
    If you want TorchRL to automatically wrap your policy with a TensorDictModule
    then the arguments to policy.forward must correspond one-to-one with entries
    in env.observation_spec.
    For more complex behaviour and more control you can consider writing your
    own TensorDictModule.
    Check the collector documentation to know more about accepted policies.
    

In [None]:
for iteration, batch in enumerate(collector):
    print(iteration, batch["observation"].shape, batch["reward"].shape)

KeyError: 0

In [None]:
from torchrl.envs.transforms import  DTypeCastTransform
env = TransformedEnv(
    base_env,
    RewardSum(in_keys=base_env.reward_keys, out_keys=[("agents", "episode_reward")]),
    DTypeCastTransform(torch.float64, torch.double),
)

ValueError: RewardSum expects the same number of input and output keys

In [None]:
env.reset()

TensorDict(
    fields={
        agents: TensorDict(
            fields={
                action_mask: Tensor(shape=torch.Size([2, 11]), device=cuda:0, dtype=torch.bool, is_shared=True),
                done: Tensor(shape=torch.Size([2, 1]), device=cuda:0, dtype=torch.bool, is_shared=True),
                episode_reward: Tensor(shape=torch.Size([2, 1]), device=cuda:0, dtype=torch.float32, is_shared=True),
                mask: Tensor(shape=torch.Size([2]), device=cuda:0, dtype=torch.bool, is_shared=True),
                observation: TensorDict(
                    fields={
                        observation: Tensor(shape=torch.Size([2, 173]), device=cuda:0, dtype=torch.float64, is_shared=True)},
                    batch_size=torch.Size([2]),
                    device=cuda:0,
                    is_shared=True),
                terminated: Tensor(shape=torch.Size([2, 1]), device=cuda:0, dtype=torch.bool, is_shared=True),
                truncated: Tensor(shape=torch.Size([2, 1]), d

In [None]:
policy_net = torch.nn.Sequential(MultiAgentMLP(
        n_agent_inputs=173,  # n_obs_per_agent
        n_agent_outputs=11,  # 2 * n_actions_per_agents
        n_agents=2,
        centralised=False,  # the policies are decentralised (ie each agent will act from its observation)
        share_params=True,
        depth=2,
        num_cells=256,
        activation_class=torch.nn.Tanh,
        device=device,
    ),
)


TypeError: __init__() got an unexpected keyword argument 'bias'

In [None]:
policy_module = TensorDictModule(
    policy_net,
    in_keys=[("agents", "observation", "observation")],
    out_keys=[("agents", "actions")],
)

In [None]:
policy = ProbabilisticActor(
    module=policy_module,
    spec=env.action_spec,
    in_keys=[("agents", "actions")],
    out_keys=[env.action_key],
    return_log_prob=True,
)  # we'll need the log-prob for the PPO loss

In [None]:
env.reset()['agents']

TensorDict(
    fields={
        action_mask: Tensor(shape=torch.Size([2, 11]), device=cuda:0, dtype=torch.bool, is_shared=True),
        done: Tensor(shape=torch.Size([2, 1]), device=cuda:0, dtype=torch.bool, is_shared=True),
        episode_reward: Tensor(shape=torch.Size([2, 1]), device=cuda:0, dtype=torch.float32, is_shared=True),
        mask: Tensor(shape=torch.Size([2]), device=cuda:0, dtype=torch.bool, is_shared=True),
        observation: TensorDict(
            fields={
                observation: Tensor(shape=torch.Size([2, 173]), device=cuda:0, dtype=torch.float64, is_shared=True)},
            batch_size=torch.Size([2]),
            device=cuda:0,
            is_shared=True),
        terminated: Tensor(shape=torch.Size([2, 1]), device=cuda:0, dtype=torch.bool, is_shared=True),
        truncated: Tensor(shape=torch.Size([2, 1]), device=cuda:0, dtype=torch.bool, is_shared=True)},
    batch_size=torch.Size([2]),
    device=cuda:0,
    is_shared=True)

In [None]:
env

TransformedEnv(
    env=PettingZooEnv(),
    transform=RewardSum(keys=[('agents', 'reward')]))

In [None]:
policy(env.reset())

RuntimeError: mat1 and mat2 must have the same dtype, but got Double and Float