In [9]:
import torch
import torch.nn as nn
from torch.distributions import Categorical
import vmas
import random
from collections import defaultdict

# Definición de la política PPO
class PPOPolicy(nn.Module):
    def __init__(self, input_dim, action_dim):
        super(PPOPolicy, self).__init__()
        self.shared = nn.Sequential(
            nn.Linear(input_dim, 128),
            nn.ReLU(),
            nn.Linear(128, 128),
            nn.ReLU(),
        )
        self.actor = nn.Linear(128, action_dim)
        self.critic = nn.Linear(128, 1)

    def forward(self, x):
        x = self.shared(x)
        return self.actor(x), self.critic(x)

    def act(self, x):
        logits, _ = self.forward(x)
        dist = Categorical(logits=logits)
        action = dist.sample()
        return action, dist.log_prob(action)

    def evaluate(self, x, actions):
        logits, values = self.forward(x)
        dist = Categorical(logits=logits)
        action_logprobs = dist.log_prob(actions)
        dist_entropy = dist.entropy()
        return action_logprobs, values, dist_entropy

class PPOAgent:
    def __init__(self, input_dim, action_dim, lr=3e-4, gamma=0.99, clip_eps=0.2, k_epochs=4):
        self.policy = PPOPolicy(input_dim, action_dim)
        self.optimizer = torch.optim.Adam(self.policy.parameters(), lr=lr)
        self.gamma = gamma
        self.clip_eps = clip_eps
        self.k_epochs = k_epochs

    def compute_returns(self, rewards, masks, next_value):
        returns = []
        R = next_value
        for step in reversed(range(len(rewards))):
            R = rewards[step] + self.gamma * R * masks[step]
            returns.insert(0, R)
        return torch.cat(returns)

    def update(self, memory):
        states = torch.stack(memory['states']).squeeze(1)
        actions = torch.cat(memory['actions'])
        log_probs_old = torch.cat(memory['log_probs'])
        rewards = torch.cat(memory['rewards'])
        masks = torch.cat(memory['masks'])
        values = self.policy.critic(states)
        returns = self.compute_returns(rewards, masks, values[-1].detach())

        advantages = returns - values.detach()
        for _ in range(self.k_epochs):
            log_probs, values, dist_entropy = self.policy.evaluate(states, actions)
            ratio = torch.exp(log_probs - log_probs_old.detach())

            surr1 = ratio * advantages
            surr2 = torch.clamp(ratio, 1.0 - self.clip_eps, 1.0 + self.clip_eps) * advantages
            policy_loss = -torch.min(surr1, surr2).mean()
            value_loss = nn.MSELoss()(returns, values)

            self.optimizer.zero_grad()
            (policy_loss + 0.5 * value_loss - 0.01 * dist_entropy.mean()).backward()
            self.optimizer.step()


# Inicializar el entorno `navigation`
env = vmas.make_env(
    scenario="navigation",
    num_envs=1,
    device="cpu",  # Cambia a "cuda" si tienes GPU disponible
    continuous_actions=False,
    max_steps=200,
)

# Dimensiones de observaciones y acciones
input_dim = env.observation_space[0].shape[0]
action_dim = env.action_space[0].n

# Crear el agente PPO
agent = PPOAgent(input_dim, action_dim)

# Entrenamiento PPO
max_episodes = 500
memory = defaultdict(list)

for episode in range(max_episodes):
    state = env.reset()
    episode_rewards = 0
    for _ in range(200):  # Máximo de pasos por episodio
        state_tensors = [torch.tensor(s, dtype=torch.float32) for s in state]

        actions, log_probs = [], []
        for state_tensor in state_tensors:
            action, log_prob = agent.policy.act(state_tensor)
            actions.append(action.numpy())  # Acción individual para cada agente
            log_probs.append(log_prob)

        next_state, reward, done, _ = env.step(actions)

        for i in range(len(state)):
            memory['states'].append(state_tensors[i])
            memory['actions'].append(torch.tensor(actions[i]))
            memory['log_probs'].append(log_probs[i])
            memory['rewards'].append(torch.tensor([reward[i]], dtype=torch.float32))  # Aseguramos 1D
            # Convertir `done` en tensor de al menos una dimensión
            done_value = done[i] if isinstance(done, list) else done
            memory['masks'].append(torch.tensor([1.0 if not done_value else 0.0], dtype=torch.float32))

        state = next_state
        episode_rewards += sum(reward)  # Sumar recompensas de todos los agentes
        if all(done):  # Terminar el episodio si todos los agentes están listos
            break

    print(f"Episode {episode} Reward: {episode_rewards}")

    # Actualizar la política al final del episodio
    agent.update(memory)
    memory.clear()



# Renderizar el entorno con la política aprendida
# for _ in range(10):  # Visualizar 10 episodios
#     state = env.reset()
#     total_rewards = 0
#     for _ in range(200):  # Máximo de pasos por episodio
#         state_tensor = torch.tensor(state[0], dtype=torch.float32)
#         action, _ = agent.policy.act(state_tensor)  # Usar la política entrenada
#         action_list = [action.numpy()]  # Convertir la acción a lista para VMAS
#         next_state, reward, done, _ = env.step(action_list)

#         total_rewards += reward[0]
#         env.render(mode="human")  # Renderizar el episodio

#         if done[0]:
#             break

#         state = next_state

#     print(f"Total Reward for Episode: {total_rewards}")


  state_tensors = [torch.tensor(s, dtype=torch.float32) for s in state]
  memory['actions'].append(torch.tensor(actions[i]))


Episode 0 Reward: tensor([-6.8778])


RuntimeError: mat1 and mat2 shapes cannot be multiplied (800x18 and 128x1)