In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.distributions import Categorical

import matplotlib.pyplot as plt
import numpy as np

from collections import defaultdict

from NaturalEnv import natural_env_v0
from pettingzoo.mpe import simple_tag_v3

is_ipython = 'inline' in plt.get_backend()
if is_ipython:
    from IPython import display

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

args = {
    'render_mode': None,
    'max_cycles': 256,
    'continuous_actions': False,
    'num_predators': 0,
    'num_prey': 1,
    'num_obstacles': 0,
    'num_food': 1,
    'num_water': 1,
    'num_forests': 0
}

env = simple_tag_v3.parallel_env(max_cycles=128, continuous_actions=False)
#env = natural_env_v0.parallel_env(**args)
obs, _ = env.reset()

# Get observation and action spaces
obs_spaces = {agent: env.observation_space(agent).shape[0] for agent in env.agents}
action_spaces = {agent: env.action_space(agent).n for agent in env.agents}
agents = env.agents  # List of agents

print(f"Device used: {device}")
print(f"Observation spaces: {obs_spaces}")
print(f"Action spaces: {action_spaces}")

Device used: cpu
Observation spaces: {'adversary_0': 16, 'adversary_1': 16, 'adversary_2': 16, 'agent_0': 14}
Action spaces: {'adversary_0': np.int64(5), 'adversary_1': np.int64(5), 'adversary_2': np.int64(5), 'agent_0': np.int64(5)}


  return torch._C._cuda_getDeviceCount() > 0


In [2]:
# Neural Network for Policy and Value Function
class ActorCritic(nn.Module):
    def __init__(self, obs_dim, act_dim, hidden_dim=64):
        super(ActorCritic, self).__init__()
        # Shared layers between policy and value
        self.shared = nn.Sequential(
            nn.Linear(obs_dim, hidden_dim),
            nn.ReLU(),
        )

        # Policy function for PPO
        self.policy = nn.Sequential(
            nn.Linear(hidden_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, act_dim),
            nn.Softmax(dim=-1),
        )

        # Value function for PPO
        self.value = nn.Sequential(
            nn.Linear(hidden_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, 1),
        )
    
    def forward(self, obs):
        x = self.shared(obs)
        return self.policy(x), self.value(x)

# PPO Algorithm
class PPO:
    def __init__(self, obs_dim, act_dim, lr=3e-4, gamma=0.99, clip_epsilon=0.2, update_steps=4, batch_size=64):
        self.gamma = gamma
        self.clip_epsilon = clip_epsilon
        self.update_steps = update_steps
        self.batch_size = batch_size

        self.actor_critic = ActorCritic(obs_dim, act_dim)
        self.optimizer = optim.Adam(self.actor_critic.parameters(), lr=lr)
    
    def compute_returns(self, rewards, dones, last_value):
        """
        Calculates the expected cumulative rewards for each time step

        Args:
            rewards (list): List of rewards for each time step
            dones (list): List of done flags for each time step
            last_value (float): Value estimate for the last time step

        Returns:
            returns (list): Expected cumulative rewards for each time step
        """
        returns = []
        R = last_value
        for r, d in zip(reversed(rewards), reversed(dones)):
            R = r + self.gamma * R * (1 - d)
            returns.insert(0, R)
        return returns
    
    def update(self, obs, actions, old_log_probs, returns, advantages):
        """
        Updates the policy and value function using PPO

        Args:
            obs (list): List of observations for each time step
            actions (list): List of actions for each time step
            old_log_probs (list): List of old log probabilities for each time step
            returns (list): List of expected cumulative rewards for each time step
            advantages (list): List of advantages for each time step
        """
        # Convert to tensors, move to device. Everything that was not already on the device
        actions = torch.tensor(actions, dtype=torch.int64, device=device)
        old_log_probs = torch.tensor(old_log_probs, dtype=torch.float32, device=device)
        returns = torch.tensor(returns, dtype=torch.float32, device=device)
        advantages = torch.tensor(advantages, dtype=torch.float32, device=device)

        for _ in range(self.update_steps):
            for start in range(0, len(obs), self.batch_size):
                end = start + self.batch_size
                batch_obs = obs[start:end]
                batch_actions = actions[start:end]
                batch_old_log_probs = old_log_probs[start:end]
                batch_returns = returns[start:end]
                batch_advantages = advantages[start:end]

                # Forward pass
                policy, values = self.actor_critic(batch_obs)
                dist = Categorical(policy)
                new_log_probs = dist.log_prob(batch_actions)
                entropy = dist.entropy().mean()

                # PPO loss
                ratios = (new_log_probs - batch_old_log_probs).exp()
                surr1 = ratios * batch_advantages
                surr2 = torch.clamp(ratios, 1.0 - self.clip_epsilon, 1.0 + self.clip_epsilon) * batch_advantages
                policy_loss = -torch.min(surr1, surr2).mean()
                value_loss = ((batch_returns - values.squeeze()) ** 2).mean()

                loss = policy_loss + 0.5 * value_loss - 0.01 * entropy

                # Backward pass
                self.optimizer.zero_grad()
                loss.backward()
                self.optimizer.step()

In [3]:
def update_plot_multi_agent(episode, max_episodes, reward_history, show_result=False):
    plt.figure(1)

    if show_result:
        plt.title(f'Final Result:')
    else:
        plt.clf()
        plt.title(f"Episode {episode} of {max_episodes}")
    plt.xlabel('Episode')
    plt.ylabel('Total reward')

    for agent_name in agents:
        rewards_t = torch.tensor(reward_history[agent_name], dtype=torch.float)
        plt.plot(rewards_t.numpy(), label=agent_name)

        """
        # Plot moving average of last 10 rewards
        if len(rewards_t) >= 10:
            means = rewards_t.unfold(0, 10, 1).mean(1).view(-1)
            means = torch.cat((torch.zeros(9), means))
            plt.plot(means.numpy())
        """

    plt.legend()
    plt.pause(0.001)
    if is_ipython:
        if not show_result:
            display.display(plt.gcf())
            display.clear_output(wait=True)
        else:
            display.display(plt.gcf())

In [4]:
# Initialize PPO agents
actors = {agent: PPO(obs_spaces[agent], action_spaces[agent]).to(device) for agent in agents}

# Epsilon Greedy Exploration
epsilon = 1.0
epsilon_decay = 0.999
epsilon_min = 0.01

# Main training loop
episodes = 100

plt.ion()
reward_history = {agent_name: [] for agent_name in agents}

for episode in range(episodes):
    obs, _ = env.reset()

    # Get initial observations
    done = defaultdict(bool, {agent: False for agent in agents})
    episode_reward = {agent: 0 for agent in agents}
    
    while not all(done.values()):
        # Get actions for each agent
        actions, log_probs, values = {}, {}, {}

        # Convert observations to tensors
        states = {agent: torch.tensor(obs[agent], dtype=torch.float32, device=device).unsqueeze(0) for agent in agents}

        for i, agent in enumerate(agents):
            # Get action probabilities from actor
            policy, value = actors[agent].actor_critic(states[agent])
            dist = Categorical(policy)
            action = dist.sample().item()
            actions[agent] = action

            # Store log probabilities and values
            log_probs[agent] = dist.log_prob(torch.tensor([action], dtype=torch.float32)).item()
            values[agent] = value.item()

        # Step the environment
        next_obs, rewards, terminated, truncated,  _ = env.step(actions)

        # Update done flag
        done = defaultdict(bool, {agent: terminated[agent] or truncated[agent] for agent in agents})

        for agent in agents:
            # Compute reward for each agent
            episode_reward[agent] += rewards[agent]

            # Update PPO for each agent
            if not done[agent]:
                last_value = values[agent]
                returns = actors[agent].compute_returns([rewards[agent]], [done[agent]], last_value)
                advantages = np.array(returns) - values[agent]

                actors[agent].update(states[agent], [actions[agent]], [log_probs[agent]], returns, advantages)

        obs = next_obs

    # Update epsilon
    epsilon = max(epsilon_min, epsilon_decay * epsilon)

    # Update reward history
    reward_history = {agent_name: reward_history[agent_name] + [episode_reward[agent_name]] for agent_name in agents}
    update_plot_multi_agent(episode, episodes, reward_history)

update_plot_multi_agent(episode + 1, episodes, reward_history, show_result=True)
plt.ioff()
env.close()

AttributeError: 'PPO' object has no attribute 'to'

In [None]:
# Test the trained model
args['render_mode'] = 'human'
#human_env = natural_env_v0.parallel_env(**args)
human_env = simple_tag_v3.parallel_env(render_mode="human", continuous_actions=False)

obs, _ = human_env.reset()

done = defaultdict(bool, {agent: False for agent in agents})

while not all(done.values()):
    actions = {}
    for agent in agents:
        state = torch.tensor(obs[agent], dtype=torch.float32).unsqueeze(0)
        policy, _ = actors[agent].actor_critic(state)
        dist = Categorical(policy)
        action = dist.sample().item()
        actions[agent] = action

    obs, _, terminated, truncated, _ = human_env.step(actions)

    done = defaultdict(bool, {agent: terminated[agent] or truncated[agent] for agent in agents})

human_env.close()