In [1]:
import torch
import torch.nn as nn
from graph_patrol_env import GraphPatrolEnv
from patrol_graph import PatrolGraph

In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np


class PPOAgent:
    def __init__(
        self,
        policy_network,
        policy_lr,
        value_lr,
        gamma,
        gae_lambda,
        entropy_coef,
        value_clip,
        num_epochs,
        num_mini_batches,
        ppo_ratio_clip,
        max_grad_norm,
        device=torch.device("cpu")
    ):
        self.policy_network = policy_network.to(device)
        self.policy_optimizer = optim.Adam(self.policy_network.parameters(), lr=policy_lr)
        self.value_network = policy_network.to(device)
        self.value_optimizer = optim.Adam(self.value_network.parameters(), lr=value_lr)
        self.gamma = gamma
        self.gae_lambda = gae_lambda
        self.entropy_coef = entropy_coef
        self.value_clip = value_clip
        self.num_epochs = num_epochs
        self.num_mini_batches = num_mini_batches
        self.ppo_ratio_clip = ppo_ratio_clip
        self.max_grad_norm = max_grad_norm
        self.device = device
        self.episode_buffer = []
        self.total_steps = 0

    def act(self, obs):
        actions = {}
        with torch.no_grad():
            for agent, agent_obs in obs.items():
                obs_tensor = torch.tensor(agent_obs, dtype=torch.float32).unsqueeze(0).to(self.device)
                action_probs = self.policy_network(obs_tensor)
                action_probs_np = action_probs.cpu().numpy()
                action = np.random.choice(len(action_probs_np), p=action_probs_np)
                actions[agent] = action
        return actions

    def collect(self, obs, actions, rewards, next_obs, dones):
        self.episode_buffer.append((obs, actions, rewards, next_obs, dones))

    def train(self):
        obs, actions, rewards, next_obs, dones = zip(*self.episode_buffer)
        obs_tensor = torch.tensor(obs, dtype=torch.float32).to(self.device)
        actions_tensor = torch.tensor(actions).to(self.device)
        rewards_tensor = torch.tensor(rewards, dtype=torch.float32).to(self.device)
        next_obs_tensor = torch.tensor(next_obs, dtype=torch.float32).to(self.device)
        dones_tensor = torch.tensor(dones, dtype=torch.float32).to(self.device)

        self.episode_buffer = []

        values = self.value_network(obs_tensor).squeeze()
        next_values = self.value_network(next_obs_tensor).squeeze()
        advantages = self.calculate_advantages(rewards_tensor, dones_tensor, values, next_values)

        for _ in range(self.num_epochs):
            indices = np.arange(len(obs))
            np.random.shuffle(indices)

            for i in range(0, len(indices), self.num_mini_batches):
                batch_indices = indices[i : i + self.num_mini_batches]
                obs_batch = obs_tensor[batch_indices]
                actions_batch = actions_tensor[batch_indices]
                advantages_batch = advantages[batch_indices]
                old_action_probs = self.policy_network(obs_batch).gather(1, actions_batch.unsqueeze(1)).squeeze()

                for _ in range(3):  # PPO optimization loop
                    action_probs = self.policy_network(obs_batch)
                    ratio = action_probs / (old_action_probs + 1e-5)
                    surr1 = ratio * advantages_batch.unsqueeze(1)
                    surr2 = torch.clamp(ratio, 1.0 - self.ppo_ratio_clip, 1.0 + self.ppo_ratio_clip) * advantages_batch.unsqueeze(1)
                    policy_loss = -torch.min(surr1, surr2).mean()

                    values_batch = self.value_network(obs_batch).squeeze()
                    clipped_values = values_batch + torch.clamp(values_batch - values[batch_indices], -self.value_clip, self.value_clip)
                    value_loss = 0.5 * torch.max((values_batch - rewards_tensor[batch_indices]) ** 2, (clipped_values - rewards_tensor[batch_indices]) ** 2).mean()

                    entropy_loss = -(action_probs * torch.log(action_probs + 1e-5)).sum(dim=1).mean()

                    self.policy_optimizer.zero_grad()
                    policy_loss.backward()
                    nn.utils.clip_grad_norm_(self.policy_network.parameters(), self.max_grad_norm)
                    self.policy_optimizer.step()

                    self.value_optimizer.zero_grad()
                    value_loss.backward()
                    nn.utils.clip_grad_norm_(self.value_network.parameters(), self.max_grad_norm)
                    self.value_optimizer.step()

        self.total_steps += len(obs)

    def calculate_advantages(self, rewards, dones, values, next_values):
        advantages = []
        advantage = 0
        next_advantage = 0
        next_non_terminal = 1 - dones[-1]
        for i in reversed(range(len(rewards))):
            delta = rewards[i] + self.gamma * next_non_terminal * next_values[i] - values[i]
            advantage = delta + self.gamma * self.gae_lambda * advantage
            advantages.append(advantage)
            next_non_terminal = 1 - dones[i]
            next_advantage = advantage
        advantages.reverse()
        return torch.tensor(advantages, dtype=torch.float32).to(self.device)


In [3]:
# Step 1: Define the policy network
class PolicyNetwork(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(PolicyNetwork, self).__init__()
        self.fc = nn.Linear(input_dim, 64)
        self.fc2 = nn.Linear(64, output_dim)

    def forward(self, x):
        x = torch.relu(self.fc(x))
        x = torch.softmax(self.fc2(x), dim=-1)
        return x

In [4]:
# Step 2: Instantiate the PPO agent
env = GraphPatrolEnv(PatrolGraph("cumberland.graph"), 3)
input_dim = len(env.pg.graph)
output_dim = len(env.pg.graph)
policy_net = PolicyNetwork(input_dim, output_dim)
agent = PPOAgent(
    policy_net,
    policy_lr=0.001,
    value_lr=0.001,
    gamma=0.99,
    gae_lambda=0.95,
    entropy_coef=0.01,
    value_clip=True,
    num_epochs=4,
    num_mini_batches=32,
    ppo_ratio_clip=0.2,
    max_grad_norm=0.5,
    device=torch.device("cuda" if torch.cuda.is_available() else "cpu")
)

In [5]:
# Step 3: Train the agent
num_episodes = 1000
for episode in range(num_episodes):
    obs = env.reset()
    done = False
    episode_reward = 0

    while not done:
        actions = agent.act(obs)
        next_obs, rewards, done, _ = env.step(actions)
        agent.collect(obs, actions, rewards, next_obs, done)
        obs = next_obs
        episode_reward += sum(rewards.values())

    agent.train()

    # Print episode information
    print(f"Episode {episode + 1}/{num_episodes} - Reward: {episode_reward}")


RuntimeError: both arguments to matmul need to be at least 1D, but they are 0D and 2D

In [None]:
# Step 4: Use the trained agent
# You can use the trained agent to act in the environment and evaluate its performance.
# For example:
obs = env.reset()
done = False
while not done:
    actions = agent.act(obs)
    obs, rewards, done, _ = env.step(actions)
    env.plot_world()