# **Report Group 12** 


#### **Loading packages**


In [1]:
from datetime import datetime

import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import matplotlib.pyplot as plt
import wandb
from gymPacMan import gymPacMan_parallel_env
import torch.nn.functional as F
from collections import deque
import random
import os
from dataclasses import dataclass
import pandas as pd

import cProfile
import pstats

torch.autograd.set_detect_anomaly(True)


if torch.cuda.is_available():
    device = torch.device("cuda:0")
    print("Running on the GPU")
else:
    device = torch.device("cpu")
    print("Running on the CPU")

wandb.login(key = '', relogin = True)
team_name = 'RL_PacMan'

from util_functions import *


Running on the GPU


wandb: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
wandb: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
wandb: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:wandb: Appending key for api.wandb.ai to your netrc file: C:\Users\josse\_netrc


Running on the GPU


# **Section 1: All functions**

## **Networks**


### Agent networks


#### CNN

In [2]:
class AgentQNetwork(nn.Module):
    def __init__(self,obs_shape, action_dim,hidden_dim=64):
        super(AgentQNetwork, self).__init__()
        self.c1 = nn.Conv2d(obs_shape[0], 32, kernel_size=3, stride=1, padding=1)
        self.c2 = nn.Conv2d(32, hidden_dim, kernel_size=3, stride=1, padding=1)
        self.c3 = nn.Conv2d(hidden_dim, 4*hidden_dim, kernel_size=3, stride=1, padding=1)
        self.global_pool = nn.AdaptiveAvgPool2d(1)
        self.fc1 = nn.Linear(4*hidden_dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, action_dim)


    def forward(self, obs):
        x = torch.relu(self.c1(obs))
        x = torch.relu(self.c2(x))
        x = torch.relu(self.c3(x))
        x = self.global_pool(x)
        x = torch.flatten(x, start_dim=1)
        x = torch.relu(self.fc1(x))
        q_values = self.fc2(x)
        return q_values

#### CNN with attention

#### LSTM

In [3]:
# Define the CNN-LSTM-Q Network


class CNNLSTMQNetwork_hidden(nn.Module):
    def __init__(self, embed_dim, obs_shape, action_space):
        super(CNNLSTMQNetwork_hidden, self).__init__()
        self.embed_dim = embed_dim

        # Shared encoder for the agent's state
        self.shared_encoder = nn.Sequential(
            nn.Conv2d(obs_shape[0], 16, kernel_size=3, stride=1, padding=1),
            nn.ReLU(),
            nn.Conv2d(16, 64, kernel_size=3, stride=1, padding=1),
            nn.ReLU(),
            nn.Conv2d(64, 128, kernel_size=3, stride=1, padding=1),
            nn.ReLU(),
            nn.AdaptiveAvgPool2d((1, 1))  # Adaptive pooling to handle varying input sizes
        )

        # Fully connected layer to project encoded state to embed_dim
        self.fc = nn.Linear(128 * 1 * 1, embed_dim)

        # LSTM layer to model temporal dependencies
        self.lstm = nn.LSTM(embed_dim, embed_dim, num_layers=1, batch_first=True)

        # Final layer to compute Q-values
        self.final_layer = nn.Sequential(
            nn.Linear(embed_dim, action_space),
            nn.SiLU()  # Swish activation
        )

    def forward(self, state, hidden):
        # Process single state using the shared encoder
        cnn_features = []
        for t in range(5):
            
            encoded_state = self.shared_encoder(state[:, t, :, :, :])  # Encode the state
            encoded_state = encoded_state.view(encoded_state.size(0), -1)  # Flatten to [batch_size, 128 * 1 * 1]
            projected_state = self.fc(encoded_state).unsqueeze(1)  # Project to embed_dim and add sequence dimension
            cnn_features.append(projected_state)
            
        cnn_features = torch.cat(cnn_features, dim=1)  # Shape: (batch_size, seq_len, cnn_output_size)
        # Apply LSTM to model temporal dependencies
        hx, cx = hidden[0, :, :, :], hidden[1, :, :, :]
        lstm_out, (hx, cx) = self.lstm(cnn_features, (hx, cx))
        hidden_new = torch.stack([hx.detach(), cx.detach()], dim=0)
        # Use the output of the LSTM for the final layer
        q_values = self.final_layer(lstm_out[:, -1, :])  # Shape: [batch_size, action_space]

        return q_values, hidden_new

    def init_hidden(self, batch_size):
        return (torch.zeros(1, batch_size, self.embed_dim),
                torch.zeros(1, batch_size, self.embed_dim))


### Mixing Networks

#### Simple

In [128]:
class SimpleQMixer(nn.Module):
    def __init__(self, n_agents, state_shape):
        """
        Simple QMIX Mixing Network with Convolutional layers.
        Args:
            n_agents (int): Number of agents.
            state_shape (tuple): Shape of the agent-specific input (n_agents, channels, H, W).
        """
        super(SimpleQMixer, self).__init__()
        
        self.n_agents = n_agents
        self.state_shape = state_shape

        self.conv = nn.Sequential(
            nn.Conv2d(state_shape[1], 16, kernel_size=3, stride=1, padding=1),  # state_shape[1] = number of channels
            nn.ReLU(),
            nn.Conv2d(16, 32, kernel_size=3, stride=1, padding=1),
            nn.ReLU(),
            nn.Conv2d(32, 64, kernel_size=3, stride=1, padding=1),
            nn.ReLU()
        )
        
        dummy_input = torch.zeros(1, state_shape[1], state_shape[2], state_shape[3])  # [batch_size=1, channels, H, W]
        conv_output_size = self.conv(dummy_input).view(1, -1).size(1)
        
        # Hypernetwork layers
        self.hyper_w = nn.Sequential(
            nn.Linear(conv_output_size * n_agents, 64),
            nn.ReLU(),
            nn.Linear(64, n_agents)
        )
        
        self.hyper_b = nn.Sequential(
            nn.Linear(conv_output_size * n_agents, 64),
            nn.ReLU(),
            nn.Linear(64, 1)
        )

    def forward(self, agent_qs, agent_states):
        """
        Forward pass for the mixing network with Convolutional layers.
        Args:
            agent_qs (torch.Tensor): Tensor of shape [batch_size, n_agents] containing Q-values for each agent.
            agent_states (torch.Tensor): Tensor of shape [batch_size, n_agents, channels, H, W].
        Returns:
            torch.Tensor: Global Q-value of shape [batch_size, 1].
        """
        batch_size, num_agents, channels, height, width = agent_states.shape
        
        agent_states_r = agent_states.view(batch_size * num_agents, channels, height, width)  # [batch_size * n_agents, channels, H, W]
        
        conv_features = self.conv(agent_states_r)  # [batch_size * n_agents, conv_channels, H, W]
        flattened_features = conv_features.view(batch_size, num_agents, -1)  # [batch_size, n_agents, conv_output_size]
        
        combined_features = flattened_features.view(batch_size, -1)  # [batch_size, n_agents * conv_output_size]
        
        w = torch.abs(self.hyper_w(combined_features)).view(batch_size, self.n_agents, 1)
        b = self.hyper_b(combined_features).view(batch_size, 1, 1)
        
        mixed_qs = torch.bmm(agent_qs.unsqueeze(1), w).squeeze(1) + b
        
        return mixed_qs




#### Attention

In [129]:

class AttentionMixer(nn.Module):
    def __init__(self, n_agents, state_shape=(2, 8, 10, 20), embed_dim=32, n_heads=4):
        super(AttentionMixer, self).__init__()

        self.n_agents = n_agents
        self.state_shape = state_shape[1:]  # Remove agent dimension
        self.embed_dim = embed_dim
        self.n_heads = n_heads

        # Shared Conv2D encoder for each agent
        self.shared_encoder = nn.Sequential(
            nn.Conv2d(self.state_shape[0], 16, kernel_size=3, stride=1, padding='same'),
            nn.SiLU(),  # Swish activation
            nn.Conv2d(16, 32, kernel_size=3, stride=1, padding='same'),
            nn.SiLU(),  # Swish activation
            nn.Conv2d(32, 64, kernel_size=3, stride=1, padding='same'),
            nn.SiLU(),  # Swish activation
            nn.AdaptiveAvgPool2d(1)
        )

        # Fully connected layer to project to embed_dim
        self.fc = nn.Sequential(
            nn.Linear(64, embed_dim),
            nn.SiLU()  # Swish activation
        )

        for m in self.shared_encoder:
            if isinstance(m, nn.Conv2d):
                nn.init.uniform_(m.weight, 0.9, 1.1)
                if m.bias is not None:
                    nn.init.constant_(m.bias, 0.1)
        for m in self.fc:
            if isinstance(m, nn.Linear):
                nn.init.uniform_(m.weight, 0.9, 1.1)
                if m.bias is not None:
                    nn.init.constant_(m.bias, 0.1)

        # Multi-head attention for inter-agent interaction
        self.attention = nn.MultiheadAttention(embed_dim=embed_dim, num_heads=n_heads, batch_first=True)

        # Hypernetwork for mixing weights_enemy and biases
        self.hyper_w = nn.Sequential(
            nn.Linear(embed_dim * n_agents, embed_dim * embed_dim),
            nn.SiLU(),  # Swish activation
            nn.Linear(embed_dim * embed_dim, n_agents * embed_dim)
        )
        self.hyper_b = nn.Sequential(
            nn.Linear(embed_dim * n_agents, embed_dim),
            nn.SiLU(),  # Swish activation
            nn.Linear(embed_dim, embed_dim)
        )

        # Final layer to compute Q_tot
        self.final_layer = nn.Sequential(
            nn.Linear(embed_dim, 1),
            nn.SiLU()  # Swish activation
        )

        for m in self.hyper_w:
            if isinstance(m, nn.Linear):
                nn.init.uniform_(m.weight, 0.9, 1.1)
                if m.bias is not None:
                    nn.init.constant_(m.bias, 0.1)

        for m in self.hyper_b:
            if isinstance(m, nn.Linear):
                nn.init.uniform_(m.weight, 0.9, 1.1)
                if m.bias is not None:
                    nn.init.constant_(m.bias, 0.1)

        for m in self.final_layer:
            if isinstance(m, nn.Linear):
                nn.init.uniform_(m.weight, 0.9, 1.1)
                if m.bias is not None:
                    nn.init.constant_(m.bias, 0.1)


    def forward(self, agent_qs, states):
        batch_size = states.size(0)

        # Process each agent's state individually using the shared encoder
        agent_features = []
        for i in range(self.n_agents):
            agent_state = states[:, i, :, :, :]  # Shape: [batch_size, 8, 10, 20]
            encoded_state = self.shared_encoder(agent_state)  # Encode individual agent
            encoded_state = encoded_state.view(batch_size, -1)  # Flatten to [batch_size, 128]
            projected_state = self.fc(encoded_state)  # Project to embed_dim
            agent_features.append(projected_state)

        # Stack agent features: Shape [batch_size, n_agents, embed_dim]
        agent_features = torch.stack(agent_features, dim=1)

        # Apply attention to model inter-agent interaction
        attention_out, _ = self.attention(agent_features, agent_features, agent_features)

        # Flatten attention output for hypernetwork
        attention_out_flat = attention_out.reshape(batch_size, -1)  # Shape: [batch_size, n_agents * embed_dim]

        # Compute hypernetwork weights and biases
        hyper_w = F.softplus(self.hyper_w(attention_out_flat).view(batch_size, self.n_agents, self.embed_dim))
        hyper_b = self.hyper_b(attention_out_flat).view(batch_size, self.embed_dim)

        # Combine agent Q-values with weights and biases
        agent_qs = agent_qs.view(batch_size, self.n_agents, 1)  # Shape: [batch_size, n_agents, 1]
        weighted_qs = torch.bmm(agent_qs.transpose(1, 2), hyper_w).squeeze(1) + hyper_b

        # Final projection to Q_tot
        q_tot = self.final_layer(weighted_qs).unsqueeze(1)
        return q_tot


## **Loss calculations**



### IQL

In [130]:

def compute_td_loss_iql(agent_q_networks, target_q_networks, batch, weights=None, gamma=0.99, lambda_=0.1):
    """
    Computes the TD loss for QMix training using the Huber loss.

    Args:
        agent_q_networks (list): List of Q-networks for each agent.
        mixing_network (SimpleQMixer): The mixing network to compute global Q-values.
        target_q_networks (list): List of target Q-networks for each agent.
        batch (tuple): A batch of experiences (states, actions, rewards, next_states, dones).
        weights (torch.Tensor): Importance sampling weights (optional).
        gamma (float): Discount factor for future rewards.
        lambda_ (float): Regularization factor for stability.

    Returns:
        torch.Tensor: Total loss for training.
    """
    states, actions, rewards, next_states, dones = batch

    # Convert to tensors and move to device
    states = torch.tensor(states, dtype=torch.float32).to(device)
    actions = torch.tensor(actions, dtype=torch.long).to(device)
    rewards = torch.tensor(rewards, dtype=torch.float32).to(device)
    next_states = torch.tensor(next_states, dtype=torch.float32).to(device)
    dones = torch.tensor(dones, dtype=torch.float32).to(device)

    # Current Q-values for each agent
    agent_q_values = []
    for agent_index, q_net in enumerate(agent_q_networks):
        q_vals = q_net(states[:, agent_index, :, :, :])  # Get Q-values for each agent
        agent_q_values.append(
            q_vals.gather(dim=1, index=actions[:, agent_index].unsqueeze(1)))  # Select Q-value for taken action
    agent_q_values = torch.cat(agent_q_values, dim=1)  # Shape: (batch_size, n_agents)

    # Target Q-values using Double DQN
    with torch.no_grad():
        # Get actions from current Q-networks
        next_agent_q_values = []
        for agent_index, (q_net, target_net) in enumerate(zip(agent_q_networks, target_q_networks)):
            next_q_vals = q_net(next_states[:, agent_index, :, :, :])  # Get Q-values from current network
            max_next_actions = next_q_vals.argmax(dim=1, keepdim=True)  # Greedy actions
            target_q_vals = target_net(next_states[:, agent_index, :, :, :])  # Get Q-values from target network\
            max_next_q_vals = target_q_vals.gather(1, max_next_actions)
            done_mask = dones[:, 0, 0].unsqueeze(1)
            filtered_target_q_vals = max_next_q_vals * (1 - done_mask)

            next_agent_q_values.append(filtered_target_q_vals)  # Use target Q-values for selected actions
        next_agent_q_values = torch.cat(next_agent_q_values, dim=1)  # Shape: (batch_size, n_agents)

    # Independent Q-learning target for each agent (all members of the blue team receive the same reward)
    target_q = rewards[:, 0, 0].unsqueeze(1) + gamma * next_agent_q_values

    # Compute Huber loss, try also with MSE loss
    loss_fn = torch.nn.HuberLoss()

    loss_agent1 = loss_fn(agent_q_values[:, 0], target_q[:, 0])
    loss_agent2 = loss_fn(agent_q_values[:, 1], target_q[:, 1])

    return loss_agent1, loss_agent2

### Mixed Q-Learning


In [261]:
def compute_td_loss_qmix(agent_q_networks, target_q_networks, mixing_network, batch, weights = None, gamma=0.99, lambda_=0.1):
    states, actions, rewards, next_states, dones = batch

    # Convert to tensors and move to device
    states = torch.tensor(states, dtype=torch.float32).to(device)
    actions = torch.tensor(actions, dtype=torch.long).to(device)
    rewards = torch.tensor(rewards, dtype=torch.float32).to(device)
    next_states = torch.tensor(next_states, dtype=torch.float32).to(device)
    dones = torch.tensor(dones, dtype=torch.float32).to(device)

     # Get current Q-values for each agent
    agent_qs = []
    for i, agent_q_network in enumerate(agent_q_networks):
        agent_q = agent_q_network(states[:, i, :, :, :])  # Shape: (batch_size, num_actions)
        agent_q = agent_q.gather(dim=1, index=actions[:,i].unsqueeze(1))  # Select action Q-values
        agent_qs.append(agent_q)
    agent_qs = torch.cat(agent_qs, dim=1)  # Shape: (batch_size, n_agents)
    # Get target Q-values for each agent using Double DQN
    with torch.no_grad():
        target_agent_qs = []
        for i, (agent_q_network, target_q_network) in enumerate(
            zip(agent_q_networks, target_q_networks)
        ):
            # Double DQN: Get greedy actions from the current network
            next_q_values = agent_q_network(next_states[:, i, :, :, :])  # Shape: (batch_size, num_actions)
            greedy_actions = torch.argmax(next_q_values, dim=1, keepdim=True)
            # Get target Q-values using the target network
            target_q_values = target_q_network(next_states[:, i, :, :, :])  # Shape: (batch_size, num_actions)
            target_q = target_q_values.gather(1, greedy_actions)
            done_mask = dones[:, 0, 0].unsqueeze(1)
            filtered_target_q = target_q * (1 - done_mask)
            target_agent_qs.append(filtered_target_q)
        target_agent_qs = torch.cat(target_agent_qs, dim=1)  # Shape: (batch_size, n_agents)

        # Compute the total target Q-value using the mixing network
        target_q_tot = mixing_network(target_agent_qs, next_states).squeeze(1)   # Shape: (batch_size, 1)
    target_q_tot = rewards.sum(dim=2) + gamma * target_q_tot

    
    # Compute the global Q-value using the mixing network
    q_tot = mixing_network(agent_qs, states).squeeze(1)  # Shape: (batch_size, 1)
    # Compute Huber loss
    td_loss = F.huber_loss(target_q_tot, q_tot,  reduction="none")

    # Apply weights if prioritized experience replay is used
    if weights is not None:
        weights = torch.tensor(weights, dtype=torch.float32).to(device)
        td_loss *= weights
        


    # Return the mean loss
    total_loss = td_loss.mean()
    return total_loss

### Mixed Q-Learning for LSTM

## **Replay buffers**

### Normal buffer

In [262]:

class ReplayBuffer:
    def __init__(self, buffer_size=10000):
        self.buffer = deque(maxlen=buffer_size)

    def add(self, experience):
        self.buffer.append(experience)

    def sample(self, batch_size):
        experiences = random.sample(self.buffer, batch_size)

        # Restructure the batch into separate arrays for states, actions, rewards, next_states, and dones
        states = np.array([exp[0].cpu().numpy() for exp in experiences], dtype=np.float32)
        actions = np.array([exp[1] for exp in experiences], dtype=np.int64)
        rewards = np.array([exp[2] for exp in experiences])
        next_states = np.array([exp[3].cpu().numpy() for exp in experiences])
        dones = np.array([exp[4] for exp in experiences])

        return states, actions, rewards, next_states, dones

    def size(self):
        return len(self.buffer)


### Prioritized buffer

In [263]:
import numpy as np

class SumTree:
    """
    A Binary Sum Tree data structure for efficient sampling and priority updates.
    Each leaf node holds a priority, and internal nodes store the sum of leaves.
    """
    def __init__(self, capacity):
        self.capacity = capacity
        # Tree structure:
        # The tree is represented by an array. For a binary tree:
        # - The leaf nodes start at index `size` (which is `capacity`).
        # - For node i, children are at indices 2*i and 2*i+1.
        self.tree = np.zeros(2 * capacity, dtype=np.float32)  # sum tree
        self.data_pointer = 0
    
    def add(self, p):
        # Add a new priority at the leaf node
        idx = self.data_pointer + self.capacity
        self.update(idx, p)
        self.data_pointer = (self.data_pointer + 1) % self.capacity
    
    def update(self, idx, p):
        # Update the priority at the leaf node `idx`
        change = p - self.tree[idx]
        self.tree[idx] = p
        
        # Propagate the change up the tree
        idx //= 2
        while idx >= 1:
            self.tree[idx] += change
            idx //= 2

    def total_priority(self):
        # The root node stores the total sum of all priorities
        return self.tree[1]

    def get_leaf(self, s):
        # Get leaf index, priority given a cumulative sum `s`
        idx = 1
        while idx < self.capacity:  # while not a leaf node
            left = 2 * idx
            right = left + 1
            if s <= self.tree[left]:
                idx = left
            else:
                s -= self.tree[left]
                idx = right
        # Return leaf index, and priority
        return idx, self.tree[idx]

    def __len__(self):
        return self.capacity


In [264]:
class MultiAgentPrioritizedReplayBufferTree:
    def __init__(self, buffer_size=10000, state_shape=(2,8,20,20), num_agents=2, alpha=0.6, beta_start=0.4, beta_frames=100000, device='cuda'):
        self.buffer_size = buffer_size
        self.alpha = alpha
        self.num_agents = num_agents
        self.device = device

        # Beta schedule parameters
        self.beta_start = beta_start
        self.beta_frames = beta_frames
        self.frame = 1

        # Preallocate memory
        # Assuming states: shape (num_agents, height, width)
        self.states = np.zeros((buffer_size, *state_shape), dtype=np.float32)
        self.next_states = np.zeros((buffer_size, *state_shape), dtype=np.float32)
        self.actions = np.zeros((buffer_size, num_agents), dtype=np.int64)
        self.rewards = np.zeros((buffer_size, 1, 2), dtype=np.float32)
        self.dones = np.zeros((buffer_size, 1, 2), dtype=np.float32)

        self.count = 0
        self.pos = 0

        # Priority tree
        self.tree = SumTree(buffer_size)
        self.max_priority = 1.0  # Max priority for newly inserted transitions

    def _beta_by_frame(self):
        # Linearly increase beta from beta_start to 1.0 over beta_frames
        return min(1.0, self.beta_start + (1.0 - self.beta_start) * self.frame / self.beta_frames)

    def size(self):
        return self.count

    def add(self, experience):
        state, action, reward, next_state, done = experience
        if torch.is_tensor(state):
            state = state.cpu().numpy()
        if torch.is_tensor(next_state):
            next_state = next_state.cpu().numpy()

        self.states[self.pos] = state
        self.next_states[self.pos] = next_state
        self.actions[self.pos] = action
        self.rewards[self.pos] = reward
        self.dones[self.pos] = done

        # Add with max priority (ensures new experiences are likely to be sampled)
        self.tree.add(self.max_priority)

        self.pos = (self.pos + 1) % self.buffer_size
        self.count = min(self.count + 1, self.buffer_size)

    def sample(self, batch_size, q_networks, target_networks):
        beta = self._beta_by_frame()
        self.frame += 1

        # Sample based on priority
        indices = np.zeros(batch_size, dtype=np.int32)
        priorities = np.zeros(batch_size, dtype=np.float32)
        segment = self.tree.total_priority() / batch_size

        for i in range(batch_size):
            s = np.random.uniform(segment * i, segment * (i + 1))
            idx, p = self.tree.get_leaf(s)
            # Leaf index to data index:
            data_index = idx - self.buffer_size
            indices[i] = data_index
            priorities[i] = p

        # Compute weights
        sampling_probabilities = priorities / self.tree.total_priority()
        weights = (self.count * sampling_probabilities) ** (-beta)
        weights /= weights.max()  # Normalize

        # Gather samples
        states_batch = self.states[indices]
        actions_batch = self.actions[indices]
        rewards_batch = self.rewards[indices]
        next_states_batch = self.next_states[indices]
        dones_batch = self.dones[indices]

        # Convert to torch tensors
        states_batch_t = torch.tensor(states_batch, dtype=torch.float32, device=self.device)
        actions_batch_t = torch.tensor(actions_batch, dtype=torch.long, device=self.device)
        rewards_batch_t = torch.tensor(rewards_batch, dtype=torch.float32, device=self.device)
        next_states_batch_t = torch.tensor(next_states_batch, dtype=torch.float32, device=self.device)
        dones_batch_t = torch.tensor(dones_batch, dtype=torch.float32, device=self.device)
        weights_t = torch.tensor(weights, dtype=torch.float32, device=self.device).unsqueeze(1)

        # Compute TD errors for sampled transitions
        td_errors = self.compute_td_errors(
            states_batch_t, actions_batch_t, rewards_batch_t,
            next_states_batch_t, dones_batch_t, q_networks, target_networks
        ).detach().cpu().numpy()

        # Update priorities incrementally
        new_priorities = np.max(td_errors, axis=1) + 1e-6
        self.update_priorities(indices, new_priorities)

        return states_batch_t, actions_batch_t, rewards_batch_t, next_states_batch_t, dones_batch_t, weights_t, indices

    def update_priorities(self, indices, priorities):
        # Update priorities in the sum tree
        for idx, p in zip(indices, priorities):
            # Clip the priority to prevent numerical issues
            p = np.max(p, axis=0) + 1e-6
            self.tree.update(idx + self.buffer_size, p)
            self.max_priority = max(self.max_priority, p)

    def compute_td_errors(self, states, actions, rewards, next_states, dones, q_networks, target_networks):
        # Calculate TD errors for each sampled transition
        with torch.no_grad():
            agent_q_values = []
            target_q_values = []

            for agent_idx in range(self.num_agents):
                # Current Q-values for chosen actions
                q_vals = q_networks[agent_idx](states[:, agent_idx:agent_idx+1, :, :].squeeze(1)) 
                q_taken = q_vals.gather(dim=1, index=actions[:, agent_idx].unsqueeze(1))

                # Target Q-values for next states (Double Q-Learning)
                next_q_vals_target = target_networks[agent_idx](next_states[:, agent_idx:agent_idx+1, :, :].squeeze(1))
                next_q_vals = q_networks[agent_idx](next_states[:, agent_idx:agent_idx+1, :, :].squeeze(1))

                # Choose greedy action from next_q_vals
                greedy_actions = next_q_vals.argmax(dim=1, keepdim=True)
                q_target = next_q_vals_target.gather(dim=1, index=greedy_actions)

                agent_q_values.append(q_taken)
                target_q_values.append(q_target)

            agent_q_values = torch.cat(agent_q_values, dim=1)  # (batch, num_agents)
            target_q_values = torch.cat(target_q_values, dim=1) # (batch, num_agents)

            # TD error: abs(R + gamma * Q_target - Q_current)
            # Assuming a single-step return for simplicity, modify if needed
            gamma = 0.99
            td_error = torch.abs(rewards + (1 - dones) * gamma * target_q_values - agent_q_values)
            return td_error


In [265]:
class MultiAgentPrioritizedReplayBuffer:
    def __init__(self, buffer_size=10000, alpha=0.6, num_agents=2):
        self.buffer = deque(maxlen=buffer_size)
        self.alpha = alpha
        self.num_agents = num_agents

    def add(self, experience):
        self.buffer.append(experience)

    def sample(self, batch_size, beta=0.4, q_networks=None, target_networks=None):

        states = np.array([exp[0].cpu().numpy() for exp in self.buffer], dtype=np.float32)
        actions = np.array([exp[1] for exp in self.buffer], dtype=np.int64)
        rewards = np.array([exp[2] for exp in self.buffer])
        next_states = np.array([exp[3].cpu().numpy() for exp in self.buffer])
        dones = np.array([exp[4] for exp in self.buffer])
        td_errors = self.compute_td_errors(states,actions,rewards,next_states,q_networks, target_networks)

        priorities = np.max(td_errors, axis=1) + 1e-6
        print(priorities.shape)
        probabilities = priorities ** self.alpha
        probabilities /= probabilities.sum()
        print(probabilities.shape)
        print(len(self.buffer))

        indices = np.random.choice(len(self.buffer), batch_size, p=probabilities)
        experiences = [self.buffer[idx] for idx in indices]

        weights = ((len(self.buffer) * probabilities[indices]) ** -beta).reshape(32, 1)

        states_batch, actions_batch, rewards_batch, next_states_batch, terminations_batch = zip(*experiences)
        states_batch = np.array([x.cpu().numpy() for x in states_batch], dtype=np.float32)
        actions_batch = np.array([x for x in actions_batch], dtype=np.int64)
        rewards_batch = np.array([x for x in rewards_batch], dtype=np.float32)
        next_states_batch = np.array([x.cpu().numpy() for x in next_states_batch], dtype=np.float32)
        terminations_batch = np.array([x for x in terminations_batch], dtype=np.float32)

        return states_batch, actions_batch, rewards_batch, next_states_batch, terminations_batch, weights

    def compute_td_errors(self, state, action, reward, next_state, q_networks, target_networks):
        states = torch.tensor(state, dtype=torch.float32).to(device)
        actions = torch.tensor(action, dtype=torch.long).to(device)
        rewards = torch.tensor(reward, dtype=torch.float32).to(device)
        next_states = torch.tensor(next_state, dtype=torch.float32).to(device)
        agent_q_values=[]
        target_q_values = []
        for agent_idx in range(len(q_networks)):

            q_vals = q_networks[agent_idx](states[:, agent_idx, :, :, :])
            agent_q_values.append(q_vals.gather(dim=1, index=actions[:, agent_idx].unsqueeze(1)))  # Select Q-value for taken action
            
            next_q_values = target_networks[agent_idx](states[:, agent_idx, :, :, :])
            greedy_actions = next_q_values.argmax( dim=1, keepdim= True)
            target_q_values.append(q_vals.gather(dim=1, index=greedy_actions))
        
        target_q_values = torch.cat(target_q_values, dim=1)
        agent_q_values = torch.cat(agent_q_values, dim=1)  # Shape: (batch_size, n_agents)
        # TD-error: δ = Q(s, a) - target

        td_error = torch.abs(rewards.squeeze(1) + target_q_values - agent_q_values)
        td_error = np.array(td_error.cpu().detach().numpy())
        return td_error
    
    def size(self):
        return len(self.buffer)


### Buffer for sequences

In [266]:
class ReplayBuffer_sequence:
    def __init__(self, buffer_size=10000, sequence_length=5):
        self.buffer = deque(maxlen=buffer_size)
        self.sequence_length = sequence_length

    def add(self, experience):
        self.buffer.append(experience)
        
    def sample(self, batch_size):
        sequences = []
        for _ in range(batch_size):
            start_index = random.randint(0, len(self.buffer) - self.sequence_length)
            sequence = [self.buffer[i] for i in range(start_index, start_index + self.sequence_length)]
            sequences.append(sequence)
        
        # Transpose the sequences to get batches of each component
        
        states = torch.stack([torch.stack([torch.stack(seq[0]).to(device) for seq in sequence]) for sequence in sequences], dim=0)
        actions = torch.stack([torch.stack([torch.tensor(seq[1]).to(device) for seq in sequence]) for sequence in sequences], dim=0)
        rewards = torch.stack([torch.stack([torch.tensor(seq[2]).to(device) for seq in sequence]) for sequence in sequences], dim=0)
        next_states = torch.stack([torch.stack([torch.stack(seq[3]).to(device) for seq in sequence]) for sequence in sequences], dim=0)
        terminations = torch.stack([torch.stack([torch.tensor(seq[4], dtype=torch.int).to(device) for seq in sequence]) for sequence in sequences], dim=2)
        hidden_states = torch.cat([sequence[-1][5] for sequence in sequences], dim = -2).to(device)
        next_hidden_states = torch.cat([sequence[-1][6] for sequence in sequences], dim = -2).to(device)
        
        return (states, actions, rewards, next_states, terminations, hidden_states, next_hidden_states)

    def size(self):
        return len(self.buffer)

# **Utility functions**

## epsilon greedy, update and weights

In [267]:
# Adjusted epsilon-greedy action function
def epsilon_greedy_action(agent_q_network, state, epsilon, legal_actions, exploring = True):
    
    if random.random() < epsilon and exploring:
        # Explore: take a random action
        action = random.choice(legal_actions)
    else:
        state = state.unsqueeze(0).to(device)  # Add batch dimension
        q_values = agent_q_network(state)
        q_values = q_values.cpu().detach().numpy()
        action = np.random.choice(np.flatnonzero(q_values == q_values.max()))

    return action


def update_target_network(agent_q_networks, target_q_networks):
    for target, source in zip(target_q_networks, agent_q_networks):
        target.load_state_dict(source.state_dict())



def interpolate_weights(start_weights, end_weights, factor):
    return [(1 - factor) * start + factor * end for start, end in zip(start_weights, end_weights)]

def get_dynamic_weights(episode, n_episodes):
    # Define the initial weights for the beginning, middle, and end
    weights_beginning = [1, 0, 0, 0, 0, 0]
    weights_middle = [0, 0.33, 0.34, 0.33, 0, 0]
    weights_end = [0, 0, 0, 0, 0.5, 0.5]

    # Calculate the transition points
    transition1 = n_episodes // 3
    transition2 = 2 * n_episodes // 3

    # Adjust weights based on the current episode
    if episode < transition1:
        factor = episode / transition1
        return interpolate_weights(weights_beginning, weights_middle, factor)
    elif episode < transition2:
        factor = (episode - transition1) / (transition2 - transition1)
        return interpolate_weights(weights_middle, weights_end, factor)
    else:
        factor = (episode - transition2) / (n_episodes - transition2)
        return interpolate_weights(weights_middle, weights_end, factor)
    

import os

def create_folder_if_not_exists(folder_path):
    os.makedirs(folder_path, exist_ok=True)

## individual rewards

In [268]:
## individual reward
from capture import AgentRules
def calculate_individual_reward(env,agentIndex,action, reward, distancer, counter):

    next_state = env.game.state.generateSuccessor(agentIndex, env.action_mapping[action])
    # --- Reward for capturing food ---
    if env.game.state.getAgentState(agentIndex).numCarrying < next_state.getAgentState(agentIndex).numCarrying:
        reward += 1

    # --- Reward for delivering food ---
    if env.game.state.getAgentState(agentIndex).numCarrying > 0 and next_state.getAgentState(agentIndex).numCarrying == 0 and env.game.state.getAgentState(agentIndex).start.pos != next_state.getAgentPosition(agentIndex):
        print('gave food to side')
        reward += (env.game.state.getAgentState(agentIndex).numCarrying)*2

    # --- Maze Distance penalty ---
    start_position = env.game.state.getAgentState(agentIndex).start.pos
    next_position = next_state.getAgentPosition(agentIndex)
    maze_distance = distancer.getDistance(next_position, start_position)

    radius = 10
    if maze_distance <= radius and counter<=0:
        reward -= (radius - maze_distance)*0.001

    #--- legal actions reward ---
    legal_actions = AgentRules.getLegalActions(env.game.state, agentIndex)
    if env.action_mapping[action] in legal_actions:
        reward += 0.05  
    else:
        reward -= 0.05

           
    for red_team_index in [0, 2]:
        if (env.game.state.data.agentStates[red_team_index].isPacman and
                not next_state.data.agentStates[red_team_index].isPacman):
            if next_state.data.agentStates[red_team_index].configuration.pos == next_state.data.agentStates[red_team_index].start.pos:
                print('defended')
                reward += 0.75
    return reward

# **Section 2: Training loops** 

## IQL

In [269]:
import layout
from distanceCalculator import Distancer

def train_iql(env, name_experiment, agent_q_networks, target_q_networks, replay_buffer, train_p, learn_p, random_enemy=False, schedule=False, later_exploration = False):
    
    optimizer = optim.Adam([param for net in agent_q_networks for param in net.parameters()], lr=learn_p.lr_agent)
    #mixing_optimizer = optim.Adam(mixing_network.parameters(), lr=learn_p.lr_mixing)
    if schedule:
        scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma=learn_p.gamma)
        #scheduler_mix = torch.optim.lr.scheduler.ExponentialLR(mixing_optimizer, gamma=learn_p.learn_gamma)
   
    n_episodes = train_p.n_episodes
    batch_size = train_p.batch_size
    gamma = train_p.gamma
    epsilon = train_p.epsilon  # Initial exploration probability
    epsilon_min = train_p.epsilon_min
    epsilon_decay = train_p.epsilon_decay
    target_update_frequency = train_p.target_update_frequency

    l = layout.getLayout('bloxCapture.lay')
    print(l.walls)
    distancer = Distancer(l)
    distancer.getMazeDistances()
    
    legal_actions = [0, 1, 2, 3, 4]
    agent_indexes = [1, 3]
    episode_rewards = []
    win_rates = []
    enemieName = env.enemieName
    if random_enemy:
        enemylist = ['randomTeam.py', 'baselineTeam.py', 'heuristicTeam.py', 'approxQTeam.py', 'MCTSTeam.py', 'AstarTeam.py']
        weights_enemy = get_dynamic_weights(0, n_episodes)
        enemieName = np.random.choice(enemylist, p=weights_enemy)

    create_folder_if_not_exists(name_experiment)

    for episode in range(n_episodes):
        steps_counter = 0
        print(f"enemy: {enemieName}")
        run_counter =[11,11]

        done = {agent_id: False for agent_id in agent_indexes}
        if random_enemy:
            weights_enemy = get_dynamic_weights(episode, n_episodes)
            enemieName = np.random.choice(enemylist, p=weights_enemy)
        env.reset(enemieName=enemieName)
        blue_player1_reward = 0
        blue_player2_reward = 0
        score = 0
        if later_exploration:
            exploring = False
        else:
            exploring = True
        while not all(done.values()):
            if later_exploration and steps_counter > 10:
                exploring = True
            actions = [-1 for _, _ in enumerate(env.agents)]
            states = []

            reward_dist = [0,0]
            for i, agent_index in enumerate(agent_indexes):
                obs_agent = env.get_Observation(agent_index)
                state = torch.tensor(obs_agent, dtype=torch.float32).to(device)
                states.append(state)
                action = epsilon_greedy_action(agent_q_networks[i], state, epsilon, legal_actions, exploring)
                actions[agent_index] = action
                reward_dist[i] = individual_reward(env,agent_index,action,reward_dist[i],distancer,run_counter[i])



            next_states, rewards, terminations, info = env.step(actions)
            for i, index in enumerate(agent_indexes):
                if env.game.state.getAgentState(index).start.pos == env.game.state.getAgentPosition(index):
                    run_counter[i] = 11
                else:
                    run_counter[i] -=1
            
            score -= info["score_change"]
            done = {key: value for key, value in terminations.items() if key in agent_indexes}
            blue_player1_reward += (rewards[1] - reward_dist[0])
            blue_player2_reward += (rewards[3] - reward_dist[1])

            next_states_converted = []
            rewards_converted = []
            terminations_converted = []
            actions_converted = []

            for index in agent_indexes:
                next_states_converted.append(list(next_states.values())[index])
                rewards_converted.append(rewards[index])
                terminations_converted.append(terminations[index])
                actions_converted.append(actions[index])

            next_states_converted = torch.stack(next_states_converted)
            states_converted = torch.stack(states)
            rewards_converted = [rewards_converted]
            terminations_converted = [terminations_converted]
            replay_buffer.add(
                (states_converted, actions_converted, rewards_converted, next_states_converted, terminations_converted))

            if replay_buffer.size() >= batch_size:
                batch = replay_buffer.sample(batch_size)
                loss1, loss2 = compute_td_loss_iql(agent_q_networks, target_q_networks, batch,
                                               gamma=gamma)
                wandb.log({"loss1": loss1})
                wandb.log({"loss2": loss2})

                # Zero gradients for all optimizers
                optimizer.zero_grad()

                # Backpropagate once for all losses
                loss1.backward(retain_graph=True)
                loss2.backward()

                # Update weights
                optimizer.step()
                if schedule:
                    scheduler.step()
                    wandb.log({"lr": optimizer.param_groups[0]['lr']})

            steps_counter += 1
        wandb.log({"epsilon": epsilon})
        epsilon = max(epsilon_min, epsilon * epsilon_decay)
        wandb.log({"blue_player1_reward": blue_player1_reward})
        wandb.log({"blue_player2_reward": blue_player2_reward})
        wandb.log({"episode": episode})
        wandb.log({'score': score})




        if (episode + 1) % target_update_frequency == 0:
            update_target_network(agent_q_networks, target_q_networks)
        
        if (episode) % target_update_frequency == 0:
            torch.save(agent_q_networks[0].state_dict(),f'{name_experiment}/agent1_{episode+1}.pth')
            torch.save(agent_q_networks[1].state_dict(),f'{name_experiment}/agent3_{episode+1}.pth')
         # Log rewards and win rates
        episode_rewards.append(blue_player1_reward+blue_player2_reward)
        print(blue_player1_reward)
        print(blue_player2_reward)
        print(f"Episode reward: {blue_player1_reward+blue_player2_reward}")

        if np.sum(env.game.state.getBlueFood().data) == 0:
            win_rates.append(1)
        else:
            win_rates.append(0)

        if (episode + 1) % 10 == 0:
            avg_reward = np.mean(episode_rewards[-10:])
            avg_win_rate = np.mean(win_rates[-10:])
            print(f"Episode {episode + 1}/{n_episodes}, Average Reward: {avg_reward:.2f}, Win Rate: {avg_win_rate:.2f}")

    # Plot rewards and win rates
    plt.figure(figsize=(12, 6))
    plt.subplot(1, 2, 1)
    plt.plot(episode_rewards, label="Average Reward")
    plt.xlabel("Episode")
    plt.ylabel("Reward")
    plt.title("Average Reward per Episode")
    plt.legend()

    plt.subplot(1, 2, 2)
    smoothed_win_rates = np.convolve(win_rates, np.ones(10)/10, mode='valid') if len(win_rates) > 10 else win_rates
    plt.plot(smoothed_win_rates, label="Win Rate (Smoothed)")
    plt.xlabel("Episode")
    plt.ylabel("Win Rate")
    plt.title("Win Rate Over Episodes")
    plt.legend()

    plt.tight_layout()
    plt.show()

    print("Training completed.")


## QMIX with everything

In [280]:
import layout
from distanceCalculator import Distancer

def train_qmix(env, name_experiment, agent_q_networks, target_q_networks, mixing_network, replay_buffer, train_p, learn_p, random_enemy=False, schedule=False, later_exploration = False, individual_reward = False):
    
    optimizer = optim.Adam([param for net in agent_q_networks for param in net.parameters()], lr=learn_p.lr_agent)
    mixing_optimizer = optim.Adam(mixing_network.parameters(), lr=learn_p.lr_mix)
    if schedule:
        scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma=learn_p.gamma)
        scheduler_mix = torch.optim.lr_scheduler.ExponentialLR(mixing_optimizer, gamma=learn_p.gamma)
   
    n_episodes = train_p.n_episodes
    batch_size = train_p.batch_size
    gamma = train_p.gamma
    epsilon = train_p.epsilon  # Initial exploration probability
    epsilon_min = train_p.epsilon_min
    epsilon_decay = train_p.epsilon_decay
    target_update_frequency = train_p.target_update_frequency

    l = layout.getLayout('bloxCapture.lay') 
    print(l.walls)
    distancer = Distancer(l)
    distancer.getMazeDistances()
    print(f'hallo: {distancer._distances}')
    
    legal_actions = [0, 1, 2, 3, 4]
    agent_indexes = [1, 3]
    episode_rewards = []
    win_rates = []
    enemieName = env.enemieName

    weights = None
    
    if random_enemy:
        enemylist = ['randomTeam.py', 'baselineTeam.py', 'heuristicTeam.py', 'approxQTeam.py', 'MCTSTeam.py', 'AstarTeam.py']
        weights_enemy = get_dynamic_weights(0, n_episodes)
        enemieName = np.random.choice(enemylist, p=weights_enemy)

    create_folder_if_not_exists(name_experiment)

    for episode in range(n_episodes):
        steps_counter = 0
        run_counter = [11,11]
        print(f"enemy: {enemieName}")

        done = {agent_id: False for agent_id in agent_indexes}
        if random_enemy:
            weights_enemy = get_dynamic_weights(episode, n_episodes)
            enemieName = np.random.choice(enemylist, p=weights_enemy)
        env.reset(enemieName=enemieName)
        blue_player1_reward = 0
        blue_player2_reward = 0
        score = 0
        if later_exploration:
            exploring = False
            start = episode/n_episodes * env.length/2 #slowly go to later exploring after half of the episode at the last episode
        else:
            exploring = True
    
        while not all(done.values()):
            print(f"episode: step:     {episode} : {steps_counter}")
            if later_exploration:
                if steps_counter > start:
                    exploring = True
            actions = [-1 for _, _ in enumerate(env.agents)]
            states = []
            if  individual_reward:
                indv_rewards = [0,0]
            for i, agent_index in enumerate(agent_indexes):
                obs_agent = env.get_Observation(agent_index)
                state = torch.tensor(obs_agent, dtype=torch.float32).to(device)
                states.append(state)
                action = epsilon_greedy_action(agent_q_networks[i], state, epsilon, legal_actions, exploring)
                actions[agent_index] = action
                if individual_reward:
                    indv_rewards[i] = calculate_individual_reward(env,agent_index,action,indv_rewards[i],distancer,run_counter[i])

            next_states, rewards, terminations, info = env.step(actions)

            if individual_reward:
                for i, index in enumerate(agent_indexes):
                    if env.game.state.getAgentState(index).start.pos == env.game.state.getAgentPosition(index):
                        run_counter[i] = 11
                    else:
                        run_counter[i] -=1
                    
            score -= info["score_change"]
            done = {key: value for key, value in terminations.items() if key in agent_indexes}
            blue_player1_reward += (rewards[1]+ 0 if not individual_reward else indv_rewards[0] )
            blue_player2_reward += (rewards[3]+ 0 if not individual_reward else indv_rewards[1])

            next_states_converted = []
            rewards_converted = []
            terminations_converted = []
            actions_converted = []

            for index in agent_indexes:
                next_states_converted.append(list(next_states.values())[index])
                rewards_converted.append(rewards[index])
                terminations_converted.append(terminations[index])
                actions_converted.append(actions[index])

            next_states_converted = torch.stack(next_states_converted)
            states_converted = torch.stack(states)
            rewards_converted = [rewards_converted]
            terminations_converted = [terminations_converted]
            replay_buffer.add(
                (states_converted, actions_converted, rewards_converted, next_states_converted, terminations_converted))

            if replay_buffer.size() >= batch_size:
                
                if type(replay_buffer) == MultiAgentPrioritizedReplayBuffer:
                    batch = replay_buffer.sample(batch_size, 0.4, agent_q_networks, target_q_networks)
                    weights = batch[-1]
                    batch = batch[:-1]
                elif type(replay_buffer) == MultiAgentPrioritizedReplayBufferTree:
                    batch = replay_buffer.sample(batch_size, agent_q_networks, target_q_networks)
                    weights = batch[-2]
                    batch = batch[:-2]
                    
                elif type(replay_buffer) == ReplayBuffer:
                    batch = replay_buffer.sample(batch_size)

                loss = compute_td_loss_qmix(agent_q_networks, target_q_networks,mixing_network, batch,
                                               gamma=gamma, weights=weights)
                
                wandb.log({"loss": loss})
                

                # Zero gradients for all optimizers
                optimizer.zero_grad()
                mixing_optimizer.zero_grad()
                # Backpropagate once for all losses
                loss.backward()

                # Update weights
                optimizer.step()
                mixing_optimizer.step()
                

            steps_counter += 1
        if schedule:
            scheduler.step()
            scheduler_mix.step()
            wandb.log({"lr": optimizer.param_groups[0]['lr']})
        wandb.log({"epsilon": epsilon})
        epsilon = max(epsilon_min, epsilon * epsilon_decay)
        wandb.log({"blue_player1_reward": blue_player1_reward})
        wandb.log({"blue_player2_reward": blue_player2_reward})
        wandb.log({"episode": episode})
        wandb.log({'score': score})




        if (episode + 1) % target_update_frequency == 0:
            update_target_network(agent_q_networks, target_q_networks)
        
        if (episode + 1) % target_update_frequency == 0:
            torch.save(agent_q_networks[0].state_dict(),f'{name_experiment}/agent1_{episode+1}.pth')
            torch.save(agent_q_networks[1].state_dict(),f'{name_experiment}/agent3_{episode+1}.pth')
         # Log rewards and win rates
        episode_rewards.append(blue_player1_reward+blue_player2_reward)
        print(blue_player1_reward)
        print(blue_player2_reward)
        print(f"Episode reward: {blue_player1_reward+blue_player2_reward}")

        if np.sum(env.game.state.getBlueFood().data) == 0:
            win_rates.append(1)
        else:
            win_rates.append(0)

        if (episode + 1) % 10 == 0:
            avg_reward = np.mean(episode_rewards[-10:])
            avg_win_rate = np.mean(win_rates[-10:])
            print(f"Episode {episode + 1}/{n_episodes}, Average Reward: {avg_reward:.2f}, Win Rate: {avg_win_rate:.2f}")

    # Plot rewards and win rates
    plt.figure(figsize=(12, 6))
    plt.subplot(1, 2, 1)
    plt.plot(episode_rewards, label="Average Reward")
    plt.xlabel("Episode")
    plt.ylabel("Reward")
    plt.title("Average Reward per Episode")
    plt.legend()

    plt.subplot(1, 2, 2)
    smoothed_win_rates = np.convolve(win_rates, np.ones(10)/10, mode='valid') if len(win_rates) > 10 else win_rates
    plt.plot(smoothed_win_rates, label="Win Rate (Smoothed)")
    plt.xlabel("Episode")
    plt.ylabel("Win Rate")
    plt.title("Win Rate Over Episodes")
    plt.legend()

    plt.tight_layout()
    plt.show()

    print("Training completed.")
    return



## QMIX with prioritized buffer

In [271]:
def train_qmix_replay_buffer(env, name_experiment, agent_q_networks, target_q_networks, mixing_network, replay_buffer, train_p, learn_p, random_enemy=False, schedule=False, later_exploration = False):
    
    optimizer = optim.Adam([param for net in agent_q_networks for param in net.parameters()], lr=learn_p.lr_agent)
    mixing_optimizer = optim.Adam(mixing_network.parameters(), lr=learn_p.lr_mix)
    if schedule:
        scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma=learn_p.gamma)
        scheduler_mix = torch.optim.lr_scheduler.ExponentialLR(mixing_optimizer, gamma=learn_p.gamma)
   
    n_episodes = train_p.n_episodes
    batch_size = train_p.batch_size
    gamma = train_p.gamma
    epsilon = train_p.epsilon  # Initial exploration probability
    epsilon_min = train_p.epsilon_min
    epsilon_decay = train_p.epsilon_decay
    target_update_frequency = train_p.target_update_frequency
    
    legal_actions = [0, 1, 2, 3, 4]
    agent_indexes = [1, 3]
    episode_rewards = []
    win_rates = []
    enemieName = env.enemieName

    weights = None
    if random_enemy:
        enemylist = ['randomTeam.py', 'baselineTeam.py', 'heuristicTeam.py', 'approxQTeam.py', 'MCTSTeam.py', 'AstarTeam.py']
        weights_enemy = get_dynamic_weights(0, n_episodes)
        enemieName = np.random.choice(enemylist, p=weights_enemy)

    create_folder_if_not_exists(name_experiment)

    for episode in range(n_episodes):
        steps_counter = 0
        print(f"enemy: {enemieName}")

        done = {agent_id: False for agent_id in agent_indexes}
        if random_enemy:
            weights_enemy = get_dynamic_weights(episode, n_episodes)
            enemieName = np.random.choice(enemylist, p=weights_enemy)
        env.reset(enemieName=enemieName)
        blue_player1_reward = 0
        blue_player2_reward = 0
        score = 0
        if later_exploration:
            exploring = False
        else:
            exploring = True
        while not all(done.values()):
            if later_exploration and steps_counter > 10:
                exploring = True
            actions = [-1 for _, _ in enumerate(env.agents)]
            states = []
            for i, agent_index in enumerate(agent_indexes):
                obs_agent = env.get_Observation(agent_index)
                state = torch.tensor(obs_agent, dtype=torch.float32).to(device)
                states.append(state)
                action = epsilon_greedy_action(agent_q_networks[i], state, epsilon, legal_actions, exploring)
                actions[agent_index] = action


            next_states, rewards, terminations, info = env.step(actions)
          
            score -= info["score_change"]
            done = {key: value for key, value in terminations.items() if key in agent_indexes}
            blue_player1_reward += rewards[1]
            blue_player2_reward += rewards[3]

            next_states_converted = []
            rewards_converted = []
            terminations_converted = []
            actions_converted = []

            for index in agent_indexes:
                next_states_converted.append(list(next_states.values())[index])
                rewards_converted.append(rewards[index])
                terminations_converted.append(terminations[index])
                actions_converted.append(actions[index])

            next_states_converted = torch.stack(next_states_converted)
            states_converted = torch.stack(states)
            rewards_converted = [rewards_converted]
            terminations_converted = [terminations_converted]
            replay_buffer.add(
                (states_converted, actions_converted, rewards_converted, next_states_converted, terminations_converted))

            if replay_buffer.size() >= batch_size:
                batch = replay_buffer.sample(batch_size, 0.4, agent_q_networks, target_q_networks)
                weights = batch[-1]
                batch = batch[:-1]
                loss = compute_td_loss_qmix(agent_q_networks, target_q_networks,mixing_network, batch,
                                               gamma=gamma, weights=weights)
                wandb.log({"loss": loss})
                

                # Zero gradients for all optimizers
                optimizer.zero_grad()
                mixing_optimizer.zero_grad()
                # Backpropagate once for all losses
                loss.backward()

                # Update weights
                optimizer.step()
                mixing_optimizer.step()
                if schedule:
                    scheduler.step()
                    scheduler_mix.step()
                    wandb.log({"lr": optimizer.param_groups[0]['lr']})

            steps_counter += 1
        wandb.log({"epsilon": epsilon})
        epsilon = max(epsilon_min, epsilon * epsilon_decay)
        wandb.log({"blue_player1_reward": blue_player1_reward})
        wandb.log({"blue_player2_reward": blue_player2_reward})
        wandb.log({"episode": episode})
        wandb.log({'score': score})




        if (episode + 1) % target_update_frequency == 0:
            update_target_network(agent_q_networks, target_q_networks)
        
        if (episode) % target_update_frequency == 0:
            torch.save(agent_q_networks[0].state_dict(),f'{name_experiment}/agent1_{episode+1}.pth')
            torch.save(agent_q_networks[1].state_dict(),f'{name_experiment}/agent3_{episode+1}.pth')
         # Log rewards and win rates
        episode_rewards.append(blue_player1_reward+blue_player2_reward)
        print(blue_player1_reward)
        print(blue_player2_reward)
        print(f"Episode reward: {blue_player1_reward+blue_player2_reward}")

        if np.sum(env.game.state.getBlueFood().data) == 0:
            win_rates.append(1)
        else:
            win_rates.append(0)

        if (episode + 1) % 10 == 0:
            avg_reward = np.mean(episode_rewards[-10:])
            avg_win_rate = np.mean(win_rates[-10:])
            print(f"Episode {episode + 1}/{n_episodes}, Average Reward: {avg_reward:.2f}, Win Rate: {avg_win_rate:.2f}")

    # Plot rewards and win rates
    plt.figure(figsize=(12, 6))
    plt.subplot(1, 2, 1)
    plt.plot(episode_rewards, label="Average Reward")
    plt.xlabel("Episode")
    plt.ylabel("Reward")
    plt.title("Average Reward per Episode")
    plt.legend()

    plt.subplot(1, 2, 2)
    smoothed_win_rates = np.convolve(win_rates, np.ones(10)/10, mode='valid') if len(win_rates) > 10 else win_rates
    plt.plot(smoothed_win_rates, label="Win Rate (Smoothed)")
    plt.xlabel("Episode")
    plt.ylabel("Win Rate")
    plt.title("Win Rate Over Episodes")
    plt.legend()

    plt.tight_layout()
    plt.show()

    print("Training completed.")



In [272]:
def train_qmix_LSTM():
    pass

In [273]:
def train_PPO():
    pass

# **Section 3: Testing and conclusions**

## Testing IQL


First 300 steps with random enemy on win.lay with fixed lr


In [144]:
@dataclass
class train_param:
    n_episodes: int = 300
    batch_size: int = 32
    gamma: float = 0.995
    epsilon: float = 0.75
    epsilon_min: float = 0.1
    epsilon_decay: float = 0.995
    target_update_frequency: int = 20

@dataclass
class learn_param:
    lr_mix: float = 0.0001
    lr_agent: float = 0.0001
    gamma: float = 0.99995

#TODO: specific net_param for each network architecture
@dataclass
class net_param_CNN:
    hidden_dim: int = 64
    

train_p_qmix = train_param()
learn_p_qmix = learn_param()
net_p_qmix = net_param_CNN()

In [145]:
env = gymPacMan_parallel_env(display=True, layout_file="layouts/bloxCapture.lay", length=300, enemieName='randomTeam.py', defenceReward=True)

Loaded layout from file
Loading Team: c:\Users\josse\OneDrive\Documenten\creditcontract\RL\RL_MARL_QMIX//agents/randomTeam.py
Arguments: {}
Blue team starts


In [146]:
n_agents = 2
action_dim = 5

obs_individual_agent = env.get_Observation(0)
obs_shape = obs_individual_agent.shape
global_state = [n_agents]+list(obs_shape)

agent_q_networks = [AgentQNetwork(obs_shape, action_dim, hidden_dim=net_p_qmix.hidden_dim).to(device) for _ in range(n_agents)]
target_q_networks = [AgentQNetwork(obs_shape, action_dim, hidden_dim=net_p_qmix.hidden_dim).to(device) for _ in range(n_agents)]
mixing_network = SimpleQMixer(n_agents,global_state).to(device)
update_target_network(agent_q_networks, target_q_networks)

replay_buffer = MultiAgentPrioritizedReplayBuffer(buffer_size=1500)

In [147]:
name_experiment ='SimpleQMIX_CNN_random_blox_basic'
team_name = 'RL_PacMan'
wandb.init(project="", id = f"{name_experiment}__{datetime.now().strftime('%Y%m%d_%H%M%S')}")
train_qmix_replay_buffer(env, name_experiment, agent_q_networks, target_q_networks, mixing_network, replay_buffer, train_p_qmix, learn_p_qmix)
wandb.finish()


enemy: randomTeam.py
Loading Team: c:\Users\josse\OneDrive\Documenten\creditcontract\RL\RL_MARL_QMIX//agents/randomTeam.py
Arguments: {}
Red team starts


  state = torch.tensor(obs_agent, dtype=torch.float32).to(device)


(32,)
(32,)
32
(33,)
(33,)
33
(34,)
(34,)
34
(35,)
(35,)
35
(36,)
(36,)
36
(37,)
(37,)
37
(38,)
(38,)
38
(39,)
(39,)
39
(40,)
(40,)
40
(41,)
(41,)
41
(42,)
(42,)
42
(43,)
(43,)
43
(44,)
(44,)
44
(45,)
(45,)
45
(46,)
(46,)
46
(47,)
(47,)
47
(48,)
(48,)
48
(49,)
(49,)
49
(50,)
(50,)
50
(51,)
(51,)
51
(52,)
(52,)
52
(53,)
(53,)
53
(54,)
(54,)
54
(55,)
(55,)
55
(56,)
(56,)
56
(57,)
(57,)
57
(58,)
(58,)
58
(59,)
(59,)
59
(60,)
(60,)
60
(61,)
(61,)
61
(62,)
(62,)
62
(63,)
(63,)
63
(64,)
(64,)
64
(65,)
(65,)
65
(66,)
(66,)
66
(67,)
(67,)
67
(68,)
(68,)
68
(69,)
(69,)
69
(70,)
(70,)
70
(71,)
(71,)
71
(72,)
(72,)
72
(73,)
(73,)
73
(74,)
(74,)
74
(75,)
(75,)
75
(76,)
(76,)
76
(77,)
(77,)
77
(78,)
(78,)
78
(79,)
(79,)
79
(80,)
(80,)
80


KeyboardInterrupt: 

In [None]:
@dataclass
class train_param:
    n_episodes: int = 100
    batch_size: int = 32
    gamma: float = 0.995
    epsilon: float = 0.3
    epsilon_min: float = 0.1
    epsilon_decay: float = 0.995
    target_update_frequency: int = 20

@dataclass
class learn_param:
    lr_mix: float = 0.0001
    lr_agent: float = 0.0001
    gamma: float = 0.99995

#TODO: specific net_param for each network architecture
@dataclass
class net_param_CNN:
    hidden_dim: int = 64
    

train_p_qmix = train_param()
learn_p_qmix = learn_param()
net_p_qmix = net_param_CNN()

In [None]:
env = gymPacMan_parallel_env(display=True, layout_file="layouts/bloxCapture.lay", length=300, enemieName='baselineTeam.py', defenceReward=True)
name_experiment ='SimpleQMIX_CNN_random_blox_basic'
team_name = 'RL_PacMan'
wandb.init(entity = team_name, id = f"{name_experiment}__{datetime.now().strftime('%Y%m%d_%H%M%S')}")
train_qmix_indv_reward(env, name_experiment, agent_q_networks, target_q_networks, mixing_network, replay_buffer, train_p_qmix, learn_p_qmix)
wandb.finish()

IQL on tiny with random enemy with scheduled learning_rate

In [None]:
replay_buffer = ReplayBuffer()
name_experiment ='IQL_CNN_tiny'
team_name = 'RL_PacMan'

env = gymPacMan_parallel_env(display=True, layout_file="layouts/tinyCapture.lay", length=300, enemieName='randomTeam.py', defenceReward=True)
wandb.init(entity = team_name, project="PacManCapture", id = f"{name_experiment}__{datetime.now().strftime('%Y%m%d_%H%M%S')}")

train_iql(env, name_experiment, agent_q_networks, target_q_networks, replay_buffer, train_p_iql, learn_p_iql)
wandb.finish()


Loaded layout from file
Loading Team: /home/seppe/RL/RL_MARL_QMIX//agents/randomTeam.py
Arguments: {}
Red team starts


AttributeError: 'NoneType' object has no attribute 'walls'

In [None]:
wandb.finish()


In [None]:
import pandas as pd

# Define the list of enemies
enemies = ['randomTeam.py', 'baselineTeam.py', 'heuristicTeam.py', 'approxQTeam.py', 'MCTSTeam.py', 'AstarTeam.py']

# Initialize results dictionary
results = {enemy: {'wins': 0, 'losses': 0, 'draws': 0} for enemy in enemies}

# Number of games to play against each enemy
num_games = 10

# Function to evaluate the agent against an enemy
def evaluate_agent(env, enemy_name, num_games):
    wins, losses, draws = 0, 0, 0
    for _ in range(num_games):
        env.reset(enemieName=enemy_name)
        done = False
        while not done:
            actions = [-1 for _, _ in enumerate(env.agents)]
            states = []
            for i, agent_index in enumerate([1,3]):
                obs_agent = env.get_Observation(agent_index)
                state = torch.tensor(obs_agent, dtype=torch.float32).to(device)
                states.append(state)
                state = state.unsqueeze(0).to(device)  # Add batch dimension
                q_values = agent_q_networks[i](state)
                q_values = q_values.cpu().detach().numpy()
                action = np.random.choice(np.flatnonzero(q_values == q_values.max()))
                actions[agent_index] = action
            next_state, reward, done, info = env.step(actions)
            if done:
                if info['result'] == 'win':
                    wins += 1
                elif info['result'] == 'loss':
                    losses += 1
                else:
                    draws += 1
    return wins, losses, draws

# Evaluate the agent against each enemy
for enemy in enemies:
    env = gymPacMan_parallel_env(display=False, layout_file="layouts/smallCapture.lay", length=200, enemieName=enemy, defenceReward=True)
    wins, losses, draws = evaluate_agent(env, enemy, num_games)
    results[enemy]['wins'] = wins
    results[enemy]['losses'] = losses
    results[enemy]['draws'] = draws

# Convert results to a DataFrame for better display
results_df = pd.DataFrame(results).T
print(results_df)

**Testing idea's**

- experience buffer
- only exploration after first 10 steps
- 

# Testing everything together

In [274]:
@dataclass
class train_param:
    n_episodes: int = 300
    batch_size: int = 32
    gamma: float = 0.995
    epsilon: float = 0.75
    epsilon_min: float = 0.2
    epsilon_decay: float = 0.999
    target_update_frequency: int = 20

@dataclass
class learn_param:
    lr_mix: float = 0.0001
    lr_agent: float = 0.0001
    gamma: float = 0.995

#TODO: specific net_param for each network architecture
@dataclass
class net_param_CNN:
    hidden_dim: int = 64
    

train_p_qmix = train_param()
learn_p_qmix = learn_param()
net_p_qmix = net_param_CNN()

In [275]:
env = gymPacMan_parallel_env(display=False, layout_file="layouts/bloxCapture.lay", length=300, enemieName='randomTeam.py', defenceReward=True)

Loaded layout from file
Loading Team: c:\Users\josse\OneDrive\Documenten\creditcontract\RL\RL_MARL_QMIX//agents/randomTeam.py
Arguments: {}
Blue team starts


In [276]:
gammas = [0.95, 0.99, 0.995]
lrs = [0.0001, 0.0005, 0.001]
hidden_dims = [64, 128]
target_update_frequencies = [10, 20, 30]

In [277]:
n_agents = 2
action_dim = 5

obs_individual_agent = env.get_Observation(0)
obs_shape = obs_individual_agent.shape
global_state = [n_agents]+list(obs_shape)


In [278]:
wandb.finish()

0,1
loss,█▅▇▇▄▃▂▂▃▃▂▂▁▁▂▁▁▁▁▁▁▂▂▁▂▂▁▁▁▁▁▁▁▁▁▁▂▂▁▁

0,1
loss,9e-05


In [279]:
import cProfile
import pstats
from io import StringIO

for gamma in gammas:
    for lr in lrs:
        for hidden_dim in hidden_dims:
            for target_update_frequency in target_update_frequencies:
                train_p_qmix = train_param()
                learn_p_qmix = learn_param()
                net_p_qmix = net_param_CNN()
                train_p_qmix.gamma = gamma
                learn_p_qmix.lr_mix = lr
                learn_p_qmix.lr_agent = lr
                net_p_qmix.hidden_dim = hidden_dim
                train_p_qmix.target_update_frequency = target_update_frequency

                agent_q_networks = [AgentQNetwork(obs_shape, action_dim, hidden_dim=net_p_qmix.hidden_dim).to(device) for _ in range(n_agents)]
                target_q_networks = [AgentQNetwork(obs_shape, action_dim, hidden_dim=net_p_qmix.hidden_dim).to(device) for _ in range(n_agents)]
                mixing_network = SimpleQMixer(n_agents,global_state).to(device)
                update_target_network(agent_q_networks, target_q_networks)

                replay_buffer = MultiAgentPrioritizedReplayBufferTree(buffer_size=1500)

                name_experiment = f'SimpleQMIX_everything_random_blox_gamma{gamma}_lr{lr}_hidden{hidden_dim}_target{target_update_frequency}'
                team_name = 'RL_PacMan'
                wandb.init(entity = team_name, project="PacManCapture", id = f"{name_experiment}__{datetime.now().strftime('%Y%m%d_%H%M%S')}")
                
                #pr = cProfile.Profile()
                #pr.enable()
                train_qmix(env, name_experiment, agent_q_networks, target_q_networks, mixing_network, replay_buffer, train_p_qmix, learn_p_qmix, later_exploration=True, individual_reward=True)
                #pr.disable()
                wandb.finish()
                #s = StringIO()
                #sortby = 'tottime'
                #ps = pstats.Stats(pr, stream=s).sort_stats(sortby)
                #ps.print_stats()
                #print(s.getvalue())


                name_experiment = f'SimpleQMIX_everything_varying_blox_gamma{gamma}_lr{lr}_hidden{hidden_dim}_target{target_update_frequency}'
                train_p_qmix.epsilon = 0.5
                wandb.init(entity = team_name, project="PacManCapture", id = f"{name_experiment}__{datetime.now().strftime('%Y%m%d_%H%M%S')}")
                train_qmix(env, name_experiment, agent_q_networks, target_q_networks, mixing_network, replay_buffer, train_p_qmix, learn_p_qmix, random_enemy=True, schedule=True, later_exploration=True, individual_reward=True)
                wandb.finish()

TTTTTTTTTTTTTTTTTTTT
TFTTTTTTTTTTTTTTTTFT
TFTFFFFFTFFTFFFFFTFT
TFTFFFFFFFFFFFFFFTFT
TFTFTFFTTTTTTFFTFTFT
TFTFTFFFFFFFFFFTFTFT
TFTFTFFTTTTTTFFTFTFT
TFTFTFFFFFFFFFFTFTFT
TFFFFFTFTTTTFTFFFFFT
TTTTFFTFTTTTFTFFTTTT
TTTTFFTFTTTTFTFFTTTT
TFFFFFTFTTTTFTFFFFFT
TFTFTFFFFFFFFFFTFTFT
TFTFTFTTTTTTTTFTFTFT
TFTFTFTFTFFTFTFTFTFT
TFTFFFFFFFFFFFFFFTFT
TFTFTTFTTFFTTFTTFTFT
TFTFTTFTTFFTTFTTFTFT
TFTFFFFFFFFFFFFFFTFT
TTTTTTTTTTTTTTTTTTTT
hallo: {((1, 1), (1, 1)): 0, ((1, 2), (1, 1)): 1, ((1, 3), (1, 1)): 2, ((1, 4), (1, 1)): 3, ((1, 5), (1, 1)): 4, ((1, 6), (1, 1)): 5, ((1, 7), (1, 1)): 6, ((1, 8), (1, 1)): 7, ((1, 11), (1, 1)): 16, ((1, 12), (1, 1)): 17, ((1, 13), (1, 1)): 18, ((1, 14), (1, 1)): 19, ((1, 15), (1, 1)): 20, ((1, 16), (1, 1)): 21, ((1, 17), (1, 1)): 22, ((1, 18), (1, 1)): 23, ((2, 8), (1, 1)): 8, ((2, 11), (1, 1)): 15, ((3, 1), (1, 1)): 16, ((3, 2), (1, 1)): 15, ((3, 3), (1, 1)): 14, ((3, 4), (1, 1)): 13, ((3, 5), (1, 1)): 12, ((3, 6), (1, 1)): 11, ((3, 7), (1, 1)): 10, ((3, 8), (1, 1)): 9, (

  state = torch.tensor(obs_agent, dtype=torch.float32).to(device)


steps: 1
steps: 2
steps: 3
steps: 4
steps: 5
steps: 6
steps: 7
steps: 8
steps: 9
steps: 10
steps: 11
steps: 12
steps: 13
steps: 14
steps: 15
steps: 16
steps: 17
steps: 18
steps: 19
steps: 20
steps: 21
steps: 22
steps: 23
steps: 24
steps: 25
steps: 26
steps: 27
steps: 28
steps: 29
steps: 30
steps: 31


  states = torch.tensor(states, dtype=torch.float32).to(device)
  actions = torch.tensor(actions, dtype=torch.long).to(device)
  rewards = torch.tensor(rewards, dtype=torch.float32).to(device)
  next_states = torch.tensor(next_states, dtype=torch.float32).to(device)
  dones = torch.tensor(dones, dtype=torch.float32).to(device)
  weights = torch.tensor(weights, dtype=torch.float32).to(device)


weights:   torch.Size([32, 1])
steps: 32
weights:   torch.Size([32, 1])
steps: 33
weights:   torch.Size([32, 1])
steps: 34
weights:   torch.Size([32, 1])
steps: 35
weights:   torch.Size([32, 1])
steps: 36
weights:   torch.Size([32, 1])
steps: 37
weights:   torch.Size([32, 1])
steps: 38
weights:   torch.Size([32, 1])
steps: 39
weights:   torch.Size([32, 1])
steps: 40
weights:   torch.Size([32, 1])
steps: 41
weights:   torch.Size([32, 1])
steps: 42
weights:   torch.Size([32, 1])
steps: 43
weights:   torch.Size([32, 1])
steps: 44
weights:   torch.Size([32, 1])
steps: 45
weights:   torch.Size([32, 1])
steps: 46
weights:   torch.Size([32, 1])
steps: 47
weights:   torch.Size([32, 1])
steps: 48
weights:   torch.Size([32, 1])
steps: 49
weights:   torch.Size([32, 1])
steps: 50
weights:   torch.Size([32, 1])
steps: 51
weights:   torch.Size([32, 1])
steps: 52
weights:   torch.Size([32, 1])
steps: 53
weights:   torch.Size([32, 1])
steps: 54
weights:   torch.Size([32, 1])
steps: 55
weights:   torch

KeyboardInterrupt: 

In [None]:
s = StringIO()
sortby = 'cumtime'
ps = pstats.Stats(pr, stream=s).sort_stats(sortby)
ps.print_stats()
print(s.getvalue())


         19785700 function calls (19699203 primitive calls) in 138.883 seconds

   Ordered by: cumulative time

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
  161/136    0.002    0.000   61.557    0.453 c:\Users\josse\OneDrive\Documenten\creditcontract\RL\RL_MARL_QMIX\venv\Lib\site-packages\wandb\sdk\lib\mailbox.py:121(_wait)
  154/146   52.722    0.342   52.838    0.362 {method 'recv' of '_socket.socket' objects}
      146    0.003    0.000   51.953    0.356 c:\Users\josse\OneDrive\Documenten\creditcontract\RL\RL_MARL_QMIX\venv\Lib\site-packages\wandb\sdk\lib\sock_client.py:282(read_server_response)
      146    0.003    0.000   51.939    0.356 c:\Users\josse\OneDrive\Documenten\creditcontract\RL\RL_MARL_QMIX\venv\Lib\site-packages\wandb\sdk\lib\sock_client.py:239(_read_packet_bytes)
   162/23    0.004    0.000   26.700    1.161 c:\Users\josse\OneDrive\Documenten\creditcontract\RL\RL_MARL_QMIX\venv\Lib\site-packages\wandb\sdk\lib\mailbox.py:124(_get_and_clea