## Assignment 3 - MARL

**Team: Wintergreen Systems**

- Parisa
- Sudha
- Saniya
- Elizabeth

In [1]:
import torch
import copy
import numpy as np
import itertools
import random
from matplotlib import pyplot as plt
from matplotlib.offsetbox import OffsetImage, AnnotationBbox

In [2]:
import matplotlib
from tqdm import tqdm
matplotlib.use("TkAgg")
BOLD = "\033[1m"
RESET = "\033[0m"

### Multi-Agent Reinforcement Learning Environment for Package Delivery

This class defines an environment where multiple agents work together to pick up, handover and deliver packages.

**Attributes:**
- reward_pickup (int): Reward for picking up a package.
- reward_deliver (int): Reward for successfully delivering a package.
- reward_handover (int): Reward for a successful handover between agents.
- reward_move (int): Reward for a regular movement action.
- reward_stuck (int): Penalty for an agent getting stuck.
- Initial location coordinated of the agents are defined in: agentsA_coord and agentsB_coord
- package collected or not for botht types of agents is defines in:  agentsA_collect and agentsB_collect.
- agentsA_step (list of int): Number of steps taken by agents in group A.
- agentsB_step (list of int): Number of steps taken by agents in group B.
- test_length (int): Counter to keep track of agent steps which is required for reward calculation later.
- packages_delivered (int): Number of packages successfully delivered.

**Methods:**
- reset_env(): Reset the environment to its initial state.
- get_state(): Get the current state of the environment as a NumPy array.
- move_agent(actionA, actionB): Move agents according to the specified actions and update rewards.
- get_grid(): Get the current grid representation.
- setup_grid(): Set up the initial grid with agents and locations.
- plot_grid(snapshot, ax=None): Visualize the current state of the environment.

**Purpose of this class:**
The purpose of the `Environment` class is to define and manage the simulation environment for a multi-agent reinforcement learning scenario where agents are tasked with picking up, handing over and delivering packages in a grid-based world. This class serves as the foundation for creating and controlling the environment in which agents interact and learn.

1. **Grid-Based Simulation:** The class creates a grid-based environment with specified dimensions where the agents operate. It sets up the grid and initial positions of key elements within the environment.

2. **Agent Interaction:** The class facilitates interactions between multiple agents (groups A and B) by allowing them to take actions and move within the grid. Agents can perform actions like picking up, delivering, or moving.

3. **Reward Management:** The class handles reward calculation for agents based on their actions. Agents receive rewards for successful package pickup, delivery, and handovers, while also incurring penalties for movement and getting stuck.

It is the foundation of creating a controlled environment in which agents can learn and make decisions while engaging in a package delivery task. It provides a structured framework for the multi-agent reinforcement learning algorithm.

In [3]:
class Environment():
    
    #fields:
    #a grid of size defined by parameter
    #b, a tuple repsenting the delivery location
    #a, a tuple representing the package location
    #size, an int representing the horizontal and vertical dimensions of the grid
    def __init__(self, size:int, verbose=0):
        self.size = size
        self.actions = ['north', 'south', 'west', 'east']
        self.reward_pickup = 500
        self.reward_deliver = 500
        self.reward_handover = 1000
        self.reward_move = -5
        self.reward_stuck = -100
        self.reset_env()
        self.verbose = verbose
    
    def reset_env(self):
        self.a=(0,0)
        self.b=(self.size-1,self.size-1)
        
        combinations = list(itertools.product(list(range(self.size)), repeat=2))
        # Remove (0, 0) and (size-1, size-1)
        combinations = [(x, y) for x, y in combinations if (x, y) != (0, 0) and (x, y) != (self.size-1, self.size-1)]
        # rd.shuffle(combinations)
        agentA1, agentA2, agentB1, agentB2 = random.sample(combinations, 4)
        self.agentsA_coord = [agentA1, agentA2]
        self.agentsB_coord = [agentB1, agentB2]

        self.agentsA_collect = [0,0]
        self.agentsB_collect = [0,0]

        self.terminated = False

        self.agentsA_step = [0,0]
        self.agentsB_step = [0,0]
        self.test_length = 0

        self.packages_delivered = 0

        self.setup_grid()

    # def coordinates_to_noisy_ohv(self, coordinate:int):
    #     ohv = np.zeros(4)
    #     noise = np.random.normal(-1e-5, 1e-5, ohv.shape)
    #     ohv[coordinate] = 1
    #     ohv = ohv + noise
    #     return ohv
    #     this is a legacy method from a one-hot representation of it
    
    # def get_optimal_distance(self):
    #     return abs(self.agent_coords[0]-self.a[0]) + abs(self.agent_coords[0]-self.a[1]) + abs(self.a[0]-self.b[0]) + abs(self.a[1]-self.b[1])


    #method to set up the original grid including a location
    # def setup_grid(self):
    #     grid = np.zeros((self.size, self.size))
    #     self.grid = grid
    #     grid[self.a] = 1
    #     grid[self.b] = 2
    #     for agentA in self.agentsA_coord:
    #         grid[agentA] = 3
    #     for agentB in self.agentsB_coord:
    #         grid[agentB] = 4

    def get_state(self):
        #This method previously unpacked the coordinates into a tuple which can be used to index the qmatrix or for easy incorporation to the Deep Q Network
        # coords = [self.agent_coords[0], self.agent_coords[1], self.a[0], self.a[1], self.b[0], self.b[1]]
        # coords = [self.coordinates_to_noisy_ohv(c) for c in coords]
        # coords.append(np.array(self.collected))
        # return np.hstack(coords)
        # return np.ndarray.flatten(np.array(coords))
        return np.array([self.agentsA_coord[0][0],self.agentsA_coord[0][1],self.agentsA_coord[1][0],self.agentsA_coord[1][1],
                         self.agentsB_coord[0][0],self.agentsB_coord[0][1],self.agentsB_coord[1][0],self.agentsB_coord[1][1],
                         self.agentsA_collect[0], self.agentsA_collect[1], self.agentsB_collect[0], self.agentsB_collect[1]])

    
    #Method which updates the location of the agent on the grid. Currently just zeroes whatever it landed on - can include other logic instead
    def move_agent(self, actionA, actionB):
        x, y = self.a
        self.grid[x, y] = 1  # Place "a" on the grid
        x, y = self.b
        self.grid[x, y] = 2  # Place "b" on the grid

        agentsA_reward = [0,0]
        agentsB_reward = [0,0]
        agentsA_deliver = [0,0]
        agentsB_deliver = [0,0]

        for i in range(len(self.agentsA_step)):
            self.agentsA_step[i] += 1
        for i in range(len(self.agentsB_step)):
            self.agentsB_step[i] += 1
        
        self.test_length += 1

        agentsA_coord_old = self.agentsA_coord
        agentsB_coord_old = self.agentsB_coord

        #Assign new coordinates for agent A to exist at
        for i in range(len(self.agentsA_coord)):
            new_x, new_y = self.agentsA_coord[i]
            if actionA[i] == 'north' and self.agentsA_coord[i][1] > 0:
                new_y -= 1
            elif actionA[i] == 'south' and self.agentsA_coord[i][1] < self.size - 1:
                new_y += 1
            elif actionA[i] == 'west' and self.agentsA_coord[i][0] > 0:
                new_x -= 1
            elif actionA[i] == 'east' and self.agentsA_coord[i][0] < self.size - 1:
                new_x += 1

            #Update the grid based on new agent coordinates
            self.grid[self.agentsA_coord[i]] = 0
            self.agentsA_coord[i] = (new_x, new_y)
            self.grid[self.agentsA_coord[i]] = 3

        #Assign new coordinates for agent B to exist at
        for i in range(len(self.agentsB_coord)):
            new_x, new_y = self.agentsB_coord[i]
            if actionB[i] == 'north' and self.agentsB_coord[i][1] > 0:
                new_y -= 1
            elif actionB[i] == 'south' and self.agentsB_coord[i][1] < self.size - 1:
                new_y += 1
            elif actionB[i] == 'west' and self.agentsB_coord[i][0] > 0:
                new_x -= 1
            elif actionB[i] == 'east' and self.agentsB_coord[i][0] < self.size - 1:
                new_x += 1

            #Update the grid based on new agent coordinates
            self.grid[self.agentsB_coord[i]] = 0
            self.agentsB_coord[i] = (new_x, new_y)
            self.grid[self.agentsB_coord[i]] = 4
        
        # Agent A:
        for i in range(len(self.agentsA_coord)):
            if self.agentsA_collect[i] == 0:
                if self.agentsA_coord[i] == self.a:
                    if self.verbose>2:
                        print("A{} pickup".format(i))
                    # self.agentsA_step[i] = 0
                    self.agentsA_collect[i]=1
                    agentsA_reward[i] = self.reward_pickup
                else:
                    agentsA_reward[i] = self.reward_move #-(self.agentsA_step[i]**2)
            else:
                for j in range(len(self.agentsB_coord)):
                    if self.agentsA_coord[i] == agentsB_coord_old[j]:
                        if self.agentsB_coord[j] == agentsA_coord_old[i]:
                            if self.agentsB_collect[j] == 0:
                                if self.agentsA_collect[i] == 1:
                                    self.agentsA_collect[i]=0
                                    agentsA_deliver[i]=1
                                    self.agentsB_collect[j]=1
                                    agentsA_reward[i] = self.reward_handover - ((np.abs(self.agentsA_coord[i][0] - self.a[0]) + np.abs(self.agentsA_coord[i][1] - self.a[1])) ** 2)
                                    agentsB_reward[j] = self.reward_handover - ((np.abs(self.agentsB_coord[j][0] - self.b[0]) + np.abs(self.agentsB_coord[j][1] - self.b[1])) ** 2)
                                    if self.verbose>2:
                                        print("A{} B{} handover".format(i,j))
                                        print("\tagentsA_reward",agentsA_reward[i], self.agentsA_step[i])
                                        print("\tagentsB_reward",agentsB_reward[j], self.agentsB_step[j])
                                    self.agentsA_step[i]=0 ; self.agentsB_step[j]=0

                if agentsA_deliver[i]==0:
                    agentsA_reward[i] = self.reward_move #-(self.agentsA_step[i]**2)

            # if self.agentsA_coord[i] == agentsA_coord_old[i]:
            #     agentsB_reward[i] = self.reward_stuck

        # Agent B:
        for i in range(len(self.agentsB_coord)):
            if self.agentsB_collect[i] == 1:
                if self.agentsB_coord[i] == self.b:
                    if self.verbose>2:
                        print("B{} deliver".format(i))
                    # self.agentsB_step[i]=0
                    agentsB_deliver[i]=1
                    self.agentsB_collect[i]=0
                    agentsB_reward[i] = self.reward_deliver
                    self.packages_delivered += 1
                else:
                    agentsB_reward[i] = self.reward_move #-(self.agentsB_step[i]**2)
            else:
                agentsB_reward[i] = self.reward_move #-(self.agentsB_step[i]**2)

            # if self.agentsB_coord[i] == agentsB_coord_old[i]:
            #     agentsB_reward[i] = self.reward_stuck

        if self.test_length > 200:
            if self.verbose>2:
                print("terminated")
            self.terminated = True

        if self.verbose>3:
            print("agentsA_reward",agentsA_reward)
            print("agentsB_reward",agentsB_reward)

        return agentsA_reward, agentsB_reward, agentsA_deliver, agentsB_deliver, self.packages_delivered, self.terminated

    def get_grid(self):
        return self.grid.tolist()

    #method to set up the original grid including a location
    def setup_grid(self):
        self.grid = np.zeros((self.size, self.size))
            
        x, y = self.a
        self.grid[x, y] = 1  # Place "a" on the grid

        x, y = self.b
        self.grid[x, y] = 2  # Place "b" on the grid

        # Place the "home" location (0) on the grid
        for i in range(len(self.agentsA_coord)):
            x, y = self.agentsA_coord[i]
            self.grid[x, y] = 3

        for i in range(len(self.agentsB_coord)):
            x, y = self.agentsB_coord[i]
            self.grid[x, y] = 4
    
    def plot_grid(self, snapshot, ax=None):
        if ax is None:
            fig, ax = plt.subplots()
            ax.set_facecolor('white')
        else:
            ax.clear()

        # Plot the grid
        ax.imshow(np.array([[0]]), cmap="bone", extent=[0, self.size, 0, self.size])

        for i in range(self.size):
            for j in range(self.size):
                cell_value = snapshot[i][j]
                images_to_display = []

                if cell_value == 3:
                    images_to_display.append(OffsetImage(agent_img, zoom=0.08, alpha=0.5))
                elif cell_value == 4:
                    images_to_display.append(OffsetImage(agent_img2, zoom=0.05, alpha=0.5))
                elif cell_value == 1:
                    images_to_display.append(OffsetImage(package_img, zoom=0.03, alpha=0.75))
                elif cell_value == 2:
                    images_to_display.append(OffsetImage(destinationB_img, zoom=0.05, alpha=0.75))

                for image in images_to_display:
                    ab = AnnotationBbox(image, (j + 0.5, self.size - i - 0.5), frameon=False)
                    ax.add_artist(ab)

        # Set axis properties
        ax.set_xlim(0, self.size)
        ax.set_ylim(0, self.size)
        ax.set_xticks(np.arange(self.size) + 1)
        ax.set_yticks(np.arange(self.size) + 1)
        ax.set_xticklabels([])
        ax.set_yticklabels([])
        ax.grid(True, linewidth=2, color='white')

        # Set title
        ax.set_title("Package Delivery Agent")

        return ax
agent_img = plt.imread('agent.jpg')
agent_img2 = plt.imread("agent2.png")
package_img = plt.imread('package.jpg')
destinationB_img = plt.imread('destinationB.jpg')

### Deep Q-Network (DQN) for RL

This class defines a DQN model for reinforcement learning tasks. It consists of a neural network that approximates Q-values for state-action pairs and supports training and target network updates. This the the DQN which will be used to train both type of Agents (A and B).

**Attributes:**
- model (torch.nn.Sequential): The main neural network model for Q-value approximation.
- model2 (torch.nn.Sequential): A target network with the same architecture as the main model.
- loss_fn (torch.nn.MSELoss): Mean Squared Error loss function for training.

**Methods:**
- update_target(): Copy the state of the prediction network to the target network.
- get_qvals(next_state): Get Q-values for the next state based on the prediction network.
- get_maxQ(state): Get the maximum Q-value for a state based on the target network.
- train_one_step(states, actions, targets): Perform a single training step with provided minibatches.

**Purpose of this class:**
The purpose of the `DQN` class is to provide a framework for creating and training Deep Q-Networks (DQNs) for reinforcement learning tasks for both type of Agents.

1. **Neural Network Model:** The class defines a neural network model that approximates Q-values for state-action pairs. This model is the main part of the DQN and is used for making action predictions.

2. **Target Network:** The class includes a target network, which is a copy of the main model. This target network is periodically updated with the state of the prediction network to stabilize training.

3. **Loss Function:** It provides a Mean Squared Error (MSE) loss function, used to calculate the loss during training. The loss quantifies the error between predicted Q-values and target Q-values.

4. **Training:** The class offers a method (`train_one_step`) for performing a single training step. It updates the model's weights based on a minibatch of states, actions, and corresponding TD (Temporal Difference) targets. This training process enables the model to learn optimal Q-values.

5. **Q-Value Computation:** It provides methods for calculating Q-values. The `get_qvals` method computes Q-values for a given state-action pair based on the prediction network, while the `get_maxQ` method finds the maximum Q-value for a state based on the target network.

6. **Target Network Update:** The class includes a method (`update_target`) for synchronizing the target network with the prediction network. This process is crucial for stabilizing training in reinforcement learning.

In [4]:
class DQN:
    def __init__(self, learning_rate, state_size=100, action_size = 4):
        l1 = state_size
        l2 = 24
        l3 = 24
        l4 = action_size
        self.model = torch.nn.Sequential(
        torch.nn.Linear(l1, l2),
        torch.nn.ReLU(),
        torch.nn.Linear(l2, l3),
        torch.nn.ReLU(),
        torch.nn.Linear(l3,l4))

        self.model2 = copy.deepcopy(self.model)
        self.model2.load_state_dict(self.model.state_dict())
        self.loss_fn = torch.nn.MSELoss()
        self.learning_rate = learning_rate
        self.optimizer = torch.optim.Adam(self.model.parameters(), lr=self.learning_rate)

# The function "update_target" copies the state of the prediction network to the target network. You need to use this in regular intervals.
    def update_target(self):
        self.model2.load_state_dict(self.model.state_dict())

# The function "get_qvals" returns a numpy list of qvals for the next_state given by the argument based on the prediction network.
    def get_qvals(self, next_state):
        state1 = torch.from_numpy(next_state).float()
        qvals_torch = self.model(state1)
        qvals = qvals_torch.data.numpy()
        return qvals

# The function "get_maxQ" returns the maximum q-value for the state given by the argument based on the target network.
    def get_maxQ(self,state):
        return torch.max(self.model2(torch.from_numpy(state).float())).float()

# The function "train_one_step_new" performs a single training step.
# It returns the current loss (only needed for debugging purposes).
# Its parameters are three parallel lists: a minibatch of states, a minibatch of actions,
# a minibatch of the corresponding TD targets and the discount factor.
    def train_one_step(self, states, actions, targets):
        targets_reply = []
        state1_batch = torch.from_numpy(states).float()
        # state1_batch = torch.cat([torch.from_numpy(s).float() for s in states])
        action_batch = torch.Tensor(actions)
        Q1 = self.model(state1_batch)
        X = Q1.gather(dim=1,index=action_batch.long().unsqueeze(dim=1)).squeeze()
        Y = torch.tensor(targets).float()
        loss = self.loss_fn(X, Y)
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()
        return loss.item()

#### AGENT A CLASS

The AgentA class is a subclass of the DQN class for type A agents. It represents an agent that can interact with an environment, learn from experiences, and make decisions using Q-learning.

**Attribites**:
- statespace_size (int): The size of the state space.
- gamma (float): The discount factor for future rewards.
- learning_rate (float): The learning rate used for updating the Q-network.
- start_epsilon (float): The initial exploration rate (epsilon-greedy strategy).
- epsilon_decay_factor (float): The factor by which epsilon is decayed over time.
- min_epsilon (float): The minimum value for epsilon.
- replay_buffer_size (int): The size of the replay buffer for experience replay.
- batch_size (int): The size of mini-batches for training the network.
- network_copy_frequency (int): The frequency at which the target network is updated.

**Method**:
- remember(self, state, actionA, agentsA_reward, agentsA_deliver, next_state): Stores a transition (experience) in the agent's memory.
- update_target(self): Updates the target Q-network based on the defined network_copy_frequency. It calls the update_target method from the parent class DQN.
- act(self, state): Decides the action to take in the current state using an epsilon-greedy strategy. It explores with probability epsilon and exploits with probability 1 - epsilon.
- process_minibatch(self, minibatch): Prepares the data for updating the Q-network based on a mini-batch of experiences.
- save_model_parameters(self, destination): Saves the model parameters of the agent's Q-network to a specified file.
- load_model_parameters(self, path): Loads model parameters for the agent's Q-network from a specified file.

**Purpose of this class**:

1. DQN Implementation: It is an implementation of the Deep Q-Network (`DQN`) algorithm, which is a popular method for solving reinforcement learning problems by approximating the optimal action-value function.

2. Experience Storage: The `remember` method is responsible for storing experiences (state, action, reward, delivery flag, next state) in the agent's memory. Experience replay is a key feature of DQN algorithms, which helps the agent learn from past experiences.

3. Epsilon-Greedy Policy: The `act` method implements an epsilon-greedy policy for action selection. It balances exploration (taking random actions) and exploitation (choosing the action with the highest estimated Q-value).

4. Network Update: The `update_target` method is responsible for updating the target Q-network. In DQN, there are typically two Q-networks: one for learning (the primary network) and one for target values. The target network is periodically updated to stabilize training.

5. Mini-Batch Processing: The `process_minibatch` method prepares the data for training the Q-network using mini-batches of experiences. It computes target Q-values for each experience, which are used to update the network.


In [15]:
class AgentA(DQN):
    def __init__(self, statespace_size=12, gamma=0.99, learning_rate=0.997, start_epsilon=1.0,
                 epsilon_decay_factor=0.999, min_epsilon=0.05, replay_buffer_size=10000,
                 batch_size=200, network_copy_frequency=10):
        super().__init__(learning_rate=learning_rate, state_size=statespace_size, action_size=4)  # Call the parent constructor
        self.statespace_size = statespace_size
        self.gamma = gamma
        # self.learning_rate = learning_rate
        self.memory = []  # Using a list instead of a deque
        self.epsilon = start_epsilon
        self.epsilon_min = min_epsilon
        self.epsilon_decay_factor = epsilon_decay_factor
        self.replay_buffer_size = replay_buffer_size
        self.batch_size = batch_size
        self.network_copy_frequency = network_copy_frequency
        self.steps_since_copy = 0  # Counter for network copy
    
    def remember(self, state, actionA, agentsA_reward, agentsA_deliver, next_state):
        self.memory.append((state, actionA, agentsA_reward, agentsA_deliver, next_state))
        while len(self.memory) > self.replay_buffer_size:
            self.memory.pop(0)  # Remove the oldest experience

    def update_target(self):
        if self.steps_since_copy % self.network_copy_frequency == 0:
            super().update_target()

    def act(self, state):
        if np.random.rand() <= self.epsilon:
            return np.random.choice(range(4))  # Random action
        q_values = self.get_qvals(state)
        return np.argmax(q_values)  # Greedy action

    def process_minibatch(self, minibatch):
        states = []
        actions = []
        targets = []
        for state, actionA, agentsA_reward, agentsA_deliver, next_state in minibatch:
            q_values = self.get_qvals(state)
                
            if agentsA_deliver==1:
                q_values[actionA] = agentsA_reward
            else:
                q_values[actionA] = agentsA_reward + self.gamma * self.get_maxQ(next_state)
            
            actions.append(actionA)
            targets.append(q_values[actionA])
        
            states.append(state)
            
        return np.array(states), np.array(actions), np.array(targets)

    def save_model_parameters(self, destination):
        torch.save(self.model2.state_dict(), destination)

    def load_model_parameters(self, path):
        self.model.load_state_dict(torch.load(path))
        self.model2.load_state_dict(torch.load(path))

#### AGENT B CLASS

The AgentB class is a subclass of the DQN class for type B agents. It represents an agent that can interact with an environment, learn from experiences, and make decisions using Q-learning.

**Attribites**:
- statespace_size (int): The size of the state space.
- gamma (float): The discount factor for future rewards.
- learning_rate (float): The learning rate used for updating the Q-network.
- start_epsilon (float): The initial exploration rate (epsilon-greedy strategy).
- epsilon_decay_factor (float): The factor by which epsilon is decayed over time.
- min_epsilon (float): The minimum value for epsilon.
- replay_buffer_size (int): The size of the replay buffer for experience replay.
- batch_size (int): The size of mini-batches for training the network.
- network_copy_frequency (int): The frequency at which the target network is updated.

**Method**:
- remember(self, state, actionB, agentsB_reward, agentsB_deliver, next_state): Stores a transition (experience) in the agent's memory.
- update_target(self): Updates the target Q-network based on the defined network_copy_frequency. It calls the update_target method from the parent class DQN.
- act(self, state): Decides the action to take in the current state using an epsilon-greedy strategy. It explores with probability epsilon and exploits with probability 1 - epsilon.
- process_minibatch(self, minibatch): Prepares the data for updating the Q-network based on a mini-batch of experiences.
- save_model_parameters(self, destination): Saves the model parameters of the agent's Q-network to a specified file.
- load_model_parameters(self, path): Loads model parameters for the agent's Q-network from a specified file.

**Purpose of this class**:

1. DQN Implementation: It is an implementation of the Deep Q-Network (`DQN`) algorithm, which is a popular method for solving reinforcement learning problems by approximating the optimal action-value function.

2. Experience Storage: The `remember` method is responsible for storing experiences (state, action, reward, delivery flag, next state) in the agent's memory. Experience replay is a key feature of DQN algorithms, which helps the agent learn from past experiences.

3. Epsilon-Greedy Policy: The `act` method implements an epsilon-greedy policy for action selection. It balances exploration (taking random actions) and exploitation (choosing the action with the highest estimated Q-value).

4. Network Update: The `update_target` method is responsible for updating the target Q-network. In DQN, there are typically two Q-networks: one for learning (the primary network) and one for target values. The target network is periodically updated to stabilize training.

5. Mini-Batch Processing: The `process_minibatch` method prepares the data for training the Q-network using mini-batches of experiences. It computes target Q-values for each experience, which are used to update the network.


In [16]:
class AgentB(DQN):
    def __init__(self, statespace_size=12, gamma=0.99, learning_rate=0.997, start_epsilon=1.0,
                 epsilon_decay_factor=0.999, min_epsilon=0.05, replay_buffer_size=10000,
                 batch_size=200, network_copy_frequency=10):
        super().__init__(learning_rate=learning_rate, state_size=statespace_size, action_size=4)  # Call the parent constructor
        self.statespace_size = statespace_size
        self.gamma = gamma
        self.memory = []  # Using a list instead of a deque
        self.epsilon = start_epsilon
        self.epsilon_min = min_epsilon
        self.epsilon_decay_factor = epsilon_decay_factor
        self.replay_buffer_size = replay_buffer_size
        self.batch_size = batch_size
        self.network_copy_frequency = network_copy_frequency
        self.steps_since_copy = 0  # Counter for network copy

    def remember(self, state, actionB, agentsB_reward, agentsB_deliver, next_state):
        self.memory.append((state, actionB, agentsB_reward, agentsB_deliver, next_state))
        while len(self.memory) > self.replay_buffer_size:
            self.memory.pop(0)  # Remove the oldest experience

    def update_target(self):
        if self.steps_since_copy % self.network_copy_frequency == 0:
            super().update_target()

    def act(self, state):
        if np.random.rand() <= self.epsilon:
            return np.random.choice(range(4))  # Random action
        q_values = self.get_qvals(state)
        return np.argmax(q_values)  # Greedy action

    def process_minibatch(self, minibatch):
        states = []
        actions = []
        targets = []
        for state, actionB, agentsB_reward, agentsB_deliver, next_state in minibatch:
            q_values = self.get_qvals(state)

            if agentsB_deliver==1:
                q_values[actionB] = agentsB_reward
            else:
                q_values[actionB] = agentsB_reward + self.gamma * self.get_maxQ(next_state)
            
            actions.append(actionB)
            targets.append(q_values[actionB])
        
            states.append(state)
            
        return np.array(states), np.array(actions), np.array(targets)

    def save_model_parameters(self, destination):
        torch.save(self.model2.state_dict(), destination)

    def load_model_parameters(self, path):
        self.model.load_state_dict(torch.load(path))
        self.model2.load_state_dict(torch.load(path))

In [7]:
def create_one_hot_vector(my_x, my_y, friend_x, friend_y, other1_x, other1_y, other2_x, other2_y, my_flag, friend_flag, other1_flag, other2_flag, grid_size=5):
    
    # Initialize a grid to represent the positions and flags of agents
    grid = np.zeros((grid_size, grid_size, 2, 4))

    # Set the positions of agents from group A
    if my_flag == 0:
        grid[my_x, my_y, 0, 0] = 1
    else:
        grid[my_x, my_y, 1, 0] = 1

    if friend_flag == 0:
        grid[friend_x, friend_y, 0, 1] = 1
    else:
        grid[friend_x, friend_y, 1, 1] = 1

    # Set the positions of agents from group B
    if other1_flag == 0:
        grid[other1_x, other1_y, 0, 2] = 1
    else:
        grid[other1_x, other1_y, 1, 2] = 1

    if other2_flag == 0:
        grid[other2_x, other2_y, 0, 3] = 1
    else:
        grid[other2_x, other2_y, 1, 3] = 1


    # Add a small noise to all the zeros
    noise_level = 1e-6
    grid += np.random.uniform(0, noise_level, size=grid.shape)

    # Flatten the grid into a one-hot vector
    one_hot_vector = grid.reshape(-1)

    return one_hot_vector

### Run visualization:
Runs a visualization of the trained agent's movement in the environment.

The `run_visualisation` function runs a visualization of package delivery episodes using a trained transportation agent. It simulates the agent's actions in the environment and displays the evolving grid at each step of the episode.

In [8]:
def run_visualisation(agentA, agentB, max_steps=50, size=5):
    #Assumes a fully trained transport agent. 
    #Target network should either have been trained in the preceeding step, or loaded from a pickled pytorch weights file
    env = Environment(size=size,verbose=3)
    # Create a figure and axis outside the loop
    fig, ax = plt.subplots()
    ax.set_facecolor('white')

    # Initialize the plot once with the initial grid
    m = ax.imshow(env.grid, cmap="bone", extent=[0, env.size, 0, env.size])
    snapshots = []
    for step in range(max_steps):
        state = env.get_state()
        stateA1 = np.array([state[0], state[1], state[2], state[3], state[4], state[5], state[6], state[7], state[8], state[9], state[10], state[11]])
        stateA2 = np.array([state[2], state[3], state[0], state[1], state[4], state[5], state[6], state[7], state[9], state[8], state[10], state[11]])
        stateB1 = np.array([state[4],state[5], state[6],state[7], state[0], state[1],state[2],state[3],state[10], state[11], state[8],state[9]])
        stateB2 = np.array([state[6],state[7], state[4],state[5], state[0], state[1],state[2],state[3], state[11], state[10], state[8],state[9]])

        actionA1= np.argmax(agentA.get_qvals(stateA1))
        actionA2 = np.argmax(agentA.get_qvals(stateA2))
        actionB1= np.argmax(agentB.get_qvals(stateB1))
        actionB2 = np.argmax(agentB.get_qvals(stateB2))

        actionA = [env.actions[actionA1], env.actions[actionA2]]
        actionB = [env.actions[actionB1], env.actions[actionB2]]

        _, _, _, _, packages_delivered, terminated  = env.move_agent(actionA, actionB)
        snapshots.append(env.get_grid())

        if terminated or packages_delivered>5:
            break
    print(BOLD, "total packages_delivered", packages_delivered, RESET)
    for s in snapshots:
        env.plot_grid(s, ax)
        # Redraw the plot
        fig.canvas.draw()
        plt.pause(0.2)  # Add a short pause for visualization
    # plt.pause(1.5)
    plt.close()
    return None

### Train Function

The purpose of the `train_function` is to begin the training process for two types of agents (A and B) in a package delivery task within a multi-agent reinforcement learning environment. The task involves agent A picking up packages and handing them over to agent B, who is responsible for delivering the packages to their destination. Here's how the function serves this purpose:

1. **Training Environment (env):** The function takes an environment (`env`) as one of its inputs. This environment represents the simulated world in which agents A and B operate. The environment defines the states, action, rewards of the package delivery task.

2. **Agent A and Agent B (agentA, agentB):** The function receives two agent types (`agentA` and `agentB`) as inputs. Agent type A handles package pickup, and agent type B manages package delivery.

3. **Number of Training Episodes (episodes):** The function allows specifying the number of training episodes. An episode represents one complete run of the package delivery task, starting from the initial state and ending when specific termination conditions are met.

4. **Training Loop:** The core purpose of the function is to run a training loop for a specified number of episodes. In each episode, the following happens:

   a. **Environment Initialization:** The environment is reset, bringing it back to its initial state, and the agents are placed in the environment.

   b. **Agent Interaction:** All agents (2 of Type A and 2 of Type B) interact with the environment by selecting actions based on their current observations and learned policies.

   c. **Experience Collection:** As the agents interact with the environment, they collect experiences, including the current state, the action taken, the received rewards, and the resulting state.

   d. **Replay Memory:** The experiences are stored in the agents' replay memory, which is used for training the DQN models. This replay memory helps stabilize the learning process by randomly sampling past experiences.

   e. **Training:** Both agents may perform training steps using their collected experiences. The DQN models are updated to improve the agents' policies. The training aims to make the agents better at making decisions during the package delivery task.

   f. **Training Metrics:** The function also tracks training metrics such as loss values and empirical rewards (cumulative rewards obtained in each episode). These metrics can be used to monitor the training progress and evaluate the performance of the agents.

5. **Termination and Exploration:** The function ensures that training episodes are terminated based on certain conditions. It also handles exploration strategy - epsilon-greedy exploration (by adjusting the agents' exploration rate during training using epslion decay).

The `train_function` is responsible for coordinating the training process of two types of agents (A and B) in a multi-agent reinforcement learning environment. It allows the agents to learn optimal policies for the package delivery task, and it tracks training metrics to evaluate their performance and progress.

In [9]:
def train_function(env, agentA, agentB, episodes=100):
    loss_history = []
    empirical_reward_history = []
    # distance_history =[]

    for episode in (range(episodes)): #tqdm
        print(BOLD, "episode", episode, RESET)
        env.reset_env()
        state = env.get_state()  # Reset the environment and get the initial state
        
        # steps_to_opt = env.get_optimal_distance()
        steps_in_episode = 0
        # done = False
        terminated = False
        total_reward = 0
        packages_delivered = 0
        episode_losses = []

        if (episode>0 and episode%25 == 0) or (episode == episodes-1):
            agentA.save_model_parameters("logs/target_params_A_{}_onehot.pt".format(episode))
            agentB.save_model_parameters("logs/target_params_B_{}_onehot.pt".format(episode))

        # print("epsilon", agentA.epsilon)
        
        while (packages_delivered<5 and not terminated):

            stateA1 = np.array([state[0], state[1], state[2], state[3], state[4], state[5], state[6], state[7], state[8], state[9], state[10], state[11]])
            stateA2 = np.array([state[2], state[3], state[0], state[1], state[4], state[5], state[6], state[7], state[9], state[8], state[10], state[11]])
            stateB1 = np.array([state[4],state[5], state[6],state[7], state[0], state[1],state[2],state[3],state[10], state[11], state[8],state[9]])
            stateB2 = np.array([state[6],state[7], state[4],state[5], state[0], state[1],state[2],state[3], state[11], state[10], state[8],state[9]])

            # print("state", state)
            actionA1 = agentA.act(stateA1)
            actionA2 = agentA.act(stateA2)

            actionB1 = agentB.act(stateB1)
            actionB2 = agentB.act(stateB2)

            actionA = [env.actions[actionA1], env.actions[actionA2]]
            actionB = [env.actions[actionB1], env.actions[actionB2]]

            agentsA_reward, agentsB_reward, agentsA_deliver, agentsB_deliver, packages_delivered, terminated = env.move_agent(actionA, actionB)
            
            next_state = env.get_state()
            next_stateA1 = np.array([next_state[0], next_state[1], next_state[2], next_state[3], next_state[4], next_state[5], next_state[6], next_state[7], next_state[8], next_state[9], next_state[10], next_state[11]])
            next_stateA2 = np.array([next_state[2], next_state[3], next_state[0], next_state[1], next_state[4], next_state[5], next_state[6], next_state[7], next_state[9], next_state[8], next_state[10], next_state[11]])
            next_stateB1 = np.array([next_state[4],next_state[5], next_state[6],next_state[7], next_state[0], next_state[1],next_state[2],next_state[3],next_state[10], next_state[11], next_state[8],next_state[9]])
            next_stateB2 = np.array([next_state[6],next_state[7], next_state[4],next_state[5], next_state[0], next_state[1],next_state[2],next_state[3], next_state[11], next_state[10], next_state[8],next_state[9]])

            # next_state, reward, done = env.step(env.actions[action])
            # total_reward += reward
            agentA.remember(stateA1, actionA1, agentsA_reward[0], agentsA_deliver[0], next_stateA1)
            agentA.remember(stateA2, actionA2, agentsA_reward[1], agentsA_deliver[1], next_stateA2)
            agentB.remember(stateB1, actionB1, agentsB_reward[0], agentsB_deliver[0], next_stateB1)
            agentB.remember(stateB2, actionB2, agentsB_reward[1], agentsB_deliver[1], next_stateB2)

            state = next_state
            steps_in_episode += 1 

            # Training A
            if len(agentA.memory) > agentA.batch_size:
                minibatch_indices = np.random.choice(len(agentA.memory), agentA.batch_size, replace=False)
                minibatch = [agentA.memory[i] for i in minibatch_indices]
                states_batch, actions_batch, targets_batch = agentA.process_minibatch(minibatch)
                loss = agentA.train_one_step(states_batch, actions_batch, targets_batch)
                episode_losses.append(loss)


            # Training B
            if len(agentB.memory) > agentB.batch_size:
                minibatch_indices = np.random.choice(len(agentB.memory), agentB.batch_size, replace=False)
                minibatch = [agentB.memory[i] for i in minibatch_indices]
                states_batch, actions_batch, targets_batch = agentB.process_minibatch(minibatch)
                loss = agentB.train_one_step(states_batch, actions_batch, targets_batch)
                episode_losses.append(loss)
        
        #Cleanup step

        agentA.epsilon = max(agentA.epsilon * agentA.epsilon_decay_factor, agentA.epsilon_min)
        agentB.epsilon = max(agentB.epsilon * agentB.epsilon_decay_factor, agentB.epsilon_min)

        if len(episode_losses) > 0:
            mean_loss = sum(episode_losses)/len(episode_losses)
        else:
            mean_loss = "n/a"

        agentA.steps_since_copy += 1
        agentA.update_target()
        agentB.steps_since_copy += 1
        agentB.update_target()

        # print("state", state)
        # print(f"Episode: {episode + 1}, Total Reward: {total_reward}, Epsilon: {agent.epsilon} | Loss: {mean_loss}")
        # print(f"Collected: {env.collected} | Terminated: {terminated} | Done: {done}")
        
        # distance_history.append(steps_in_episode - steps_to_opt + 1)
        loss_history.append(mean_loss)
        empirical_reward_history.append(total_reward)
    return loss_history, empirical_reward_history #, distance_history

In [17]:
# if __name__ == "__main__":
env = Environment(5, verbose=3) # Create the environment

gamma = 0.8; learning_rate = 0.01; epsilon_decay_factor = 0.97; min_epsilon=0.05; replay_buffer_size=10000; batch_size=300; network_copy_frequency=15
episodes = 400

agentA = AgentA(gamma=gamma, learning_rate=learning_rate, epsilon_decay_factor=epsilon_decay_factor,
                 min_epsilon=min_epsilon, replay_buffer_size=replay_buffer_size,
                 batch_size=batch_size, network_copy_frequency=network_copy_frequency)
agentB = AgentB(gamma=gamma, learning_rate=learning_rate, epsilon_decay_factor=epsilon_decay_factor,
                 min_epsilon=min_epsilon, replay_buffer_size=replay_buffer_size,
                 batch_size=batch_size, network_copy_frequency=network_copy_frequency)


#### Run the below cell for training

In [None]:
losses, rewards = train_function(env, agentA, agentB, episodes=episodes) #, distance_history
np.save("losses.npy", losses, allow_pickle=False)
np.save("rewards.npy", rewards, allow_pickle=False)

#### Run the below cell for visualisation

In [18]:
# check visualization
agentA.load_model_parameters(path = 'logs/target_params_A_375_onehot.pt')
agentB.load_model_parameters(path = 'logs/target_params_B_375_onehot.pt')
run_visualisation(agentA, agentB)

A1 pickup
A0 pickup
A1 B0 handover
	agentsA_reward 991 7
	agentsB_reward 975 7
A1 pickup
A0 B1 handover
	agentsA_reward 984 11
	agentsB_reward 984 11
B0 deliver
A0 pickup
A1 B0 handover
	agentsA_reward 975 8
	agentsB_reward 991 8
B1 deliver
B0 deliver
A1 pickup
A0 B0 handover
	agentsA_reward 984 13
	agentsB_reward 984 9
A1 B1 handover
	agentsA_reward 984 9
	agentsB_reward 984 13
A0 pickup
A1 pickup
B0 deliver
B1 deliver
A0 B0 handover
	agentsA_reward 975 9
	agentsB_reward 991 9
A1 B1 handover
	agentsA_reward 975 9
	agentsB_reward 991 9
B0 deliver
B1 deliver
[1m total packages_delivered 7 [0m


In [None]:
round_losses = [round(loss, 2) for loss in losses if loss != "n/a"]
plt.plot(round_losses, label='Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.title('Training Loss')
plt.legend()
plt.show()

In [None]:
round_rewards = [round(r, 2) for r in rewards if r != "n/a"]
plt.plot(round_rewards, label='Rewards')
plt.xlabel('Epochs')
plt.ylabel('Reward')
plt.title('Empirical Reward')
plt.legend()
plt.show()