## Assignment 2 - Deep Q

**Team: Wintergreen Systems**

- Parisa
- Sudha
- Saniya
- Elizabeth

In [11]:
import torch
import copy
import numpy as np
import itertools
import random as rd
from matplotlib import pyplot as plt
from matplotlib.offsetbox import OffsetImage, AnnotationBbox

In [12]:
import matplotlib

matplotlib.use("TkAgg")
BOLD = "\033[1m"
RESET = "\033[0m"

In [13]:
# !pip install torch

# Purpose

Purpose of this branch is to test convergence properties of the network using one-hot-vectors for the state space instead of 0-3, as the OHV format may be easier for the network to operate on. Due to the need to operate on zero vectors we will

- add a small noise function gamma to each OHV
- use He initialisation for weight initialisation

 ### Environment Class

The `Environment` class has been used to initialize the grid-based environment in which the package delivery agent operates.

**Methods:**

 - `reset_env`: Resets the environment to an initial state and generates random locations for the package , destination and agent.
 - `coordinates_to_noisy_obv(coordinate:int)`: Converts the coordinates to noisy one-hot vector representation.
 - `setup_grid()`: Initializes the grid with package, destination and agent locations.
 - `get_state()`: Retrieves the current state of the environment.
 - `move_agent(action)`: Moves the agent in the specified direction and updates the environment.
 - `get_grid()`: Retrieves the current grid representation of the environment. The function reruens a list of lists representing the grid.
 - `plot_grid(snapshot, ax=None)`: This is the implementation of the visualization for the grid state throughout the journey of the agent from its start location to end destination. Visualization is implemented using Matplotlib. It displays images representing the agent, package, and destination locations.


In [14]:
class Environment():
    
    #fields:
    #a grid of size defined by parameter
    #b, a tuple repsenting the delivery location
    #a, a tuple representing the package location
    #size, an int representing the horizontal and vertical dimensions of the grid
    def __init__(self, size:int):
        self.size = size
        self.actions = ['north', 'south', 'west', 'east']
        self.reward_pickup = 50
        self.reward_deliver = 200
        self.reward_move = -1.5
        self.reset_env()
    
    def reset_env(self):
        combinations = list(itertools.product(list(range(self.size)), repeat=2))
        rd.shuffle(combinations)
        self.a, self.b, self.agent_coords = combinations[0:3]
        self.collected = 0
        self.done = False
        self.terminated = False
        self.test_length = 0 #an additional parameter to determine how long the environment has been active for
        self.setup_grid()

    # def coordinates_to_noisy_ohv(self, coordinate:int):
    #     ohv = np.zeros(4)
    #     noise = np.random.normal(-1e-5, 1e-5, ohv.shape)
    #     ohv[coordinate] = 1
    #     ohv = ohv + noise
    #     return ohv
    
    def get_optimal_distance(self):
        return abs(self.agent_coords[0]-self.a[0]) + abs(self.agent_coords[0]-self.a[1]) + abs(self.a[0]-self.b[0]) + abs(self.a[1]-self.b[1])


    #method to set up the original grid including a location
    def setup_grid(self):
        grid = np.zeros((self.size, self.size))
        self.grid = grid
        grid[self.a] = 1
        grid[self.b] = 2
        grid[self.agent_coords] = -1

    def get_state(self):
        #This method unpacks the coordinates into a tuple which can be used to index the qmatrix or for easy incorporation to the Deep Q Network
        # coords = [self.agent_coords[0], self.agent_coords[1], self.a[0], self.a[1], self.b[0], self.b[1]]
        # coords = [self.coordinates_to_noisy_ohv(c) for c in coords]
        # coords.append(np.array(self.collected))
        # return np.hstack(coords)
        # return np.ndarray.flatten(np.array(coords))
        return np.array([self.agent_coords[0], self.agent_coords[1], self.collected, self.a[0], self.a[1], self.b[0], self.b[1]])
        # return np.array([self.agent_coords[0]-self.a[0], self.agent_coords[1]-self.a[1],
        #                   self.collected,
        #                     self.agent_coords[0]-self.b[0], self.agent_coords[1]-self.b[1]])
    
    #Method which updates the location of the agent on the grid. Currently just zeroes whatever it landed on - can include other logic instead
    def move_agent(self, action):
        #For agent move, note that 0 = up, 1=down, 2=left, 3=right
        self.test_length += 1

        #Assign new coordinates for agent to exist at
        new_x, new_y = self.agent_coords
        if action == 'north' and self.agent_coords[1] > 0:
            new_y -= 1
        elif action == 'south' and self.agent_coords[1] < self.size - 1:
            new_y += 1
        elif action == 'west' and self.agent_coords[0] > 0:
            new_x -= 1
        elif action == 'east' and self.agent_coords[0] < self.size - 1:
             new_x += 1
        
        #Update the grid based on new agent coordinates
        self.grid[self.agent_coords] = 0
        self.agent_coords = (new_x, new_y)
        self.grid[self.agent_coords] = -1

        #Handle logic based on new agent location
        if self.collected == 0:
            if self.agent_coords == self.a:
                self.collected = 1
                reward = self.reward_pickup
            else:
                reward = self.reward_move
        
        else:
            if self.agent_coords == self.b:
                self.done = True
                reward = self.reward_deliver
            else:
                reward = self.reward_move

                #This code was considered test code which could be useful in exiting early
        if self.test_length > 70:
            self.terminated = True
        return reward, self.done, self.terminated

    def get_grid(self):
        return self.grid.tolist()

    #method to set up the original grid including a location
    def setup_grid(self):
        self.grid = np.zeros((self.size, self.size))
            
        # Place the "a" and "b" locations on the grid
        if self.a is None:
            x, y = rd.randint(0, self.size - 1), rd.randint(0, self.size - 1)
            while (x, y) == self.b:
                x, y = rd.randint(0, self.size - 1), rd.randint(0, self.size - 1)
            self.a = (x, y)  # A represented by 1
        else:
            x, y = self.a
        self.grid[x, y] = 1  # Place "a" on the grid

        if self.b is None:
            x, y = rd.randint(0, self.size - 1), rd.randint(0, self.size - 1)
            while (x, y) == self.a:
                x, y = rd.randint(0, self.size - 1), rd.randint(0, self.size - 1)
            self.b = (x, y)  # B represented by 2
        else:
            x, y = self.b
        self.grid[x, y] = 2  # Place "b" on the grid

        # Place the "home" location (0) on the grid
        x, y = self.agent_coords
        self.grid[x, y] = 0
    
    def plot_grid(self, snapshot, ax=None):
        if ax is None:
            fig, ax = plt.subplots()
            ax.set_facecolor('white')
        else:
            ax.clear()
        
        # Plot the grid
        ax.imshow(np.array([[0]]), cmap="bone", extent=[0, self.size, 0, self.size])

        for i in range(self.size):
            for j in range(self.size):
                cell_value = snapshot[i][j]
                if cell_value == -1:
                    # Display agent image in the cell
                    imagebox = OffsetImage(agent_img, zoom=0.08)
                    ab = AnnotationBbox(imagebox, (j + 0.5, self.size - i - 0.5), frameon=False)
                    ax.add_artist(ab)
                elif cell_value == 1:
                    # Display package image in the cell
                    imagebox = OffsetImage(package_img, zoom=0.03)
                    ab = AnnotationBbox(imagebox, (j + 0.5, self.size - i - 0.5), frameon=False)
                    ax.add_artist(ab)
                elif cell_value == 2:
                    # Display destination image in the cell
                    imagebox = OffsetImage(destinationB_img, zoom=0.05)
                    ab = AnnotationBbox(imagebox, (j + 0.5, self.size - i - 0.5), frameon=False)
                    ax.add_artist(ab)
                else:
                    ax.text(j + 0.5, self.size - i - 0.5, self.grid[i, j], ha='center', va='center', fontsize=20, color='black')
        
        # Set axis properties
        ax.set_xlim(0, self.size)
        ax.set_ylim(0, self.size)
        ax.set_xticks(np.arange(self.size) + 1)
        ax.set_yticks(np.arange(self.size) + 1)
        ax.set_xticklabels([])
        ax.set_yticklabels([])
        ax.grid(True, linewidth=2, color='white')
        
        # Set title
        ax.set_title("Package Delivery Agent")
        
        # Show the plot
        # plt.show()
        return ax
agent_img = plt.imread('agent.jpg')
package_img = plt.imread('package.jpg')
destinationB_img = plt.imread('destinationB.jpg')

 ### TransportAgent Class

 The `TransportAgent` class represents an agent for a transportation task using reinforcemnet learning with neural networks.

 **Methods**
- `prepare_torch(self)`: This method initializes the agent's neural network architecture. It defines a multi-layered neural network with ReLU activation functions and prepares a target neural network for stability in training. The network architecture consists of an input layer with `statespace_size` neurons, followed by two hidden layers with 150 and 100 neurons, respectively, and an output layer with 4 neurons representing Q-values for each action.

- `update_target(self)`: This method updates the target neural network with the main neural network's weights. It helps stabilize the training process by reducing the target network's "chasing" of the main network.

- `remember(self, state, action, reward, next_state, done)`: Stores the agent's experiences in the memory buffer. Experiences are represented as tuples of `(state, action, reward, next_state, done)` and are added to the memory. If the memory buffer exceeds the specified `replay_buffer_size`, the oldest experience is removed.

- `get_qvals(self, state)`: Computes Q-values for a given state. It takes a state as input, converts it to a PyTorch tensor, and computes the Q-values using the agent's main neural network.

- `get_maxQ(self, s)`: Returns the maximum Q-value for a given state. It calculates the maximum Q-value using the target neural network.

- `act(self, state)`: Selects an action based on epsilon-greedy policy. If a random number is less than the current epsilon value, a random action is chosen; otherwise, the action with the highest Q-value is selected.
    
- `process_minibatch(self, minibatch)`: Prepares the minibatch of experiences for training. It extracts states, actions, and target Q-values from the provided minibatch of experiences.

- `train_one_step(self, states, actions, targets)`: Performs one step of training using a minibatch of experiences. It computes the loss between predicted and target Q-values and updates the agent's neural network's weights using backpropagation. Gradient clipping is applied to prevent exploding gradients.


In [20]:
# Define the TransportAgent agent
#new statespace size should be 25 - this represents six 4-size one-hot-vectors, plus a collected flag
class TransportAgent:
    def __init__(self, statespace_size=7, gamma=0.99, learning_rate=0.01, start_epsilon=1.0,
                 epsilon_decay_factor=0.997, min_epsilon=0.1, replay_buffer_size=1000,
                 batch_size=200, network_copy_frequency=500):
        self.statespace_size = statespace_size
        self.gamma = gamma
        self.learning_rate = learning_rate
        self.model2 = self.prepare_torch()
        self.memory = []  # Using a list instead of a deque
        self.epsilon = start_epsilon
        self.epsilon_min = min_epsilon
        self.epsilon_decay_factor = epsilon_decay_factor
        self.replay_buffer_size = replay_buffer_size
        self.batch_size = batch_size
        self.network_copy_frequency = network_copy_frequency
        self.steps_since_copy = 0  # Counter for network copy

    def prepare_torch(self):
        l1 = self.statespace_size
        l2 = 150
        l3 = 100
        l4 = 4
        self.model = torch.nn.Sequential(
            torch.nn.Linear(l1, l2),
            torch.nn.ReLU(),
            torch.nn.Linear(l2, l3),
            torch.nn.ReLU(),
            torch.nn.Linear(l3, l4)
        )

        model2 = copy.deepcopy(self.model)
        model2.load_state_dict(self.model.state_dict())
        # self.loss_fn = torch.nn.HuberLoss()
        self.loss_fn = torch.nn.MSELoss()
        self.optimizer = torch.optim.Adam(self.model.parameters(), lr=self.learning_rate)
        return model2

    def update_target(self):
        if self.steps_since_copy >= self.network_copy_frequency:
            self.model2.load_state_dict(self.model.state_dict())
            self.steps_since_copy = 0
            # print(BOLD + "Target model updated" + RESET)

    def remember(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))
        while len(self.memory) > self.replay_buffer_size:
            self.memory.pop(0)  # Remove the oldest experience

    def get_qvals(self, state):
        state1 = torch.from_numpy(state).float()
        qvals_torch = self.model(state1)
        qvals = qvals_torch.data.numpy()
        return qvals

    def get_maxQ(self, s):
        return torch.max(self.model2(torch.from_numpy(s).float())).float()

    def act(self, state):
        if np.random.rand() <= self.epsilon:
            return np.random.choice(range(4))  # Random action
        q_values = self.get_qvals(state)
        return np.argmax(q_values)  # Greedy action

    def process_minibatch(self, minibatch):
        states = []
        actions = []
        targets = []
        for state, action, reward, next_state, done in minibatch:
            q_values = self.get_qvals(state)
            if done:
                q_values[action] = reward
            else:
                q_values[action] = reward + self.gamma * self.get_maxQ(next_state)
            states.append(state)
            actions.append(action)
            targets.append(q_values[action])
        return np.array(states), np.array(actions), np.array(targets) #this is returning a thruple of state transitions?

    def train_one_step(self, states, actions, targets):
        state1_batch = torch.from_numpy(states).float()
        # state1_batch = torch.Tensor([torch.from_numpy(s).float() for s in states])
        action_batch = torch.Tensor(actions)
        Q1 = self.model(state1_batch)
        X = Q1.gather(dim=1, index=action_batch.long().unsqueeze(dim=1)).squeeze()
        Y = torch.tensor(targets)
        # loss = torch.nn.HuberLoss()(X, Y)
        loss = self.loss_fn(X, Y)
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()
        return loss.item()
    
    def save_model_parameters(self, destination):
        torch.save(self.model2.state_dict(), destination)

    def load_model_parameters(self, path):
        self.model.load_state_dict(torch.load(path))


### Run visualization:
Runs a visualization of the trained agent's movement in the environment.

The `run_visualisation` function runs a visualization of package delivery episodes using a trained transportation agent. It simulates the agent's actions in the environment and displays the evolving grid at each step of the episode.

In [16]:
def run_visualisation(agent, max_steps=20, size=4):
    #Assumes a fully trained transport agent. 
    #Target network should either have been trained in the preceeding step, or loaded from a pickled pytorch weights file
    env = Environment(size)
    # Create a figure and axis outside the loop
    fig, ax = plt.subplots()
    ax.set_facecolor('white')

    # Initialize the plot once with the initial grid
    m = ax.imshow(env.grid, cmap="bone", extent=[0, env.size, 0, env.size])
    snapshots = []
    for step in range(max_steps):
        state = env.get_state()
        q_values = agent.get_qvals(state)
        action = np.argmax(q_values)
        _, done, _ = env.move_agent(env.actions[action])
        snapshots.append(env.get_grid())
        if done:
            break

    for s in snapshots:
        env.plot_grid(s, ax)
        # Redraw the plot
        fig.canvas.draw()
        plt.pause(0.2)  # Add a short pause for visualization
    # plt.pause(1.5)
    plt.close()
    return None

### Train function

The `train_function` function trains a the agent in the environment with the neural network that we have defined for a set of episode iterations. In this process the target network will be updated. This function encapsulates the core training logic for our agent.

**Parameters:**

- `env` (Environment): The environment in which the agent operates in.
- `agent` (Agent): The learning agent that is being trained.
- `episodes` (int, optional): The number of episodes to train the agent (default is 100).

**Returns:**

A tuple containing two lists:
- `loss_history` (list): A list of mean loss values for each episode during training.
- `empirical_reward_history` (list): A list of total rewards obtained for each episode during training.

**Function Explanation:**

- The training loop to iterate over a fixed number of episodes. For each episode:
    - The environment is reset to the initial state.
    - the agent takes actions within the environment until one of the termination conditions (`done` or `terminated`) is met. The key steps include:
      - Selecting an action using the agent's policy (`agent.act(state)`).
      - Executing the selected action in the environment and observing the resulting reward, done/termination status.
      - Storing the transition (state, action, reward, next_state, done) in the agent's memory (`agent.remember()`).
      - Accumulating the reward (`total_reward`) for the episode.

- After each action in the episode, the agent performs training using experience stored in its memory:
    - A check is made to ensure that there are enough experiences in the memory for training (`len(agent.memory) > agent.batch_size`).
    - If sufficient experiences are available, a minibatch is randomly sampled from the memory.
    - The minibatch is used to compute the loss and update the agent's neural network using the `agent.train_one_step()` method.

- After the episode completes, a cleanup step is performed to decay the agent's exploration rate (`epsilon`) and calculate the mean loss for that episode. And  updates the target neural network with the main neural network's weights

Finally, the function returns `loss_history` and `empirical_reward_history`, which is the mean loss and the total rewards for episodes providing insights into the training progress and performance of the agent.

In [21]:
def train_function(env, agent, episodes=100):
    loss_history = []
    empirical_reward_history = []
    distance_history =[]
    for episode in range(episodes):
        env.reset_env()
        state = env.get_state()  # Reset the environment and get the initial state
        steps_to_opt = env.get_optimal_distance()
        steps_in_episode = 0
        done = False
        terminated = False
        total_reward = 0
        episode_losses = []
        while (not done and not terminated):
            # print("state", state)
            action = agent.act(state)
            reward, done, terminated = env.move_agent(env.actions[action])
            next_state = env.get_state()
            # next_state, reward, done = env.step(env.actions[action])
            total_reward += reward
            agent.remember(state, action, reward, next_state, done)
            # print("agent.memory[-1]",agent.memory[-1])
            state = next_state
            steps_in_episode += 1

            # Training
            if len(agent.memory) > agent.batch_size:
                minibatch_indices = np.random.choice(len(agent.memory), agent.batch_size, replace=False)
                minibatch = [agent.memory[i] for i in minibatch_indices]
                states_batch, actions_batch, targets_batch = agent.process_minibatch(minibatch)
                loss = agent.train_one_step(states_batch, actions_batch, targets_batch)
                episode_losses.append(loss)
        

        #Cleanup step

        agent.epsilon = max(agent.epsilon * agent.epsilon_decay_factor, agent.epsilon_min)
        if len(episode_losses) > 0:
            mean_loss = sum(episode_losses)/len(episode_losses)
        else:
            mean_loss = "n/a"
        agent.steps_since_copy += 1
        agent.update_target()
        # print("state", state)
        # print(f"Episode: {episode + 1}, Total Reward: {total_reward}, Epsilon: {agent.epsilon} | Loss: {mean_loss}")
        # print(f"Collected: {env.collected} | Terminated: {terminated} | Done: {done}")
        
        distance_history.append(steps_in_episode - steps_to_opt + 1)
        loss_history.append(mean_loss)
        empirical_reward_history.append(total_reward)
    return loss_history, empirical_reward_history, distance_history

In [18]:
def plot_history(history, ylabel):
    plt.figure()
    # Create a line plot
    numeric_losses = [float(val) for val in history if isinstance(val, (int, float))]
    plt.plot(numeric_losses)

    # Add labels and a title
    plt.xlabel('Epochs')
    plt.ylabel(ylabel)
    plt.title(ylabel + ' Curve')

    # Display the plot (this is necessary to show the plot in most environments)
    plt.show()
    plt.pause(1.5)
    plt.close()

Compile the entire program to intialize an environment of size, agent with hyperparameters required for the training.

And carry out the training of agent in our gird env.

In [22]:
# if __name__ == "__main__":
env = Environment(4) # Create the environment
agent = TransportAgent(gamma=0.65, learning_rate=0.01, start_epsilon=1.0, epsilon_decay_factor=0.99,
                        min_epsilon=0.1, replay_buffer_size=5000, batch_size=200, network_copy_frequency=5)
losses, rewards, distance_history = train_function(env, agent, episodes=250)
agent.save_model_parameters("logs/target_params.pt")
np.save("losses.npy", losses, allow_pickle=False)
np.save("rewards.npy", rewards, allow_pickle=False)

Episode: 1, Total Reward: 217.0, Epsilon: 0.99 | Loss: n/a
Collected: 1 | Terminated: False | Done: True
Episode: 2, Total Reward: -55.0, Epsilon: 0.9801 | Loss: n/a
Collected: 1 | Terminated: True | Done: False
Episode: 3, Total Reward: -106.5, Epsilon: 0.9702989999999999 | Loss: n/a
Collected: 0 | Terminated: True | Done: False
Episode: 4, Total Reward: -55.0, Epsilon: 0.96059601 | Loss: 165.16706910004487
Collected: 1 | Terminated: True | Done: False
[1mTarget model updated[0m
Episode: 5, Total Reward: 191.5, Epsilon: 0.9509900498999999 | Loss: 56.87302826672065
Collected: 1 | Terminated: False | Done: True
Episode: 6, Total Reward: 188.5, Epsilon: 0.9414801494009999 | Loss: 142.88808911345726
Collected: 1 | Terminated: False | Done: True
Episode: 7, Total Reward: -55.0, Epsilon: 0.9320653479069899 | Loss: 92.14011344103746
Collected: 1 | Terminated: True | Done: False
Episode: 8, Total Reward: -55.0, Epsilon: 0.92274469442792 | Loss: 26.133459091186523
Collected: 1 | Terminated: 

In [23]:
# check visualization
run_visualisation(agent)

In [24]:
#check loss and rewards across episodes
plot_history(losses, "Loss")
plt.close()
plot_history(rewards, "Rewards")
plt.close()
plot_history(distance_history, "Distance to Optimal Solution")