# Multi Robot Grid World Assessment part 3
**Written by**  
Egor Danilov (33411115),  
Yash Balchandani(33279950),  
Gautam Ravi Kumar(33197970),  
Jacob Wicklund (31265936)

In [49]:
import numpy as np
import time
import random
import matplotlib.pyplot as plt
from IPython.display import clear_output
import torch
from torch import tensor, optim
import torch.nn as nn
from torch.nn.modules import activation
import torch.nn.functional as F
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

### Brain

In [50]:
from collections import deque, namedtuple, deque
class ReplayBuffer:
    ############################################################################################################################
    def __init__(self, action_size, buffer_size, batch_size):
        """
        Initialize a ReplayBuffer object.

        Parameters:
        ===========
            action_size (int): Dimensionality of each action.
            buffer_size (int): Maximum number of experiences to store in the buffer.
            batch_size (int): Number of experiences to sample during training.
        """
        self.action_size = action_size # Maximum possible ways each action can be done.

        self.memory = deque(maxlen=buffer_size) #double ended queue for a short term memory of agent.

        self.batch_size = batch_size # Resampling of the past experience what is past experience? Named Tuple from part 1.

        # (Tagged) named tuple to represent a single experience in the replay memory
        self.experience = namedtuple("Experience", field_names=["state", "action", "reward", "next_state", "done"])

    ############################################################################################################################
    def __len__(self):
        """Return the current size of internal memory."""
        return len(self.memory) # That is the dbl qu length.

    ############################################################################################################################
    def add(self, state, action, reward, next_state, done):
        """
        Add a new experience to memory.
        """
        exp = self.experience(state, action, reward, next_state, done) # Experience tuple. Defined above.
        # Pass in the experience to the short term memory queue.
        self.memory.append(exp) # Passes the list of attributes into the memory dbl queue.

    ############################################################################################################################
    def sample(self): #Segway into the deep q network. This is the sampler method.
        """
        Randomly sample a batch of experiences from memory for training.
        """
        experiences = random.sample(self.memory, k=self.batch_size) # Sample all the past experiences to learn the most important qualities so we can delete the experience data.

        states = torch.from_numpy(np.vstack([e.state for e in experiences])).float().to(device) #retrieve the states from experience stored in double qu
        actions = torch.from_numpy(np.vstack([e.action for e in experiences])).long().to(device) # retrieve the actions
        rewards = torch.from_numpy(np.vstack([e.reward for e in experiences])).float().to(device) # the rewards
        next_states = torch.from_numpy(np.vstack([e.next_state for e in experiences])).float().to(device) # possible next moves, choosing the probabilistic best reward. retrieved from the DQN calculation
        dones = torch.from_numpy(np.vstack([e.done for e in experiences]).astype(np.uint8)).float().to(device) # task completed flag. IE shut down.

        return (states, actions, rewards, next_states, dones) # send this info back to caller


## Agent Class.

In [51]:
class Agent:
    def __init__(self, agent_type, start_position, grid_size):
        self.agent_type = agent_type
        self.start_position = start_position
        self.grid_size = grid_size
        self.buff = ReplayBuffer(action_size=4,buffer_size=10000, batch_size=64)
        self.picked = False
        self.reward = 0
        self.done = False

    def action(self, state):
        possible_actions = [] # Store legal moves

        ### INSERT THE DAIRY QUEEN!


        return np.random.choice([0, 1, 2, 3]) #of the legal moves pic one at random to do.

    def next_position(self, action): # HERE the 0-3 is passed in and used.
            x_disp, y_disp = {
                0: (-1, 0), # up
                1: (1, 0), #down
                2: (0, -1), #left
                3: (0, 1) #right
                }[action]

            return (min(max(self.start_position[0] + x_disp, 0),self.grid_size[0] - 1),
                    min(max(self.start_position[1] + y_disp, 0),self.grid_size[0] - 1))# Here is where the boundary is checked.

    def move(self, action): # This actually executes the move. Action is passed in and executed.
        self.start_position = self.next_position(action)

    def remember(self, state, action, reward, next_state, done):
        self.buff.add(state, action, reward, next_state, done)

    def learn(self):
        self.buff.sample()


## Environment Class

In [67]:
import numpy.random as npr
class MultiAgentGridWorld:

    ############################################################################################################################
    def __init__(self, size=5):
        self.size = size # Grid world size default to 5.
        self.pick_up_position = (0, size - 1) # This puts the box in upper left corner of any size map.
        self.delivery_position = (size - 1, 0) # The delivery is in lower right corner for any size map.
        self.agents = self.initialize_agents() #This list of tuples is carrying all agent objects
        #the world has these attributes.
        self.has_package = False
        self.pick_reward = 5
        self.drop_reward = 5
        self.handover_reward = 10
        self.steps = 0
        self.done = False
    def initialize_agents(self):
        # Ensuring agents don't spawn on pickup or delivery positions.
        possible_positions = [(i, j)
                            for i in range(self.size)
                            for j in range(self.size)
                            if (i, j) not in [self.pick_up_position, self.delivery_position]]
        # Add 2 agents of eachh type
        indexes = npr.choice(len(possible_positions), 2, replace=False)
        start_positions = [possible_positions[_] for _ in indexes]
        return [Agent("pickup", start_positions[0], (self.size, self.size)),
                    Agent("delivery", start_positions[1], (self.size, self.size))]

    ############################################################################################################################
    def step(self, actions): # so this method needs a bunch of actions.
        memories = [a.start_position for a in self.agents]
        self.steps += 1
        
        pick_agents = [agent for agent in self.agents if agent.agent_type == "pickup" and agent.done == False]
        drop_agents = [agent for agent in self.agents if agent.agent_type == "delivery" and agent.done == False]

        for agent in pick_agents: # here i need the entire list of actions
            if agent.start_position == self.pick_up_position and agent.picked == False:
                agent.picked = True
                agent.reward += self.pick_reward - self.steps**2
            for agent_2 in drop_agents:
                if agent.start_position == agent_2.start_position and agent.picked == False and agent_2.picked == True:
                    agent.picked = False
                    agent_2.picked = True
                    agent.reward += self.handover_reward - self.steps**2
                    agent_2.reward += self.handover_reward - self.steps**2
                    agent.done = True
                    
                    
        for agent in drop_agents: # here i need the entire list of actions
            if agent.start_position == self.pick_up_position and agent.picked == True:
                agent.reward += self.drop_reward - self.steps**2
                agent.done = True
        if sum(1 for agent in drop_agents if agent.done) == len(drop_agents) and sum(1 for agent in pick_agents if agent.done) == len(pick_agents):
            self.done = True

        return self.state_of_the_world(), self.done # So each time a step is taken the comeback is these three things.

    ############################################################################################################################
    def state_of_the_world(self): # This environment only has the spots of agents nd the package picked as its data.
        return [a.start_position for a in self.agents], self.has_package

    ############################################################################################################################
    def render(self):
        """
        Renders grid world.
        """
        for i in range(self.size):
            for j in range(self.size):
                if (i, j) == self.pick_up_position:
                    print("P", end=" ")
                elif (i, j) == self.delivery_position:
                    print("D", end=" ")
                elif (i, j) == self.agents[0].start_position:
                    print("1", end=" ") #type 1 agent Pickup
                elif (i, j) == self.agents[1].start_position:
                    print('2', end=' ') #type 2 agent delivery
                else:
                    print(".", end=" ")
            print()

    ############################################################################################################################
    def reset(self):
        self.initialize_agents()
        self.package_picked = False
        return self.state_of_the_world() #This works because all the method does is set all the world attributes to above.

    ############################################################################################################################


## Agent Handler

### Define Deep Q Learning Algorithm

In [71]:
import torch
import copy
import numpy as np
class DQN:

    ############################################################################################################################
    """Deep Q-Network (DQN) agent."""
    def __init__(self, state_space, action_space=4, sampling=100, gamma=0.99, memory=None):
        # implementation from skeleton
        input_layer = state_space
        layer_2 = 24
        layer_3 = 24
        output_layer = action_space
        self.nn_initializer(input_layer, layer_2, layer_3, output_layer)
        # Initialize short term memory
        
        #implementation kept from part 2
        self.start_sampling = sampling # Experience sampling
        self.gamma= gamma
        #Short term memory, comparing the Q network information.
        self.memory = memory if memory else ReplayBuffer(action_space, buffer_size=10000, batch_size=32) # User Defined above ______^
        self.action_space=action_space
############################################################################################################################
    # The number of starting nodes: input, two hidden layers, and output
    def nn_initializer (self, two, three, out, inn=5):
        device = torch.device('cpu')
        self.model_local = torch.nn.Sequential(
            torch.nn.Linear(5, two), torch.nn.ReLU(),
            torch.nn.Linear(two, three), torch.nn.ReLU(),
            torch.nn.Linear(three, out)).to(device)

        self.model_global = copy.deepcopy(self. model_local).to(device)
        self.model_global.load_state_dict(self.model_local.state_dict())
        self.loss_func = torch.nn.MSELoss()
        self.optimizer = torch.optim.Adam(self.model_local.parameters(), lr=0.001)

############################################################################################################################
    def get_q_vals(self, state): # The Q value list
        state = torch.from_numpy(state).float().unsqueeze(0)
        return self.model_local(state)

############################################################################################################################
    def update_target(self):
        self.model_global.load_state_dict(self.model_local.state_dict())

############################################################################################################################
    def act(self, state, epsilon=0.):
        """
        Choose an action based on the current state using epsilon-greedy policy.

        Parameters:
        ===========
            state (torch.Tensor): Current state.
            epsilon (float): Epsilon for epsilon-greedy action selection.
        """
        print(state)
        flattend_state_info = [c for sub in state[0] for c in sub]
        add_picked = [int(state[1])]
        flat_everything = np.array(flattend_state_info + add_picked).astype(np.float64)
        print(flat_everything.shape)
        state = torch.from_numpy(flat_everything)
        state = state.float().unsqueeze(0).to(device) # here we take the state tensor and add another dimension at the start of this tensor
        print(state.shape)
        self.model_local.eval() # this sets the nn from torch into evaluation mode.
        with torch.no_grad(): # Disable gradient tracking (local net doesn't learn from the calculation following:::)
            action_values = self.model_local(state) # compute the value of taking each act --- then store it.
        self.model_local.train() # return local q net into train mode (learn from acting in XYZ way)

        # Epsilon-greedy action selection
        if np.random.random() > epsilon: # zero and 1 random selection number comparison with epsilon
            return np.argmax(action_values.cpu().data.numpy())  # should epsilon be smaller, this retrieves the calculated values from processing unit
                                                                # puts it into a numpy array. picks the highest reward
        else:
            return random.choice(np.arange(self.action_space)) # randomly makes a choice for its next step.

############################################################################################################################
    def get_maxQ(self, state):
        q_values = self.model(state)
        return torch.max(q_values).item()

############################################################################################################################
    def fit(self, env, n_episodes=5000, max_t=100,
            rolling_epochs=200, target_score=None,
            eps_start=1.0, eps_end=0.01, eps_decay=0.99):
        """
        Train the agent using deep Q-learning.

        Parameters:
        ===========
            env: Environment to train in.
            n_episodes (int): Maximum number of training episodes.
            max_t (int): Maximum number of time steps per episode.
            rolling_epochs (int): Number of episodes for calculating average score.
            target_score (float): Target score for the environment.
            eps_start (float): Starting value of epsilon.
            eps_end (float): Minimum value of epsilon.
            eps_decay (float): Decay factor for epsilon.
        """
        scores = []  # List to store scores (rewards) from each episode
        scores_window = deque(maxlen=rolling_epochs)  # Double ended que, once rolling_epochs is reached by one of the ends, the other end is bumped.
        begin = eps_start # The local variable is what gives flexibility from the input param.
        for epis in range(1, n_episodes + 1): # Simply for each episode...:
            state = env.state_of_the_world() # Reset environment
            print(type(state), state)
            for _ in range(max_t): # For everything from 0 to the max allowed time
                action = self.act(state, begin) # Calling the action function.
                next_state, done = env.step(action) # Calling the step function from the "grid" getting reward values for taking steps, getting items etc..
                self.step(state, action, next_state, done) # Take actual step based on the above line's values.
                state = next_state # Updating the state variable
                if done: # Done condition is package delivery.
                    break

            scores_window.append(score) # The last step's reward update is put into the double end queue.
            scores.append(score) # Adds the score to the agent's personal record.
            begin = max(eps_end, eps_decay * begin) # Reset the begin variable to the max of either the ending episode or decay times begin.

            # Display training progress
            print(f'\rEpisode {epis}, Average Score: {np.mean(scores_window):.2f}', end="")

            if epis % rolling_epochs == 0:
                print(f'\rEpisode {epis}, Average Score: {np.mean(scores_window):.2f}')

            if target_score and np.mean(scores_window) >= target_score:
                torch.save(self.model_local.state_dict(), 'checkpoint.pth')
                break

                    # Plot the scores
        fig = plt.figure(figsize=(6,3))
        plt.plot(range(len(scores)), scores)
        plt.ylabel('Score')
        plt.xlabel('Episode')
        plt.show()

    ############################################################################################################################
    def train_one_step(self, experiences):

        targets_reply = [] #What the hell is this for? replay buff
        print("expereinces",experiences)
        states, actions, q_rewards, next_states, dones = experiences #Next states and done are not used

        state_batch = torch.cat([s.float() for s in states]) #Why does this have a state size of 224 and not 104
        print(state_batch.size())
        action_batch = torch.tensor(actions)
        Q_local_expected = self.model_local(state_batch)


        X = Q_local_expected.gather(dim=1,index=action_batch.long().unsqueeze(dim=1)).squeeze()
        Y = torch.tensor(q_rewards).float()
        loss = self.loss_func(X, Y)
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()
        self.update_target(self.model_local, self.model_global)

    ############################################################################################################################
    def step(self, state, action, reward, next_state, done):
        """Store experience in replay memory and learn if enough samples are available."""

        # Store experience in replay memory
        self.memory.add(state, action, reward, next_state, done) #Each agent has its own memory attribute, holding the parameters inside. Now i gotta know what replay buffer is doing.

        # Learn from a random subset of experiences if enough samples are available
        if len(self.memory) > self.start_sampling: # if theres open memory randomly fill it with samples
            experiences = self.memory.sample() # sampling the replay buffer obj from torch
            self.train_one_step(experiences) # calls the learn method with above line executed and stored and the set future reward gamma discount rate.


### Agent Instantiation. Policy learning and Visualisation.

In [72]:
import torch as fire
class MultiAgentHandler: # Connection to the Dairy Queen Network
    def __init__(self, num_agents, state_space, action_space=4,**kwargs):
        self.shared_memory = ReplayBuffer(action_space, buffer_size=10000, batch_size=64)
        self.agents = [DQN(state_space, action_space,memory=self.shared_memory, **kwargs) for _ in range(num_agents)]

    def train_agents(self, environment, n_episodes=1000):
        for _ in range(1, n_episodes+1):
            states = environment.reset() #start a fresh instance.
            scores = [0]*len(self.agents) # each agent gets its score.

            while True: #This runs through all the agents at their states and they move.
                actions = [a.act(st) for a, st in zip(self.agents,states)]
                next_state, rewards, done = environment.step(actions)

                for agent, state, action, reward, next_state, done in zip(self.agents,states,actions,rewards,next_state,done):
                    self.shared_memory.add(state,action, reward, next_state, done) #basically store a memory (remember it.)
                    agent.shared_memory.step(state, action, reward, next_state, done) # Here we use the shared mem across erry body.

                scores = [score + reward for score, reward in zip(scores,rewards)]
                states = next_state

                if any(done):
                    break
            # print scores / logging output
            if _ % 100 == 0:
                print(f"Episode {_}, Average Score: {np.mean(scores):.2f}")

    def save(self):
        for i, a in enumerate(self.agents):
            fire.save(a.model_local.state_dict(), f'checkpoint_agent_{i}.pth') #This is where the agents are connected to their brains.

    def load(self):
        for i, a in enumerate(self.agents):
            a.model_local.load_state_dict(fire.load(f'checkpoint_agent_{i}.pth'))

In [73]:
# Initialize a 4x4 (n^2) grid world environment.
environment = MultiAgentGridWorld(size=5)
# Set agent's personal state size to the environment's state size.
# The possible actions in this environment set to 4: up, down, left, right.

policy = DQN(state_space=environment.size, action_space=4) # Use the DQN class to govern agent object.

# Train agent object with GridWorld environment.
# Training process for n_episodes or until desired score is reached.
# over the last 'rolling_epochs' (defaulted to 200 in the DQN class) reaches or exceeds 2.2.
policy.fit(env=environment, n_episodes=2500, target_score=2.2)

<class 'tuple'> ([(2, 1), (2, 0)], False)
([(2, 1), (2, 0)], False)
(5,)
torch.Size([1, 5])


TypeError: step() missing 1 required positional argument: 'done'

### Test the Agent

In [None]:
# load pretrained weights
policy.model_local.load_state_dict(torch.load('checkpoint.pth'))

FileNotFoundError: [Errno 2] No such file or directory: 'checkpoint.pth'

In [None]:
# Visualise 5 episodes using trained agent

for i in range(5):
    state = env.reset()
    for j in range(20):
        action = policy.act(state)
        state, reward, done = env.step(action)
        clear_output(wait=True)
        env.render()
        time.sleep(1)
        if done:
            break

. . . . 
. . . . 
. F . . 
. R . . 
