### ONLY RUN THIS CELL IF THESE DEPENDENCIES ARE NOT YET INSTALLED ON YOUR DEVICE

If you get some error saying XXX can't be found, then just run this


In [None]:
# Rendering Dependencies
!pip install gym==0.19.0 pyvirtualdisplay > /dev/null 2>&1
!apt-get install -y xvfb python-opengl ffmpeg > /dev/null 2>&1

# ATARI Dependencies
!apt-get update > /dev/null 2>&1
!apt-get install cmake > /dev/null 2>&1
!pip install --upgrade setuptools 2>&1
!pip install ez_setup > /dev/null 2>&1
!pip install gym[atari] > /dev/null 2>&1
#!pip install --upgrade gym 2>&1



## 1) Importing everything...

In [None]:
''' First we are going to import all the necessary libraries and directories'''

# Import the main bois: Gym and other standard libraries
import gym
from gym import spaces
import numpy as np
import math
import os
import random
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline

# Import all necessary torch libraries
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torchvision.transforms as T
import torch.autograd as autograd

# Rendering Dependencies
from collections import namedtuple
from PIL import Image
from gym import logger as gymlogger
from gym.wrappers import Monitor
gymlogger.set_level(40) #error only
import io
import glob
import base64
from IPython.display import HTML # The main dude who's gonna create the visualization

from IPython import display as ipythondisplay
from IPython.display import clear_output

# Processing Dependencies
from collections import deque
import time

# Souvenir Shop
from google.colab import drive
import sys

#Mount your Google drive to the VM
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
class ReplayMemory(object):

    def __init__(self, memory_size):
        '''Setup the class variables required for the Replay Memory:
            - The memory size
            - The data structure where the experience tuples will be stored
              (I suggest a deque)
        '''
        self.memory_size = memory_size
        self.replay_memory_deque = deque(maxlen = self.memory_size)

    def sample_batch(self, batch_size=32):
        '''
        Take a random set of experiences from the Replay Memory deque and return the values from each experience
        in their own respective arrays ->
        return state, action, reward, done, next_state
            - state: array of the state values from the random set of experiences
            - action: array of the action values from the random set of experiences
            - reward: array of the reward values from the random set of experiences
            - done: array of the done values from the random set of experiences
            - next_state: array of the next_state values from the random set of experiences
        '''
        if len(self.replay_memory_deque) < batch_size:
            return array(), array(), array(), array(), array()
        state, action, reward, done, next_state = zip(*random.sample(self.replay_memory_deque, k = batch_size))
        return array(state), array(action), array(reward), array(done), array(next_state)

    # Push the data into the memory space
    def append(self, state, action, reward, done, next_state):
        '''
        Append the data pased into the method from an experience into the deque
        '''

        self.replay_memory_deque.append((state, action, reward, done, next_state))

    def __len__(self):
        '''
        Return the length of the deque
        '''
        return len(self.replay_memory_deque)

In [None]:
class DQN(nn.Module):

    def __init__(self, env, device):
        super(DQN, self).__init__()

        # Parameters that will be used later
        self.input_dim = env.observation_space.shape
        self.num_actions = env.action_space.n
        self.device = device

        '''
        First begin by creating the CNN
        I suggest building the CNN using the nn.Sequential() method.
        If you're not sure what to do for the CNN, I would suggest taking a look over the following
        paper which details working with the OpenAi Atari Gym.
        https://web.stanford.edu/class/psych209/Readings/MnihEtAlHassibis15NatureControlDeepRL.pdf
        '''
        self.CNN = nn.Sequential(
            nn.Conv2d(in_channels= self.input_dim[2], out_channels= 32, kernel_size= 8, stride= 4),
            nn.ReLU(),
            nn.Conv2d(in_channels= 32, out_channels= 64, kernel_size= 4, stride= 2),
            nn.ReLU(),
            nn.Conv2d(in_channels= 64, out_channels= 64, kernel_size= 3, stride= 1),
            nn.ReLU()
        )

        # Determine the fully connected layer's input size using the supplied CNN_output_dim() method
        self.fcl_input_size = self.CNN_output_dim(self.input_dim)

        '''
        Now build the fully connected layer.
        Again, I suggest building it with the nn.Sequential() method.
        If you found the model architecture content in the paper listed above, then the following
        architecture will be very easy.
        '''
        self.fcl = nn.Sequential(
            nn.Linear(self.fcl_input_size, 512),
            nn.ReLu(),
            nn.Linear(512, self.num_actions)
        )

    def CNN_output_dim(self, input_dim):
        '''
        This method is used to determine the output dimensions of the CNN (which is used
        to set the input dimension of the fully connected layer).
        '''
        return self.CNN(torch.zeros(1, *input_dim)).flatten().shape[0]

    def forward(self, X):
        '''
        This method is used to do a forward pass into the model, taking in the state and returning the Q value
            1. Pass the state into the CNN
            2. Flatten the output of the CNN using the flatten method with start_dim=1
            3. Pass this flattened tensor into the fully connected layer
            4. Return the output of the fully connected layer, i.e., the Q value
        '''
        X = self.CNN(X)
        X = X.flatten(start_dim=1)
        X = self.fcl(X)
        return X

    def get_action(self, state, epsilon):
        '''
        This methos is used to get the action that should be taken where the action will either be random (explore)
        or calculated by the model (exploit).
        '''
        # Use random.random() here to generate a random float and if it's greater than epsilon, then exploit else explore
        if random.uniform(0, 1) > epsilon:
        # If exploiting, use the supplied epsilon_Greed_Strat() method to generate the action
            new_action = self.epsilon_Greed_Strat(state)
        # If exploring, use the random.randrange() method, passing in self.num_actions to generate the action
        else:
            new_action = random.randrange(self.num_actions)
        # Then, return the action
        return new_action

    def epsilon_Greed_Strat(self, state):
        '''
        This method is used to determine the best action to take from a given state based on the previously-calculated
        Q values.
        '''
        qval = self.get_qvals(state)
        return qval.max(1)[1].data[0]

    def get_qvals(self, state):
        '''
        This method is used to calculate the Q value of a given state, by passing the state into the model
        '''
        with torch.no_grad():
            # Converting state data into tensor
            state_t = torch.FloatTensor(np.float32(state)).unsqueeze(0).to(device=self.device)
            return self.forward(state_t)

    def calc_TD_Loss(self, batch_size, model, target_model, optimiser, experience, discount_factor):
        """The function will first take input from the exprience and then
        calculate the optimum value from it. Later on, the Q-values are calculated
        to give an idea of if it's converging to the optimal policy."""

        # Extract a sample of batches from our replay buffer (ReplayMemory class)
        states, actions, rewards, dones, next_states = ReplayMemory.sample_batch(32)

        # The following code is typical of any loss-calculation and training method.

        # Convert all of our tuples to tensors:
        states = torch.FloatTensor(np.float32(states)).to(self.device)
        next_states = torch.FloatTensor(np.float32(next_states)).to(self.device)
        rewards_t = torch.FloatTensor(rewards).to(self.device)
        actions_t = torch.LongTensor(actions).to(self.device)
        done_t = torch.FloatTensor(dones).to(self.device)

        # Calculate the value-action function as well as the value-action function
        # for the next state
        qvals = model(states)
        qvals = qvals.gather(1, actions_t.unsqueeze(1)).squeeze(1)

        # (2,1)
        indices = actions_t.unsqueeze(1)
        qvals = qvals.gather(1, indices)
        qvals = qvals.squeeze(1)
        # (2,)

        # lst = [1,2,3]
        # a = lst[1]

        # index_tensor = [0, 1, 1, 0]


        next_qvals = model(next_states)
        next_qval_state = target_model(next_states)
        next_qval = next_qval_state.gather(1, torch.max(next_qvals, 1)[1].unsqueeze(1)).squeeze(1)

        # Calculate towards the optimum value-action function
        expected_qvals = rewards_t + discount_factor * next_qval * (1 - done_t)

        # Calculate loss with Mean-Square Loss Function:
        loss = F.mse_loss(qvals, expected_qvals.detach().to(device=model.device))

        # Backpropagate and update the model:
        optimiser.zero_grad()
        loss.backward()
        optimiser.step()


In [None]:
def video_display():
    mp4Video = glob.glob('video/*.mp4')
    if len(mp4Video) > 0:
        mp4 = mp4Video[0]
        video = io.open(mp4, 'r+b').read()
        encoded = base64.b64encode(video)
        ipythondisplay.display(HTML(data='''<video alt="test" autoplay
                loop controls style="height: 400px;">
                <source src="data:video/mp4;base64,{0}" type="video/mp4" />
             </video>'''.format(encoded.decode('ascii'))))
    else:
        print("Could not find video")


def wrap_env_Video(env):
    env = Monitor(env, './video', force=True)
    return env


def plot_save_Results(training_rewards, path):
  clear_output(True)
  plt.figure(figsize=(12,8))
  plt.plot(training_rewards, label='Rewards')
  plt.xlabel('Episodes')
  plt.ylabel('Rewards')
  plt.show()


def save_weights(model, path, file_name=None):
  if file_name is None:
    file_name = 'your_models_trained_weights.pt'
  weights_path = os.path.join(path,file_name)
  torch.save(model.state_dict(), weights_path)
def video_display():
    mp4Video = glob.glob('video/*.mp4')
    if len(mp4Video) > 0:
        mp4 = mp4Video[0]
        video = io.open(mp4, 'r+b').read()
        encoded = base64.b64encode(video)
        ipythondisplay.display(HTML(data='''<video alt="test" autoplay
                loop controls style="height: 400px;">
                <source src="data:video/mp4;base64,{0}" type="video/mp4" />
             </video>'''.format(encoded.decode('ascii'))))
    else:
        print("Could not find video")


def wrap_env_Video(env):
    env = Monitor(env, './video', force=True)
    return env


def plot_save_Results(training_rewards, path):
  clear_output(True)
  plt.figure(figsize=(12,8))
  plt.plot(training_rewards, label='Rewards')
  plt.xlabel('Episodes')
  plt.ylabel('Rewards')
  plt.show()


def save_weights(model, path, file_name=None):
  if file_name is None:
    file_name = 'your_models_trained_weights.pt'
  weights_path = os.path.join(path,file_name)
  torch.save(model.state_dict(), weights_path)


In [None]:
class Agent:
    def __init__(self, environment, device, display_progress=True, fp="model.pt"):
        self.environment = environment
        self.fp = fp  # The file path we will be saving our weights to

        # Hyperparameters - I strongly suggest you play around ith these to find better ones!
        self.memory_size = 20000  # Maximum amount of experiences that can be stored
        self.burn_in = 10000  # How many time steps should pass until the network learns (via TD Loss)
        self.save_freq = 50  # How often the model gets saved to a .pt file, and the performance is graphed
        self.max_episodes = 350  # The total number of episodes the agent will play in training
        self.target_update_freq = 1000  # How often the policy network is transferred to the target network
        self.batch_size = 32  # The amount of experiences used to train the network at one time
        self.discount_factor = 0.99
        self.learning_rate = 0.0001

        # Epsilon parameters
        self.epsilon_start = 1
        self.epsilon_final = 0.01
        self.epsilon_decay = 100000  # Time steps to go from start to final
        self.epsilon_by_frame = lambda time_step: -(time_step - self.epsilon_decay) / self.epsilon_decay

        self.experience = ReplayMemory(self.memory_size)

        self.Policy_Net = DQN(environment, device)
        self.Policy_Net.to(self.Policy_Net.device)

        self.Target_Net = DQN(environment, device)
        self.Target_Net.to(self.Policy_Net.device)
        self.Target_Net.load_state_dict(self.Policy_Net.state_dict())
        self.Target_Net.eval()
        self.optimiser = optim.Adam(params=self.Policy_Net.parameters(), lr=self.learning_rate)
        self.display_progress = display_progress

    def train(self):
        """This is the function used to train the network"""

        # Initialising Parameters used within the code:
        num_ep = 0
        frame_idx = 0
        training = True
        training_rewards = []
        rewards = 0
        work_path = "models"

        # Begin with getting the initial state by resetting the environment (reset the environment like you have already
        # seen and assign it to a state variable)
        self.environment.reset()
        state = self.environment.observation_space()
        # Training loop
        if training:
            self.Policy_Net.train()
        while True:
        # Determine the epsilon value based on the frame index with the anonymous function created in the constructor
            gen_epsilon = self.epsilon_by_frame(frame_idx)
        # Check if the generated epsilon value is less than epsilon file (i.e., the minimum epsilon can be) and
        # if it is, set it to epsilon final
            if gen_epsilon < self.epsilon_final:
                gen_epsilon = self.epsilon_final
        # Get the action from the policy network
            action = self.Policy_Net.get_action(state, gen_epsilon)
        # Pass the action into the environment and retrieve the next_state, reward and done flag from it
            next_state, reward, done, info = self.environment.step(action)
        # Save all of the information from this experience into the experience data set
            self.experience.append(state, action, reward, done, next_state)
        # Add the reward from this experience to the cumulative rewards count 'rewards'
            rewards += reward
        # Set the state variable to be the next_state for the next time step
            state = next_state
        # Increase the frame_idx count by 1
            frame_idx += 1
        # Check to see if there are enough experiences in the experience data set to train the model with (burn in)
            if len(self.experience) >= self.batch_size:
        # If there are, we want to then train our network (using the policy network's TD loss method we have
        # already implemented)
                self.Policy_Net.calc_TD_Loss(self.batch_size, model=self.Policy_Net, target_model=self.Target_Net, optmiser=self.optimiser, experience=self.experience, discount_factor=self.discount_factor)
        # Check to see if the most recent time step finished the episode
            if done:
        # If so, the episode is now over

        # Increase the num_ep counter by 1
                num_ep += 1
        # Print any useful information here (this part is up to you and for your own benefit when
        # observing the training process)

        # Store the episode's reward amount in the training_rewards array for graphing purposes
                training_rewards.append(rewards)
        # Reset the environment for the next episode and store the state
                state = self.environment.reset()
        # Reset the rewards count to 0 for the next episode
                rewards = 0
        # Check to see if enough episodes have passed to save the weights (save_freq)
                if num_ep % self.save_freq == 0:
        # Save the weights with the save_weights() function from utils.py
                    save_weights(model=self.Policy_Net, path=self.fp, file_name='Weights_Policy.pt')
                    save_weights(model=self.Target_Net, path=self.fp, file_name='Weights_Target.pt')
        # Print anymore useful information here (again up to you)

        # If you would like to see the current progress in the training (i.e., if display_progress is true)

        # Then plot the results (again, in utils.py) - do note that a graph halts the interpreter: you
        # have to close the graph before the training can continue
                    # plot_save_Results(training_rewards, self.fp)

        # Check to see if enough frames have passed for us to transfer the policy weights to the target network
            if frame_idx % self.target_update_freq == 0 and frame_idx != 0:
        # If so, retrieve the state_dict() of the policy net and use load_state_dict() on the target net
                self.Target_Net.load_state_dict(self.Policy_Net.state_dict())
                self.Target_Net.eval()
        # Check to see if enough episodes have passed to consider training complete (max_episodes)
            if num_ep >= self.max_episodes:
        # Save the weights for te final time
                save_weights(model=self.Policy_Net, path=self.fp, file_name='Weights_Policy.pt')
                save_weights(model=self.Target_Net, path=self.fp, file_name='Weights_Target.pt')
        # Plot the results of training
                plot_save_Results(training_rewards, self.fp)
        # Break such that training does in fact stop
                break

    def learn(self):
        """This is the method that we will call from outside this class - think of it as a wrapper for the training"""

        # Use time.time() to record the starting time of training
        start_time = time.time()
        # Start the training!
        self.train()
        # Record the end time of training
        end_time = time.time()
        # Find out how much time is spent training and display
        diff_time = end_time - start_time
        print(f'Time taken to complete training: {diff_time}')

    def evaluate(self, game_speed=0.1):
        """This is the method used to evaluate the performance of the agent"""

        Policy_weights_path = os.path.join(self.fp, 'Weights_Policy.pt')  # Getting the path to where the weights are saved

        # Load the saved weights into our model and set it to evaluation mode
        self.Policy_Net.load_state_dict(torch.load(Policy_weights_path))
        self.Policy_Net.eval()

        # Include Video Wrapper for video output
        environment = wrap_env_Video(self.environment)
        # Reset the environment and store the state
        state = environment.reset()
        # set epsilon to 0 (we don't want any randomness)
        gen_epsilon = 0
        # Starting the eval loop
        while True:
        # Render the environment
            environment.render()
        # Use time.sleep(), passing in game_speed. This is done to slow down the game as it is very fast be default
        # to increase the speed of training
            time.sleep(game_speed)
        # Evaluate the action from the model
            action = self.Policy_Net.get_action(state, gen_epsilon)
        # Pass this action into the environment and store the resulting information
            next_state, reward, done, info = environment.step(action)
        # Update the state with next_state for the next time step
            state = next_state
        # Check if done to break
            if done:
        # Closing the environment and displaying the video
                environment.close()
                break
        video_display()

    def plot_view(self):
        """This method plots the view seen by the agent"""
        self.environment.reset()

        # Just a normal agent without any learning
        action = self.environment.action_space.sample()
        observation, reward, done, info = self.environment.step(action)

        plt.figure(figsize=(20, 5))
        for i in range(4):
            plt.subplot(1, 4, i + 1)
            plt.imshow(observation[i], cmap=plt.get_cmap('gray'))


In [None]:
from pyvirtualdisplay import Display
from common.wrappers import make_atari, wrap_deepmind, wrap_pytorch

display = Display(visible=0, size=(1400, 900))
display.start()

is_ipython = 'inline' in matplotlib.get_backend()
if is_ipython: from IPython import display


# Loading the device, setting it to run on a GPU if available and on the CPU otherwise
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

# Initialising the environment
environment = make_atari("PongNoFrameskip-v4")
# Adding required wrappers
environment = wrap_pytorch(wrap_deepmind(environment, frame_stack=True))

# Initialising Agent
Agent = Agent(environment, device, fp="model-pong.pt")
Agent.learn()
#Agent.evaluate()
