In [1]:
%pip install wandb

Collecting wandb
  Downloading wandb-0.12.16-py2.py3-none-any.whl (1.8 MB)
[K     |████████████████████████████████| 1.8 MB 5.2 MB/s 
Collecting docker-pycreds>=0.4.0
  Downloading docker_pycreds-0.4.0-py2.py3-none-any.whl (9.0 kB)
Collecting sentry-sdk>=1.0.0
  Downloading sentry_sdk-1.5.11-py2.py3-none-any.whl (144 kB)
[K     |████████████████████████████████| 144 kB 62.4 MB/s 
Collecting GitPython>=1.0.0
  Downloading GitPython-3.1.27-py3-none-any.whl (181 kB)
[K     |████████████████████████████████| 181 kB 58.4 MB/s 
Collecting setproctitle
  Downloading setproctitle-1.2.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (29 kB)
Collecting shortuuid>=0.5.0
  Downloading shortuuid-1.0.8-py3-none-any.whl (9.5 kB)
Collecting pathtools
  Downloading pathtools-0.1.2.tar.gz (11 kB)
Collecting gitdb<5,>=4.0.1
  Downloading gitdb-4.0.9-py3-none-any.whl (63 kB)
[K     |████████████████████████████████| 63 kB 1.4 MB/s 
Collecting smmap<6,>

In [2]:
%pip install -U gym>=0.21.0
%pip install -U gym[atari,accept-rom-license]

Collecting autorom[accept-rom-license]~=0.4.2
  Downloading AutoROM-0.4.2-py3-none-any.whl (16 kB)
Collecting ale-py~=0.7.4
  Downloading ale_py-0.7.5-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.6 MB)
[K     |████████████████████████████████| 1.6 MB 5.1 MB/s 
Collecting AutoROM.accept-rom-license
  Downloading AutoROM.accept-rom-license-0.4.2.tar.gz (9.8 kB)
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
    Preparing wheel metadata ... [?25l[?25hdone
Building wheels for collected packages: AutoROM.accept-rom-license
  Building wheel for AutoROM.accept-rom-license (PEP 517) ... [?25l[?25hdone
  Created wheel for AutoROM.accept-rom-license: filename=AutoROM.accept_rom_license-0.4.2-py3-none-any.whl size=441027 sha256=2da2a60b82a37c2756e0451de815e26a5bddbd04f194bf1f9d81b509358d0bf9
  Stored in directory: /root/.cache/pip/wheels/87/67/2e/6147e7912fe37f5408b80d07527dab807c1d25f5c403a9538a
Successfully bu

In [3]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import wandb
from collections import deque
import random
import numpy as np
import gym
import cv2
import time
import copy

In [4]:
wandb.init(
  project="Breakout-Pytorch",
  tags=["DQN", "CNN", "RL"],
)

  class HTTPHeaders(collections.MutableMapping):


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


In [5]:
GPU = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [6]:
class DDQN(nn.Module):
    def __init__(self, h, w, output_size):
        super(DDQN, self).__init__()
        self.conv1 = nn.Conv2d(in_channels=4,  out_channels=32, kernel_size=8, stride=4)
        self.bn1 = nn.BatchNorm2d(32)
        
        convw, convh = self.conv2d_size_calc(w, h, kernel_size=8, stride=4)
        
        self.conv2 = nn.Conv2d(in_channels=32, out_channels=64, kernel_size=4, stride=2)
        self.bn2 = nn.BatchNorm2d(64)
        
        convw, convh = self.conv2d_size_calc(convw, convh, kernel_size=4, stride=2)
        
        self.conv3 = nn.Conv2d(in_channels=64, out_channels=64, kernel_size=3, stride=1)
        self.bn3 = nn.BatchNorm2d(64)
        
        convw, convh = self.conv2d_size_calc(convw, convh, kernel_size=3, stride=1)
        linear_input_size = convw * convh * 64  # Last conv layer's out sizes

        # Action layer
        self.Alinear1 = nn.Linear(in_features=linear_input_size, out_features=128)
        self.Alrelu = nn.LeakyReLU()  # Linear 1 activation funct
        self.Alinear2 = nn.Linear(in_features=128, out_features=output_size)

        # State Value layer
        self.Vlinear1 = nn.Linear(in_features=linear_input_size, out_features=128)
        self.Vlrelu = nn.LeakyReLU()  # Linear 1 activation funct
        self.Vlinear2 = nn.Linear(in_features=128, out_features=1)  # Only 1 node

    def init_weights(self, m):
        if type(m) == nn.Linear:
            torch.nn.init.kaiming_normal_(m.weight, nonlinearity='relu')
            #m.bias.data.fill_(0.0)
        
        if type(m) == nn.Conv2d:
            torch.nn.init.kaiming_normal_(m.weight, nonlinearity='relu')
            m.bias.data.fill_(0.1)

    def conv2d_size_calc(self, w, h, kernel_size=5, stride=2):
        """
        Calcs conv layers output image sizes
        """
        next_w = (w - (kernel_size - 1) - 1) // stride + 1
        next_h = (h - (kernel_size - 1) - 1) // stride + 1
        return next_w, next_h

    def forward(self, x):
        x = F.relu(self.bn1(self.conv1(x)))
        x = F.relu(self.bn2(self.conv2(x)))
        x = F.relu(self.bn3(self.conv3(x)))

        x = x.view(x.size(0), -1)  # Flatten every batch

        Ax = self.Alrelu(self.Alinear1(x))
        Ax = self.Alinear2(Ax)  # No activation on last layer

        Vx = self.Vlrelu(self.Vlinear1(x))
        Vx = self.Vlinear2(Vx)  # No activation on last layer

        q = Vx + (Ax - Ax.mean())

        return q

In [7]:
#AGENT COMBINE CELL
class DDDQNAgent:
    def __init__(self, environment):
        """Set the hyperparameters for our agent"""

        # Set the discount rate
        self.gamma = 0.99

        # Set our initial exploration parameter epsilon
        self.epsilon = 1

        # Get number of actions available to the agent
        self.n_actions = environment.action_space.n

        # Initiate our replay memory
        self.replay_buffer = deque(maxlen=50000)

        # Create two model for DDQN algorithm
        self.Net = DDQN(h=84, w=84, output_size=self.n_actions)
        self.main_model = self.Net.to(GPU)
        # self.target_model = DDQN(h=self.target_h, w=self.target_w, output_size=self.action_size).to(GPU)
        self.target_model=copy.deepcopy(self.main_model).to(GPU)
        self.Net.apply(self.Net.init_weights)
        self.target_model.load_state_dict(self.main_model.state_dict())
        self.target_model.eval()

        # Set out optimizer
        self.optimizer = optim.Adam(self.main_model.parameters(), lr=0.0003)

    def convert_to_grey(self, image):
        """Convert the image to greyscale"""
        return cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)

    def crop_image(self, image, h_start=34, h_end=194, w_start=0, w_end=160):
        """Crop the image"""
        return image[h_start:h_end, w_start:w_end]

    def resize_reshape(self, image, height=84, width=84):
        """Resize and reshape our image"""
        image = cv2.resize(image, (width, height))
        return image.reshape(width, height)
     

    def preProcess(self, image):
        img = self.convert_to_grey(image)
        img = self.crop_image(img)
        img = self.resize_reshape(img)

        return img / 255

    def random_action(self):
      return random.randrange(self.n_actions)

    def act(self, state):
        if np.random.rand() > self.epsilon:
            with torch.no_grad():
                  state = torch.tensor(state, dtype=torch.float, device=GPU)
                  state = state.unsqueeze(0)
                  qs = self.main_model.forward(state)
                  action = torch.argmax(qs).item()
            return action
        else:
            action = random.randrange(self.n_actions)
            return action

    def predict(self, online, state):
        if online:
          return self.main_model(state)
        else:
          return self.target_model(state)

    def train(self):

        #Delays learning until agent has sufficient experience.
        if len(agent.replay_buffer) < 40000:
            loss, max_q = [0, 0]
            return loss, max_q

        #Take minibatch of size 64 freom repay buffer
        random_samples = random.sample(self.replay_buffer, 64) #Change 64 to change size of batch
        state, action, reward, next_state, done = zip(*random_samples)


        state = np.concatenate(state)
        next_state = np.concatenate(next_state)

        #Feed replays to Neural nets to observe expected q_values
        state = torch.tensor(state, dtype=torch.float, device=GPU)
        state_q_values = self.predict(True,state)
        state_q_val_max = torch.max(state_q_values).item()
        next_state = torch.tensor(next_state, dtype=torch.float, device=GPU)
        next_states_q_values = self.predict(True, next_state)
        next_states_target_q_values = self.predict(False, next_state)


        action = torch.tensor(action, dtype=torch.long, device=GPU)
        selected_q_value = state_q_values.gather(1, action.unsqueeze(1)).squeeze(1)
        next_states_target_q_value = next_states_target_q_values.gather(1, next_states_q_values.max(1)[1].unsqueeze(
            1)).squeeze(1)
        reward = torch.tensor(reward, dtype=torch.float, device=GPU)
        done = torch.tensor(done, dtype=torch.float, device=GPU)

        bellQ = reward + self.gamma * next_states_target_q_value * (1 - done)

        #Find difference between expected value and action value
        difference_q_value = selected_q_value - bellQ.detach()
        loss = (difference_q_value**2).mean()

        #Use difference to update model weightings and hopefull improve performance
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()



        return loss, state_q_val_max

    def storeResults(self, state, action, reward, nextState, done):
        self.replay_buffer.append([state[None, :], action, reward, nextState[None, :], done])

    def updateEpsilon(self):
        if self.epsilon > 0.05:
            self.epsilon *= 0.99


In [9]:
env = gym.make('BreakoutDeterministic-v4')
agent = DDDQNAgent(env)
rolling_100 = deque(maxlen=100)
total_iter = 1

for episode in range(100000): #Loops for passed range number of total episodes.

    #Grab the first frame, and then jam 4 of them together rather unceremoniously. Standard practice for Atari Games. Helps to demonstrate velocity.
    state = env.reset()
    state = agent.preProcess(state)
    state = np.stack((state, state, state, state))

    #Declaring in per-episode totals
    Max_Qs = 0 
    episode_reward = 0
    episode_loss = 0

    for iter in range(100000): #Very ambitious limit to number of steps in a given episode. Will be very surprised if we get anywhere near this.

        #Classic Sarsa style step, take action and observe output from environment. Then store those results for experience recall
        action = agent.act(state)
        next_state, reward, dead, _ = env.step(action)
        next_state = agent.preProcess(next_state)
        next_state = np.stack((next_state, state[0], state[1], state[2]))
        agent.storeResults(state, action, reward, next_state, dead)
        state = next_state
        episode_reward += reward

        #Now we update the weights of the agent.
        loss, max_q = agent.train()
        episode_loss += loss
        Max_Qs += max_q

        total_iter += 1

        #Update the epsilon as we go to improve late training performance. Exploration/Exploitation trade off.
        if total_iter % 1000 == 0:
            agent.updateEpsilon()

        if dead:
            #Episode has ended. DDDQNAgent has run out of lives.

            #Get episode statistics ready.
            rolling_100.append(episode_reward)
            avg_max_q_val = Max_Qs / iter

            #Log for monitoring
            print(f"Current Episode = {episode},  Episode Reward = {episode_reward}, Rolling Average = {np.mean(rolling_100)}, Current Epsilon = {agent.epsilon}, Total Step = {total_iter}")

            #Send to WandB for remote monitoring and summary graphing.
            wandb.log({'Episode': episode, \
                           "Average reward": np.mean(rolling_100), \
                           "Total Loss": episode_loss, \
                           "Average maxQ": round(avg_max_q_val, 2), \
                           "total_step": total_iter})
            
            #Save model as pkl to store progress / use later to play.
            if episode % 20 == 0:
                file_name = "./DQNBreakOut-" + str(episode) + "-" + str(round(np.mean(rolling_100))) + '.pkl'
                torch.save(agent.main_model.state_dict(), file_name)
            
            #Make sure target and online model weights match
            agent.target_model.load_state_dict(agent.main_model.state_dict())

            break


Current Episode = 0,  Episode Reward = 3.0, Rolling Average = 3.0, Current Epsilon = 1, Total Step = 263
Current Episode = 1,  Episode Reward = 1.0, Rolling Average = 2.0, Current Epsilon = 1, Total Step = 430
Current Episode = 2,  Episode Reward = 0.0, Rolling Average = 1.3333333333333333, Current Epsilon = 1, Total Step = 564
Current Episode = 3,  Episode Reward = 0.0, Rolling Average = 1.0, Current Epsilon = 1, Total Step = 702
Current Episode = 4,  Episode Reward = 0.0, Rolling Average = 0.8, Current Epsilon = 1, Total Step = 837
Current Episode = 5,  Episode Reward = 0.0, Rolling Average = 0.6666666666666666, Current Epsilon = 1, Total Step = 978
Current Episode = 6,  Episode Reward = 3.0, Rolling Average = 1.0, Current Epsilon = 0.99, Total Step = 1224
Current Episode = 7,  Episode Reward = 3.0, Rolling Average = 1.25, Current Epsilon = 0.99, Total Step = 1489
Current Episode = 8,  Episode Reward = 0.0, Rolling Average = 1.1111111111111112, Current Epsilon = 0.99, Total Step = 16

KeyboardInterrupt: ignored

Acknowledgements:

  Playing Atari with Deep Learning - Deepmind - Available at:  https://www.cs.toronto.edu/~vmnih/docs/dqn.pdf

  Official Pytorch DQN tutorial - Available at: https://pytorch.org/tutorials/intermediate/reinforcement_q_learning.html
  
  Neural Network and Hyperparameters informed by:
  https://lzzmm.github.io/2021/11/05/breakout/

  Adaptation of model to breakout learned from:

  https://github.com/AdrianHsu/breakout-Deep-Q-Network

  https://keras.io/examples/rl/deep_q_network_breakout/

  https://becominghuman.ai/lets-build-an-atari-ai-part-1-dqn-df57e8ff3b26

  Use of pytorch in this environment informed by:

  https://www.mlq.ai/deep-reinforcement-learning-pytorch-implementation/

  https://github.com/iKintosh/DQN-breakout-Pytorch

  https://github.com/bhctsntrk/OpenAIPong-DQN

  https://github.com/jasonbian97/Deep-Q-Learning-Atari-Pytorch


  Prioritised replay researched from (Not able to implement):

  https://github.com/sfyzsr/Reinforcement-Learning-for-Atari

