# Deep Q-Learning 

Install dependencies for AI gym to run properly (shouldn't take more than a minute). If running on google cloud or running locally, only need to run once. Colab may require installing everytime the vm shuts down.

In [None]:
!pip3 install gym pyvirtualdisplay
!sudo apt-get install -y xvfb python-opengl ffmpeg

In [None]:
!pip3 install --upgrade setuptools --user
!pip3 install ez_setup 
!pip3 install gym[atari] 
!pip3 install gym[accept-rom-license] 

For this assignment we will implement the Deep Q-Learning algorithm with Experience Replay as described in breakthrough paper __"Playing Atari with Deep Reinforcement Learning"__. We will train an agent to play the famous game of __Breakout__.

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
%matplotlib inline
import sys
import gym
import torch
import pylab
import random
import numpy as np
from collections import deque
from datetime import datetime
from copy import deepcopy
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.autograd import Variable
from utils import find_max_lives, check_live, get_frame, get_init_state
from model import DQN
from config import *

import matplotlib.pyplot as plt

## Understanding the environment

In the following cell, we initialize our game of __Breakout__ and you can see how the environment looks like. For further documentation of the of the environment refer to https://www.gymlibrary.dev/environments/atari/breakout/. 

In breakout, we will use 3 actions "fire", "left", and "right". "fire" is only used to reset the game when a life is lost, "left" moves the agent left and "right" moves the agent right.

In [3]:
env = gym.make('BreakoutDeterministic-v4')
state = env.reset()

A.L.E: Arcade Learning Environment (version 0.8.1+53f58b7)
[Powered by Stella]


In [4]:
number_lives = find_max_lives(env)
state_size = env.observation_space.shape
action_size = 3 #fire, left, and right

## Creating a DQN Agent

Here we create a DQN Agent. This agent is defined in the __agent.py__. The corresponding neural network is defined in the __model.py__. Once you've created a working DQN agent, use the code in agent.py to create a double DQN agent in __agent_double.py__. Set the flag "double_dqn" to True to train the double DQN agent.

__Evaluation Reward__ : The average reward received in the past 100 episodes/games.

__Frame__ : Number of frames processed in total.

__Memory Size__ : The current size of the replay memory.

In [5]:
double_dqn = True # set to True if using double DQN agent

if double_dqn:
    from agent_double import Agent
else:
    from agent import Agent

agent = Agent(action_size)
evaluation_reward = deque(maxlen=evaluation_reward_length)
frame = 0
memory_size = 0

### Main Training Loop

In this training loop, we do not render the screen because it slows down training signficantly. To watch the agent play the game, run the code in next section "Visualize Agent Performance"

In [None]:
rewards, episodes = [], []
best_eval_reward = 0
for e in range(EPISODES):
    done = False
    score = 0

    history = np.zeros([5, 84, 84], dtype=np.uint8)
    step = 0
    state = env.reset()
    next_state = state
    life = number_lives

    get_init_state(history, state, HISTORY_SIZE)

    while not done:
        step += 1
        frame += 1

        # Perform a fire action if ball is no longer on screen to continue onto next life
        if step > 1 and len(np.unique(next_state[:189] == state[:189])) < 2:
            action = 0
        else:
            action = agent.get_action(np.float32(history[:4, :, :]) / 255.)
        state = next_state
        next_state, reward, done, _, info = env.step(action + 1)
        
        frame_next_state = get_frame(next_state)
        history[4, :, :] = frame_next_state
        terminal_state = check_live(life, info['lives'])

        life = info['lives']
        r = reward

        # Store the transition in memory 
        agent.memory.push(deepcopy(frame_next_state), action, r, terminal_state)
        # Start training after random sample generation
        if(frame >= train_frame):
            agent.train_policy_net(frame)
            # Update the target network only for Double DQN only
            if double_dqn and (frame % update_target_network_frequency)== 0:
                agent.update_target_net()
        score += reward
        history[:4, :, :] = history[1:, :, :]
            
        if done:
            evaluation_reward.append(score)
            rewards.append(np.mean(evaluation_reward))
            episodes.append(e)
            pylab.plot(episodes, rewards, 'b')
            pylab.xlabel('Episodes')
            pylab.ylabel('Rewards') 
            pylab.title('Episodes vs Reward')
            pylab.savefig("./save_graph/breakout_double_dqn.png") # save graph for training visualization
            
            # every episode, plot the play time
            print("epis:", e, "  score:", score, "  mem len:",
                  len(agent.memory), "  epsilon:", round(agent.epsilon, 4), "   steps:", step,
                  "   lr:", round(agent.optimizer.param_groups[0]['lr'], 7), "    reward:", round(np.mean(evaluation_reward), 2))

            # if the mean of scores of last 100 episode is bigger than 5 save model
            ### Change this save condition to whatever you prefer ###
            if np.mean(evaluation_reward) > 5 and np.mean(evaluation_reward) > best_eval_reward:
                torch.save(agent.policy_net, "./save_model/breakout_double_dqn.pth")
                best_eval_reward = np.mean(evaluation_reward)


  if step > 1 and len(np.unique(next_state[:189] == state[:189])) < 2:
  if step > 1 and len(np.unique(next_state[:189] == state[:189])) < 2:


epis: 0   score: 1.0   mem len: 151   epsilon: 1.0    steps: 151    lr: 0.0001     reward: 1.0
epis: 1   score: 1.0   mem len: 322   epsilon: 1.0    steps: 171    lr: 0.0001     reward: 1.0
epis: 2   score: 0.0   mem len: 445   epsilon: 1.0    steps: 123    lr: 0.0001     reward: 0.67
epis: 3   score: 3.0   mem len: 691   epsilon: 1.0    steps: 246    lr: 0.0001     reward: 1.25
epis: 4   score: 0.0   mem len: 814   epsilon: 1.0    steps: 123    lr: 0.0001     reward: 1.0
epis: 5   score: 0.0   mem len: 937   epsilon: 1.0    steps: 123    lr: 0.0001     reward: 0.83
epis: 6   score: 3.0   mem len: 1163   epsilon: 1.0    steps: 226    lr: 0.0001     reward: 1.14
epis: 7   score: 1.0   mem len: 1332   epsilon: 1.0    steps: 169    lr: 0.0001     reward: 1.12
epis: 8   score: 3.0   mem len: 1598   epsilon: 1.0    steps: 266    lr: 0.0001     reward: 1.33
epis: 9   score: 0.0   mem len: 1720   epsilon: 1.0    steps: 122    lr: 0.0001     reward: 1.2
epis: 10   score: 1.0   mem len: 1871   

epis: 84   score: 2.0   mem len: 16851   epsilon: 1.0    steps: 200    lr: 0.0001     reward: 1.81
epis: 85   score: 2.0   mem len: 17070   epsilon: 1.0    steps: 219    lr: 0.0001     reward: 1.81
epis: 86   score: 1.0   mem len: 17239   epsilon: 1.0    steps: 169    lr: 0.0001     reward: 1.8
epis: 87   score: 0.0   mem len: 17362   epsilon: 1.0    steps: 123    lr: 0.0001     reward: 1.78
epis: 88   score: 4.0   mem len: 17603   epsilon: 1.0    steps: 241    lr: 0.0001     reward: 1.81
epis: 89   score: 2.0   mem len: 17819   epsilon: 1.0    steps: 216    lr: 0.0001     reward: 1.81
epis: 90   score: 1.0   mem len: 17990   epsilon: 1.0    steps: 171    lr: 0.0001     reward: 1.8
epis: 91   score: 0.0   mem len: 18112   epsilon: 1.0    steps: 122    lr: 0.0001     reward: 1.78
epis: 92   score: 2.0   mem len: 18331   epsilon: 1.0    steps: 219    lr: 0.0001     reward: 1.78
epis: 93   score: 1.0   mem len: 18482   epsilon: 1.0    steps: 151    lr: 0.0001     reward: 1.78
epis: 94   s

epis: 167   score: 1.0   mem len: 31955   epsilon: 1.0    steps: 151    lr: 0.0001     reward: 1.6
epis: 168   score: 0.0   mem len: 32077   epsilon: 1.0    steps: 122    lr: 0.0001     reward: 1.58
epis: 169   score: 2.0   mem len: 32296   epsilon: 1.0    steps: 219    lr: 0.0001     reward: 1.57
epis: 170   score: 1.0   mem len: 32465   epsilon: 1.0    steps: 169    lr: 0.0001     reward: 1.52
epis: 171   score: 2.0   mem len: 32680   epsilon: 1.0    steps: 215    lr: 0.0001     reward: 1.53
epis: 172   score: 0.0   mem len: 32803   epsilon: 1.0    steps: 123    lr: 0.0001     reward: 1.5
epis: 173   score: 2.0   mem len: 33023   epsilon: 1.0    steps: 220    lr: 0.0001     reward: 1.49
epis: 174   score: 2.0   mem len: 33220   epsilon: 1.0    steps: 197    lr: 0.0001     reward: 1.5
epis: 175   score: 3.0   mem len: 33445   epsilon: 1.0    steps: 225    lr: 0.0001     reward: 1.51
epis: 176   score: 2.0   mem len: 33642   epsilon: 1.0    steps: 197    lr: 0.0001     reward: 1.53
epi

epis: 249   score: 2.0   mem len: 46928   epsilon: 1.0    steps: 219    lr: 0.0001     reward: 1.44
epis: 250   score: 3.0   mem len: 47155   epsilon: 1.0    steps: 227    lr: 0.0001     reward: 1.45
epis: 251   score: 1.0   mem len: 47305   epsilon: 1.0    steps: 150    lr: 0.0001     reward: 1.46
epis: 252   score: 1.0   mem len: 47474   epsilon: 1.0    steps: 169    lr: 0.0001     reward: 1.46
epis: 253   score: 1.0   mem len: 47643   epsilon: 1.0    steps: 169    lr: 0.0001     reward: 1.45
epis: 254   score: 1.0   mem len: 47812   epsilon: 1.0    steps: 169    lr: 0.0001     reward: 1.46
epis: 255   score: 0.0   mem len: 47935   epsilon: 1.0    steps: 123    lr: 0.0001     reward: 1.44
epis: 256   score: 2.0   mem len: 48135   epsilon: 1.0    steps: 200    lr: 0.0001     reward: 1.46
epis: 257   score: 2.0   mem len: 48333   epsilon: 1.0    steps: 198    lr: 0.0001     reward: 1.47
epis: 258   score: 1.0   mem len: 48503   epsilon: 1.0    steps: 170    lr: 0.0001     reward: 1.47


epis: 331   score: 2.0   mem len: 61770   epsilon: 1.0    steps: 223    lr: 0.0001     reward: 1.37
epis: 332   score: 0.0   mem len: 61892   epsilon: 1.0    steps: 122    lr: 0.0001     reward: 1.37
epis: 333   score: 2.0   mem len: 62090   epsilon: 1.0    steps: 198    lr: 0.0001     reward: 1.38
epis: 334   score: 2.0   mem len: 62287   epsilon: 1.0    steps: 197    lr: 0.0001     reward: 1.39
epis: 335   score: 0.0   mem len: 62410   epsilon: 1.0    steps: 123    lr: 0.0001     reward: 1.36
epis: 336   score: 1.0   mem len: 62561   epsilon: 1.0    steps: 151    lr: 0.0001     reward: 1.37
epis: 337   score: 5.0   mem len: 62887   epsilon: 1.0    steps: 326    lr: 0.0001     reward: 1.41
epis: 338   score: 1.0   mem len: 63056   epsilon: 1.0    steps: 169    lr: 0.0001     reward: 1.4
epis: 339   score: 1.0   mem len: 63206   epsilon: 1.0    steps: 150    lr: 0.0001     reward: 1.4
epis: 340   score: 0.0   mem len: 63329   epsilon: 1.0    steps: 123    lr: 0.0001     reward: 1.39
ep

epis: 414   score: 1.0   mem len: 76806   epsilon: 1.0    steps: 150    lr: 0.0001     reward: 1.42
epis: 415   score: 5.0   mem len: 77153   epsilon: 1.0    steps: 347    lr: 0.0001     reward: 1.47
epis: 416   score: 0.0   mem len: 77276   epsilon: 1.0    steps: 123    lr: 0.0001     reward: 1.47
epis: 417   score: 4.0   mem len: 77573   epsilon: 1.0    steps: 297    lr: 0.0001     reward: 1.48
epis: 418   score: 1.0   mem len: 77741   epsilon: 1.0    steps: 168    lr: 0.0001     reward: 1.48
epis: 419   score: 0.0   mem len: 77864   epsilon: 1.0    steps: 123    lr: 0.0001     reward: 1.44
epis: 420   score: 4.0   mem len: 78157   epsilon: 1.0    steps: 293    lr: 0.0001     reward: 1.48
epis: 421   score: 0.0   mem len: 78280   epsilon: 1.0    steps: 123    lr: 0.0001     reward: 1.43
epis: 422   score: 0.0   mem len: 78403   epsilon: 1.0    steps: 123    lr: 0.0001     reward: 1.43
epis: 423   score: 0.0   mem len: 78526   epsilon: 1.0    steps: 123    lr: 0.0001     reward: 1.4
e

epis: 497   score: 1.0   mem len: 92097   epsilon: 1.0    steps: 150    lr: 0.0001     reward: 1.56
epis: 498   score: 0.0   mem len: 92220   epsilon: 1.0    steps: 123    lr: 0.0001     reward: 1.54
epis: 499   score: 2.0   mem len: 92418   epsilon: 1.0    steps: 198    lr: 0.0001     reward: 1.56
epis: 500   score: 1.0   mem len: 92587   epsilon: 1.0    steps: 169    lr: 0.0001     reward: 1.54
epis: 501   score: 4.0   mem len: 92882   epsilon: 1.0    steps: 295    lr: 0.0001     reward: 1.57
epis: 502   score: 0.0   mem len: 93005   epsilon: 1.0    steps: 123    lr: 0.0001     reward: 1.55
epis: 503   score: 2.0   mem len: 93203   epsilon: 1.0    steps: 198    lr: 0.0001     reward: 1.56
epis: 504   score: 0.0   mem len: 93326   epsilon: 1.0    steps: 123    lr: 0.0001     reward: 1.56
epis: 505   score: 1.0   mem len: 93495   epsilon: 1.0    steps: 169    lr: 0.0001     reward: 1.57
epis: 506   score: 2.0   mem len: 93677   epsilon: 1.0    steps: 182    lr: 0.0001     reward: 1.52


  sample = np.array(sample)
  mini_batch = np.array(mini_batch).transpose()
  next_q_values[mask] = self.target_net(non_final_next_states).gather(1, next_action.unsqueeze(1)).cuda()[mask]


epis: 541   score: 0.0   mem len: 100100   epsilon: 0.9998    steps: 123    lr: 0.0001     reward: 1.52
epis: 542   score: 1.0   mem len: 100251   epsilon: 0.9995    steps: 151    lr: 0.0001     reward: 1.52
epis: 543   score: 0.0   mem len: 100374   epsilon: 0.9993    steps: 123    lr: 0.0001     reward: 1.5
epis: 544   score: 1.0   mem len: 100525   epsilon: 0.999    steps: 151    lr: 0.0001     reward: 1.51
epis: 545   score: 1.0   mem len: 100693   epsilon: 0.9986    steps: 168    lr: 0.0001     reward: 1.51
epis: 546   score: 1.0   mem len: 100862   epsilon: 0.9983    steps: 169    lr: 0.0001     reward: 1.49
epis: 547   score: 3.0   mem len: 101108   epsilon: 0.9978    steps: 246    lr: 0.0001     reward: 1.51
epis: 548   score: 1.0   mem len: 101277   epsilon: 0.9975    steps: 169    lr: 0.0001     reward: 1.48
epis: 549   score: 1.0   mem len: 101428   epsilon: 0.9972    steps: 151    lr: 0.0001     reward: 1.48
epis: 550   score: 1.0   mem len: 101579   epsilon: 0.9969    step

epis: 620   score: 3.0   mem len: 114181   epsilon: 0.9719    steps: 244    lr: 0.0001     reward: 1.47
epis: 621   score: 0.0   mem len: 114304   epsilon: 0.9717    steps: 123    lr: 0.0001     reward: 1.47
epis: 622   score: 0.0   mem len: 114427   epsilon: 0.9714    steps: 123    lr: 0.0001     reward: 1.46
epis: 623   score: 3.0   mem len: 114677   epsilon: 0.9709    steps: 250    lr: 0.0001     reward: 1.47
epis: 624   score: 4.0   mem len: 114977   epsilon: 0.9703    steps: 300    lr: 0.0001     reward: 1.51
epis: 625   score: 2.0   mem len: 115178   epsilon: 0.9699    steps: 201    lr: 0.0001     reward: 1.52
epis: 626   score: 0.0   mem len: 115301   epsilon: 0.9697    steps: 123    lr: 0.0001     reward: 1.5
epis: 627   score: 0.0   mem len: 115424   epsilon: 0.9695    steps: 123    lr: 0.0001     reward: 1.48
epis: 628   score: 1.0   mem len: 115594   epsilon: 0.9691    steps: 170    lr: 0.0001     reward: 1.46
epis: 629   score: 4.0   mem len: 115888   epsilon: 0.9685    ste

epis: 699   score: 0.0   mem len: 129078   epsilon: 0.9424    steps: 122    lr: 0.0001     reward: 1.54
epis: 700   score: 2.0   mem len: 129277   epsilon: 0.942    steps: 199    lr: 0.0001     reward: 1.55
epis: 701   score: 0.0   mem len: 129400   epsilon: 0.9418    steps: 123    lr: 0.0001     reward: 1.55
epis: 702   score: 0.0   mem len: 129523   epsilon: 0.9415    steps: 123    lr: 0.0001     reward: 1.54
epis: 703   score: 1.0   mem len: 129674   epsilon: 0.9412    steps: 151    lr: 0.0001     reward: 1.55
epis: 704   score: 0.0   mem len: 129797   epsilon: 0.941    steps: 123    lr: 0.0001     reward: 1.55
epis: 705   score: 1.0   mem len: 129948   epsilon: 0.9407    steps: 151    lr: 0.0001     reward: 1.55
epis: 706   score: 1.0   mem len: 130118   epsilon: 0.9404    steps: 170    lr: 0.0001     reward: 1.51
epis: 707   score: 1.0   mem len: 130289   epsilon: 0.94    steps: 171    lr: 0.0001     reward: 1.47
epis: 708   score: 5.0   mem len: 130609   epsilon: 0.9394    steps:

epis: 778   score: 0.0   mem len: 143503   epsilon: 0.9139    steps: 123    lr: 0.0001     reward: 1.64
epis: 779   score: 3.0   mem len: 143749   epsilon: 0.9134    steps: 246    lr: 0.0001     reward: 1.62
epis: 780   score: 1.0   mem len: 143918   epsilon: 0.913    steps: 169    lr: 0.0001     reward: 1.62
epis: 781   score: 2.0   mem len: 144136   epsilon: 0.9126    steps: 218    lr: 0.0001     reward: 1.62
epis: 782   score: 1.0   mem len: 144287   epsilon: 0.9123    steps: 151    lr: 0.0001     reward: 1.62
epis: 783   score: 3.0   mem len: 144533   epsilon: 0.9118    steps: 246    lr: 0.0001     reward: 1.63
epis: 784   score: 3.0   mem len: 144760   epsilon: 0.9114    steps: 227    lr: 0.0001     reward: 1.62
epis: 785   score: 4.0   mem len: 145055   epsilon: 0.9108    steps: 295    lr: 0.0001     reward: 1.64
epis: 786   score: 3.0   mem len: 145300   epsilon: 0.9103    steps: 245    lr: 0.0001     reward: 1.65
epis: 787   score: 0.0   mem len: 145422   epsilon: 0.9101    ste

epis: 857   score: 0.0   mem len: 158194   epsilon: 0.8848    steps: 123    lr: 0.0001     reward: 1.55
epis: 858   score: 0.0   mem len: 158316   epsilon: 0.8845    steps: 122    lr: 0.0001     reward: 1.55
epis: 859   score: 0.0   mem len: 158439   epsilon: 0.8843    steps: 123    lr: 0.0001     reward: 1.53
epis: 860   score: 3.0   mem len: 158684   epsilon: 0.8838    steps: 245    lr: 0.0001     reward: 1.53
epis: 861   score: 2.0   mem len: 158866   epsilon: 0.8834    steps: 182    lr: 0.0001     reward: 1.54
epis: 862   score: 3.0   mem len: 159115   epsilon: 0.883    steps: 249    lr: 0.0001     reward: 1.55
epis: 863   score: 3.0   mem len: 159384   epsilon: 0.8824    steps: 269    lr: 0.0001     reward: 1.58
epis: 864   score: 0.0   mem len: 159506   epsilon: 0.8822    steps: 122    lr: 0.0001     reward: 1.56
epis: 865   score: 1.0   mem len: 159674   epsilon: 0.8818    steps: 168    lr: 0.0001     reward: 1.57
epis: 866   score: 2.0   mem len: 159893   epsilon: 0.8814    ste

epis: 936   score: 2.0   mem len: 174699   epsilon: 0.8521    steps: 182    lr: 0.0001     reward: 2.1
epis: 937   score: 3.0   mem len: 174946   epsilon: 0.8516    steps: 247    lr: 0.0001     reward: 2.1
epis: 938   score: 2.0   mem len: 175147   epsilon: 0.8512    steps: 201    lr: 0.0001     reward: 2.11
epis: 939   score: 3.0   mem len: 175391   epsilon: 0.8507    steps: 244    lr: 0.0001     reward: 2.13
epis: 940   score: 7.0   mem len: 175801   epsilon: 0.8499    steps: 410    lr: 0.0001     reward: 2.2
epis: 941   score: 5.0   mem len: 176114   epsilon: 0.8493    steps: 313    lr: 0.0001     reward: 2.21
epis: 942   score: 2.0   mem len: 176336   epsilon: 0.8489    steps: 222    lr: 0.0001     reward: 2.22
epis: 943   score: 0.0   mem len: 176459   epsilon: 0.8486    steps: 123    lr: 0.0001     reward: 2.21
epis: 944   score: 2.0   mem len: 176639   epsilon: 0.8483    steps: 180    lr: 0.0001     reward: 2.2
epis: 945   score: 3.0   mem len: 176883   epsilon: 0.8478    steps:

epis: 1015   score: 2.0   mem len: 191983   epsilon: 0.8179    steps: 200    lr: 0.0001     reward: 2.35
epis: 1016   score: 1.0   mem len: 192153   epsilon: 0.8175    steps: 170    lr: 0.0001     reward: 2.35
epis: 1017   score: 1.0   mem len: 192321   epsilon: 0.8172    steps: 168    lr: 0.0001     reward: 2.35
epis: 1018   score: 4.0   mem len: 192631   epsilon: 0.8166    steps: 310    lr: 0.0001     reward: 2.35
epis: 1019   score: 4.0   mem len: 192947   epsilon: 0.816    steps: 316    lr: 0.0001     reward: 2.35
epis: 1020   score: 1.0   mem len: 193118   epsilon: 0.8156    steps: 171    lr: 0.0001     reward: 2.36
epis: 1021   score: 3.0   mem len: 193345   epsilon: 0.8152    steps: 227    lr: 0.0001     reward: 2.36
epis: 1022   score: 1.0   mem len: 193496   epsilon: 0.8149    steps: 151    lr: 0.0001     reward: 2.35
epis: 1023   score: 2.0   mem len: 193694   epsilon: 0.8145    steps: 198    lr: 0.0001     reward: 2.34
epis: 1024   score: 2.0   mem len: 193892   epsilon: 0.8

epis: 1094   score: 2.0   mem len: 209306   epsilon: 0.7836    steps: 198    lr: 4e-05     reward: 2.47
epis: 1095   score: 1.0   mem len: 209457   epsilon: 0.7833    steps: 151    lr: 4e-05     reward: 2.44
epis: 1096   score: 1.0   mem len: 209626   epsilon: 0.7829    steps: 169    lr: 4e-05     reward: 2.45
epis: 1097   score: 7.0   mem len: 210039   epsilon: 0.7821    steps: 413    lr: 4e-05     reward: 2.52
epis: 1098   score: 1.0   mem len: 210209   epsilon: 0.7818    steps: 170    lr: 4e-05     reward: 2.53
epis: 1099   score: 3.0   mem len: 210439   epsilon: 0.7813    steps: 230    lr: 4e-05     reward: 2.49
epis: 1100   score: 3.0   mem len: 210684   epsilon: 0.7808    steps: 245    lr: 4e-05     reward: 2.5
epis: 1101   score: 1.0   mem len: 210852   epsilon: 0.7805    steps: 168    lr: 4e-05     reward: 2.49
epis: 1102   score: 3.0   mem len: 211100   epsilon: 0.78    steps: 248    lr: 4e-05     reward: 2.5
epis: 1103   score: 2.0   mem len: 211300   epsilon: 0.7796    steps

epis: 1173   score: 4.0   mem len: 227927   epsilon: 0.7467    steps: 274    lr: 4e-05     reward: 2.82
epis: 1174   score: 2.0   mem len: 228126   epsilon: 0.7463    steps: 199    lr: 4e-05     reward: 2.82
epis: 1175   score: 5.0   mem len: 228472   epsilon: 0.7456    steps: 346    lr: 4e-05     reward: 2.84
epis: 1176   score: 3.0   mem len: 228716   epsilon: 0.7451    steps: 244    lr: 4e-05     reward: 2.83
epis: 1177   score: 4.0   mem len: 229012   epsilon: 0.7446    steps: 296    lr: 4e-05     reward: 2.84
epis: 1178   score: 5.0   mem len: 229354   epsilon: 0.7439    steps: 342    lr: 4e-05     reward: 2.85
epis: 1179   score: 4.0   mem len: 229630   epsilon: 0.7433    steps: 276    lr: 4e-05     reward: 2.88
epis: 1180   score: 3.0   mem len: 229856   epsilon: 0.7429    steps: 226    lr: 4e-05     reward: 2.9
epis: 1181   score: 4.0   mem len: 230154   epsilon: 0.7423    steps: 298    lr: 4e-05     reward: 2.91
epis: 1182   score: 3.0   mem len: 230400   epsilon: 0.7418    st

# Visualize Agent Performance

BE AWARE THIS CODE BELOW MAY CRASH THE KERNEL IF YOU RUN THE SAME CELL TWICE.

Please save your model before running this portion of the code.

In [None]:
torch.save(agent.policy_net, "./save_model/breakout_double_dqn_latest.pth")

In [None]:
from gym.wrappers import RecordVideo # If importing monitor raises issues, try using `from gym.wrappers import RecordVideo`
import glob
import io
import base64

from IPython.display import HTML
from IPython import display as ipythondisplay

from pyvirtualdisplay import Display

# Displaying the game live
def show_state(env, step=0, info=""):
    plt.figure(3)
    plt.clf()
    plt.imshow(env.render(mode='rgb_array'))
    plt.title("%s | Step: %d %s" % ("Agent Playing",step, info))
    plt.axis('off')

    ipythondisplay.clear_output(wait=True)
    ipythondisplay.display(plt.gcf())
    
# Recording the game and replaying the game afterwards
def show_video():
    mp4list = glob.glob('video/*.mp4')
    if len(mp4list) > 0:
        mp4 = mp4list[0]
        video = io.open(mp4, 'r+b').read()
        encoded = base64.b64encode(video)
        ipythondisplay.display(HTML(data='''<video alt="test" autoplay 
                loop controls style="height: 400px;">
                <source src="data:video/mp4;base64,{0}" type="video/mp4" />
             </video>'''.format(encoded.decode('ascii'))))
    else: 
        print("Could not find video")
    

def wrap_env(env):
    env = RecordVideo(env, './video')
    return env

In [None]:
display = Display(visible=0, size=(300, 200))
display.start()

# Load agent
# agent.load_policy_net("./save_model/breakout_dqn.pth")
agent.epsilon = 0.0 # Set agent to only exploit the best action

env = gym.make('BreakoutDeterministic-v4')
env = wrap_env(env)

done = False
score = 0
step = 0
state = env.reset()
next_state = state
life = number_lives
history = np.zeros([5, 84, 84], dtype=np.uint8)
get_init_state(history, state)

while not done:
    
    # Render breakout
    env.render()
#     show_state(env,step) # uncommenting this provides another way to visualize the game

    step += 1
    frame += 1

    # Perform a fire action if ball is no longer on screen
    if step > 1 and len(np.unique(next_state[:189] == state[:189])) < 2:
        action = 0
    else:
        action = agent.get_action(np.float32(history[:4, :, :]) / 255.)
    state = next_state
    
    next_state, reward, done, _, info = env.step(action + 1)
        
    frame_next_state = get_frame(next_state)
    history[4, :, :] = frame_next_state
    terminal_state = check_live(life, info['ale.lives'])
        
    life = info['ale.lives']
    r = np.clip(reward, -1, 1) 
    r = reward

    # Store the transition in memory 
    agent.memory.push(deepcopy(frame_next_state), action, r, terminal_state)
    # Start training after random sample generation
    score += reward
    
    history[:4, :, :] = history[1:, :, :]
env.close()
show_video()
display.stop()