# Deep Q-Learning 

Install dependencies for AI gym to run properly (shouldn't take more than a minute). If running on google cloud or running locally, only need to run once. Colab may require installing everytime the vm shuts down.

In [None]:
!pip3 install gym pyvirtualdisplay
!sudo apt-get install -y xvfb python-opengl ffmpeg

In [None]:
!pip3 install --upgrade setuptools --user
!pip3 install ez_setup 
!pip3 install gym[atari] 
!pip3 install gym[accept-rom-license] 

For this assignment we will implement the Deep Q-Learning algorithm with Experience Replay as described in breakthrough paper __"Playing Atari with Deep Reinforcement Learning"__. We will train an agent to play the famous game of __Breakout__.

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
%matplotlib inline
import sys
import gym
import torch
import pylab
import random
import numpy as np
from collections import deque
from datetime import datetime
from copy import deepcopy
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.autograd import Variable
from utils import find_max_lives, check_live, get_frame, get_init_state
from model import DQN
from config import *

import matplotlib.pyplot as plt

## Understanding the environment

In the following cell, we initialize our game of __Breakout__ and you can see how the environment looks like. For further documentation of the of the environment refer to https://www.gymlibrary.dev/environments/atari/breakout/. 

In breakout, we will use 3 actions "fire", "left", and "right". "fire" is only used to reset the game when a life is lost, "left" moves the agent left and "right" moves the agent right.

In [3]:
env = gym.make('BreakoutDeterministic-v4')
state = env.reset()

A.L.E: Arcade Learning Environment (version 0.8.1+53f58b7)
[Powered by Stella]


In [4]:
number_lives = find_max_lives(env)
state_size = env.observation_space.shape
action_size = 3 #fire, left, and right

## Creating a DQN Agent

Here we create a DQN Agent. This agent is defined in the __agent.py__. The corresponding neural network is defined in the __model.py__. Once you've created a working DQN agent, use the code in agent.py to create a double DQN agent in __agent_double.py__. Set the flag "double_dqn" to True to train the double DQN agent.

__Evaluation Reward__ : The average reward received in the past 100 episodes/games.

__Frame__ : Number of frames processed in total.

__Memory Size__ : The current size of the replay memory.

In [5]:
double_dqn = False # set to True if using double DQN agent

if double_dqn:
    from agent_double import Agent
else:
    from agent import Agent

agent = Agent(action_size)
evaluation_reward = deque(maxlen=evaluation_reward_length)
frame = 0
memory_size = 0

### Main Training Loop

In this training loop, we do not render the screen because it slows down training signficantly. To watch the agent play the game, run the code in next section "Visualize Agent Performance"

In [None]:
rewards, episodes = [], []
best_eval_reward = 0
for e in range(EPISODES):
    done = False
    score = 0

    history = np.zeros([5, 84, 84], dtype=np.uint8)
    step = 0
    state = env.reset()
    next_state = state
    life = number_lives

    get_init_state(history, state, HISTORY_SIZE)

    while not done:
        step += 1
        frame += 1

        # Perform a fire action if ball is no longer on screen to continue onto next life
        if step > 1 and len(np.unique(next_state[:189] == state[:189])) < 2:
            action = 0
        else:
            action = agent.get_action(np.float32(history[:4, :, :]) / 255.)
        state = next_state
        next_state, reward, done, _, info = env.step(action + 1)
        
        frame_next_state = get_frame(next_state)
        history[4, :, :] = frame_next_state
        terminal_state = check_live(life, info['lives'])

        life = info['lives']
        r = reward

        # Store the transition in memory 
        agent.memory.push(deepcopy(frame_next_state), action, r, terminal_state)
        # Start training after random sample generation
        if(frame >= train_frame):
            agent.train_policy_net(frame)
            # Update the target network only for Double DQN only
            if double_dqn and (frame % update_target_network_frequency)== 0:
                agent.update_target_net()
        score += reward
        history[:4, :, :] = history[1:, :, :]
            
        if done:
            evaluation_reward.append(score)
            rewards.append(np.mean(evaluation_reward))
            episodes.append(e)
            pylab.plot(episodes, rewards, 'b')
            pylab.xlabel('Episodes')
            pylab.ylabel('Rewards') 
            pylab.title('Episodes vs Reward')
            pylab.savefig("./save_graph/breakout_dqn.png") # save graph for training visualization
            
            # every episode, plot the play time
            print("epis:", e, "  score:", score, "  mem len:",
                  len(agent.memory), "  epsilon:", round(agent.epsilon, 4), "   steps:", step,
                  "   lr:", round(agent.optimizer.param_groups[0]['lr'], 7), "    reward:", round(np.mean(evaluation_reward), 2))

            # if the mean of scores of last 100 episode is bigger than 5 save model
            ### Change this save condition to whatever you prefer ###
            if np.mean(evaluation_reward) > 5 and np.mean(evaluation_reward) > best_eval_reward:
                torch.save(agent.policy_net, "./save_model/breakout_dqn.pth")
                best_eval_reward = np.mean(evaluation_reward)


  if step > 1 and len(np.unique(next_state[:189] == state[:189])) < 2:
  if step > 1 and len(np.unique(next_state[:189] == state[:189])) < 2:


epis: 0   score: 0.0   mem len: 122   epsilon: 1.0    steps: 122    lr: 0.0001     reward: 0.0
epis: 1   score: 2.0   mem len: 339   epsilon: 1.0    steps: 217    lr: 0.0001     reward: 1.0
epis: 2   score: 3.0   mem len: 567   epsilon: 1.0    steps: 228    lr: 0.0001     reward: 1.67
epis: 3   score: 2.0   mem len: 766   epsilon: 1.0    steps: 199    lr: 0.0001     reward: 1.75
epis: 4   score: 1.0   mem len: 917   epsilon: 1.0    steps: 151    lr: 0.0001     reward: 1.6
epis: 5   score: 1.0   mem len: 1089   epsilon: 1.0    steps: 172    lr: 0.0001     reward: 1.5
epis: 6   score: 2.0   mem len: 1271   epsilon: 1.0    steps: 182    lr: 0.0001     reward: 1.57
epis: 7   score: 1.0   mem len: 1439   epsilon: 1.0    steps: 168    lr: 0.0001     reward: 1.5
epis: 8   score: 0.0   mem len: 1562   epsilon: 1.0    steps: 123    lr: 0.0001     reward: 1.33
epis: 9   score: 0.0   mem len: 1685   epsilon: 1.0    steps: 123    lr: 0.0001     reward: 1.2
epis: 10   score: 1.0   mem len: 1836   e

epis: 84   score: 0.0   mem len: 14894   epsilon: 1.0    steps: 123    lr: 0.0001     reward: 1.32
epis: 85   score: 1.0   mem len: 15045   epsilon: 1.0    steps: 151    lr: 0.0001     reward: 1.31
epis: 86   score: 1.0   mem len: 15197   epsilon: 1.0    steps: 152    lr: 0.0001     reward: 1.31
epis: 87   score: 4.0   mem len: 15494   epsilon: 1.0    steps: 297    lr: 0.0001     reward: 1.34
epis: 88   score: 2.0   mem len: 15713   epsilon: 1.0    steps: 219    lr: 0.0001     reward: 1.35
epis: 89   score: 1.0   mem len: 15864   epsilon: 1.0    steps: 151    lr: 0.0001     reward: 1.34
epis: 90   score: 1.0   mem len: 16014   epsilon: 1.0    steps: 150    lr: 0.0001     reward: 1.34
epis: 91   score: 1.0   mem len: 16183   epsilon: 1.0    steps: 169    lr: 0.0001     reward: 1.34
epis: 92   score: 0.0   mem len: 16306   epsilon: 1.0    steps: 123    lr: 0.0001     reward: 1.32
epis: 93   score: 2.0   mem len: 16524   epsilon: 1.0    steps: 218    lr: 0.0001     reward: 1.33
epis: 94  

epis: 167   score: 1.0   mem len: 29929   epsilon: 1.0    steps: 169    lr: 0.0001     reward: 1.39
epis: 168   score: 2.0   mem len: 30126   epsilon: 1.0    steps: 197    lr: 0.0001     reward: 1.38
epis: 169   score: 2.0   mem len: 30324   epsilon: 1.0    steps: 198    lr: 0.0001     reward: 1.38
epis: 170   score: 4.0   mem len: 30600   epsilon: 1.0    steps: 276    lr: 0.0001     reward: 1.42
epis: 171   score: 0.0   mem len: 30723   epsilon: 1.0    steps: 123    lr: 0.0001     reward: 1.41
epis: 172   score: 1.0   mem len: 30892   epsilon: 1.0    steps: 169    lr: 0.0001     reward: 1.39
epis: 173   score: 2.0   mem len: 31110   epsilon: 1.0    steps: 218    lr: 0.0001     reward: 1.41
epis: 174   score: 0.0   mem len: 31233   epsilon: 1.0    steps: 123    lr: 0.0001     reward: 1.39
epis: 175   score: 1.0   mem len: 31384   epsilon: 1.0    steps: 151    lr: 0.0001     reward: 1.4
epis: 176   score: 2.0   mem len: 31602   epsilon: 1.0    steps: 218    lr: 0.0001     reward: 1.41
e

epis: 249   score: 1.0   mem len: 44668   epsilon: 1.0    steps: 172    lr: 0.0001     reward: 1.4
epis: 250   score: 0.0   mem len: 44791   epsilon: 1.0    steps: 123    lr: 0.0001     reward: 1.4
epis: 251   score: 0.0   mem len: 44914   epsilon: 1.0    steps: 123    lr: 0.0001     reward: 1.37
epis: 252   score: 2.0   mem len: 45112   epsilon: 1.0    steps: 198    lr: 0.0001     reward: 1.38
epis: 253   score: 0.0   mem len: 45235   epsilon: 1.0    steps: 123    lr: 0.0001     reward: 1.36
epis: 254   score: 2.0   mem len: 45433   epsilon: 1.0    steps: 198    lr: 0.0001     reward: 1.36
epis: 255   score: 0.0   mem len: 45556   epsilon: 1.0    steps: 123    lr: 0.0001     reward: 1.35
epis: 256   score: 1.0   mem len: 45724   epsilon: 1.0    steps: 168    lr: 0.0001     reward: 1.35
epis: 257   score: 0.0   mem len: 45847   epsilon: 1.0    steps: 123    lr: 0.0001     reward: 1.31
epis: 258   score: 0.0   mem len: 45970   epsilon: 1.0    steps: 123    lr: 0.0001     reward: 1.3
epi

epis: 332   score: 2.0   mem len: 59786   epsilon: 1.0    steps: 218    lr: 0.0001     reward: 1.39
epis: 333   score: 3.0   mem len: 60050   epsilon: 1.0    steps: 264    lr: 0.0001     reward: 1.42
epis: 334   score: 3.0   mem len: 60297   epsilon: 1.0    steps: 247    lr: 0.0001     reward: 1.43
epis: 335   score: 0.0   mem len: 60419   epsilon: 1.0    steps: 122    lr: 0.0001     reward: 1.43
epis: 336   score: 1.0   mem len: 60572   epsilon: 1.0    steps: 153    lr: 0.0001     reward: 1.43
epis: 337   score: 2.0   mem len: 60792   epsilon: 1.0    steps: 220    lr: 0.0001     reward: 1.44
epis: 338   score: 0.0   mem len: 60915   epsilon: 1.0    steps: 123    lr: 0.0001     reward: 1.44
epis: 339   score: 0.0   mem len: 61037   epsilon: 1.0    steps: 122    lr: 0.0001     reward: 1.44
epis: 340   score: 2.0   mem len: 61235   epsilon: 1.0    steps: 198    lr: 0.0001     reward: 1.45
epis: 341   score: 1.0   mem len: 61406   epsilon: 1.0    steps: 171    lr: 0.0001     reward: 1.46


epis: 414   score: 2.0   mem len: 75112   epsilon: 1.0    steps: 197    lr: 0.0001     reward: 1.7
epis: 415   score: 1.0   mem len: 75281   epsilon: 1.0    steps: 169    lr: 0.0001     reward: 1.69
epis: 416   score: 2.0   mem len: 75479   epsilon: 1.0    steps: 198    lr: 0.0001     reward: 1.66
epis: 417   score: 3.0   mem len: 75728   epsilon: 1.0    steps: 249    lr: 0.0001     reward: 1.67
epis: 418   score: 0.0   mem len: 75850   epsilon: 1.0    steps: 122    lr: 0.0001     reward: 1.64
epis: 419   score: 0.0   mem len: 75973   epsilon: 1.0    steps: 123    lr: 0.0001     reward: 1.62
epis: 420   score: 2.0   mem len: 76171   epsilon: 1.0    steps: 198    lr: 0.0001     reward: 1.62
epis: 421   score: 0.0   mem len: 76294   epsilon: 1.0    steps: 123    lr: 0.0001     reward: 1.58
epis: 422   score: 0.0   mem len: 76416   epsilon: 1.0    steps: 122    lr: 0.0001     reward: 1.56
epis: 423   score: 3.0   mem len: 76642   epsilon: 1.0    steps: 226    lr: 0.0001     reward: 1.58
e

epis: 497   score: 3.0   mem len: 90654   epsilon: 1.0    steps: 246    lr: 0.0001     reward: 1.63
epis: 498   score: 1.0   mem len: 90806   epsilon: 1.0    steps: 152    lr: 0.0001     reward: 1.61
epis: 499   score: 2.0   mem len: 91004   epsilon: 1.0    steps: 198    lr: 0.0001     reward: 1.63
epis: 500   score: 5.0   mem len: 91311   epsilon: 1.0    steps: 307    lr: 0.0001     reward: 1.66
epis: 501   score: 0.0   mem len: 91434   epsilon: 1.0    steps: 123    lr: 0.0001     reward: 1.66
epis: 502   score: 2.0   mem len: 91653   epsilon: 1.0    steps: 219    lr: 0.0001     reward: 1.66
epis: 503   score: 3.0   mem len: 91878   epsilon: 1.0    steps: 225    lr: 0.0001     reward: 1.66
epis: 504   score: 1.0   mem len: 92046   epsilon: 1.0    steps: 168    lr: 0.0001     reward: 1.64
epis: 505   score: 2.0   mem len: 92244   epsilon: 1.0    steps: 198    lr: 0.0001     reward: 1.66
epis: 506   score: 0.0   mem len: 92367   epsilon: 1.0    steps: 123    lr: 0.0001     reward: 1.62


  sample = np.array(sample)
  mini_batch = np.array(mini_batch).transpose()
  next_state_values[mask] = self.policy_net(non_final_next_states).max(1)[0].cuda()[mask]


epis: 543   score: 2.0   mem len: 100031   epsilon: 0.9999    steps: 198    lr: 0.0001     reward: 1.8
epis: 544   score: 1.0   mem len: 100202   epsilon: 0.9996    steps: 171    lr: 0.0001     reward: 1.81
epis: 545   score: 0.0   mem len: 100325   epsilon: 0.9994    steps: 123    lr: 0.0001     reward: 1.81
epis: 546   score: 1.0   mem len: 100475   epsilon: 0.9991    steps: 150    lr: 0.0001     reward: 1.82
epis: 547   score: 1.0   mem len: 100625   epsilon: 0.9988    steps: 150    lr: 0.0001     reward: 1.81
epis: 548   score: 2.0   mem len: 100824   epsilon: 0.9984    steps: 199    lr: 0.0001     reward: 1.81
epis: 549   score: 0.0   mem len: 100947   epsilon: 0.9981    steps: 123    lr: 0.0001     reward: 1.8
epis: 550   score: 1.0   mem len: 101116   epsilon: 0.9978    steps: 169    lr: 0.0001     reward: 1.79
epis: 551   score: 1.0   mem len: 101266   epsilon: 0.9975    steps: 150    lr: 0.0001     reward: 1.8
epis: 552   score: 4.0   mem len: 101564   epsilon: 0.9969    steps

epis: 622   score: 0.0   mem len: 114667   epsilon: 0.971    steps: 122    lr: 0.0001     reward: 1.65
epis: 623   score: 1.0   mem len: 114818   epsilon: 0.9707    steps: 151    lr: 0.0001     reward: 1.66
epis: 624   score: 4.0   mem len: 115112   epsilon: 0.9701    steps: 294    lr: 0.0001     reward: 1.7
epis: 625   score: 2.0   mem len: 115294   epsilon: 0.9697    steps: 182    lr: 0.0001     reward: 1.66
epis: 626   score: 2.0   mem len: 115476   epsilon: 0.9694    steps: 182    lr: 0.0001     reward: 1.64
epis: 627   score: 0.0   mem len: 115599   epsilon: 0.9691    steps: 123    lr: 0.0001     reward: 1.61
epis: 628   score: 2.0   mem len: 115799   epsilon: 0.9687    steps: 200    lr: 0.0001     reward: 1.58
epis: 629   score: 2.0   mem len: 115979   epsilon: 0.9684    steps: 180    lr: 0.0001     reward: 1.57
epis: 630   score: 1.0   mem len: 116151   epsilon: 0.968    steps: 172    lr: 0.0001     reward: 1.57
epis: 631   score: 0.0   mem len: 116273   epsilon: 0.9678    steps

epis: 701   score: 2.0   mem len: 128905   epsilon: 0.9428    steps: 218    lr: 0.0001     reward: 1.47
epis: 702   score: 2.0   mem len: 129103   epsilon: 0.9424    steps: 198    lr: 0.0001     reward: 1.49
epis: 703   score: 0.0   mem len: 129225   epsilon: 0.9421    steps: 122    lr: 0.0001     reward: 1.48
epis: 704   score: 2.0   mem len: 129423   epsilon: 0.9417    steps: 198    lr: 0.0001     reward: 1.47
epis: 705   score: 2.0   mem len: 129639   epsilon: 0.9413    steps: 216    lr: 0.0001     reward: 1.49
epis: 706   score: 3.0   mem len: 129868   epsilon: 0.9409    steps: 229    lr: 0.0001     reward: 1.51
epis: 707   score: 0.0   mem len: 129991   epsilon: 0.9406    steps: 123    lr: 0.0001     reward: 1.5
epis: 708   score: 2.0   mem len: 130189   epsilon: 0.9402    steps: 198    lr: 0.0001     reward: 1.48
epis: 709   score: 1.0   mem len: 130340   epsilon: 0.9399    steps: 151    lr: 0.0001     reward: 1.47
epis: 710   score: 1.0   mem len: 130509   epsilon: 0.9396    ste

epis: 781   score: 1.0   mem len: 142224   epsilon: 0.9164    steps: 169    lr: 0.0001     reward: 1.09
epis: 782   score: 0.0   mem len: 142346   epsilon: 0.9162    steps: 122    lr: 0.0001     reward: 1.08
epis: 783   score: 0.0   mem len: 142469   epsilon: 0.9159    steps: 123    lr: 0.0001     reward: 1.07
epis: 784   score: 0.0   mem len: 142592   epsilon: 0.9157    steps: 123    lr: 0.0001     reward: 1.05
epis: 785   score: 4.0   mem len: 142889   epsilon: 0.9151    steps: 297    lr: 0.0001     reward: 1.09
epis: 786   score: 0.0   mem len: 143012   epsilon: 0.9148    steps: 123    lr: 0.0001     reward: 1.08
epis: 787   score: 2.0   mem len: 143210   epsilon: 0.9144    steps: 198    lr: 0.0001     reward: 1.07
epis: 788   score: 2.0   mem len: 143407   epsilon: 0.9141    steps: 197    lr: 0.0001     reward: 1.08
epis: 789   score: 0.0   mem len: 143530   epsilon: 0.9138    steps: 123    lr: 0.0001     reward: 1.08
epis: 790   score: 2.0   mem len: 143746   epsilon: 0.9134    st

epis: 860   score: 3.0   mem len: 157574   epsilon: 0.886    steps: 247    lr: 0.0001     reward: 1.63
epis: 861   score: 0.0   mem len: 157697   epsilon: 0.8858    steps: 123    lr: 0.0001     reward: 1.59
epis: 862   score: 0.0   mem len: 157819   epsilon: 0.8855    steps: 122    lr: 0.0001     reward: 1.59
epis: 863   score: 2.0   mem len: 158017   epsilon: 0.8851    steps: 198    lr: 0.0001     reward: 1.59
epis: 864   score: 3.0   mem len: 158242   epsilon: 0.8847    steps: 225    lr: 0.0001     reward: 1.59
epis: 865   score: 2.0   mem len: 158440   epsilon: 0.8843    steps: 198    lr: 0.0001     reward: 1.6
epis: 866   score: 2.0   mem len: 158637   epsilon: 0.8839    steps: 197    lr: 0.0001     reward: 1.58
epis: 867   score: 0.0   mem len: 158759   epsilon: 0.8837    steps: 122    lr: 0.0001     reward: 1.57
epis: 868   score: 3.0   mem len: 158985   epsilon: 0.8832    steps: 226    lr: 0.0001     reward: 1.6
epis: 869   score: 1.0   mem len: 159154   epsilon: 0.8829    steps

epis: 939   score: 4.0   mem len: 172162   epsilon: 0.8571    steps: 296    lr: 0.0001     reward: 1.59
epis: 940   score: 0.0   mem len: 172285   epsilon: 0.8569    steps: 123    lr: 0.0001     reward: 1.58
epis: 941   score: 2.0   mem len: 172483   epsilon: 0.8565    steps: 198    lr: 0.0001     reward: 1.58
epis: 942   score: 1.0   mem len: 172655   epsilon: 0.8561    steps: 172    lr: 0.0001     reward: 1.57
epis: 943   score: 0.0   mem len: 172778   epsilon: 0.8559    steps: 123    lr: 0.0001     reward: 1.57
epis: 944   score: 1.0   mem len: 172929   epsilon: 0.8556    steps: 151    lr: 0.0001     reward: 1.55
epis: 945   score: 2.0   mem len: 173127   epsilon: 0.8552    steps: 198    lr: 0.0001     reward: 1.56
epis: 946   score: 1.0   mem len: 173296   epsilon: 0.8549    steps: 169    lr: 0.0001     reward: 1.53
epis: 947   score: 2.0   mem len: 173494   epsilon: 0.8545    steps: 198    lr: 0.0001     reward: 1.54
epis: 948   score: 2.0   mem len: 173691   epsilon: 0.8541    st

epis: 1018   score: 1.0   mem len: 187003   epsilon: 0.8277    steps: 169    lr: 0.0001     reward: 1.63
epis: 1019   score: 2.0   mem len: 187220   epsilon: 0.8273    steps: 217    lr: 0.0001     reward: 1.61
epis: 1020   score: 0.0   mem len: 187343   epsilon: 0.8271    steps: 123    lr: 0.0001     reward: 1.61
epis: 1021   score: 2.0   mem len: 187562   epsilon: 0.8266    steps: 219    lr: 0.0001     reward: 1.61
epis: 1022   score: 5.0   mem len: 187898   epsilon: 0.826    steps: 336    lr: 0.0001     reward: 1.65
epis: 1023   score: 3.0   mem len: 188124   epsilon: 0.8255    steps: 226    lr: 0.0001     reward: 1.67
epis: 1024   score: 4.0   mem len: 188441   epsilon: 0.8249    steps: 317    lr: 0.0001     reward: 1.71
epis: 1025   score: 0.0   mem len: 188564   epsilon: 0.8246    steps: 123    lr: 0.0001     reward: 1.7
epis: 1026   score: 3.0   mem len: 188812   epsilon: 0.8242    steps: 248    lr: 0.0001     reward: 1.71
epis: 1027   score: 2.0   mem len: 188994   epsilon: 0.82

epis: 1097   score: 2.0   mem len: 202416   epsilon: 0.7972    steps: 218    lr: 4e-05     reward: 1.77
epis: 1098   score: 4.0   mem len: 202691   epsilon: 0.7967    steps: 275    lr: 4e-05     reward: 1.81
epis: 1099   score: 5.0   mem len: 203055   epsilon: 0.7959    steps: 364    lr: 4e-05     reward: 1.86
epis: 1100   score: 1.0   mem len: 203224   epsilon: 0.7956    steps: 169    lr: 4e-05     reward: 1.86
epis: 1101   score: 0.0   mem len: 203346   epsilon: 0.7954    steps: 122    lr: 4e-05     reward: 1.85
epis: 1102   score: 3.0   mem len: 203611   epsilon: 0.7948    steps: 265    lr: 4e-05     reward: 1.88
epis: 1103   score: 4.0   mem len: 203888   epsilon: 0.7943    steps: 277    lr: 4e-05     reward: 1.89
epis: 1104   score: 2.0   mem len: 204068   epsilon: 0.7939    steps: 180    lr: 4e-05     reward: 1.88
epis: 1105   score: 1.0   mem len: 204239   epsilon: 0.7936    steps: 171    lr: 4e-05     reward: 1.88
epis: 1106   score: 2.0   mem len: 204436   epsilon: 0.7932    s

epis: 1176   score: 0.0   mem len: 218921   epsilon: 0.7645    steps: 123    lr: 4e-05     reward: 2.15
epis: 1177   score: 2.0   mem len: 219141   epsilon: 0.7641    steps: 220    lr: 4e-05     reward: 2.15
epis: 1178   score: 1.0   mem len: 219292   epsilon: 0.7638    steps: 151    lr: 4e-05     reward: 2.12
epis: 1179   score: 1.0   mem len: 219460   epsilon: 0.7635    steps: 168    lr: 4e-05     reward: 2.09
epis: 1180   score: 6.0   mem len: 219799   epsilon: 0.7628    steps: 339    lr: 4e-05     reward: 2.11
epis: 1181   score: 0.0   mem len: 219922   epsilon: 0.7626    steps: 123    lr: 4e-05     reward: 2.08
epis: 1182   score: 3.0   mem len: 220152   epsilon: 0.7621    steps: 230    lr: 4e-05     reward: 2.08
epis: 1183   score: 5.0   mem len: 220455   epsilon: 0.7615    steps: 303    lr: 4e-05     reward: 2.09
epis: 1184   score: 5.0   mem len: 220766   epsilon: 0.7609    steps: 311    lr: 4e-05     reward: 2.12
epis: 1185   score: 1.0   mem len: 220917   epsilon: 0.7606    s

epis: 1255   score: 1.0   mem len: 235452   epsilon: 0.7318    steps: 151    lr: 4e-05     reward: 2.32
epis: 1256   score: 3.0   mem len: 235678   epsilon: 0.7314    steps: 226    lr: 4e-05     reward: 2.33
epis: 1257   score: 2.0   mem len: 235878   epsilon: 0.731    steps: 200    lr: 4e-05     reward: 2.33
epis: 1258   score: 5.0   mem len: 236200   epsilon: 0.7303    steps: 322    lr: 4e-05     reward: 2.34
epis: 1259   score: 5.0   mem len: 236534   epsilon: 0.7297    steps: 334    lr: 4e-05     reward: 2.37
epis: 1260   score: 1.0   mem len: 236685   epsilon: 0.7294    steps: 151    lr: 4e-05     reward: 2.37
epis: 1261   score: 3.0   mem len: 236911   epsilon: 0.7289    steps: 226    lr: 4e-05     reward: 2.34
epis: 1262   score: 2.0   mem len: 237111   epsilon: 0.7285    steps: 200    lr: 4e-05     reward: 2.34
epis: 1263   score: 7.0   mem len: 237527   epsilon: 0.7277    steps: 416    lr: 4e-05     reward: 2.37
epis: 1264   score: 3.0   mem len: 237753   epsilon: 0.7272    st

epis: 1334   score: 5.0   mem len: 253539   epsilon: 0.696    steps: 346    lr: 4e-05     reward: 2.57
epis: 1335   score: 2.0   mem len: 253736   epsilon: 0.6956    steps: 197    lr: 4e-05     reward: 2.57
epis: 1336   score: 5.0   mem len: 254017   epsilon: 0.695    steps: 281    lr: 4e-05     reward: 2.6
epis: 1337   score: 2.0   mem len: 254215   epsilon: 0.6947    steps: 198    lr: 4e-05     reward: 2.6
epis: 1338   score: 5.0   mem len: 254540   epsilon: 0.694    steps: 325    lr: 4e-05     reward: 2.63
epis: 1339   score: 6.0   mem len: 254935   epsilon: 0.6932    steps: 395    lr: 4e-05     reward: 2.68
epis: 1340   score: 3.0   mem len: 255182   epsilon: 0.6927    steps: 247    lr: 4e-05     reward: 2.69
epis: 1341   score: 1.0   mem len: 255333   epsilon: 0.6924    steps: 151    lr: 4e-05     reward: 2.65
epis: 1342   score: 4.0   mem len: 255649   epsilon: 0.6918    steps: 316    lr: 4e-05     reward: 2.67
epis: 1343   score: 3.0   mem len: 255875   epsilon: 0.6914    steps:

epis: 1413   score: 1.0   mem len: 272390   epsilon: 0.6587    steps: 150    lr: 4e-05     reward: 2.86
epis: 1414   score: 4.0   mem len: 272666   epsilon: 0.6581    steps: 276    lr: 4e-05     reward: 2.9
epis: 1415   score: 4.0   mem len: 272941   epsilon: 0.6576    steps: 275    lr: 4e-05     reward: 2.92
epis: 1416   score: 3.0   mem len: 273191   epsilon: 0.6571    steps: 250    lr: 4e-05     reward: 2.92
epis: 1417   score: 2.0   mem len: 273370   epsilon: 0.6567    steps: 179    lr: 4e-05     reward: 2.93
epis: 1418   score: 2.0   mem len: 273568   epsilon: 0.6563    steps: 198    lr: 4e-05     reward: 2.91
epis: 1419   score: 2.0   mem len: 273749   epsilon: 0.656    steps: 181    lr: 4e-05     reward: 2.92
epis: 1420   score: 0.0   mem len: 273872   epsilon: 0.6557    steps: 123    lr: 4e-05     reward: 2.91
epis: 1421   score: 4.0   mem len: 274148   epsilon: 0.6552    steps: 276    lr: 4e-05     reward: 2.9
epis: 1422   score: 4.0   mem len: 274403   epsilon: 0.6547    step

epis: 1492   score: 3.0   mem len: 291655   epsilon: 0.6205    steps: 226    lr: 4e-05     reward: 3.32
epis: 1493   score: 4.0   mem len: 291935   epsilon: 0.62    steps: 280    lr: 4e-05     reward: 3.34
epis: 1494   score: 3.0   mem len: 292160   epsilon: 0.6195    steps: 225    lr: 4e-05     reward: 3.35
epis: 1495   score: 4.0   mem len: 292453   epsilon: 0.6189    steps: 293    lr: 4e-05     reward: 3.37
epis: 1496   score: 3.0   mem len: 292682   epsilon: 0.6185    steps: 229    lr: 4e-05     reward: 3.35
epis: 1497   score: 2.0   mem len: 292880   epsilon: 0.6181    steps: 198    lr: 4e-05     reward: 3.35
epis: 1498   score: 3.0   mem len: 293106   epsilon: 0.6176    steps: 226    lr: 4e-05     reward: 3.34
epis: 1499   score: 3.0   mem len: 293332   epsilon: 0.6172    steps: 226    lr: 4e-05     reward: 3.3
epis: 1500   score: 5.0   mem len: 293620   epsilon: 0.6166    steps: 288    lr: 4e-05     reward: 3.32
epis: 1501   score: 1.0   mem len: 293771   epsilon: 0.6163    step

epis: 1571   score: 2.0   mem len: 310811   epsilon: 0.5826    steps: 180    lr: 1.6e-05     reward: 3.25
epis: 1572   score: 5.0   mem len: 311127   epsilon: 0.582    steps: 316    lr: 1.6e-05     reward: 3.28
epis: 1573   score: 2.0   mem len: 311309   epsilon: 0.5816    steps: 182    lr: 1.6e-05     reward: 3.28
epis: 1574   score: 4.0   mem len: 311603   epsilon: 0.581    steps: 294    lr: 1.6e-05     reward: 3.29
epis: 1575   score: 3.0   mem len: 311816   epsilon: 0.5806    steps: 213    lr: 1.6e-05     reward: 3.26
epis: 1576   score: 5.0   mem len: 312120   epsilon: 0.58    steps: 304    lr: 1.6e-05     reward: 3.25
epis: 1577   score: 5.0   mem len: 312446   epsilon: 0.5794    steps: 326    lr: 1.6e-05     reward: 3.27
epis: 1578   score: 4.0   mem len: 312721   epsilon: 0.5788    steps: 275    lr: 1.6e-05     reward: 3.26
epis: 1579   score: 3.0   mem len: 312947   epsilon: 0.5784    steps: 226    lr: 1.6e-05     reward: 3.25
epis: 1580   score: 6.0   mem len: 313310   epsilo

epis: 1649   score: 3.0   mem len: 332692   epsilon: 0.5393    steps: 230    lr: 1.6e-05     reward: 4.03
epis: 1650   score: 3.0   mem len: 332904   epsilon: 0.5388    steps: 212    lr: 1.6e-05     reward: 4.04
epis: 1651   score: 3.0   mem len: 333114   epsilon: 0.5384    steps: 210    lr: 1.6e-05     reward: 4.03
epis: 1652   score: 3.0   mem len: 333348   epsilon: 0.538    steps: 234    lr: 1.6e-05     reward: 4.04
epis: 1653   score: 5.0   mem len: 333672   epsilon: 0.5373    steps: 324    lr: 1.6e-05     reward: 4.07
epis: 1654   score: 1.0   mem len: 333841   epsilon: 0.537    steps: 169    lr: 1.6e-05     reward: 4.04
epis: 1655   score: 4.0   mem len: 334116   epsilon: 0.5364    steps: 275    lr: 1.6e-05     reward: 4.02
epis: 1656   score: 3.0   mem len: 334385   epsilon: 0.5359    steps: 269    lr: 1.6e-05     reward: 4.01
epis: 1657   score: 6.0   mem len: 334697   epsilon: 0.5353    steps: 312    lr: 1.6e-05     reward: 4.03
epis: 1658   score: 4.0   mem len: 334952   epsi

epis: 1727   score: 4.0   mem len: 355226   epsilon: 0.4947    steps: 275    lr: 1.6e-05     reward: 4.55
epis: 1728   score: 1.0   mem len: 355394   epsilon: 0.4943    steps: 168    lr: 1.6e-05     reward: 4.52
epis: 1729   score: 4.0   mem len: 355669   epsilon: 0.4938    steps: 275    lr: 1.6e-05     reward: 4.51
epis: 1730   score: 2.0   mem len: 355851   epsilon: 0.4934    steps: 182    lr: 1.6e-05     reward: 4.51
epis: 1731   score: 5.0   mem len: 356139   epsilon: 0.4928    steps: 288    lr: 1.6e-05     reward: 4.48
epis: 1732   score: 7.0   mem len: 356480   epsilon: 0.4922    steps: 341    lr: 1.6e-05     reward: 4.5
epis: 1733   score: 12.0   mem len: 356918   epsilon: 0.4913    steps: 438    lr: 1.6e-05     reward: 4.6
epis: 1734   score: 3.0   mem len: 357150   epsilon: 0.4908    steps: 232    lr: 1.6e-05     reward: 4.56
epis: 1735   score: 5.0   mem len: 357431   epsilon: 0.4903    steps: 281    lr: 1.6e-05     reward: 4.55
epis: 1736   score: 5.0   mem len: 357710   eps

epis: 1805   score: 4.0   mem len: 378158   epsilon: 0.4492    steps: 277    lr: 1.6e-05     reward: 4.85
epis: 1806   score: 5.0   mem len: 378482   epsilon: 0.4486    steps: 324    lr: 1.6e-05     reward: 4.86
epis: 1807   score: 3.0   mem len: 378708   epsilon: 0.4482    steps: 226    lr: 1.6e-05     reward: 4.84
epis: 1808   score: 6.0   mem len: 379040   epsilon: 0.4475    steps: 332    lr: 1.6e-05     reward: 4.86
epis: 1809   score: 4.0   mem len: 379322   epsilon: 0.4469    steps: 282    lr: 1.6e-05     reward: 4.87
epis: 1810   score: 5.0   mem len: 379664   epsilon: 0.4463    steps: 342    lr: 1.6e-05     reward: 4.86
epis: 1811   score: 3.0   mem len: 379873   epsilon: 0.4458    steps: 209    lr: 1.6e-05     reward: 4.84


# Visualize Agent Performance

BE AWARE THIS CODE BELOW MAY CRASH THE KERNEL IF YOU RUN THE SAME CELL TWICE.

Please save your model before running this portion of the code.

In [None]:
torch.save(agent.policy_net, "./save_model/breakout_dqn_latest.pth")

In [None]:
from gym.wrappers import RecordVideo # If importing monitor raises issues, try using `from gym.wrappers import RecordVideo`
import glob
import io
import base64

from IPython.display import HTML
from IPython import display as ipythondisplay

from pyvirtualdisplay import Display

# Displaying the game live
def show_state(env, step=0, info=""):
    plt.figure(3)
    plt.clf()
    plt.imshow(env.render(mode='rgb_array'))
    plt.title("%s | Step: %d %s" % ("Agent Playing",step, info))
    plt.axis('off')

    ipythondisplay.clear_output(wait=True)
    ipythondisplay.display(plt.gcf())
    
# Recording the game and replaying the game afterwards
def show_video():
    mp4list = glob.glob('video/*.mp4')
    if len(mp4list) > 0:
        mp4 = mp4list[0]
        video = io.open(mp4, 'r+b').read()
        encoded = base64.b64encode(video)
        ipythondisplay.display(HTML(data='''<video alt="test" autoplay 
                loop controls style="height: 400px;">
                <source src="data:video/mp4;base64,{0}" type="video/mp4" />
             </video>'''.format(encoded.decode('ascii'))))
    else: 
        print("Could not find video")
    

def wrap_env(env):
    env = RecordVideo(env, './video')
    return env

In [None]:
display = Display(visible=0, size=(300, 200))
display.start()

# Load agent
# agent.load_policy_net("./save_model/breakout_dqn.pth")
agent.epsilon = 0.0 # Set agent to only exploit the best action

env = gym.make('BreakoutDeterministic-v4')
env = wrap_env(env)

done = False
score = 0
step = 0
state = env.reset()
next_state = state
life = number_lives
history = np.zeros([5, 84, 84], dtype=np.uint8)
get_init_state(history, state)

while not done:
    
    # Render breakout
    env.render()
#     show_state(env,step) # uncommenting this provides another way to visualize the game

    step += 1
    frame += 1

    # Perform a fire action if ball is no longer on screen
    if step > 1 and len(np.unique(next_state[:189] == state[:189])) < 2:
        action = 0
    else:
        action = agent.get_action(np.float32(history[:4, :, :]) / 255.)
    state = next_state
    
    next_state, reward, done, _, info = env.step(action + 1)
        
    frame_next_state = get_frame(next_state)
    history[4, :, :] = frame_next_state
    terminal_state = check_live(life, info['ale.lives'])
        
    life = info['ale.lives']
    r = np.clip(reward, -1, 1) 
    r = reward

    # Store the transition in memory 
    agent.memory.push(deepcopy(frame_next_state), action, r, terminal_state)
    # Start training after random sample generation
    score += reward
    
    history[:4, :, :] = history[1:, :, :]
env.close()
show_video()
display.stop()