# Deep Q-Learning 

Install dependencies for AI gym to run properly (shouldn't take more than a minute). If running on google cloud or running locally, only need to run once. Colab may require installing everytime the vm shuts down.

In [None]:
!pip3 install gym pyvirtualdisplay
!sudo apt-get install -y xvfb python-opengl ffmpeg

In [None]:
!pip3 install --upgrade setuptools --user
!pip3 install ez_setup 
!pip3 install gym[atari] 
!pip3 install gym[accept-rom-license] 

For this assignment we will implement the Deep Q-Learning algorithm with Experience Replay as described in breakthrough paper __"Playing Atari with Deep Reinforcement Learning"__. We will train an agent to play the famous game of __Breakout__.

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
%matplotlib inline
import sys
import gym
import torch
import pylab
import random
import numpy as np
from collections import deque
from datetime import datetime
from copy import deepcopy
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.autograd import Variable
from utils import find_max_lives, check_live, get_frame, get_init_state
from model import DQN
from config import *

import matplotlib.pyplot as plt

## Understanding the environment

In the following cell, we initialize our game of __Breakout__ and you can see how the environment looks like. For further documentation of the of the environment refer to https://www.gymlibrary.dev/environments/atari/breakout/. 

In breakout, we will use 3 actions "fire", "left", and "right". "fire" is only used to reset the game when a life is lost, "left" moves the agent left and "right" moves the agent right.

In [8]:
env = gym.make('BreakoutDeterministic-v4')
state = env.reset()

In [9]:
number_lives = find_max_lives(env)
state_size = env.observation_space.shape
action_size = 3 #fire, left, and right

## Creating a DQN Agent

Here we create a DQN Agent. This agent is defined in the __agent.py__. The corresponding neural network is defined in the __model.py__. Once you've created a working DQN agent, use the code in agent.py to create a double DQN agent in __agent_double.py__. Set the flag "double_dqn" to True to train the double DQN agent.

__Evaluation Reward__ : The average reward received in the past 100 episodes/games.

__Frame__ : Number of frames processed in total.

__Memory Size__ : The current size of the replay memory.

In [10]:
double_dqn = True # set to True if using double DQN agent

if double_dqn:
    from agent_double import Agent
else:
    from agent import Agent

agent = Agent(action_size)
evaluation_reward = deque(maxlen=evaluation_reward_length)
frame = 0
memory_size = 0

### Main Training Loop

In this training loop, we do not render the screen because it slows down training signficantly. To watch the agent play the game, run the code in next section "Visualize Agent Performance"

In [None]:
rewards, episodes = [], []
best_eval_reward = 0
for e in range(EPISODES):
    done = False
    score = 0

    history = np.zeros([5, 84, 84], dtype=np.uint8)
    step = 0
    state = env.reset()
    next_state = state
    life = number_lives

    get_init_state(history, state, HISTORY_SIZE)

    while not done:
        step += 1
        frame += 1

        # Perform a fire action if ball is no longer on screen to continue onto next life
        if step > 1 and len(np.unique(next_state[:189] == state[:189])) < 2:
            action = 0
        else:
            action = agent.get_action(np.float32(history[:4, :, :]) / 255.)
        state = next_state
        next_state, reward, done, _, info = env.step(action + 1)
        
        frame_next_state = get_frame(next_state)
        history[4, :, :] = frame_next_state
        terminal_state = check_live(life, info['lives'])

        life = info['lives']
        r = reward

        # Store the transition in memory 
        agent.memory.push(deepcopy(frame_next_state), action, r, terminal_state)
        # Start training after random sample generation
        if(frame >= train_frame):
            agent.train_policy_net(frame)
            # Update the target network only for Double DQN only
            if double_dqn and (frame % update_target_network_frequency)== 0:
                agent.update_target_net()
        score += reward
        history[:4, :, :] = history[1:, :, :]
            
        if done:
            evaluation_reward.append(score)
            rewards.append(np.mean(evaluation_reward))
            episodes.append(e)
            pylab.plot(episodes, rewards, 'b')
            pylab.xlabel('Episodes')
            pylab.ylabel('Rewards') 
            pylab.title('Episodes vs Reward')
            pylab.savefig("./save_graph/breakout_double_dqn.png") # save graph for training visualization
            
            # every episode, plot the play time
            print("epis:", e, "  score:", score, "  mem len:",
                  len(agent.memory), "  epsilon:", round(agent.epsilon, 4), "   steps:", step,
                  "   lr:", round(agent.optimizer.param_groups[0]['lr'], 7), "    reward:", round(np.mean(evaluation_reward), 2))

            # if the mean of scores of last 100 episode is bigger than 5 save model
            ### Change this save condition to whatever you prefer ###
            if np.mean(evaluation_reward) > 5 and np.mean(evaluation_reward) > best_eval_reward:
                torch.save(agent.policy_net, "./save_model/breakout_double_dqn.pth")
                best_eval_reward = np.mean(evaluation_reward)


  if step > 1 and len(np.unique(next_state[:189] == state[:189])) < 2:
  if step > 1 and len(np.unique(next_state[:189] == state[:189])) < 2:


epis: 0   score: 0.0   mem len: 122   epsilon: 1.0    steps: 122    lr: 0.0001     reward: 0.0
epis: 1   score: 2.0   mem len: 342   epsilon: 1.0    steps: 220    lr: 0.0001     reward: 1.0
epis: 2   score: 1.0   mem len: 493   epsilon: 1.0    steps: 151    lr: 0.0001     reward: 1.0
epis: 3   score: 4.0   mem len: 768   epsilon: 1.0    steps: 275    lr: 0.0001     reward: 1.75
epis: 4   score: 3.0   mem len: 1017   epsilon: 1.0    steps: 249    lr: 0.0001     reward: 2.0
epis: 5   score: 2.0   mem len: 1215   epsilon: 1.0    steps: 198    lr: 0.0001     reward: 2.0
epis: 6   score: 4.0   mem len: 1474   epsilon: 1.0    steps: 259    lr: 0.0001     reward: 2.29
epis: 7   score: 0.0   mem len: 1597   epsilon: 1.0    steps: 123    lr: 0.0001     reward: 2.0
epis: 8   score: 0.0   mem len: 1720   epsilon: 1.0    steps: 123    lr: 0.0001     reward: 1.78
epis: 9   score: 5.0   mem len: 2046   epsilon: 1.0    steps: 326    lr: 0.0001     reward: 2.1
epis: 10   score: 3.0   mem len: 2293   e

epis: 84   score: 4.0   mem len: 15504   epsilon: 1.0    steps: 297    lr: 0.0001     reward: 1.51
epis: 85   score: 7.0   mem len: 15964   epsilon: 1.0    steps: 460    lr: 0.0001     reward: 1.57
epis: 86   score: 4.0   mem len: 16226   epsilon: 1.0    steps: 262    lr: 0.0001     reward: 1.6
epis: 87   score: 1.0   mem len: 16378   epsilon: 1.0    steps: 152    lr: 0.0001     reward: 1.59
epis: 88   score: 2.0   mem len: 16594   epsilon: 1.0    steps: 216    lr: 0.0001     reward: 1.6
epis: 89   score: 1.0   mem len: 16766   epsilon: 1.0    steps: 172    lr: 0.0001     reward: 1.59
epis: 90   score: 1.0   mem len: 16916   epsilon: 1.0    steps: 150    lr: 0.0001     reward: 1.58
epis: 91   score: 2.0   mem len: 17113   epsilon: 1.0    steps: 197    lr: 0.0001     reward: 1.59
epis: 92   score: 1.0   mem len: 17282   epsilon: 1.0    steps: 169    lr: 0.0001     reward: 1.58
epis: 93   score: 2.0   mem len: 17480   epsilon: 1.0    steps: 198    lr: 0.0001     reward: 1.59
epis: 94   s

epis: 167   score: 0.0   mem len: 31472   epsilon: 1.0    steps: 122    lr: 0.0001     reward: 1.68
epis: 168   score: 0.0   mem len: 31594   epsilon: 1.0    steps: 122    lr: 0.0001     reward: 1.67
epis: 169   score: 4.0   mem len: 31890   epsilon: 1.0    steps: 296    lr: 0.0001     reward: 1.71
epis: 170   score: 0.0   mem len: 32012   epsilon: 1.0    steps: 122    lr: 0.0001     reward: 1.68
epis: 171   score: 2.0   mem len: 32209   epsilon: 1.0    steps: 197    lr: 0.0001     reward: 1.68
epis: 172   score: 3.0   mem len: 32479   epsilon: 1.0    steps: 270    lr: 0.0001     reward: 1.7
epis: 173   score: 0.0   mem len: 32602   epsilon: 1.0    steps: 123    lr: 0.0001     reward: 1.7
epis: 174   score: 1.0   mem len: 32754   epsilon: 1.0    steps: 152    lr: 0.0001     reward: 1.7
epis: 175   score: 2.0   mem len: 32956   epsilon: 1.0    steps: 202    lr: 0.0001     reward: 1.72
epis: 176   score: 1.0   mem len: 33124   epsilon: 1.0    steps: 168    lr: 0.0001     reward: 1.71
epi

epis: 250   score: 0.0   mem len: 46774   epsilon: 1.0    steps: 123    lr: 0.0001     reward: 1.58
epis: 251   score: 3.0   mem len: 47021   epsilon: 1.0    steps: 247    lr: 0.0001     reward: 1.61
epis: 252   score: 2.0   mem len: 47240   epsilon: 1.0    steps: 219    lr: 0.0001     reward: 1.6
epis: 253   score: 1.0   mem len: 47391   epsilon: 1.0    steps: 151    lr: 0.0001     reward: 1.59
epis: 254   score: 3.0   mem len: 47656   epsilon: 1.0    steps: 265    lr: 0.0001     reward: 1.61
epis: 255   score: 0.0   mem len: 47778   epsilon: 1.0    steps: 122    lr: 0.0001     reward: 1.61
epis: 256   score: 2.0   mem len: 47995   epsilon: 1.0    steps: 217    lr: 0.0001     reward: 1.61
epis: 257   score: 1.0   mem len: 48146   epsilon: 1.0    steps: 151    lr: 0.0001     reward: 1.61
epis: 258   score: 0.0   mem len: 48268   epsilon: 1.0    steps: 122    lr: 0.0001     reward: 1.6
epis: 259   score: 3.0   mem len: 48496   epsilon: 1.0    steps: 228    lr: 0.0001     reward: 1.6
epi

epis: 333   score: 3.0   mem len: 61840   epsilon: 1.0    steps: 233    lr: 0.0001     reward: 1.46
epis: 334   score: 2.0   mem len: 62038   epsilon: 1.0    steps: 198    lr: 0.0001     reward: 1.48
epis: 335   score: 4.0   mem len: 62354   epsilon: 1.0    steps: 316    lr: 0.0001     reward: 1.5
epis: 336   score: 2.0   mem len: 62553   epsilon: 1.0    steps: 199    lr: 0.0001     reward: 1.49
epis: 337   score: 0.0   mem len: 62676   epsilon: 1.0    steps: 123    lr: 0.0001     reward: 1.48
epis: 338   score: 0.0   mem len: 62799   epsilon: 1.0    steps: 123    lr: 0.0001     reward: 1.46
epis: 339   score: 0.0   mem len: 62922   epsilon: 1.0    steps: 123    lr: 0.0001     reward: 1.45
epis: 340   score: 1.0   mem len: 63073   epsilon: 1.0    steps: 151    lr: 0.0001     reward: 1.46
epis: 341   score: 3.0   mem len: 63303   epsilon: 1.0    steps: 230    lr: 0.0001     reward: 1.49
epis: 342   score: 2.0   mem len: 63500   epsilon: 1.0    steps: 197    lr: 0.0001     reward: 1.5
ep

epis: 416   score: 1.0   mem len: 77909   epsilon: 1.0    steps: 169    lr: 0.0001     reward: 1.7
epis: 417   score: 0.0   mem len: 78032   epsilon: 1.0    steps: 123    lr: 0.0001     reward: 1.69
epis: 418   score: 1.0   mem len: 78183   epsilon: 1.0    steps: 151    lr: 0.0001     reward: 1.66
epis: 419   score: 0.0   mem len: 78306   epsilon: 1.0    steps: 123    lr: 0.0001     reward: 1.63
epis: 420   score: 0.0   mem len: 78429   epsilon: 1.0    steps: 123    lr: 0.0001     reward: 1.63
epis: 421   score: 1.0   mem len: 78598   epsilon: 1.0    steps: 169    lr: 0.0001     reward: 1.64
epis: 422   score: 0.0   mem len: 78720   epsilon: 1.0    steps: 122    lr: 0.0001     reward: 1.62
epis: 423   score: 1.0   mem len: 78888   epsilon: 1.0    steps: 168    lr: 0.0001     reward: 1.61
epis: 424   score: 0.0   mem len: 79010   epsilon: 1.0    steps: 122    lr: 0.0001     reward: 1.6
epis: 425   score: 2.0   mem len: 79207   epsilon: 1.0    steps: 197    lr: 0.0001     reward: 1.6
epi

epis: 499   score: 0.0   mem len: 92083   epsilon: 1.0    steps: 123    lr: 0.0001     reward: 1.33
epis: 500   score: 0.0   mem len: 92206   epsilon: 1.0    steps: 123    lr: 0.0001     reward: 1.32
epis: 501   score: 0.0   mem len: 92329   epsilon: 1.0    steps: 123    lr: 0.0001     reward: 1.29
epis: 502   score: 1.0   mem len: 92480   epsilon: 1.0    steps: 151    lr: 0.0001     reward: 1.28
epis: 503   score: 3.0   mem len: 92727   epsilon: 1.0    steps: 247    lr: 0.0001     reward: 1.31
epis: 504   score: 1.0   mem len: 92895   epsilon: 1.0    steps: 168    lr: 0.0001     reward: 1.32
epis: 505   score: 2.0   mem len: 93093   epsilon: 1.0    steps: 198    lr: 0.0001     reward: 1.32
epis: 506   score: 1.0   mem len: 93262   epsilon: 1.0    steps: 169    lr: 0.0001     reward: 1.3
epis: 507   score: 1.0   mem len: 93413   epsilon: 1.0    steps: 151    lr: 0.0001     reward: 1.27
epis: 508   score: 1.0   mem len: 93565   epsilon: 1.0    steps: 152    lr: 0.0001     reward: 1.26
e

  sample = np.array(sample)
  mini_batch = np.array(mini_batch).transpose()
  next_states_values[mask] = self.target_net(non_final_next_states).max(1)[0].cuda()[mask]


epis: 544   score: 2.0   mem len: 100145   epsilon: 0.9997    steps: 197    lr: 0.0001     reward: 1.37
epis: 545   score: 2.0   mem len: 100345   epsilon: 0.9993    steps: 200    lr: 0.0001     reward: 1.37
epis: 546   score: 0.0   mem len: 100468   epsilon: 0.9991    steps: 123    lr: 0.0001     reward: 1.34
epis: 547   score: 3.0   mem len: 100696   epsilon: 0.9986    steps: 228    lr: 0.0001     reward: 1.35
epis: 548   score: 0.0   mem len: 100819   epsilon: 0.9984    steps: 123    lr: 0.0001     reward: 1.33
epis: 549   score: 2.0   mem len: 101017   epsilon: 0.998    steps: 198    lr: 0.0001     reward: 1.35
epis: 550   score: 0.0   mem len: 101140   epsilon: 0.9977    steps: 123    lr: 0.0001     reward: 1.33
epis: 551   score: 5.0   mem len: 101484   epsilon: 0.9971    steps: 344    lr: 0.0001     reward: 1.3
epis: 552   score: 0.0   mem len: 101606   epsilon: 0.9968    steps: 122    lr: 0.0001     reward: 1.3
epis: 553   score: 3.0   mem len: 101856   epsilon: 0.9963    steps

epis: 623   score: 5.0   mem len: 115045   epsilon: 0.9702    steps: 292    lr: 0.0001     reward: 1.54
epis: 624   score: 0.0   mem len: 115167   epsilon: 0.97    steps: 122    lr: 0.0001     reward: 1.52
epis: 625   score: 1.0   mem len: 115318   epsilon: 0.9697    steps: 151    lr: 0.0001     reward: 1.52
epis: 626   score: 2.0   mem len: 115516   epsilon: 0.9693    steps: 198    lr: 0.0001     reward: 1.51
epis: 627   score: 1.0   mem len: 115688   epsilon: 0.9689    steps: 172    lr: 0.0001     reward: 1.52
epis: 628   score: 2.0   mem len: 115886   epsilon: 0.9685    steps: 198    lr: 0.0001     reward: 1.52
epis: 629   score: 1.0   mem len: 116055   epsilon: 0.9682    steps: 169    lr: 0.0001     reward: 1.51
epis: 630   score: 0.0   mem len: 116178   epsilon: 0.968    steps: 123    lr: 0.0001     reward: 1.49
epis: 631   score: 3.0   mem len: 116439   epsilon: 0.9674    steps: 261    lr: 0.0001     reward: 1.51
epis: 632   score: 2.0   mem len: 116643   epsilon: 0.967    steps:

epis: 702   score: 1.0   mem len: 128977   epsilon: 0.9426    steps: 168    lr: 0.0001     reward: 1.32
epis: 703   score: 0.0   mem len: 129100   epsilon: 0.9424    steps: 123    lr: 0.0001     reward: 1.32
epis: 704   score: 2.0   mem len: 129298   epsilon: 0.942    steps: 198    lr: 0.0001     reward: 1.34
epis: 705   score: 0.0   mem len: 129420   epsilon: 0.9417    steps: 122    lr: 0.0001     reward: 1.32
epis: 706   score: 2.0   mem len: 129638   epsilon: 0.9413    steps: 218    lr: 0.0001     reward: 1.33
epis: 707   score: 1.0   mem len: 129789   epsilon: 0.941    steps: 151    lr: 0.0001     reward: 1.34
epis: 708   score: 0.0   mem len: 129911   epsilon: 0.9408    steps: 122    lr: 0.0001     reward: 1.33
epis: 709   score: 2.0   mem len: 130128   epsilon: 0.9403    steps: 217    lr: 0.0001     reward: 1.33
epis: 710   score: 3.0   mem len: 130398   epsilon: 0.9398    steps: 270    lr: 0.0001     reward: 1.34
epis: 711   score: 4.0   mem len: 130641   epsilon: 0.9393    step

epis: 781   score: 2.0   mem len: 143821   epsilon: 0.9132    steps: 219    lr: 0.0001     reward: 1.53
epis: 782   score: 3.0   mem len: 144068   epsilon: 0.9127    steps: 247    lr: 0.0001     reward: 1.53
epis: 783   score: 2.0   mem len: 144266   epsilon: 0.9124    steps: 198    lr: 0.0001     reward: 1.54
epis: 784   score: 3.0   mem len: 144533   epsilon: 0.9118    steps: 267    lr: 0.0001     reward: 1.57
epis: 785   score: 2.0   mem len: 144713   epsilon: 0.9115    steps: 180    lr: 0.0001     reward: 1.57
epis: 786   score: 2.0   mem len: 144910   epsilon: 0.9111    steps: 197    lr: 0.0001     reward: 1.53
epis: 787   score: 4.0   mem len: 145222   epsilon: 0.9105    steps: 312    lr: 0.0001     reward: 1.55
epis: 788   score: 0.0   mem len: 145345   epsilon: 0.9102    steps: 123    lr: 0.0001     reward: 1.54
epis: 789   score: 3.0   mem len: 145593   epsilon: 0.9097    steps: 248    lr: 0.0001     reward: 1.57
epis: 790   score: 1.0   mem len: 145744   epsilon: 0.9094    st

epis: 860   score: 1.0   mem len: 158757   epsilon: 0.8837    steps: 151    lr: 0.0001     reward: 1.65
epis: 861   score: 1.0   mem len: 158928   epsilon: 0.8833    steps: 171    lr: 0.0001     reward: 1.61
epis: 862   score: 2.0   mem len: 159128   epsilon: 0.8829    steps: 200    lr: 0.0001     reward: 1.61
epis: 863   score: 0.0   mem len: 159251   epsilon: 0.8827    steps: 123    lr: 0.0001     reward: 1.59
epis: 864   score: 4.0   mem len: 159524   epsilon: 0.8821    steps: 273    lr: 0.0001     reward: 1.62
epis: 865   score: 2.0   mem len: 159742   epsilon: 0.8817    steps: 218    lr: 0.0001     reward: 1.64
epis: 866   score: 0.0   mem len: 159864   epsilon: 0.8815    steps: 122    lr: 0.0001     reward: 1.62
epis: 867   score: 1.0   mem len: 160034   epsilon: 0.8811    steps: 170    lr: 0.0001     reward: 1.6
epis: 868   score: 7.0   mem len: 160354   epsilon: 0.8805    steps: 320    lr: 0.0001     reward: 1.65
epis: 869   score: 0.0   mem len: 160477   epsilon: 0.8803    ste

epis: 939   score: 2.0   mem len: 173094   epsilon: 0.8553    steps: 197    lr: 0.0001     reward: 1.55
epis: 940   score: 2.0   mem len: 173276   epsilon: 0.8549    steps: 182    lr: 0.0001     reward: 1.56
epis: 941   score: 2.0   mem len: 173492   epsilon: 0.8545    steps: 216    lr: 0.0001     reward: 1.56
epis: 942   score: 5.0   mem len: 173789   epsilon: 0.8539    steps: 297    lr: 0.0001     reward: 1.6
epis: 943   score: 0.0   mem len: 173912   epsilon: 0.8537    steps: 123    lr: 0.0001     reward: 1.58
epis: 944   score: 1.0   mem len: 174063   epsilon: 0.8534    steps: 151    lr: 0.0001     reward: 1.56
epis: 945   score: 3.0   mem len: 174327   epsilon: 0.8528    steps: 264    lr: 0.0001     reward: 1.56
epis: 946   score: 0.0   mem len: 174450   epsilon: 0.8526    steps: 123    lr: 0.0001     reward: 1.53
epis: 947   score: 4.0   mem len: 174723   epsilon: 0.852    steps: 273    lr: 0.0001     reward: 1.57
epis: 948   score: 5.0   mem len: 175050   epsilon: 0.8514    step

epis: 1018   score: 4.0   mem len: 187115   epsilon: 0.8275    steps: 258    lr: 0.0001     reward: 1.45
epis: 1019   score: 3.0   mem len: 187361   epsilon: 0.827    steps: 246    lr: 0.0001     reward: 1.46
epis: 1020   score: 5.0   mem len: 187651   epsilon: 0.8264    steps: 290    lr: 0.0001     reward: 1.5
epis: 1021   score: 1.0   mem len: 187802   epsilon: 0.8262    steps: 151    lr: 0.0001     reward: 1.5
epis: 1022   score: 0.0   mem len: 187925   epsilon: 0.8259    steps: 123    lr: 0.0001     reward: 1.5
epis: 1023   score: 2.0   mem len: 188123   epsilon: 0.8255    steps: 198    lr: 0.0001     reward: 1.5
epis: 1024   score: 2.0   mem len: 188303   epsilon: 0.8252    steps: 180    lr: 0.0001     reward: 1.49
epis: 1025   score: 2.0   mem len: 188501   epsilon: 0.8248    steps: 198    lr: 0.0001     reward: 1.49
epis: 1026   score: 2.0   mem len: 188699   epsilon: 0.8244    steps: 198    lr: 0.0001     reward: 1.5
epis: 1027   score: 0.0   mem len: 188821   epsilon: 0.8241  

epis: 1097   score: 3.0   mem len: 202156   epsilon: 0.7977    steps: 228    lr: 4e-05     reward: 1.67
epis: 1098   score: 3.0   mem len: 202425   epsilon: 0.7972    steps: 269    lr: 4e-05     reward: 1.66
epis: 1099   score: 3.0   mem len: 202672   epsilon: 0.7967    steps: 247    lr: 4e-05     reward: 1.66
epis: 1100   score: 2.0   mem len: 202870   epsilon: 0.7963    steps: 198    lr: 4e-05     reward: 1.68
epis: 1101   score: 2.0   mem len: 203068   epsilon: 0.7959    steps: 198    lr: 4e-05     reward: 1.7
epis: 1102   score: 3.0   mem len: 203293   epsilon: 0.7955    steps: 225    lr: 4e-05     reward: 1.72
epis: 1103   score: 2.0   mem len: 203515   epsilon: 0.795    steps: 222    lr: 4e-05     reward: 1.69
epis: 1104   score: 2.0   mem len: 203712   epsilon: 0.7946    steps: 197    lr: 4e-05     reward: 1.71
epis: 1105   score: 2.0   mem len: 203910   epsilon: 0.7943    steps: 198    lr: 4e-05     reward: 1.7
epis: 1106   score: 4.0   mem len: 204227   epsilon: 0.7936    step

# Visualize Agent Performance

BE AWARE THIS CODE BELOW MAY CRASH THE KERNEL IF YOU RUN THE SAME CELL TWICE.

Please save your model before running this portion of the code.

In [None]:
torch.save(agent.policy_net, "./save_model/breakout_double_dqn_latest.pth")

In [None]:
from gym.wrappers import RecordVideo # If importing monitor raises issues, try using `from gym.wrappers import RecordVideo`
import glob
import io
import base64

from IPython.display import HTML
from IPython import display as ipythondisplay

from pyvirtualdisplay import Display

# Displaying the game live
def show_state(env, step=0, info=""):
    plt.figure(3)
    plt.clf()
    plt.imshow(env.render(mode='rgb_array'))
    plt.title("%s | Step: %d %s" % ("Agent Playing",step, info))
    plt.axis('off')

    ipythondisplay.clear_output(wait=True)
    ipythondisplay.display(plt.gcf())
    
# Recording the game and replaying the game afterwards
def show_video():
    mp4list = glob.glob('video/*.mp4')
    if len(mp4list) > 0:
        mp4 = mp4list[0]
        video = io.open(mp4, 'r+b').read()
        encoded = base64.b64encode(video)
        ipythondisplay.display(HTML(data='''<video alt="test" autoplay 
                loop controls style="height: 400px;">
                <source src="data:video/mp4;base64,{0}" type="video/mp4" />
             </video>'''.format(encoded.decode('ascii'))))
    else: 
        print("Could not find video")
    

def wrap_env(env):
    env = RecordVideo(env, './video')
    return env

In [None]:
display = Display(visible=0, size=(300, 200))
display.start()

# Load agent
# agent.load_policy_net("./save_model/breakout_dqn.pth")
agent.epsilon = 0.0 # Set agent to only exploit the best action

env = gym.make('BreakoutDeterministic-v4')
env = wrap_env(env)

done = False
score = 0
step = 0
state = env.reset()
next_state = state
life = number_lives
history = np.zeros([5, 84, 84], dtype=np.uint8)
get_init_state(history, state)

while not done:
    
    # Render breakout
    env.render()
#     show_state(env,step) # uncommenting this provides another way to visualize the game

    step += 1
    frame += 1

    # Perform a fire action if ball is no longer on screen
    if step > 1 and len(np.unique(next_state[:189] == state[:189])) < 2:
        action = 0
    else:
        action = agent.get_action(np.float32(history[:4, :, :]) / 255.)
    state = next_state
    
    next_state, reward, done, _, info = env.step(action + 1)
        
    frame_next_state = get_frame(next_state)
    history[4, :, :] = frame_next_state
    terminal_state = check_live(life, info['ale.lives'])
        
    life = info['ale.lives']
    r = np.clip(reward, -1, 1) 
    r = reward

    # Store the transition in memory 
    agent.memory.push(deepcopy(frame_next_state), action, r, terminal_state)
    # Start training after random sample generation
    score += reward
    
    history[:4, :, :] = history[1:, :, :]
env.close()
show_video()
display.stop()