# Deep Q-Learning 

Install dependencies for AI gym to run properly (shouldn't take more than a minute). If running on google cloud or running locally, only need to run once. Colab may require installing everytime the vm shuts down.

In [None]:
!pip3 install gym pyvirtualdisplay
!sudo apt-get install -y xvfb python-opengl ffmpeg

In [None]:
!pip3 install --upgrade setuptools --user
!pip3 install ez_setup 
!pip3 install gym[atari] 
!pip3 install gym[accept-rom-license] 

For this assignment we will implement the Deep Q-Learning algorithm with Experience Replay as described in breakthrough paper __"Playing Atari with Deep Reinforcement Learning"__. We will train an agent to play the famous game of __Breakout__.

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
%matplotlib inline
import sys
import gym
import torch
import pylab
import random
import numpy as np
from collections import deque
from datetime import datetime
from copy import deepcopy
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.autograd import Variable
from utils import find_max_lives, check_live, get_frame, get_init_state
from model import DQN
from config import *

import matplotlib.pyplot as plt

## Understanding the environment

In the following cell, we initialize our game of __Breakout__ and you can see how the environment looks like. For further documentation of the of the environment refer to https://www.gymlibrary.dev/environments/atari/breakout/. 

In breakout, we will use 3 actions "fire", "left", and "right". "fire" is only used to reset the game when a life is lost, "left" moves the agent left and "right" moves the agent right.

In [3]:
env = gym.make('BreakoutDeterministic-v4')
state = env.reset()

A.L.E: Arcade Learning Environment (version 0.8.1+53f58b7)
[Powered by Stella]


In [4]:
number_lives = find_max_lives(env)
state_size = env.observation_space.shape
action_size = 3 #fire, left, and right

## Creating a DQN Agent

Here we create a DQN Agent. This agent is defined in the __agent.py__. The corresponding neural network is defined in the __model.py__. Once you've created a working DQN agent, use the code in agent.py to create a double DQN agent in __agent_double.py__. Set the flag "double_dqn" to True to train the double DQN agent.

__Evaluation Reward__ : The average reward received in the past 100 episodes/games.

__Frame__ : Number of frames processed in total.

__Memory Size__ : The current size of the replay memory.

In [5]:
double_dqn = False # set to True if using double DQN agent

if double_dqn:
    from agent_double import Agent
else:
    from agent import Agent

agent = Agent(action_size)
evaluation_reward = deque(maxlen=evaluation_reward_length)
frame = 0
memory_size = 0

### Main Training Loop

In this training loop, we do not render the screen because it slows down training signficantly. To watch the agent play the game, run the code in next section "Visualize Agent Performance"

In [None]:
rewards, episodes = [], []
best_eval_reward = 0
for e in range(EPISODES):
    done = False
    score = 0

    history = np.zeros([5, 84, 84], dtype=np.uint8)
    step = 0
    state = env.reset()
    next_state = state
    life = number_lives

    get_init_state(history, state, HISTORY_SIZE)

    while not done:
        step += 1
        frame += 1

        # Perform a fire action if ball is no longer on screen to continue onto next life
        if step > 1 and len(np.unique(next_state[:189] == state[:189])) < 2:
            action = 0
        else:
            action = agent.get_action(np.float32(history[:4, :, :]) / 255.)
        state = next_state
        next_state, reward, done, _, info = env.step(action + 1)
        
        frame_next_state = get_frame(next_state)
        history[4, :, :] = frame_next_state
        terminal_state = check_live(life, info['lives'])

        life = info['lives']
        r = reward

        # Store the transition in memory 
        agent.memory.push(deepcopy(frame_next_state), action, r, terminal_state)
        # Start training after random sample generation
        if(frame >= train_frame):
            agent.train_policy_net(frame)
            # Update the target network only for Double DQN only
            if (frame % update_target_network_frequency)== 0:
                agent.update_target_net()
        score += reward
        history[:4, :, :] = history[1:, :, :]
            
        if done:
            evaluation_reward.append(score)
            rewards.append(np.mean(evaluation_reward))
            episodes.append(e)
            pylab.plot(episodes, rewards, 'b')
            pylab.xlabel('Episodes')
            pylab.ylabel('Rewards') 
            pylab.title('Episodes vs Reward')
            pylab.savefig("./save_graph/breakout_dqn.png") # save graph for training visualization
            
            # every episode, plot the play time
            print("epis:", e, "  score:", score, "  mem len:",
                  len(agent.memory), "  epsilon:", round(agent.epsilon, 4), "   steps:", step,
                  "   lr:", round(agent.optimizer.param_groups[0]['lr'], 7), "    reward:", round(np.mean(evaluation_reward), 2))

            # if the mean of scores of last 100 episode is bigger than 5 save model
            ### Change this save condition to whatever you prefer ###
            if np.mean(evaluation_reward) > 5 and np.mean(evaluation_reward) > best_eval_reward:
                torch.save(agent.policy_net, "./save_model/breakout_dqn.pth")
                best_eval_reward = np.mean(evaluation_reward)


  if step > 1 and len(np.unique(next_state[:189] == state[:189])) < 2:
  if step > 1 and len(np.unique(next_state[:189] == state[:189])) < 2:


epis: 0   score: 2.0   mem len: 218   epsilon: 1.0    steps: 218    lr: 0.0001     reward: 2.0
epis: 1   score: 1.0   mem len: 369   epsilon: 1.0    steps: 151    lr: 0.0001     reward: 1.5
epis: 2   score: 2.0   mem len: 567   epsilon: 1.0    steps: 198    lr: 0.0001     reward: 1.67
epis: 3   score: 1.0   mem len: 739   epsilon: 1.0    steps: 172    lr: 0.0001     reward: 1.5
epis: 4   score: 1.0   mem len: 889   epsilon: 1.0    steps: 150    lr: 0.0001     reward: 1.4
epis: 5   score: 1.0   mem len: 1039   epsilon: 1.0    steps: 150    lr: 0.0001     reward: 1.33
epis: 6   score: 0.0   mem len: 1162   epsilon: 1.0    steps: 123    lr: 0.0001     reward: 1.14
epis: 7   score: 0.0   mem len: 1284   epsilon: 1.0    steps: 122    lr: 0.0001     reward: 1.0
epis: 8   score: 0.0   mem len: 1407   epsilon: 1.0    steps: 123    lr: 0.0001     reward: 0.89
epis: 9   score: 2.0   mem len: 1625   epsilon: 1.0    steps: 218    lr: 0.0001     reward: 1.0
epis: 10   score: 2.0   mem len: 1822   e

epis: 84   score: 0.0   mem len: 15676   epsilon: 1.0    steps: 122    lr: 0.0001     reward: 1.54
epis: 85   score: 1.0   mem len: 15845   epsilon: 1.0    steps: 169    lr: 0.0001     reward: 1.53
epis: 86   score: 0.0   mem len: 15968   epsilon: 1.0    steps: 123    lr: 0.0001     reward: 1.52
epis: 87   score: 2.0   mem len: 16189   epsilon: 1.0    steps: 221    lr: 0.0001     reward: 1.52
epis: 88   score: 0.0   mem len: 16312   epsilon: 1.0    steps: 123    lr: 0.0001     reward: 1.51
epis: 89   score: 1.0   mem len: 16480   epsilon: 1.0    steps: 168    lr: 0.0001     reward: 1.5
epis: 90   score: 0.0   mem len: 16602   epsilon: 1.0    steps: 122    lr: 0.0001     reward: 1.48
epis: 91   score: 0.0   mem len: 16725   epsilon: 1.0    steps: 123    lr: 0.0001     reward: 1.47
epis: 92   score: 3.0   mem len: 16996   epsilon: 1.0    steps: 271    lr: 0.0001     reward: 1.48
epis: 93   score: 1.0   mem len: 17147   epsilon: 1.0    steps: 151    lr: 0.0001     reward: 1.48
epis: 94   

epis: 167   score: 3.0   mem len: 30814   epsilon: 1.0    steps: 230    lr: 0.0001     reward: 1.44
epis: 168   score: 1.0   mem len: 30982   epsilon: 1.0    steps: 168    lr: 0.0001     reward: 1.43
epis: 169   score: 1.0   mem len: 31134   epsilon: 1.0    steps: 152    lr: 0.0001     reward: 1.4
epis: 170   score: 0.0   mem len: 31257   epsilon: 1.0    steps: 123    lr: 0.0001     reward: 1.39
epis: 171   score: 2.0   mem len: 31454   epsilon: 1.0    steps: 197    lr: 0.0001     reward: 1.4
epis: 172   score: 3.0   mem len: 31700   epsilon: 1.0    steps: 246    lr: 0.0001     reward: 1.4
epis: 173   score: 2.0   mem len: 31897   epsilon: 1.0    steps: 197    lr: 0.0001     reward: 1.42
epis: 174   score: 0.0   mem len: 32020   epsilon: 1.0    steps: 123    lr: 0.0001     reward: 1.41
epis: 175   score: 0.0   mem len: 32143   epsilon: 1.0    steps: 123    lr: 0.0001     reward: 1.4
epis: 176   score: 1.0   mem len: 32294   epsilon: 1.0    steps: 151    lr: 0.0001     reward: 1.39
epis

epis: 250   score: 1.0   mem len: 46102   epsilon: 1.0    steps: 150    lr: 0.0001     reward: 1.53
epis: 251   score: 0.0   mem len: 46225   epsilon: 1.0    steps: 123    lr: 0.0001     reward: 1.53
epis: 252   score: 0.0   mem len: 46348   epsilon: 1.0    steps: 123    lr: 0.0001     reward: 1.53
epis: 253   score: 1.0   mem len: 46498   epsilon: 1.0    steps: 150    lr: 0.0001     reward: 1.54
epis: 254   score: 2.0   mem len: 46678   epsilon: 1.0    steps: 180    lr: 0.0001     reward: 1.54
epis: 255   score: 0.0   mem len: 46800   epsilon: 1.0    steps: 122    lr: 0.0001     reward: 1.51
epis: 256   score: 3.0   mem len: 47029   epsilon: 1.0    steps: 229    lr: 0.0001     reward: 1.49
epis: 257   score: 1.0   mem len: 47197   epsilon: 1.0    steps: 168    lr: 0.0001     reward: 1.5
epis: 258   score: 1.0   mem len: 47369   epsilon: 1.0    steps: 172    lr: 0.0001     reward: 1.51
epis: 259   score: 0.0   mem len: 47491   epsilon: 1.0    steps: 122    lr: 0.0001     reward: 1.49
e

epis: 333   score: 0.0   mem len: 60890   epsilon: 1.0    steps: 123    lr: 0.0001     reward: 1.49
epis: 334   score: 2.0   mem len: 61109   epsilon: 1.0    steps: 219    lr: 0.0001     reward: 1.48
epis: 335   score: 3.0   mem len: 61356   epsilon: 1.0    steps: 247    lr: 0.0001     reward: 1.51
epis: 336   score: 0.0   mem len: 61479   epsilon: 1.0    steps: 123    lr: 0.0001     reward: 1.45
epis: 337   score: 1.0   mem len: 61651   epsilon: 1.0    steps: 172    lr: 0.0001     reward: 1.44
epis: 338   score: 2.0   mem len: 61867   epsilon: 1.0    steps: 216    lr: 0.0001     reward: 1.44
epis: 339   score: 0.0   mem len: 61990   epsilon: 1.0    steps: 123    lr: 0.0001     reward: 1.42
epis: 340   score: 2.0   mem len: 62188   epsilon: 1.0    steps: 198    lr: 0.0001     reward: 1.41
epis: 341   score: 0.0   mem len: 62311   epsilon: 1.0    steps: 123    lr: 0.0001     reward: 1.4
epis: 342   score: 0.0   mem len: 62434   epsilon: 1.0    steps: 123    lr: 0.0001     reward: 1.4
ep

epis: 416   score: 0.0   mem len: 75520   epsilon: 1.0    steps: 123    lr: 0.0001     reward: 1.29
epis: 417   score: 4.0   mem len: 75817   epsilon: 1.0    steps: 297    lr: 0.0001     reward: 1.32
epis: 418   score: 4.0   mem len: 76096   epsilon: 1.0    steps: 279    lr: 0.0001     reward: 1.36
epis: 419   score: 3.0   mem len: 76321   epsilon: 1.0    steps: 225    lr: 0.0001     reward: 1.38
epis: 420   score: 0.0   mem len: 76444   epsilon: 1.0    steps: 123    lr: 0.0001     reward: 1.37
epis: 421   score: 0.0   mem len: 76566   epsilon: 1.0    steps: 122    lr: 0.0001     reward: 1.36
epis: 422   score: 0.0   mem len: 76689   epsilon: 1.0    steps: 123    lr: 0.0001     reward: 1.36
epis: 423   score: 0.0   mem len: 76811   epsilon: 1.0    steps: 122    lr: 0.0001     reward: 1.32
epis: 424   score: 0.0   mem len: 76934   epsilon: 1.0    steps: 123    lr: 0.0001     reward: 1.31
epis: 425   score: 0.0   mem len: 77056   epsilon: 1.0    steps: 122    lr: 0.0001     reward: 1.29


epis: 499   score: 3.0   mem len: 90381   epsilon: 1.0    steps: 248    lr: 0.0001     reward: 1.29
epis: 500   score: 1.0   mem len: 90531   epsilon: 1.0    steps: 150    lr: 0.0001     reward: 1.29
epis: 501   score: 0.0   mem len: 90653   epsilon: 1.0    steps: 122    lr: 0.0001     reward: 1.29
epis: 502   score: 3.0   mem len: 90900   epsilon: 1.0    steps: 247    lr: 0.0001     reward: 1.32
epis: 503   score: 1.0   mem len: 91069   epsilon: 1.0    steps: 169    lr: 0.0001     reward: 1.31
epis: 504   score: 1.0   mem len: 91240   epsilon: 1.0    steps: 171    lr: 0.0001     reward: 1.32
epis: 505   score: 0.0   mem len: 91363   epsilon: 1.0    steps: 123    lr: 0.0001     reward: 1.31
epis: 506   score: 4.0   mem len: 91622   epsilon: 1.0    steps: 259    lr: 0.0001     reward: 1.34
epis: 507   score: 3.0   mem len: 91868   epsilon: 1.0    steps: 246    lr: 0.0001     reward: 1.35
epis: 508   score: 0.0   mem len: 91991   epsilon: 1.0    steps: 123    lr: 0.0001     reward: 1.35


  sample = np.array(sample)
  mini_batch = np.array(mini_batch).transpose()


epis: 554   score: 2.0   mem len: 100137   epsilon: 0.9997    steps: 197    lr: 0.0001     reward: 1.45
epis: 555   score: 0.0   mem len: 100260   epsilon: 0.9995    steps: 123    lr: 0.0001     reward: 1.42
epis: 556   score: 1.0   mem len: 100411   epsilon: 0.9992    steps: 151    lr: 0.0001     reward: 1.43
epis: 557   score: 4.0   mem len: 100687   epsilon: 0.9986    steps: 276    lr: 0.0001     reward: 1.45
epis: 558   score: 3.0   mem len: 100955   epsilon: 0.9981    steps: 268    lr: 0.0001     reward: 1.46
epis: 559   score: 0.0   mem len: 101078   epsilon: 0.9979    steps: 123    lr: 0.0001     reward: 1.44
epis: 560   score: 0.0   mem len: 101201   epsilon: 0.9976    steps: 123    lr: 0.0001     reward: 1.42
epis: 561   score: 2.0   mem len: 101419   epsilon: 0.9972    steps: 218    lr: 0.0001     reward: 1.43
epis: 562   score: 1.0   mem len: 101570   epsilon: 0.9969    steps: 151    lr: 0.0001     reward: 1.41
epis: 563   score: 2.0   mem len: 101767   epsilon: 0.9965    st

epis: 633   score: 2.0   mem len: 114790   epsilon: 0.9707    steps: 221    lr: 0.0001     reward: 1.55
epis: 634   score: 3.0   mem len: 115025   epsilon: 0.9702    steps: 235    lr: 0.0001     reward: 1.56
epis: 635   score: 1.0   mem len: 115193   epsilon: 0.9699    steps: 168    lr: 0.0001     reward: 1.56
epis: 636   score: 0.0   mem len: 115316   epsilon: 0.9697    steps: 123    lr: 0.0001     reward: 1.56
epis: 637   score: 3.0   mem len: 115584   epsilon: 0.9691    steps: 268    lr: 0.0001     reward: 1.59
epis: 638   score: 1.0   mem len: 115752   epsilon: 0.9688    steps: 168    lr: 0.0001     reward: 1.6
epis: 639   score: 2.0   mem len: 115949   epsilon: 0.9684    steps: 197    lr: 0.0001     reward: 1.59
epis: 640   score: 0.0   mem len: 116071   epsilon: 0.9682    steps: 122    lr: 0.0001     reward: 1.56
epis: 641   score: 1.0   mem len: 116242   epsilon: 0.9678    steps: 171    lr: 0.0001     reward: 1.54
epis: 642   score: 1.0   mem len: 116412   epsilon: 0.9675    ste

epis: 712   score: 3.0   mem len: 128316   epsilon: 0.9439    steps: 211    lr: 0.0001     reward: 1.31
epis: 713   score: 3.0   mem len: 128545   epsilon: 0.9435    steps: 229    lr: 0.0001     reward: 1.31
epis: 714   score: 1.0   mem len: 128695   epsilon: 0.9432    steps: 150    lr: 0.0001     reward: 1.32
epis: 715   score: 2.0   mem len: 128913   epsilon: 0.9428    steps: 218    lr: 0.0001     reward: 1.33
epis: 716   score: 2.0   mem len: 129111   epsilon: 0.9424    steps: 198    lr: 0.0001     reward: 1.34
epis: 717   score: 1.0   mem len: 129280   epsilon: 0.942    steps: 169    lr: 0.0001     reward: 1.32
epis: 718   score: 1.0   mem len: 129431   epsilon: 0.9417    steps: 151    lr: 0.0001     reward: 1.32
epis: 719   score: 2.0   mem len: 129648   epsilon: 0.9413    steps: 217    lr: 0.0001     reward: 1.3
epis: 720   score: 1.0   mem len: 129816   epsilon: 0.941    steps: 168    lr: 0.0001     reward: 1.27
epis: 721   score: 0.0   mem len: 129939   epsilon: 0.9407    steps

epis: 791   score: 2.0   mem len: 142463   epsilon: 0.9159    steps: 197    lr: 0.0001     reward: 1.26
epis: 792   score: 4.0   mem len: 142735   epsilon: 0.9154    steps: 272    lr: 0.0001     reward: 1.28
epis: 793   score: 1.0   mem len: 142905   epsilon: 0.915    steps: 170    lr: 0.0001     reward: 1.26
epis: 794   score: 2.0   mem len: 143103   epsilon: 0.9147    steps: 198    lr: 0.0001     reward: 1.27
epis: 795   score: 4.0   mem len: 143381   epsilon: 0.9141    steps: 278    lr: 0.0001     reward: 1.31
epis: 796   score: 1.0   mem len: 143532   epsilon: 0.9138    steps: 151    lr: 0.0001     reward: 1.32
epis: 797   score: 0.0   mem len: 143654   epsilon: 0.9136    steps: 122    lr: 0.0001     reward: 1.32
epis: 798   score: 0.0   mem len: 143776   epsilon: 0.9133    steps: 122    lr: 0.0001     reward: 1.32
epis: 799   score: 1.0   mem len: 143946   epsilon: 0.913    steps: 170    lr: 0.0001     reward: 1.33
epis: 800   score: 2.0   mem len: 144144   epsilon: 0.9126    step

epis: 870   score: 0.0   mem len: 158327   epsilon: 0.8845    steps: 122    lr: 0.0001     reward: 1.89
epis: 871   score: 2.0   mem len: 158543   epsilon: 0.8841    steps: 216    lr: 0.0001     reward: 1.91
epis: 872   score: 2.0   mem len: 158761   epsilon: 0.8837    steps: 218    lr: 0.0001     reward: 1.89
epis: 873   score: 1.0   mem len: 158933   epsilon: 0.8833    steps: 172    lr: 0.0001     reward: 1.88
epis: 874   score: 2.0   mem len: 159132   epsilon: 0.8829    steps: 199    lr: 0.0001     reward: 1.9
epis: 875   score: 2.0   mem len: 159313   epsilon: 0.8826    steps: 181    lr: 0.0001     reward: 1.92
epis: 876   score: 6.0   mem len: 159668   epsilon: 0.8819    steps: 355    lr: 0.0001     reward: 1.98
epis: 877   score: 4.0   mem len: 159919   epsilon: 0.8814    steps: 251    lr: 0.0001     reward: 2.02
epis: 878   score: 3.0   mem len: 160183   epsilon: 0.8808    steps: 264    lr: 0.0001     reward: 2.01
epis: 879   score: 3.0   mem len: 160451   epsilon: 0.8803    ste

epis: 949   score: 5.0   mem len: 174655   epsilon: 0.8522    steps: 306    lr: 0.0001     reward: 2.07
epis: 950   score: 3.0   mem len: 174881   epsilon: 0.8517    steps: 226    lr: 0.0001     reward: 2.08
epis: 951   score: 6.0   mem len: 175292   epsilon: 0.8509    steps: 411    lr: 0.0001     reward: 2.14
epis: 952   score: 3.0   mem len: 175539   epsilon: 0.8504    steps: 247    lr: 0.0001     reward: 2.16
epis: 953   score: 2.0   mem len: 175737   epsilon: 0.85    steps: 198    lr: 0.0001     reward: 2.16
epis: 954   score: 0.0   mem len: 175860   epsilon: 0.8498    steps: 123    lr: 0.0001     reward: 2.12
epis: 955   score: 3.0   mem len: 176086   epsilon: 0.8493    steps: 226    lr: 0.0001     reward: 2.11
epis: 956   score: 1.0   mem len: 176255   epsilon: 0.849    steps: 169    lr: 0.0001     reward: 2.08
epis: 957   score: 0.0   mem len: 176378   epsilon: 0.8488    steps: 123    lr: 0.0001     reward: 2.07
epis: 958   score: 2.0   mem len: 176577   epsilon: 0.8484    steps

epis: 1028   score: 5.0   mem len: 191693   epsilon: 0.8184    steps: 341    lr: 0.0001     reward: 2.26
epis: 1029   score: 2.0   mem len: 191891   epsilon: 0.8181    steps: 198    lr: 0.0001     reward: 2.24
epis: 1030   score: 1.0   mem len: 192059   epsilon: 0.8177    steps: 168    lr: 0.0001     reward: 2.23
epis: 1031   score: 6.0   mem len: 192380   epsilon: 0.8171    steps: 321    lr: 0.0001     reward: 2.28
epis: 1032   score: 2.0   mem len: 192579   epsilon: 0.8167    steps: 199    lr: 0.0001     reward: 2.29
epis: 1033   score: 2.0   mem len: 192777   epsilon: 0.8163    steps: 198    lr: 0.0001     reward: 2.31
epis: 1034   score: 0.0   mem len: 192900   epsilon: 0.8161    steps: 123    lr: 0.0001     reward: 2.29
epis: 1035   score: 3.0   mem len: 193146   epsilon: 0.8156    steps: 246    lr: 0.0001     reward: 2.3
epis: 1036   score: 0.0   mem len: 193269   epsilon: 0.8153    steps: 123    lr: 0.0001     reward: 2.3
epis: 1037   score: 1.0   mem len: 193420   epsilon: 0.81

epis: 1107   score: 6.0   mem len: 209831   epsilon: 0.7825    steps: 351    lr: 4e-05     reward: 2.6
epis: 1108   score: 3.0   mem len: 210062   epsilon: 0.7821    steps: 231    lr: 4e-05     reward: 2.59
epis: 1109   score: 2.0   mem len: 210243   epsilon: 0.7817    steps: 181    lr: 4e-05     reward: 2.59
epis: 1110   score: 4.0   mem len: 210521   epsilon: 0.7812    steps: 278    lr: 4e-05     reward: 2.63
epis: 1111   score: 4.0   mem len: 210813   epsilon: 0.7806    steps: 292    lr: 4e-05     reward: 2.65
epis: 1112   score: 3.0   mem len: 211038   epsilon: 0.7801    steps: 225    lr: 4e-05     reward: 2.65
epis: 1113   score: 8.0   mem len: 211506   epsilon: 0.7792    steps: 468    lr: 4e-05     reward: 2.73
epis: 1114   score: 2.0   mem len: 211723   epsilon: 0.7788    steps: 217    lr: 4e-05     reward: 2.7
epis: 1115   score: 7.0   mem len: 212115   epsilon: 0.778    steps: 392    lr: 4e-05     reward: 2.77
epis: 1116   score: 1.0   mem len: 212285   epsilon: 0.7777    step

epis: 1186   score: 2.0   mem len: 230906   epsilon: 0.7408    steps: 199    lr: 4e-05     reward: 3.57
epis: 1187   score: 2.0   mem len: 231088   epsilon: 0.7404    steps: 182    lr: 4e-05     reward: 3.55
epis: 1188   score: 8.0   mem len: 231520   epsilon: 0.7396    steps: 432    lr: 4e-05     reward: 3.61
epis: 1189   score: 4.0   mem len: 231812   epsilon: 0.739    steps: 292    lr: 4e-05     reward: 3.62
epis: 1190   score: 1.0   mem len: 231981   epsilon: 0.7387    steps: 169    lr: 4e-05     reward: 3.59
epis: 1191   score: 2.0   mem len: 232163   epsilon: 0.7383    steps: 182    lr: 4e-05     reward: 3.59
epis: 1192   score: 5.0   mem len: 232485   epsilon: 0.7377    steps: 322    lr: 4e-05     reward: 3.6
epis: 1193   score: 2.0   mem len: 232683   epsilon: 0.7373    steps: 198    lr: 4e-05     reward: 3.57
epis: 1194   score: 5.0   mem len: 233009   epsilon: 0.7366    steps: 326    lr: 4e-05     reward: 3.59
epis: 1195   score: 4.0   mem len: 233266   epsilon: 0.7361    ste

epis: 1265   score: 2.0   mem len: 252894   epsilon: 0.6973    steps: 197    lr: 4e-05     reward: 4.16
epis: 1266   score: 4.0   mem len: 253137   epsilon: 0.6968    steps: 243    lr: 4e-05     reward: 4.12
epis: 1267   score: 6.0   mem len: 253494   epsilon: 0.6961    steps: 357    lr: 4e-05     reward: 4.12
epis: 1268   score: 3.0   mem len: 253723   epsilon: 0.6956    steps: 229    lr: 4e-05     reward: 4.14
epis: 1269   score: 3.0   mem len: 253971   epsilon: 0.6951    steps: 248    lr: 4e-05     reward: 4.07
epis: 1270   score: 4.0   mem len: 254246   epsilon: 0.6946    steps: 275    lr: 4e-05     reward: 4.05
epis: 1271   score: 4.0   mem len: 254539   epsilon: 0.694    steps: 293    lr: 4e-05     reward: 4.05
epis: 1272   score: 3.0   mem len: 254788   epsilon: 0.6935    steps: 249    lr: 4e-05     reward: 4.0
epis: 1273   score: 2.0   mem len: 254986   epsilon: 0.6931    steps: 198    lr: 4e-05     reward: 3.98
epis: 1274   score: 3.0   mem len: 255230   epsilon: 0.6926    ste

epis: 1344   score: 7.0   mem len: 276119   epsilon: 0.6513    steps: 404    lr: 4e-05     reward: 4.47
epis: 1345   score: 8.0   mem len: 276558   epsilon: 0.6504    steps: 439    lr: 4e-05     reward: 4.52
epis: 1346   score: 11.0   mem len: 277034   epsilon: 0.6495    steps: 476    lr: 4e-05     reward: 4.62
epis: 1347   score: 2.0   mem len: 277232   epsilon: 0.6491    steps: 198    lr: 4e-05     reward: 4.6
epis: 1348   score: 3.0   mem len: 277477   epsilon: 0.6486    steps: 245    lr: 4e-05     reward: 4.57
epis: 1349   score: 4.0   mem len: 277718   epsilon: 0.6481    steps: 241    lr: 4e-05     reward: 4.56
epis: 1350   score: 6.0   mem len: 278091   epsilon: 0.6474    steps: 373    lr: 4e-05     reward: 4.56
epis: 1351   score: 11.0   mem len: 278516   epsilon: 0.6465    steps: 425    lr: 4e-05     reward: 4.63
epis: 1352   score: 7.0   mem len: 278888   epsilon: 0.6458    steps: 372    lr: 4e-05     reward: 4.65
epis: 1353   score: 2.0   mem len: 279086   epsilon: 0.6454    

epis: 1423   score: 4.0   mem len: 301566   epsilon: 0.6009    steps: 263    lr: 1.6e-05     reward: 5.16
epis: 1424   score: 7.0   mem len: 301923   epsilon: 0.6002    steps: 357    lr: 1.6e-05     reward: 5.19
epis: 1425   score: 6.0   mem len: 302279   epsilon: 0.5995    steps: 356    lr: 1.6e-05     reward: 5.22
epis: 1426   score: 6.0   mem len: 302671   epsilon: 0.5987    steps: 392    lr: 1.6e-05     reward: 5.26
epis: 1427   score: 6.0   mem len: 303033   epsilon: 0.598    steps: 362    lr: 1.6e-05     reward: 5.28
epis: 1428   score: 12.0   mem len: 303590   epsilon: 0.5969    steps: 557    lr: 1.6e-05     reward: 5.35
epis: 1429   score: 6.0   mem len: 304004   epsilon: 0.5961    steps: 414    lr: 1.6e-05     reward: 5.36
epis: 1430   score: 7.0   mem len: 304379   epsilon: 0.5953    steps: 375    lr: 1.6e-05     reward: 5.4
epis: 1431   score: 5.0   mem len: 304703   epsilon: 0.5947    steps: 324    lr: 1.6e-05     reward: 5.42
epis: 1432   score: 5.0   mem len: 305016   eps

epis: 1501   score: 5.0   mem len: 328204   epsilon: 0.5482    steps: 341    lr: 1.6e-05     reward: 5.73
epis: 1502   score: 6.0   mem len: 328532   epsilon: 0.5475    steps: 328    lr: 1.6e-05     reward: 5.73
epis: 1503   score: 3.0   mem len: 328761   epsilon: 0.5471    steps: 229    lr: 1.6e-05     reward: 5.74
epis: 1504   score: 6.0   mem len: 329125   epsilon: 0.5463    steps: 364    lr: 1.6e-05     reward: 5.72
epis: 1505   score: 6.0   mem len: 329502   epsilon: 0.5456    steps: 377    lr: 1.6e-05     reward: 5.72
epis: 1506   score: 5.0   mem len: 329817   epsilon: 0.545    steps: 315    lr: 1.6e-05     reward: 5.71
epis: 1507   score: 11.0   mem len: 330377   epsilon: 0.5439    steps: 560    lr: 1.6e-05     reward: 5.71
epis: 1508   score: 4.0   mem len: 330619   epsilon: 0.5434    steps: 242    lr: 1.6e-05     reward: 5.66
epis: 1509   score: 2.0   mem len: 330800   epsilon: 0.543    steps: 181    lr: 1.6e-05     reward: 5.63
epis: 1510   score: 10.0   mem len: 331319   ep

epis: 1579   score: 8.0   mem len: 358697   epsilon: 0.4878    steps: 427    lr: 1.6e-05     reward: 7.15
epis: 1580   score: 8.0   mem len: 359174   epsilon: 0.4868    steps: 477    lr: 1.6e-05     reward: 7.19
epis: 1581   score: 4.0   mem len: 359434   epsilon: 0.4863    steps: 260    lr: 1.6e-05     reward: 7.14
epis: 1582   score: 6.0   mem len: 359794   epsilon: 0.4856    steps: 360    lr: 1.6e-05     reward: 7.12
epis: 1583   score: 3.0   mem len: 360006   epsilon: 0.4852    steps: 212    lr: 1.6e-05     reward: 7.08
epis: 1584   score: 9.0   mem len: 360441   epsilon: 0.4843    steps: 435    lr: 1.6e-05     reward: 7.07
epis: 1585   score: 9.0   mem len: 360908   epsilon: 0.4834    steps: 467    lr: 1.6e-05     reward: 7.08
epis: 1586   score: 3.0   mem len: 361138   epsilon: 0.4829    steps: 230    lr: 1.6e-05     reward: 7.05
epis: 1587   score: 5.0   mem len: 361481   epsilon: 0.4823    steps: 343    lr: 1.6e-05     reward: 7.05
epis: 1588   score: 9.0   mem len: 361944   ep

epis: 1657   score: 7.0   mem len: 390673   epsilon: 0.4245    steps: 388    lr: 1.6e-05     reward: 7.71
epis: 1658   score: 8.0   mem len: 391102   epsilon: 0.4236    steps: 429    lr: 1.6e-05     reward: 7.72
epis: 1659   score: 12.0   mem len: 391537   epsilon: 0.4228    steps: 435    lr: 1.6e-05     reward: 7.76
epis: 1660   score: 8.0   mem len: 392014   epsilon: 0.4218    steps: 477    lr: 1.6e-05     reward: 7.79
epis: 1661   score: 6.0   mem len: 392389   epsilon: 0.4211    steps: 375    lr: 1.6e-05     reward: 7.81
epis: 1662   score: 12.0   mem len: 393023   epsilon: 0.4198    steps: 634    lr: 1.6e-05     reward: 7.78
epis: 1663   score: 9.0   mem len: 393470   epsilon: 0.4189    steps: 447    lr: 1.6e-05     reward: 7.79
epis: 1664   score: 9.0   mem len: 393944   epsilon: 0.418    steps: 474    lr: 1.6e-05     reward: 7.87
epis: 1665   score: 7.0   mem len: 394367   epsilon: 0.4172    steps: 423    lr: 1.6e-05     reward: 7.88
epis: 1666   score: 7.0   mem len: 394767   e

epis: 1735   score: 11.0   mem len: 425260   epsilon: 0.356    steps: 535    lr: 6.4e-06     reward: 8.59
epis: 1736   score: 14.0   mem len: 425918   epsilon: 0.3547    steps: 658    lr: 6.4e-06     reward: 8.7
epis: 1737   score: 11.0   mem len: 426432   epsilon: 0.3537    steps: 514    lr: 6.4e-06     reward: 8.68
epis: 1738   score: 9.0   mem len: 426902   epsilon: 0.3527    steps: 470    lr: 6.4e-06     reward: 8.6
epis: 1739   score: 11.0   mem len: 427408   epsilon: 0.3517    steps: 506    lr: 6.4e-06     reward: 8.64
epis: 1740   score: 7.0   mem len: 427783   epsilon: 0.351    steps: 375    lr: 6.4e-06     reward: 8.63
epis: 1741   score: 12.0   mem len: 428341   epsilon: 0.3499    steps: 558    lr: 6.4e-06     reward: 8.67
epis: 1742   score: 9.0   mem len: 428796   epsilon: 0.349    steps: 455    lr: 6.4e-06     reward: 8.69
epis: 1743   score: 6.0   mem len: 429175   epsilon: 0.3482    steps: 379    lr: 6.4e-06     reward: 8.69
epis: 1744   score: 7.0   mem len: 429583   ep

epis: 1813   score: 17.0   mem len: 461807   epsilon: 0.2836    steps: 702    lr: 6.4e-06     reward: 9.37
epis: 1814   score: 13.0   mem len: 462393   epsilon: 0.2825    steps: 586    lr: 6.4e-06     reward: 9.46
epis: 1815   score: 11.0   mem len: 462880   epsilon: 0.2815    steps: 487    lr: 6.4e-06     reward: 9.42
epis: 1816   score: 12.0   mem len: 463418   epsilon: 0.2804    steps: 538    lr: 6.4e-06     reward: 9.47
epis: 1817   score: 17.0   mem len: 464056   epsilon: 0.2792    steps: 638    lr: 6.4e-06     reward: 9.5
epis: 1818   score: 8.0   mem len: 464506   epsilon: 0.2783    steps: 450    lr: 6.4e-06     reward: 9.5
epis: 1819   score: 7.0   mem len: 464879   epsilon: 0.2775    steps: 373    lr: 6.4e-06     reward: 9.48
epis: 1820   score: 12.0   mem len: 465449   epsilon: 0.2764    steps: 570    lr: 6.4e-06     reward: 9.5
epis: 1821   score: 18.0   mem len: 466116   epsilon: 0.2751    steps: 667    lr: 6.4e-06     reward: 9.62
epis: 1822   score: 9.0   mem len: 466575 

epis: 1890   score: 6.0   mem len: 501224   epsilon: 0.2056    steps: 361    lr: 2.6e-06     reward: 10.83
epis: 1891   score: 17.0   mem len: 501859   epsilon: 0.2043    steps: 635    lr: 2.6e-06     reward: 10.87
epis: 1892   score: 18.0   mem len: 502576   epsilon: 0.2029    steps: 717    lr: 2.6e-06     reward: 10.95
epis: 1893   score: 8.0   mem len: 503009   epsilon: 0.202    steps: 433    lr: 2.6e-06     reward: 10.93
epis: 1894   score: 13.0   mem len: 503516   epsilon: 0.201    steps: 507    lr: 2.6e-06     reward: 10.98
epis: 1895   score: 7.0   mem len: 503927   epsilon: 0.2002    steps: 411    lr: 2.6e-06     reward: 10.95
epis: 1896   score: 11.0   mem len: 504516   epsilon: 0.1991    steps: 589    lr: 2.6e-06     reward: 10.89
epis: 1897   score: 15.0   mem len: 505091   epsilon: 0.1979    steps: 575    lr: 2.6e-06     reward: 10.99
epis: 1898   score: 16.0   mem len: 505637   epsilon: 0.1968    steps: 546    lr: 2.6e-06     reward: 11.06
epis: 1899   score: 13.0   mem le

epis: 1967   score: 11.0   mem len: 542000   epsilon: 0.1248    steps: 602    lr: 2.6e-06     reward: 11.42
epis: 1968   score: 13.0   mem len: 542651   epsilon: 0.1235    steps: 651    lr: 2.6e-06     reward: 11.47
epis: 1969   score: 11.0   mem len: 543199   epsilon: 0.1225    steps: 548    lr: 2.6e-06     reward: 11.4
epis: 1970   score: 16.0   mem len: 543806   epsilon: 0.1213    steps: 607    lr: 2.6e-06     reward: 11.44
epis: 1971   score: 13.0   mem len: 544433   epsilon: 0.12    steps: 627    lr: 2.6e-06     reward: 11.48
epis: 1972   score: 7.0   mem len: 544815   epsilon: 0.1193    steps: 382    lr: 2.6e-06     reward: 11.39
epis: 1973   score: 14.0   mem len: 545326   epsilon: 0.1183    steps: 511    lr: 2.6e-06     reward: 11.48
epis: 1974   score: 9.0   mem len: 545834   epsilon: 0.1172    steps: 508    lr: 2.6e-06     reward: 11.46
epis: 1975   score: 5.0   mem len: 546182   epsilon: 0.1166    steps: 348    lr: 2.6e-06     reward: 11.36
epis: 1976   score: 11.0   mem len

epis: 2044   score: 12.0   mem len: 582190   epsilon: 0.0453    steps: 526    lr: 2.6e-06     reward: 11.5
epis: 2045   score: 12.0   mem len: 582771   epsilon: 0.0441    steps: 581    lr: 2.6e-06     reward: 11.52
epis: 2046   score: 13.0   mem len: 583269   epsilon: 0.0431    steps: 498    lr: 2.6e-06     reward: 11.48
epis: 2047   score: 14.0   mem len: 583895   epsilon: 0.0419    steps: 626    lr: 2.6e-06     reward: 11.54
epis: 2048   score: 14.0   mem len: 584559   epsilon: 0.0406    steps: 664    lr: 2.6e-06     reward: 11.62
epis: 2049   score: 15.0   mem len: 585129   epsilon: 0.0394    steps: 570    lr: 2.6e-06     reward: 11.63
epis: 2050   score: 11.0   mem len: 585668   epsilon: 0.0384    steps: 539    lr: 2.6e-06     reward: 11.62
epis: 2051   score: 10.0   mem len: 586189   epsilon: 0.0373    steps: 521    lr: 2.6e-06     reward: 11.62
epis: 2052   score: 9.0   mem len: 586681   epsilon: 0.0364    steps: 492    lr: 2.6e-06     reward: 11.63
epis: 2053   score: 12.0   mem

epis: 2122   score: 16.0   mem len: 626901   epsilon: 0.01    steps: 605    lr: 1e-06     reward: 13.14
epis: 2123   score: 13.0   mem len: 627478   epsilon: 0.01    steps: 577    lr: 1e-06     reward: 13.13
epis: 2124   score: 9.0   mem len: 627949   epsilon: 0.01    steps: 471    lr: 1e-06     reward: 13.13
epis: 2125   score: 7.0   mem len: 628351   epsilon: 0.01    steps: 402    lr: 1e-06     reward: 13.11
epis: 2126   score: 15.0   mem len: 629015   epsilon: 0.01    steps: 664    lr: 1e-06     reward: 13.04
epis: 2127   score: 11.0   mem len: 629523   epsilon: 0.01    steps: 508    lr: 1e-06     reward: 13.04
epis: 2128   score: 15.0   mem len: 630132   epsilon: 0.01    steps: 609    lr: 1e-06     reward: 13.11
epis: 2129   score: 13.0   mem len: 630707   epsilon: 0.01    steps: 575    lr: 1e-06     reward: 13.17
epis: 2130   score: 8.0   mem len: 631181   epsilon: 0.01    steps: 474    lr: 1e-06     reward: 13.08
epis: 2131   score: 13.0   mem len: 631774   epsilon: 0.01    steps

epis: 2201   score: 12.0   mem len: 671690   epsilon: 0.01    steps: 545    lr: 1e-06     reward: 13.08
epis: 2202   score: 20.0   mem len: 672430   epsilon: 0.01    steps: 740    lr: 1e-06     reward: 13.11
epis: 2203   score: 10.0   mem len: 672985   epsilon: 0.01    steps: 555    lr: 1e-06     reward: 13.07
epis: 2204   score: 10.0   mem len: 673471   epsilon: 0.01    steps: 486    lr: 1e-06     reward: 12.99
epis: 2205   score: 13.0   mem len: 674078   epsilon: 0.01    steps: 607    lr: 1e-06     reward: 13.04
epis: 2206   score: 10.0   mem len: 674592   epsilon: 0.01    steps: 514    lr: 1e-06     reward: 12.91
epis: 2207   score: 18.0   mem len: 675139   epsilon: 0.01    steps: 547    lr: 1e-06     reward: 12.99
epis: 2208   score: 11.0   mem len: 675710   epsilon: 0.01    steps: 571    lr: 1e-06     reward: 12.93
epis: 2209   score: 8.0   mem len: 676142   epsilon: 0.01    steps: 432    lr: 1e-06     reward: 12.96
epis: 2210   score: 9.0   mem len: 676643   epsilon: 0.01    step

epis: 2280   score: 11.0   mem len: 716785   epsilon: 0.01    steps: 536    lr: 4e-07     reward: 12.92
epis: 2281   score: 7.0   mem len: 717196   epsilon: 0.01    steps: 411    lr: 4e-07     reward: 12.89
epis: 2282   score: 11.0   mem len: 717753   epsilon: 0.01    steps: 557    lr: 4e-07     reward: 12.83
epis: 2283   score: 16.0   mem len: 718309   epsilon: 0.01    steps: 556    lr: 4e-07     reward: 12.91
epis: 2284   score: 15.0   mem len: 718952   epsilon: 0.01    steps: 643    lr: 4e-07     reward: 12.98
epis: 2285   score: 12.0   mem len: 719558   epsilon: 0.01    steps: 606    lr: 4e-07     reward: 12.96
epis: 2286   score: 14.0   mem len: 720224   epsilon: 0.01    steps: 666    lr: 4e-07     reward: 12.9
epis: 2287   score: 12.0   mem len: 720839   epsilon: 0.01    steps: 615    lr: 4e-07     reward: 12.94
epis: 2288   score: 11.0   mem len: 721430   epsilon: 0.01    steps: 591    lr: 4e-07     reward: 12.94
epis: 2289   score: 13.0   mem len: 722076   epsilon: 0.01    step

epis: 2359   score: 12.0   mem len: 764069   epsilon: 0.01    steps: 577    lr: 4e-07     reward: 14.0
epis: 2360   score: 15.0   mem len: 764656   epsilon: 0.01    steps: 587    lr: 4e-07     reward: 14.03
epis: 2361   score: 15.0   mem len: 765237   epsilon: 0.01    steps: 581    lr: 4e-07     reward: 14.09
epis: 2362   score: 8.0   mem len: 765711   epsilon: 0.01    steps: 474    lr: 4e-07     reward: 14.02
epis: 2363   score: 15.0   mem len: 766315   epsilon: 0.01    steps: 604    lr: 4e-07     reward: 14.05
epis: 2364   score: 18.0   mem len: 767039   epsilon: 0.01    steps: 724    lr: 4e-07     reward: 14.09
epis: 2365   score: 22.0   mem len: 767773   epsilon: 0.01    steps: 734    lr: 4e-07     reward: 14.25
epis: 2366   score: 11.0   mem len: 768350   epsilon: 0.01    steps: 577    lr: 4e-07     reward: 14.24
epis: 2367   score: 13.0   mem len: 768883   epsilon: 0.01    steps: 533    lr: 4e-07     reward: 14.27
epis: 2368   score: 12.0   mem len: 769420   epsilon: 0.01    step

epis: 2438   score: 24.0   mem len: 810901   epsilon: 0.01    steps: 696    lr: 2e-07     reward: 13.92
epis: 2439   score: 15.0   mem len: 811482   epsilon: 0.01    steps: 581    lr: 2e-07     reward: 13.92
epis: 2440   score: 18.0   mem len: 812281   epsilon: 0.01    steps: 799    lr: 2e-07     reward: 13.99
epis: 2441   score: 13.0   mem len: 812810   epsilon: 0.01    steps: 529    lr: 2e-07     reward: 14.05
epis: 2442   score: 15.0   mem len: 813513   epsilon: 0.01    steps: 703    lr: 2e-07     reward: 14.0
epis: 2443   score: 20.0   mem len: 814248   epsilon: 0.01    steps: 735    lr: 2e-07     reward: 14.06
epis: 2444   score: 12.0   mem len: 814835   epsilon: 0.01    steps: 587    lr: 2e-07     reward: 14.12
epis: 2445   score: 19.0   mem len: 815578   epsilon: 0.01    steps: 743    lr: 2e-07     reward: 14.25
epis: 2446   score: 14.0   mem len: 816221   epsilon: 0.01    steps: 643    lr: 2e-07     reward: 14.17
epis: 2447   score: 13.0   mem len: 816816   epsilon: 0.01    ste

epis: 2517   score: 13.0   mem len: 857903   epsilon: 0.01    steps: 555    lr: 2e-07     reward: 13.82
epis: 2518   score: 19.0   mem len: 858591   epsilon: 0.01    steps: 688    lr: 2e-07     reward: 13.9
epis: 2519   score: 17.0   mem len: 859189   epsilon: 0.01    steps: 598    lr: 2e-07     reward: 13.91
epis: 2520   score: 11.0   mem len: 859752   epsilon: 0.01    steps: 563    lr: 2e-07     reward: 13.9
epis: 2521   score: 13.0   mem len: 860331   epsilon: 0.01    steps: 579    lr: 2e-07     reward: 13.92
epis: 2522   score: 10.0   mem len: 860828   epsilon: 0.01    steps: 497    lr: 2e-07     reward: 13.88
epis: 2523   score: 11.0   mem len: 861352   epsilon: 0.01    steps: 524    lr: 2e-07     reward: 13.84
epis: 2524   score: 10.0   mem len: 861891   epsilon: 0.01    steps: 539    lr: 2e-07     reward: 13.83
epis: 2525   score: 11.0   mem len: 862415   epsilon: 0.01    steps: 524    lr: 2e-07     reward: 13.8
epis: 2526   score: 12.0   mem len: 862984   epsilon: 0.01    steps

epis: 2596   score: 7.0   mem len: 904270   epsilon: 0.01    steps: 423    lr: 1e-07     reward: 13.55
epis: 2597   score: 7.0   mem len: 904693   epsilon: 0.01    steps: 423    lr: 1e-07     reward: 13.5
epis: 2598   score: 12.0   mem len: 905304   epsilon: 0.01    steps: 611    lr: 1e-07     reward: 13.49
epis: 2599   score: 15.0   mem len: 905986   epsilon: 0.01    steps: 682    lr: 1e-07     reward: 13.55
epis: 2600   score: 18.0   mem len: 906700   epsilon: 0.01    steps: 714    lr: 1e-07     reward: 13.58
epis: 2601   score: 8.0   mem len: 907151   epsilon: 0.01    steps: 451    lr: 1e-07     reward: 13.47
epis: 2602   score: 6.0   mem len: 907547   epsilon: 0.01    steps: 396    lr: 1e-07     reward: 13.37
epis: 2603   score: 8.0   mem len: 908022   epsilon: 0.01    steps: 475    lr: 1e-07     reward: 13.27
epis: 2604   score: 12.0   mem len: 908633   epsilon: 0.01    steps: 611    lr: 1e-07     reward: 13.28
epis: 2605   score: 12.0   mem len: 909142   epsilon: 0.01    steps: 5

epis: 2675   score: 17.0   mem len: 948706   epsilon: 0.01    steps: 586    lr: 1e-07     reward: 12.73
epis: 2676   score: 8.0   mem len: 949157   epsilon: 0.01    steps: 451    lr: 1e-07     reward: 12.67
epis: 2677   score: 13.0   mem len: 949693   epsilon: 0.01    steps: 536    lr: 1e-07     reward: 12.69
epis: 2678   score: 10.0   mem len: 950237   epsilon: 0.01    steps: 544    lr: 1e-07     reward: 12.63
epis: 2679   score: 9.0   mem len: 950693   epsilon: 0.01    steps: 456    lr: 1e-07     reward: 12.61
epis: 2680   score: 15.0   mem len: 951301   epsilon: 0.01    steps: 608    lr: 1e-07     reward: 12.55
epis: 2681   score: 13.0   mem len: 951917   epsilon: 0.01    steps: 616    lr: 1e-07     reward: 12.54
epis: 2682   score: 13.0   mem len: 952502   epsilon: 0.01    steps: 585    lr: 1e-07     reward: 12.56
epis: 2683   score: 9.0   mem len: 952989   epsilon: 0.01    steps: 487    lr: 1e-07     reward: 12.5
epis: 2684   score: 17.0   mem len: 953668   epsilon: 0.01    steps:

epis: 2755   score: 12.0   mem len: 993220   epsilon: 0.01    steps: 546    lr: 1e-07     reward: 12.67
epis: 2756   score: 12.0   mem len: 993766   epsilon: 0.01    steps: 546    lr: 1e-07     reward: 12.7
epis: 2757   score: 9.0   mem len: 994306   epsilon: 0.01    steps: 540    lr: 1e-07     reward: 12.66
epis: 2758   score: 18.0   mem len: 995086   epsilon: 0.01    steps: 780    lr: 1e-07     reward: 12.69
epis: 2759   score: 9.0   mem len: 995544   epsilon: 0.01    steps: 458    lr: 1e-07     reward: 12.53
epis: 2760   score: 12.0   mem len: 996101   epsilon: 0.01    steps: 557    lr: 1e-07     reward: 12.52
epis: 2761   score: 7.0   mem len: 996491   epsilon: 0.01    steps: 390    lr: 1e-07     reward: 12.5
epis: 2762   score: 15.0   mem len: 997060   epsilon: 0.01    steps: 569    lr: 1e-07     reward: 12.53
epis: 2763   score: 15.0   mem len: 997609   epsilon: 0.01    steps: 549    lr: 1e-07     reward: 12.57
epis: 2764   score: 8.0   mem len: 998061   epsilon: 0.01    steps: 4

epis: 2835   score: 12.0   mem len: 1000000   epsilon: 0.01    steps: 477    lr: 0.0     reward: 13.55
epis: 2836   score: 9.0   mem len: 1000000   epsilon: 0.01    steps: 464    lr: 0.0     reward: 13.51
epis: 2837   score: 8.0   mem len: 1000000   epsilon: 0.01    steps: 436    lr: 0.0     reward: 13.45
epis: 2838   score: 15.0   mem len: 1000000   epsilon: 0.01    steps: 569    lr: 0.0     reward: 13.47
epis: 2839   score: 11.0   mem len: 1000000   epsilon: 0.01    steps: 589    lr: 0.0     reward: 13.45
epis: 2840   score: 13.0   mem len: 1000000   epsilon: 0.01    steps: 637    lr: 0.0     reward: 13.44
epis: 2841   score: 13.0   mem len: 1000000   epsilon: 0.01    steps: 637    lr: 0.0     reward: 13.42
epis: 2842   score: 10.0   mem len: 1000000   epsilon: 0.01    steps: 489    lr: 0.0     reward: 13.43
epis: 2843   score: 15.0   mem len: 1000000   epsilon: 0.01    steps: 713    lr: 0.0     reward: 13.43
epis: 2844   score: 9.0   mem len: 1000000   epsilon: 0.01    steps: 457   

epis: 2915   score: 9.0   mem len: 1000000   epsilon: 0.01    steps: 457    lr: 0.0     reward: 12.21
epis: 2916   score: 11.0   mem len: 1000000   epsilon: 0.01    steps: 538    lr: 0.0     reward: 12.15
epis: 2917   score: 11.0   mem len: 1000000   epsilon: 0.01    steps: 538    lr: 0.0     reward: 12.15
epis: 2918   score: 12.0   mem len: 1000000   epsilon: 0.01    steps: 507    lr: 0.0     reward: 12.06
epis: 2919   score: 11.0   mem len: 1000000   epsilon: 0.01    steps: 538    lr: 0.0     reward: 12.08
epis: 2920   score: 8.0   mem len: 1000000   epsilon: 0.01    steps: 416    lr: 0.0     reward: 12.06
epis: 2921   score: 11.0   mem len: 1000000   epsilon: 0.01    steps: 557    lr: 0.0     reward: 11.96
epis: 2922   score: 21.0   mem len: 1000000   epsilon: 0.01    steps: 725    lr: 0.0     reward: 12.06
epis: 2923   score: 9.0   mem len: 1000000   epsilon: 0.01    steps: 455    lr: 0.0     reward: 11.98
epis: 2924   score: 10.0   mem len: 1000000   epsilon: 0.01    steps: 449   

epis: 2995   score: 13.0   mem len: 1000000   epsilon: 0.01    steps: 639    lr: 0.0     reward: 11.23
epis: 2996   score: 21.0   mem len: 1000000   epsilon: 0.01    steps: 666    lr: 0.0     reward: 11.36
epis: 2997   score: 13.0   mem len: 1000000   epsilon: 0.01    steps: 637    lr: 0.0     reward: 11.37
epis: 2998   score: 9.0   mem len: 1000000   epsilon: 0.01    steps: 457    lr: 0.0     reward: 11.36
epis: 2999   score: 8.0   mem len: 1000000   epsilon: 0.01    steps: 436    lr: 0.0     reward: 11.35
epis: 3000   score: 12.0   mem len: 1000000   epsilon: 0.01    steps: 595    lr: 0.0     reward: 11.36
epis: 3001   score: 13.0   mem len: 1000000   epsilon: 0.01    steps: 628    lr: 0.0     reward: 11.36
epis: 3002   score: 10.0   mem len: 1000000   epsilon: 0.01    steps: 479    lr: 0.0     reward: 11.35
epis: 3003   score: 11.0   mem len: 1000000   epsilon: 0.01    steps: 515    lr: 0.0     reward: 11.38
epis: 3004   score: 16.0   mem len: 1000000   epsilon: 0.01    steps: 640  

# Visualize Agent Performance

BE AWARE THIS CODE BELOW MAY CRASH THE KERNEL IF YOU RUN THE SAME CELL TWICE.

Please save your model before running this portion of the code.

In [None]:
torch.save(agent.policy_net, "./save_model/breakout_dqn_latest.pth")

In [None]:
from gym.wrappers import RecordVideo # If importing monitor raises issues, try using `from gym.wrappers import RecordVideo`
import glob
import io
import base64

from IPython.display import HTML
from IPython import display as ipythondisplay

from pyvirtualdisplay import Display

# Displaying the game live
def show_state(env, step=0, info=""):
    plt.figure(3)
    plt.clf()
    plt.imshow(env.render(mode='rgb_array'))
    plt.title("%s | Step: %d %s" % ("Agent Playing",step, info))
    plt.axis('off')

    ipythondisplay.clear_output(wait=True)
    ipythondisplay.display(plt.gcf())
    
# Recording the game and replaying the game afterwards
def show_video():
    mp4list = glob.glob('video/*.mp4')
    if len(mp4list) > 0:
        mp4 = mp4list[0]
        video = io.open(mp4, 'r+b').read()
        encoded = base64.b64encode(video)
        ipythondisplay.display(HTML(data='''<video alt="test" autoplay 
                loop controls style="height: 400px;">
                <source src="data:video/mp4;base64,{0}" type="video/mp4" />
             </video>'''.format(encoded.decode('ascii'))))
    else: 
        print("Could not find video")
    

def wrap_env(env):
    env = RecordVideo(env, './video')
    return env

In [None]:
display = Display(visible=0, size=(300, 200))
display.start()

# Load agent
# agent.load_policy_net("./save_model/breakout_dqn.pth")
agent.epsilon = 0.0 # Set agent to only exploit the best action

env = gym.make('BreakoutDeterministic-v4')
env = wrap_env(env)

done = False
score = 0
step = 0
state = env.reset()
next_state = state
life = number_lives
history = np.zeros([5, 84, 84], dtype=np.uint8)
get_init_state(history, state)

while not done:
    
    # Render breakout
    env.render()
#     show_state(env,step) # uncommenting this provides another way to visualize the game

    step += 1
    frame += 1

    # Perform a fire action if ball is no longer on screen
    if step > 1 and len(np.unique(next_state[:189] == state[:189])) < 2:
        action = 0
    else:
        action = agent.get_action(np.float32(history[:4, :, :]) / 255.)
    state = next_state
    
    next_state, reward, done, _, info = env.step(action + 1)
        
    frame_next_state = get_frame(next_state)
    history[4, :, :] = frame_next_state
    terminal_state = check_live(life, info['ale.lives'])
        
    life = info['ale.lives']
    r = np.clip(reward, -1, 1) 
    r = reward

    # Store the transition in memory 
    agent.memory.push(deepcopy(frame_next_state), action, r, terminal_state)
    # Start training after random sample generation
    score += reward
    
    history[:4, :, :] = history[1:, :, :]
env.close()
show_video()
display.stop()