# Deep Q-Learning 

Install dependencies for AI gym to run properly (shouldn't take more than a minute). If running on google cloud or running locally, only need to run once. Colab may require installing everytime the vm shuts down.

In [None]:
!pip3 install gym pyvirtualdisplay
!sudo apt-get install -y xvfb python-opengl ffmpeg

In [None]:
!pip3 install --upgrade setuptools --user
!pip3 install ez_setup 
!pip3 install gym[atari] 
!pip3 install gym[accept-rom-license] 

For this assignment we will implement the Deep Q-Learning algorithm with Experience Replay as described in breakthrough paper __"Playing Atari with Deep Reinforcement Learning"__. We will train an agent to play the famous game of __Breakout__.

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
%matplotlib inline
import sys
import gym
import torch
import pylab
import random
import numpy as np
from collections import deque
from datetime import datetime
from copy import deepcopy
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.autograd import Variable
from utils import find_max_lives, check_live, get_frame, get_init_state
from model import DQN
from config import *

import matplotlib.pyplot as plt

## Understanding the environment

In the following cell, we initialize our game of __Breakout__ and you can see how the environment looks like. For further documentation of the of the environment refer to https://www.gymlibrary.dev/environments/atari/breakout/. 

In breakout, we will use 3 actions "fire", "left", and "right". "fire" is only used to reset the game when a life is lost, "left" moves the agent left and "right" moves the agent right.

In [3]:
env = gym.make('BreakoutDeterministic-v4')
state = env.reset()

A.L.E: Arcade Learning Environment (version 0.8.1+53f58b7)
[Powered by Stella]


In [4]:
number_lives = find_max_lives(env)
state_size = env.observation_space.shape
action_size = 3 #fire, left, and right

## Creating a DQN Agent

Here we create a DQN Agent. This agent is defined in the __agent.py__. The corresponding neural network is defined in the __model.py__. Once you've created a working DQN agent, use the code in agent.py to create a double DQN agent in __agent_double.py__. Set the flag "double_dqn" to True to train the double DQN agent.

__Evaluation Reward__ : The average reward received in the past 100 episodes/games.

__Frame__ : Number of frames processed in total.

__Memory Size__ : The current size of the replay memory.

In [5]:
double_dqn = True # set to True if using double DQN agent

if double_dqn:
    from agent_double import Agent
else:
    from agent import Agent

agent = Agent(action_size)
evaluation_reward = deque(maxlen=evaluation_reward_length)
frame = 0
memory_size = 0

### Main Training Loop

In this training loop, we do not render the screen because it slows down training signficantly. To watch the agent play the game, run the code in next section "Visualize Agent Performance"

In [None]:
rewards, episodes = [], []
best_eval_reward = 0
for e in range(EPISODES):
    done = False
    score = 0

    history = np.zeros([5, 84, 84], dtype=np.uint8)
    step = 0
    state = env.reset()
    next_state = state
    life = number_lives

    get_init_state(history, state, HISTORY_SIZE)

    while not done:
        step += 1
        frame += 1

        # Perform a fire action if ball is no longer on screen to continue onto next life
        if step > 1 and len(np.unique(next_state[:189] == state[:189])) < 2:
            action = 0
        else:
            action = agent.get_action(np.float32(history[:4, :, :]) / 255.)
        state = next_state
        next_state, reward, done, _, info = env.step(action + 1)
        
        frame_next_state = get_frame(next_state)
        history[4, :, :] = frame_next_state
        terminal_state = check_live(life, info['lives'])

        life = info['lives']
        r = reward

        # Store the transition in memory 
        agent.memory.push(deepcopy(frame_next_state), action, r, terminal_state)
        # Start training after random sample generation
        if(frame >= train_frame):
            agent.train_policy_net(frame)
            # Update the target network only for Double DQN only
            if double_dqn and (frame % update_target_network_frequency)== 0:
                agent.update_target_net()
        score += reward
        history[:4, :, :] = history[1:, :, :]
            
        if done:
            evaluation_reward.append(score)
            rewards.append(np.mean(evaluation_reward))
            episodes.append(e)
            pylab.plot(episodes, rewards, 'b')
            pylab.xlabel('Episodes')
            pylab.ylabel('Rewards') 
            pylab.title('Episodes vs Reward')
            pylab.savefig("./save_graph/breakout_double_dqn.png") # save graph for training visualization
            
            # every episode, plot the play time
            print("epis:", e, "  score:", score, "  mem len:",
                  len(agent.memory), "  epsilon:", round(agent.epsilon, 4), "   steps:", step,
                  "   lr:", round(agent.optimizer.param_groups[0]['lr'], 7), "    reward:", round(np.mean(evaluation_reward), 2))

            # if the mean of scores of last 100 episode is bigger than 5 save model
            ### Change this save condition to whatever you prefer ###
            if np.mean(evaluation_reward) > 5 and np.mean(evaluation_reward) > best_eval_reward:
                torch.save(agent.policy_net, "./save_model/breakout_double_dqn.pth")
                best_eval_reward = np.mean(evaluation_reward)


  if step > 1 and len(np.unique(next_state[:189] == state[:189])) < 2:
  if step > 1 and len(np.unique(next_state[:189] == state[:189])) < 2:


epis: 0   score: 1.0   mem len: 151   epsilon: 1.0    steps: 151    lr: 0.0001     reward: 1.0
epis: 1   score: 1.0   mem len: 322   epsilon: 1.0    steps: 171    lr: 0.0001     reward: 1.0
epis: 2   score: 0.0   mem len: 445   epsilon: 1.0    steps: 123    lr: 0.0001     reward: 0.67
epis: 3   score: 3.0   mem len: 691   epsilon: 1.0    steps: 246    lr: 0.0001     reward: 1.25
epis: 4   score: 0.0   mem len: 814   epsilon: 1.0    steps: 123    lr: 0.0001     reward: 1.0
epis: 5   score: 0.0   mem len: 937   epsilon: 1.0    steps: 123    lr: 0.0001     reward: 0.83
epis: 6   score: 3.0   mem len: 1163   epsilon: 1.0    steps: 226    lr: 0.0001     reward: 1.14
epis: 7   score: 1.0   mem len: 1332   epsilon: 1.0    steps: 169    lr: 0.0001     reward: 1.12
epis: 8   score: 3.0   mem len: 1598   epsilon: 1.0    steps: 266    lr: 0.0001     reward: 1.33
epis: 9   score: 0.0   mem len: 1720   epsilon: 1.0    steps: 122    lr: 0.0001     reward: 1.2
epis: 10   score: 1.0   mem len: 1871   

epis: 84   score: 2.0   mem len: 16851   epsilon: 1.0    steps: 200    lr: 0.0001     reward: 1.81
epis: 85   score: 2.0   mem len: 17070   epsilon: 1.0    steps: 219    lr: 0.0001     reward: 1.81
epis: 86   score: 1.0   mem len: 17239   epsilon: 1.0    steps: 169    lr: 0.0001     reward: 1.8
epis: 87   score: 0.0   mem len: 17362   epsilon: 1.0    steps: 123    lr: 0.0001     reward: 1.78
epis: 88   score: 4.0   mem len: 17603   epsilon: 1.0    steps: 241    lr: 0.0001     reward: 1.81
epis: 89   score: 2.0   mem len: 17819   epsilon: 1.0    steps: 216    lr: 0.0001     reward: 1.81
epis: 90   score: 1.0   mem len: 17990   epsilon: 1.0    steps: 171    lr: 0.0001     reward: 1.8
epis: 91   score: 0.0   mem len: 18112   epsilon: 1.0    steps: 122    lr: 0.0001     reward: 1.78
epis: 92   score: 2.0   mem len: 18331   epsilon: 1.0    steps: 219    lr: 0.0001     reward: 1.78
epis: 93   score: 1.0   mem len: 18482   epsilon: 1.0    steps: 151    lr: 0.0001     reward: 1.78
epis: 94   s

epis: 167   score: 1.0   mem len: 31955   epsilon: 1.0    steps: 151    lr: 0.0001     reward: 1.6
epis: 168   score: 0.0   mem len: 32077   epsilon: 1.0    steps: 122    lr: 0.0001     reward: 1.58
epis: 169   score: 2.0   mem len: 32296   epsilon: 1.0    steps: 219    lr: 0.0001     reward: 1.57
epis: 170   score: 1.0   mem len: 32465   epsilon: 1.0    steps: 169    lr: 0.0001     reward: 1.52
epis: 171   score: 2.0   mem len: 32680   epsilon: 1.0    steps: 215    lr: 0.0001     reward: 1.53
epis: 172   score: 0.0   mem len: 32803   epsilon: 1.0    steps: 123    lr: 0.0001     reward: 1.5
epis: 173   score: 2.0   mem len: 33023   epsilon: 1.0    steps: 220    lr: 0.0001     reward: 1.49
epis: 174   score: 2.0   mem len: 33220   epsilon: 1.0    steps: 197    lr: 0.0001     reward: 1.5
epis: 175   score: 3.0   mem len: 33445   epsilon: 1.0    steps: 225    lr: 0.0001     reward: 1.51
epis: 176   score: 2.0   mem len: 33642   epsilon: 1.0    steps: 197    lr: 0.0001     reward: 1.53
epi

epis: 249   score: 2.0   mem len: 46928   epsilon: 1.0    steps: 219    lr: 0.0001     reward: 1.44
epis: 250   score: 3.0   mem len: 47155   epsilon: 1.0    steps: 227    lr: 0.0001     reward: 1.45
epis: 251   score: 1.0   mem len: 47305   epsilon: 1.0    steps: 150    lr: 0.0001     reward: 1.46
epis: 252   score: 1.0   mem len: 47474   epsilon: 1.0    steps: 169    lr: 0.0001     reward: 1.46
epis: 253   score: 1.0   mem len: 47643   epsilon: 1.0    steps: 169    lr: 0.0001     reward: 1.45
epis: 254   score: 1.0   mem len: 47812   epsilon: 1.0    steps: 169    lr: 0.0001     reward: 1.46
epis: 255   score: 0.0   mem len: 47935   epsilon: 1.0    steps: 123    lr: 0.0001     reward: 1.44
epis: 256   score: 2.0   mem len: 48135   epsilon: 1.0    steps: 200    lr: 0.0001     reward: 1.46
epis: 257   score: 2.0   mem len: 48333   epsilon: 1.0    steps: 198    lr: 0.0001     reward: 1.47
epis: 258   score: 1.0   mem len: 48503   epsilon: 1.0    steps: 170    lr: 0.0001     reward: 1.47


epis: 331   score: 2.0   mem len: 61770   epsilon: 1.0    steps: 223    lr: 0.0001     reward: 1.37
epis: 332   score: 0.0   mem len: 61892   epsilon: 1.0    steps: 122    lr: 0.0001     reward: 1.37
epis: 333   score: 2.0   mem len: 62090   epsilon: 1.0    steps: 198    lr: 0.0001     reward: 1.38
epis: 334   score: 2.0   mem len: 62287   epsilon: 1.0    steps: 197    lr: 0.0001     reward: 1.39
epis: 335   score: 0.0   mem len: 62410   epsilon: 1.0    steps: 123    lr: 0.0001     reward: 1.36
epis: 336   score: 1.0   mem len: 62561   epsilon: 1.0    steps: 151    lr: 0.0001     reward: 1.37
epis: 337   score: 5.0   mem len: 62887   epsilon: 1.0    steps: 326    lr: 0.0001     reward: 1.41
epis: 338   score: 1.0   mem len: 63056   epsilon: 1.0    steps: 169    lr: 0.0001     reward: 1.4
epis: 339   score: 1.0   mem len: 63206   epsilon: 1.0    steps: 150    lr: 0.0001     reward: 1.4
epis: 340   score: 0.0   mem len: 63329   epsilon: 1.0    steps: 123    lr: 0.0001     reward: 1.39
ep

epis: 414   score: 1.0   mem len: 76806   epsilon: 1.0    steps: 150    lr: 0.0001     reward: 1.42
epis: 415   score: 5.0   mem len: 77153   epsilon: 1.0    steps: 347    lr: 0.0001     reward: 1.47
epis: 416   score: 0.0   mem len: 77276   epsilon: 1.0    steps: 123    lr: 0.0001     reward: 1.47
epis: 417   score: 4.0   mem len: 77573   epsilon: 1.0    steps: 297    lr: 0.0001     reward: 1.48
epis: 418   score: 1.0   mem len: 77741   epsilon: 1.0    steps: 168    lr: 0.0001     reward: 1.48
epis: 419   score: 0.0   mem len: 77864   epsilon: 1.0    steps: 123    lr: 0.0001     reward: 1.44
epis: 420   score: 4.0   mem len: 78157   epsilon: 1.0    steps: 293    lr: 0.0001     reward: 1.48
epis: 421   score: 0.0   mem len: 78280   epsilon: 1.0    steps: 123    lr: 0.0001     reward: 1.43
epis: 422   score: 0.0   mem len: 78403   epsilon: 1.0    steps: 123    lr: 0.0001     reward: 1.43
epis: 423   score: 0.0   mem len: 78526   epsilon: 1.0    steps: 123    lr: 0.0001     reward: 1.4
e

epis: 497   score: 1.0   mem len: 92097   epsilon: 1.0    steps: 150    lr: 0.0001     reward: 1.56
epis: 498   score: 0.0   mem len: 92220   epsilon: 1.0    steps: 123    lr: 0.0001     reward: 1.54
epis: 499   score: 2.0   mem len: 92418   epsilon: 1.0    steps: 198    lr: 0.0001     reward: 1.56
epis: 500   score: 1.0   mem len: 92587   epsilon: 1.0    steps: 169    lr: 0.0001     reward: 1.54
epis: 501   score: 4.0   mem len: 92882   epsilon: 1.0    steps: 295    lr: 0.0001     reward: 1.57
epis: 502   score: 0.0   mem len: 93005   epsilon: 1.0    steps: 123    lr: 0.0001     reward: 1.55
epis: 503   score: 2.0   mem len: 93203   epsilon: 1.0    steps: 198    lr: 0.0001     reward: 1.56
epis: 504   score: 0.0   mem len: 93326   epsilon: 1.0    steps: 123    lr: 0.0001     reward: 1.56
epis: 505   score: 1.0   mem len: 93495   epsilon: 1.0    steps: 169    lr: 0.0001     reward: 1.57
epis: 506   score: 2.0   mem len: 93677   epsilon: 1.0    steps: 182    lr: 0.0001     reward: 1.52


  sample = np.array(sample)
  mini_batch = np.array(mini_batch).transpose()
  next_q_values[mask] = self.target_net(non_final_next_states).gather(1, next_action.unsqueeze(1)).cuda()[mask]


epis: 541   score: 0.0   mem len: 100100   epsilon: 0.9998    steps: 123    lr: 0.0001     reward: 1.52
epis: 542   score: 1.0   mem len: 100251   epsilon: 0.9995    steps: 151    lr: 0.0001     reward: 1.52
epis: 543   score: 0.0   mem len: 100374   epsilon: 0.9993    steps: 123    lr: 0.0001     reward: 1.5
epis: 544   score: 1.0   mem len: 100525   epsilon: 0.999    steps: 151    lr: 0.0001     reward: 1.51
epis: 545   score: 1.0   mem len: 100693   epsilon: 0.9986    steps: 168    lr: 0.0001     reward: 1.51
epis: 546   score: 1.0   mem len: 100862   epsilon: 0.9983    steps: 169    lr: 0.0001     reward: 1.49
epis: 547   score: 3.0   mem len: 101108   epsilon: 0.9978    steps: 246    lr: 0.0001     reward: 1.51
epis: 548   score: 1.0   mem len: 101277   epsilon: 0.9975    steps: 169    lr: 0.0001     reward: 1.48
epis: 549   score: 1.0   mem len: 101428   epsilon: 0.9972    steps: 151    lr: 0.0001     reward: 1.48
epis: 550   score: 1.0   mem len: 101579   epsilon: 0.9969    step

epis: 620   score: 3.0   mem len: 114181   epsilon: 0.9719    steps: 244    lr: 0.0001     reward: 1.47
epis: 621   score: 0.0   mem len: 114304   epsilon: 0.9717    steps: 123    lr: 0.0001     reward: 1.47
epis: 622   score: 0.0   mem len: 114427   epsilon: 0.9714    steps: 123    lr: 0.0001     reward: 1.46
epis: 623   score: 3.0   mem len: 114677   epsilon: 0.9709    steps: 250    lr: 0.0001     reward: 1.47
epis: 624   score: 4.0   mem len: 114977   epsilon: 0.9703    steps: 300    lr: 0.0001     reward: 1.51
epis: 625   score: 2.0   mem len: 115178   epsilon: 0.9699    steps: 201    lr: 0.0001     reward: 1.52
epis: 626   score: 0.0   mem len: 115301   epsilon: 0.9697    steps: 123    lr: 0.0001     reward: 1.5
epis: 627   score: 0.0   mem len: 115424   epsilon: 0.9695    steps: 123    lr: 0.0001     reward: 1.48
epis: 628   score: 1.0   mem len: 115594   epsilon: 0.9691    steps: 170    lr: 0.0001     reward: 1.46
epis: 629   score: 4.0   mem len: 115888   epsilon: 0.9685    ste

epis: 699   score: 0.0   mem len: 129078   epsilon: 0.9424    steps: 122    lr: 0.0001     reward: 1.54
epis: 700   score: 2.0   mem len: 129277   epsilon: 0.942    steps: 199    lr: 0.0001     reward: 1.55
epis: 701   score: 0.0   mem len: 129400   epsilon: 0.9418    steps: 123    lr: 0.0001     reward: 1.55
epis: 702   score: 0.0   mem len: 129523   epsilon: 0.9415    steps: 123    lr: 0.0001     reward: 1.54
epis: 703   score: 1.0   mem len: 129674   epsilon: 0.9412    steps: 151    lr: 0.0001     reward: 1.55
epis: 704   score: 0.0   mem len: 129797   epsilon: 0.941    steps: 123    lr: 0.0001     reward: 1.55
epis: 705   score: 1.0   mem len: 129948   epsilon: 0.9407    steps: 151    lr: 0.0001     reward: 1.55
epis: 706   score: 1.0   mem len: 130118   epsilon: 0.9404    steps: 170    lr: 0.0001     reward: 1.51
epis: 707   score: 1.0   mem len: 130289   epsilon: 0.94    steps: 171    lr: 0.0001     reward: 1.47
epis: 708   score: 5.0   mem len: 130609   epsilon: 0.9394    steps:

epis: 778   score: 0.0   mem len: 143503   epsilon: 0.9139    steps: 123    lr: 0.0001     reward: 1.64
epis: 779   score: 3.0   mem len: 143749   epsilon: 0.9134    steps: 246    lr: 0.0001     reward: 1.62
epis: 780   score: 1.0   mem len: 143918   epsilon: 0.913    steps: 169    lr: 0.0001     reward: 1.62
epis: 781   score: 2.0   mem len: 144136   epsilon: 0.9126    steps: 218    lr: 0.0001     reward: 1.62
epis: 782   score: 1.0   mem len: 144287   epsilon: 0.9123    steps: 151    lr: 0.0001     reward: 1.62
epis: 783   score: 3.0   mem len: 144533   epsilon: 0.9118    steps: 246    lr: 0.0001     reward: 1.63
epis: 784   score: 3.0   mem len: 144760   epsilon: 0.9114    steps: 227    lr: 0.0001     reward: 1.62
epis: 785   score: 4.0   mem len: 145055   epsilon: 0.9108    steps: 295    lr: 0.0001     reward: 1.64
epis: 786   score: 3.0   mem len: 145300   epsilon: 0.9103    steps: 245    lr: 0.0001     reward: 1.65
epis: 787   score: 0.0   mem len: 145422   epsilon: 0.9101    ste

epis: 857   score: 0.0   mem len: 158194   epsilon: 0.8848    steps: 123    lr: 0.0001     reward: 1.55
epis: 858   score: 0.0   mem len: 158316   epsilon: 0.8845    steps: 122    lr: 0.0001     reward: 1.55
epis: 859   score: 0.0   mem len: 158439   epsilon: 0.8843    steps: 123    lr: 0.0001     reward: 1.53
epis: 860   score: 3.0   mem len: 158684   epsilon: 0.8838    steps: 245    lr: 0.0001     reward: 1.53
epis: 861   score: 2.0   mem len: 158866   epsilon: 0.8834    steps: 182    lr: 0.0001     reward: 1.54
epis: 862   score: 3.0   mem len: 159115   epsilon: 0.883    steps: 249    lr: 0.0001     reward: 1.55
epis: 863   score: 3.0   mem len: 159384   epsilon: 0.8824    steps: 269    lr: 0.0001     reward: 1.58
epis: 864   score: 0.0   mem len: 159506   epsilon: 0.8822    steps: 122    lr: 0.0001     reward: 1.56
epis: 865   score: 1.0   mem len: 159674   epsilon: 0.8818    steps: 168    lr: 0.0001     reward: 1.57
epis: 866   score: 2.0   mem len: 159893   epsilon: 0.8814    ste

epis: 936   score: 2.0   mem len: 174699   epsilon: 0.8521    steps: 182    lr: 0.0001     reward: 2.1
epis: 937   score: 3.0   mem len: 174946   epsilon: 0.8516    steps: 247    lr: 0.0001     reward: 2.1
epis: 938   score: 2.0   mem len: 175147   epsilon: 0.8512    steps: 201    lr: 0.0001     reward: 2.11
epis: 939   score: 3.0   mem len: 175391   epsilon: 0.8507    steps: 244    lr: 0.0001     reward: 2.13
epis: 940   score: 7.0   mem len: 175801   epsilon: 0.8499    steps: 410    lr: 0.0001     reward: 2.2
epis: 941   score: 5.0   mem len: 176114   epsilon: 0.8493    steps: 313    lr: 0.0001     reward: 2.21
epis: 942   score: 2.0   mem len: 176336   epsilon: 0.8489    steps: 222    lr: 0.0001     reward: 2.22
epis: 943   score: 0.0   mem len: 176459   epsilon: 0.8486    steps: 123    lr: 0.0001     reward: 2.21
epis: 944   score: 2.0   mem len: 176639   epsilon: 0.8483    steps: 180    lr: 0.0001     reward: 2.2
epis: 945   score: 3.0   mem len: 176883   epsilon: 0.8478    steps:

epis: 1015   score: 2.0   mem len: 191983   epsilon: 0.8179    steps: 200    lr: 0.0001     reward: 2.35
epis: 1016   score: 1.0   mem len: 192153   epsilon: 0.8175    steps: 170    lr: 0.0001     reward: 2.35
epis: 1017   score: 1.0   mem len: 192321   epsilon: 0.8172    steps: 168    lr: 0.0001     reward: 2.35
epis: 1018   score: 4.0   mem len: 192631   epsilon: 0.8166    steps: 310    lr: 0.0001     reward: 2.35
epis: 1019   score: 4.0   mem len: 192947   epsilon: 0.816    steps: 316    lr: 0.0001     reward: 2.35
epis: 1020   score: 1.0   mem len: 193118   epsilon: 0.8156    steps: 171    lr: 0.0001     reward: 2.36
epis: 1021   score: 3.0   mem len: 193345   epsilon: 0.8152    steps: 227    lr: 0.0001     reward: 2.36
epis: 1022   score: 1.0   mem len: 193496   epsilon: 0.8149    steps: 151    lr: 0.0001     reward: 2.35
epis: 1023   score: 2.0   mem len: 193694   epsilon: 0.8145    steps: 198    lr: 0.0001     reward: 2.34
epis: 1024   score: 2.0   mem len: 193892   epsilon: 0.8

epis: 1094   score: 2.0   mem len: 209306   epsilon: 0.7836    steps: 198    lr: 4e-05     reward: 2.47
epis: 1095   score: 1.0   mem len: 209457   epsilon: 0.7833    steps: 151    lr: 4e-05     reward: 2.44
epis: 1096   score: 1.0   mem len: 209626   epsilon: 0.7829    steps: 169    lr: 4e-05     reward: 2.45
epis: 1097   score: 7.0   mem len: 210039   epsilon: 0.7821    steps: 413    lr: 4e-05     reward: 2.52
epis: 1098   score: 1.0   mem len: 210209   epsilon: 0.7818    steps: 170    lr: 4e-05     reward: 2.53
epis: 1099   score: 3.0   mem len: 210439   epsilon: 0.7813    steps: 230    lr: 4e-05     reward: 2.49
epis: 1100   score: 3.0   mem len: 210684   epsilon: 0.7808    steps: 245    lr: 4e-05     reward: 2.5
epis: 1101   score: 1.0   mem len: 210852   epsilon: 0.7805    steps: 168    lr: 4e-05     reward: 2.49
epis: 1102   score: 3.0   mem len: 211100   epsilon: 0.78    steps: 248    lr: 4e-05     reward: 2.5
epis: 1103   score: 2.0   mem len: 211300   epsilon: 0.7796    steps

epis: 1173   score: 4.0   mem len: 227927   epsilon: 0.7467    steps: 274    lr: 4e-05     reward: 2.82
epis: 1174   score: 2.0   mem len: 228126   epsilon: 0.7463    steps: 199    lr: 4e-05     reward: 2.82
epis: 1175   score: 5.0   mem len: 228472   epsilon: 0.7456    steps: 346    lr: 4e-05     reward: 2.84
epis: 1176   score: 3.0   mem len: 228716   epsilon: 0.7451    steps: 244    lr: 4e-05     reward: 2.83
epis: 1177   score: 4.0   mem len: 229012   epsilon: 0.7446    steps: 296    lr: 4e-05     reward: 2.84
epis: 1178   score: 5.0   mem len: 229354   epsilon: 0.7439    steps: 342    lr: 4e-05     reward: 2.85
epis: 1179   score: 4.0   mem len: 229630   epsilon: 0.7433    steps: 276    lr: 4e-05     reward: 2.88
epis: 1180   score: 3.0   mem len: 229856   epsilon: 0.7429    steps: 226    lr: 4e-05     reward: 2.9
epis: 1181   score: 4.0   mem len: 230154   epsilon: 0.7423    steps: 298    lr: 4e-05     reward: 2.91
epis: 1182   score: 3.0   mem len: 230400   epsilon: 0.7418    st

epis: 1252   score: 4.0   mem len: 247564   epsilon: 0.7078    steps: 278    lr: 4e-05     reward: 3.32
epis: 1253   score: 5.0   mem len: 247908   epsilon: 0.7071    steps: 344    lr: 4e-05     reward: 3.34
epis: 1254   score: 0.0   mem len: 248031   epsilon: 0.7069    steps: 123    lr: 4e-05     reward: 3.34
epis: 1255   score: 1.0   mem len: 248181   epsilon: 0.7066    steps: 150    lr: 4e-05     reward: 3.34
epis: 1256   score: 4.0   mem len: 248476   epsilon: 0.706    steps: 295    lr: 4e-05     reward: 3.35
epis: 1257   score: 0.0   mem len: 248598   epsilon: 0.7058    steps: 122    lr: 4e-05     reward: 3.33
epis: 1258   score: 1.0   mem len: 248768   epsilon: 0.7054    steps: 170    lr: 4e-05     reward: 3.29
epis: 1259   score: 3.0   mem len: 248993   epsilon: 0.705    steps: 225    lr: 4e-05     reward: 3.3
epis: 1260   score: 3.0   mem len: 249219   epsilon: 0.7045    steps: 226    lr: 4e-05     reward: 3.29
epis: 1261   score: 1.0   mem len: 249388   epsilon: 0.7042    step

epis: 1331   score: 4.0   mem len: 266101   epsilon: 0.6711    steps: 256    lr: 4e-05     reward: 3.04
epis: 1332   score: 5.0   mem len: 266408   epsilon: 0.6705    steps: 307    lr: 4e-05     reward: 3.08
epis: 1333   score: 3.0   mem len: 266657   epsilon: 0.67    steps: 249    lr: 4e-05     reward: 3.07
epis: 1334   score: 4.0   mem len: 266953   epsilon: 0.6694    steps: 296    lr: 4e-05     reward: 3.09
epis: 1335   score: 3.0   mem len: 267162   epsilon: 0.669    steps: 209    lr: 4e-05     reward: 3.09
epis: 1336   score: 2.0   mem len: 267380   epsilon: 0.6686    steps: 218    lr: 4e-05     reward: 3.05
epis: 1337   score: 2.0   mem len: 267577   epsilon: 0.6682    steps: 197    lr: 4e-05     reward: 3.01
epis: 1338   score: 3.0   mem len: 267790   epsilon: 0.6678    steps: 213    lr: 4e-05     reward: 2.99
epis: 1339   score: 5.0   mem len: 268126   epsilon: 0.6671    steps: 336    lr: 4e-05     reward: 3.03
epis: 1340   score: 3.0   mem len: 268371   epsilon: 0.6666    step

epis: 1410   score: 3.0   mem len: 286551   epsilon: 0.6306    steps: 226    lr: 4e-05     reward: 3.27
epis: 1411   score: 2.0   mem len: 286751   epsilon: 0.6302    steps: 200    lr: 4e-05     reward: 3.26
epis: 1412   score: 6.0   mem len: 287103   epsilon: 0.6295    steps: 352    lr: 4e-05     reward: 3.3
epis: 1413   score: 5.0   mem len: 287410   epsilon: 0.6289    steps: 307    lr: 4e-05     reward: 3.31
epis: 1414   score: 3.0   mem len: 287623   epsilon: 0.6285    steps: 213    lr: 4e-05     reward: 3.3
epis: 1415   score: 5.0   mem len: 287934   epsilon: 0.6279    steps: 311    lr: 4e-05     reward: 3.35
epis: 1416   score: 2.0   mem len: 288132   epsilon: 0.6275    steps: 198    lr: 4e-05     reward: 3.34
epis: 1417   score: 7.0   mem len: 288562   epsilon: 0.6266    steps: 430    lr: 4e-05     reward: 3.36
epis: 1418   score: 5.0   mem len: 288921   epsilon: 0.6259    steps: 359    lr: 4e-05     reward: 3.4
epis: 1419   score: 3.0   mem len: 289146   epsilon: 0.6255    step

epis: 1489   score: 3.0   mem len: 308352   epsilon: 0.5875    steps: 226    lr: 1.6e-05     reward: 3.98
epis: 1490   score: 2.0   mem len: 308549   epsilon: 0.5871    steps: 197    lr: 1.6e-05     reward: 3.99
epis: 1491   score: 8.0   mem len: 308955   epsilon: 0.5863    steps: 406    lr: 1.6e-05     reward: 4.0
epis: 1492   score: 5.0   mem len: 309281   epsilon: 0.5856    steps: 326    lr: 1.6e-05     reward: 3.97
epis: 1493   score: 4.0   mem len: 309557   epsilon: 0.5851    steps: 276    lr: 1.6e-05     reward: 3.99
epis: 1494   score: 6.0   mem len: 309891   epsilon: 0.5844    steps: 334    lr: 1.6e-05     reward: 4.03
epis: 1495   score: 7.0   mem len: 310272   epsilon: 0.5837    steps: 381    lr: 1.6e-05     reward: 4.09
epis: 1496   score: 6.0   mem len: 310628   epsilon: 0.583    steps: 356    lr: 1.6e-05     reward: 4.12
epis: 1497   score: 8.0   mem len: 311078   epsilon: 0.5821    steps: 450    lr: 1.6e-05     reward: 4.15
epis: 1498   score: 2.0   mem len: 311258   epsi

epis: 1567   score: 8.0   mem len: 332181   epsilon: 0.5403    steps: 432    lr: 1.6e-05     reward: 4.68
epis: 1568   score: 14.0   mem len: 332732   epsilon: 0.5392    steps: 551    lr: 1.6e-05     reward: 4.74
epis: 1569   score: 8.0   mem len: 333212   epsilon: 0.5382    steps: 480    lr: 1.6e-05     reward: 4.79
epis: 1570   score: 5.0   mem len: 333535   epsilon: 0.5376    steps: 323    lr: 1.6e-05     reward: 4.8
epis: 1571   score: 7.0   mem len: 333882   epsilon: 0.5369    steps: 347    lr: 1.6e-05     reward: 4.77
epis: 1572   score: 10.0   mem len: 334359   epsilon: 0.536    steps: 477    lr: 1.6e-05     reward: 4.85
epis: 1573   score: 2.0   mem len: 334541   epsilon: 0.5356    steps: 182    lr: 1.6e-05     reward: 4.83
epis: 1574   score: 4.0   mem len: 334800   epsilon: 0.5351    steps: 259    lr: 1.6e-05     reward: 4.82
epis: 1575   score: 5.0   mem len: 335092   epsilon: 0.5345    steps: 292    lr: 1.6e-05     reward: 4.83
epis: 1576   score: 3.0   mem len: 335304   ep

epis: 1645   score: 7.0   mem len: 356861   epsilon: 0.4914    steps: 246    lr: 1.6e-05     reward: 5.13
epis: 1646   score: 5.0   mem len: 357186   epsilon: 0.4908    steps: 325    lr: 1.6e-05     reward: 5.14
epis: 1647   score: 5.0   mem len: 357495   epsilon: 0.4902    steps: 309    lr: 1.6e-05     reward: 5.15
epis: 1648   score: 6.0   mem len: 357851   epsilon: 0.4895    steps: 356    lr: 1.6e-05     reward: 5.15
epis: 1649   score: 12.0   mem len: 358364   epsilon: 0.4884    steps: 513    lr: 1.6e-05     reward: 5.2
epis: 1650   score: 2.0   mem len: 358546   epsilon: 0.4881    steps: 182    lr: 1.6e-05     reward: 5.18
epis: 1651   score: 5.0   mem len: 358852   epsilon: 0.4875    steps: 306    lr: 1.6e-05     reward: 5.16
epis: 1652   score: 3.0   mem len: 359065   epsilon: 0.487    steps: 213    lr: 1.6e-05     reward: 5.11
epis: 1653   score: 3.0   mem len: 359313   epsilon: 0.4866    steps: 248    lr: 1.6e-05     reward: 5.08
epis: 1654   score: 6.0   mem len: 359711   eps

epis: 1723   score: 6.0   mem len: 382918   epsilon: 0.4398    steps: 335    lr: 1.6e-05     reward: 5.55
epis: 1724   score: 10.0   mem len: 383258   epsilon: 0.4391    steps: 340    lr: 1.6e-05     reward: 5.59
epis: 1725   score: 5.0   mem len: 383549   epsilon: 0.4386    steps: 291    lr: 1.6e-05     reward: 5.57
epis: 1726   score: 5.0   mem len: 383863   epsilon: 0.4379    steps: 314    lr: 1.6e-05     reward: 5.53
epis: 1727   score: 11.0   mem len: 384286   epsilon: 0.4371    steps: 423    lr: 1.6e-05     reward: 5.61
epis: 1728   score: 5.0   mem len: 384571   epsilon: 0.4365    steps: 285    lr: 1.6e-05     reward: 5.62
epis: 1729   score: 6.0   mem len: 384927   epsilon: 0.4358    steps: 356    lr: 1.6e-05     reward: 5.64
epis: 1730   score: 4.0   mem len: 385186   epsilon: 0.4353    steps: 259    lr: 1.6e-05     reward: 5.62
epis: 1731   score: 8.0   mem len: 385664   epsilon: 0.4344    steps: 478    lr: 1.6e-05     reward: 5.62
epis: 1732   score: 6.0   mem len: 386021   

epis: 1801   score: 5.0   mem len: 409351   epsilon: 0.3875    steps: 307    lr: 6.4e-06     reward: 6.08
epis: 1802   score: 3.0   mem len: 409561   epsilon: 0.3871    steps: 210    lr: 6.4e-06     reward: 6.03
epis: 1803   score: 4.0   mem len: 409802   epsilon: 0.3866    steps: 241    lr: 6.4e-06     reward: 6.0
epis: 1804   score: 7.0   mem len: 410207   epsilon: 0.3858    steps: 405    lr: 6.4e-06     reward: 5.96
epis: 1805   score: 5.0   mem len: 410553   epsilon: 0.3851    steps: 346    lr: 6.4e-06     reward: 5.88
epis: 1806   score: 10.0   mem len: 411090   epsilon: 0.384    steps: 537    lr: 6.4e-06     reward: 5.88
epis: 1807   score: 7.0   mem len: 411484   epsilon: 0.3833    steps: 394    lr: 6.4e-06     reward: 5.9
epis: 1808   score: 9.0   mem len: 411955   epsilon: 0.3823    steps: 471    lr: 6.4e-06     reward: 5.95
epis: 1809   score: 4.0   mem len: 412195   epsilon: 0.3819    steps: 240    lr: 6.4e-06     reward: 5.9
epis: 1810   score: 3.0   mem len: 412406   epsil

epis: 1879   score: 5.0   mem len: 436813   epsilon: 0.3331    steps: 305    lr: 6.4e-06     reward: 6.16
epis: 1880   score: 3.0   mem len: 437044   epsilon: 0.3327    steps: 231    lr: 6.4e-06     reward: 6.08
epis: 1881   score: 5.0   mem len: 437367   epsilon: 0.332    steps: 323    lr: 6.4e-06     reward: 6.04
epis: 1882   score: 7.0   mem len: 437734   epsilon: 0.3313    steps: 367    lr: 6.4e-06     reward: 6.08
epis: 1883   score: 8.0   mem len: 438141   epsilon: 0.3305    steps: 407    lr: 6.4e-06     reward: 6.1
epis: 1884   score: 8.0   mem len: 438422   epsilon: 0.3299    steps: 281    lr: 6.4e-06     reward: 6.14
epis: 1885   score: 4.0   mem len: 438697   epsilon: 0.3294    steps: 275    lr: 6.4e-06     reward: 6.13
epis: 1886   score: 10.0   mem len: 439155   epsilon: 0.3285    steps: 458    lr: 6.4e-06     reward: 6.12
epis: 1887   score: 4.0   mem len: 439431   epsilon: 0.3279    steps: 276    lr: 6.4e-06     reward: 6.13
epis: 1888   score: 12.0   mem len: 439921   ep

epis: 1957   score: 6.0   mem len: 465004   epsilon: 0.2773    steps: 359    lr: 6.4e-06     reward: 6.5
epis: 1958   score: 10.0   mem len: 465533   epsilon: 0.2762    steps: 529    lr: 6.4e-06     reward: 6.5
epis: 1959   score: 5.0   mem len: 465806   epsilon: 0.2757    steps: 273    lr: 6.4e-06     reward: 6.51
epis: 1960   score: 4.0   mem len: 466045   epsilon: 0.2752    steps: 239    lr: 6.4e-06     reward: 6.48
epis: 1961   score: 7.0   mem len: 466426   epsilon: 0.2745    steps: 381    lr: 6.4e-06     reward: 6.52
epis: 1962   score: 7.0   mem len: 466789   epsilon: 0.2738    steps: 363    lr: 6.4e-06     reward: 6.53
epis: 1963   score: 4.0   mem len: 467088   epsilon: 0.2732    steps: 299    lr: 6.4e-06     reward: 6.53
epis: 1964   score: 8.0   mem len: 467529   epsilon: 0.2723    steps: 441    lr: 6.4e-06     reward: 6.5
epis: 1965   score: 5.0   mem len: 467823   epsilon: 0.2717    steps: 294    lr: 6.4e-06     reward: 6.48
epis: 1966   score: 10.0   mem len: 468316   eps

epis: 2035   score: 7.0   mem len: 494982   epsilon: 0.2179    steps: 385    lr: 6.4e-06     reward: 6.86
epis: 2036   score: 5.0   mem len: 495287   epsilon: 0.2173    steps: 305    lr: 6.4e-06     reward: 6.84
epis: 2037   score: 6.0   mem len: 495665   epsilon: 0.2166    steps: 378    lr: 6.4e-06     reward: 6.86
epis: 2038   score: 12.0   mem len: 496119   epsilon: 0.2157    steps: 454    lr: 6.4e-06     reward: 6.88
epis: 2039   score: 11.0   mem len: 496685   epsilon: 0.2146    steps: 566    lr: 6.4e-06     reward: 6.94
epis: 2040   score: 9.0   mem len: 497143   epsilon: 0.2137    steps: 458    lr: 6.4e-06     reward: 6.96
epis: 2041   score: 8.0   mem len: 497596   epsilon: 0.2128    steps: 453    lr: 6.4e-06     reward: 7.0
epis: 2042   score: 8.0   mem len: 497893   epsilon: 0.2122    steps: 297    lr: 6.4e-06     reward: 7.02
epis: 2043   score: 9.0   mem len: 498348   epsilon: 0.2113    steps: 455    lr: 6.4e-06     reward: 7.01
epis: 2044   score: 5.0   mem len: 498654   e

epis: 2113   score: 5.0   mem len: 525753   epsilon: 0.157    steps: 307    lr: 2.6e-06     reward: 7.33
epis: 2114   score: 9.0   mem len: 526205   epsilon: 0.1561    steps: 452    lr: 2.6e-06     reward: 7.34
epis: 2115   score: 5.0   mem len: 526492   epsilon: 0.1555    steps: 287    lr: 2.6e-06     reward: 7.31
epis: 2116   score: 9.0   mem len: 526954   epsilon: 0.1546    steps: 462    lr: 2.6e-06     reward: 7.32
epis: 2117   score: 10.0   mem len: 527433   epsilon: 0.1537    steps: 479    lr: 2.6e-06     reward: 7.35
epis: 2118   score: 9.0   mem len: 527900   epsilon: 0.1528    steps: 467    lr: 2.6e-06     reward: 7.34
epis: 2119   score: 8.0   mem len: 528302   epsilon: 0.152    steps: 402    lr: 2.6e-06     reward: 7.37
epis: 2120   score: 8.0   mem len: 528714   epsilon: 0.1511    steps: 412    lr: 2.6e-06     reward: 7.39
epis: 2121   score: 11.0   mem len: 529261   epsilon: 0.1501    steps: 547    lr: 2.6e-06     reward: 7.44
epis: 2122   score: 7.0   mem len: 529645   ep

epis: 2191   score: 15.0   mem len: 557359   epsilon: 0.0944    steps: 598    lr: 2.6e-06     reward: 7.61
epis: 2192   score: 10.0   mem len: 557832   epsilon: 0.0935    steps: 473    lr: 2.6e-06     reward: 7.62
epis: 2193   score: 6.0   mem len: 558186   epsilon: 0.0928    steps: 354    lr: 2.6e-06     reward: 7.65
epis: 2194   score: 8.0   mem len: 558614   epsilon: 0.0919    steps: 428    lr: 2.6e-06     reward: 7.67
epis: 2195   score: 6.0   mem len: 558975   epsilon: 0.0912    steps: 361    lr: 2.6e-06     reward: 7.66
epis: 2196   score: 8.0   mem len: 559371   epsilon: 0.0904    steps: 396    lr: 2.6e-06     reward: 7.7
epis: 2197   score: 6.0   mem len: 559725   epsilon: 0.0897    steps: 354    lr: 2.6e-06     reward: 7.71
epis: 2198   score: 4.0   mem len: 560004   epsilon: 0.0892    steps: 279    lr: 2.6e-06     reward: 7.69
epis: 2199   score: 3.0   mem len: 560217   epsilon: 0.0888    steps: 213    lr: 2.6e-06     reward: 7.65
epis: 2200   score: 6.0   mem len: 560591   e

epis: 2269   score: 11.0   mem len: 589199   epsilon: 0.0314    steps: 518    lr: 2.6e-06     reward: 7.75
epis: 2270   score: 5.0   mem len: 589490   epsilon: 0.0308    steps: 291    lr: 2.6e-06     reward: 7.74
epis: 2271   score: 10.0   mem len: 589963   epsilon: 0.0299    steps: 473    lr: 2.6e-06     reward: 7.79
epis: 2272   score: 10.0   mem len: 590428   epsilon: 0.029    steps: 465    lr: 2.6e-06     reward: 7.85
epis: 2273   score: 6.0   mem len: 590776   epsilon: 0.0283    steps: 348    lr: 2.6e-06     reward: 7.82
epis: 2274   score: 14.0   mem len: 591252   epsilon: 0.0273    steps: 476    lr: 2.6e-06     reward: 7.85
epis: 2275   score: 7.0   mem len: 591645   epsilon: 0.0265    steps: 393    lr: 2.6e-06     reward: 7.89
epis: 2276   score: 5.0   mem len: 591951   epsilon: 0.0259    steps: 306    lr: 2.6e-06     reward: 7.87
epis: 2277   score: 4.0   mem len: 592211   epsilon: 0.0254    steps: 260    lr: 2.6e-06     reward: 7.85
epis: 2278   score: 4.0   mem len: 592471  

epis: 2349   score: 8.0   mem len: 621393   epsilon: 0.01    steps: 417    lr: 1e-06     reward: 7.95
epis: 2350   score: 10.0   mem len: 621903   epsilon: 0.01    steps: 510    lr: 1e-06     reward: 8.02
epis: 2351   score: 5.0   mem len: 622198   epsilon: 0.01    steps: 295    lr: 1e-06     reward: 8.0
epis: 2352   score: 11.0   mem len: 622775   epsilon: 0.01    steps: 577    lr: 1e-06     reward: 8.05
epis: 2353   score: 10.0   mem len: 623262   epsilon: 0.01    steps: 487    lr: 1e-06     reward: 8.04
epis: 2354   score: 8.0   mem len: 623740   epsilon: 0.01    steps: 478    lr: 1e-06     reward: 8.02
epis: 2355   score: 11.0   mem len: 624267   epsilon: 0.01    steps: 527    lr: 1e-06     reward: 7.99
epis: 2356   score: 8.0   mem len: 624692   epsilon: 0.01    steps: 425    lr: 1e-06     reward: 8.0
epis: 2357   score: 7.0   mem len: 625083   epsilon: 0.01    steps: 391    lr: 1e-06     reward: 7.98
epis: 2358   score: 5.0   mem len: 625409   epsilon: 0.01    steps: 326    lr: 1

epis: 2430   score: 5.0   mem len: 657884   epsilon: 0.01    steps: 331    lr: 1e-06     reward: 9.41
epis: 2431   score: 9.0   mem len: 658355   epsilon: 0.01    steps: 471    lr: 1e-06     reward: 9.37
epis: 2432   score: 9.0   mem len: 658826   epsilon: 0.01    steps: 471    lr: 1e-06     reward: 9.37
epis: 2433   score: 3.0   mem len: 659057   epsilon: 0.01    steps: 231    lr: 1e-06     reward: 9.29
epis: 2434   score: 3.0   mem len: 659289   epsilon: 0.01    steps: 232    lr: 1e-06     reward: 9.29
epis: 2435   score: 7.0   mem len: 659646   epsilon: 0.01    steps: 357    lr: 1e-06     reward: 9.27
epis: 2436   score: 5.0   mem len: 659958   epsilon: 0.01    steps: 312    lr: 1e-06     reward: 9.24
epis: 2437   score: 3.0   mem len: 660190   epsilon: 0.01    steps: 232    lr: 1e-06     reward: 9.21
epis: 2438   score: 6.0   mem len: 660536   epsilon: 0.01    steps: 346    lr: 1e-06     reward: 9.18
epis: 2439   score: 7.0   mem len: 660905   epsilon: 0.01    steps: 369    lr: 1e-

epis: 2510   score: 16.0   mem len: 694267   epsilon: 0.01    steps: 596    lr: 1e-06     reward: 9.57
epis: 2511   score: 17.0   mem len: 694802   epsilon: 0.01    steps: 535    lr: 1e-06     reward: 9.65
epis: 2512   score: 19.0   mem len: 695535   epsilon: 0.01    steps: 733    lr: 1e-06     reward: 9.62
epis: 2513   score: 17.0   mem len: 696112   epsilon: 0.01    steps: 577    lr: 1e-06     reward: 9.74
epis: 2514   score: 6.0   mem len: 696451   epsilon: 0.01    steps: 339    lr: 1e-06     reward: 9.69
epis: 2515   score: 9.0   mem len: 696917   epsilon: 0.01    steps: 466    lr: 1e-06     reward: 9.69
epis: 2516   score: 6.0   mem len: 697236   epsilon: 0.01    steps: 319    lr: 1e-06     reward: 9.64
epis: 2517   score: 10.0   mem len: 697722   epsilon: 0.01    steps: 486    lr: 1e-06     reward: 9.71
epis: 2518   score: 10.0   mem len: 698226   epsilon: 0.01    steps: 504    lr: 1e-06     reward: 9.73
epis: 2519   score: 7.0   mem len: 698576   epsilon: 0.01    steps: 350    l

epis: 2590   score: 11.0   mem len: 732555   epsilon: 0.01    steps: 554    lr: 4e-07     reward: 10.71
epis: 2591   score: 9.0   mem len: 733023   epsilon: 0.01    steps: 468    lr: 4e-07     reward: 10.7
epis: 2592   score: 10.0   mem len: 733510   epsilon: 0.01    steps: 487    lr: 4e-07     reward: 10.68
epis: 2593   score: 5.0   mem len: 733785   epsilon: 0.01    steps: 275    lr: 4e-07     reward: 10.63
epis: 2594   score: 7.0   mem len: 734135   epsilon: 0.01    steps: 350    lr: 4e-07     reward: 10.6
epis: 2595   score: 5.0   mem len: 734443   epsilon: 0.01    steps: 308    lr: 4e-07     reward: 10.5
epis: 2596   score: 9.0   mem len: 734883   epsilon: 0.01    steps: 440    lr: 4e-07     reward: 10.49
epis: 2597   score: 10.0   mem len: 735402   epsilon: 0.01    steps: 519    lr: 4e-07     reward: 10.49
epis: 2598   score: 9.0   mem len: 735870   epsilon: 0.01    steps: 468    lr: 4e-07     reward: 10.54
epis: 2599   score: 10.0   mem len: 736356   epsilon: 0.01    steps: 486 

epis: 2670   score: 11.0   mem len: 767572   epsilon: 0.01    steps: 573    lr: 4e-07     reward: 9.24
epis: 2671   score: 9.0   mem len: 768044   epsilon: 0.01    steps: 472    lr: 4e-07     reward: 9.2
epis: 2672   score: 5.0   mem len: 768372   epsilon: 0.01    steps: 328    lr: 4e-07     reward: 9.14
epis: 2673   score: 8.0   mem len: 768772   epsilon: 0.01    steps: 400    lr: 4e-07     reward: 9.1
epis: 2674   score: 10.0   mem len: 769291   epsilon: 0.01    steps: 519    lr: 4e-07     reward: 9.05
epis: 2675   score: 11.0   mem len: 769838   epsilon: 0.01    steps: 547    lr: 4e-07     reward: 8.94
epis: 2676   score: 15.0   mem len: 770438   epsilon: 0.01    steps: 600    lr: 4e-07     reward: 8.98
epis: 2677   score: 18.0   mem len: 771068   epsilon: 0.01    steps: 630    lr: 4e-07     reward: 9.1
epis: 2678   score: 10.0   mem len: 771532   epsilon: 0.01    steps: 464    lr: 4e-07     reward: 9.12
epis: 2679   score: 6.0   mem len: 771856   epsilon: 0.01    steps: 324    lr: 

epis: 2751   score: 9.0   mem len: 804146   epsilon: 0.01    steps: 471    lr: 2e-07     reward: 8.92
epis: 2752   score: 9.0   mem len: 804618   epsilon: 0.01    steps: 472    lr: 2e-07     reward: 8.89
epis: 2753   score: 10.0   mem len: 805122   epsilon: 0.01    steps: 504    lr: 2e-07     reward: 8.89
epis: 2754   score: 10.0   mem len: 805626   epsilon: 0.01    steps: 504    lr: 2e-07     reward: 8.9
epis: 2755   score: 10.0   mem len: 806130   epsilon: 0.01    steps: 504    lr: 2e-07     reward: 8.86
epis: 2756   score: 6.0   mem len: 806454   epsilon: 0.01    steps: 324    lr: 2e-07     reward: 8.86
epis: 2757   score: 5.0   mem len: 806729   epsilon: 0.01    steps: 275    lr: 2e-07     reward: 8.81
epis: 2758   score: 5.0   mem len: 807004   epsilon: 0.01    steps: 275    lr: 2e-07     reward: 8.8
epis: 2759   score: 10.0   mem len: 807498   epsilon: 0.01    steps: 494    lr: 2e-07     reward: 8.84
epis: 2760   score: 8.0   mem len: 807908   epsilon: 0.01    steps: 410    lr: 2

epis: 2832   score: 7.0   mem len: 836414   epsilon: 0.01    steps: 393    lr: 2e-07     reward: 7.73
epis: 2833   score: 10.0   mem len: 836914   epsilon: 0.01    steps: 500    lr: 2e-07     reward: 7.71
epis: 2834   score: 9.0   mem len: 837311   epsilon: 0.01    steps: 397    lr: 2e-07     reward: 7.74
epis: 2835   score: 12.0   mem len: 837872   epsilon: 0.01    steps: 561    lr: 2e-07     reward: 7.74
epis: 2836   score: 10.0   mem len: 838335   epsilon: 0.01    steps: 463    lr: 2e-07     reward: 7.76
epis: 2837   score: 8.0   mem len: 838742   epsilon: 0.01    steps: 407    lr: 2e-07     reward: 7.76
epis: 2838   score: 11.0   mem len: 839262   epsilon: 0.01    steps: 520    lr: 2e-07     reward: 7.74
epis: 2839   score: 8.0   mem len: 839691   epsilon: 0.01    steps: 429    lr: 2e-07     reward: 7.76
epis: 2840   score: 9.0   mem len: 840152   epsilon: 0.01    steps: 461    lr: 2e-07     reward: 7.75
epis: 2841   score: 11.0   mem len: 840672   epsilon: 0.01    steps: 520    lr

epis: 2913   score: 15.0   mem len: 873303   epsilon: 0.01    steps: 532    lr: 2e-07     reward: 8.86
epis: 2914   score: 10.0   mem len: 873792   epsilon: 0.01    steps: 489    lr: 2e-07     reward: 8.91
epis: 2915   score: 16.0   mem len: 874343   epsilon: 0.01    steps: 551    lr: 2e-07     reward: 9.02
epis: 2916   score: 11.0   mem len: 874843   epsilon: 0.01    steps: 500    lr: 2e-07     reward: 9.08
epis: 2917   score: 16.0   mem len: 875394   epsilon: 0.01    steps: 551    lr: 2e-07     reward: 9.12
epis: 2918   score: 10.0   mem len: 875852   epsilon: 0.01    steps: 458    lr: 2e-07     reward: 9.14
epis: 2919   score: 14.0   mem len: 876467   epsilon: 0.01    steps: 615    lr: 2e-07     reward: 9.15
epis: 2920   score: 9.0   mem len: 876913   epsilon: 0.01    steps: 446    lr: 2e-07     reward: 9.15
epis: 2921   score: 8.0   mem len: 877359   epsilon: 0.01    steps: 446    lr: 2e-07     reward: 9.14
epis: 2922   score: 11.0   mem len: 877903   epsilon: 0.01    steps: 544   

epis: 2993   score: 9.0   mem len: 912358   epsilon: 0.01    steps: 454    lr: 1e-07     reward: 10.08
epis: 2994   score: 10.0   mem len: 912816   epsilon: 0.01    steps: 458    lr: 1e-07     reward: 10.09
epis: 2995   score: 11.0   mem len: 913360   epsilon: 0.01    steps: 544    lr: 1e-07     reward: 10.1
epis: 2996   score: 6.0   mem len: 913702   epsilon: 0.01    steps: 342    lr: 1e-07     reward: 10.07
epis: 2997   score: 12.0   mem len: 914291   epsilon: 0.01    steps: 589    lr: 1e-07     reward: 10.07
epis: 2998   score: 10.0   mem len: 914749   epsilon: 0.01    steps: 458    lr: 1e-07     reward: 10.09
epis: 2999   score: 11.0   mem len: 915185   epsilon: 0.01    steps: 436    lr: 1e-07     reward: 10.14
epis: 3000   score: 10.0   mem len: 915643   epsilon: 0.01    steps: 458    lr: 1e-07     reward: 10.16
epis: 3001   score: 12.0   mem len: 916224   epsilon: 0.01    steps: 581    lr: 1e-07     reward: 10.15
epis: 3002   score: 12.0   mem len: 916775   epsilon: 0.01    steps

epis: 3073   score: 9.0   mem len: 951793   epsilon: 0.01    steps: 454    lr: 1e-07     reward: 10.22
epis: 3074   score: 9.0   mem len: 952247   epsilon: 0.01    steps: 454    lr: 1e-07     reward: 10.22
epis: 3075   score: 18.0   mem len: 952908   epsilon: 0.01    steps: 661    lr: 1e-07     reward: 10.3
epis: 3076   score: 15.0   mem len: 953434   epsilon: 0.01    steps: 526    lr: 1e-07     reward: 10.36
epis: 3077   score: 14.0   mem len: 953918   epsilon: 0.01    steps: 484    lr: 1e-07     reward: 10.4
epis: 3078   score: 15.0   mem len: 954444   epsilon: 0.01    steps: 526    lr: 1e-07     reward: 10.44
epis: 3079   score: 15.0   mem len: 954970   epsilon: 0.01    steps: 526    lr: 1e-07     reward: 10.49
epis: 3080   score: 11.0   mem len: 955490   epsilon: 0.01    steps: 520    lr: 1e-07     reward: 10.5
epis: 3081   score: 6.0   mem len: 955832   epsilon: 0.01    steps: 342    lr: 1e-07     reward: 10.45
epis: 3082   score: 23.0   mem len: 956432   epsilon: 0.01    steps: 6

epis: 3153   score: 9.0   mem len: 991258   epsilon: 0.01    steps: 450    lr: 1e-07     reward: 11.19
epis: 3154   score: 10.0   mem len: 991729   epsilon: 0.01    steps: 471    lr: 1e-07     reward: 11.18
epis: 3155   score: 10.0   mem len: 992200   epsilon: 0.01    steps: 471    lr: 1e-07     reward: 11.19
epis: 3156   score: 10.0   mem len: 992671   epsilon: 0.01    steps: 471    lr: 1e-07     reward: 11.2
epis: 3157   score: 10.0   mem len: 993142   epsilon: 0.01    steps: 471    lr: 1e-07     reward: 11.19
epis: 3158   score: 8.0   mem len: 993544   epsilon: 0.01    steps: 402    lr: 1e-07     reward: 11.16
epis: 3159   score: 12.0   mem len: 994089   epsilon: 0.01    steps: 545    lr: 1e-07     reward: 11.18
epis: 3160   score: 16.0   mem len: 994640   epsilon: 0.01    steps: 551    lr: 1e-07     reward: 11.25
epis: 3161   score: 11.0   mem len: 995197   epsilon: 0.01    steps: 557    lr: 1e-07     reward: 11.26
epis: 3162   score: 16.0   mem len: 995826   epsilon: 0.01    steps

epis: 3233   score: 7.0   mem len: 1000000   epsilon: 0.01    steps: 388    lr: 0.0     reward: 10.14
epis: 3234   score: 13.0   mem len: 1000000   epsilon: 0.01    steps: 608    lr: 0.0     reward: 10.15
epis: 3235   score: 13.0   mem len: 1000000   epsilon: 0.01    steps: 597    lr: 0.0     reward: 10.18
epis: 3236   score: 7.0   mem len: 1000000   epsilon: 0.01    steps: 388    lr: 0.0     reward: 10.15
epis: 3237   score: 8.0   mem len: 1000000   epsilon: 0.01    steps: 437    lr: 0.0     reward: 10.17
epis: 3238   score: 9.0   mem len: 1000000   epsilon: 0.01    steps: 476    lr: 0.0     reward: 10.12
epis: 3239   score: 7.0   mem len: 1000000   epsilon: 0.01    steps: 388    lr: 0.0     reward: 10.11
epis: 3240   score: 7.0   mem len: 1000000   epsilon: 0.01    steps: 388    lr: 0.0     reward: 10.09
epis: 3241   score: 8.0   mem len: 1000000   epsilon: 0.01    steps: 459    lr: 0.0     reward: 10.02
epis: 3242   score: 7.0   mem len: 1000000   epsilon: 0.01    steps: 387    lr: 

epis: 3313   score: 12.0   mem len: 1000000   epsilon: 0.01    steps: 570    lr: 0.0     reward: 10.46
epis: 3314   score: 8.0   mem len: 1000000   epsilon: 0.01    steps: 445    lr: 0.0     reward: 10.43
epis: 3315   score: 12.0   mem len: 1000000   epsilon: 0.01    steps: 570    lr: 0.0     reward: 10.5
epis: 3316   score: 4.0   mem len: 1000000   epsilon: 0.01    steps: 246    lr: 0.0     reward: 10.47
epis: 3317   score: 4.0   mem len: 1000000   epsilon: 0.01    steps: 246    lr: 0.0     reward: 10.44
epis: 3318   score: 12.0   mem len: 1000000   epsilon: 0.01    steps: 570    lr: 0.0     reward: 10.49
epis: 3319   score: 12.0   mem len: 1000000   epsilon: 0.01    steps: 570    lr: 0.0     reward: 10.54
epis: 3320   score: 11.0   mem len: 1000000   epsilon: 0.01    steps: 502    lr: 0.0     reward: 10.54
epis: 3321   score: 13.0   mem len: 1000000   epsilon: 0.01    steps: 437    lr: 0.0     reward: 10.51
epis: 3322   score: 13.0   mem len: 1000000   epsilon: 0.01    steps: 437    

epis: 3393   score: 9.0   mem len: 1000000   epsilon: 0.01    steps: 454    lr: 0.0     reward: 10.4
epis: 3394   score: 9.0   mem len: 1000000   epsilon: 0.01    steps: 472    lr: 0.0     reward: 10.37


# Visualize Agent Performance

BE AWARE THIS CODE BELOW MAY CRASH THE KERNEL IF YOU RUN THE SAME CELL TWICE.

Please save your model before running this portion of the code.

In [None]:
torch.save(agent.policy_net, "./save_model/breakout_double_dqn_latest.pth")

In [6]:
from gym.wrappers import RecordVideo # If importing monitor raises issues, try using `from gym.wrappers import RecordVideo`
import glob
import io
import base64

from IPython.display import HTML
from IPython import display as ipythondisplay

from pyvirtualdisplay import Display

# Displaying the game live
def show_state(env, step=0, info=""):
    plt.figure(3)
    plt.clf()
    plt.imshow(env.render(mode='rgb_array'))
    plt.title("%s | Step: %d %s" % ("Agent Playing",step, info))
    plt.axis('off')

    ipythondisplay.clear_output(wait=True)
    ipythondisplay.display(plt.gcf())
    
# Recording the game and replaying the game afterwards
def show_video():
    mp4list = glob.glob('video/*.mp4')
    if len(mp4list) > 0:
        mp4 = mp4list[0]
        video = io.open(mp4, 'r+b').read()
        encoded = base64.b64encode(video)
        ipythondisplay.display(HTML(data='''<video alt="test" autoplay 
                loop controls style="height: 400px;">
                <source src="data:video/mp4;base64,{0}" type="video/mp4" />
             </video>'''.format(encoded.decode('ascii'))))
    else: 
        print("Could not find video")
    

def wrap_env(env):
    env = RecordVideo(env, './video')
    return env

In [7]:
display = Display(visible=0, size=(300, 200))
display.start()

# Load agent
agent.load_policy_net("./save_model/breakout_double_dqn.pth")
agent.epsilon = 0.0 # Set agent to only exploit the best action

# env = gym.make('BreakoutDeterministic-v4')
env = gym.make('BreakoutDeterministic-v4', render_mode='rgb_array')
env = wrap_env(env)

done = False
score = 0
step = 0
state = env.reset()
next_state = state
life = number_lives
history = np.zeros([5, 84, 84], dtype=np.uint8)
get_init_state(history, state, HISTORY_SIZE)

while not done:
    
    # Render breakout
#     env.render()
#     show_state(env,step) # uncommenting this provides another way to visualize the game

    step += 1
    frame += 1

    # Perform a fire action if ball is no longer on screen
    if step > 1 and len(np.unique(next_state[:189] == state[:189])) < 2:
        action = 0
    else:
        action = agent.get_action(np.float32(history[:4, :, :]) / 255.)
    state = next_state
    
    next_state, reward, done, _, info = env.step(action + 1)
        
    frame_next_state = get_frame(next_state)
    history[4, :, :] = frame_next_state
    terminal_state = check_live(life, info['lives'])
        
    life = info['lives']
    r = np.clip(reward, -1, 1) 
    r = reward

    # Store the transition in memory 
    agent.memory.push(deepcopy(frame_next_state), action, r, terminal_state)
    # Start training after random sample generation
    score += reward
    
    history[:4, :, :] = history[1:, :, :]
env.close()
show_video()
display.stop()

  logger.warn(
  logger.warn(
  if step > 1 and len(np.unique(next_state[:189] == state[:189])) < 2:
  if step > 1 and len(np.unique(next_state[:189] == state[:189])) < 2:


Moviepy - Building video /home/kaiwenjon/Documents/Spring2023/Deep-Learning-for-CV/spring2023/MP5/assignment5_materials/assignment5_materials/video/rl-video-episode-0.mp4.
Moviepy - Writing video /home/kaiwenjon/Documents/Spring2023/Deep-Learning-for-CV/spring2023/MP5/assignment5_materials/assignment5_materials/video/rl-video-episode-0.mp4



                                                                                                                      

Moviepy - Done !
Moviepy - video ready /home/kaiwenjon/Documents/Spring2023/Deep-Learning-for-CV/spring2023/MP5/assignment5_materials/assignment5_materials/video/rl-video-episode-0.mp4




<pyvirtualdisplay.display.Display at 0x7fbd1a36ee50>