In [1]:
import matplotlib.pyplot as plt

from keras.models import Sequential, load_model
from keras.layers import Dense, Convolution2D, Flatten
from keras.optimizers import RMSprop
import tensorflow as tf

import gym
import numpy as np
import math
import cv2
import random
from collections import deque, namedtuple

from wrappers import wrap_deepmind


Using TensorFlow backend.


In [2]:
env = wrap_deepmind(gym.make('BreakoutNoFrameskip-v4'))

In [3]:
env = gym.make('BreakoutNoFrameskip-v4')

n_actions = env.action_space.n


class DeepQNet():
    def __init__(self, n_actions, height, width):
        self.model = Sequential()
        self.model.add(Convolution2D(filters=32, kernel_size=8, strides=4, activation='relu', input_shape=(height, width, 4)))
        self.model.add(Convolution2D(filters=64, kernel_size=4, strides=2, activation='relu'))
        self.model.add(Convolution2D(filters=64, kernel_size=3, strides=1, activation='relu'))

        self.model.add(Flatten())
        self.model.add(Dense(units=512, activation='relu'))
        self.model.add(Dense(units=n_actions))
        self.model.compile(loss='mse', optimizer=RMSprop(lr=0.00025,
                                             rho=0.95,
                                             epsilon=0.01))

In [4]:
Experience = namedtuple('Experience', field_names=['state', 'action', 'reward', 'done', 'new_state'])



class ReplayMemory:
    def __init__(self, capacity):
        self.buffer = deque(maxlen=capacity)

    def __len__(self):
        return len(self.buffer)

    def append(self, experience):
        self.buffer.append(experience)

    def sample(self, batch_size):
        indices = np.random.choice(len(self.buffer), batch_size, replace=False)
        states, actions, rewards, dones, next_states = zip(*[self.buffer[idx] for idx in indices])
        return np.array(states), np.array(actions), np.array(rewards, dtype=np.float32), \
               np.array(dones, dtype=np.uint8), np.array(next_states)

In [5]:
def get_epsilon(current_step):
    rate = (EPS_END-EPS_START)/MEM_SIZE
    eps_threshold = rate * current_step + EPS_START
    if eps_threshold < EPS_END:
        return EPS_END
    return eps_threshold


def select_action(state, steps_done, eval=False):
    global EPSILON

    # This equation is for the decaying epsilon
    eps_threshold = get_epsilon(steps_done)

    if eval:
        eps_threshold = EPS_END

    r = np.random.rand()

    EPSILON = eps_threshold

    # We select an action with an espilon greedy policy
    if r > eps_threshold:
        # Return the action with the maximum Q value for the current state
        return np.argmax(policy_net.model.predict(state)[0])
    else:
        return random.randrange(n_actions)

In [6]:
def optimize_model():
    if memory.__len__() < BATCH_SIZE:
        return 0
    states, actions, rewards, dones, next_states = memory.sample(BATCH_SIZE)
       
    state_action_values = policy_net.model.predict(states)
    next_state_values = np.amax(target_net.model.predict(next_states), axis=1)
    next_state_values[dones] = 0.0
    
    expected_state_action_values = next_state_values * GAMMA + rewards
    state_action_values[:,actions] = expected_state_action_values
    # Compute Huber loss
    history = policy_net.model.fit(states, state_action_values, verbose=False)

    return history.history['loss'][0]

In [7]:
BATCH_SIZE = 32
GAMMA = 0.999
EPS_START = 0.9
EPS_END = 0.05
NUMBER_OF_FRAMES = 5000000
TARGET_UPDATE = 10000


HEIGHT = 84
WIDTH = 84
EPSILON = 0
MEM_SIZE = 1000000

policy_net = DeepQNet(n_actions, HEIGHT, WIDTH)
target_net = DeepQNet(n_actions, HEIGHT, WIDTH)

memory = ReplayMemory(int(MEM_SIZE))

In [11]:
def train_model(num_frames):
    env = gym.make('BreakoutNoFrameskip-v4')
    env = wrap_deepmind(env)
    cumulative_frames = 0
    
    highest_score = 0
    current_cum_loss = []
    current_game_score = 0
    new_game = True
    games = 0
    
    losses = []
    scores = []
    while cumulative_frames < num_frames:
        if new_game:
            print("============================")
            print("Game: {} | Frame {}".format(games, cumulative_frames))
            new_game = False
        state = env.reset()
        done = False

        while not done:
            action = select_action(state.__array__().reshape(-1, 84, 84, 4), cumulative_frames)

            next_state, reward, done, info = env.step(action)
            
            memory.append(Experience(state, action, reward, done, next_state))

            state = next_state
            loss = optimize_model()
            
            current_cum_loss.append(loss)
            
            current_game_score += reward
            cumulative_frames += 1
            
            if info['ale.lives'] == 0:
                if highest_score < current_game_score:
                    highest_score = current_game_score
                    
                current_loss = np.mean(current_cum_loss)
                losses.append(current_loss)
                scores.append(current_game_score)
                
                print("Current game score: {}".format(current_game_score))
                print("Current loss: {}".format(current_loss))
                print("Highest Score: {}".format(highest_score))
                print("Average loss last 50 games: {}".format(np.mean(losses[-50:])))
                print("Average score last 50 games: {}".format(np.mean(scores[-50:])))
                
                current_game_score = 0
                current_cum_loss = []
                new_game = True
                games += 1
                

        if cumulative_frames % TARGET_UPDATE == 0:
            target_net.model.set_weights(policy_net.model.get_weights())

    target_net.model.save('my_model.h5')
    return losses, scores

In [None]:

loss, score = train_model(10000000)

Game: 0 | Frame 0
Current game score: 0.0
Current loss: 0.4921145833813046
Highest Score: 0
Average loss last 50 games: 0.4921145833813046
Average score last 50 games: 0.0
Game: 1 | Frame 475
Current game score: 0.0
Current loss: 0.5731401985099441
Highest Score: 0
Average loss last 50 games: 0.5326273909456244
Average score last 50 games: 0.0
Game: 2 | Frame 950
Current game score: 0.0
Current loss: 0.4629847489179749
Highest Score: 0
Average loss last 50 games: 0.509413176936408
Average score last 50 games: 0.0
Game: 3 | Frame 1425
Current game score: 1.0
Current loss: 0.42594967315096804
Highest Score: 1.0
Average loss last 50 games: 0.48854730099004795
Average score last 50 games: 0.25
Game: 4 | Frame 2014
Current game score: 0.0
Current loss: 0.5029596574006504
Highest Score: 1.0
Average loss last 50 games: 0.49142977227216844
Average score last 50 games: 0.2
Game: 5 | Frame 2489
Current game score: 0.0
Current loss: 0.45387893924195516
Highest Score: 1.0
Average loss last 50 game

Current game score: 1.0
Current loss: 0.38800616440680186
Highest Score: 3.0
Average loss last 50 games: 0.3941856599327892
Average score last 50 games: 0.41025641025641024
Game: 39 | Frame 21187
Current game score: 2.0
Current loss: 0.4107142587610562
Highest Score: 3.0
Average loss last 50 games: 0.3945988749034959
Average score last 50 games: 0.45
Game: 40 | Frame 21972
Current game score: 0.0
Current loss: 0.33335270262862504
Highest Score: 3.0
Average loss last 50 games: 0.393105065823621
Average score last 50 games: 0.43902439024390244
Game: 41 | Frame 22447
Current game score: 0.0
Current loss: 0.34874801772793657
Highest Score: 3.0
Average loss last 50 games: 0.39204894563086656
Average score last 50 games: 0.42857142857142855
Game: 42 | Frame 22922
Current game score: 0.0
Current loss: 0.3525430474136221
Highest Score: 3.0
Average loss last 50 games: 0.39113020381186087
Average score last 50 games: 0.4186046511627907
Game: 43 | Frame 23397
Current game score: 1.0
Current loss:

Current game score: 0.0
Current loss: 0.27821371003985407
Highest Score: 3.0
Average loss last 50 games: 0.3394580203134657
Average score last 50 games: 0.2
Game: 78 | Frame 41046
Current game score: 0.0
Current loss: 0.2830079308995291
Highest Score: 3.0
Average loss last 50 games: 0.33766578990561136
Average score last 50 games: 0.2
Game: 79 | Frame 41521
Current game score: 3.0
Current loss: 0.2945897733146636
Highest Score: 3.0
Average loss last 50 games: 0.3358514807192426
Average score last 50 games: 0.26
Game: 80 | Frame 42484
Current game score: 0.0
Current loss: 0.3190014656986061
Highest Score: 3.0
Average loss last 50 games: 0.33478469488678725
Average score last 50 games: 0.26
Game: 81 | Frame 42959
Current game score: 0.0
Current loss: 0.2831857586416759
Highest Score: 3.0
Average loss last 50 games: 0.33300549585136074
Average score last 50 games: 0.26
Game: 82 | Frame 43434
Current game score: 0.0
Current loss: 0.2948702116173349
Highest Score: 3.0
Average loss last 50 g

Current game score: 2.0
Current loss: 0.38151429281114546
Highest Score: 5.0
Average loss last 50 games: 0.3085315774365357
Average score last 50 games: 0.62
Game: 118 | Frame 64630
Current game score: 2.0
Current loss: 0.37055517312925246
Highest Score: 5.0
Average loss last 50 games: 0.3096857841786078
Average score last 50 games: 0.66
Game: 119 | Frame 65495
Current game score: 0.0
Current loss: 0.37249573771302635
Highest Score: 5.0
Average loss last 50 games: 0.30904121725432604
Average score last 50 games: 0.66
Game: 120 | Frame 65970
Current game score: 0.0
Current loss: 0.3199014390907005
Highest Score: 5.0
Average loss last 50 games: 0.3096399399270984
Average score last 50 games: 0.64
Game: 121 | Frame 66445
Current game score: 0.0
Current loss: 0.33006059987666575
Highest Score: 5.0
Average loss last 50 games: 0.3100801840755544
Average score last 50 games: 0.64
Game: 122 | Frame 66920
Current game score: 1.0
Current loss: 0.3325191186292839
Highest Score: 5.0
Average loss l