In [None]:
from __future__ import division

import gym
import tensorflow as tf
import numpy as np
import random
import tensorflow.contrib.slim as slim
import matplotlib.pyplot as plt
import skimage.transform as skim
import os
%matplotlib inline

In [None]:
class Qnetwork():
    def __init__(self,h_size):
        #Convolutional neural network.
        self.scalarInput = tf.placeholder(shape=[None,28224],dtype=tf.float32)
        self.imageIn = tf.reshape(self.scalarInput, shape=[-1,84,84,4])
        self.conv1 = slim.conv2d(inputs=self.imageIn,num_outputs=16,kernel_size=[8,8],stride=[4,4],
                                padding='VALID', biases_initializer=None)
        self.conv2 = slim.conv2d(inputs=self.conv1,num_outputs=32,kernel_size=[6,6],stride=[3,3],
                                padding='VALID', biases_initializer=None)
        self.conv3 = slim.conv2d(inputs=self.conv2,num_outputs=64,kernel_size=[4,4],stride=[1,1],
                                padding='VALID', biases_initializer=None)
        #self.conv4 = slim.conv2d(inputs=self.conv3,num_outputs=h_size,kernel_size=[5,5],stride=[2,2],
                                #padding='VALID', biases_initializer=None)
        
        #Output of conv3 into value streams.
        self.res = self.conv3
        self.streamA = slim.flatten(self.res)
        self.streamV = slim.flatten(self.res)
        xavier_init = tf.contrib.layers.xavier_initializer()
        self.AW = tf.Variable(xavier_init([h_size,4]))
        self.VW = tf.Variable(xavier_init([h_size,1]))
        #Compute Advantage value of state.
        self.Advantage = tf.matmul(self.streamA,self.AW)
        #Compute Q-values of actions.
        self.Value = tf.matmul(self.streamV,self.VW)
        
        #Combine for final Q-values.
        self.Qout = self.Value + tf.subtract(self.Advantage,tf.reduce_mean(self.Advantage,axis=1,keepdims=True))
        #Compute action with greatest Q-value.
        self.predict = tf.argmax(self.Qout,1)
        
        #Compute loss through sum squares difference of prediction Q-values.
        self.targetQ = tf.placeholder(shape=[None],dtype=tf.float32)
        self.actions = tf.placeholder(shape=[None],dtype=tf.int32)
        self.actions_onehot = tf.one_hot(self.actions,4,dtype=tf.float32)
        
        self.Q = tf.reduce_sum(tf.multiply(self.Qout, self.actions_onehot), axis=1)
        
        self.td_error = tf.square(self.targetQ - self.Q)
        self.loss = tf.reduce_mean(self.td_error)
        self.trainer = tf.train.AdamOptimizer(learning_rate=0.00025)
        self.updateModel = self.trainer.minimize(self.loss)

In [None]:
#Class for handling experience gained.
class experience_buffer():
    def __init__(self, buffer_size = 12500):
        self.buffer = []
        self.buffer_size = buffer_size
    
    def add(self, experience):
        if len(self.buffer) + len(experience) >= self.buffer_size:
            self.buffer[0:(len(experience)+len(self.buffer))-self.buffer_size] = []
        self.buffer.extend(experience)

    def sample(self, size):
        return np.reshape(np.array(random.sample(self.buffer,size)),[size,5])

In [None]:
#Preprocess state obeservations by converting to grey-scale, resizing then cropping for relevant game info.
def processState(states):
    s = np.dot(states[...,:3], [0.299, 0.587, 0.114])
    s = skim.resize(s,[110,84],mode='constant')
    s = s[26:110,0:84]
    return np.reshape(s,[1,7056])

In [None]:
#Slow update of target network parameters to minimize main Q-network Q-value overestimates.
def updateTargetGraph(tfVars,tau):
    total_vars = len(tfVars)
    op_holder = []
    for idx,var in enumerate(tfVars[0:total_vars//2]):
        op_holder.append(tfVars[idx+total_vars//2].assign((var.value()*tau) + ((1-tau)*tfVars[idx+
                        total_vars//2].value())))
        return op_holder

def updateTarget(op_holder,sess):
    for op in op_holder:
        sess.run(op)

In [None]:
#Parameters
batch_size = 32 #How many experiences to use for each training step.
update_freq = 4 #How often to perform a training step.
y = 0.99 #Discount factior on the target Q-values.
startE = 1 #starting chance of random action.
endE = 0.1 #Final chance of random action.
annealing_steps = 1000000. #How many steps of training to reduce startE to endE.
num_episodes = 10000000 #How many episodes of game environment to train network with.
pre_train_steps = 10000 #The many steps of random actions before training begins.
load_model = False #Whether to load a model or not.
path = './4dqn' #The path to save model to.
h_size = 256 #The size of the final convolutional layer before splitting.
tau = 0.001 #Rate of update target network toward primary network

In [None]:
env = gym.make('Breakout-v0')
tf.reset_default_graph()
mainQN = Qnetwork(h_size)
targetQN = Qnetwork(h_size)

init = tf.global_variables_initializer()

save = tf.train.Saver()

trainables = tf.trainable_variables()

targetOps = updateTargetGraph(trainables,tau)

myBuffer = experience_buffer()
sf = np.zeros([1,7056])

#Set the rate of random action decrease.
e = startE
stepDrop = (startE - endE)/annealing_steps

#Create lists to contain total reward and steps per episode.
jList = []
rList = []

total_steps = 0

#Make a path for our model to be saved in.
if not os.path.exists(path):
    os.makedirs(path)

#Training the Q-network.
with tf.Session() as sess:
    sess.run(init)
    #Load latest saved model in path if load_model==True.
    if load_model == True:
        print('Loading Model...')
        ckpt = tf.train.get_checkpoint_state(path)
        save.restore(sess,ckpt.model_checkpoint_path)
    for i in range(num_episodes):
        internal_buffer = [sf,sf,sf]
        episodeBuffer = experience_buffer()
        s = env.reset()
        s = processState(s)
        internal_buffer.append(s)
        d = False
        rAll = 0
        j = 0
        #Episode.
        while not d:
            #Choose an action by greedily (with e chance of random action) from the Q-network
            if np.random.rand(1) < e or total_steps < pre_train_steps:
                a = np.random.randint(4)
            else:
                a = sess.run(mainQN.predict,feed_dict={mainQN.scalarInput:np.hstack(internal_buffer[-4:])})[0]
            #Execute action a and recieve new state observation, reward
            s1,r,d,info = env.step(a)
            s1 = processState(s1)
            internal_buffer.append(s1)
            total_steps += 1
            episodeBuffer.add(np.reshape(np.array([np.hstack(internal_buffer[-5:-1]),a,r,
                                                   np.hstack(internal_buffer[-4:]),d]),[1,5]))
            #Decrease e linearly over training when pre-train steps are over.
            if total_steps > pre_train_steps:
                if e > endE:
                    e -= stepDrop
                #Update main Q-network and target Q-network every fourth step.
                if total_steps % update_freq == 0:
                    trainBatch = myBuffer.sample(batch_size)
                    Q1 = sess.run(mainQN.predict,feed_dict={mainQN.scalarInput:np.vstack(trainBatch[:,3])})
                    Q2 = sess.run(targetQN.Qout,feed_dict={targetQN.scalarInput:np.vstack(trainBatch[:,3])})
                    end_multiplier = -(trainBatch[:,4] - 1)
                    doubleQ = Q2[range(batch_size),Q1]
                    targetQ = trainBatch[:,2] + (y*doubleQ * end_multiplier)
                    _ = sess.run(mainQN.updateModel, feed_dict={mainQN.scalarInput:np.vstack(trainBatch[:,0]),
                                                                   mainQN.targetQ:targetQ, mainQN.actions:trainBatch[:,1]})
                    updateTarget(targetOps,sess)
            rAll += r
            s = s1
            
            if d == True:
                break
        
        myBuffer.add(episodeBuffer.buffer)
        jList.append(j)
        rList.append(rAll)
        #Periodically save the model. 
        if i % 1000 == 0 and i != 0:
            print('-------------------------------------------------')
            print(i,np.mean(rList[-1000:]))
            save.save(sess,path+'/model-'+str(i)+'.ckpt')
            print("Saved Model")
        if len(rList) % 10 == 0:
            print(i,np.mean(rList[-10:]), e)
    saver.save(sess,path+'/model-'+str(i)+'.ckpt')
print("Percent of succesful episodes: " + str(sum(rList)/num_episodes) + "%")

In [None]:
evalQN = Qnetwork(h_size)
evalBatchSize = 10000
env = gym.make('Breakout-v0')

save = tf.train.Saver()
sf = np.zeros([1,7056])

evalrList = []


#Tests the latest saved policy in path over a sample of 10000 games.
with tf.Session() as sess:
    ckpt = tf.train.get_checkpoint_state(path)
    print('Loading Model', ckpt.model_checkpoint_path + '...')
    save.restore(sess,ckpt.model_checkpoint_path)
    for i in range(10000):
        s = env.reset()
        s = processState(s)
        eval_frame_buffer = [sf,sf,sf]
        eval_frame_buffer.append(s)
        d = False
        evalrAll = 0
        while not d:
            a = sess.run([predict], feed_dict={evalQN.scalaInput:np.hstack(eval_fram_buffer[-4:])})
            s1, r, d, info = env.step(a)
            s1 = processState(s1)
            eval_frame_buffer.append(s1)
            evalrAll += r
            if d:
                break

rMat = np.resize(np.array(evalrList),[len(evalrList)//100,100])
rMean = np.average(rMat,1)
plt.plot(rMean)
                
            