In [1]:
import scipy
import scipy.misc
import numpy as np
import random
import tensorflow as tf
import gym
from tqdm import tqdm
import os
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
class env_wrapper():
    def __init__(self, game_title, actions):
        self.state_size = 8
        self.game_title = game_title
        self.actions = actions
        self.n_actions = len(self.actions)
        try:
            self.env = gym.make(self.game_title)
        except:
            print ("ERROR : Wrong game title.")
            return None
        self.env.reset()
    
    def processState(self, state):
        return state
    
    def processAction(self, action):
        return action
    
    def make_reset(self):
        state = self.env.reset()
    
        return self.processState(state)
    
    def make_step(self, action, render = False):
        action = self.processAction(action)
    
        state, ret1, ret2, ret3 = self.env.step(action)
    
        if render:
            self.env.render()
    
        return self.processState(state), ret1, ret2, ret3

In [4]:
n_actions = 4
action_names = ["DO_NOTHING", "FIRE_LEFT", "FIRE_MAIN",
                "FIRE_RIGHT"]

In [5]:
GAME_TITLE = "LunarLander-v2"

In [6]:
class AV_qnetwork():
    def __init__(self, h_size, n_actions):
        self.scalarInput = tf.placeholder(shape=[None, 8],dtype=tf.float32)
        self.IW = tf.Variable(tf.random_normal([8,h_size]))
        self.inp = tf.matmul(self.scalarInput,self.IW)
        
        self.streamAC,self.streamVC = tf.split(1,2,self.inp)
        self.streamA = tf.contrib.layers.flatten(self.streamAC)
        self.streamV = tf.contrib.layers.flatten(self.streamVC)
        self.AW = tf.Variable(tf.random_normal([h_size/2, n_actions]))
        self.VW = tf.Variable(tf.random_normal([h_size/2,1]))
        self.Advantage = tf.matmul(self.streamA,self.AW)
        self.Value = tf.matmul(self.streamV,self.VW)
        
        self.Qout = self.Value + tf.sub(self.Advantage,tf.reduce_mean(self.Advantage,reduction_indices=1,keep_dims=True))
        self.predict = tf.argmax(self.Qout,1)
        
        self.targetQ = tf.placeholder(shape=[None],dtype=tf.float32)
        self.actions = tf.placeholder(shape=[None],dtype=tf.int32)
        self.actions_onehot = tf.one_hot(self.actions,n_actions,dtype=tf.float32)
        
        self.Q = tf.reduce_sum(tf.mul(self.Qout, self.actions_onehot), reduction_indices=1)
        
        self.td_error = tf.square(self.targetQ - self.Q)
        self.loss = tf.reduce_mean(self.td_error)
        self.trainer = tf.train.AdamOptimizer(learning_rate=0.0001)
        self.updateModel = self.trainer.minimize(self.loss)

In [7]:
class experience_buffer():
    def __init__(self, buffer_size = 50000):
        self.buffer = []
        self.buffer_size = buffer_size
    
    def add(self,experience):
        if len(self.buffer) + len(experience) >= self.buffer_size:
            self.buffer[0:(len(experience)+len(self.buffer))-self.buffer_size] = []
        self.buffer.extend(experience)
            
    def sample(self,size):
        return np.reshape(np.array(random.sample(self.buffer,size)),[size,5])

In [8]:
lparams = {"batch_size" : 35, 
           "MQN_updatefreq" : 6,
           "TQN_updatefreq" : 6000,
           "saver_freq" : 100,
           "y" : 0.99,
           "startE" : 1,
           "endE" : 0.1,
           "anneling_steps" : 50000,
           "num_episodes" : 1,
           "pre_train_steps" : 50000,
           "load_model" : False,
           "path" : "./dqn",
           "h_size" : 256, 
           "gamma" : 0.99,
           "render" : True}

In [9]:
class DDQL():
    def __init__(self, lparams, env):
        self.batch_size = lparams["batch_size"]
        self.MQN_updatefreq = lparams["MQN_updatefreq"]
        self.TQN_updatefreq = lparams["TQN_updatefreq"]
        self.saver_freq = lparams["saver_freq"]
        self.y = lparams["y"]
        self.startE = lparams["startE"]
        self.endE = lparams["endE"]
        self.anneling_steps = lparams["anneling_steps"]
        self.pre_train_steps = lparams["pre_train_steps"]
        self.load_model = lparams["load_model"]
        self.path = lparams["path"]
        self.h_size = lparams["h_size"]
        self.gamma = lparams["gamma"]
        self.render = lparams["render"]
        
        self.env = env
        
        tf.reset_default_graph()
        self.mainQN = AV_qnetwork(self.h_size, self.env.n_actions)
        self.targetQN = AV_qnetwork(self.h_size, self.env.n_actions)
        self.init = tf.initialize_all_variables()
        self.saver = tf.train.Saver()
        self.trainables = tf.trainable_variables()
        self.algBuffer = experience_buffer()

        self.epsilon = self.startE
        self.stepDrop = (self.startE - self.endE) / self.anneling_steps

        self.jList = []
        self.rList = []
        self.total_steps = 0
        
        if not os.path.exists(self.path):
            os.makedirs(self.path)
            
    def updateTarget(self, session):
        total_vars = len(self.trainables)
        for idx,var in enumerate(self.trainables[0:total_vars/2]):
            session.run(self.trainables[idx + total_vars/2].assign(var.eval()))
    
    def discount_rewards(self, r):
        discounted_r = np.zeros_like(r)
        running_add = 0
        for t in reversed(xrange(0, r.size)):
            running_add = running_add * self.gamma + r[t]
            discounted_r[t] = running_add
        return discounted_r

    def choose_action(self, state, session):
        if np.random.rand(1) < self.epsilon or self.total_steps < self.pre_train_steps:
            a = np.random.randint(0, self.env.n_actions)
        else:
            a = session.run(self.mainQN.predict,feed_dict={self.mainQN.scalarInput:[state]})[0]
            
        return a    
    
    def train(self, num_episodes, frame_limit):
        with tf.Session() as sess:
            if self.load_model == True:
                print 'Loading Model...'
                ckpt = tf.train.get_checkpoint_state(self.path)
                self.saver.restore(sess,ckpt.model_checkpoint_path)
            sess.run(self.init)
            self.updateTarget(sess) 
            
            for i in tqdm(range(num_episodes)):
                episodeBuffer = experience_buffer()
                state = self.env.make_reset()
                
                for iteration in xrange(0, frame_limit):
                    action = self.choose_action(state, sess)
                    new_state, reward, gameover, _ = self.env.make_step(action, self.render)
                    self.total_steps += 1
                
                    episodeBuffer.add(np.reshape(np.array([state, action, reward, new_state, gameover]),[1,5]))
            
                    if self.total_steps > self.pre_train_steps:
                        if self.epsilon > self.endE:
                            self.epsilon -= self.stepDrop
            
                        if self.total_steps % (self.TQN_updatefreq) == 0:
                            print "Target network updated."
                            self.updateTarget(sess)
                
                        if self.total_steps % (self.MQN_updatefreq) == 0:
                            trainBatch = self.algBuffer.sample(self.batch_size)
                            Q1 = sess.run(self.mainQN.predict,feed_dict={self.mainQN.scalarInput:np.vstack(trainBatch[:,3])})
                            Q2 = sess.run(self.targetQN.Qout,feed_dict={self.targetQN.scalarInput:np.vstack(trainBatch[:,3])})
                            end_multiplier = -(trainBatch[:,4] - 1)
                            doubleQ = Q2[range(self.batch_size),Q1]
                            targetQ = trainBatch[:,2] + (self.y * doubleQ * end_multiplier)
                            _ = sess.run(self.mainQN.updateModel, \
                                feed_dict={self.mainQN.scalarInput:np.vstack(trainBatch[:,0]),self.mainQN.targetQ:targetQ, self.mainQN.actions:trainBatch[:,1]})
            
                    state = new_state
            
                    if gameover:
                        break
    
                episodeRewards = np.array(episodeBuffer.buffer)[:,2]
                total_reward = np.sum(episodeRewards)
                discountRewards = self.discount_rewards(episodeRewards)
                bufferArray = np.array(episodeBuffer.buffer)
                bufferArray[:,2] = discountRewards
                self.algBuffer.add(zip(bufferArray))
                self.jList.append(iteration)
                self.rList.append(total_reward)
                if i % self.saver_freq == 0:
                    self.saver.save(sess,self.path+'/model-'+str(i)+'.cptk')
                    print "Saved Model"
                if len(self.rList) % 10 == 0:
                    print "total:", self.total_steps, " rlist:", self.rList[-10:]
                    print "mean: ", np.mean(self.rList[-10:]), " e: ", self.epsilon
                    print self.jList[-10:]
            self.saver.save(sess,self.path+'/model-'+str(i)+'.cptk')

In [10]:
envhandle = env_wrapper(GAME_TITLE, action_names)

[2016-12-01 14:46:34,952] Making new env: LunarLander-v2


In [11]:
ddql = DDQL(lparams, envhandle)

In [None]:
ddql.train(num_episodes = 1000, frame_limit = 1000)

  0%|          | 1/1000 [00:01<29:38,  1.78s/it]

Saved Model


  1%|          | 10/1000 [00:15<27:23,  1.66s/it]

total: 889  rlist: [-120.32550079920317, -272.32822856698465, -255.61325231581989, -154.34624580734726, -170.07726667698006, -177.94052757979819, -169.68043348023764, -369.32183755348558, -156.6047513916663, -61.99741464945]
mean:  -190.823545882  e:  1
[66, 86, 60, 90, 100, 87, 79, 107, 91, 113]


In [None]:
episodeBuffer.sample(1)[0][4]

In [None]:
pList = [np.average(rList[i:i+10]) for i in range(0, 790)]

In [None]:
plt.plot(range(1, len(pList)+1), pList)

In [None]:
with tf.Session() as sess:    
    print 'Loading Model...'
    ckpt = tf.train.get_checkpoint_state(path)
#    saver.restore(sess,ckpt.model_checkpoint_path)
    saver.restore(sess,"./dqn3"+'/model-'+str(1199)+'.cptk')
    
    
    for i in tqdm(range(5)):
        env.monitor.start('./tmp/carracing1-e' + str(i))
        episodeBufferx = experience_buffer()
        s1 = make_reset(env)
        s = processState(s1)
        d = False
        rAll = 0
        j = 0 
        while (j < 1000):
            j+=1
            a = sess.run(mainQN.predict,feed_dict={mainQN.scalarInput:[s]})[0]
            s1,r,d,_ = make_step(env, a, render = True)
            s = processState(s1)
            rAll += r
        env.monitor.close()