In [1]:
import tensorflow as tf
import gym
import numpy as np
import random
import matplotlib.pyplot as plt
import cv2
from tqdm import tqdm_notebook as tqdm
from collections import *
from keras.models import Sequential
from keras.layers import Dense, Conv2D, Flatten

Using TensorFlow backend.


In [2]:

env = gym.make("MsPacman-v0")
obs=env.reset()


In [3]:
img = env.render(mode='rgb_array')  
n_outputs=env.action_space.n
n_outputs

9

In [4]:
def preprocess_img(img):
  
    img=img[:180,:,:]/255
    img=cv2.resize(img,(88,88))
    X=np.zeros((88,88,1))
    X[:,:,0]=np.dot(img[...,:3], [0.2989, 0.5870, 0.1140])
    return X


In [13]:
tf.reset_default_graph()

def QNetwork(X,scope):
  
  #Making structure of Q_Network and collecting weights
    
    initializer=tf.contrib.layers.variance_scaling_initializer()
    with tf.variable_scope(scope) as scope:
    
        conv1=tf.layers.conv2d(inputs=X,kernel_size=(8,8),strides=(4,4),filters=32,padding='SAME',kernel_initializer=initializer)
    
        conv2=tf.layers.conv2d(inputs=conv1,filters=64,kernel_size=(4,4),strides=(2,2),padding='SAME',kernel_initializer=initializer)
    
        conv3=tf.layers.conv2d(inputs=conv2,filters=64,kernel_size=(3,3),strides=(1,1),padding='SAME',kernel_initializer=initializer)
    
        flat=tf.layers.flatten(conv3)
    
        fc = tf.contrib.layers.fully_connected(flat, num_outputs=128, weights_initializer=initializer)
    
        output=tf.contrib.layers.fully_connected(fc, num_outputs=n_outputs,activation_fn=None,weights_initializer=initializer)
    
        var = {v.name[len(scope.name):]: v for v in tf.get_collection(key=tf.GraphKeys.TRAINABLE_VARIABLES, scope=scope.name)}
    
        return var,output

In [14]:
epsilon=0.5
eps_min=0.05
eps_max=1
K=0.00001
num_episodes=1000
steps_per_action={}
global_step=0
train_step=5
start_step=200
batch_size=32
discount_factor=0.8
learning_rate=0.01

In [15]:
def best_action(action,steps):
  
    epsilon=max(eps_min,eps_max-K*steps)
  
    if np.random.rand()<epsilon:
        return np.random.randint(n_outputs)
  
    else:
        return action

In [16]:
def add(frame,action,reward,next_frame,done):
    
    if(len(replay_memory)==memory_capacity):
        replay_memory.popleft()
      
    replay_memory.append([frame,action,reward,next_frame,done])

In [17]:
memory_capacity=1000000
replay_memory=deque([])       #Holding frame set

def sample(batch_size):
  
    perm=np.random.permutation(len(replay_memory))[:batch_size]
    store=np.array(replay_memory)[perm]
    
    return store[:,0],store[:,1],store[:,2],store[:,3],store[:,4]

In [18]:
img_shape=(88,88)
tf.reset_default_graph()
X=tf.placeholder(tf.float32,shape=(None,img_shape[0],img_shape[1],1))

In [19]:
# we build our Q network, which takes the input X and generates Q values for all the actions in the state
mainQ, mainQ_outputs = QNetwork(X, 'mainQ')
# similarly we build our target Q network
targetQ, targetQ_outputs = QNetwork(X, 'targetQ')

In [20]:
#stores action array batch-wise
X_action=tf.placeholder(dtype=tf.int32,shape=(None,))
#computes Q(s,a) value for training
Q_action=tf.reduce_sum(mainQ_outputs*tf.one_hot(X_action,n_outputs),axis=-1,keepdims=True)

In [21]:
#copy weights from mainQ to targetQ
copy_op = [tf.assign(target_name, mainQ[var_name]) for var_name, target_name in targetQ.items()]
copy_weights = tf.group(*copy_op)

In [22]:
y=tf.placeholder(tf.float32,shape=(None,1))
cost=tf.reduce_mean(tf.square(y-Q_action))
optimizer=tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(cost)
init=tf.global_variables_initializer()

Instructions for updating:
Use tf.cast instead.


In [None]:
with tf.Session() as sess:
  
    init.run()
  
    #for each episode i
    for i in tqdm(range(num_episodes)):
    
        obs=env.reset()
        done=False
        reward_per_episode=0
        epoch=0
     
        while not done:
            #Reduce image size and convert it to black and white
            obs=preprocess_img(obs)
      
            actions=mainQ_outputs.eval(feed_dict={X:[obs]})
      
            action=np.argmax(actions,axis=-1)
        
      
            action=best_action(action,global_step)
      
           # steps_per_action[action]=steps_per_action[action]+1
            #print(action)
            next_obs,reward,done,_= env.step(action)
      
            next_obs=preprocess_img(next_obs)
      
            add(obs,action,reward,next_obs,done)
      
            if global_step%train_step==0 and global_step>start_step:
        
                obs_arr,action_arr,reward_arr,next_obs_arr,done_arr=sample(batch_size)
        
                obs_arr=[x for x in obs_arr]
        
                next_obs_arr=[x for x in next_obs_arr]
        
                y_o=reward_arr+discount_factor*np.max(targetQ_outputs.eval(feed_dict={X:next_obs_arr}))*(1-done_arr)
                        
                
                train_loss,opt=sess.run([cost,optimizer],feed_dict={X:obs_arr,y:np.expand_dims(y_o,axis=-1),X_action:action_arr})
                
            if global_step%50==0 and global_step>start_step:
                
                copy_weights.run()
                
            global_step+=1
            reward_per_episode+=reward
            epoch+=1
            
            
            
        print('epoch',epoch,'reward',reward_per_episode)

HBox(children=(IntProgress(value=0, max=1000), HTML(value='')))

epoch 1245 reward 670.0
epoch 599 reward 200.0
epoch 693 reward 220.0
epoch 660 reward 140.0
epoch 753 reward 150.0
epoch 540 reward 180.0
epoch 631 reward 260.0
epoch 512 reward 100.0
epoch 713 reward 230.0
epoch 865 reward 460.0
epoch 717 reward 240.0
epoch 751 reward 190.0
epoch 622 reward 210.0
epoch 621 reward 210.0
epoch 536 reward 130.0
epoch 646 reward 200.0
epoch 672 reward 240.0
epoch 758 reward 360.0
epoch 762 reward 260.0
epoch 618 reward 240.0
epoch 768 reward 250.0
epoch 829 reward 1740.0
epoch 533 reward 210.0
epoch 626 reward 250.0
epoch 603 reward 250.0
epoch 482 reward 150.0
epoch 717 reward 300.0
epoch 1100 reward 510.0
epoch 655 reward 220.0
epoch 520 reward 120.0
epoch 695 reward 290.0
epoch 691 reward 270.0
epoch 711 reward 310.0
epoch 531 reward 190.0
epoch 628 reward 150.0
epoch 482 reward 110.0
epoch 522 reward 150.0
epoch 715 reward 240.0
epoch 531 reward 190.0
epoch 602 reward 190.0
epoch 1003 reward 370.0
epoch 715 reward 330.0
epoch 638 reward 370.0
epoch 6

array([3, 8, 6, 7, array([0]), 3, 0, array([1]), array([0])], dtype=object)

206

205