In [2]:
import gym
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
import tensorflow.contrib.slim as slim

In [3]:
env = gym.make('CartPole-v0')
state_size = 4 # [Position, Velocity, Pole Angle, Pole Velocity]
action_size = 2 # [Left, Right] 
#state_size = 2 # [Position, Velocity, Pole Angle, Pole Velocity]
#action_size = 3 # [Left, Right] 

#Reward = 1 for each step taken including termination step

In [4]:
gamma = 0.99

def discount_rewards(r):
    """ take 1D float array of rewards and compute discounted reward """
    discounted_r = np.zeros_like(r)
    running_add = 0
    for t in reversed(range(0, r.size)):
        running_add = running_add * gamma + r[t]
        discounted_r[t] = running_add
    return discounted_r


In [5]:
class Agent():
    def __init__(self, lr, s_size, a_size,h_size):
        
        #Feedforward
        self.state_in= tf.placeholder(shape=[None,s_size],dtype=tf.float32)
        hidden = slim.fully_connected(self.state_in,h_size,biases_initializer=None,activation_fn=tf.nn.relu)
        self.output = slim.fully_connected(hidden,a_size,activation_fn=tf.nn.softmax,biases_initializer=None)
        self.chosen_action = tf.argmax(self.output,1)

        #Training Procedure
        
        self.reward_holder = tf.placeholder(shape = [None], dtype= tf.float32)
        self.action_holder = tf.placeholder(shape = [None], dtype=tf.int32)
        
        self.indexes = tf.range(0,tf.shape(self.output)[0]) * tf.shape(self.output)[1]+ self.action_holder
        
        self.responsible_outputs = tf.gather(tf.reshape(self.output, [-1]), self.indexes)

        self.loss = -tf.reduce_mean(tf.log(self.responsible_outputs)*self.reward_holder)
        
        tvars = tf.trainable_variables()
        self.gradient_holders = []
        for idx,var in enumerate(tvars):
            placeholder = tf.placeholder(tf.float32,name=str(idx)+'_holder')
            self.gradient_holders.append(placeholder)
        
        self.gradients = tf.gradients(self.loss,tvars)
        
        optimizer = tf.train.AdamOptimizer(learning_rate=lr)
        self.update_batch = optimizer.apply_gradients(zip(self.gradient_holders,tvars))

In [6]:
tf.reset_default_graph() #Clear the Tensorflow graph.

myAgent = Agent(lr=1e-2,s_size=state_size,a_size=action_size,h_size=8) #Load the agent.

total_episodes = 2000 #Set total number of episodes to train agent on.
max_ep = 999
update_frequency = 5

init = tf.global_variables_initializer()

# Launch the tensorflow graph
sess = tf.Session()
sess.run(init)
i = 0
total_reward = []
total_length = []

#Reset the buffer or variable initiated by slim.fully_connected
gradBuffer = sess.run(tf.trainable_variables())
for ix,grad in enumerate(gradBuffer):
    gradBuffer[ix] = grad * 0
while i < total_episodes:
    s = env.reset()
    running_reward = 0
    ep_history = []
    for j in range(max_ep):
        #Probabilistically pick an action given our network outputs.        
        a_dist = sess.run(myAgent.output,feed_dict={myAgent.state_in:[s]})
        a = np.random.choice(a_dist[0],p=a_dist[0])
        a = np.argmax(a_dist == a)
        s1,r,d,_ = env.step(a) #Get our reward for taking an action given a bandit.
        if i % 100 == 0 and (total_episodes - i) < 540:
            env.render()
        ep_history.append([s,a,r,s1])
        s = s1
        running_reward += r
        if d == True:
            #Update the network.
            ep_history = np.array(ep_history)
            
            ep_history[:,2] = discount_rewards(ep_history[:,2])
            feed_dict={myAgent.reward_holder:ep_history[:,2],
                    myAgent.action_holder:ep_history[:,1],myAgent.state_in:np.vstack(ep_history[:,0])}
           # print(feed_dict)
            grads = sess.run(myAgent.gradients, feed_dict=feed_dict)
            for idx,grad in enumerate(grads):
                gradBuffer[idx] += grad

            if i % update_frequency == 0 and i != 0:
                feed_dict= dictionary = dict(zip(myAgent.gradient_holders, gradBuffer))
                _ = sess.run(myAgent.update_batch, feed_dict=feed_dict)
                for ix,grad in enumerate(gradBuffer):
                    gradBuffer[ix] = grad * 0

            total_reward.append(running_reward)
            total_length.append(j)
            break

    
        #Update our running tally of scores.
    if i % 100 == 0:
        print(np.mean(total_reward[-100:]))
    i += 1

saver = tf.train.Saver()

#save_path = saver.save(sess,'/home/dan/JupyterNotebook/QL2Data.ckpt')

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Use tf.cast instead.


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


12.0
24.39
30.12
38.76
48.0
66.07
97.91
114.39
126.15
132.22
167.46
175.56
177.72
178.98
190.78
191.84
175.91
169.25
186.11
196.27


In [7]:
d = False
s = env.reset()
i = 0
while not d:
    a = np.random.randint(2)
    s1,r,d,_ = env.step(a)
    env.render()
    i+=1

In [8]:
d = False
s = env.reset() 
i = 0
while not d:
    a = sess.run(myAgent.chosen_action,feed_dict={myAgent.state_in:[s]})
    a = a[0]
    s1, r, d,_ = env.step(a)
    s = s1
    env.render()
    i+=1
            

In [9]:
env.close()

In [10]:
#Vanilla RNNaccording to Lecture slides
#-------------------------------------
# h = Hidden state
# x = input vector (observation) 4x1
# y = output vector (action 0 or 1) 2x1
# p = probability output

# a_t = W * h_{t-1} * U * x_t + b
# h_t = tanh(a)
# o_t = V*h_t + c
# p_t = softmax(o_t)

class VanillaRNNAgent():
    def __init__(self,lr,obs_size_action_size):
        
        x = tf.placeholder(shape=[obs_size], dtype = float32)
        h = tf.placeholder()

