# Policy Gradient Learning with Cart Pole V0

In [2]:
import gym
import numpy as np
import tensorflow as tf

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [3]:
env = gym.make("CartPole-v0") # declare the environment
# Watch the simulation
env.reset() # initialize the environment
rewards = []
for _ in range(100):
    env.render() 
    # Redraw a frame of the environment, default mode means poping up a window, can also set (mode=‘human’, close=False)
    # Take a random action, step means advance one time step
    state, reward, done, info = env.step(env.action_space.sample()) # state is also known as observation
env.close() # Close the environment and clear the memory




In [4]:
input_size = 4  # 4 informations given by state
action_size = 2  # 2 actions possible: left / right
hidden_size = 64  # Hidden neurons

learning_rate = 0.001 
gamma = 0.99 #Discount rate

train_episodes = 3000 # An episode is a game
max_steps = 900 # Max steps per episode
batch_size = 5

In [5]:
# Build Deep Neural Network
class PGAgent():
    def __init__(self, input_size, action_size, hidden_size, learning_rate, gamma):
        
        self.input_size = input_size
        self.action_size = action_size
        self.hidden_size = hidden_size
        self.learning_rate = learning_rate
        self.gamma = gamma
        
        # Make the NN
        # 在神经网络构建graph的时候在模型中的占位,没有把要输入的数据传入模型，只会分配必要的内存
        self.inputs = tf.placeholder(tf.float32, 
                      shape = [None, input_size])
                              
        # Using ELU is much better than using ReLU
        self.hidden_layer_1 = tf.contrib.layers.fully_connected(inputs = self.inputs,
                                                  num_outputs = hidden_size,
                                                  activation_fn = tf.nn.elu,
                                                  weights_initializer = tf.random_normal_initializer())

        self.output_layer = tf.contrib.layers.fully_connected(inputs = self.hidden_layer_1,
                                                         num_outputs = action_size,
                                                 activation_fn = tf.nn.softmax)
        
        # Log prob output
        self.output_log_prob = tf.log(self.output_layer)

        # LOSS Function : feed the reward and chosen action in the DNN
        self.actions = tf.placeholder(tf.int32, shape = [None])
        self.rewards = tf.placeholder(tf.float32, shape = [None])
        
        # Get log probability of actions from episode : 
        self.indices = tf.range(0, tf.shape(self.output_log_prob)[0]) * tf.shape(self.output_log_prob)[1] + self.actions
        
        self.actions_probability = tf.gather(tf.reshape(self.output_layer, [-1]), self.indices)
        
        self.loss = -tf.reduce_mean(tf.log(self.actions_probability) * self.rewards)

        #  Collect some gradients after some training episodes outside the graph and then apply them.
  
        tvars = tf.trainable_variables()
        self.gradient_holders = []
        for idx,var in enumerate(tvars):
            placeholder = tf.placeholder(tf.float32, name=str(idx)+ '_holder')
            self.gradient_holders.append(placeholder)
        
        self.gradients = tf.gradients(self.loss,tvars)
        
        
        # OPTIMIZER
        # 相比于AdaGrad的历史梯度,RMSProp增加了一个衰减系数来控制历史信息的获取多少
        # Better to use RMSProp
        optimizer = tf.train.RMSPropOptimizer(learning_rate=learning_rate)
        self.update_batch = optimizer.apply_gradients(zip(self.gradient_holders,tvars))
        

In [6]:
# Weight rewards differently : weight immediate rewards higher than delayed reward

def discount_rewards(r):
    # Init discount reward matrix
    discounted_reward= np.zeros_like(r) 
    
    # Running_add: store sum of reward
    running_add = 0
    
    # Foreach rewards
    for t in reversed(range(0, r.size)):
        
        running_add = running_add * gamma + r[t] # sum * y (gamma) + reward
        discounted_reward[t] = running_add
    return discounted_reward

In [7]:
# Train the agent
# Clear the graph
tf.reset_default_graph()
agent = PGAgent(input_size, action_size, hidden_size, learning_rate, gamma)
# Launch the tensorflow graph
with tf.Session() as sess:
    saver = tf.train.Saver()
    sess.run(tf.global_variables_initializer())
    
    nb_episodes = 0
    
    # Define total_rewards and total_length
    total_reward = []
    total_length = []
    
    # Not my implementation: 
    gradBuffer = sess.run(tf.trainable_variables())
    for ix,grad in enumerate(gradBuffer):
        gradBuffer[ix] = grad * 0
        
    
    # While we have episodes to train
    while nb_episodes < train_episodes:
        state = env.reset()
        running_reward = 0
        episode_history = [] # Init the array that keep track the history in an episode
        
        for step in range(max_steps):
            # Probabilistically pick an action given our network outputs.
            # 建立session，在会话中，运行模型的时候通过向占位符喂入数据
            action_distribution = sess.run(agent.output_layer ,feed_dict={agent.inputs:[state]})
            action = np.random.choice(action_distribution[0],p=action_distribution[0])
            action = np.argmax(action_distribution == action)
            
            state_1, reward, done, info = env.step(action)
            
            # Append this step in the history of the episode
            episode_history.append([state, action, reward, state_1])
            
            # Now we are in this state (state is now state 1)
            state = state_1
            
            running_reward += reward
            
            if done == True:
                # Update the network
                episode_history = np.array(episode_history)
                episode_history[:,2] = discount_rewards(episode_history[:,2])
                feed_dict={agent.rewards:episode_history[:,2],
                        agent.actions:episode_history[:,1],agent.inputs:np.vstack(episode_history[:,0])}
                grads = sess.run(agent.gradients, feed_dict=feed_dict)
                
                
                for idx,grad in enumerate(grads):
                    gradBuffer[idx] += grad

                if nb_episodes % batch_size == 0 and nb_episodes != 0:
                    feed_dict= dictionary = dict(zip(agent.gradient_holders, gradBuffer))
                    _ = sess.run(agent.update_batch, feed_dict=feed_dict)
                    for ix,grad in enumerate(gradBuffer):
                        gradBuffer[ix] = grad * 0
                
                #(running_reward))
                total_reward.append(running_reward)
                total_length.append(step)
                break
                
        # For each 100 episodes
        if nb_episodes % 100 == 0:
            print("Episode: {}".format(nb_episodes),
                    "Total reward: {}".format(np.mean(total_reward[-100:])))
        nb_episodes += 1
    
    saver.save(sess, "checkpoints/cartPoleGame.ckpt")
        
        
  



The TensorFlow contrib module will not be included in TensorFlow 2.0.
For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
  * https://github.com/tensorflow/io (for I/O related ops)
If you depend on functionality not listed there, please file an issue.




Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor






Episode: 0 Total reward: 10.0
Episode: 100 Total reward: 10.38
Episode: 200 Total reward: 10.38
Episode: 300 Total reward: 10.37
Episode: 400 Total reward: 9.87
Episode: 500 Total reward: 10.12
Episode: 600 Total reward: 10.17
Episode: 700 Total reward: 9.66
Episode: 800 Total reward: 9.62
Episode: 900 Total reward: 9.82
Episode: 1000 Total reward: 9.57
Episode: 1100 Total reward: 9.69
Episode: 1200 Total reward: 9.79
Episode: 1300 Total reward: 10.02
Episode: 1400 Total reward: 9.98
Episode: 1500 Total reward: 10.22
Episode: 1600 Total reward: 9.94
Episode: 1700 Total reward: 9.82
Episode: 1800 Total reward: 9.71
Episode: 1900 Total reward: 9.78
Episode: 2000 Total reward: 9.75
Episode: 2100 Total reward: 9.91
Episode: 2200 Total reward: 9.8
Episode: 2300 Total reward: 10.0
Episode: 2400 Total reward: 9.82
Episode: 2500 Total reward: 9.9
Episode: 2600 Total reward: 9.71
Episode: 2700 Total reward: 9.64
Episode: 2800 Total reward: 9.58
Episode: 2900 Total reward: 9.65


In [21]:
test_episodes = 60
test_max_steps = 400
env.reset()
with tf.Session() as sess:
    saver.restore(sess, tf.train.latest_checkpoint('checkpoints'))
    for episode in range(1, test_episodes):
        t = 0
        while t < test_max_steps:
            env.render() 
            #Probabilistically pick an action given our network outputs
            action_distribution = sess.run(agent.output_layer ,feed_dict={agent.inputs:[state]})
            action = np.random.choice(action_distribution[0],p=action_distribution[0])
            action = np.argmax(action_distribution == action) 
            state_1, reward, done, info = env.step(action)
            if done:
                t = test_max_steps
                env.reset()
                # Take one random step to get the pole and cart moving
                state, reward, done, info = env.step(env.action_space.sample())
            else:
                state = state_1 # Next state
                t += 1
                
env.close()

INFO:tensorflow:Restoring parameters from checkpoints\cartPoleGame.ckpt
