In [2]:
import tensorflow as tf
import numpy as np
import gym
from math import sqrt
from gym.spaces import Discrete, Box

In [3]:
# define the Multilayer perceptron here
def mlp(x, sizes, activation=tf.tanh,output_activation=None):
    for size in sizes[:-1]:
        x = tf.layers.dense(x,units=size,activation=activation)
    return tf.layers.dense(x,units=sizes[-1],activation=output_activation)

In [4]:
# implement cost to go
def reward_to_go(rews):
    n = len(rews)
    rtgs = np.zeros_like(rews)
    for i in reversed(range(n)):
        rtgs[i] = rews[i]+(rtgs[i+1] if i+1<n else 0)
    return rtgs
    

In [5]:
def train(env_name='CartPole-v0',hidden_sizes=[32],lr=0.001,epochs=50,batch_size=5000,render=False):
    # make the environment
    env = gym.make(env_name)
    print('observation space :: ',env.observation_space.shape[0],'action space :: ',env.action_space)
    assert isinstance(env.observation_space,Box),"works only for env with continuous obs spaces"
    assert isinstance(env.action_space,Discrete),"works only for env with discrete action spaces"
    obs_dim = env.observation_space.shape[0]
    n_acts = env.action_space.n
    
    
    # make core of the policy network
    obs_ph = tf.placeholder(shape=(None,obs_dim),dtype=tf.float32)
    logits = mlp(obs_ph, sizes=hidden_sizes+[n_acts])
    
    actions = tf.squeeze(tf.multinomial(logits=logits,num_samples=1),axis=1)
    
    # make loss function whose gradient for the right data is the policy gradient
    weights_ph = tf.placeholder(shape=(None,),dtype=tf.float32)
    act_ph = tf.placeholder(shape=(None,),dtype=tf.int32)
    action_masks = tf.one_hot(act_ph,n_acts)
    log_probs = tf.reduce_sum(action_masks*tf.nn.log_softmax(logits),axis=1)
    
    loss = -tf.reduce_mean(weights_ph*log_probs)
    
    # do the training here
    train_op = tf.train.AdamOptimizer(learning_rate=lr).minimize(loss)
    
    sess = tf.InteractiveSession()
    sess.run(tf.global_variables_initializer())
    
    # for training policy
    def train_one_epoch():
        # make some empty lists for logging
        batch_obs = [] # for observations
        batch_acts = [] # for actions
        batch_weights = [] # for R(tau) weighting in policy gradient
        batch_rets = [] # for measuring episode returns
        batch_lens = [] # for measuring episode lengths
        
        # reset episode specific variables
        obs = env.reset() # first observation comes from starting distribution
        done = False # signal from environment that the episode is over
        ep_rews = [] # list for rewards accrued throughout the ep
        
        # render first episode of each epoch
        finished_rendering_this_episode = False
        while True:
            # rendering
            if (not finished_rendering_this_episode) and render:
                env.render()
            # save obs
            batch_obs.append(obs.copy())
            
            #act in the environment
            act = sess.run(actions, {obs_ph: obs.reshape(1,-1)})[0]

            obs,rew,done,_ = env.step(act)
            
            # save action,reward
            batch_acts.append(act) # saving all the actions taken ? 
            ep_rews.append(rew)
            if done:
                # if episode is over, record info about the episode
                ep_ret,ep_len = sum(ep_rews),len(ep_rews)
                batch_rets.append(ep_ret)
                batch_lens.append(ep_len)
                
                # weight for each logprob(a|s) is R(tau)
                batch_weights += list(reward_to_go(ep_rews))
                
                # reset episode specific variables
                obs,done,ep_rews = env.reset(),False,[]
                
                # wont render again this epoch
                finished_rendering_this_episode = True
                
                #end experience loop if we have enough of it
                if len(batch_obs)>batch_size:
                    break
        
        # take a single policy gradient update step
        batch_loss,_ba_ = sess.run([loss,train_op],feed_dict={obs_ph:np.array(batch_obs),act_ph:np.array(batch_acts),weights_ph:np.array(batch_weights)})
        return batch_loss,batch_rets,batch_lens
    
    # training loop
    for i in range(epochs):
        batch_loss,batch_rets,batch_lens = train_one_epoch()
        
        print('epoch: %3d \t loss: %.3f \t return: %.3f \t ep_len: %.3f'%
                (i, batch_loss, np.mean(batch_rets), np.mean(batch_lens)))

In [None]:
train(env_name='CartPole-v0',hidden_sizes=[32,64,1024,1024,64,32],lr=0.001,epochs=1000,render=True,batch_size=5000)

  result = entry_point.load(False)


observation space ::  4 action space ::  Discrete(2)
