In [1]:
import tensorflow as tf
import numpy as np
import gym

In [2]:
def compute_discounted_rewards(rewards, gamma):
    discounted_rewards = np.zeros_like(rewards, dtype=np.float)
    cum = 0.
    end_idx = len(rewards) - 1
    
    for i, r in enumerate(reversed(rewards)):
        cum = gamma * cum + r
        discounted_rewards[end_idx - i] = cum
        
    discounted_rewards -= np.mean(discounted_rewards)
    discounted_rewards /= np.std(discounted_rewards)
            
    return discounted_rewards

In [3]:
class PolicyGradientGraph():
    def __init__(self, state_size, nb_actions, learning_rate, scope_name):
        self.state_size = state_size
        self.nb_actions = nb_actions
        self.learning_rate = learning_rate
        self.scope_name = scope_name
        
        with tf.variable_scope(self.scope_name):
            self.input = tf.placeholder(tf.float32, (None, self.state_size), name="input")
            self.action = tf.placeholder(tf.float32, (None, self.nb_actions), name="action")
            
            self.discounted_reward = tf.placeholder(tf.float32, (None,), name="discounted_reward")
            
            self.fc1 = tf.layers.dense(self.input,
                                       10,
                                       activation=tf.nn.relu,
                                       kernel_initializer=tf.contrib.layers.xavier_initializer(),
                                       name="fc1")
            
            self.fc2 = tf.layers.dense(self.fc1,
                                       self.nb_actions,
                                       activation=tf.nn.relu,
                                       kernel_initializer=tf.contrib.layers.xavier_initializer(),
                                       name="fc2")

            self.fc3 = tf.layers.dense(self.fc2,
                                       self.nb_actions,
                                       activation=None,
                                       kernel_initializer=tf.contrib.layers.xavier_initializer(),
                                       name="fc3")
            
            self.output_distribution = tf.nn.softmax(self.fc3)
            
            # Compute negative log-likelihood
            #self.neg_log_prob = tf.nn.softmax_cross_entropy_with_logits_v2(self.fc3,
            #                                                               self.action)
            self.log_likelihood = tf.reduce_sum(tf.multiply(tf.math.log(self.output_distribution),
                                                            self.action),
                                                axis=1)
            self.neg_log_likelihood = tf.math.negative(self.log_likelihood)

            self.loss = tf.reduce_mean(self.neg_log_likelihood * self.discounted_reward)
            
            # Optimization operation
            self.optimizer = tf.train.AdamOptimizer(self.learning_rate)
            #optimizer = tf.contrib.estimator.clip_gradients_by_norm(optimizer, clip_norm=2.0)            
            self.train_op = self.optimizer.minimize(self.loss)

In [4]:
class PolicyGradientTrainer:
    def __init__(self, graph, env, possible_actions, gamma, ckpt_file):
        self.graph = graph
        self.env = env
        self.possible_actions = possible_actions
        self.gamma = gamma
        self.ckpt_file = ckpt_file
        self.saver = tf.train.Saver()
        
    def choose_action(self, input_state, session):
        output_distribution = session.run(self.graph.output_distribution,
                                          feed_dict={self.graph.input: np.expand_dims(input_state, axis=0)})

        action = np.random.choice(np.arange(len(self.possible_actions)),
                                  p=np.ravel(output_distribution))
        
        return action
        
    def run_episode(self, max_step, render, session):
        state = self.env.reset()
        total_reward = 0

        for step in range(max_step):
            action = self.choose_action(state, session)

            # Apply action to env and get next state, reward, and done bool
            state, reward, done, _ = self.env.step(action)

            if render:
                self.env.render()
            
            total_reward += reward

            if done:
                break

        print("Reward on episode: %f" % total_reward)
        
    def train_on_episode(self, states, actions, discounted_rewards, session):
        
        loss, _ = session.run([self.graph.loss, self.graph.train_op],
                              feed_dict={self.graph.input: states,
                                         self.graph.action: actions,
                                         self.graph.discounted_reward: discounted_rewards})
        
        return loss
         
    def play_and_learn(self, n_episodes, print_eval, save_after, session):
        
        for ep in range(1, n_episodes + 1):
            state = self.env.reset()
            states, actions, rewards = [], [], []
            
            while True:
                action = self.choose_action(state, session)

                # Apply action to env and get next state, reward, and done bool
                next_state, reward, done, _ = self.env.step(action)
                
                states.append(state)
                actions.append(action)
                rewards.append(reward)
                
                state = next_state
                
                if done:
                    discounted_rewards = compute_discounted_rewards(rewards, self.gamma)
                    actions = self.possible_actions[actions]

                    last_loss = self.train_on_episode(states, actions, discounted_rewards, session)
                    break
                
            if ep % print_eval == 0:
                print("Reward on last episode: %f" % np.sum(rewards))
                
            if ep % save_after == 0:
                self.saver.save(sess, self.ckpt_file)
                print("Saved model after %i episodes." % ep)
                
    def restore(self, session):
        self.saver.restore(session, self.ckpt_file)

In [8]:
env.close()

In [9]:
env = gym.make('CartPole-v0')
env._max_episode_steps = 2500

In [10]:
state_size = 4

possible_actions = np.identity(env.action_space.n, dtype=int)
nb_actions = env.action_space.n

learning_rate = 0.01
gamma = 0.95

n_episodes_learning = 600

print_eval = 200
save_after = 200
ckpt_file = "./models/model.ckpt"

In [11]:
tf.reset_default_graph()

sess = tf.Session()

pg_graph = PolicyGradientGraph(state_size, nb_actions, learning_rate, "pg_graph")
pg_trainer = PolicyGradientTrainer(pg_graph, env, possible_actions, gamma, ckpt_file)

sess.run(tf.global_variables_initializer())

pg_trainer.play_and_learn(n_episodes=n_episodes_learning,
                          print_eval=print_eval, save_after=save_after,
                          session=sess)

Reward on last episode: 215.000000
Saved model after 200 episodes.
Reward on last episode: 425.000000
Saved model after 400 episodes.
Reward on last episode: 2500.000000
Saved model after 600 episodes.


In [13]:
# Test

pg_trainer.restore(sess, ckpt_file)
pg_trainer.run_episode(max_step=2500, render=True,
                       session=sess)

INFO:tensorflow:Restoring parameters from ./models/model.ckpt
Reward on episode: 2500.000000
