In [None]:
import gym
import os
import time
import numpy as np
import tensorflow as tf
from utils.ValueHistory import ValueHistory

In [None]:
class REINFORCE_Agent:
    def __init__(self, 
                 sess, 
                 input_shape, 
                 action_num,
                 gamma = 0.99,
                 batch_size=32,
                 lr=1e-5):
        self.sess = sess
        self.input_shape = list(input_shape)
        self.action_num = action_num
        self.gamma = gamma
        self.batch_size = batch_size
        
        self.value_estimator = ValueEstimator(sess, input_shape)
        self.value_history = ValueHistory(5e6)
        
        # bookkeeping (s,a,r) tuple in an episode
        self._reset()
        self.ep_states = tf.placeholder(tf.float32, [None] + self.input_shape)
        self.ep_actions = tf.placeholder(tf.int32, [None]) # actions taken in an episode
        self.ep_advantage = tf.placeholder(tf.float32, [None]) # advantage for each (s,a) pair in an episode
        
        self.action_logits = self._build_network("Actor", self.ep_states)
        self.action_prob = tf.nn.softmax(self.action_logits)
        
        with tf.variable_scope("Optimizer"):
            optimizer = tf.train.AdamOptimizer(learning_rate=lr)
            neg_log_prob = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=self.action_logits, labels=self.ep_actions)
            self.loss = tf.reduce_mean(neg_log_prob * self.ep_advantage)
            self.train_step = optimizer.minimize(self.loss) # minimizing negative grad log probability
                                                            # equivalent to maximizing positive grad log probability
        
    def _build_network(self, scope, X):
        with tf.variable_scope(scope):
            fc1 = tf.layers.dense(X, 128, activation=tf.nn.relu)
            fc2 = tf.layers.dense(fc1, 256, activation=tf.nn.relu)
            fc3 = tf.layers.dense(fc2, 256, activation=tf.nn.relu)
            fc4 = tf.layers.dense(fc3, 512, activation=tf.nn.relu)
            action_logits = tf.layers.dense(fc4, self.action_num)
        return action_logits
    
    def _reset(self):
        self.reward_list = []
        self.state_list = []
        self.action_list = []
        
    def act(self, state, training=True):
        state = np.reshape(state, [1] + self.input_shape)
        probs = sess.run(self.action_prob, feed_dict={self.ep_states:state})[0]
        if training:
            action = np.random.choice(self.action_num, p=probs)
        else:
            action = np.argmax(probs)
        return action
    
    def observe(self, state, action, reward):
        self.reward_list.append(reward)
        self.state_list.append(state)
        self.action_list.append(action)
    
    def train(self):
        discounted_reward = self._caculate_reward()
        baseline = self.value_estimator.predict(self.state_list)
        advantages = discounted_reward - baseline
        
        sess.run(self.train_step, feed_dict={self.ep_states:self.state_list,
                                             self.ep_actions:self.action_list,
                                             self.ep_advantage:advantages})
        
        # store training data in history and fit value function
        for s, r in zip(self.state_list, discounted_reward):
            self.value_history.append(s, r)
        for _ in range(5):
            batch_states, batch_V = self.value_history.sample(self.batch_size)
            self.value_estimator.train(batch_states, batch_V)
            
        self._reset()
        
    def _caculate_reward(self):
        # caculate discounted reward with causality (reward to go)
        reward_list = np.array(self.reward_list)
        d_reward = np.zeros_like(reward_list)
        for i in range(len(d_reward)):
            d_reward[i] = np.sum([reward_list[j] * self.gamma**j for j in range(len(reward_list[i:]))])
        return d_reward

In [None]:
class ValueEstimator:
    # Monte Carlo value estimator
    def __init__(self, sess, input_shape, lr=1e-4):
        self.sess = sess
        
        self.batch_states = tf.placeholder(tf.float32, [None] + list(input_shape))
        self.V_target = tf.placeholder(tf.float32, [None])
        
        with tf.variable_scope("ValueEstimator"):
            fc1 = fc1 = tf.layers.dense(self.batch_states, 128, activation=tf.nn.relu)
            fc2 = tf.layers.dense(fc1, 256, activation=tf.nn.relu)
            fc3 = tf.layers.dense(fc2, 256, activation=tf.nn.relu)
            fc4 = tf.layers.dense(fc3, 512, activation=tf.nn.relu)
            self.V_estimate = tf.squeeze(tf.layers.dense(fc4, 1))
            
            mse_loss = tf.losses.mean_squared_error(self.V_target, self.V_estimate)
            l2_loss = tf.add_n([tf.nn.l2_loss(v) for v in tf.trainable_variables() if 'ValueEstimator' in v.name]) * 3e-6
            self.loss = mse_loss + l2_loss
            optimizer = tf.train.AdamOptimizer(learning_rate=lr)
            self.train_step = optimizer.minimize(self.loss)
            
    def predict(self, states):
        return sess.run(self.V_estimate, feed_dict={self.batch_states:states})
    
    def train(self, states, targets):
        loss, _ = sess.run([self.loss, self.train_step], feed_dict={self.batch_states:states, self.V_target:targets})

In [None]:
tf.reset_default_graph()

with tf.device("/gpu:0"):
    
    config = tf.ConfigProto(allow_soft_placement = True)
    with tf.Session(config=config) as sess:
        
        env = gym.make("CartPole-v1")
        state = env.reset()
        action_num = env.action_space.n
        input_shape = env.observation_space.shape
        agent = REINFORCE_Agent(sess, input_shape, action_num)
        
        episodic_reward = tf.get_variable("episodic_reward", (), trainable=False)
        episodic_step = tf.get_variable("episodic_step", (), trainable=False)
        tf.summary.scalar("episode_reward",episodic_reward)
        tf.summary.scalar("episode_step",episodic_step)
        merged = tf.summary.merge_all()
        writer = tf.summary.FileWriter("logs/reinforce", sess.graph)
        saver = tf.train.Saver(max_to_keep=20)
        
        sess.run(tf.global_variables_initializer())
        
        e, episode_reward, global_step, episode_step = 0, 0, 0, 0
        
        while True:
            
            action = agent.act(state)
            next_state, reward, done, info = env.step(action)
            # normalize reward
            reward /= 100
            agent.observe(state, action, reward)
            episode_reward += reward
            state = next_state
            episode_step += 1
            global_step += 1
            
            if done:
                agent.train()
                state = env.reset()
                summary = sess.run(merged, feed_dict={episodic_reward:episode_reward, episodic_step:episode_step})
                writer.add_summary(summary, global_step=e)
                if e % 10 == 0:
                    writer.flush()
                episode_reward = 0
                episode_step = 0
                e += 1
            
                if e % 200 == 0:
                    saver.save(sess, "reinforce/model", global_step=e)

In [None]:
tf.reset_default_graph()

with tf.device("/gpu:0"):
    
    config = tf.ConfigProto(allow_soft_placement = True)
    with tf.Session(config=config) as sess:
        
        ckpt = tf.train.get_checkpoint_state(os.path.dirname('./reinforce/checkpoint'))
        env = gym.make("CartPole-v1")
        state = env.reset()
        action_num = env.action_space.n
        input_shape = env.observation_space.shape
        agent = REINFORCE_Agent(sess, input_shape, action_num, lr=0.001)
        saver = tf.train.Saver()
        saver.restore(sess, ckpt.model_checkpoint_path)
        done = False
        
        for i in range(10):
            l = []
            while not done:
                env.render()
                time.sleep(0.01)
                action = agent.act(state, False)
                state, reward, done, info = env.step(action)
                l.append(reward)
                #print(reward)
                if done:
                    state = env.reset()
                    done = False
                    print(sum(l)/len(l))
                    break
        env.close()