## PG for Policy Optimization. Using OpenAi Gym and Tensorflow.

## Lab2 -> Deep Reinforcement Learning - John Schulman MLSS
http://rl-gym-doc.s3-website-us-west-2.amazonaws.com/mlss/lab2.html

1 -> Import modules

In [1]:
from __future__ import print_function
import tensorflow as tf
import numpy as np
import gym
from gym.spaces import Discrete, Box

2 -> Choose action accroding to probability distribution (Discrete)

In [2]:
def categorical_sample(p):
    cump = np.cumsum(p)
    return (cump > np.random.rand()).argmax()
def greedy_sample(p):
    return p.argmax()

3 -> Do an episode, get observations, actions and rewards

In [3]:
def get_episode(agent, env, episode_max_length, render=False):
    obss = []
    acts = []
    rews = []
    obs = env.reset()
    for i in range(episode_max_length):
        obss.append(obs.reshape(-1))
        act = agent.act(obs)
        acts.append(act)
        obs, rew, done, _ = env.step(act)
        rews.append(rew)
        if render: env.render()
        if done: break
    return {
        "obs": np.array(obss),
        "act": np.array(acts),
        "rew": np.array(rews)
    }

4 -> Compute discounted returns

In [4]:
def discounted_ret(rew, gamma):
    for i in range(len(rew)-1):
        rew[-(i+2)] += gamma*rew[-(i+1)]
    return rew

5 -> Define agent (Discrete & Continuous)
- init -> define policy network, linear model
    * Discrete: output probability for each action
    * Continuous: output action value
- act -> compute probability and choose (Discrete) or compute and clip (Continuous)
- learn -> main loop, to update parameters from env

In [5]:
def weight_variable(shape):
    initial = tf.truncated_normal(shape, stddev=0.1)
    return tf.Variable(initial)
def bias_variable(shape):
    initial = tf.constant(0.1, shape=shape)
    return tf.Variable(initial)

In [6]:
class REINFORCEAgentDiscrete(object):
    def __init__(self, obs_space, act_space, **usercfg):
        obs_dim = obs_space.shape[0]
        self.act_dim = act_space.n
        self.cfg = dict(episode_max_length=100, timesteps_per_batch=10000,
                        n_iter=100, gamma=0.9, lr=0.05, hid_dim=20)
        self.cfg.update(usercfg)
        self.xs = tf.placeholder(tf.float32, [None, obs_dim])
        W1 = weight_variable([obs_dim, self.cfg['hid_dim']])
        b1 = bias_variable([self.cfg['hid_dim']])
        hs = tf.tanh(tf.matmul(self.xs, W1) + b1)
        W2 = weight_variable([self.cfg['hid_dim'], self.act_dim])
        b2 = bias_variable([self.act_dim])
        self.ps = tf.nn.softmax(tf.matmul(hs, W2) + b2)
        self.advs = tf.placeholder(tf.float32, [None])
        self.acts_onehot = tf.placeholder(tf.float32, [None, self.act_dim])
        loss = tf.reduce_mean(tf.log(tf.reduce_sum(self.ps*self.acts_onehot, 1))*self.advs)
        self.optimizer = tf.train.RMSPropOptimizer(learning_rate=self.cfg['lr'], 
                                              epsilon=1e-9).minimize(loss)
        
    def act(self, obs):
        p = self.sess.run(self.ps, feed_dict={self.xs:obs.reshape(1,-1)})
        # act = categorical_sample(p[0])
        act = greedy_sample(p[0])
        return act
        
    def learn(self, env):
        self.sess = tf.InteractiveSession()
        self.sess.run(tf.global_variables_initializer())
        for iteration in range(self.cfg["n_iter"]):
            timestep = 0
            episodes = []
            while (timestep < self.cfg["timesteps_per_batch"]):
                episode = get_episode(self, env, self.cfg["episode_max_length"])
                episodes.append(episode)
                timestep = timestep + len(episode["rew"])
            obss = np.concatenate([episode["obs"] for episode in episodes])
            acts = np.concatenate([episode["act"] for episode in episodes])
            acts_onehot = np.zeros([len(acts), self.act_dim])
            acts_onehot[np.arange(len(acts)), acts] = 1
            dis_rets = [discounted_ret(episode["rew"], self.cfg["gamma"]) for episode in episodes]
            timesteps_max_length = np.max([len(dis_ret) for dis_ret in dis_rets])
            dis_rets_padded = [np.append(dis_ret, np.zeros(timesteps_max_length - len(dis_ret))) for dis_ret in dis_rets]
            bs = np.mean(dis_rets_padded, axis=0)
            advs = np.concatenate([dis_ret - bs[:len(dis_ret)] for dis_ret in dis_rets])
            self.sess.run(self.optimizer, feed_dict={self.xs:obss, self.acts_onehot:acts_onehot, self.advs:advs})
            eprews = [np.sum(episode["rew"]) for episode in episodes]
            eplens = [len(episode) for episode in episodes]
            print("Iteration %i. \n NumEpisodes %i. NumTimeSteps %i. \n MaxRew %s. MeanRew %s+/-%s. MeanLen %s+/-%s." 
                  % (iteration+1, len(episodes), timestep+1, 
                     np.max(eprews), np.mean(eprews), np.std(eprews),
                     np.mean(eplens), np.std(eplens)))
            # get_episode(self, env, self.cfg["episode_max_length"], render=True)
        self.sess.close()

In [7]:
class REINFORCEAgentContinuous(object):
    def __init__(self, obs_space, act_space, **usercfg):
        obs_dim = obs_space.shape[0]
        act_dim = act_space.shape[0]
        self.act_space = act_space
        self.cfg = dict(episode_max_length=100, timesteps_per_batch=10000,
                        n_iter=100, gamma=0.9, lr=0.05, hid_dim=20)
        self.cfg.update(usercfg)
        self.xs = tf.placeholder(tf.float32, [None, obs_dim])
        self.W1 = weight_variable([obs_dim, self.cfg['hid_dim']])
        self.b1 = bias_variable([self.cfg['hid_dim']])
        hs = tf.tanh(tf.matmul(self.xs, self.W1) + self.b1)
        self.W2 = weight_variable([self.cfg['hid_dim'], act_dim])
        self.b2 = bias_variable([act_dim])
        self.acts = tf.matmul(hs, self.W2) + self.b2
        self.advs = tf.placeholder(tf.float32, [None])
        loss = tf.reduce_mean(tf.log(self.acts)*self.advs)
        self.optimizer = tf.train.RMSPropOptimizer(learning_rate=self.cfg['lr'], 
                                              epsilon=1e-9).minimize(loss)
        
    def act(self, obs):
        act = self.sess.run(self.acts, feed_dict={self.xs:obs.reshape(1,-1)})
        act = np.clip(act[0], self.act_space.low, self.act_space.high)
        return act
        
    def learn(self, env):
        self.sess = tf.InteractiveSession()
        self.sess.run(tf.global_variables_initializer())
        for iteration in range(self.cfg["n_iter"]):
            timestep = 0
            episodes = []
            while (timestep < self.cfg["timesteps_per_batch"]):
                episode = get_episode(self, env, self.cfg["episode_max_length"])
                episodes.append(episode)
                timestep = timestep + len(episode["rew"])
            obss = np.concatenate([episode["obs"] for episode in episodes])
            acts = np.concatenate([episode["act"] for episode in episodes])
            dis_rets = [discounted_ret(episode["rew"], self.cfg["gamma"]) for episode in episodes]
            timesteps_max_length = np.max([len(dis_ret) for dis_ret in dis_rets])
            dis_rets_padded = [np.append(dis_ret, np.zeros(timesteps_max_length - len(dis_ret))) for dis_ret in dis_rets]
            bs = np.mean(dis_rets_padded, axis=0)
            advs = np.concatenate([dis_ret - bs[:len(dis_ret)] for dis_ret in dis_rets])
            self.sess.run(self.optimizer, feed_dict={self.xs:obss, self.acts:acts, self.advs:advs})
            eprews = [np.sum(episode["rew"]) for episode in episodes]
            eplens = [len(episode) for episode in episodes]
            theta = self.sess.run(self.W1).reshape(-1)
            theta = np.append(theta, self.sess.run(self.b1).reshape(-1))
            theta = np.append(theta, self.sess.run(self.W2).reshape(-1))
            theta = np.append(theta, self.sess.run(self.b2).reshape(-1))
            print("Iteration %i. \n NumEpisodes %i. NumTimeSteps %i. \n MaxRew %s. MeanRew %s+/-%s. MeanLen %s+/-%s." 
                  % (iteration+1, len(episodes), timestep+1, 
                     np.max(eprews), np.mean(eprews), np.std(eprews),
                     np.mean(eplens), np.std(eplens)))
            print(" ThetaMean %s. ThetaStd %s" % (np.mean(theta), np.std(theta)))
            # get_episode(self, env, self.cfg["episode_max_length"], render=True)
        self.sess.close()

6 -> Main run

In [8]:
# env = gym.make("Acrobot-v1") # Discrete
env = gym.make("Pendulum-v0") # Continuous
if isinstance(env.action_space, Discrete):
    agent = REINFORCEAgentDiscrete(env.observation_space, env.action_space, 
                                   episode_max_length=env.spec.timestep_limit)
elif isinstance(env.action_space, Box):
    agent = REINFORCEAgentContinuous(env.observation_space, env.action_space, 
                                     episode_max_length=env.spec.timestep_limit)
else:
    raise NotImplementedError
agent.learn(env)

[2017-04-17 10:15:26,276] Making new env: Pendulum-v0


Iteration 1. 
 NumEpisodes 50. NumTimeSteps 10001. 
 MaxRew -7515.51499279. MeanRew -12389.8192059+/-2014.81305478. MeanLen 3.0+/-0.0.
 ThetaMean 0.0195458. ThetaStd 0.0789361
Iteration 2. 
 NumEpisodes 50. NumTimeSteps 10001. 
 MaxRew -7614.51113772. MeanRew -13282.3333011+/-2823.73948515. MeanLen 3.0+/-0.0.
 ThetaMean 0.0195458. ThetaStd 0.0789361
Iteration 3. 
 NumEpisodes 50. NumTimeSteps 10001. 
 MaxRew -7531.97731184. MeanRew -12043.4531024+/-2498.28530066. MeanLen 3.0+/-0.0.
 ThetaMean 0.0195459. ThetaStd 0.0789361
Iteration 4. 
 NumEpisodes 50. NumTimeSteps 10001. 
 MaxRew -7628.78817855. MeanRew -12448.9913581+/-2778.90335197. MeanLen 3.0+/-0.0.
 ThetaMean 0.0195458. ThetaStd 0.0789361
Iteration 5. 
 NumEpisodes 50. NumTimeSteps 10001. 
 MaxRew -7470.60254188. MeanRew -12711.9485615+/-3015.09361482. MeanLen 3.0+/-0.0.
 ThetaMean 0.0195458. ThetaStd 0.0789361
Iteration 6. 
 NumEpisodes 50. NumTimeSteps 10001. 
 MaxRew -7492.35605448. MeanRew -12785.7719894+/-3130.12856909. Mean

Iteration 48. 
 NumEpisodes 50. NumTimeSteps 10001. 
 MaxRew -7358.74763456. MeanRew -12438.0803054+/-2728.65886544. MeanLen 3.0+/-0.0.
 ThetaMean 0.0195453. ThetaStd 0.0789365
Iteration 49. 
 NumEpisodes 50. NumTimeSteps 10001. 
 MaxRew -7585.33391206. MeanRew -12781.0288429+/-2490.58298229. MeanLen 3.0+/-0.0.
 ThetaMean 0.0195453. ThetaStd 0.0789365
Iteration 50. 
 NumEpisodes 50. NumTimeSteps 10001. 
 MaxRew -7540.44044986. MeanRew -13043.6316356+/-2676.51875894. MeanLen 3.0+/-0.0.
 ThetaMean 0.0195453. ThetaStd 0.0789365
Iteration 51. 
 NumEpisodes 50. NumTimeSteps 10001. 
 MaxRew -7429.98645111. MeanRew -11739.144507+/-2735.2317265. MeanLen 3.0+/-0.0.
 ThetaMean 0.0195452. ThetaStd 0.0789365
Iteration 52. 
 NumEpisodes 50. NumTimeSteps 10001. 
 MaxRew -7524.66079614. MeanRew -13048.47924+/-2659.36685862. MeanLen 3.0+/-0.0.
 ThetaMean 0.0195468. ThetaStd 0.0789353
Iteration 53. 
 NumEpisodes 50. NumTimeSteps 10001. 
 MaxRew -7753.24042689. MeanRew -12770.2507449+/-2676.55193009. Me

Iteration 95. 
 NumEpisodes 50. NumTimeSteps 10001. 
 MaxRew -6445.15187868. MeanRew -12482.4543457+/-2585.97346251. MeanLen 3.0+/-0.0.
 ThetaMean 0.0195502. ThetaStd 0.0789303
Iteration 96. 
 NumEpisodes 50. NumTimeSteps 10001. 
 MaxRew -7713.69727801. MeanRew -12545.9142682+/-2462.81875944. MeanLen 3.0+/-0.0.
 ThetaMean 0.0195508. ThetaStd 0.0789303
Iteration 97. 
 NumEpisodes 50. NumTimeSteps 10001. 
 MaxRew -9366.84729579. MeanRew -13439.0412477+/-2736.17905238. MeanLen 3.0+/-0.0.
 ThetaMean 0.0195513. ThetaStd 0.0789307
Iteration 98. 
 NumEpisodes 50. NumTimeSteps 10001. 
 MaxRew -7604.14719249. MeanRew -12810.555562+/-3169.02946115. MeanLen 3.0+/-0.0.
 ThetaMean 0.0195514. ThetaStd 0.0789307
Iteration 99. 
 NumEpisodes 50. NumTimeSteps 10001. 
 MaxRew -7587.10365705. MeanRew -12378.5853103+/-2270.21489578. MeanLen 3.0+/-0.0.
 ThetaMean 0.0195483. ThetaStd 0.0789332
Iteration 100. 
 NumEpisodes 50. NumTimeSteps 10001. 
 MaxRew -7588.50901464. MeanRew -12956.2984041+/-2543.60910491