### **装载云盘**

In [0]:
from google.colab import drive
drive.mount('/content/drive')

### **安装tensorflow 2.0**

In [0]:
!pip install tensorflow-gpu==2.0.0-beta1

In [0]:
# tensorlayer 兼容问题
!pip install imgaug==0.2.6

In [0]:
!pip install tensorlayer

### **cd命令**

In [0]:
import os
os.chdir("drive/My Drive/RL_EA/Policy_Gradient")

### **查看当前路径**

In [0]:
!pwd

/content/drive/My Drive/RL_EA/Policy_Gradient


# Proximal Policy Optimization


$$
on-Policy\\
discrete\ \&\ continous
$$ 

</br>

**PPO对TRPO进行了一定的简化，而且效果和训练速度都比TRPO好。TRPO用到了二阶近似，PPO只用了一阶。**
</br>
* **TRPO:**

$$
 \max_{\theta}\mathbb{E}_{s\sim \rho_{\theta_{old}}\ ,\ a\sim \pi_{\theta_{old}}}\left[ \frac{\pi_{\theta}(a \mid s)}{\pi_{\theta_{old}}\ (a\mid s)}A_{\theta_{old}}\ (s,a)\right]\
$$
</br>
$$
subject \ to \quad \mathbb{E}_{s\sim \rho_{\theta_{old}}}\left[ D_{KL}(\pi_{\theta_{old}}\ (\cdot \mid s)\parallel \pi_{\theta}(\cdot \mid s))\right] \le \delta 
$$
</br>

* **PPO:**

1. **PPO1（baseline基本都用的PPO2）：**

> 关注点在KL惩罚项上，也可以理解成自动调整KL项的系数：

$$
 \max_{\theta}\mathbb{E}_{s\sim \rho_{\theta_{old}}\ ,\ a\sim \pi_{\theta_{old}}}\left[ \frac{\pi_{\theta}(a \mid s)}{\pi_{\theta_{old}}\ (a\mid s)}A_{\theta_{old}}\ (s,a)-\beta D_{KL}(\pi_{\theta_{old}}\ (\cdot \mid s)\parallel \pi_{\theta}(\cdot \mid s)) \right]\
$$

$$
\text{设} d=\mathbb{E}_{t}\big[D_{KL}(\pi_{\theta_{old}}\ (\cdot \mid s)\parallel \pi_{\theta}(\cdot \mid s)) \big]
$$

$$
\text{If}\ \ d < d_{targ}\ ,\ \beta \leftarrow\ \beta/2\ \ \
$$
$$
\text{If}\ \ d > d_{targ}\ ,\ \beta \leftarrow\ \beta\times2
$$
</br>
2. **PPO2：**

$$
ratio = \frac{\pi_{\theta}(a \mid s)}{\pi_{\theta_{old}}\ (a\mid s)}
$$

$$
 \max_{\theta}\mathbb{E}_{s\sim \rho_{\theta_{old}}\ ,\ a\sim \pi_{\theta_{old}}} \big[\min \big(ratioA_{\theta_{old}}\ (s,a)\ ,clip(ratio, 1-\epsilon, 1+\epsilon)A_{\theta_{old}}\ (s,a)\big)\big]\
$$
</br>
value（critic）的更新方式同一般的MSE。

In [0]:
import os
import time


import matplotlib.pyplot as plt
import numpy as np


import gym
import tensorflow as tf
import tensorflow_probability as tfp
import tensorlayer as tl

In [0]:
##### hyper parameters #####
ENV_NAME = 'Pendulum-v0' # CartPole-v0 Pendulum-v0
RANDOMSEED = 1
EP_MAX = 2000
EP_LEN = 200
GAMMA = 0.9 # reward折扣因子
A_LR = 0.0002 # 0.0001
C_LR = 0.0003 # 0.0002
BATCH = 32
A_UPDATE_STEPS = 10 # actor 更新steps
C_UPDATE_STEPS = 10 # critic 更新steps
S_DIM, A_DIM = 3, 1 # 提前给出env的状态，动作维度
EPS = 1e-8 # epsilon

METHOD = [
  dict(name='kl_pen', kl_target=0.01, lam=0.5), # KL penalty
  dict(name='clip', epsilon=0.15), # 裁剪参数，源代码认为0.2挺好
][1]


In [0]:
# PPO
class PPO:

  def __init__(self):
    # critic
    tfs = tl.layers.Input([None, S_DIM], tf.float32, 'state')
    l1 = tl.layers.Dense(100, tf.nn.relu)(tfs)
    v = tl.layers.Dense(1)(l1)
    self.critic = tl.models.Model(tfs, v)
    self.critic.train()

    # actor
    self.actor = self._build_anet('pi1', trainable=True)
    self.actor_old = self._build_anet('oldpi1', trainable=False)
    self.actor_opt = tf.optimizers.Adam(A_LR)
    self.critic_opt = tf.optimizers.Adam(C_LR)
    
  
  def _build_anet(self, name, trainable):
    tfs = tl.layers.Input([None, S_DIM], tf.float32, name + '_state')
    l1 = tl.layers.Dense(100, tf.nn.relu, name=name + '_l1')(tfs)
    a = tl.layers.Dense(A_DIM, tf.nn.tanh, name=name + '_a')(l1)
    # 连续空间用μ和σ表示
    mu = tl.layers.Lambda(lambda x: x * 2, name=name + '_lambda')(a)
    sigma = tl.layers.Dense(A_DIM, tf.nn.softplus, name=name + '_sigma')(l1) # softplus 平滑的relu
    model = tl.models.Model(tfs, [mu, sigma], name)

    if trainable:
      model.train()
    else:
      model.eval()
    return model

  def a_train(self, tfs, tfa, tfadv):
    '''
    状态(观察)，动作，adv(优势)值
    '''
    tfs = np.array(tfs, np.float32)
    tfa = np.array(tfa, np.float32)
    tfadv = np.array(tfadv, np.float32)
    with tf.GradientTape() as tape:
      mu, sigma = self.actor(tfs)
      pi = tfp.distributions.Normal(mu, sigma)

      mu_old, sigma_old = self.actor_old(tfs)
      oldpi = tfp.distributions.Normal(mu_old, sigma_old)

      ratio = pi.prob(tfa) / (oldpi.prob(tfa) + EPS)
      surr = ratio * tfadv
      if METHOD['name'] == 'kl_pen':
        tflam = METHOD['lam']
        kl = tfp.distributions.kl_divergence(oldpi, pi)
        kl_mean = tf.reduce_mean(kl)
        aloss = -(tf.reduce_mean(surr - tflam * kl))
      else:
        aloss = -tf.reduce_mean(
          tf.minimum(surr,
                tf.clip_by_value(ratio, 1. - METHOD['epsilon'], 1. + METHOD['epsilon']) * tfadv)
        )
    a_grad = tape.gradient(aloss, self.actor.trainable_weights)

    self.actor_opt.apply_gradients(zip(a_grad, self.actor.trainable_weights))

    if METHOD['name'] == 'kl_pen':
      return kl_mean

  def update_old_pi(self):
    for p, oldp in zip(self.actor.trainable_weights, self.actor_old.trainable_weights):
      oldp.assign(p)

  def c_train(self, tfdc_r, s):
    '''
    tfdc_r: 累计reward
    '''
    tfdc_r = np.array(tfdc_r, np.float32)
    with tf.GradientTape() as tape:
      v = self.critic(s)
      adv = tfdc_r - v
      closs = tf.reduce_mean(tf.square(adv))
    c_grad = tape.gradient(closs, self.critic.trainable_weights)
    self.critic_opt.apply_gradients(zip(c_grad, self.critic.trainable_weights))

  def cal_adv(self, tfs, tfdc_r):
    '''
    状态， 累计reward -> 优势
    '''
    tfdc_r = np.array(tfdc_r, np.float32)
    adv = tfdc_r - self.critic(tfs)
    return adv.numpy()

  def update(self, s, a, r):
    s, a, r = s.astype(np.float32), a.astype(np.float32), r.astype(np.float32)

    self.update_old_pi()
    adv = self.cal_adv(s, r)
    # adv = (adv-adv.mean())/(adv.std()+1e-6)

    # actor
    if METHOD['name'] == 'kl_pen':
      for _ in range(A_UPDATE_STEPS):
        kl = self.a_train(s, a, adv)
        if kl > 4 * METHOD['kl_target']: # google论文里的
          break
      if kl < METHOD['kl_target'] / 1.5: # OpenAI论文里的
        METHOD['lam'] /= 2
      elif kl > METHOD['kl_target'] * 1.5:
        METHOD['lam'] *= 2
      METHOD['lam'] = np.clip(METHOD['lam'], 1e-4, 10)
    else:
      for _ in range(A_UPDATE_STEPS):
        self.a_train(s, a, adv)
      
    # critic
    for _ in range(C_UPDATE_STEPS):
      self.c_train(r, s)

  def choose_action(self, s):
    s = s[np.newaxis, :].astype(np.float32)
    mu, sigma = self.actor(s)
    pi = tfp.distributions.Normal(mu, sigma)
    a = tf.squeeze(pi.sample(1), axis=0)[0]
    return np.clip(a, -2, 2)

  def get_v(self, s):
    s = s.astype(np.float32)
    if s.ndim < 2:
      s = s[np.newaxis, :]
    return self.critic(s)[0, 0] # ??

  def save_ckpt(self):
    if not os.path.exists('model'):
      os.makedirs('model')
    tl.files.save_weights_to_hdf5('model/ppo_actor.hdf5', self.actor)
    tl.files.save_weights_to_hdf5('model/ppo_actor_old.hdf5', self.actor_old)
    tl.files.save_weights_to_hdf5('model/ppo_critic.hdf5', self.critic)

  def load_ckpt(self):
    tl.files.load_hdf5_to_weights_in_order('model/ppo_actor.hdf5', self.actor)
    tl.files.load_hdf5_to_weights_in_order('model/ppo_actor_old.hdf5', self.actor_old)
    tl.files.load_hdf5_to_weights_in_order('model/ppo_critic.hdf5', self.critic)

### **Main （初始化，训练，测试）**

In [0]:
env = gym.make(ENV_NAME).unwrapped

env.seed(RANDOMSEED)
np.random.seed(RANDOMSEED)
tf.random.set_seed(RANDOMSEED)

ppo = PPO()

[TL] Input  state: [None, 3]
[TL] Dense  dense_1: 100 relu
[TL] Dense  dense_2: 1 No Activation
[TL] Input  pi1_state: [None, 3]
[TL] Dense  pi1_l1: 100 relu
[TL] Dense  pi1_a: 1 tanh
[TL] Lambda  pi1_lambda: func: <function PPO._build_anet.<locals>.<lambda> at 0x7f55cefbc620>, len_weights: 0
[TL] Dense  pi1_sigma: 1 softplus
[TL] Input  oldpi1_state: [None, 3]
[TL] Dense  oldpi1_l1: 100 relu
[TL] Dense  oldpi1_a: 1 tanh
[TL] Lambda  oldpi1_lambda: func: <function PPO._build_anet.<locals>.<lambda> at 0x7f55cefbc9d8>, len_weights: 0
[TL] Dense  oldpi1_sigma: 1 softplus


In [0]:
# train
all_ep_r = []
for ep in range(EP_MAX):
  s = env.reset()
  buffer_s, buffer_a, buffer_r = [], [], []
  ep_r = 0
  t0 = time.time()
  for t in range(EP_LEN): # 一个episode
    a = ppo.choose_action(s)
    s_, r, done, _ = env.step(a)
    buffer_s.append(s)
    buffer_a.append(a)
    buffer_r.append((r + 8) / 8) # 源代码认为这么norm能有效
    s = s_
    ep_r += r

    # update
    if (t + 1) % BATCH == 0 or t == EP_LEN - 1:
      v_s_ = ppo.get_v(s_)
      discounted_r = []
      for r in buffer_r[::-1]:
        v_s_ = r + GAMMA * v_s_
        discounted_r.append(v_s_)
      discounted_r.reverse()

      bs, ba, br = np.vstack(buffer_s), np.vstack(buffer_a), np.array(discounted_r)[:, np.newaxis]
      buffer_s, buffer_a, buffer_r = [], [], []
      ppo.update(bs, ba, br)
  if ep == 0:
    all_ep_r.append(ep_r)
  else:
    all_ep_r.append(all_ep_r[-1] * 0.9 + ep_r * 0.1)
  print(
    'Episode:{}/{} | Episode Reward:{:.4f} | Running Time:{:.4f}'.format(
        ep, EP_MAX, ep_r,
        time.time() - t0
    )
  )

  plt.ion()
  plt.cla()
  plt.title('PPO')
  plt.plot(np.arange(len(all_ep_r)), all_ep_r)
  plt.ylim(-2000, 0)
  plt.xlabel('Episode')
  plt.ylabel('Moving averaged episode reward')
  plt.show()
  plt.pause(0.1)
  if ep % 500 == 0:
    ppo.save_ckpt()
ppo.save_ckpt()
plt.ioff()
plt.show()

In [0]:
# test
ppo.load_ckpt()
while True:
  s = env.reset()
  for i in range(EP_LEN):
    # env.render()
    s, r, done, _ = env.step(ppo.choose_action(s))
    if done:
      break