### **装载云盘**

In [0]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=email%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdocs.test%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive.photos.readonly%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········
Mounted at /content/drive


### **安装tensorflow 2.0**

In [0]:
!pip install tensorflow-gpu==2.0.0-beta1 

In [0]:
# tensorlayer 兼容问题
!pip install imgaug==0.2.6

In [0]:
!pip install tensorlayer

### **cd命令**

In [0]:
import os
os.chdir("drive/My Drive/RL_EA/Actor_Critic")

### **查看当前路径**

In [0]:
!pwd

/content/drive/My Drive/RL_EA/Actor_Critic


# Deep Deterministic Policy Gradient

</br>
$$
off-Policy\\
continous
$$
</br>

**DDPG学习两个approximator来估计$Q^*(s,a)$，$a^*(s)$。适合continous动作空间。训练的时候加noise能增加前期explore的能力。Actor和Critic均有target网**

### Q-function相关核心公式：

</br>

$Bellmen Optimal Equation$：
$$
Q^*(s,a) = \underset{s' \sim P}{{\mathrm E}}\left[r(s,a) + \gamma \max_{a'} Q^*(s', a')\right]
$$

</br>

用一个网络来近似最优$Q-function$：

</br>

&emsp;&emsp;一般情况：

$$
L(\phi,D)=\underset{(s,a,r,s',d)' \sim P}{{\mathrm E}}\big[(Q_\phi(s,a)-(r+\gamma(1-d)\max_{a'}Q_\phi(s',a')))^2\big]
$$

</br>

&emsp;&emsp;DDPG(由于连续空间中不能很容易地罗列出所有的状态动作值，**因此对max操作进行了调整**)：

$$
L(\phi,D)=\underset{(s,a,r,s',d)' \sim P}{{\mathrm E}}\big[(Q_\phi(s,a)-(r+\gamma(1-d)\max_{a'}Q_{\phi_{targ}}\ (s',\mu_{\theta_{targ}}\ (s'))))^2\big]
$$

</br>

利用$Experience\ Replay\ Buffer$和$Target$网，target网络参数更新方式：

$$
\phi_{\text{targ}} \leftarrow \rho \phi_{\text{targ}} + (1 - \rho) \phi
$$

</br>

### Policy相关核心公式：

</br>

学的是一个确定性的策略$\mu_\theta(s)$，其选择的动作能最大化$Q_\phi(s,a)$，这里假设$Q-function$是针对动作可微的，那么$Policy$的学习通过下面公式来解决(将$Q-function$相关的参数当常量计算)：


$$
\max_\theta \underset{s \sim D}{\mathrm E}\big[Q_\phi(s,\mu_\theta(s)) \big]
$$

In [0]:
import os
import time

import matplotlib.pyplot as plt
import numpy as np
  
import gym
import tensorflow as tf
import tensorlayer as tl

In [0]:
# tl.logging.set_verbosity(tl.logging.DEBUG)

# np.random.seed(3)
# tf.random.set_seed(3)

##### hyper parameters #####

ENV_NAME = 'Pendulum-v0'
RANDOMSEED = 1

LR_A = 0.001 # actor lr
LR_C = 0.002 # critic lr
GAMMA = 0.9
TAU = 0.01 # soft replace (target网络更新)
MEMORY_CAPACITY = 10000
BATCH_SIZE = 32

MAX_EPISODES = 200 # 训练episode数
MAX_EP_STEPS = 200 # 每个episode的step(步)数
TEST_PER_EPISODES = 10 # 每episodes测试模型？？
VAR = 3 # 控制explore

In [0]:
##### DDPG #####
class DDPG:

  def __init__(self, a_dim, s_dim, a_bound):
    # (s,a,r,s')
    self.memory = np.zeros((MEMORY_CAPACITY, s_dim * 2 + a_dim + 1), dtype=np.float32)
    self.pointer = 0
    self.a_dim, self.s_dim, self.a_bound = a_dim, s_dim, a_bound

    W_init = tf.random_normal_initializer(mean=0, stddev=0.3)
    b_init = tf.constant_initializer(0.1)

    def get_actor(input_state_shape, name=''):
      """
      return: act
      """
      inputs = tl.layers.Input(input_state_shape, name='A_input')
      x = tl.layers.Dense(n_units=30, act=tf.nn.relu, W_init=W_init, b_init=b_init, name='A_l1')(inputs)
      x = tl.layers.Dense(n_units=a_dim, act=tf.nn.tanh, W_init=W_init, b_init=b_init, name='A_a')(x)
      # lambda 层
      x = tl.layers.Lambda(lambda x: np.array(a_bound) * x)(x)
      return tl.models.Model(inputs=inputs, outputs=x, name='Actor' + name)

    def get_critic(input_state_shape, input_action_shape, name=''):
      """
      return: Q(s,a)
      """
      s = tl.layers.Input(input_state_shape, name='C_s_input')
      a = tl.layers.Input(input_action_shape, name='C_a_input')
      # 将s，a均作为输入，之前的PG，DQN等都是只把s当作输入，输出一个向量表示Q(s,a)。
      x = tl.layers.Concat(1)([s, a])
      x = tl.layers.Dense(n_units=60, act=tf.nn.relu, W_init=W_init, b_init=b_init, name='C_l1')(x)
      x = tl.layers.Dense(n_units=1, W_init=W_init, b_init=b_init, name='C_out')(x)
      return tl.models.Model(inputs=[s, a], outputs=x, name='Critic' + name)

    self.actor = get_actor([None, s_dim])
    self.critic = get_critic([None, s_dim], [None, a_dim])
    self.actor.train()
    self.critic.train()

    def copy_para(from_model, to_model):
      """
      from_model: 最新模型
      to_model: target模型
      初始化用的硬更新
      """
      for i, j in zip(from_model.trainable_weights, to_model.trainable_weights):
        j.assign(i)

    self.actor_target = get_actor([None, s_dim], name='_target')
    copy_para(self.actor, self.actor_target)
    self.actor_target.eval()

    self.critic_target = get_critic([None, s_dim], [None, a_dim], name='_target')
    copy_para(self.critic, self.critic_target)
    self.critic_target.eval()

    self.R = tl.layers.Input([None, 1], tf.float32, 'r')
    # soft replacement
    self.ema = tf.train.ExponentialMovingAverage(decay=1-TAU)

    self.actor_opt = tf.optimizers.Adam(LR_A)
    self.critic_opt = tf.optimizers.Adam(LR_C)

  def ema_update(self):
    """
    利用滑动平均实现软更新
    """
    paras = self.actor.trainable_weights + self.critic.trainable_weights
    self.ema.apply(paras)
    for i, j in zip(self.actor_target.trainable_weights + self.critic_target.trainable_weights, paras):
      i.assign(self.ema.average(j))  

  def choose_action(self, s):
    # [s]会加一维，是list，通过np变成nparray，然后标明dtype
    return self.actor(np.array([s], dtype=np.float32))[0]

  def learn(self):
    indices = np.random.choice(MEMORY_CAPACITY, size=BATCH_SIZE)
    bt = self.memory[indices, :]
    bs = bt[:, :self.s_dim]
    ba = bt[:, self.s_dim:self.s_dim + self.a_dim]
    br = bt[:, -self.s_dim - 1:-self.s_dim]
    bs_ = bt[:, -self.s_dim:]

    # Critic
    with tf.GradientTape() as tape:
      a_ = self.actor_target(bs_)
      q_ = self.critic_target([bs_, a_])
      y = br + GAMMA * q_
      q = self.critic([bs, ba])
      td_error = tf.losses.mean_squared_error(y, q)
    c_grads = tape.gradient(td_error, self.critic.trainable_weights)
    self.critic_opt.apply_gradients(zip(c_grads, self.critic.trainable_weights))

    # Actor
    with tf.GradientTape() as tape:
      a = self.actor(bs)
      q = self.critic([bs, a])
      a_loss = -tf.reduce_mean(q)
    a_grad = tape.gradient(a_loss, self.actor.trainable_weights)
    self.actor_opt.apply_gradients(zip(a_grad, self.actor.trainable_weights))

    self.ema_update()

  def store_transition(self, s, a, r, s_):
    s = s.astype(np.float32)
    s_ = s_.astype(np.float32)
    # 这里r是个标量，不是list，所以加[]，hstack水平合并
    transition = np.hstack((s, a, [r], s_))
    index = self.pointer % MEMORY_CAPACITY
    self.memory[index, :] = transition
    self.pointer += 1


  def save_ckpt(self):
    if not os.path.exists('model'):
      os.makedirs('model')
    tl.files.save_weights_to_hdf5('model/ddpg_actor.hdf5', self.actor)
    tl.files.save_weights_to_hdf5('model/ddpg_actor_target.hdf5', self.actor_target)
    tl.files.save_weights_to_hdf5('model/ddpg_critic.hdf5', self.critic)
    tl.files.save_weights_to_hdf5('model/ddpg_critic_target.hdf5', self.critic_target)


  def load_ckpt(self):
    tl.files.load_hdf5_to_weights_in_order('model/ddpg_actor.hdf5', self.actor)
    tl.files.load_hdf5_to_weights_in_order('model/ddpg_actor_target.hdf5', self.actor_target)
    tl.files.load_hdf5_to_weights_in_order('model/ddpg_critic.hdf5', self.critic)
    tl.files.load_hdf5_to_weights_in_order('model/ddpg_critic_target.hdf5', self.critic_target)


### **Main （初始化，训练，测试）**

In [0]:
# 初始化
env = gym.make(ENV_NAME)
env = env.unwrapped

env.seed(RANDOMSEED)
np.random.seed(RANDOMSEED)
tf.random.set_seed(RANDOMSEED)

s_dim = env.observation_space.shape[0]
a_dim = env.action_space.shape[0]
a_bound = env.action_space.high

print("observation dimension: {}".format(s_dim))
print("action high: {}".format(a_bound))
print("actions dimension: {}".format(a_dim))

ddpg = DDPG(a_dim, s_dim, a_bound)

observation dimension: 3
action high: [2.]
actions dimension: 1


In [0]:
# train
reward_buffer = []
t0 = time.time()
for i in range(MAX_EPISODES):
  t1 = time.time()
  s = env.reset()
  ep_reward = 0
  for j in range(MAX_EP_STEPS):
    a = ddpg.choose_action(s)
    # 加noise 提高explore
    # np.random.normal(loc=均值，scale=标准差，size=None[输出个数，默认为1])
    a = np.clip(np.random.normal(a, VAR), -2, 2)
    s_, r, done, info = env.step(a)

    ddpg.store_transition(s, a, r / 10, s_)

    if ddpg.pointer > MEMORY_CAPACITY:
      ddpg.learn()

    s = s_
    ep_reward += r
    if j == MAX_EP_STEPS - 1:
      print(
          '\rEpisode:{}/{} | Episode Reward:{:.4f} | Running Time:{:.4f}'\
          .format(i, MAX_EPISODES, ep_reward, time.time() - t1), end=''
      )
      # end='' 刷新式输出
    plt.show()
  
  # 训练中途测试
  if i and not i % TEST_PER_EPISODES:
    t1 = time.time()
    s = env.reset()
    ep_reward = 0
    for j in range(MAX_EP_STEPS):
      # 测试时不加noise
      a = ddpg.choose_action(s)
      s_, r, done, info = env.step(a)

      s = s_
      ep_reward += r
      if j == MAX_EP_STEPS - 1:
        print(
          '\rEpisode:{}/{} | Episode Reward:{:.4f} | Running Time:{:.4f}'\
          .format(i, MAX_EPISODES, ep_reward, time.time() - t1)
        )      
        reward_buffer.append(ep_reward)

  if reward_buffer:
    plt.ion()
    plt.cla()
    plt.title('DDPG')
    plt.plot(np.array(range(len(reward_buffer))) * TEST_PER_EPISODES, reward_buffer)
    plt.xlabel('episode steps')
    plt.ylabel('normalized state-action value')
    plt.ylim(-2000, 0)
    plt.show()
    plt.pause(0.1)

plt.ioff()
plt.show()
print('\nRunning time: ', time.time() - t0)
ddpg.save_ckpt()

In [0]:
# test
ddpg.load_ckpt()
while True:
  s = env.reset()
  for i in range(MAX_EP_STEPS):
    # env.render()
    s, r, done, info = env.step(ddpg.choose_action(s))
    if done:
      print('finished one complete time')
      break