### **装载云盘**

In [0]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


### **安装tensorflow 2.0**

In [0]:
!pip install tensorflow-gpu==2.0.0-beta1 

In [0]:
# tensorlayer 兼容问题
!pip install imgaug==0.2.6

In [0]:
!pip install tensorlayer

### **cd命令**

In [0]:
import os
os.chdir("drive/My Drive/RL_EA/Actor_Critic")

### **查看当前路径**

In [0]:
!pwd

/content


# Twin Delayed DDPG (TD3)

</br>
$$
off-Policy\\
continous
$$
</br>

**DDPG学习两个approximator来估计$Q^*(s,a)$，$a^*(s)$。适合continous动作空间。训练的时候加noise能增加前期explore的能力。Actor和Critic均有target网**

**TD3 是基于DDPG进行改进的，添加了3个改进点：**


**（1）target policy smoothing：**

为了防止critic学习过程中进入了错误的“最优”点，对action加了点noise。

$$
a'(s')=clip(\mu_{\theta_{targ}} \ \ (s')+clip(\epsilon,-c,c),a_{Low},a_{High}), \ \ \epsilon\sim{\mathcal N}(0,\sigma)
$$

**（2）clipped double-Q learning：**

用两对critic网来计算Q值，取其中最小的一个进行update。

$$
y(r,s',d)=r+\gamma(1-d)\min_{i=1,2}Q_{\phi_{i,targ}} \ \ (s',a'(s'))
$$

**（3）delayed update of target and policy networks：**

为减少方差，policy更新得比Q慢一步，例如更新critic和actor的频率为2：1。
</br>

### Critic更新：

$$
L(\phi_i,D)=\underset{(s,a,r,s',d)' \sim D}{{\mathrm E}}\big[(Q_{\phi_i}(s,a)-y(r,s',d))^2\big]
$$


### Actor更新：

$$
\max_\theta \underset{s \sim D}{\mathrm E}\big[Q_{\phi_1}(s,\mu_\theta(s)) \big]
$$
*这里选择1号网络进行计算Q值。

In [0]:
import os
import time
import random

import matplotlib.pyplot as plt
import numpy as np
from IPython.display import clear_output
  
import gym
import tensorflow as tf
import tensorflow_probability as tfp
import tensorlayer as tl
from tensorlayer.layers import Dense
from tensorlayer.models import Model

In [0]:
tfd = tfp.distributions
Normal = tfd.Normal

tl.logging.set_verbosity(tl.logging.DEBUG)

random.seed(2)
np.random.seed(2)
tf.random.set_seed(2)

##### hyper parameters #####

ENV = 'Pendulum-v0'
action_range = 1.
max_frames = 40000
test_frames = 300
max_steps = 150
batch_size = 64
explore_steps = 500
update_itr = 3
hidden_dim = 32
q_lr = 3e-4
policy_lr = 3e-4
policy_target_update_interval = 3
explore_noise_scale = 1.0
eval_noise_scale = 0.5
reward_scale = 1.
replay_buffer_size = 5e5

In [0]:
# buffer
class ReplayBuffer:
  def __init__(self, capacity):
    self.capacity = capacity
    self.buffer = []
    self.position = 0
  
  def push(self, state, action, reward, next_state, done):
    if len(self.buffer) < self.capacity:
      self.buffer.append(None)
    self.buffer[self.position] = (state, action, reward, next_state, done)
    self.position = int((self.position + 1) % self.capacity)

  def sample(self, batch_size):
    batch = random.sample(self.buffer, batch_size)
    state, action, reward, next_state, done = map(np.stack, zip(*batch))
    '''
    * : sum(a,b) <=> batch=(a,b), sum(*batch)
    zip : a=[1,2], b=[2,3], zip(a,b) => [(1,2),(2,3)]
    map : map(square, [2,3]) => [4,9]
    stack : np.stack((1,2)) => array([1,2])
    '''
    return state, action, reward, next_state, done

  def __len__(self):
    return len(self.buffer)

In [0]:
# utils
class NormalizedActions(gym.ActionWrapper):
  # action _action
  def action(self, action):
    low = self.action_space.low
    high = self.action_space.high

    action = low + (action + 1.0) * 0.5 *(high - low) # ?
    action = np.clip(action, low, high)
    return action

  # reverse_action _reverse_action
  def reverse_action(self, action):
    low = self.action_space.low
    high = self.action_space.high

    action = 2 * (action - low) / (high - low) - 1
    action = np.clip(action, low, high)
    return action
  
def plot(frame_idx, rewards):
  clear_output(True)
  plt.figure(figsize=(20,5))
  plt.title('frame %s. reward:%s' % (frame_idx, rewards[-1]))
  plt.plot(rewards)
  plt.xlabel('Episode')
  plt.ylabel('Episode Reward')
  plt.savefig('td3.png')
  # plt.show()

In [0]:
# Network
class QNetwork(Model):

  def __init__(self, num_inputs, num_actions, hidden_dim, init_w=3e-3):
    super(QNetwork, self).__init__()
    input_dim = num_inputs + num_actions
    w_init = tf.random_uniform_initializer(-init_w, init_w)

    self.linear1 = Dense(n_units=hidden_dim, act=tf.nn.relu, W_init=w_init, in_channels=input_dim, name='q1')
    self.linear2 = Dense(n_units=hidden_dim, act=tf.nn.relu, W_init=w_init, in_channels=hidden_dim, name='q2')
    self.linear3 = Dense(n_units=1, W_init=w_init, in_channels=hidden_dim, name='q3')

  def forward(self, input):
    x = self.linear1(input)
    x = self.linear2(x)
    x = self.linear3(x)
    return x

class PolicyNetwork(Model):

  def __init__(self, num_inputs, num_actions, hidden_dim, action_range=1., init_w=3e-3):
    super(PolicyNetwork, self).__init__()
    w_init = tf.random_uniform_initializer(-init_w, init_w)
    self.linear1 = Dense(n_units=hidden_dim, act=tf.nn.relu, W_init=w_init, in_channels=num_inputs, name='policy1')
    self.linear2 = Dense(n_units=hidden_dim, act=tf.nn.relu, W_init=w_init, in_channels=hidden_dim, name='policy2')
    self.linear3 = Dense(n_units=hidden_dim, act=tf.nn.relu, W_init=w_init, in_channels=hidden_dim, name='policy3')

    self.output_linear = Dense(n_units=num_actions, W_init=w_init, \
    b_init=tf.random_uniform_initializer(-init_w, init_w), in_channels=hidden_dim, name='policy_output')
    self.action_range = action_range
    self.num_actions = num_actions

  def forward(self, state):
    x = self.linear1(state)
    x = self.linear2(x)
    x = self.linear3(x)

    output = tf.nn.tanh(self.output_linear(x))
    return output

  def evaluate(self, state, eval_noise_scale):
    '''
    target policy smooth
    '''
    state = state.astype(np.float32)
    action = self.forward(state)

    action = self.action_range * action

    # noise
    normal = Normal(0, 1)
    eval_noise_clip = 2 * eval_noise_scale
    noise = normal.sample(action.shape) * eval_noise_scale
    noise = tf.clip_by_value(noise, -eval_noise_clip, eval_noise_clip)
    action = action + noise

    return action

  def get_action(self, state, explore_noise_scale):
    action = self.forward([state])
    action = action.numpy()[0]

    # noise
    normal = Normal(0, 1)
    noise = normal.sample(action.shape) * explore_noise_scale
    action = self.action_range * action + noise

    return action.numpy()

  def sample_action(self, ):
    a = tf.random.uniform([self.num_actions], -1, 1)
    return self.action_range * a.numpy()

In [0]:
# TD3
class TD3_Trainer:

  def __init__(
      self, replay_buffer, hidden_dim, action_range, policy_target_update_interval=1, q_lr=3e-4, policy_lr=3e-4
  ):
    self.replay_buffer = replay_buffer
    
    self.q_net1 = QNetwork(state_dim, action_dim, hidden_dim)
    self.q_net2 = QNetwork(state_dim, action_dim, hidden_dim)
    self.target_q_net1 = QNetwork(state_dim, action_dim, hidden_dim)
    self.target_q_net2 = QNetwork(state_dim, action_dim, hidden_dim)
    self.policy_net = PolicyNetwork(state_dim, action_dim, hidden_dim, action_range)
    self.target_policy_net = PolicyNetwork(state_dim, action_dim, hidden_dim, action_range)
    print('Q Network (1,2): ',self.q_net1)
    print('Policy Network: ',self.policy_net)

    self.target_q_net1 = self.target_ini(self.q_net1, self.target_q_net1)
    self.target_q_net2 = self.target_ini(self.q_net2, self.target_q_net2)
    self.target_policy_net = self.target_ini(self.policy_net, self.target_policy_net)

    self.update_cnt = 0
    self.policy_target_update_interval = policy_target_update_interval

    self.q_optimizer1 = tf.optimizers.Adam(q_lr)
    self.q_optimizer2 = tf.optimizers.Adam(q_lr)
    self.policy_optimizer = tf.optimizers.Adam(policy_lr)

  def target_ini(self, net, target_net):
    for target_param, param in zip(target_net.trainable_weights, net.trainable_weights):
      target_param.assign(param)
    return target_net

  def target_soft_update(self, net, target_net, soft_tau):
    for target_param, param in zip(target_net.trainable_weights, net.trainable_weights):
      target_param.assign(
          target_param * (1.0 - soft_tau) + param * soft_tau
      )
    return target_net

  def update(self, batch_size, eval_noise_scale, reward_scale=10., gamma=0.9, soft_tau=1e-2):
    self.update_cnt += 1
    state, action, reward, next_state, done = self.replay_buffer.sample(batch_size)

    reward = reward[:, np.newaxis]
    done = done[:, np.newaxis]

    new_next_action = self.target_policy_net.evaluate(
        next_state, eval_noise_scale=eval_noise_scale
    ) # clipped normal noise
    # norm reward
    reward = reward_scale * (reward - np.mean(reward, axis=0)) / np.std(reward, axis=0)

    target_q_input = tf.concat([next_state, new_next_action], 1)
    target_q_min = tf.minimum(self.target_q_net1(target_q_input), self.target_q_net2(target_q_input))

    target_q_value = reward + (1 - done) * gamma * target_q_min
    q_input = tf.concat([state, action], 1)

    with tf.GradientTape() as q1_tape:
      predicted_q_value1 = self.q_net1(q_input)
      q_value_loss1 = tf.reduce_mean(tf.square(predicted_q_value1 - target_q_value))
    q1_grad = q1_tape.gradient(q_value_loss1, self.q_net1.trainable_weights)
    self.q_optimizer1.apply_gradients(zip(q1_grad, self.q_net1.trainable_weights))

    with tf.GradientTape() as q2_tape:
      predicted_q_value2 = self.q_net2(q_input)
      q_value_loss2 = tf.reduce_mean(tf.square(predicted_q_value2 - target_q_value))
    q2_grad = q2_tape.gradient(q_value_loss2, self.q_net2.trainable_weights)
    self.q_optimizer2.apply_gradients(zip(q2_grad, self.q_net2.trainable_weights))

    # policy
    if self.update_cnt % self.policy_target_update_interval == 0:
      with tf.GradientTape() as p_tape:
        new_action = self.policy_net.evaluate(
            state, eval_noise_scale=0.0
        ) # 无noise，确定性policy梯度
        new_q_input = tf.concat([state, new_action], 1)
        predicted_new_q_value = self.q_net1(new_q_input)
        policy_loss = -tf.reduce_mean(predicted_new_q_value)
        # 另一版本的实现是 Q值为 两个Q网络中最小的
      p_grad = p_tape.gradient(policy_loss, self.policy_net.trainable_weights)
      self.policy_optimizer.apply_gradients(zip(p_grad, self.policy_net.trainable_weights))

      # soft update
      self.target_q_net1 = self.target_soft_update(self.q_net1, self.target_q_net1, soft_tau)
      self.target_q_net2 = self.target_soft_update(self.q_net2, self.target_q_net2, soft_tau)
      self.target_policy_net = self.target_soft_update(self.policy_net, self.target_policy_net, soft_tau)

  def save_weights(self):
    tl.files.save_npz(self.q_net1.trainable_weights, name='model_q_net1.npz')
    tl.files.save_npz(self.q_net2.trainable_weights, name='model_q_net2.npz')
    tl.files.save_npz(self.target_q_net1.trainable_weights, name='model_target_q_net1.npz')
    tl.files.save_npz(self.target_q_net2.trainable_weights, name='model_target_q_net2.npz')
    tl.files.save_npz(self.policy_net.trainable_weights, name='model_policy_net.npz')
    tl.files.save_npz(self.target_policy_net.trainable_weights, name='model_target_policy_net.npz')

  def load_weights(self):
    tl.files.load_and_assign_npz(name='model_q_net1.npz', network=self.q_net1)
    tl.files.load_and_assign_npz(name='model_q_net2.npz', network=self.q_net2)
    tl.files.load_and_assign_npz(name='model_target_q_net1.npz', network=self.target_q_net1)
    tl.files.load_and_assign_npz(name='model_target_q_net2.npz', network=self.target_q_net2)
    tl.files.load_and_assign_npz(name='model_policy_net.npz', network=self.policy_net)
    tl.files.load_and_assign_npz(name='model_target_policy_net.npz', network=self.target_policy_net)

### **Main （初始化，训练，测试）**

In [0]:
# 初始化
env = NormalizedActions(gym.make(ENV))
action_dim = env.action_space.shape[0]
state_dim = env.observation_space.shape[0]
replay_buffer = ReplayBuffer(replay_buffer_size)
td3_trainer = TD3_Trainer(replay_buffer, hidden_dim=hidden_dim, policy_target_update_interval=policy_target_update_interval,\
action_range=action_range, q_lr=q_lr, policy_lr=policy_lr)
td3_trainer.q_net1.train()
td3_trainer.q_net2.train()
td3_trainer.target_q_net1.train()
td3_trainer.target_q_net2.train()
td3_trainer.policy_net.train()
td3_trainer.target_policy_net.train()


[TL] Dense  q1: 32 relu
[TL] Dense  q2: 32 relu
[TL] Dense  q3: 1 No Activation
[TL] Dense  q1: 32 relu
[TL] Dense  q2: 32 relu
[TL] Dense  q3: 1 No Activation
[TL] Dense  q1: 32 relu
[TL] Dense  q2: 32 relu
[TL] Dense  q3: 1 No Activation
[TL] Dense  q1: 32 relu
[TL] Dense  q2: 32 relu
[TL] Dense  q3: 1 No Activation
[TL] Dense  policy1: 32 relu
[TL] Dense  policy2: 32 relu
[TL] Dense  policy3: 32 relu
[TL] Dense  policy_output: 1 No Activation
[TL] Dense  policy1: 32 relu
[TL] Dense  policy2: 32 relu
[TL] Dense  policy3: 32 relu
[TL] Dense  policy_output: 1 No Activation
Q Network (1,2):  qnetwork(
  (q1): Dense(n_units=32, relu, in_channels='4', name='q1')
  (q2): Dense(n_units=32, relu, in_channels='32', name='q2')
  (q3): Dense(n_units=1, No Activation, in_channels='32', name='q3')
)
Policy Network:  policynetwork(
  (policy1): Dense(n_units=32, relu, in_channels='3', name='policy1')
  (policy2): Dense(n_units=32, relu, in_channels='32', name='policy2')
  (policy3): Dense(n_units=

In [0]:
# train
frame_idx = 0
rewards = []
t0 = time.time()
while frame_idx < max_frames:
  state = env.reset()
  state = state.astype(np.float32)
  episode_reward = 0
  if frame_idx < 1:
    print('intialize')
    # extra call 用来使内部func能够使用model.forward
    _ = td3_trainer.policy_net([state])
    _ = td3_trainer.target_policy_net([state])

  for step in range(max_steps):
    if frame_idx > explore_steps:
      action = td3_trainer.policy_net.get_action(state, explore_noise_scale=1.0)
    else:
      action = td3_trainer.policy_net.sample_action()

    next_state, reward, done, _ = env.step(action)
    next_state = next_state.astype(np.float32)
    # env.render()
    done = 1 if done == True else 0
    
    replay_buffer.push(state, action, reward, next_state, done)

    state = next_state
    episode_reward += reward
    frame_idx += 1

    if len(replay_buffer) > batch_size:
      for i in range(update_itr):
        td3_trainer.update(batch_size, eval_noise_scale=0.5, reward_scale=1.)
    
    if frame_idx % 500 == 0:
      plot(frame_idx, rewards)
    
    if done:
      break
  episode = int(frame_idx / max_steps)  # 当前episode
  all_episodes = int(max_frames / max_steps)  # 所有episode
  print('Episode:{}/{} | Episode Reward:{:.4f} | Running Time:{:.4f}'\
  .format(episode, all_episodes, episode_reward, time.time() - t0))
  rewards.append(episode_reward)
td3_trainer.save_weights()

In [0]:
# test
frame_idx = 0
rewards = []
t0 = time.time()

td3_trainer.load_weights()

while frame_idx < test_frames:
  state = env.reset()
  state = state.astype(np.float32)
  episode_reward = 0
  if frame_idx < 1:
    print('initialize')
    _ = td3_trainer.policy_net([state])
    _ = td3_trainer.target_policy_net([state])

  for step in range(max_steps):
    action = td3_trainer.policy_net.get_action(state, explore_noise_scale=1.0)
    next_state, reward, done, _ = env.step(action)
    next_state = next_state.astype(np.float32)
    # env.render()
    done = 1 if done == True else 0

    state = next_state
    episode_reward += reward
    frame_idx += 1

    if done:
      break
  episode = int(frame_idx / max_steps)
  all_episodes = int(test_frames / max_steps)
  print('Episode:{}/{} | Episode Reward:{:.4f} | Running Time:{:.4f}'\
  .format(episode, all_episodes, episode_reward, time.time() - t0))
  rewards.append(episode_reward)

[TL] [*] Load model_q_net1.npz SUCCESS!
[TL] [*] Load model_q_net2.npz SUCCESS!
[TL] [*] Load model_target_q_net1.npz SUCCESS!
[TL] [*] Load model_target_q_net2.npz SUCCESS!
[TL] [*] Load model_policy_net.npz SUCCESS!
[TL] [*] Load model_target_policy_net.npz SUCCESS!
initialize
Episode:1/2 | Episode Reward:-342.6536 | Running Time:0.6592
Episode:2/2 | Episode Reward:-647.6249 | Running Time:1.2619
