### **装载云盘**

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=email%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdocs.test%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive.photos.readonly%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········
Mounted at /content/drive


### **安装tensorflow 2.0**

In [0]:
!pip install tensorflow-gpu==2.0.0-beta1 

In [0]:
# tensorlayer 兼容问题
!pip install imgaug==0.2.6

In [0]:
!pip install tensorlayer

### **cd命令**

In [0]:
import os
os.chdir("drive/My Drive/RL_EA/Actor_Critic")

### **查看当前路径**

In [6]:
!pwd

/content/drive/My Drive/RL_EA/Actor_Critic


# Actor Critic


$$
??-Policy\\
discrete\ \&\ continous
$$

优点：比PG收敛快。

缺点：Policy不容易收敛（DDPG用DQN的优点解决了这个问题）。

actor选择动作->critic评估动作->critic学习->actor学习

### 核心公式：

$$td-error:\delta=r_t(s_t,a_t)+\lambda v_w(s_{t+1})-v_w(s_t)$$

$$w_{t+1}=w_t+\alpha^w\delta_t\nabla_wv_w(s_t)$$

$$\theta_{t+1}=\theta_t+\alpha^{\theta}\nabla_{\theta}log\pi_{\theta}(a_t|s_t)\delta_t$$




In [0]:
import os
import time

import matplotlib.pyplot as plt
import numpy as np
  
import gym
import tensorflow as tf
import tensorlayer as tl

In [0]:
tl.logging.set_verbosity(tl.logging.DEBUG)

np.random.seed(3)
tf.random.set_seed(3)

##### hyper parameters #####

OUTPUT_GRAPH = False
MAX_EPISODE  = 3000
DISPLAY_REWARD_THRESHOLD = 100
MAX_EP_STEPS = 1000
RENDER = False
LAMBDA = 0.99 # TD error中的reward折扣因子
LR_A = 0.003 # actor lr
LR_C = 0.01 # critic lr

In [0]:
##### Actor_Critic #####
# Actor #
class Actor:

  def __init__(self, n_features, n_actions, lr=0.001):

    def get_model(inputs_shape):
      ni = tl.layers.Input(inputs_shape, name='state')
      nn = tl.layers.Dense(
          n_units=30, act=tf.nn.relu6, W_init=tf.random_uniform_initializer(0, 0.01), name='hidden1'
      )(ni)
      nn = tl.layers.Dense(
          n_units=10, act=tf.nn.relu6, W_init=tf.random_uniform_initializer(0, 0.01), name='hidden2'
      )(nn)
      nn = tl.layers.Dense(n_units=n_actions, name='actions')(nn)
      return tl.models.Model(inputs=ni, outputs=nn, name='Actor_1')

    self.model = get_model([None, n_features])
    self.model.train()
    self.optimizer = tf.optimizers.Adam(lr)

  def learn(self, s, a, td):
    with tf.GradientTape() as tape:
      # 对s及后面的s_加上[]是为了添加一维，来适配model网络的shape，否则就要用expand_dims了。
      _logits = self.model(np.array([s]))
      # actor loss 也就是最大化类似PG的reward，用TD error代替奖励。
      _exp_v = tl.rein.cross_entropy_reward_loss(logits=_logits, actions=[a], rewards=td[0])  #???
    grad = tape.gradient(_exp_v, self.model.trainable_weights)
    self.optimizer.apply_gradients(zip(grad, self.model.trainable_weights))
    return _exp_v

  def choose_action(self, s):
    """
    从分布sample动作
    """
    _logits = self.model(np.array([s]))
    _probs = tf.nn.softmax(_logits).numpy()
    return tl.rein.choice_action_by_probs(_probs.ravel())

  def choose_action_greedy(self, s):
    """
    贪婪sample动作
    """
    _logits = self.model(np.array([s]))
    _probs = tf.nn.softmax(_logits).numpy()
    return np.argmax(_probs.ravel())

  def save_ckpt(self):
    if not os.path.exists('model'):
      os.makedirs('model')
    tl.files.save_npz(self.model.trainable_weights, name='model/model_actor.npz')

  def load_ckpt(self):
    tl.files.load_and_assign_npz(name='model/model_actor.npz', network=self.model)


In [0]:
# Critic #
class Critic:
  
  def __init__(self, n_features, lr=0.01):

    def get_model(inputs_shape):
      ni = tl.layers.Input(inputs_shape, name='state')
      nn = tl.layers.Dense(
          n_units=30, act=tf.nn.relu6, W_init=tf.random_uniform_initializer(0, 0.01), name='hidden1'
      )(ni)
      nn = tl.layers.Dense(
          n_units=5, act=tf.nn.relu6, W_init=tf.random_uniform_initializer(0, 0.01), name='hidden2'
      )(nn)
      nn = tl.layers.Dense(n_units=1, name='value')(nn)
      return tl.models.Model(inputs=ni, outputs=nn, name='Critic_1')
    
    self.model = get_model([None, n_features])
    self.model.train()
    self.optimizer = tf.optimizers.Adam(lr)

  def learn(self, s, r, s_):
    v_ = self.model(np.array([s_]))
    with tf.GradientTape() as tape:
      v = self.model(np.array([s]))
      # TD error
      td_error = r + LAMBDA * v_ - v
      loss = tf.square(td_error)
    grad = tape.gradient(loss, self.model.trainable_weights)
    self.optimizer.apply_gradients(zip(grad, self.model.trainable_weights))
    return td_error

  def save_ckpt(self):
    if not os.path.exists('model'):
      os.makedirs('model')
    tl.files.save_npz(self.model.trainable_weights, name='model/model_critic.npz')

  def load_ckpt(self):
    tl.files.load_and_assign_npz(name='model/model_critic.npz', network=self.model)


### **Main （初始化，训练，测试）**

In [23]:
# 初始化
env = gym.make('CartPole-v0')
env.seed(2)
N_F = env.observation_space.shape[0]
N_A = env.action_space.n

print("observation dimension: %d" % N_F)
print("observation high: %s" % env.observation_space.high)
print("observation low: %s" % env.observation_space.low)
print("num of actions: %d" % N_A)

actor = Actor(n_features=N_F, n_actions=N_A, lr=LR_A)
# 原文：we need a good teacher ,so the teacher should learn faster than the actor
critic = Critic(n_features=N_F, lr=LR_C)

I0830 06:10:55.931411 140300197140352 tl_logging.py:99] Input  state: [None, 4]
I0830 06:10:56.000565 140300197140352 tl_logging.py:99] Dense  hidden1: 30 relu6


observation dimension: 4
observation high: [4.8000002e+00 3.4028235e+38 4.1887903e-01 3.4028235e+38]
observation low: [-4.8000002e+00 -3.4028235e+38 -4.1887903e-01 -3.4028235e+38]
num of actions: 2


I0830 06:10:56.075567 140300197140352 tl_logging.py:99] Dense  hidden2: 10 relu6
I0830 06:10:56.235423 140300197140352 tl_logging.py:99] Dense  actions: 2 No Activation
I0830 06:10:56.315556 140300197140352 tl_logging.py:99] Input  state: [None, 4]
I0830 06:10:56.386860 140300197140352 tl_logging.py:99] Dense  hidden1: 30 relu6
I0830 06:10:56.463069 140300197140352 tl_logging.py:99] Dense  hidden2: 5 relu6
I0830 06:10:56.536109 140300197140352 tl_logging.py:99] Dense  value: 1 No Activation


In [26]:
# train
# 改变了两次seed，一次不收敛，一次200左右就收敛（在400左右突然又不收敛了？？r从150掉到负数）。
# AC方法不太稳定？
t0 = time.time()
for i_episode in range(MAX_EPISODE+1):
  s = env.reset().astype(np.float32)
  t = 0 # 本次episode中的step数
  all_r = []  # 所有steps的rewards
  while True:
    if RENDER:
      env.render()
    a = actor.choose_action(s)
    s_new, r, done, info = env.step(a)
    s_new = s_new.astype(np.float32)

    if done:
      r = -20 # 杆子倒了给个大的惩罚
    
    all_r.append(r)

    # 学习value func：gradient = grad[r + lambda * V(s_new) - V(s)]
    td_error = critic.learn(
        s, r, s_new
    )
    try:
      # 学习Policy：true_gradient = grad[logPi(s, a) * td_error]
      actor.learn(s, a, td_error)
    except KeyboardInterrupt:
      actor.save_ckpt()
      critic.save_ckpt()

    s = s_new
    t += 1

    # 第二个条件是为了防止一个episode花费时间太长，这个环境训练好的话杆子一直不会掉。
    if done or t >= MAX_EP_STEPS:
      ep_rs_sum = sum(all_r)

      if 'running_reward' not in globals():
        running_reward = ep_rs_sum
      else:
        running_reward = 0.95 * running_reward + 0.05 * ep_rs_sum
      
      print('Episode: {}/{} | Episode Reward:{:.4f} | Rinning Time:{:.4f}'\
            .format(i_episode, MAX_EPISODE, ep_rs_sum, time.time()-t0))
      
      # 差不多训练好了的模型，直接就render看结果
      if t >= MAX_EP_STEPS:
        print('Early Stopping')
        s = env.reset().astype(np.float32)
        rall = 0
        while True:
          # env.render()
          a = actor.choose_action_greedy(s)
          s_new, r, done, info = env.step(a)
          # np.concatenate:根据axis堆叠数组。
          # 但在这里其实没什么意义，直接用s_new也行？
          s_new = np.concatenate((s_new[0:N_F], s[N_F:]), axis=0).astype(np.float32)
          rall += r
          s = s_new
          if done:
            print('reward', rall)
            s = env.reset().astype(np.float32)
            rall = 0
      break
actor.save_ckpt()
critic.save_ckpt()


Episode: 0/3000 | Episode Reward:-12.0000 | Rinning Time:0.2022
Episode: 1/3000 | Episode Reward:-11.0000 | Rinning Time:0.4240
Episode: 2/3000 | Episode Reward:-13.0000 | Rinning Time:0.6018
Episode: 3/3000 | Episode Reward:-12.0000 | Rinning Time:0.8104
Episode: 4/3000 | Episode Reward:-12.0000 | Rinning Time:1.0095
Episode: 5/3000 | Episode Reward:-12.0000 | Rinning Time:1.2173
Episode: 6/3000 | Episode Reward:-11.0000 | Rinning Time:1.4398
Episode: 7/3000 | Episode Reward:-11.0000 | Rinning Time:1.6564
Episode: 8/3000 | Episode Reward:-13.0000 | Rinning Time:1.8451
Episode: 9/3000 | Episode Reward:-12.0000 | Rinning Time:2.0633
Episode: 10/3000 | Episode Reward:-11.0000 | Rinning Time:2.2774
Episode: 11/3000 | Episode Reward:-11.0000 | Rinning Time:2.4986
Episode: 12/3000 | Episode Reward:-11.0000 | Rinning Time:2.7188
Episode: 13/3000 | Episode Reward:-12.0000 | Rinning Time:2.9157
Episode: 14/3000 | Episode Reward:-11.0000 | Rinning Time:3.1537
Episode: 15/3000 | Episode Reward:-

I0830 06:38:39.288285 140300197140352 tl_logging.py:99] [*] Saving TL weights into model/model_actor.npz
I0830 06:38:39.307972 140300197140352 tl_logging.py:99] [*] Saved
I0830 06:38:39.309779 140300197140352 tl_logging.py:99] [*] Saving TL weights into model/model_critic.npz
I0830 06:38:39.321506 140300197140352 tl_logging.py:99] [*] Saved


Episode: 366/3000 | Episode Reward:-11.0000 | Rinning Time:76.8047
Episode: 367/3000 | Episode Reward:-11.0000 | Rinning Time:77.0641
Episode: 368/3000 | Episode Reward:-11.0000 | Rinning Time:77.2876
Episode: 369/3000 | Episode Reward:-12.0000 | Rinning Time:77.4775
Episode: 370/3000 | Episode Reward:-12.0000 | Rinning Time:77.6808
Episode: 371/3000 | Episode Reward:-11.0000 | Rinning Time:77.9081
Episode: 372/3000 | Episode Reward:-12.0000 | Rinning Time:78.0979
Episode: 373/3000 | Episode Reward:-11.0000 | Rinning Time:78.3155
Episode: 374/3000 | Episode Reward:-11.0000 | Rinning Time:78.5414
Episode: 375/3000 | Episode Reward:-12.0000 | Rinning Time:78.7391
Episode: 376/3000 | Episode Reward:-11.0000 | Rinning Time:78.9683
Episode: 377/3000 | Episode Reward:-11.0000 | Rinning Time:79.1876
Episode: 378/3000 | Episode Reward:-11.0000 | Rinning Time:79.4098
Episode: 379/3000 | Episode Reward:-12.0000 | Rinning Time:79.6051
Episode: 380/3000 | Episode Reward:-11.0000 | Rinning Time:79.

KeyboardInterrupt: ignored

In [25]:
# test
actor.load_ckpt()
critic.load_ckpt()
t0 = time.time()

for i_episode in range(MAX_EPISODE+1):
  episode_time = time.time()
  s = env.reset().astype(np.float32)
  t = 0 # 本次episode中的step数
  all_r = []  # 所有steps的rewards
  while True:
    if RENDER:
      env.render()
    a = actor.choose_action(s)
    s_new, r, done, info = env.step(a)
    s_new = s_new.astype(np.float32)

    if done:
      r = -20 # 杆子倒了给个大的惩罚
    
    all_r.append(r)
    s = s_new
    t += 1

    # 第二个条件是为了防止一个episode花费时间太长，这个环境训练好的话杆子一直不会掉。
    if done or t >= MAX_EP_STEPS:
      ep_rs_sum = sum(all_r)

      if 'running_reward' not in globals():
        running_reward = ep_rs_sum
      else:
        running_reward = 0.95 * running_reward + 0.05 * ep_rs_sum
      
      print('Episode: {}/{} | Episode Reward:{:.4f} | Rinning Time:{:.4f}'\
            .format(i_episode, MAX_EPISODE, ep_rs_sum, time.time()-t0))
      
      # 差不多训练好了的模型，直接就render看结果
      if t >= MAX_EP_STEPS:
        print('Early Stopping')
        s = env.reset().astype(np.float32)
        rall = 0
        while True:
          # env.render()
          a = actor.choose_action_greedy(s)
          s_new, r, done, info = env.step(a)
          # np.concatenate:根据axis堆叠数组。
          s_new = np.concatenate((s_new[0:N_F], s[N_F:]), axis=0).astype(np.float32)
          rall += r
          s = s_new
          if done:
            print('reward', rall)
            s = env.reset().astype(np.float32)
            rall = 0
      break

I0830 06:33:43.289759 140300197140352 tl_logging.py:99] [*] Load model/model_actor.npz SUCCESS!
I0830 06:33:43.296303 140300197140352 tl_logging.py:99] [*] Load model/model_critic.npz SUCCESS!


Episode: 0/3000 | Episode Reward:-10.0000 | Rinning Time:0.0215
Episode: 1/3000 | Episode Reward:-9.0000 | Rinning Time:0.0380
Episode: 2/3000 | Episode Reward:-5.0000 | Rinning Time:0.0625
Episode: 3/3000 | Episode Reward:-7.0000 | Rinning Time:0.0820
Episode: 4/3000 | Episode Reward:-6.0000 | Rinning Time:0.1017
Episode: 5/3000 | Episode Reward:-10.0000 | Rinning Time:0.1230
Episode: 6/3000 | Episode Reward:-9.0000 | Rinning Time:0.1393
Episode: 7/3000 | Episode Reward:-6.0000 | Rinning Time:0.1587
Episode: 8/3000 | Episode Reward:-12.0000 | Rinning Time:0.1699
Episode: 9/3000 | Episode Reward:-13.0000 | Rinning Time:0.1801
Episode: 10/3000 | Episode Reward:-6.0000 | Rinning Time:0.1995
Episode: 11/3000 | Episode Reward:-9.0000 | Rinning Time:0.2143
Episode: 12/3000 | Episode Reward:-10.0000 | Rinning Time:0.2345
Episode: 13/3000 | Episode Reward:-7.0000 | Rinning Time:0.2526
Episode: 14/3000 | Episode Reward:-5.0000 | Rinning Time:0.2733
Episode: 15/3000 | Episode Reward:7.0000 | Ri