### **装载云盘**

In [0]:
from google.colab import drive
drive.mount('/content/drive')

### **安装tensorflow 2.0**

In [0]:
!pip install tensorflow-gpu==2.0.0-beta1 

In [0]:
# tensorlayer 兼容问题
!pip install imgaug==0.2.6

In [0]:
!pip install tensorlayer

### **cd命令**

In [0]:
import os
os.chdir("drive/My Drive/RL_EA/Deep_Q_Network")

### **查看当前路径**

In [0]:
!pwd

/content/drive/My Drive/RL_EA/Deep_Q_Network


# Deep Q Network

$$
off-Policy\\
discrete
$$ 

两个特点：

1）从 $Experience Replay Memory$ 中均匀采样。为了消除样本相关性。

2）延迟复制权值的 $Fixed Q-target$ 网络。提高稳定性，收敛性，消除估计$Q$和目标$Q$的相关性。

回顾$Q-Learning$的更新过程:

$$
Q(s,a)=Q(s,a)+\alpha\big[r+\gamma\max_{a'}Q(s',a')-Q(s,a)\big]
$$

可以看到$Q-Learning$就是让$Q$值接近目标$Q$值，那么$DQN$的$loss$：

$$
targetQ\:or\:y'=r + \gamma \max_{a'} Q(s',a';\theta_i^-)
$$
$$
L_i(\theta_i) = \mathbb{E}_{(s,a,r,s') \sim U(D)} \big[ \big( r + \gamma \max_{a'} Q(s',a';\theta_i^-) - Q(s, a; \theta_i) \big)^2 \big]
$$

In [0]:
import time
import os
import random

import numpy as np

import gym
import tensorflow as tf
import tensorlayer as tl
# 这个代码没用到 TargetQ 和 ReplayBuffer，只是用NN近似Q值

In [0]:
### hyper ###
gamma = 0.99 # 衰减因子
e = 0.1 # greedy系数
EPISODES = 10000
RENDER = False
BATCH = 32
RANDOMSEED = 1
DELAY_COUNT = 100 # 延迟复制target间隔
WARM_START = 100
running_reward = 0

# tabular 环境
ENV_NAME = 'FrozenLake-v0' 

In [0]:
### DQN ###
class DQN:
  def __init__(self, gamma, n_features, a_features, lr=0.02):
    self.gamma = gamma
    self.lr = lr
    self.a_features = a_features
    self.n_features = n_features

    self.model = self.network([None, n_features],"Q-Network-simple")
    self.model.train()

    self.optimizer = tf.optimizers.Adam(self.lr)


  def network(self, inputs_shape, name):
    with tf.name_scope('inputs'):
      self.obs = tl.layers.Input(inputs_shape, name='observation')
    nn = tl.layers.Dense(
        4, act=None, W_init=tf.random_uniform_initializer(0, 0.01), b_init=None, name='q_a_s'
        )(self.obs)
    return tl.models.Model(inputs=self.obs, outputs=nn, name=name)

  def to_one_hot(self, i, n_classes=None):
    """
    FrozenLake这个环境的observation是一个int数，范围在0~15，表示其位置，这里将其展开成一个vector。
    """
    a = np.zeros(n_classes, 'uint8')
    a[i] = 1
    return a

  def save_ckpt(self, model):
    if not os.path.exists('model_simple'):
      os.makedirs('model_simple')
    tl.files.save_npz(model.trainable_weights, name='model_simple/dqn_model.npz')

  def load_ckpt(self, model):
    tl.files.load_and_assign_npz(name='model_simple/dqn_model.npz', network=model)

  def huber_loss(self, x):
    return tf.where(tf.abs(x) < 1, tf.square(x) * 0.5, tf.abs(x) - 0.5)


  def new_reward(self, o, r):
    """
    for frozenlake env
    """
    y = int(o / 4)
    x = o - 4 * y
    pain = 0
    if o==5 or o==7 or o==11 or o==12:
      pain = -10
    if o==15:
      pain = 10
    return r - ((3 - x) + (3 - y)) + pain


In [0]:
# 初始化 #
np.random.seed(RANDOMSEED)
tf.random.set_seed(RANDOMSEED)

tl.logging.set_verbosity(tl.logging.DEBUG)
env = gym.make(ENV_NAME)
env.seed(RANDOMSEED)


print(env.action_space)
print(env.observation_space)

a_features = env.action_space.n
n_features = env.observation_space.n

Dqn = DQN(gamma, n_features, a_features)

I0829 03:20:00.473494 140388202448768 tl_logging.py:99] Input  observation: [None, 16]


Discrete(4)
Discrete(16)


I0829 03:20:00.543918 140388202448768 tl_logging.py:99] Dense  q_a_s: 4 No Activation
I0829 03:20:00.613805 140388202448768 tl_logging.py:99] Input  observation: [None, 16]
I0829 03:20:00.677609 140388202448768 tl_logging.py:99] Dense  q_a_s: 4 No Activation


In [0]:
# train #
t0 = time.time()
for i in range(EPISODES):
  # if i % (2*DELAY_COUNT) == 0:
  #   Dqn.sync()
  s = env.reset()
  rAll = 0
  for j in range(99):
    # 每轮episode最大step
    # 从DQN中贪婪地选择一个动作
    allQ = Dqn.model(np.asarray([Dqn.to_one_hot(s, 16)], dtype=np.float32)).numpy()
    a = np.argmax(allQ, 1)
    # 随机动作
    if np.random.rand(1) < e:
      a[0] = env.action_space.sample()
    # 执行动作得到下一步信息
    s1, r, d, _ = env.step(a[0])
    # 得到Q'
    # Q1 = Dqn.target_model(np.asarray([Dqn.to_one_hot(s1, 16)], dtype=np.float32)).numpy()
    Q1 = Dqn.model(np.asarray([Dqn.to_one_hot(s1, 16)], dtype=np.float32)).numpy()
    maxQ1 = np.max(Q1)
    targetQ = allQ
    targetQ[0, a[0]] = r + Dqn.gamma * maxQ1 
    
    with tf.GradientTape() as tape:
      _qvalues = Dqn.model(np.asarray([Dqn.to_one_hot(s, 16)], dtype=np.float32))
      # 只是让单个Q(s,a)变化
      _loss = tl.cost.mean_squared_error(targetQ, _qvalues, is_mean=False)

    
    grad = tape.gradient(_loss, Dqn.model.trainable_weights)
    Dqn.optimizer.apply_gradients(zip(grad, Dqn.model.trainable_weights))

    rAll += r
    s = s1
    if d:
      e = 1. / ((i / 50) + 10)
      break
  running_reward = rAll if running_reward is None else running_reward * 0.99 + rAll * 0.01
  print('Episode:{}/{} | Episode Reward:{:.4f} | Running AVG Reward:{:.4f} | Running Time:{:.4f}'\
        .format(i, EPISODES, rAll, running_reward, time.time()-t0))
Dqn.save_ckpt(Dqn.model)

Episode:6207/10000 | Episode Reward:1.0000 | Running AVG Reward:0.6244 | Running Time:1064.0626
Episode:6208/10000 | Episode Reward:1.0000 | Running AVG Reward:0.6282 | Running Time:1064.1536
Episode:6209/10000 | Episode Reward:0.0000 | Running AVG Reward:0.6219 | Running Time:1064.4889
Episode:6210/10000 | Episode Reward:0.0000 | Running AVG Reward:0.6157 | Running Time:1064.7595
Episode:6211/10000 | Episode Reward:1.0000 | Running AVG Reward:0.6195 | Running Time:1064.8472
Episode:6212/10000 | Episode Reward:0.0000 | Running AVG Reward:0.6133 | Running Time:1065.0431
Episode:6213/10000 | Episode Reward:1.0000 | Running AVG Reward:0.6172 | Running Time:1065.1511
Episode:6214/10000 | Episode Reward:0.0000 | Running AVG Reward:0.6110 | Running Time:1065.2113
Episode:6215/10000 | Episode Reward:1.0000 | Running AVG Reward:0.6149 | Running Time:1065.3491
Episode:6216/10000 | Episode Reward:0.0000 | Running AVG Reward:0.6087 | Running Time:1065.5176
Episode:6217/10000 | Episode Reward:1.00

I0829 04:00:28.295628 140388202448768 tl_logging.py:99] [*] Saving TL weights into model/dqn_model.npz
I0829 04:00:28.300491 140388202448768 tl_logging.py:99] [*] Saved


Episode:9998/10000 | Episode Reward:1.0000 | Running AVG Reward:0.5798 | Running Time:1831.8023
Episode:9999/10000 | Episode Reward:1.0000 | Running AVG Reward:0.5840 | Running Time:1831.8967


In [0]:
# test
Dqn.load_ckpt(Dqn.model)
running_reward = 0
t0 = time.time()
for i in range(1000):
  # if i % (2*DELAY_COUNT) == 0:
  #   Dqn.sync()
  s = env.reset()
  rAll = 0
  for j in range(99):
    # 每轮episode最大step
    # 从DQN中贪婪地选择一个动作
    allQ = Dqn.model(np.asarray([Dqn.to_one_hot(s, 16)], dtype=np.float32)).numpy()
    a = np.argmax(allQ, 1)
    s1, r, d, _ = env.step(a[0])

    rAll += r
    s = s1
    if d:
      e = 1. / ((i / 50) + 10)
      break
  running_reward = rAll if running_reward is None else running_reward * 0.99 + rAll * 0.01
  print('Episode:{}/{} | Episode Reward:{:.4f} | Running AVG Reward:{:.4f} | Running Time:{:.4f}'\
        .format(i, 1000, rAll, running_reward, time.time()-t0))

I0829 05:50:23.885507 140388202448768 tl_logging.py:99] [*] Load model/dqn_model.npz SUCCESS!


Episode:0/1000 | Episode Reward:1.0000 | Running AVG Reward:0.0100 | Running Time:0.0427
Episode:1/1000 | Episode Reward:1.0000 | Running AVG Reward:0.0199 | Running Time:0.0508
Episode:2/1000 | Episode Reward:1.0000 | Running AVG Reward:0.0297 | Running Time:0.0553
Episode:3/1000 | Episode Reward:1.0000 | Running AVG Reward:0.0394 | Running Time:0.0822
Episode:4/1000 | Episode Reward:1.0000 | Running AVG Reward:0.0490 | Running Time:0.0871
Episode:5/1000 | Episode Reward:1.0000 | Running AVG Reward:0.0585 | Running Time:0.0972
Episode:6/1000 | Episode Reward:1.0000 | Running AVG Reward:0.0679 | Running Time:0.1356
Episode:7/1000 | Episode Reward:0.0000 | Running AVG Reward:0.0673 | Running Time:0.1557
Episode:8/1000 | Episode Reward:1.0000 | Running AVG Reward:0.0766 | Running Time:0.1703
Episode:9/1000 | Episode Reward:0.0000 | Running AVG Reward:0.0758 | Running Time:0.2079
Episode:10/1000 | Episode Reward:1.0000 | Running AVG Reward:0.0851 | Running Time:0.2159
Episode:11/1000 | Ep

In [0]:
ob=1
Dqn.model(np.expand_dims(Dqn.to_one_hot(ob), 0).astype('float32')).numpy()
# 0 left
# 1 down
# 2 right
# 3 up

array([[-115.95938 , -114.066124, -112.87696 , -115.04886 ]],
      dtype=float32)

In [0]:
ob_,r,done,_ = env.step(1)
print(r)
env.render()

0.0
  (Down)
SFFF
[41mF[0mHFH
FFFH
HFFG


In [17]:
!mv model drive

mv: cannot create directory 'drive/model': Operation not supported
