1. 基础题
   - 使用三种时序差分价值迭代方法（SARSA，期望 SARSA，Q 学习）求解悬崖寻路策略 CliffWalking-v0
   - 随机数种子为学号后四位
   - 自行对超参数寻优
   - 训练好后，输出三种方法完全利用（epsilon=0）时的回合总奖励（total_reward）（评分比重70%）
   - 将代码和结果整理为ipynb格式（文件名“MTA.ipynb”），需要包含三种方法训练过程中每个回合的总奖励变化曲线（评分比重30%）

In [1]:
import numpy as np
import gym

np.random.seed(5235) # 学号后四位

### 环境使用

引入环境

In [2]:
env = gym.make('CliffWalking-v0')
env.seed(0)
print('观测空间 = {}'.format(env.observation_space))
print('动作空间 = {}'.format(env.action_space))
print('状态数量 = {}, 动作数量 = {}'.format(env.nS, env.nA))
print('地图大小 = {}'.format(env.shape))

观测空间 = Discrete(48)
动作空间 = Discrete(4)
状态数量 = 48, 动作数量 = 4
地图大小 = (4, 12)


单回合测试

In [3]:
def play_once(env, policy):
    total_reward = 0
    state = env.reset()
    while True:
        loc = np.unravel_index(state, env.shape)
        print('状态 = {}, 位置 = {}'.format(state, loc), end=' ')
        action = np.random.choice(env.nA, p=policy[state])
        next_state, reward, done, _ = env.step(action)
        print('动作 = {}, 奖励 = {}, 次态 = {}'.format(action, reward, next_state))
        total_reward += reward
        if done:
            break
        state = next_state
    return total_reward

目标：最优策略

In [4]:
actions = np.ones(env.shape, dtype=int)
actions[-1, :] = 0
actions[:, -1] = 2
optimal_policy = np.eye(4)[actions.reshape(-1)]
total_reward = play_once(env, optimal_policy)
print('回合奖励 = {}'.format(total_reward))

状态 = 36, 位置 = (3, 0) 动作 = 0, 奖励 = -1, 次态 = 24
状态 = 24, 位置 = (2, 0) 动作 = 1, 奖励 = -1, 次态 = 25
状态 = 25, 位置 = (2, 1) 动作 = 1, 奖励 = -1, 次态 = 26
状态 = 26, 位置 = (2, 2) 动作 = 1, 奖励 = -1, 次态 = 27
状态 = 27, 位置 = (2, 3) 动作 = 1, 奖励 = -1, 次态 = 28
状态 = 28, 位置 = (2, 4) 动作 = 1, 奖励 = -1, 次态 = 29
状态 = 29, 位置 = (2, 5) 动作 = 1, 奖励 = -1, 次态 = 30
状态 = 30, 位置 = (2, 6) 动作 = 1, 奖励 = -1, 次态 = 31
状态 = 31, 位置 = (2, 7) 动作 = 1, 奖励 = -1, 次态 = 32
状态 = 32, 位置 = (2, 8) 动作 = 1, 奖励 = -1, 次态 = 33
状态 = 33, 位置 = (2, 9) 动作 = 1, 奖励 = -1, 次态 = 34
状态 = 34, 位置 = (2, 10) 动作 = 1, 奖励 = -1, 次态 = 35
状态 = 35, 位置 = (2, 11) 动作 = 2, 奖励 = -1, 次态 = 47
回合奖励 = -13


# SARSA

In [13]:
import matplotlib.pyplot as plt
import pandas as pd

In [19]:
class SARSAAgent:
    def __init__(self, env, gamma=0.6, learning_rate=0.2, epsilon=.01):
        self.gamma = gamma
        self.learning_rate = learning_rate
        self.epsilon = epsilon
        self.action_n = env.action_space.n
        self.q = np.zeros((env.observation_space.n, env.action_space.n))
        
    def decide(self, state):
        if np.random.uniform() > self.epsilon:
            action = self.q[state].argmax()
        else:
            action = np.random.randint(self.action_n)
        return action
    
    def learn(self, state, action, reward, next_state, done, next_action):
        u = reward + self.gamma * \
                self.q[next_state, next_action] * (1. - done)
        td_error = u - self.q[state, action]
        self.q[state, action] += self.learning_rate * td_error

In [20]:
def play_sarsa(env, agent, train=False, render=False):
    episode_reward = 0
    observation = env.reset()
    action = agent.decide(observation)
    while True:
        if render:
            env.render()
        next_observation, reward, done, _ = env.step(action)
        episode_reward += reward
        next_action = agent.decide(next_observation) # 终止状态时此步无意义
        if train:
            agent.learn(observation, action, reward, next_observation,
                    done, next_action)
        if done:
            break
        observation, action = next_observation, next_action
    return episode_reward

In [21]:
agent = SARSAAgent(env)

# 训练
episodes = 3000
episode_rewards = []
for episode in range(episodes):
    episode_reward = play_sarsa(env, agent, train=True)
    episode_rewards.append(episode_reward)
    
plt.plot(episode_rewards)

# 测试
agent.epsilon = 0. # 取消探索

episode_rewards = [play_sarsa(env, agent) for _ in range(100)]
print('平均回合奖励 = {} / {} = {}'.format(sum(episode_rewards),
        len(episode_rewards), np.mean(episode_rewards)))

In [17]:
pd.DataFrame(agent.q)

Unnamed: 0,0,1,2,3
0,-7.791558,-7.730543,-7.857314,-7.797656
1,-7.5752,-7.475655,-7.535509,-7.629406
2,-7.288391,-7.201013,-7.398583,-7.584658
3,-7.008736,-6.895288,-7.108724,-7.189801
4,-6.736827,-6.541145,-6.949383,-6.760394
5,-6.397308,-6.141714,-6.471727,-6.442615
6,-5.893534,-5.705174,-6.065769,-5.920151
7,-5.406139,-5.225956,-5.570803,-5.955168
8,-4.951811,-4.692946,-5.184334,-5.161307
9,-4.432394,-4.100025,-4.216658,-4.397024


In [18]:
policy = np.eye(agent.action_n)[agent.q.argmax(axis=-1)] 
pd.DataFrame(policy)

Unnamed: 0,0,1,2,3
0,0.0,1.0,0.0,0.0
1,0.0,1.0,0.0,0.0
2,0.0,1.0,0.0,0.0
3,0.0,1.0,0.0,0.0
4,0.0,1.0,0.0,0.0
5,0.0,1.0,0.0,0.0
6,0.0,1.0,0.0,0.0
7,0.0,1.0,0.0,0.0
8,0.0,1.0,0.0,0.0
9,0.0,1.0,0.0,0.0
