### 蒙特卡罗方法

In [3]:
import gymnasium as gym
import numpy as np
import time

# 创建环境（训练时不渲染，加快速度）
env = gym.make("FrozenLake-v1", is_slippery=False)

n_states = env.observation_space.n      # 16
n_actions = env.action_space.n          # 4

# Step 1: 初始化策略为均匀随机策略
policy = np.ones((n_states, n_actions)) / n_actions

# Step 2: 初始化 Q 表和 returns 字典（用于累加 return）
Q = np.zeros((n_states, n_actions))
returns = {(s, a): [] for s in range(n_states) for a in range(n_actions)}

# 超参数
num_episodes_for_evaluation = 20000  # 用大量 episode 来评估初始随机策略
gamma = 1.0  # FrozenLake 只在终点有 reward，可用 gamma=1

print("正在使用随机策略生成 episodes 并评估 Q(s,a)...")

# Step 3: 策略评估 —— 使用当前策略（随机）生成 episodes，计算 Q
for _ in range(num_episodes_for_evaluation):
    episode = []
    state, _ = env.reset()
    done = False

    # 用当前策略（均匀随机）生成一个完整 episode
    while not done:
        action_probs = policy[state]
        action = np.random.choice(n_actions, p=action_probs)
        next_state, reward, terminated, truncated, _ = env.step(action)
        episode.append((state, action, reward))
        state = next_state
        done = terminated or truncated

    # 计算每个 (s,a) 的 first-visit return 并记录
    visited = set()
    G = 0
    # 从 episode 末尾反向计算 return
    for t in reversed(range(len(episode))):
        state, action, reward = episode[t]
        G = gamma * G + reward
        if (state, action) not in visited:
            visited.add((state, action))
            returns[(state, action)].append(G)

# 更新 Q 表：对每个 (s,a)，取所有 observed return 的平均值
for s in range(n_states):
    for a in range(n_actions):
        if returns[(s, a)]:  # 避免除零或空列表
            Q[s][a] = np.mean(returns[(s, a)])
        # 否则保持为 0（未访问过）

print("Q 表评估完成。")

# Step 4: 策略改进 —— 对每个状态选择 Q 最大的动作（greedy）
new_policy = np.zeros((n_states, n_actions))
for s in range(n_states):
    best_action = np.argmax(Q[s])
    new_policy[s] = np.eye(n_actions)[best_action]  # one-hot 表示确定性策略

print("策略改进完成。新策略为确定性 greedy 策略。")
print("每个状态选择的动作（0=左,1=下,2=右,3=上）:")
deterministic_actions = np.argmax(new_policy, axis=1)
print(deterministic_actions)

# Step 5: 可视化测试新策略
print("\n正在用新策略运行一个可视化 episode...")
env_test = gym.make("FrozenLake-v1", is_slippery=False, render_mode='human')
state, _ = env_test.reset()
done = False
total_reward = 0

while not done:
    action = deterministic_actions[state]
    state, reward, terminated, truncated, _ = env_test.step(action)
    total_reward += reward
    done = terminated or truncated
    time.sleep(1)

print(f"测试 episode 总奖励: {total_reward}")
env_test.close()
env.close()

正在使用随机策略生成 episodes 并评估 Q(s,a)...
Q 表评估完成。
策略改进完成。新策略为确定性 greedy 策略。
每个状态选择的动作（0=左,1=下,2=右,3=上）:
[1 2 1 0 1 0 1 0 2 1 1 0 0 2 2 0]

正在用新策略运行一个可视化 episode...
测试 episode 总奖励: 1.0


In [None]:
import gymnasium as gym
import numpy as np
import random


# 创建环境（训练时不渲染，加快速度）
env = gym.make("FrozenLake-v1", is_slippery=False)
n_states = env.observation_space.n      # 16
n_actions = env.action_space.n          # 4


# Step 1: 初始化策略为均匀随机策略
policy = np.ones((n_states, n_actions)) / n_actions

# Step 2: 初始化 Q 表和 returns 字典（用于累加 return）
Q = np.zeros((n_states, n_actions))
returns = {(s, a): [] for s in range(n_states) for a in range(n_actions)}
print(returns)

# 超参数
num_episodes_for_evaluation = 20000  # 用大量 episode 来评估初始随机策略
gamma = 1.0  # FrozenLake 只在终点有 reward，可用 gamma=1


print("正在使用随机策略生成 episodes 并评估 Q(s,a)...")