In [14]:
import numpy as np
import random

# 定义网格世界环境
class GridWorld:
    def __init__(self):
        self.n_rows = 5
        self.n_cols = 5
        self.start = (0, 0)
        self.goal = (4, 4)
        # 定义障碍物
        self.obstacles = [(2, 2), (3, 1),(1, 3),(0, 4)]
        self.state = self.start

    def reset(self):
        """重置环境至起点"""
        self.state = self.start
        return self.state

    def step(self, action):
        """
        执行动作
        动作：0: 上, 1: 下, 2: 左, 3: 右, 4: 原地不动
        返回：下一个状态, 奖励, 是否结束
        
        为防止出界，所有移动动作都会在边界处被限制在有效范围内。
        """
        x, y = self.state
        
        # 根据动作计算下一个状态，并利用边界检查确保不出界
        if action == 0:  # 上
            next_state = (max(x - 1, 0), y)
        elif action == 1:  # 下
            next_state = (min(x + 1, self.n_rows - 1), y)
        elif action == 2:  # 左
            next_state = (x, max(y - 1, 0))
        elif action == 3:  # 右
            next_state = (x, min(y + 1, self.n_cols - 1))
        elif action == 4:  # 原地不动
            next_state = (x, y)
        else:
            next_state = self.state

        # 如果试图出界（虽然由于边界检查不可能发生），则保持原状态
        if next_state == self.state and action != 4:
            # 可以选择额外惩罚出界动作，但此处统一奖励为 -1
            reward = -2
            done = False
        # 判断是否进入障碍物
        elif next_state in self.obstacles:
            reward = -10
            # 撞到障碍物后保持原状态
            next_state = self.state
            done = False
        elif next_state == self.goal:
            reward = 10
            done = True
        else:
            reward = -1
            done = False

        self.state = next_state
        return next_state, reward, done

    def get_possible_actions(self):
        """返回所有可能的动作（包含原地不动）"""
        return [0, 1, 2, 3, 4]


# Q-learning 算法实现
def train_agent(env, episodes=1000, alpha=0.1, gamma=0.9, epsilon=0.2):
    # 初始化 Q 表：每个状态下的 5 个动作的 Q 值初始化为 0
    Q = {}
    for i in range(env.n_rows):
        for j in range(env.n_cols):
            Q[(i, j)] = np.zeros(5)
    
    rewards_per_episode = []
    
    for episode in range(episodes):
        state = env.reset()
        total_reward = 0
        done = False
        
        while not done:
            # ε-greedy 策略选择动作
            if random.uniform(0, 1) < epsilon:
                action = random.choice(env.get_possible_actions())
            else:
                action = np.argmax(Q[state])
            
            next_state, reward, done = env.step(action)
            total_reward += reward
            
            # Q-learning 更新公式
            best_next_action = np.argmax(Q[next_state])
            Q[state][action] += alpha * (reward + gamma * Q[next_state][best_next_action] - Q[state][action])
            
            state = next_state
        
        rewards_per_episode.append(total_reward)
        # 可选：打印每一集的总奖励，观察收敛过程
        # print(f"Episode {episode + 1}: Total Reward = {total_reward}")
    
    return Q, rewards_per_episode


# 输出策略函数：用箭头表示最优动作
def print_policy(Q, env):
    actions_map = {0: '↑', 1: '↓', 2: '←', 3: '→', 4: '•'}
    grid_policy = []
    for i in range(env.n_rows):
        row = []
        for j in range(env.n_cols):
            if (i, j) in env.obstacles:
                row.append('X')  # 障碍物用 X 表示
            elif (i, j) == env.goal:
                row.append('G')  # 目标状态用 G 表示
            else:
                action = np.argmax(Q[(i, j)])
                row.append(actions_map[action])
        grid_policy.append(row)
    
    for row in grid_policy:
        print(' '.join(row))


# 主程序入口
if __name__ == '__main__':
    env = GridWorld()
    Q, rewards = train_agent(env, episodes=10000, alpha=0.1, gamma=0.9, epsilon=0.2)
    print("训练后的策略：")
    print_policy(Q, env)


训练后的策略：
↓ ← ← ← X
↓ ← ← X ↓
↓ ← X ↓ ↓
↓ X → ↓ ↓
→ → → → G
