### 使用网格世界实现DQN算法

In [None]:
import numpy as np
import torch
from collections import deque
from torch import nn
import random
from rl_utils.GNGridWorldEnv import GridWorldEnv

# 定义神经网络
class NeuralNetwork(nn.Module):
    def __init__(self, input_dim=2, hidden_dim=128, output_dim=4):
        super().__init__()
        self.network = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, output_dim)
        )

    def forward(self, x):
        return self.network(x)

# 设备选择
device = (
    "cuda" if torch.cuda.is_available()
    else "mps" if hasattr(torch.backends, 'mps') and torch.backends.mps.is_available()
    else "cpu"
)
print(f"使用设备: {device}")

# 设置环境
grid_world_size = 10
obstacle_count = 20
env = GridWorldEnv(size=grid_world_size, obstacle_count=obstacle_count)
n_actions = env.action_space.n

# 初始化参数
replay_buffer = deque(maxlen=2000)  # 经验回放缓冲区
batch_size = 64     # 批量大小
update_frequency = 100  # 目标网络更新频率
epsilon = 1.0       # 初始探索率
epsilon_min = 0.01  # 最小探索率
epsilon_decay = 0.995   # 探索率衰减
gamma = 0.99        # 折扣因子
lr = 0.001          # 学习率

# 网络初始化 (输入维度=2，对应x,y坐标)
main_network = NeuralNetwork(input_dim=2,hidden_dim=1024, output_dim=4).to(device)
target_network = NeuralNetwork(input_dim=2, hidden_dim=1024,output_dim=4).to(device)
target_network.load_state_dict(main_network.state_dict())
target_network.eval()  # 目标网络不需要梯度

# 损失函数和优化器
loss_fn = nn.MSELoss()
optimizer = torch.optim.Adam(main_network.parameters(), lr=lr)

# ε-Greedy动作选择
def select_action(state, epsilon):
    if random.random() < epsilon:
        return random.randint(0, n_actions-1)

    state_tensor = torch.tensor(state, dtype=torch.float32, device=device).unsqueeze(0)
    with torch.no_grad():
        q_values = main_network(state_tensor)
    return torch.argmax(q_values).item()

# 训练函数
def train():
    if len(replay_buffer) < batch_size:
        return None

    # 随机采样批次
    batch = random.sample(replay_buffer, batch_size)
    states, actions, rewards, next_states, dones = zip(*batch)

    # 转换为张量
    states = torch.tensor(np.array(states), dtype=torch.float32, device=device)
    actions = torch.tensor(actions, dtype=torch.long, device=device)
    rewards = torch.tensor(rewards, dtype=torch.float32, device=device)
    next_states = torch.tensor(np.array(next_states), dtype=torch.float32, device=device)
    dones = torch.tensor(dones, dtype=torch.float32, device=device)

    # 计算当前Q值 (仅执行的动作)
    current_q_values = main_network(states)
    current_q_value = current_q_values.gather(1, actions.unsqueeze(1))

    # 计算目标Q值 (使用目标网络)
    with torch.no_grad():
        next_q_values = target_network(next_states)
        max_next_q = next_q_values.max(1)[0]
        target_q_values = rewards + (1 - dones) * gamma * max_next_q

    # 计算损失
    loss = loss_fn(current_q_value.squeeze(), target_q_values)

    # 优化
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    return loss.item()

# 训练循环
global_step = 0
episode_rewards = []

for episode in range(1000):
    # 重置环境
    observation, _ = env.reset(seed=99,options={'enable_random_pos': True})
    state = observation['agent'] / grid_world_size  # 归一化到[0,1]
    done = False
    total_reward = 0
    step_count = 0

    while not done and step_count < 100:  # 最大步数限制
        # 选择动作
        action = select_action(state, epsilon)

        # 执行动作
        next_observation, reward, terminated, truncated, _ = env.step(action)
        next_state = next_observation['agent'] / grid_world_size
        done = terminated or truncated

        # 存储经验
        replay_buffer.append((state.copy(), action, reward, next_state.copy(), float(done)))

        # 训练
        train()

        # 更新状态
        state = next_state
        total_reward += reward
        step_count += 1
        global_step += 1

        # 定期更新目标网络
        if global_step % update_frequency == 0:
            target_network.load_state_dict(main_network.state_dict())

    # 更新探索率
    epsilon = max(epsilon * epsilon_decay, epsilon_min)
    episode_rewards.append(total_reward)

    # 打印进度
    if episode % 10 == 0:
        avg_reward = np.mean(episode_rewards[-10:])
        print(f"回合 {episode}, 平均奖励: {avg_reward:.2f}, Epsilon: {epsilon:.4f}, 步数: {global_step}")

env.close()

# 保存模型
torch.save(main_network.state_dict(), '../data/deep-q-learning/dqn-model.pth')

### 使用模型进行预测

In [None]:
import time
import torch
import random
from torch import nn
from rl_utils.GNGridWorldEnv import GridWorldEnv

# 定义神经网络
class NeuralNetwork(nn.Module):
    def __init__(self, input_dim=2, hidden_dim=128, output_dim=4):
        super().__init__()
        self.network = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, output_dim)
        )

    def forward(self, x):
        return self.network(x)

# 设置环境
grid_world_size = 10
obstacle_count = 20
test_env = GridWorldEnv(size=grid_world_size, render_mode='human', obstacle_count=obstacle_count)
n_actions = test_env.action_space.n

epsilon_min = 0.01  # 最小探索率

# 测试学习到的策略
print("\n正在用学习到的策略运行测试 episode...")
observation, _ = test_env.reset(seed=99,options={'enable_random_pos': True})
state = observation['agent'] / grid_world_size
done = False
total_reward = 0
# 设备选择
device = (
    "cuda" if torch.cuda.is_available()
    else "mps" if hasattr(torch.backends, 'mps') and torch.backends.mps.is_available()
    else "cpu"
)
print(f"使用设备: {device}")
# ε-Greedy动作选择
def select_action_test(state, epsilon):
    if random.random() < epsilon:
        return random.randint(0, n_actions-1)

    state_tensor = torch.tensor(state, dtype=torch.float32, device=device).unsqueeze(0)
    with torch.no_grad():
        q_values = predict_network(state_tensor)
    return torch.argmax(q_values).item()

# 加载模型参数
model_path = '../data/deep-q-learning/dqn-model.pth'
print(f"正在从 {model_path} 加载模型...")
params = torch.load(model_path, map_location=device)

predict_network = NeuralNetwork(input_dim=2,hidden_dim=1024, output_dim=4).to(device)
predict_network.load_state_dict(params)
predict_network.eval()  # 设置为评估模式

while not done:
    action = select_action_test(state, epsilon_min)  # 使用最小探索率
    next_observation, reward, terminated, truncated, _ = test_env.step(action)
    state = next_observation['agent'] / grid_world_size
    total_reward += reward
    done = terminated or truncated
    time.sleep(0.3)

print(f"测试 episode 总奖励: {total_reward:.2f}")
test_env.close()
