In [21]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# 初始化

In [27]:
# 定义优化器
import torch
import random
from RobotEnv import RobotEnv
from utils import *

env = RobotEnv(screen_width=400, screen_height=400)
# size of obs
observation_length = env.observation_space.shape[0]
action_length = env.action_space.n

state_size = 8
hidden_size = 128
# note 加载GRU模型
# gru.load_state_dict(torch.load('gru.pth'))

# 新网络，将GRU的输出和state堆叠后作为mlp的输入
class DQN_SP(torch.nn.Module):
    def __init__(self):
        super(DQN_SP, self).__init__()
        self.mlp = torch.nn.Sequential(
            torch.nn.Linear(observation_length * 2, 64),
            torch.nn.ReLU(),
            torch.nn.Linear(64, 64),
            torch.nn.ReLU(),
            torch.nn.Linear(64, action_length),
        )
        self.gru = GRUModel(state_size, hidden_size, state_size)

    def forward(self, s):
        input = torch.cat([s, self.gru(s.reshape(-1, 1, 8))], dim=1)
        q_vector = self.mlp(input)
        return q_vector


model = DQN_SP()

model_delay = DQN_SP()
model_delay.mlp.load_state_dict(model.mlp.state_dict())
model_delay.gru.load_state_dict(model.gru.state_dict())
# 冻结gru参数
for param in model.gru.parameters():
    param.requires_grad = False
for param in model_delay.gru.parameters():
    param.requires_grad = False
# 测试
s = torch.randn(1, 8)
print(model(s))

tensor([[ 0.0331,  0.1150, -0.0534, -0.1486,  0.0619]],
       grad_fn=<AddmmBackward0>)


# 训练

In [28]:
from torch.utils.tensorboard import SummaryWriter
# 复制参数
controller = Controller(model, env)
pool = Pool(controller)
# 训练
def train():
    model.train()
    optimizer = torch.optim.Adam(model.parameters(), lr=2e-4)
    loss_fn = torch.nn.MSELoss()
    # 共训练n_step次
    n_step = 0
    log_interval = 100_000
    last_log_step = 0
    writer = SummaryWriter("./logs/DQN_SP")
    while n_step < 50_000_000:
        n_step += pool.update()
        # 每次更新数据后,训练N次
        for i in range(200):
            # 采样N条数据
            state, action, reward, next_state, terminated = pool.sample()
            # 计算value
            value = model(state).gather(dim=1, index=action)
            # 计算target
            with torch.no_grad():
                # 使用原模型计算动作,使用延迟模型计算target,进一步缓解自举
                next_action = model(next_state).argmax(dim=1, keepdim=True)
                target = model_delay(next_state).gather(dim=1, index=next_action)
            target = target * 0.99 * (1 - terminated) + reward
            loss = loss_fn(value, target)
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()
        # 复制参数
        if (n_step - last_log_step) >= log_interval:
            model_delay.load_state_dict(model.state_dict())
            test_result = (
                sum([pool.controller.play(mode="test")[-1] for _ in range(20)]) / 20
            )
            print(f"step:{n_step},test_result:{test_result}")
            last_log_step = n_step
            # 将步数，测试结果和损失写入TensorBoard
            writer.add_scalar("Step", n_step, global_step=n_step)
            writer.add_scalar("Test Result", test_result, global_step=n_step)
            writer.add_scalar("Loss", loss.item(), global_step=n_step)
    writer.close()  # 训练结束后关闭writer
train()

step:101102,test_result:-71.47250000000001
step:202133,test_result:-84.66
step:303837,test_result:-133.305
step:404828,test_result:-82.555
step:506183,test_result:-82.94000000000001
step:607226,test_result:-37.00749999999999
step:708402,test_result:-36.5775
step:809988,test_result:-129.73
step:911578,test_result:-108.82250000000003
step:1012836,test_result:-78.92


KeyboardInterrupt: 

In [None]:
# 保存模型参数
torch.save(model.state_dict(), "DQN.pth")

In [None]:
# load model
model.load_state_dict(torch.load("DQN.pth"))

# 测试

In [None]:
env = RobotEnv(screen_width=400, screen_height=400)
controller = Controller(model, env)
controller.play(mode="test", show=True)