In [1]:
import gym
import torch
import torch.nn as nn
import torch.optim as optim
import random
import numpy as np

In [2]:
# 定义Q网络
class QNetwork(nn.Module):
    def __init__(self, state_dim, action_dim):
        super(QNetwork, self).__init()
        self.fc1 = nn.Linear(state_dim, 64)
        self.fc2 = nn.Linear(64, 64)
        self.fc3 = nn.Linear(64, action_dim)

    def forward(self, state):
        x = torch.relu(self.fc1(state))
        x = torch.relu(self.fc2(x))
        q_values = self.fc3(x)
        return q_values

In [3]:
# 定义经验回放缓冲区
class ReplayBuffer:
    def __init__(self, capacity):
        self.capacity = capacity
        self.buffer = []
    
    def push(self, state, action, reward, next_state, done):
        if len(self.buffer) == self.capacity:
            self.buffer.pop(0)
        self.buffer.append((state, action, reward, next_state, done))
    
    def sample(self, batch_size):
        return random.sample(self.buffer, batch_size)

In [4]:
# 定义DQN智能体
class DQNAgent:
    def __init__(self, state_dim, action_dim, capacity, batch_size, lr, gamma, epsilon):
        self.q_network = QNetwork(state_dim, action_dim)
        self.target_q_network = QNetwork(state_dim, action_dim)
        self.target_q_network.load_state_dict(self.q_network.state_dict())
        self.replay_buffer = ReplayBuffer(capacity)
        self.batch_size = batch_size
        self.optimizer = optim.Adam(self.q_network.parameters(), lr=lr)
        self.gamma = gamma
        self.epsilon = epsilon

    def select_action(self, state):
        if random.random() < self.epsilon:
            return random.randint(0, action_dim - 1)
        else:
            q_values = self.q_network(state)
            return torch.argmax(q_values).item()
    
    def update(self):
        if len(self.replay_buffer.buffer) < self.batch_size:
            return

        samples = self.replay_buffer.sample(self.batch_size)
        states, actions, rewards, next_states, dones = zip(*samples)
        
        states = torch.tensor(states, dtype=torch.float)
        actions = torch.tensor(actions, dtype=torch.long)
        rewards = torch.tensor(rewards, dtype=torch.float)
        next_states = torch.tensor(next_states, dtype=torch.float)
        dones = torch.tensor(dones, dtype=torch.float)

        q_values = self.q_network(states)
        next_q_values = self.target_q_network(next_states)

        target_q_values = rewards + (1 - dones) * self.gamma * torch.max(next_q_values, dim=1)[0]
        loss = nn.MSELoss()(q_values.gather(1, actions.unsqueeze(1)), target_q_values.unsqueeze(1))

        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

In [6]:
# 初始化Gym环境
env = gym.make('Walker2d-v2')

# 获取状态和动作空间维度
state_dim = env.observation_space.shape[0]
action_dim = env.action_space.n

# 初始化DQN智能体
agent = DQNAgent(state_dim, action_dim, capacity=10000, batch_size=64, lr=0.001, gamma=0.99, epsilon=0.1)

  f"The environment {id} is out of date. You should consider "
  "This version of the mujoco environments depends "


AttributeError: 'super' object has no attribute '_QNetwork__init'

In [None]:
# 训练
num_episodes = 1000

for episode in range(num_episodes):
    state = env.reset()
    state = torch.tensor(state, dtype=torch.float)
    total_reward = 0

    while True:
        action = agent.select_action(state)
        next_state, reward, done, _ = env.step(action)
        next_state = torch.tensor(next_state, dtype=torch.float)
        agent.replay_buffer.push(state, action, reward, next_state, done)
        state = next_state
        agent.update()
        total_reward += reward

        if done:
            break

    if episode % 10 == 0:
        agent.target_q_network.load_state_dict(agent.q_network.state_dict())

    print(f"Episode: {episode}, Total Reward: {total_reward}")

In [None]:
# 测试
test_episodes = 10

for _ in range(test_episodes):
    state = env.reset()
    state = torch.tensor(state, dtype=torch.float)
    total_reward = 0

    while True:
        action = agent.select_action(state)
        next_state, reward, done, _ = env.step(action)
        next_state = torch.tensor(next_state, dtype=torch.float)
        state = next_state
        total_reward += reward

        if done:
            break

    print(f"Test Episode, Total Reward: {total_reward}")