In [42]:
import gym
import torch
import torch.nn as nn
import torch.optim as optim
import random

In [43]:
class QNetwork(nn.Module):
    def __init__(self, state_dim, action_dim):
        super(QNetwork, self).__init__()
        self.fc1 = nn.Linear(state_dim, 64)
        self.fc2 = nn.Linear(64, 64)
        self.fc3 = nn.Linear(64, action_dim)
    
    def forward(self, state):
        x = torch.relu(self.fc1(state))
        x = torch.relu(self.fc2(x))
        return self.fc3(x)


In [44]:
class ReplayBuffer:
    def __init__(self, capacity):
        self.capacity = capacity
        self.buffer = []
    
    def push(self, state, action, reward, next_state, done):
        if len(self.buffer) == self.capacity:
            self.buffer.pop(0)
        self.buffer.append((state, action, reward, next_state, done))
    
    def sample(self, batch_size):
        return random.sample(self.buffer, batch_size)
    
    def __len__(self):
        return len(self.buffer)


In [45]:
# 定义深度Q学习代理
class DQNAgent:
    def __init__(self, state_dim, action_dim, buffer_capacity=10000):
        self.q_network = QNetwork(state_dim, action_dim)
        self.target_network = QNetwork(state_dim, action_dim)
        self.target_network.load_state_dict(self.q_network.state_dict())
        self.buffer = ReplayBuffer(buffer_capacity)
        self.optimizer = optim.Adam(self.q_network.parameters(), lr=0.001)
    
    def select_action(self, state, epsilon):
        if random.random() < epsilon:
            return random.randint(0, action_dim - 1)
        else:
            with torch.no_grad():
                q_values = self.q_network(state)
                return torch.argmax(q_values).item()
    
    def update_q_network(self, batch_size, gamma):
        if len(self.buffer) < batch_size:
            return
        
        transitions = self.buffer.sample(batch_size)
        batch = list(zip(*transitions))
        
        state_batch = torch.stack(batch[0])
        action_batch = torch.tensor(batch[1], dtype=torch.long)  # 将整数值转换为张量
        reward_batch = torch.tensor(batch[2], dtype=torch.float32)

        next_state_batch = torch.stack(batch[3])
        done_batch = torch.tensor(batch[4], dtype=torch.int32)


        
        
        q_values = self.q_network(state_batch)
        next_q_values = self.target_network(next_state_batch)
        
        target_q_values = reward_batch + (1 - done_batch) * gamma * torch.max(next_q_values, dim=1).values
        
        loss = nn.MSELoss()(q_values.gather(1, action_batch.unsqueeze(1)), target_q_values.unsqueeze(1))
        
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        self.target_network.load_state_dict(self.q_network.state_dict())

In [46]:
# 主训练循环
env = gym.make('InvertedDoublePendulum-v2')
state_dim = env.observation_space.shape[0]
action_dim = env.action_space.shape[0]
agent = DQNAgent(state_dim, action_dim)

num_episodes = 1000
batch_size = 64
epsilon = 0.1
gamma = 0.99

for episode in range(num_episodes):
    state = env.reset()
    state = torch.tensor(state, dtype=torch.float32)
    done = False
    total_reward = 0

    while not done:
        action = agent.select_action(state, epsilon)
        next_state, reward, done, _ = env.step(action)
        next_state = torch.tensor(next_state, dtype=torch.float32)
        agent.buffer.push(state, action, reward, next_state, done)
        state = next_state
        total_reward += reward

    agent.update_q_network(batch_size, gamma)
    
    print(f"Episode {episode+1}, Total Reward: {total_reward}")


Episode 1, Total Reward: 110.97026840733221
Episode 2, Total Reward: 101.71596762130999
Episode 3, Total Reward: 101.15432315754218
Episode 4, Total Reward: 91.7662225113128
Episode 5, Total Reward: 82.89536978894913
Episode 6, Total Reward: 119.76150838182193
Episode 7, Total Reward: 82.68979789493235
Episode 8, Total Reward: 92.01234014571934
Episode 9, Total Reward: 82.9089246847353
Episode 10, Total Reward: 119.98734136264964
Episode 11, Total Reward: 82.48982332086923
Episode 12, Total Reward: 91.77328195719448
Episode 13, Total Reward: 91.70271847036388
Episode 14, Total Reward: 92.25016624971556
Episode 15, Total Reward: 92.00862308013342
Episode 16, Total Reward: 129.46264521910575
Episode 17, Total Reward: 82.25670063359291
Episode 18, Total Reward: 73.48887441474602
Episode 19, Total Reward: 91.72000895716057
Episode 20, Total Reward: 82.60800183836409
Episode 21, Total Reward: 73.42485850890196
Episode 22, Total Reward: 82.64661426154419
Episode 23, Total Reward: 91.43859677

In [47]:
# 测试控制效果
state = env.reset()
state = torch.tensor(state, dtype=torch.float32)
done = False
total_reward = 0

while not done:
    action = agent.select_action(state, 0)  # 在测试中不再探索
    next_state, reward, done, _ = env.step(action)
    next_state = torch.tensor(next_state, dtype=torch.float32)
    state = next_state
    total_reward += reward

print(f"Testing Complete, Total Reward: {total_reward}")

Testing Complete, Total Reward: 92.17488756180087
