### Demo_DDPG

In [1]:
import numpy as np
import copy
import random
from collections import deque
import os
import time

import gym

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

### 网络定义

In [2]:
class ReplayBuffer:
    def __init__(self, capacity):
        self.capacity = capacity
        self.buffer = deque(maxlen=self.capacity)

    def add(self, state, action, reward, next_state, done):
        self.buffer.append((state, action, reward, next_state, done))

    def sample(self, batch_size):
        batch = random.sample(self.buffer, batch_size)
        states, actions, rewards, next_states, dones = zip(*batch)

        return (
            torch.FloatTensor(np.array(states)),
            torch.FloatTensor(np.array(actions)),
            torch.FloatTensor(np.array(rewards)),
            torch.FloatTensor(np.array(next_states)),
            torch.FloatTensor(np.array(dones))
            )

    def save(self, file_path):
        np.save(file_path, np.array(self.buffer, dtype=object))  # dtype=object 是关键

    def load(self, file_path):
        data = np.load(file_path, allow_pickle=True)
        self.buffer = deque(data.tolist(), maxlen=self.capacity)

    def __len__(self):
        return len(self.buffer)

In [3]:
# Actor: A(s) -> action
class Actor(nn.Module):
    def __init__(self, state_dim, action_dim, max_action):
        super().__init__()
        self.model = nn.Sequential(
            nn.Linear(state_dim, 256),
            nn.ReLU(),
            nn.Linear(256, 256),
            nn.ReLU(),
            nn.Linear(256, 256),
            nn.ReLU(),
            nn.Linear(256, action_dim),
            nn.Tanh()  # 限定动作在 [-1, 1]
        )
        self.max_action = max_action

    def forward(self, state):
        return self.model(state) * self.max_action

In [4]:
# Critic: Q(s, a) -> y_hat
class Critic(nn.Module):
    def __init__(self, state_dim, action_dim):
        super().__init__()
        self.model = nn.Sequential(
            nn.Linear(state_dim + action_dim, 256),
            nn.ReLU(),
            nn.Linear(256, 256),
            nn.ReLU(),
            nn.Linear(256, 256),
            nn.ReLU(),
            nn.Linear(256, 1)
        )
        
    def forward(self, state, action):
        return self.model(torch.cat([state, action], dim=1))

In [5]:
class DDPGAgent:
    def __init__(self, state_dim, action_dim, max_action,
                 actor_lr=1e-3, critic_lr=1e-3, gamma=0.99, tau=0.005, 
                 device=None):
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") if device is None else device
        print(f"training on: {self.device}")
        
        self.gamma = gamma
        self.tau = tau
        self.max_action = max_action

        # Actor 网络
        self.actor = Actor(state_dim, action_dim, max_action).to(self.device)
        self.actor_target = copy.deepcopy(self.actor)
        self.actor_optimizer = optim.Adam(self.actor.parameters(), lr=actor_lr)

        # Critic 网络
        self.critic = Critic(state_dim, action_dim).to(self.device)
        self.critic_target = copy.deepcopy(self.critic)
        self.critic_optimizer = optim.Adam(self.critic.parameters(), lr=critic_lr)

    def select_action(self, state, noise_std=0.1):
        state = torch.FloatTensor(state).unsqueeze(0).to(self.device)
        action = self.actor(state).cpu().data.numpy().flatten()
        if noise_std > 0:
            action += np.random.normal(0, noise_std, size=action.shape)
        return np.clip(action, -self.max_action, self.max_action)

    def train(self, replay_buffer, batch_size=64):
        if len(replay_buffer) < batch_size:
            return

        # 抽样 batch
        # (s, a, r, s'), r(s, a, s')
        states, actions, rewards, next_states, dones = replay_buffer.sample(batch_size)
        states = states.to(self.device)
        actions = actions.to(self.device)
        rewards = rewards.to(self.device).unsqueeze(1)
        next_states = next_states.to(self.device)
        dones = dones.to(self.device).unsqueeze(1)

        # 目标Q值: Q_t(s',μ_t(s'))
        # target = r + γ * Q_target(s′, μ_target(s′))
        with torch.no_grad():
            next_actions = self.actor_target(next_states)
            target_q = self.critic_target(next_states, next_actions)
            target = rewards + self.gamma * (1 - dones) * target_q

        # Critic loss
        current_q = self.critic(states, actions)
        critic_loss = nn.MSELoss()(current_q, target)

        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        self.critic_optimizer.step()

        # Actor loss（最大化 Q，即最小化 -Q）
        actor_loss = -self.critic(states, self.actor(states)).mean()

        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        # 软更新 target 网络
        self.soft_update(self.actor_target, self.actor)
        self.soft_update(self.critic_target, self.critic)

    def soft_update(self, target_net, source_net):
        for target_param, param in zip(target_net.parameters(), source_net.parameters()):
            target_param.data.copy_(self.tau * param.data + (1.0 - self.tau) * target_param.data)


### 测试环境选择

In [6]:
# basic
# game = "Pendulum-v1"
game = "MountainCarContinuous-v0"

In [7]:
# mujoco series
game = "HalfCheetah-v4"

In [8]:
env = gym.make(game)
state_dim = env.observation_space.shape[0]      # 状态维度，例如 3
action_dim = env.action_space.shape[0]          # 动作维度，例如 1
max_action = float(env.action_space.high[0])    # 最大动作幅度，例如 2.0

In [9]:
print(f"game: {game}, state_dim: {state_dim}, action_dim: {action_dim}, max_action: {max_action}\n")

game: HalfCheetah-v4, state_dim: 17, action_dim: 6, max_action: 1.0



### 网络定义 训练

In [10]:
agent = DDPGAgent(state_dim, action_dim, max_action, actor_lr=5e-4, critic_lr=1e-3, gamma=0.99, tau=0.005)

training on: cuda


In [11]:
replay_buffer = ReplayBuffer(capacity=1000000)

超参数设定

In [12]:
env = gym.make(game) # 必要时重构环境
num_episodes = 1000
batch_size = 256
reward_list = []

不同实验参数

In [None]:
# game = "Pendulum-v1"
max_steps = 300
noise_rate = 0.1
# warm up not needed

In [13]:
# game = "MountainCarContinuous-v0"
max_steps = 1000
noise_rate = 0.6
warm_up_round = 50
warm_up_steps = 5000

In [13]:
# game = "HalfCheetah-v4"
max_steps = 1000
noise_rate = 0.2
warm_up_round = 20
warm_up_steps = 10000

warm up(非必须)

对于Delayed Reward + Sparse Success 引导的策略跳跃, 必要

In [18]:
# warm_up round
i = 0
for i in range(warm_up_round):
    state, _ = env.reset()
    done = False
    t_inner = 0
    while not done and t_inner < max_steps:
        action = env.action_space.sample()  # 完全随机
        next_state, reward, done, _, _ = env.step(action)
    
        replay_buffer.add(state, action, reward, next_state, float(done))
        state = next_state
    
        t_inner += 1

print(f"warn up for {i+1} round, sample num: {t_inner}, buffer_len: {len(replay_buffer)}")

warn up for 20 round, sample num: 1000, buffer_len: 100000


In [19]:
for episode in range(num_episodes):
    state, _ = env.reset()
    total_reward = 0
    t = 0
    done = False

    while not done and t < max_steps:
        action = agent.select_action(state, noise_std=noise_rate)  # 加探索噪声
        next_state, reward, done, _, _ = env.step(action)

        replay_buffer.add(state, action, reward, next_state, float(done))
        state = next_state
        total_reward += reward

        if t % 5 == 0:
            agent.train(replay_buffer, batch_size)
        t += 1
        
    reward_list.append(total_reward)

    if episode % 10 == 0:
        print(f"Episode {episode}, Reward: {total_reward:.2f}, step in this episide: {t}")

Episode 0, Reward: -444.72, step in this episode 1000
Episode 10, Reward: -475.05, step in this episode 1000
Episode 20, Reward: 619.07, step in this episode 1000
Episode 30, Reward: 1073.15, step in this episode 1000
Episode 40, Reward: 375.30, step in this episode 1000
Episode 50, Reward: 3195.97, step in this episode 1000
Episode 60, Reward: 3473.93, step in this episode 1000
Episode 70, Reward: 3335.27, step in this episode 1000
Episode 80, Reward: 4020.55, step in this episode 1000
Episode 90, Reward: 4657.38, step in this episode 1000
Episode 100, Reward: 4851.97, step in this episode 1000
Episode 110, Reward: 3000.96, step in this episode 1000
Episode 120, Reward: 5428.03, step in this episode 1000
Episode 130, Reward: 2460.15, step in this episode 1000
Episode 140, Reward: 5578.26, step in this episode 1000
Episode 150, Reward: 5538.78, step in this episode 1000
Episode 160, Reward: 3551.37, step in this episode 1000
Episode 170, Reward: 5871.05, step in this episode 1000
Episo

In [21]:
print(len(replay_buffer))

1000000


### 动画实际演示

In [23]:
# 设置渲染模式为 pygame
env = gym.make(game, render_mode="human")
# env = gym.make(game)

# 关闭 epsilon，完全贪婪策略
agent.epsilon = 0.0

# 可视化运行一回合
num_episodes = 1
for ep in range(num_episodes):
    state, _ = env.reset()
    done = False
    total_reward = 0
    t = 0

    while not done and t < max_steps:
    # while not done:
        action = agent.select_action(state)
        # action = env.action_space.sample()
        state, reward, done, _, _ = env.step(action)
        total_reward += reward
        t += 1

        # render 使用 pygame 显示界面
        env.render()
        if t % 50 == 0: 
            print(f"t = {t}, total_reward: {total_reward}")
        # time.sleep(0.05)  # 控制每帧速度

    print(f"Episode {ep}, Reward: {total_reward}")
env.close()

t = 50, total_reward: 160.61568894756525
t = 100, total_reward: 604.506177932148
t = 150, total_reward: 1189.7368938019317
t = 200, total_reward: 1808.2301714178893
t = 250, total_reward: 2446.8659579739424
t = 300, total_reward: 3010.5387280742407
t = 350, total_reward: 3623.204237717452
t = 400, total_reward: 4207.047516260323
t = 450, total_reward: 4826.550915406012
t = 500, total_reward: 5514.469879357395
t = 550, total_reward: 6146.223352075212
t = 600, total_reward: 6787.770211345009
t = 650, total_reward: 7456.653418422533
t = 700, total_reward: 8058.802065633286
t = 750, total_reward: 8683.676346806109
t = 800, total_reward: 9294.872777812425
t = 850, total_reward: 9945.265529205937
t = 900, total_reward: 10584.671331676735
t = 950, total_reward: 11204.844287076165
t = 1000, total_reward: 11441.490042844776
Episode 0, Reward: 11441.490042844776


### 保存Buffer

In [24]:
buffer_path = "data/DDPGgenerated_offline_dataset.npy"

In [25]:
replay_buffer.save(buffer_path)
print(f"replay buffer saved at: {buffer_path}, sample num: {len(replay_buffer)}")


replay buffer saved at: data/DDPGgenerated_offline_dataset.npy, sample num: 1000000


In [26]:
new_buffer = ReplayBuffer(capacity=100000)
print(len(new_buffer))
new_buffer.load(buffer_path)
print(len(new_buffer))

0
100000


In [27]:
new_buffer.sample(64)

(tensor([[  0.0644,  -0.3848,  -0.5941,  ...,  -0.6222,   0.7229,  -0.8768],
         [  0.0525,  -0.0702,   0.3193,  ...,  -7.6673,  -6.6357, -10.9752],
         [ -0.0258,  -0.1489,  -0.2472,  ..., -14.6382, -18.7319,  -8.1949],
         ...,
         [  0.1260,  -0.1067,   0.4556,  ...,  -1.2596, -19.8739, -14.8173],
         [  0.0996,   0.2511,   0.1155,  ...,  -3.4950,  19.7792,  -2.8418],
         [  0.0940,   0.1420,   0.2458,  ...,  10.8364,  18.7656,  10.8135]]),
 tensor([[-1.0000,  0.9429, -0.8383, -1.0000, -1.0000, -0.7076],
         [ 0.5264,  1.0000,  1.0000,  1.0000, -0.9785, -0.6702],
         [ 0.9912, -0.9062,  0.6992, -0.7888, -0.8813, -1.0000],
         [-0.8675, -1.0000, -0.6957, -0.6425,  0.9098,  0.6326],
         [ 1.0000, -0.4665,  0.8105, -0.3070, -1.0000,  0.9728],
         [ 1.0000,  1.0000,  1.0000, -1.0000, -1.0000, -0.7482],
         [ 0.9882,  0.7250,  1.0000, -1.0000, -0.8690, -1.0000],
         [ 0.9413, -1.0000,  1.0000, -1.0000,  1.0000, -1.0000],
  