In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
import torch
import torch.nn as nn
import torch.nn.functional as F
from collections import deque
import random
import torch.optim as optim
import numpy as np
import gym
import os
import torch
import numpy as np
import random
import Env

In [2]:
# 超参数
train_eps = 6000
test_eps = 20
max_steps = 5001
gamma = 0.99
batch_size = 128
device = torch.device('cuda')
tau = 1e-3

In [3]:
class Actor(nn.Module): #定义Actor网络
    def __init__(self, num_states, num_actions):
        super(Actor, self).__init__()
        self.linear1 = nn.Linear(num_states, 32)
        self.linear2 = nn.Linear(32, 64)
        self.linear3 = nn.Linear(64, 128)
        self.linear4 = nn.Linear(128, num_actions)

    def forward(self, x):
        x = F.relu(self.linear1(x))
        x = F.relu(self.linear2(x))
        x = F.relu(self.linear3(x))
        x = self.linear4(x)
        return x

In [4]:
class Critic(nn.Module): #定义Critic网络
    def __init__(self, num_state_action, num_action_value = 1):
        super(Critic, self).__init__()
        self.linear1 = nn.Linear(num_state_action, 64)
        self.linear2 = nn.Linear(64, 128)
        self.linear3 = nn.Linear(128, num_action_value)

    def forward(self, state, action):
        # 按维数1拼接
        x = torch.cat([state, action], 1)
        x = F.relu(self.linear1(x))
        x = F.relu(self.linear2(x))
        x = self.linear3(x)
        return x

In [5]:
class ReplayBuffer:
    def __init__(self, capacity: int) -> None:
        self.capacity = capacity
        self.buffer = deque(maxlen=self.capacity)
    def push(self,transitions):
        '''_summary_
        Args:
            trainsitions (tuple): _description_
        '''
        self.buffer.append(transitions)
    def sample(self, batch_size: int, sequential: bool = False):
        if batch_size > len(self.buffer):
            batch_size = len(self.buffer)
        if sequential: # sequential sampling
            rand = random.randint(0, len(self.buffer) - batch_size)
            batch = [self.buffer[i] for i in range(rand, rand + batch_size)]
            return zip(*batch)
        else:
            batch = random.sample(self.buffer, batch_size)
            return zip(*batch)
    def clear(self):
        self.buffer.clear()
    def __len__(self):
        return len(self.buffer)

In [6]:
class DDPG:
    def __init__(self, device, action_space, state_space, batch_size, gamma, tau):
        self.device = device
        self.critic = Critic(action_space+state_space,1).to(device)
        self.actor = Actor(state_space,action_space).to(device)
        self.target_critic = Critic(action_space+state_space,1).to(device)
        self.target_actor = Actor(state_space,action_space).to(device)

        # 复制参数到目标网络
        for target_param, param in zip(self.target_critic.parameters(), self.critic.parameters()):
            target_param.data.copy_(param.data)
        for target_param, param in zip(self.target_actor.parameters(), self.actor.parameters()):
            target_param.data.copy_(param.data)

        self.critic_optimizer = optim.Adam(self.critic.parameters(), lr=1e-3)
        self.actor_optimizer = optim.Adam(self.actor.parameters(), lr=1e-4)
        self.memory = ReplayBuffer(capacity= 100000)
        self.batch_size = batch_size
        self.gamma = gamma
        self.tau = tau

    def sample_action(self, state):
        state = torch.FloatTensor(state).to(self.device)
        action = self.actor(state)
        return action.detach().cpu().numpy()

    @torch.no_grad()
    def predict_action(self, state):
        state = torch.FloatTensor(state).unsqueeze(0).to(self.device)
        action = self.actor(state)
        action[0] = np.clip(action[0],a_min=-24,a_max=10.5)
        action[1] = np.clip(action[1],a_min=-25,a_max=25)
        action[2] = np.clip(action[2],a_min=-30,a_max=30)
        action[3] = np.clip(action[3],a_min=0,a_max=1)
        return action.cpu().numpy()

    def update(self):
        if len(self.memory) < self.batch_size:  # 当memory中不满足一个批量时，不更新策略
            return
        # 从经验回放中中随机采样一个批量的transition
        state, action, reward, next_state, done = self.memory.sample(self.batch_size)
        # 转变为张量

        state= torch.FloatTensor(state).to(self.device)
        next_state = torch.FloatTensor(next_state).to(self.device)
        action = torch.FloatTensor(action).to(self.device)
        reward = torch.FloatTensor(reward).unsqueeze(1).to(self.device)
        done = torch.FloatTensor(np.float32(done)).unsqueeze(1).to(self.device)

        # 计算actor_loss
        actor_loss = self.critic(state, self.actor(state))
        actor_loss = - actor_loss.mean()

        # print(actor_loss,actor_loss.shape)

        # 计算下一时刻的预测动作价值
        next_action = self.target_actor(next_state)
        target_value = self.target_critic(next_state, next_action.detach())

        # 计算y_t
        expected_value = reward + (1.0 - done) * self.gamma * target_value
        expected_value = torch.clamp(expected_value, -np.inf, np.inf)

        # 计算critic_loss
        actual_value = self.critic(state, action)
        critic_loss = nn.MSELoss()(actual_value, expected_value.detach())

        # print(critic_loss, critic_loss.shape)

        # 反向传播
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        self.critic_optimizer.step()

        # 软更新
        for target_param, param in zip(self.target_critic.parameters(), self.critic.parameters()):
            target_param.data.copy_(
                target_param.data * (1.0 - self.tau) +
                param.data * self.tau
            )
        for target_param, param in zip(self.target_actor.parameters(), self.actor.parameters()):
            target_param.data.copy_(
                target_param.data * (1.0 - self.tau) +
                param.data * self.tau
            )

In [7]:
# class OUNoise(object): # Ornstein–Uhlenbeck噪声
#     def __init__(self, action_space, mu=0.0, theta=0.15, max_sigma=0.3, min_sigma=0.3, decay_period=100000):
#         self.mu           = mu # OU噪声的参数
#         self.theta        = theta # OU噪声的参数
#         self.sigma        = max_sigma # OU噪声的参数
#         self.max_sigma    = max_sigma
#         self.min_sigma    = min_sigma
#         self.decay_period = decay_period
#         self.n_actions   = action_space
#         self.reset()
#     def reset(self):
#         self.obs = np.ones(self.n_actions) * self.mu
#     def evolve_obs(self):
#         x  = self.obs
#         dx = self.theta * (self.mu - x) + self.sigma * np.random.randn(self.n_actions)
#         self.obs = x + dx
#         return self.obs
#     def get_action(self, action, t=0):
#         ou_obs = self.evolve_obs()
#         self.sigma = self.max_sigma - (self.max_sigma - self.min_sigma) * min(1.0, t / self.decay_period) # sigma会逐渐衰减
#         action = action + ou_obs
#         action[0] = np.clip(action[0],a_min=-24,a_max=10.5)
#         action[1] = np.clip(action[1],a_min=-25,a_max=25)
#         action[2] = np.clip(action[2],a_min=-30,a_max=30)
#         action[3] = np.clip(action[3],a_min=0,a_max=1)
#         return torch.tensor(action,dtype=torch.float32)

In [8]:
def test(env, agent, test_eps, max_steps):
    print("Start Testing")
    rewards = [] # 记录所有回合的奖励
    for episode in range(test_eps):
        state = env.reset()
        ep_reward = 0
        for step in range(max_steps):
            action = agent.predict_action(state)
            next_state, reward, done = env.step(action,0.1)
            ep_reward += reward
            state = next_state
            if done:
                break
        rewards.append(ep_reward)
        print(f"Episode：{episode+1}/{test_eps}，Reward：{ep_reward:.2f}")
    print("Testing Complete")
    return rewards

In [9]:
def train(env, agent, train_eps, max_steps):
    print("Start Training")
    rewards = [] # 记录所有回合的奖励
    for episode in range(train_eps):
        state = env.reset()
        ep_reward = 0
        x = []
        y = []
        z = []
        for step in range(max_steps):
            action = agent.sample_action(state)
            action[0] = np.clip(action[0],a_min=-24,a_max=10.5)
            action[1] = np.clip(action[1],a_min=-25,a_max=25)
            action[2] = np.clip(action[2],a_min=-30,a_max=30)
            action[3] = np.clip(action[3],a_min=0,a_max=1)
            # print(action)
            next_state, reward, done = env.step(action,0.1)
            # print(reward)
            ep_reward += reward
            x.append(env.x_b)
            y.append(env.y_b)
            z.append(env.z_b)
            agent.memory.push((state, action, reward, next_state, done))
            agent.update()
            state = next_state
            if done:
                break
        # print(ep_reward)
        if (episode+1)%10 == 0:
            print(f"Episode：{episode+1}/{train_eps}，Reward：{ep_reward:.2f}")
        rewards.append(ep_reward)
        print("X",x)
        print('--------------------------------------------------')
        print("Y",y)
        print('--------------------------------------------------')
        print("Z",z)
        print('--------------------------------------------------')
    print("Training Complete")
    return rewards

In [10]:
def draw(rewards,tag):
    sns.set(style='whitegrid')
    fig = sns.relplot(y= rewards, kind= 'line', tag=tag)
    plt.legend()

In [11]:
env = Env.Plane(1,65,40,0,100)

agent = DDPG(device, env.action_space, env.state_space, batch_size, gamma, tau)
train_res = train(env,agent,train_eps,max_steps)
test_res = test(env,agent,test_eps,max_steps)



# 画出结果
draw(train_res,tag="train")
draw(test_res,tag="test")

Start Training
X [0.00011000000047497452, 0.00010953136222960893, 0.013213284690552973, 0.04681450911300781, 0.11319304295318725, 0.21222756691711547, 0.34376152344482547, 0.5076001483990491, 0.7035135466648877, 0.9312455851628125, 1.1905048091008008, 1.4809640605046095, 1.8022599656178298, 2.1539932210041823, 2.5357294518544022, 2.9470007378651446, 3.387308831889803, 3.8561297852589433, 4.352920814235384, 4.877130647380525, 5.428212829311067, 6.005644651134188, 6.608950992305452, 7.237735982616121, 7.891722293574983, 8.570800681789095, 9.275091500957185, 10.00501761027115, 10.761389919002228, 11.545504852016144, 12.35925321169632, 13.20523824282425, 14.086892028529816, 15.00857553072708, 15.975650115688017, 16.994469161708526, 18.072248740871125, 19.216741271693877, 20.435610957820586, 21.735377689082792, 23.11976642199295, 24.587382884700467, 26.1285318333699, 27.721277232845, 29.32709540911453, 30.886931415279083, 32.33740758486526, 33.561644073207546, 34.42610148974197, 34.79383163

  state= torch.FloatTensor(state).to(self.device)


X [0.00011000000047497452, 0.00010953136222960893, 0.013213284690552973, 0.04681450911300781, 0.11319304295318725, 0.21222756691711547, 0.34376152344482547, 0.5076001483990491, 0.7035135466648877, 0.9312455851628125, 1.1905048091008008, 1.4809640605046095, 1.8022599656178298, 2.1539932210041823, 2.5357294518544022, 2.9470007378651446, 3.387308831889803, 3.8561297852589433, 4.352920814235384, 4.877130647380525, 5.428212829311067, 6.005652805049593, 6.6089989144398515, 7.2379069287373365, 7.892200036723787, 8.571948905666048, 9.248704333980257, 9.919514938075716, 10.586142821986849, 11.251274295528109, 11.918789048869783, 12.594028754909212, 13.283926339824374, 13.997297807414705, 14.745030971248324, 15.540159602840122, 16.397726817805943, 17.3343935925557, 18.367755885799106, 19.513374038417513, 20.781526084621127, 22.17278613634842, 23.672608943660432, 25.24560575075882, 26.830855365474395, 28.340122028072052, 29.66602878161209, 30.673412509639434, 30.116211791713408, 30.12377481051223

KeyboardInterrupt: 