In [1]:
from turtle import forward
from xml.etree.ElementTree import tostring
import gym
import random
import torch
import torch.nn as TNN
import torch.nn.functional as TF
import numpy as np
from collections import deque
import torch.utils.data as Data
import matplotlib.pyplot as plt

In [2]:
#参数定义
ENV_NAME = 'MountainCarContinuous-v0'
BUFFER_SIZE = 1000000

GAMMA = 0.99
BATCHSIZE = 64
TEST = 5
SAVINGPATH = "./modelDDPG/"
EPISODE = 10000
TAU = 0.001
STEP = 200

In [3]:
# OU噪声生成
class OrnsteinUhlenbeckActionNoise:
	def __init__(self, action_dim, mu = 0, theta = 0.15, sigma = 0.2):
		self.action_dim = action_dim
		self.mu = mu
		self.theta = theta
		self.sigma = sigma
		self.X = np.ones(self.action_dim) * self.mu

	def reset(self):
		self.X = np.ones(self.action_dim) * self.mu

	def sample(self):
		dx = self.theta * (self.mu - self.X)
		dx = dx + self.sigma * np.random.randn(len(self.X))
		self.X = self.X + dx
		return torch.Tensor(self.X)

In [4]:
# ReplayBuffer
class ReplayBuffer():
    def __init__(self,env,buffersize):
        self.buffer = deque(maxlen=buffersize)
        self.state_dim = env.observation_space.shape[0]
        self.action_dim = env.action_space.shape[0]
    def append(self,content):
        self.buffer.append([content[0],content[1],content[2],content[3]])

In [5]:
# 采样minibatch
# 生成标签 y 及 minibatch
def sample_Batch(replaybuffer,Q_t,u_t,batch_size=BATCHSIZE):
    # 根据BATCHSIZE采样
    minibatch = random.sample(replaybuffer.buffer,batch_size)
    
    state_batch = [data[0] for data in minibatch]
    action_batch = [data[1] for data in minibatch]
    reward_batch = [data[2] for data in minibatch]
    next_state_batch = [data[3] for data in minibatch]

    #将数据取出转换为成为张量
    tensor_state = torch.Tensor(state_batch).reshape(batch_size,Q_t.state_dim)
    tensor_action = torch.Tensor(action_batch).reshape(batch_size,Q_t.action_dim)
    tensor_r = torch.Tensor(reward_batch).reshape(batch_size,1)
    tensor_nextstate = torch.Tensor(next_state_batch).reshape(batch_size,Q_t.state_dim)

    tensor_u_t = u_t.net(tensor_nextstate)
    # 将数据送入Q_target计算
    tensor_y = tensor_r + GAMMA * Q_t(tensor_nextstate,tensor_u_t)

    return tensor_y,tensor_action,tensor_state

In [6]:
# 参数化策略网络
# 输入为状态state，输出为确定的动作
class Policy_Net(TNN.Module):
    def __init__(self,env):
        super().__init__()
        self.state_dim = env.observation_space.shape[0]
        self.action_dim = env.action_space.shape[0]
        self.bound = env.action_space.high
        self.bound_low = env.action_space.low
        self.net = TNN.Sequential()
        self.net.add_module("fc1",TNN.Linear(self.state_dim,30))
        self.net.add_module("relu1",TNN.ReLU())
        self.net.add_module("fc2",TNN.Linear(30,20))
        self.net.add_module("relu2",TNN.ReLU())
        self.net.add_module("fc3",TNN.Linear(20,self.action_dim))
        self.net.add_module("tanh1",TNN.Tanh())
        # 初始化最后一层
        TNN.init.uniform_(self.net.fc3.weight,a=-0.003,b=0.003)
        TNN.init.uniform_(self.net.fc3.bias,a=-0.003,b=0.003)
        # 初始化OU噪声
        self.OU_Noise = OrnsteinUhlenbeckActionNoise(self.action_dim)
        self.optimizer = torch.optim.Adam([{'params':self.net.parameters()}],lr=0.0001)
        
    def forward(self,x):
        Actions = self.net(x)
        return Actions
    
    def action(self,state):
        state = torch.Tensor(state)
        a=self.forward(state)
        return a * self.bound.item()

    #带OU噪声的action
    def action_with_noise(self,state):
        state = torch.Tensor(state)
        ret = self.action(state) + self.OU_Noise.sample()
        return ret

In [7]:
# Critic Net
# 输入为状态state和动作action，输出为价值
class Critic_Net(TNN.Module):
    def __init__(self,env) :
        super().__init__()
        self.state_dim = env.observation_space.shape[0]
        self.action_dim = env.action_space.shape[0] 

        # 隐层网络定义
        self.fc1 = TNN.Linear(self.state_dim,20)
        self.fc2 = TNN.Linear(20+self.action_dim,20)
        self.fc3 = TNN.Linear(20,1)
        # 初始化最后一层
        TNN.init.uniform_(self.fc3.weight,a=0.0003,b=0.0003)
        TNN.init.uniform_(self.fc3.bias,a=0.0003,b=0.0003)

        self.train_setup()

    def forward(self,state,action):
        h1 = TF.relu(self.fc1(state))
        #print(x)
        #print(action)
        cat = torch.cat((h1,action),axis=1)
        h2 = TF.relu(self.fc2(cat))
        out = self.fc3(h2)
        return out

    # 定义训练优化器和损失函数
    def train_setup(self):
        self.optimizer = torch.optim.Adam([
                    {'params':self.fc1.parameters()},
                    {'params':self.fc2.parameters()},
                    {'params':self.fc3.parameters()}],lr=0.001)
                    
        self.loss_fn = TNN.MSELoss()

    # 根据targetQ和targetU生成的y训练Critic网络
    def update_criticNet(self,tensor_state,tensor_action,tensor_y):
        x = self.forward(tensor_state,tensor_action)
        loss = self.loss_fn(x,tensor_y)

        # 反向传播
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

In [8]:
# 从replaybuffer中采样，完成算法的一次迭代
def update_Network(replaybuffer,Q,Q_t,u,u_t):
    if len(replaybuffer.buffer)<BATCHSIZE:
        return 
    # 形成 minibatch
    tensor_y,tensor_action,tensor_state  = sample_Batch(replaybuffer,Q_t,u_t)

    ## update Critic网络
    Q.update_criticNet(tensor_state,tensor_action,tensor_y)

    ## update Actor网络
    # 使用Actor生成动作 u_action
    u_action = u(tensor_state)
    # 将采样的状态state和Actor生成的动作输入Critic网络计算评判结果
    loss_u_Grad = - Q(tensor_state,u_action)
    # 损失函数为取平均，对Q的评判结果的平均值取负，因为要做的是梯度上升
    #loss_u_Grad =   Q_critic
    loss_u_Grad = loss_u_Grad.mean()
    
    # 将反向传播通路上的Q网络的节点梯度清零
    Q.optimizer.zero_grad()
    # 反向传播
    u.optimizer.zero_grad()
    loss_u_Grad.backward()
    u.optimizer.step()

    ### update target网络
    ## update Q_target网络
    update_tNet(Q,Q_t,TAU)
    
    ## 更新u_target网络
    update_tNet(u,u_t,TAU)

In [9]:
# 更新target网络，输入tau为更新权重
def update_tNet(Net,Net_t,tau):
    # 用于target网络的初始化
    if tau == 0:
        Net_t.load_state_dict(Net.state_dict())
        return
    
    state_dict = Net.state_dict()
    state_dict_t = Net_t.state_dict()
    #print(state_dict)
    for key in state_dict:
        state_dict_t[key] = state_dict[key]*tau + (1-tau)*state_dict_t[key]
    Net_t.load_state_dict(state_dict_t)


In [10]:

def main():
    Mode = 'Train'
    env = gym.make(ENV_NAME)
    replaybuffer = ReplayBuffer(env,BUFFER_SIZE)
    # 创建网络
    Q = Critic_Net(env)
    u = Policy_Net(env)
    # target
    Q_t = Critic_Net(env)
    u_t = Policy_Net(env)

    # 初始化target
    update_tNet(Q,Q_t,0)
    update_tNet(u,u_t,0)
    if Mode == 'Train':
        ave_reward = []
        for episode in range(EPISODE):
            # 初始化环境
            state = env.reset()
            acc_reward = 0 #累积奖赏
            # 训练
            for step in range(STEP):
                # 产生动作
                action = u.action_with_noise(state)
                #观测
                next_state,reward,done,_ = env.step(action.detach().numpy())
                replaybuffer.append([state,action,reward,next_state])
                #更新四个网络
                update_Network(replaybuffer,Q,Q_t,u,u_t)
                state = next_state
                if done : 
                    break
            if episode%100 == 0:#测试
                acc_reward = 0
                for i in range(TEST):
                    state = env.reset()
                    for step in range(STEP):
                        action = u.action(state)
                        state,reward,done,_ = env.step(action.detach().numpy())
                        acc_reward += reward
                        if done:
                            break
                ave_reward.append(acc_reward/TEST)
                print('episode: ',episode,'Evaluation Average Reward:',acc_reward/TEST)
                if acc_reward/TEST >93:
                    #保存
                    torch.save(u.state_dict(),SAVINGPATH+"model-"+str(acc_reward/TEST)+".pth")                
                    break
            
    else :
        u.load_state_dict(torch.load("modelDDPG\model-94.23124390777878.pth"))
        state = env.reset()
        acc_reward = 0
        while True:
            action = u.action(state)
            state,reward,done,_ = env.step(action.detach().numpy())
            acc_reward += reward
            if done:
                break
        print("total reward: {}".format(acc_reward))


if __name__ == '__main__':
    main()

episode:  0 Evaluation Average Reward: -0.00018443086012521413
episode:  100 Evaluation Average Reward: -0.12882516758247425
episode:  200 Evaluation Average Reward: -0.12119056573987108
episode:  300 Evaluation Average Reward: -0.005951791737562561
episode:  400 Evaluation Average Reward: -0.039328653384987235
episode:  500 Evaluation Average Reward: -0.047719049670496436
episode:  600 Evaluation Average Reward: -0.12340840088010363
episode:  700 Evaluation Average Reward: -0.7863463507293721
episode:  800 Evaluation Average Reward: -1.3954676846445642
episode:  900 Evaluation Average Reward: -12.735283124272883
episode:  1000 Evaluation Average Reward: -19.233619675667047
episode:  1100 Evaluation Average Reward: -19.982987305858238
episode:  1200 Evaluation Average Reward: 91.31608152489275
episode:  1300 Evaluation Average Reward: 88.85023502767196
episode:  1400 Evaluation Average Reward: -19.99999278307167
episode:  1500 Evaluation Average Reward: -18.156353624002385
episode:  16

KeyboardInterrupt: 

In [None]:
def main():

    env = gym.make(ENV_NAME)
    # 创建网络
    u = Policy_Net(env)
    u.load_state_dict(torch.load("modelDDPG\model-94.23124390777878.pth"))
    state = env.reset()
    acc_reward = 0
    while True:
        action = u.action(state)
        state,reward,done,_ = env.step(action.detach().numpy())
        acc_reward += reward
        env.render()
        if done:
            state = env.reset()
            print("total reward: {}".format(acc_reward))
            acc_reward=0
    #


if __name__ == '__main__':
    main()

total reward: 92.5034822225326
total reward: 94.210850786879
total reward: 94.38004282103113
total reward: 94.36252076554238
total reward: 94.24541889939272
total reward: 91.10457492914891
total reward: 93.34046744742642
total reward: 93.49390179729043
total reward: 94.35951246150044
total reward: 94.22223874419761
total reward: 94.24694256858206
total reward: 94.5733744021846
total reward: 94.24389989921806
total reward: 91.0324431312632
total reward: 94.07632175582212
total reward: 93.75615330343385
total reward: 94.3659897627396
total reward: 94.57191740453176
total reward: 89.36962892967787
total reward: 93.70021643669196
total reward: 94.25243958122559


KeyboardInterrupt: 