**文件构成：**
- Replayer Buffer
- 策略网络和价值函数网络构建
- Agent
- 各种初始化方法
- main方法

In [1]:
import os
import gym
import torch
import shutil
import random
import numpy as np
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

from pathlib import Path
from gym.spaces import Box
from config import get_config
from tensorboardX import SummaryWriter
from collections import namedtuple, deque
from torch.nn.utils import clip_grad_norm_

In [2]:
"""
ReplayBuffer的构建
"""
class ReplayBuffer:
    """存储轨迹转移数组"""

    def __init__(self, action_size, buffer_size, batch_size):
        self.action_size = action_size
        self.buffer = deque(maxlen=buffer_size)  # 一个buffer里能存多少条经验轨迹
        self.batch_size = batch_size
        self.experience = namedtuple("Experience", field_names=["state", "action", "reward", "next_state", "done"])
        self.device = torch.device(f"cuda:0" if torch.cuda.is_available() else "cpu")

    def add(self, state, action, reward, next_state, done):
        """往buffer里添加新的经验"""
        e = self.experience(state, action, reward, next_state, done)
        self.buffer.append(e)

    def sample(self):
        """从buffer里随机采样一个批次的轨迹样本"""
        experiences = random.sample(self.buffer, k=self.batch_size)  # 随机抽取batch_size个样本

        # 将变量类型从np转为tensor，并从CPU挪到GPU中进行加速计算
        states = torch.as_tensor(np.vstack([e.state for e in experiences if e is not None]),
                                 dtype=torch.float32, device=self.device)
        actions = torch.as_tensor(np.vstack([e.action for e in experiences if e is not None]),
                                  dtype=torch.float32, device=self.device)
        rewards = torch.as_tensor(np.vstack([e.reward for e in experiences if e is not None]),
                                  dtype=torch.float32, device=self.device)
        next_states = torch.as_tensor(np.vstack([e.next_state for e in experiences if e is not None]),
                                      dtype=torch.float32, device=self.device)
        dones = torch.as_tensor(np.vstack([e.done for e in experiences if e is not None]),
                                dtype=torch.float32, device=self.device)

        return states, actions, rewards, next_states, dones

    def __len__(self):
        return len(self.buffer)

In [3]:
"""
神经网络的构建
"""
def init_weight(layer):
    if type(layer) == nn.Linear:
        nn.init.kaiming_normal_(layer.weight, nonlinearity='relu')


class DQN(nn.Module):
    def __init__(self, state_size, action_size, hidden_size):
        super(DQN, self).__init__()
        self.net = nn.Sequential(nn.Linear(state_size, hidden_size), nn.ReLU(),
                                 nn.Linear(hidden_size, hidden_size), nn.ReLU(),
                                 nn.Linear(hidden_size, hidden_size), nn.Hardswish(),
                                 nn.Linear(hidden_size, action_size))
        self.net.apply(init_weight)

    def forward(self, state):
        return self.net(state)



In [4]:
class Agent:
    """与环境交互并且学习好的策略"""
    def __init__(self, state_size, action_size, hidden_size, config):
        self.state_size = state_size
        self.action_size = action_size
        self.config = config
        self.device = torch.device(f"cuda:0" if torch.cuda.is_available() else "cpu")

        # Q-Network
        self.q_net = DQN(state_size, action_size, hidden_size).to(self.device)
        self.q_target = DQN(state_size, action_size, hidden_size).to(self.device)

        # optimizer
        self.optimizer = optim.Adam(self.q_net.parameters(), lr=config.lr)

        # ReplayBuffer
        self.buffer = ReplayBuffer(action_size, buffer_size=config.buffer_size, batch_size=config.batch_size)

        # 训练时间步骤的初始化
        self.t_step = 0
        self.q_updates = 0
        self.action_step = 4
        self.last_action = None

        self.writer = SummaryWriter('result')

    def step(self, state, action, reward, next_state, done):
        """往buffer中保存经验， 并且使用随机抽样进行学习"""
        self.buffer.add(state, action, reward, next_state, done)  # 保存经验

        # 每隔多久更新一次
        self.t_step = self.t_step + 1
        if (self.t_step) % self.config.update_every == 0:
            # 如果buffe中的数据存储的够多了，就可以学习
            if len(self.buffer) > self.config.batch_size:
                experiences = self.buffer.sample()
                loss = self.learn(experiences)
                self.q_updates += 1
                self.writer.add_scalar('q_loss', loss, self.q_updates)

    def get_action(self, state, epsilon=0.):
        # 根据当前策略返回给定状态的操作，确定性策略，画面每更新4帧多一次动作
        state = np.array(state)
        state = torch.from_numpy(state).float().unsqueeze(0).to(self.device)  # 增加一个维度给batch_size
        self.q_net.eval()
        action_values = self.q_net(state).detach()
        self.q_net.train()

        # Epsilon-greedy action selection
        if random.random() > epsilon:
            action = np.argmax(action_values.cpu().numpy())
            self.last_action = action
            return action
        else:
            action = random.choice(np.arange(self.action_size))
            self.last_action = action
            return action

    def learn(self, experiences):
        """
        使用一个批次的经验轨迹数据来更新值网络和策略网络
        Q_targets = r + γ * critic_target(next_state, actor_target(next_state)) ：这个是基于真实值的标签
        where:
            actor_target(state) -> action
            critic_target(state, action) -> Q-value
        """
        config = self.config
        gamma = self.config.gamma
        alpha = self.config.alpha
        entropy_tau = self.config.entropy_tau
        states, actions, rewards, next_states, dones = experiences
        # 从target网络模型里得到预测next的Q值(获得一堆Q值)
        q_targets_next = self.q_target(next_states).detach()

        # 用LogSumExp计算entropy，目的是维持数值的稳定性，详情见博客
        q_k_targets_next = q_targets_next
        v_k_targets_next = q_targets_next.max(1)[0].unsqueeze(-1)
        logSum = torch.logsumexp((q_k_targets_next - v_k_targets_next) / entropy_tau, 1).unsqueeze(-1)
        tau_log_pi_next = q_k_targets_next - v_k_targets_next - entropy_tau * logSum

        # 目标策略
        pi_target = F.softmax(q_targets_next / entropy_tau, dim=1)

        # q_targets的计算
        q_targets = (gamma * (pi_target * (q_targets_next - tau_log_pi_next) * (1-dones)).sum(1)).unsqueeze(-1)

        # 用logSum计算munchausen的addon
        q_k_targets = self.q_target(states).detach()
        v_k_targets = q_k_targets.max(1)[0].unsqueeze(-1)
        logSum = torch.logsumexp((q_k_targets - v_k_targets) / entropy_tau, 1).unsqueeze(-1)
        tau_log_pi = q_k_targets - v_k_targets - entropy_tau * logSum
        munchausen_addon = tau_log_pi.gather(1, actions.long())

        # 计算munchausen reward
        munchausen_reward = rewards + alpha * torch.clamp(munchausen_addon, min=-1, max=0)
        q_targets = q_targets + munchausen_reward

        # 用当前的状态去估计/预测Q值
        q_expected = self.q_net(states).gather(1, actions.long())

        # 计算loss,target是我们想去接近的（相当于真实值）
        loss = F.mse_loss(q_expected, q_targets)
        loss.backward()
        clip_grad_norm_(self.q_net.parameters(), max_norm=self.config.max_grad_norm)
        self.optimizer.step()

        # 软更新target
        self.soft_update(self.q_net, self.q_target, self.config.soft_update_tau)
        return loss.detach().cpu().numpy()

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model: PyTorch model (weights will be copied from)
            target_model: PyTorch model (weights will be copied to)
            tau (float): interpolation parameter
        """
        for target_param, local_param in zip(target_model.parameters(), local_model.parameters()):
            target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)

In [5]:
class Init:
    def __init__(self, config):
        self.config = config
        self.state_size = 0
        self.action_size = 0
        torch.set_num_threads(self.config.num_threads)
        torch.set_default_dtype(torch.float32)

    def init_seed(self):
        np.random.seed(self.config.seed)
        torch.manual_seed(self.config.seed)
        if torch.backends.cudnn.enabled:
            torch.backends.cudnn.benchmark = False
            torch.backends.cudnn.deterministic = True

    def init_env(self):
        env = gym.make(self.config.env_name)
        self.state_size = env.observation_space.shape[0]
        self.action_size = env.action_space.shape[0] if isinstance(env.action_space, Box) else env.action_space.n
        return env #, self.state_size, self.action_size

    def init_agent(self):
        agent = Agent(self.state_size, self.action_size, hidden_size=self.config.hidden_size, config=self.config)
        return agent

    def init_results_dir(self):
        results_dir = Path('./results') / self.config.env_name / self.config.algorithm
        # Todo: 如果存在相同名字直接覆盖
        # if not model_dir.exists():
        seed_dir = f'{self.config.algorithm}_{self.config.seed}'
        logs_dir = results_dir/seed_dir
        checkpoint_dir = logs_dir / 'checkpoint'
        if logs_dir.exists():
            shutil.rmtree(logs_dir, ignore_errors=True)
        os.makedirs(checkpoint_dir)
        writer = SummaryWriter(logs_dir)
        return logs_dir, checkpoint_dir, writer


In [6]:
from utils import epsilon_explore
def run(config):
    initialization = Init(config)
    initialization.init_seed()
    env = initialization.init_env()
    agent = initialization.init_agent()
    obs = env.reset()
    score = 0
    scores_window = deque(maxlen=100)
    frames = config.frames
    for frame in range(1, frames):
        epsilon = epsilon_explore(frame, frames)
        action = agent.get_action(obs, epsilon)
        next_obs, reward, done, _ = env.step(action)
        agent.step(obs, action, reward, next_obs, done)
        obs = next_obs
        score += reward

        if done:
            scores_window.append(score)
            agent.writer.add_scalar('Average 100', np.mean(scores_window), frame)
            obs = env.reset()
            score = 0



In [9]:
if __name__ == '__main__':
    parser = get_config()
    config = parser.parse_args()
    run(config)

usage: ipykernel_launcher.py [-h] [--algorithm ALGORITHM] [--run_num RUN_NUM]
                             [--n_episodes N_EPISODES]
                             [--num_threads NUM_THREADS] [--gamma GAMMA]
                             [--frames FRAMES] [--eps_frames EPS_FRAMES]
                             [--min_eps MIN_EPS]
                             [--experiment_name EXPERIMENT_NAME] [--seed SEED]
                             [--cuda] [--cuda_deterministic]
                             [--env_name ENV_NAME] [--action_step ACTION_STEP]
                             [--buffer_size BUFFER_SIZE]
                             [--batch_size BATCH_SIZE]
                             [--episode_length EPISODE_LENGTH]
                             [--n_step N_STEP]
                             [--soft_update_tau SOFT_UPDATE_TAU]
                             [--hidden_size HIDDEN_SIZE] [--layer_N LAYER_N]
                             [--lr LR] [--critic_lr CRITIC_LR]
                          

SystemExit: 2