In [7]:
# !pip install -r requirements.txt
# !pip install -q torchdata==0.3.0 torchtext==0.12 spacy==3.2 altair GPUtil
# !pip install torchdata==0.3.0
# !pip install torchtext==0.12
# !pip install spacy==3.2
# !pip install altair
# !pip install GPUtil
# !python -m spacy download de_core_news_sm
# !python -m spacy download en_core_web_sm

In [8]:
import os
from os.path import exists
import torch
import torch.nn as nn
from torch.nn.functional import log_softmax, pad
import math
import copy
import time
from torch.optim.lr_scheduler import LambdaLR
import pandas as pd
import altair as alt
from torchtext.data.functional import to_map_style_dataset
from torch.utils.data import DataLoader
from torchtext.vocab import build_vocab_from_iterator
import torchtext.datasets as datasets
import spacy
import GPUtil
import warnings
from torch.utils.data.distributed import DistributedSampler
import torch.distributed as dist
import torch.multiprocessing as mp
from torch.nn.parallel import DistributedDataParallel as DDP


# Set to False to skip notebook execution (e.g. for debugging)
warnings.filterwarnings("ignore")
RUN_EXAMPLES = True

In [10]:
# Some convenience helper functions used throughout the notebook


def is_interactive_notebook():
    return __name__ == "__main__"


def show_example(fn, args=[]):
    if __name__ == "__main__" and RUN_EXAMPLES:
        return fn(*args)


def execute_example(fn, args=[]):
    if __name__ == "__main__" and RUN_EXAMPLES:
        fn(*args)


class DummyOptimizer(torch.optim.Optimizer):
    def __init__(self):
        self.param_groups = [{"lr": 0}]
        None

    def step(self):
        None

    def zero_grad(self, set_to_none=False):
        None


class DummyScheduler:
    def step(self):
        None

- 结构

<div
    style="width: 600px; background-color: white; margin: 0 auto; padding: 20px"
>
    <img src="https://arxiv.org/html/1706.03762v7/extracted/1706.03762v7/Figures/ModalNet-21.png" alt="结构">
</div>

# 注解的 Transformer
[原文链接](https://nlp.seas.harvard.edu/annotated-transformer/)

# 模型结构

In [9]:
class EncoderDecoder(nn.Module):
    """
    一个标准的编码器-解码器架构。这个类是许多其他模型的基础。
    """

    def __init__(self, encoder, decoder, src_embed, tgt_embed, generator):
        super(EncoderDecoder, self).__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.src_embed = src_embed
        self.tgt_embed = tgt_embed
        self.generator = generator

    def forward(self, src, tgt, src_mask, tgt_mask):
        "Take in and process masked src and target sequences."
        return self.decode(self.encode(src, src_mask), src_mask, tgt, tgt_mask)

    def encode(self, src, src_mask):
        return self.encoder(self.src_embed(src), src_mask)

    def decode(self, memory, src_mask, tgt, tgt_mask):
        return self.decoder(self.tgt_embed(tgt), memory, src_mask, tgt_mask)

In [25]:
class Generator(nn.Module):
    """ 定义标准的线性 + softmax 生成步骤。 """
    def __init__(self, d_model, vocab):
        super(Generator, self).__init__()
        # W [vocab, d_model]; b [vocab]
        # input: x [batch_size, seq_len, d_model]
        # output: y [batch_size, seq_len, vocab]
        self.proj = nn.Linear(d_model, vocab) 

    def forward(self, x): # x [batch_size, seq_len, d_model]
        return log_softmax(self.proj(x), dim=-1)

## 编码器和解码器栈

### 编码器

编码器由 $N=6$ 个相同的层组成。

In [26]:
def clones(module, N):
    """ 生成 N 个相同的层 """
    return nn.ModuleList([copy.deepcopy(module) for _ in range(N)])

In [29]:
class Encoder(nn.Module):
    """ 核心编码器是 N 层的堆栈 """

    def __init__(self, layer, N):
        super(Encoder, self).__init__()
        self.layers = clones(layer, N)
        self.norm = LayerNorm(layer.size)

    def forward(self, x, mask):
        """ 依次序将输入 x (和 mask) 传递给每一层 """
        for layer in self.layers:
            x = layer(x, mask)
        return self.norm(x) # 最后一层的输出经过归一化处理

我们使用残差连接
[(cite)](https://arxiv.org/abs/1512.03385) 包裹每个子层，然后进行层归一化
[(cite)](https://arxiv.org/abs/1607.06450)。

In [30]:
class LayerNorm(nn.Module):
    """ 构建一个 LayerNorm 模块 (参见引用以获取详细信息)。"""

    def __init__(self, features, eps=1e-6):
        super(LayerNorm, self).__init__()
        self.gamma = nn.Parameter(torch.ones(features))
        self.beta = nn.Parameter(torch.zeros(features))
        self.eps = eps

    def forward(self, x):
        mean = x.mean(-1, keepdim=True)
        std = x.std(-1, keepdim=True)
        return self.gamma * (x - mean) / (std + self.eps) + self.beta


也就是说，每个子层的输出是 $\mathrm{LayerNorm}(x +
\mathrm{Sublayer}(x))$，其中 $\mathrm{Sublayer}(x)$ 是子层本身实现的函数。
我们在每个子层的输出上应用 dropout
[(cite)](http://jmlr.org/papers/v15/srivastava14a.html)，然后再将其添加到子层输入并进行归一化。

为了便于这些残差连接，模型中的所有子层以及嵌入层都产生维度为 $d_{\text{model}}=512$ 的输出。

In [33]:
class SublayerConnection(nn.Module):
    """
    原论文: output = LayerNorm(x + sublayer(x))
    代码: output = x + sublayer(LayerNorm(x)
    """
    def __init__(self, size, dropout):
        super(SublayerConnection, self).__init__()
        self.norm = LayerNorm(size)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, sublayer):
        """ 将残差连接应用于任何具有相同大小的子层。"""
        return x + self.dropout(sublayer(self.norm(x)))