In [11]:
import copy
import math

import torch
import torch.nn as nn
import torch.nn.functional as F

reference:

1. http://nlp.seas.harvard.edu/annotated-transformer/

2. https://luweikxy.gitbook.io/machine-learning-notes/self-attention-and-transformer

3. https://github.com/fubuki75/the-annotated-transformer-notes/blob/master/the%20annotated%20transformer%E5%AD%A6%E4%B9%A0%E7%AC%94%E8%AE%B0.md


### Transformer整体结构

Transformer整体上是encoder-decoder架构。

Encoder把**符号表示序列**映射为**连续表示序列**， $\left(x_{1}, \ldots, x_{n}\right) \to \mathbf{z}=\left(z_{1}, \ldots, z_{n}\right)$

Decoder为每个$\mathbf{z}$中的元素生成输出序列$\left(y_{1}, \ldots, y_{m}\right)$。
在生成下一个符号时，使用先前生成的符号作为额外的输入。

下面代码段展示了encoder-decoder的框架：

In [6]:
class EncoderDecoder(nn.Module):
    '''
    标准的encoder-decoder结构
    '''
    def __ini__(self, encoder, decoder, src_embed, tgt_embed, generator):
        super(EncoderDecoder, self).__init__()
        '''
        src_embed:
        tgt_embed:
        '''
        self.encoder = encoder
        self.decoder = decoder
        
        self.src_embed = src_embed
        self.tgt_embed = tgt_embed
        
        self.generator = generator
        
    def forward(self, src, tgt, src_mask, tgt_mask):
        
        # decoder中的memory为encode的输出
        return self.decode(self.encode(src, src_mask), src_mask, tgt, tgt_mask)
    
    def encode(self, src, src_mask):
        
        return self.encoder(self.src_embed(src), src_mask)
    
    def decode(self, memory, src_mask, tgt, tgt_mask):
        
        return self.decoder(self.tgt_embed(tgt), memory, src_mask, tgt_mask)
        

In [8]:
class Generator(nn.Module):
    "定义标准的 linear + softmax 生成方式"
    
    def __init__(self, d_model, vocab):
        super(Generator, self).__init__()
        self.proj = nn.Linear(d_model, vocab)
    
    def forward(self, x):
        return F.log_softmax(self.proj(x), dim=-1)
        

Transformer也采用如上decoder-encoder结构。
在decoder和encoder中分别使用了堆叠的self-attention和point-wise全连接层。

![image](./images/169628874-e9586707-02cc-439b-a0a2-7b5202d16c38.png)

In [3]:
def clones(module, N):
    "初始化N个完全一样的module，但注意每个module是参数不共享的"
    return nn.ModuleList([copy.deepcopy(module) for _ in range(N)])

关于ModuleList可以参考： https://zhuanlan.zhihu.com/p/64990232

TODO:

- [] 可以学习一下copy和deepcopy的区别


In [4]:
class Encoder(nn.Module):
    "Encoder主要组件就是N layers的堆叠，用到clones函数"
    def __init__(self, layer, N):
        super(Encoder, self).__init__()
        
        self.layers = clones(layer, N)
        self.norm = LayerNorm(layer.size)
        
    def forward(self, x, mask):
        "迭代处理x及mask"
        for layer in self.layers:
            x = layer(x, mask)
        return self.norm(x)

对应上面的框架图，Encoder中的每个layer包括：MultiHead Attention，残差连接+LayerNorm， Feed Forward Network，残差连接+LayerNorm。

N层结束后，最后还有一个LayerNorm。

每个layer中的残差连接+LayerNorm可以表示为：LayerNorm(x + Sublayer(x))。

先看LayerNorm:

In [5]:
class LayerNorm(nn.Module):
    def __init__(self, features, eps=1e-6):
        super(LayerNorm, self).__init__()
        
        self.a_2 = nn.Parameter(torch.ones(features))
        self.b_2 = nn.Parameter(torch.zeros(features))
        self.eps = eps
        
    def forward(self, x):
        mean = x.mean(-1, keepdim=True)
        std = x.std(-1, keepdim=True)
        return self.a_2 * (x - mean) / (std + self.eps) + self.b_2

再看下残差链接：

In [6]:
class SublayerConnection(nn.Module):
    def __init__(self, size, dropout):
        super(SublayerConnection, self).__init__()
        self.norm = LayerNorm(size)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, x, sublayer):
        "Apply residual connection to any sublayer with the same size."
        return x + self.dropout(sublayer(self.norm(x)))

再回头看框架图，encoder中每层（共N层）中有两处sublayer，一个是MultiHeadAttention,一个是position-wise 全连接层。
下面我们就按照encoder框架图实现encoderlayer。

这里再放一下框架图，省得翻回去看。

![image](./images/169628874-e9586707-02cc-439b-a0a2-7b5202d16c38.png)

In [None]:
class EncoderLayer(nn.Module):
    def __init__(self, size, self_attn, feed_forward, dropout):
        super(EncoderLayer, self).__init__()
        
        self.self_attn = self_attn
        self.feed_forward = feed_forward
        self.sublayer = clones(SublayerConnection(size, dropout), 2)
        self.size = size
        
    def forward(self, x, mask):
        
        # 为什么self_attn用lambda的形式传参？
        x = self.sublayer[0](x, lambda x: self.self_attn(x, x, x, mask))
        
        return self.sublayer[1](x, self.feed_forward)

### 下面看Decoder

有了Encoder基础，Decoder就会容易上手一些。不过还是值得注意二者输入输出的差别。

In [7]:
class Decoder(nn.Module):
    def __init__(self, layer, N):
        super(Decoder, self).__init__()
        
        self.layers = clones(layer, N)
        self.norm = LayerNorm(layer.size)
        
    def forward(self, x, memory, src_mask, tgt_mask):
        for layer in self.layers:
            x = layer(x, memory, src_mask, tgt_mask)
            
        return self.norm(x)

区别于encoder, decoder有三个sublayer。

而且第一个MultiHeadAttention层是**masked** MultiHeadAttention，晚些我们会解释它的作用。

第一个attn是self attn，第二个是cross attn。

In [8]:
class DecoderLayer(nn.Module):
    def __init__(self, size, self_attn, src_attn, feed_forward, dropout):
        super(DecoderLayer, self).__init__()
        
        self.size = size
        self.self_attn = self_attn
        self.src_attn = src_attn
        self.feed_forward = feed_forward
        self.sublayer = clones(SublayerConnection(size, dropout), 3)
        
    def forward(self, x, memory, src_mask, tgt_mask):
        
        # 这一步的目的是什么？
        m = memory
        x = self.sublayer[0](x, lambda x: self.self_attn(x, x, x, tgt_mask))
        x = self.sublayer[1](x, lambda x: self.src_attn(x, m, m, src_mask))
        return self.sublayer[2](x, self.feed_forward)

写到这里会发现有很多mask参数，所以有必要深究一下各种mask究竟是干嘛的。

这部分主要参考： https://luweikxy.gitbook.io/machine-learning-notes/self-attention-and-transformer

### 什么是Mask？

mask表示掩码，它对某些值进行掩盖，使其在参数更新时不产生效果。Transformer模型里面涉及两种mask，分别是 padding mask和sequence mask。 其中，padding mask在所有的scaled dot-product attention（也就是attention计算中）里面都需要用到，而sequence mask只有在Decoder的Self-Attention里面用到。

Padding mask是为了让每个batch的输入等长。 另外在计算attn时，这些填充的位置不应得到“atten”，所以需要在padding mask为False的位置加上负无穷的数，这样在attn的softmax之后，这些padding的位置的atten score就趋近于0。（注意其实输入序列会有个max length， 小于它需要mask，大于它的会直接截断掉左侧多余的内容）

Sequence mask是为了让decoder看不到“未来信息”。因为我们要序列化的预测，自然不能让decoder预知未来，而只能根据过去推测当前输出，所以需要这个mask。实现中，sequence mask是一个下三角矩阵，对角线及左下角为1，其余为0.

在transformer中，只有decoder的self-attention是需要同时使用padding mask和sequential mask的（二者之和作为attn mask），encoder的self-attention和decoder的cross attention均只使用padding mask作为attn mask。

我们看下sequential mask

In [9]:
def subsequent_mask(size):
    attn_shape = (1, size, size)
    subsequent_mask = torch.triu(torch.ones(attn_shape), diagonal=-1).type(torch.uint8)
    return subsequent_mask == 0

### Attention


![image](./images/169642008-0347600c-1bdf-41f9-b612-e369734fa2d7.png)

这个流程网上的帖子很多了。

QK要去算atten weight，所以二者的维度需要相等 $d_k$， 但V可以和QK的维度$d_v$不等的。

Transformer采用的attention是scaled dot-product attention，scaled是说QK乘完之后除以根号下$d_k$

$$\operatorname{Attention}(Q, K, V)=\operatorname{softmax}\left(\frac{Q K^{T}}{\sqrt{d_{k}}}\right) V$$

**说一下为什么要scaled**

当$d_k$很大时，$QK^T$结果会变得很大，这会导致softmax的结果的梯度极小。(因为softmax的梯度需要把坟墓平方一下)。所以我们需要缩放一下$QK^T$结果。


原文解释如下：

![image](./images/169643252-749f8ed2-1f13-4e95-bb1e-d6896eb5b7e2.png)

In [12]:
def attention(query, key, value, mask=None, dropout=None):
    
    d_k = query.size(-1)
    scores = torch.matmul(query, key.transpose(-2, -1)) / math.sqrt(d_k)
    if mask is not None:
        scores = scores.masked_fill(mask == 0, -1e9)
    p_attn = scores.softmax(dim=-1)
    if dropout is not None:
        p_attn = dropout(p_attn)
    return torch.matmul(p_attn, value), p_attn

MultiHeadAttention

In [13]:
class MultiHeadAttention(nn.Module):
    def __init__(self, h, d_model, dropout=0.1):
        super(MultiHeadAttention, super).__init__()
        assert d_model % h == 0
        # 这里假设d_k == d_v
        self.d_k = d_model // h
        self.h = h
        # 这里的4代表decoder中要用到的四个linear layer
        self.linears = clones(nn.Linear(d_model, d_model), 4)
        self.attn = None
        self.dropout = nn.Dropout(p=dropout)
        
    def forward(self, query, key, value, mask=None):
        if mask is not None:
            # Same mask applied to all h heads.
            mask = mask.unsqueeze(1)
            
        nbatches = query.size(0)
        
        # 1) Do all the linear projections in batch from d_model => h x d_k
        # 这里有三次循环，分别把query, key, value扔进各自的linear layer。
        # linear出来后是d_model维的，所以要reshape成h x d_k。
        query, key, value = [
            lin(x).view(nbatches, -1, self.h, self.d_k).transpose(1, 2)
            for lin, x in zip(self.linears, (query, key, value))
        ]
        
        # Apply attention on all the projected vectors in batch.
        x, self.attn = attention(
            query, key, value, mask=mask, dropout=self.dropout
        )
        
        
        # 3) "Concat" using a view and apply a final linear.
        x = (
            x.transpose(1, 2)
            .contiguous()
            .view(nbatches, -1, self.h * self.d_k)
        )
        
        del query
        del key
        del value
        return self.linears[-1](x)
        

In [14]:
class PositionwiseFeedForward(nn.Module):
    def __init__(self, d_model, d_ff, dropout=0.1):
        super(PositionwiseFeedForward, self).__init__()
        self.w_1 = nn.Linear(d_model, d_ff)
        self.w_2 = nn.Linear(d_ff, d_model)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, x):
        return self.w_2(self.dropout(self.w_1(x).relu()))

In [15]:
class Embeddings(nn.Module):
    def __init__(self, d_model, vocab):
        super(Embeddings, self).__init__()
        self.lut = nn.Embedding(vocab, d_model)
        self.d_model = d_model
        
    def forward(self, x):
        return self.lut(x) * math.sqrt(self.d_model)

In [16]:
class PositionalEncoding(nn.Module):
    "Implement the PE function."

    def __init__(self, d_model, dropout, max_len=5000):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)

        # Compute the positional encodings once in log space.
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len).unsqueeze(1)
        div_term = torch.exp(
            torch.arange(0, d_model, 2) * -(math.log(10000.0) / d_model)
        )
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0)
        self.register_buffer("pe", pe)

    def forward(self, x):
        x = x + self.pe[:, : x.size(1)].requires_grad_(False)
        return self.dropout(x)

以上，我们把模型搭建的部分搞定了。

接下来我们分别搞定模型的初始化和inference。

目的是弄清楚数据流逻辑。

In [17]:
def make_model(
  src_vocab, tgt_vocab, N=6, d_model=512, d_ff=2048, h=8, dropout=0.1  
):
    c = copy.deepcopy
    attn = MultiHeadAttention(h, d_model)
    ff = PositionwiseFeedForward(d_model, d_ff, dropout)
    position = PositionalEncoding(d_model, dropout)
    model = EncoderDecoder(
      Encoder(EncoderLayer(d_model, c(attn), c(ff), dropout), N),
      Decoder(DecoderLayer(d_model, c(attn), c(attn), c(ff), dropout), N),
      nn.Sequential(Embeddings(d_model, src_vocab), c(position)),
      nn.Sequential(Embeddings(d_model, tgt_vocab), c(position)),
      Generator(d_model, tgt_vocab),
    )
    
    # This was important from their code.
    # Initialize parameters with Glorot / fan_avg.
    for p in model.parameters():
      if p.dim() > 1:
        nn.init.xavier_uniform_(p)
        
    return model

### Inference