In [1]:
# 后续代码所需的依赖
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import math, copy, time

class TokenEmbedding(nn.Embedding):
    """
    使用torch.nn的Embedding模块
    """

    def __init__(self, vocab_size, d_model):
        """
        TokenEmbedding类

        :param vocab_size: 词汇表的大小
        :param d_model: 模型的维度
        :padding的索引为1，即token索引为1时，Embedding补0
        """
        super(TokenEmbedding, self).__init__(vocab_size, d_model, padding_idx=1)

In [2]:
tok_emb = TokenEmbedding(1000, 512)
num_params = sum(p.numel() for p in tok_emb.parameters())
print("模块中的参数数量为:", num_params)

模块中的参数数量为: 512000


In [3]:
# x是batch_size为2, seq_len为3，索引为1的会被padding为0
x = torch.LongTensor([[6, 5, 4], [3, 2, 1]])
res = tok_emb(x)
print("res:", res)
print("res.shape:", res.shape)

res: tensor([[[ 0.1994,  0.9800,  1.3162,  ...,  0.5627,  0.4720,  0.9675],
         [ 0.5742, -0.6087,  0.5124,  ...,  1.2031,  0.7531, -0.6712],
         [ 1.5465, -0.5008,  0.9264,  ...,  0.0815, -0.2846,  0.0975]],

        [[ 0.7377, -0.0586, -1.3138,  ...,  0.4524, -0.2046,  1.6616],
         [ 1.1249, -1.7420, -1.6807,  ..., -0.0495,  0.3476,  1.1462],
         [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000]]],
       grad_fn=<EmbeddingBackward0>)
res.shape: torch.Size([2, 3, 512])


In [4]:
class PositionalEncoding(nn.Module):
    """
    计算正余弦位置编码。
    """
    def __init__(self, d_model, max_len):
        """
        正余弦位置编码类

        :param d_model: 模型的维度
        :param max_len: 最大序列长度
        """
        super(PositionalEncoding, self).__init__()

        # 初始化位置编码矩阵
        self.encoding = torch.zeros(max_len, d_model)
        self.encoding.requires_grad = False  # 不需要计算梯度

        pos = torch.arange(0, max_len)
        pos = pos.float().unsqueeze(dim=1)

        # 'i'表示d_model的索引（例如，嵌入大小=50，'i' = [0,50]）
        # “step=2”表示将'i'乘以二（与2 * i相同）
        _2i = torch.arange(0, d_model, step=2).float()
        self.encoding[:, 0::2] = torch.sin(pos / (10000 ** (_2i / d_model)))
        self.encoding[:, 1::2] = torch.cos(pos / (10000 ** (_2i / d_model)))

    def forward(self, x):
        # self.encoding
        # [max_len = 512, d_model = 512]

        batch_size, seq_len = x.size()
        # [batch_size = 8, seq_len = 30]

        return self.encoding[:seq_len, :]
        # [seq_len = 30, d_model = 512]
        # 将与 tok_emb 相加：[8, 30, 512]

In [5]:
pe = PositionalEncoding(512,512)
num_params = sum(p.numel() for p in pe.parameters())
print("模块中的参数数量为:", num_params)

模块中的参数数量为: 0


In [6]:
# x是batch_size为2, seq_len为3
x = torch.LongTensor([[6, 5, 4], [3, 2, 1]])
res = pe.forward(x)
print("res:", res)
# 返回的形状是[seq_len = 3, d_model = 512]
print("res.shape:", res.shape)

res: tensor([[ 0.0000e+00,  1.0000e+00,  0.0000e+00,  ...,  1.0000e+00,
          0.0000e+00,  1.0000e+00],
        [ 8.4147e-01,  5.4030e-01,  8.2186e-01,  ...,  1.0000e+00,
          1.0366e-04,  1.0000e+00],
        [ 9.0930e-01, -4.1615e-01,  9.3641e-01,  ...,  1.0000e+00,
          2.0733e-04,  1.0000e+00]])
res.shape: torch.Size([3, 512])


In [7]:
class TransformerEmbedding(nn.Module):
    """
    token embedding + positional encoding
    """

    def __init__(self, vocab_size, d_model, max_len, drop_prob):
        """
        包含Embedding和位置编码的类

        :param vocab_size: 词汇表大小
        :param d_model: 模型的维度
        :param max_len: 最大序列长度
        :param drop_prob: dropout 正则化概率，防止过拟合
        """
        super(TransformerEmbedding, self).__init__()
        self.tok_emb = TokenEmbedding(vocab_size, d_model)
        self.pos_emb = PositionalEncoding(d_model, max_len)
        self.drop_out = nn.Dropout(p=drop_prob)

    def forward(self, x):
        tok_emb = self.tok_emb(x)
        pos_emb = self.pos_emb(x)       
        return self.drop_out(tok_emb + pos_emb)

In [8]:
te = TransformerEmbedding(1000,512,512,0.1)
# x是batch_size为2, seq_len为3
x = torch.LongTensor([[6, 5, 4], [3, 2, 1]])
res = te.forward(x)
print("res:", res)
# 返回的形状是[batch_size = 2, seq_len = 3, d_model = 512]
print("res.shape:", res.shape)

res: tensor([[[ 1.6016e+00,  0.0000e+00,  7.3131e-01,  ...,  3.4665e-01,
           1.2576e-01,  1.7100e+00],
         [ 5.4879e-01,  7.7141e-01,  1.6291e+00,  ...,  0.0000e+00,
          -0.0000e+00,  2.7418e+00],
         [ 0.0000e+00,  1.3316e+00,  0.0000e+00,  ..., -1.0642e+00,
          -6.8040e-02, -4.0913e-01]],

        [[-1.1485e+00,  3.1954e-01,  0.0000e+00,  ...,  0.0000e+00,
           1.0274e+00,  1.3045e+00],
         [-5.3425e-02,  2.9367e-01,  5.0048e-01,  ...,  1.2340e+00,
           1.0360e+00, -7.7892e-01],
         [ 1.0103e+00, -4.6239e-01,  1.0405e+00,  ...,  1.1111e+00,
           2.3036e-04,  1.1111e+00]]], grad_fn=<MulBackward0>)
res.shape: torch.Size([2, 3, 512])


In [9]:
class ScaleDotProductAttention(nn.Module):
    """
    计算单个点积注意力
    """

    def __init__(self):
        super(ScaleDotProductAttention, self).__init__()
        self.softmax = nn.Softmax(dim=-1)

    def forward(self, q, k, v, mask=None, e=1e-12):
        # 输入是一个4维的张量
        # [batch_size, head, length, d_tensor]
        batch_size, head, length, d_tensor = k.size()

        # 1.用Key的转置与Query计算点积
        k_t = k.transpose(2, 3)  # transpose
        score = (q @ k_t) / math.sqrt(d_tensor)  # scaled dot product

        # 2.进行掩码，encoder不需要进行掩码，decoder需要进行掩码
        if mask is not None:
            score = score.masked_fill(mask == 0, -10000)  

        # 3.通过softmax使分数范围在[0, 1]之间
        score = self.softmax(score)

        # 4.再与Value相乘        
        v = score @ v

        return v, score

In [10]:
class MultiHeadAttention(nn.Module):

    def __init__(self, d_model, n_head):
        super(MultiHeadAttention, self).__init__()
        self.n_head = n_head
        self.attention = ScaleDotProductAttention()
        self.w_q = nn.Linear(d_model, d_model)
        self.w_k = nn.Linear(d_model, d_model)
        self.w_v = nn.Linear(d_model, d_model)
        self.w_concat = nn.Linear(d_model, d_model)

    def forward(self, q, k, v, mask=None):
        # 1.点积相应的矩阵
        q, k, v = self.w_q(q), self.w_k(k), self.w_v(v)

        # 2.根据头数进行维度拆分
        q, k, v = self.split(q), self.split(k), self.split(v)

        # 3.进行计算
        out, attention = self.attention(q, k, v, mask=mask)

        # 4.把拆分的多头再拼起来
        out = self.concat(out)
        out = self.w_concat(out)

        return out

    def split(self, tensor):
        """
        根据头数进行维度拆分

        :param tensor: [batch_size, length, d_model]
        :return: [batch_size, head, length, d_tensor]
        """
        batch_size, length, d_model = tensor.size()

        d_tensor = d_model // self.n_head
        tensor = tensor.view(batch_size, length, self.n_head, d_tensor).transpose(1, 2)

        return tensor

    def concat(self, tensor):
        """
        把拆分的多头再拼起来

        :param tensor: [batch_size, head, length, d_tensor]
        :return: [batch_size, length, d_model]
        """
        batch_size, head, length, d_tensor = tensor.size()
        d_model = head * d_tensor

        tensor = tensor.transpose(1, 2).contiguous().view(batch_size, length, d_model)
        return tensor

In [11]:
class PositionwiseFeedForward(nn.Module):

    def __init__(self, d_model, hidden, drop_prob=0.1):
        super(PositionwiseFeedForward, self).__init__()
        self.linear1 = nn.Linear(d_model, hidden)
        self.linear2 = nn.Linear(hidden, d_model)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(p=drop_prob)

    def forward(self, x):
        x = self.linear1(x)
        x = self.relu(x)
        x = self.dropout(x)
        x = self.linear2(x)
        return x

In [12]:
class LayerNorm(nn.Module):
    def __init__(self, d_model, eps=1e-12):
        super(LayerNorm, self).__init__()
        self.alpha = nn.Parameter(torch.ones(d_model))
        self.bias = nn.Parameter(torch.zeros(d_model))
        self.eps = eps

    def forward(self, x):
        mean = x.mean(-1, keepdim=True)
        var = x.var(-1, unbiased=False, keepdim=True)

        out = (x - mean) / torch.sqrt(var + self.eps)
        out = self.alpha * out + self.bias
        return out

In [13]:
class EncoderLayer(nn.Module):

    def __init__(self, d_model, ffn_hidden, n_head, drop_prob):
        super(EncoderLayer, self).__init__()
        self.attention = MultiHeadAttention(d_model=d_model, n_head=n_head)
        self.norm1 = LayerNorm(d_model=d_model)
        self.dropout1 = nn.Dropout(p=drop_prob)

        self.ffn = PositionwiseFeedForward(d_model=d_model, hidden=ffn_hidden, drop_prob=drop_prob)
        self.norm2 = LayerNorm(d_model=d_model)
        self.dropout2 = nn.Dropout(p=drop_prob)

    def forward(self, x, src_mask):
        # 1.计算注意力
        _x = x
        x = self.attention(q=x, k=x, v=x, mask=src_mask)

        # 2.残差连接和层归一化
        x = self.dropout1(x)
        x = self.norm1(x + _x)

        # 3.前馈层
        _x = x
        x = self.ffn(x)

        # 4.最后一次残差连接和层归一化
        x = self.dropout2(x)
        x = self.norm2(x + _x)
        return x

In [14]:
class Encoder(nn.Module):

    def __init__(self, enc_voc_size, max_len, d_model, ffn_hidden, n_head, n_layers, drop_prob):
        super().__init__()
        self.emb = TransformerEmbedding(d_model=d_model,
                                        max_len=max_len,
                                        vocab_size=enc_voc_size,
                                        drop_prob=drop_prob)

        self.layers = nn.ModuleList([EncoderLayer(d_model=d_model,
                                                  ffn_hidden=ffn_hidden,
                                                  n_head=n_head,
                                                  drop_prob=drop_prob)
                                     for _ in range(n_layers)])

    def forward(self, x, src_mask):
        x = self.emb(x)

        for layer in self.layers:
            x = layer(x, src_mask)

        return x

In [15]:
class DecoderLayer(nn.Module):

    def __init__(self, d_model, ffn_hidden, n_head, drop_prob):
        super(DecoderLayer, self).__init__()
        self.self_attention = MultiHeadAttention(d_model=d_model, n_head=n_head)
        self.norm1 = LayerNorm(d_model=d_model)
        self.dropout1 = nn.Dropout(p=drop_prob)

        self.enc_dec_attention = MultiHeadAttention(d_model=d_model, n_head=n_head)
        self.norm2 = LayerNorm(d_model=d_model)
        self.dropout2 = nn.Dropout(p=drop_prob)

        self.ffn = PositionwiseFeedForward(d_model=d_model, hidden=ffn_hidden, drop_prob=drop_prob)
        self.norm3 = LayerNorm(d_model=d_model)
        self.dropout3 = nn.Dropout(p=drop_prob)

    def forward(self, dec, enc, trg_mask, src_mask):    
        # 1.对应上面说的第一点
        _x = dec
        x = self.self_attention(q=dec, k=dec, v=dec, mask=trg_mask)

        # 2.残差连接和层归一化
        x = self.dropout1(x)
        x = self.norm1(x + _x)

        if enc is not None:
            # 3.对应上面说的第二点
            _x = x
            x = self.enc_dec_attention(q=x, k=enc, v=enc, mask=src_mask)

            # 4.残差连接和层归一化
            x = self.dropout2(x)
            x = self.norm2(x + _x)

        # 5.前馈层
        _x = x
        x = self.ffn(x)

        # 6.残差连接和层归一化
        x = self.dropout3(x)
        x = self.norm3(x + _x)
        return x

In [16]:
class Decoder(nn.Module):
    def __init__(self, dec_voc_size, max_len, d_model, ffn_hidden, n_head, n_layers, drop_prob):
        super().__init__()
        self.emb = TransformerEmbedding(d_model=d_model,
                                        drop_prob=drop_prob,
                                        max_len=max_len,
                                        vocab_size=dec_voc_size)

        self.layers = nn.ModuleList([DecoderLayer(d_model=d_model,
                                                  ffn_hidden=ffn_hidden,
                                                  n_head=n_head,
                                                  drop_prob=drop_prob)
                                     for _ in range(n_layers)])

        self.linear = nn.Linear(d_model, dec_voc_size)

    def forward(self, trg, src, trg_mask, src_mask):
        trg = self.emb(trg)

        for layer in self.layers:
            trg = layer(trg, src, trg_mask, src_mask)

        # 最后经过一个全连接层
        output = self.linear(trg)
        return output

In [17]:
class Transformer(nn.Module):

    def __init__(self, src_pad_idx, trg_pad_idx, trg_sos_idx, enc_voc_size, dec_voc_size, d_model, n_head, max_len,
                 ffn_hidden, n_layers, drop_prob):
        super().__init__()
        self.src_pad_idx = src_pad_idx
        self.trg_pad_idx = trg_pad_idx
        self.trg_sos_idx = trg_sos_idx
        self.encoder = Encoder(d_model=d_model,
                               n_head=n_head,
                               max_len=max_len,
                               ffn_hidden=ffn_hidden,
                               enc_voc_size=enc_voc_size,
                               drop_prob=drop_prob,
                               n_layers=n_layers)

        self.decoder = Decoder(d_model=d_model,
                               n_head=n_head,
                               max_len=max_len,
                               ffn_hidden=ffn_hidden,
                               dec_voc_size=dec_voc_size,
                               drop_prob=drop_prob,
                               n_layers=n_layers)

    def forward(self, src, trg):
        src_mask = self.make_src_mask(src)
        trg_mask = self.make_trg_mask(trg)
        enc_src = self.encoder(src, src_mask)
        output = self.decoder(trg, enc_src, trg_mask, src_mask)
        return output

    def make_src_mask(self, src):
        """
        创建源序列（src）的掩码, 将pad补零的位置设为False
        """
        src_mask = (src != self.src_pad_idx).unsqueeze(1).unsqueeze(2)
        return src_mask

    def make_trg_mask(self, trg):
        """
        创建目标序列（trg）的掩码, 1.pad补零的位置设为False；
        2.创建一个下三角矩阵，这个矩阵的对角线及以下的为为True，其余位置为False
        表示在训练时模型只能依赖于当前和过去的信息，不能依赖未来的信息
        """
        trg_pad_mask = (trg != self.trg_pad_idx).unsqueeze(1).unsqueeze(3)
        trg_len = trg.shape[1]
        trg_sub_mask = torch.tril(torch.ones(trg_len, trg_len)).type(torch.ByteTensor).to(self.device)
        trg_mask = trg_pad_mask & trg_sub_mask
        return trg_mask

In [19]:
# 原文base模型参数
max_len = 256
d_model = 512
n_layers = 6
n_heads = 8
ffn_hidden = 2048
drop_prob = 0.1

# 分词，词表的一些参数，和通过数据集训练的tokenizer是相关的，这里就简单给一下
# 原文使用的数据集是WMT14 EN-DE，enc_voc_size为32000，dec_voc_size为25000，
# 这个训练出的词表数值不一样，最终模型的参数也不一样，因为这两个参数会影响Embedding层的参数
src_pad_idx = 1
trg_pad_idx = 1
trg_sos_idx = 2
enc_voc_size = 32000
dec_voc_size = 25000

model = Transformer(src_pad_idx=src_pad_idx,
                    trg_pad_idx=trg_pad_idx,
                    trg_sos_idx=trg_sos_idx,
                    d_model=d_model,
                    enc_voc_size=enc_voc_size,
                    dec_voc_size=dec_voc_size,
                    max_len=max_len,
                    ffn_hidden=ffn_hidden,
                    n_head=n_heads,
                    n_layers=n_layers,
                    drop_prob=drop_prob)

def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

# 模型的参数
print(f'The model has {count_parameters(model):,} trainable parameters')

# 模型的结构
print(model)



NameError: name 'device' is not defined