In [1]:
import copy
import math
from typing import Literal, Tuple
import os
import random
import tqdm
import string
import time

In [2]:
import torch
from torch import Tensor, ByteTensor, optim
import torch.nn as nn
from torch.optim import Adam

In [3]:
import matplotlib.pyplot as plt

### 超参数初始化

In [4]:
#####################################################################
# 超参数部分, 后面的参数名尽量与这部分保持一致
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")  # GPU配置
dtype = torch.float32  # 数据类型配置: 这里默认是 32 位浮点数
batch_size = 128  # 训练批次, 一批训练数据内包含句子的数量
max_len = 128  # 单句最大长度
d_model = 512  # 词嵌入向量维度
n_layers = 3  # 编码层/解码层数量
n_heads = 8  # 注意力头数: d_model = 512 / n_heads = 8 => 单头向量维度 64 , 即每个头 QKV 维度
ffn_hidden = 2048  # 前向传播维度: 一般是词嵌入向量维度 d_model 的 4 倍数
d_proj = ffn_hidden  # 跟前向传播维度一样
n_hidden = ffn_hidden  # 跟前向传播维度一样
drop_prob = 0.1  # dropout 提升鲁棒性，随机失活一些节点

In [5]:
# 优化器超参数设置
init_lr = 5e-6
factor = 0.9
adam_eps = 5e-9
patience = 10
warmup = 100
epoch = 150
clip = 1.0
weight_decay = 5e-4
inf = float('inf')

In [6]:
# 数据集制作的参数设置
n = 5000  # 生成的字符串数量
l = max_len  # 字符串的最大长度
src = "./datasets/src_string.txt"
trg = "./datasets/trg_string.txt"

In [7]:
save_plot_name = "results_ignore_padding_index_5e_6.png"

### 分词器设置

In [8]:
# 模仿 huggingface transformers 的分词器编写一个类
class Tokenizer:
    def __init__(self, max_len, sos_token, padding_token, eos_token, device):

        self.max_len = max_len
        self.sos_token = sos_token
        self.padding_token = padding_token
        self.eos_token = eos_token
        self.device = device

        self.VOCABULARY = []
        self.token2index = {}
        self.index2token = {}
        self.vocab = None

        self.padding_index, self.sos_index, self.eod_index = -1, -1, -1

    def from_pretrained(self, filepath: str):

        with open(filepath, 'r') as fp:
            for line in fp:
                self.VOCABULARY.append(line.strip())
        fp.close()

        self.vocab = len(self.VOCABULARY)

        for v, index in zip(self.VOCABULARY, range(len(self.VOCABULARY))):
            self.token2index[v] = index

        for v, index in zip(self.VOCABULARY, range(len(self.VOCABULARY))):
            self.index2token[index] = v

        self.padding_index = self.token2index[self.padding_token]
        self.sos_index = self.token2index[self.sos_token]
        self.eos_index = self.token2index[self.eos_token]

    def encode(self, sentences, return_tensor=False):
        encode_list = []
        for sentence in sentences:

            if len(sentence) <= self.max_len - 2:
                encode_list.append(
                    [self.token2index[self.sos_token]] +
                    [self.token2index[char] for char in sentence] +
                    [self.token2index[self.eos_token]] +
                    [self.token2index[self.padding_token]] * (self.max_len - 2 - len(sentence))
                )
            else:
                encode_list.append(
                    [self.token2index[self.sos_token]] +
                    [self.token2index[char] for char in sentence[0:self.max_len - 2]] +
                    [self.token2index[self.eos_token]]
                )

        if return_tensor:
            # encode_list = torch.tensor(encode_list).to(device=self.device, dtype=torch.int)
            encode_list = torch.tensor(encode_list).to(device=self.device)

        return encode_list

    def decode(self, sentences):
        decode_list = []

        if not isinstance(sentences[0], list):
            # 解码过程中, 如果只有单列表, 例如 [1, 2, 3] 则需要额外嵌套一层列表
            # 默认的解码都是一个批次的, 因此是双层列表嵌套
            sentences = [sentences]

        for sentence in sentences:
            decode_list.append([self.index2token[deco] for deco in sentence])
        return decode_list

    def generate(self, tokenList: Literal[Literal]):

        def single(tokenList):
            str_to_return = ''
            for char in tokenList:
                str_to_return += char
            return str_to_return

        return [single(token_list) for token_list in tokenList]

In [9]:
tokenizer = Tokenizer(max_len=max_len, sos_token='$', padding_token='&', eos_token='#', device=device)
tokenizer.from_pretrained("./VOCABULARY.txt")

In [10]:
vocab = tokenizer.vocab
padding_idx = tokenizer.padding_index  # padding token 的序列号
(vocab, padding_idx)

(55, 53)

### 模型设置

In [11]:
#####################################################################
# 嵌入部分
class Embedding(nn.Module):
    def __init__(self, vocab, max_len, d_model, dropout, device, dtype):
        """
        1. 嵌入部分包括词嵌入和位置编码, 二者相加 -> dropout -> 作为编码器或解码器的输入。
        2. 细节: 词嵌入会设置导数, 位置编码的索引张量不设置导数
        :param vocab: 单词表的数量
        :param max_len: 一句话的最大 token 长度
        :param d_model: 词嵌入向量维度
        :param dropout: 正则化率
        :param device: 张量存放设备
        :param dtype: 张量数据类型
        """
        super().__init__()

        self.vocab = vocab
        self.max_len = max_len
        self.d_model = d_model
        self.dropout = dropout
        self.device = device
        self.dtype = dtype

        # 词嵌入部分
        self.word_embedding = nn.Embedding(num_embeddings=self.vocab, embedding_dim=self.d_model)

        # 位置编码部分: (max_len, d_model) 的二维张量, 不需要导数
        self.position_embedding_map = torch.zeros(size=(self.max_len, self.d_model))
        self.position_embedding_map.requires_grad = False
        # 设置奇数维度和偶数维度的索引列表
        odd, even = torch.arange(1, self.d_model, 2), torch.arange(0, self.d_model, 2)
        # 设置一个句子 token 的全部索引
        # 这里 "unsqueeze(1)" 的作用是让句子 token 的位置索引可以广播
        ########################################################################################
        # 回忆: pytorch 的张量基本运算是按元素位置计算的, 因此两个相同 shape 的张量结果返回的也是相同 shape
        # 例如: 两个张量的维度都是 (4.) , 进行基本运算的结果就是 (4.)
        # 例如: 一个张量的维度是 (4.) , 另一个维度是 (4, 1) 那么 (4, 1) 会广播成 (4, 4) 再做运算
        # 分析: pos 的原本张量维度是 (self.max_len.) , even 和 odd 都是 (256.) 直接做运算会报错
        # 分析: 原本情况 pos 和 even 都被视为向量, 但是二者维度不匹配, 因此无法计算
        # 分析: pos 最后扩充一个维度时, (self.max_len, 1) , pos 会在最后这个 1 维度广播机制重复 256 次
        # 分析: 再将广播机制后的 (self.max_len, 256) 与 even 或者 odd 做运算
        ########################################################################################
        pos = torch.arange(0, self.max_len, 1).unsqueeze(1)
        # 根据 odd, even 和 pos 填充 self.position_embedding_map 张量内的元素
        self.position_embedding_map[:, even] = torch.sin(pos / (1e4 ** (even / self.d_model)))
        self.position_embedding_map[:, odd] = torch.cos(pos / (1e4 ** (even / self.d_model)))

        # 正则化部分
        self.dropout = nn.Dropout(self.dropout)

        # 对初始化好的 self.word_embedding 和 self.position_embedding_map 进行数据类型和设备设置
        self.word_embedding.to(device=self.device, dtype=self.dtype)
        self.position_embedding_map.to(device=self.device, dtype=self.dtype)
        self.dropout.to(device=self.device, dtype=self.dtype)

    def forward(self, x: Tensor):
        """
        对输入的离散稀疏 token 的编号计算得到连续稠密 embedding 向量
        :param x: 输入 tokens 的编号序列
        :return: 输出 embedding 张量
        """
        # batch_size 一批训练数据内包含句子的数量
        # max_len 一批训练数据内单句最大长度, 且 max_len <= 位置编码初始化设定的长度
        batch_size, max_len = x.shape

        # 词嵌入
        word_embedding = self.word_embedding(x)
        # 位置编码
        # debug: 最初维度要扩充一个维度, 以实现广播机制
        # debug: position_encoding 存储在 cpu 上, 还要将其转到 cuda 上
        position_encode = self.position_embedding_map[:max_len, :].unsqueeze(0).to(device=self.device, dtype=self.dtype)
        # 两者相加再 dropout 正则化
        encode = self.dropout(word_embedding + position_encode)

        return encode

In [12]:
#####################################################################
# 层归一化算法
class LayerNorm(nn.Module):
    def __init__(self, d_model, dtype, device, epsilon=1e-7):
        """
        :param d_model: 模型嵌入维度
        :param dtype: 数据类型
        :param device: 显卡设备
        :param epsilon: 设置的一个很小很小的数
        """
        super().__init__()

        self.d_model = d_model
        self.epsilon = epsilon
        self.dtype = dtype
        self.device = device

        # 初始化可以更新的全 0 全 1 张量参数
        self.A = nn.Parameter(torch.ones(self.d_model)).to(device=self.device, dtype=self.dtype)
        self.B = nn.Parameter(torch.zeros(self.d_model)).to(device=self.device, dtype=self.dtype)

    def forward(self, x):
        # 1. 不计算无偏的, 也就是方差公式除以 N 而不是 N-1
        # 2. keepdim=True, 保留原有维度, 便于进行广播机制
        mean = torch.mean(x, dim=-1, keepdim=True)
        var = torch.var(x, dim=-1, unbiased=False, keepdim=True)
        x = self.A * (x - mean) / (var + self.epsilon) + self.B
        return x

In [13]:
#####################################################################
# 编码器部分
class Encoder(nn.Module):
    def __init__(self, max_len, d_model, n_heads, d_proj, dropout, device, dtype):
        """
        :param max_len: 单句最大长度
        :param d_model: 词嵌入向量维度
        :param n_heads: 注意力头数
        :param d_proj: 投影层的维度
        :param dropout: 正则化率
        :param device: 张量存放设备
        :param dtype: 张量数据类型
        """
        super().__init__()

        self.max_len = max_len
        self.d_model = d_model
        self.n_heads = n_heads
        self.d_proj = d_proj
        self.dropout = dropout
        self.device = device
        self.dtype = dtype

        # 计算出每个头的子空间维度
        self.d_per_head = int(self.d_model / self.n_heads)

        # 初始化 query key value 的线性投射层
        self.Wq = nn.Linear(self.d_model, self.d_model).to(device=self.device, dtype=self.dtype)
        self.Wk = nn.Linear(self.d_model, self.d_model).to(device=self.device, dtype=self.dtype)
        self.Wv = nn.Linear(self.d_model, self.d_model).to(device=self.device, dtype=self.dtype)

        # 初始化头合并的投射层, 以及前馈神经网络 FFNN
        self.Wc = nn.Linear(self.d_model, self.d_model).to(device=self.device, dtype=self.dtype)
        self.Wf = nn.Sequential(nn.Linear(self.d_model, self.d_proj),
                                nn.ReLU(),
                                nn.Linear(self.d_proj, self.d_model)).to(device=self.device, dtype=self.dtype)

        # 初始化正则化
        self.Dropout = nn.Dropout(self.dropout).to(device=self.device, dtype=self.dtype)

        # 初始化层归一化
        self.layerNorm = LayerNorm(self.d_model, self.dtype, self.device)

        # 对所有线性投射层的权重进行恺明初始化
        torch.nn.init.kaiming_uniform_(self.Wq.weight)
        torch.nn.init.kaiming_uniform_(self.Wk.weight)
        torch.nn.init.kaiming_uniform_(self.Wv.weight)
        torch.nn.init.kaiming_uniform_(self.Wc.weight)
        for layer in self.Wf:
            if isinstance(layer, nn.Linear):
                torch.nn.init.kaiming_uniform_(layer.weight)

    def forward(self, input_content: Tuple[Tensor, ByteTensor]):
        """
        :param input_content: 输入内容的元组, 包含两个元素: 一个是输入张量, 另一个是注意力掩码
        """
        x, mask = input_content

        # batch_size 一个批次的句子数量
        # max_len 句子的最大 token 数量
        # d_model 嵌入维度
        batch_size, max_len, d_model = x.shape

        # x 分别当做 query key value 输入线性投射层
        # x_q, x_k, x_v: (batch_size, max_len, d_model)
        x_q, x_k, x_v = self.Wq(x), self.Wk(x), self.Wv(x)

        # 分头, 并将头分出来
        # x_q, x_k, x_v: (batch_size, n_heads, max_len, d_per_head)
        x_q = x_q.view(batch_size, max_len, self.n_heads, self.d_per_head).permute(0, 2, 1, 3)
        x_k = x_k.view(batch_size, max_len, self.n_heads, self.d_per_head).permute(0, 2, 1, 3)
        x_v = x_v.view(batch_size, max_len, self.n_heads, self.d_per_head).permute(0, 2, 1, 3)

        # 注意力机制计算
        # 做乘法注意力 temp: (batch_size, n_heads, max_len, max_len)
        temp = x_q @ x_k.permute(0, 1, 3, 2)
        # 除以缩放算子
        temp /= math.sqrt(self.d_per_head)
        ########################################################################################
        # 其中 mask 必须是一个 ByteTensor, shape 必须和 a 一样, 且元素只能是 0 或者 1 .
        # 将 mask 中为 1 的元素所在的索引, 在 a 中相同的的索引处替换为 value, mask value 必须同为 tensor
        # 这里的 mask 被掩码的是 1, 没被掩码的是 0
        ########################################################################################
        # 进行自编码器掩码操作
        temp.masked_fill(mask, -1 * torch.inf)
        # 进行 softmax 归一化, dim=-1 表示只对最后的维度, 就是嵌入维度做归一化
        # attention: (batch_size, n_heads, max_len, max_len)
        attention = torch.softmax(temp, dim=-1)
        # 将注意力分数乘以 value 张量
        # value: (batch_size, n_heads, max_len, d_per_head)
        value = attention @ x_v
        # 将每个头合并
        # 若在维度变换后还需要进行 reshape 操作的话, 需要在后面加 contiguous() 保持连续 
        # total_value: (batch_size, max_len, d_model)
        total_value = value.permute(0, 2, 1, 3).contiguous().reshape((batch_size, max_len, d_model))
        # 输入头合并投影层, 输出表示自注意力模块结束
        # total_value_: (batch_size, max_len, d_model)
        total_value_ = self.Wc(total_value)
        # 马上进行正则化
        total_value_ = self.Dropout(total_value_)
        # 进行残差连接和层归一化
        last = self.layerNorm(x + total_value_)
        return (last, mask)

In [14]:
#####################################################################
# 解码器部分
class Decoder(nn.Module):
    def __init__(self, max_len, d_model, n_heads, d_proj, dropout, device, dtype):
        """
        :param max_len: 单句最大长度
        :param d_model: 词嵌入向量维度
        :param n_heads: 注意力头数
        :param d_proj: 投影层的维度
        :param dropout: 正则化率
        :param device: 张量存放设备
        :param dtype: 张量数据类型
        """
        super().__init__()

        self.max_len = max_len
        self.d_model = d_model
        self.n_heads = n_heads
        self.d_proj = d_proj
        self.dropout = dropout
        self.device = device
        self.dtype = dtype

        # 计算出每个头的子空间维度
        self.d_per_head = int(self.d_model / self.n_heads)

        # 初始化 query key value 的线性投射
        self.Wq = nn.Linear(self.d_model, self.d_model).to(device=self.device, dtype=self.dtype)
        self.Wk = nn.Linear(self.d_model, self.d_model).to(device=self.device, dtype=self.dtype)
        self.Wv = nn.Linear(self.d_model, self.d_model).to(device=self.device, dtype=self.dtype)

        # 初始化头合并的投射以及前馈神经网络
        self.Wc = nn.Linear(self.d_model, self.d_model).to(device=self.device, dtype=self.dtype)
        self.Wf = nn.Sequential(nn.Linear(self.d_model, self.d_proj),
                                nn.ReLU(),
                                nn.Linear(self.d_proj, self.d_model)).to(device=self.device, dtype=self.dtype)

        # 初始化正则化
        self.Dropout = nn.Dropout(self.dropout).to(device=self.device, dtype=self.dtype)

        # 初始化层归一化
        self.layerNorm = LayerNorm(self.d_model, self.dtype, self.device)

        # 对所有线性投射层的权重进行恺明初始化
        torch.nn.init.kaiming_uniform_(self.Wq.weight)
        torch.nn.init.kaiming_uniform_(self.Wk.weight)
        torch.nn.init.kaiming_uniform_(self.Wv.weight)
        torch.nn.init.kaiming_uniform_(self.Wc.weight)
        for layer in self.Wf:
            if isinstance(layer, nn.Linear):
                torch.nn.init.kaiming_uniform_(layer.weight)

    def forward(self, input_content: Tuple[Tensor, Tensor, Tensor, ByteTensor, ByteTensor]):
        """
        :param input_content: 输入内容的元组, 包含五个元素: 三个是 query key 和 value , 另外两个是解码注意力掩码和编码解码掩码
        """
        q, k_, v_, cross_mask, decode_mask = input_content

        k = copy.copy(k_)
        v = copy.copy(v_)

        batch_size_q, max_len_q, d_model_q = q.shape
        batch_size_k, max_len_k, d_model_k = k.shape

        # x 分别当做 query key value 输入线性投射层
        # x_q, x_k, x_v: (batch_size, max_len, d_model)
        x_q, x_k, x_v = self.Wq(q), self.Wk(q), self.Wv(q)

        # 分头, 并将头分出来
        # x_q, x_k, x_v: (batch_size, n_heads, max_len, d_per_head)
        x_q = x_q.view(batch_size_q, max_len_q, self.n_heads, self.d_per_head).permute(0, 2, 1, 3)
        x_k = x_k.view(batch_size_q, max_len_q, self.n_heads, self.d_per_head).permute(0, 2, 1, 3)
        x_v = x_v.view(batch_size_q, max_len_q, self.n_heads, self.d_per_head).permute(0, 2, 1, 3)

        # 注意力机制计算
        # 做乘法注意力 temp: (batch_size, n_heads, max_len, max_len)
        temp = x_q @ x_k.permute(0, 1, 3, 2)
        # 除以缩放算子
        temp /= math.sqrt(self.d_per_head)
        ########################################################################################
        # 其中 mask 必须是一个 ByteTensor, shape 必须和 a 一样, 且元素只能是 0 或者 1 .
        # 将 mask 中为 1 的元素所在的索引, 在 a 中相同的的索引处替换为 value, mask value 必须同为 tensor
        # 这里的 mask 被掩码的是 1, 没被掩码的是 0
        ########################################################################################
        # 进行自编码器掩码操作
        temp.masked_fill(decode_mask, -1 * torch.inf)
        # 进行 softmax 归一化, dim=-1 表示只对最后的维度, 就是嵌入维度做归一化
        # attention: (batch_size, n_heads, max_len, max_len)
        attention = torch.softmax(temp, dim=-1)
        # 将注意力分数乘以 value 张量
        # value: (batch_size, n_heads, max_len, d_per_head)
        value = attention @ x_v
        # 将每个头合并
        # total_value: (batch_size, max_len, d_model)
        total_value = value.permute(0, 2, 1, 3).contiguous().reshape((batch_size_q, max_len_q, d_model))
        # 输入头合并投影层, 输出表示自注意力模块结束
        # total_value_: (batch_size, max_len, d_model)
        total_value_ = self.Wc(total_value)
        # 马上进行正则化
        total_value_ = self.Dropout(total_value_)
        # 进行残差连接和层归一化
        last = self.layerNorm(q + total_value_)

        # --------------------------------------------
        last_ = copy.copy(last)
        last_ = last_.view(batch_size_q, max_len_q, self.n_heads, self.d_per_head).permute(0, 2, 1, 3)
        k = k.view(batch_size_k, max_len_k, self.n_heads, self.d_per_head).permute(0, 2, 1, 3)
        v = v.view(batch_size_k, max_len_k, self.n_heads, self.d_per_head).permute(0, 2, 1, 3)

        # 注意力机制计算
        temp = last_ @ k.permute(0, 1, 3, 2)
        # 除以缩放算子
        temp /= math.sqrt(self.d_per_head)
        # 进行编码-解码器掩码操作
        temp.masked_fill(cross_mask, -1 * torch.inf)
        # 进行 softmax 归一化
        attention = torch.softmax(temp, dim=-1)
        # 将注意力分数乘以 value 张量
        value = attention @ v
        # 将每个头合并
        total_value = value.permute(0, 2, 1, 3).contiguous().reshape((batch_size_q, max_len_q, d_model))
        # 输入头合并投影层, 输出表示互注意力模块结束
        total_value_ = self.Wc(total_value)
        # 马上进行正则化
        total_value_ = self.Dropout(total_value_)
        # 进行残差连接和层归一化
        last_decode = self.layerNorm(last + total_value_)

        return (last_decode, k_, v_, cross_mask, decode_mask)

In [15]:
#####################################################################
# 投射到词汇表
class ProjVocab(nn.Module):
    def __init__(self, vocab, d_model, d_proj, dropout, device, dtype):
        """
        :param vocab: 词汇表数量
        :param d_model: 模型嵌入维度
        :param d_proj: 线性投射层维度
        :param dropout: 正则化率
        :param device: 显卡设备
        :param dtype: 数据类型
        """
        super().__init__()

        self.vocab = vocab
        self.d_model = d_model
        self.d_proj = d_proj
        self.dropout = dropout
        self.device = device
        self.dtype = dtype

        self.projVocab = nn.Sequential(
            nn.Linear(self.d_model, self.d_proj),
            nn.ReLU(),
            nn.Linear(self.d_proj, self.vocab)
        ).to(device=self.device, dtype=self.dtype)

        for layer in self.projVocab:
            if isinstance(layer, nn.Linear):
                torch.nn.init.kaiming_uniform_(layer.weight)

    def forward(self, x):
        x = self.projVocab(x)
        # softmax 归一化处理, 含义是 "预测出来的, 在词汇表上每个 token 出现的概率"

        return x

In [16]:
#####################################################################
# 汇总成 Transformer
class Transformer(nn.Module):
    def __init__(self, max_len, d_model, n_heads, n_layers, d_proj, vocab, dropout, device, dtype, epsilon=1e-7):
        super().__init__()

        self.max_len = max_len
        self.d_model = d_model
        self.n_heads = n_heads
        self.n_layers = n_layers
        self.d_proj = d_proj
        self.vocab = vocab
        self.dropout = dropout
        self.device = device
        self.dtype = dtype
        self.epsilon = epsilon

        # 每个模块类实例的时候都进行了权重初始化, 设备和数据类型的声明
        # 初始化单个编码器和解码器
        single_encoder = Encoder(self.max_len, self.d_model, self.n_heads, self.d_proj, self.dropout, self.device,
                                 self.dtype)
        single_decoder = Decoder(self.max_len, self.d_model, self.n_heads, self.d_proj, self.dropout, self.device,
                                 self.dtype)

        # 初始化编码部分和解码部分的嵌入层
        self.embedding = Embedding(self.vocab, self.max_len, self.d_model, self.dropout, self.device, self.dtype)

        # 使用列表解包的方法构建整个网络
        # 注意: nn.Sequential 构建的网络只允许单个变量输入模型中, 因此在编码器和解码器中进行了打包和解包操作
        self.encoders = nn.Sequential(
            *[single_encoder for _ in range(self.n_layers)]
        )
        self.decoders = nn.Sequential(
            *[single_decoder for _ in range(self.n_layers)]
        )

        # 初始化线性投射层
        self.projVocab = ProjVocab(self.vocab, self.d_model, self.d_proj, self.dropout, self.device, self.dtype)

    def forward(self, src_seq, trg_seq):
        """
        训练过程的 transformer 的前向推理
        :param src_seq: 源序列
        :param trg_seq: 目标序列
        :return: 每个句子每个 token 的下一个预测的 token 
        """
        # 先获得编码器掩码, 解码器掩码和编码-解码掩码
        self_mask = self.make_mask(src_seq, src_seq, "encoder")
        cross_mask = self.make_mask(trg_seq, src_seq, "encoder-decoder")
        decoder_mask = self.make_mask(trg_seq, trg_seq, "decoder")

        # 计算源序列嵌入和目标序列嵌入
        en_emb = self.embedding(src_seq)
        de_emb = self.embedding(trg_seq)

        # 将源序列嵌入和编码器掩码送入编码器计算源序列的编码信息
        encodes_tuple = self.encoders((en_emb, self_mask))
        # 从计算结果的包中得到编码
        encodes, _ = encodes_tuple
        # 将目标序列的嵌入, 两个编码 (分别做 key 和 value), 编码-解码掩码 和 解码器掩码送入解码器计算每句话每个 token 的下一个预测
        decodes = self.decoders((de_emb, encodes, encodes, cross_mask, decoder_mask))
        # 从计算结果的包中得到解码
        last_decode, _, _, _, _ = decodes
        # 将解码内容送入投射层中获得在词汇表中每个 token 的预测概率
        vocab_pos = self.projVocab(last_decode)
        return vocab_pos

    def make_mask(self, q: Tensor, k: Tensor, type: Literal["encoder", "encoder-decoder", "decoder"]):
        """
        这个类方法用于构建编码器掩码, 解码器掩码和编码-解码掩码
        :param q: query 张量
        :param k: key 张量
        :param type: 选择是编码器掩码, 解码器掩码 和编码-解码掩码
        :return: 
        """
        max_len_q = q.shape[1]  # 获得 query 和 key 的每一句话的最大 token 长度
        max_len_k = k.shape[1]

        # qMask: (batch_size, max_len_q)
        qMask = q.ne(padding_idx)  # 过滤掉被 padding 的 token
        # qMask: (batch_size, 1, max_len_q, 1)
        qMask = qMask.unsqueeze(1).unsqueeze(3)
        # qMask: (batch_size, 1, max_len_q, max_len_k)
        qMask = qMask.repeat(1, 1, 1, max_len_k)

        # kMask: (batch_size, max_len_k)
        kMask = k.ne(padding_idx)
        # kMask: (batch_size, 1, 1, max_len_k)
        kMask = kMask.unsqueeze(1).unsqueeze(2)
        # kMask: (batch_size, 1, max_len_q, max_len_k)
        kMask = kMask.repeat(1, 1, max_len_q, 1)

        Mask = qMask & kMask

        # 如果是解码器注意力时, 需要设置一个同大小的下三角掩码, 然后做与运算
        if type == "decoder":
            trigl = torch.tril(torch.ones_like(Mask))
            Mask &= trigl

        return Mask

In [17]:
transformer = Transformer(max_len=max_len,
                          d_model=d_model,
                          n_heads=n_heads,
                          n_layers=n_layers,
                          d_proj=d_proj,
                          vocab=vocab,
                          dropout=drop_prob,
                          device=device,
                          dtype=dtype)

In [18]:
total = sum([param.nelement() for param in transformer.parameters()])
print("Number of parameter: %.2fM" % (total/1e6))

Number of parameter: 7.49M


In [19]:
transformer

Transformer(
  (embedding): Embedding(
    (word_embedding): Embedding(55, 512)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoders): Sequential(
    (0): Encoder(
      (Wq): Linear(in_features=512, out_features=512, bias=True)
      (Wk): Linear(in_features=512, out_features=512, bias=True)
      (Wv): Linear(in_features=512, out_features=512, bias=True)
      (Wc): Linear(in_features=512, out_features=512, bias=True)
      (Wf): Sequential(
        (0): Linear(in_features=512, out_features=2048, bias=True)
        (1): ReLU()
        (2): Linear(in_features=2048, out_features=512, bias=True)
      )
      (Dropout): Dropout(p=0.1, inplace=False)
      (layerNorm): LayerNorm()
    )
    (1): Encoder(
      (Wq): Linear(in_features=512, out_features=512, bias=True)
      (Wk): Linear(in_features=512, out_features=512, bias=True)
      (Wv): Linear(in_features=512, out_features=512, bias=True)
      (Wc): Linear(in_features=512, out_features=512, bias=True)
      (Wf): Sequent

### 模型测试

In [20]:
def test(model, num_gene: int, test_src: str, device):
    model.eval()
    with torch.no_grad():
        

        test_src_index = tokenizer.encode(test_src, return_tensor=True)

        gene_trg = tokenizer.sos_token

        for _ in range(num_gene):
            gene_trg_index = tokenizer.encode(gene_trg, return_tensor=True)
    
            output = model(test_src_index, gene_trg_index)
            output_reshape = output.contiguous().view(-1, output.shape[-1])
    
            output_words = output.softmax(dim=-1).max(dim=-1)[1]
            output_words = tokenizer.decode(output_words.data.cpu().numpy().tolist())
            output_words = tokenizer.generate(output_words)[0]

            gene_trg += output_words
            
            del gene_trg_index

    return gene_trg

In [21]:
# 只能使用一次
torch.cuda.empty_cache()

In [22]:
trained_device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
trained_model = torch.load("my_transformer.pth").to(trained_device)
total = sum([param.nelement() for param in trained_model.parameters()])
print("Number of parameter: %.2fM" % (total/1e6))
print(trained_model)

Number of parameter: 7.49M
Transformer(
  (embedding): Embedding(
    (word_embedding): Embedding(55, 512)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoders): Sequential(
    (0): Encoder(
      (Wq): Linear(in_features=512, out_features=512, bias=True)
      (Wk): Linear(in_features=512, out_features=512, bias=True)
      (Wv): Linear(in_features=512, out_features=512, bias=True)
      (Wc): Linear(in_features=512, out_features=512, bias=True)
      (Wf): Sequential(
        (0): Linear(in_features=512, out_features=2048, bias=True)
        (1): ReLU()
        (2): Linear(in_features=2048, out_features=512, bias=True)
      )
      (Dropout): Dropout(p=0.1, inplace=False)
      (layerNorm): LayerNorm()
    )
    (1): Encoder(
      (Wq): Linear(in_features=512, out_features=512, bias=True)
      (Wk): Linear(in_features=512, out_features=512, bias=True)
      (Wv): Linear(in_features=512, out_features=512, bias=True)
      (Wc): Linear(in_features=512, out_features=512, bia

In [23]:
print(test(trained_model, 20, ["apple"], device))

OutOfMemoryError: CUDA out of memory. Tried to allocate 962.00 MiB (GPU 0; 10.75 GiB total capacity; 7.20 GiB already allocated; 499.88 MiB free; 9.42 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF