# 搭建 Transformer
- encoder + decoder
![](./img/TransformerBlock.png)

In [1]:
import torch
import torch.nn as nn 
import torch.nn.functional as F
import math


In [None]:
class TransformerMHA(nn.Module):
    def __init__(self, d_model, h):
        """
        多头注意力机制
        args:
            d_model:输入序列的维度大小
            h：attention heads的数量 
        """
        super(TransformerMHA, self).__init__()
        assert d_model % h ==0, "d_model must be divided by h (number of heads)"

        self.d_model = d_model 
        self.h = h

        # QKV Linear layer
        self.w_q = nn.Linear(d_model, d_model)
        self.w_k = nn.Linear(d_model, d_model)
        self.w_v = nn.Linear(d_model, d_model)
        self.fc_out = nn.Linear(d_model, d_model)
    def forward(self, q, k, v, mask=None):
        """
        args: 
            i = q,k,v shape is (B, seq_len_{i}, d_model)
            mask shape is (B, 1, seq_len_q, seq_len_k)
            and seq_len_k = seq_len_v unequal to seq_len_q
        return:
            outputs:(weighted) shape is (batch_size, h, seq_len_q, seq_len_kv)
            attntion_weights shape is (batch_size, h, seq_len_q, seq_len_k)
        """
        batch_size, seq_len_q, _ = q.size()
        seq_len_kv, _ , _ = k.size()

        # 将d_model 拆分为 head_numbers and head_dim 
        # 并且 (B, S_ , num_head, head_dim)  ---> (B , num_head, , S_, head_dim)
        Q = self.w_q(q).view(batch_size, seq_len_q, self.h, -1).transpose(1,2)
        K = self.w_k(k).view(batch_size, seq_len_kv, self.h, -1).transpose(1,2)
        V = self.w_v(v).view(batch_size, seq_len_kv, self.h, -1).transpose(1,2)

        scaled_attention, _ = scaled_dot_product_attention(Q, K, V, mask=mask)

        concat_out = scaled_attention.transpose(1,2).contiguous()
        concat_out.view(batch_size, -1, self.d_model)

        out = self.fc_out(concat_out)
        return out 

def scaled_dot_product_attention(Q, K, V, mask=None):
    """
    缩放点积注意力计算。

    args:
        Q: 查询矩阵 (batch_size, num_heads, seq_len_q, d_k)
        K: 键矩阵 (batch_size, num_heads, seq_len_kv, d_k)
        V: 值矩阵 (batch_size, num_heads, seq_len_kv, d_v)
        mask: 掩码矩阵 (batch_size, 1, seq_len_q, seq_len_kv) 或 
        (1, 1, seq_len_q, seq_len_kv) 或 
        (batch_size, h, seq_len_q, seq_len_kv)

    return:
        output: 注意力加权后的输出矩阵
        attention_weights: 注意力权重矩阵
    """
    d_k = Q.size(-1)  # d_k

    # 计算点积并进行缩放  
    # !! K.transpose(-2, -1)这里如果使用正的索引会造成维度错误，
    # 例如使用(1, 2)如果新增h-head_number(B, h, S, h_dim).
    scores = torch.matmul(Q, K.transpose(-2, -1)) / math.sqrt(d_k)

    # 如果提供了掩码矩阵，则将掩码对应位置的分数设为 -inf
    if mask is not None:
        scores = scores.masked_fill(mask == 0, float('-inf'))

    # 对缩放后的分数应用 Softmax 函数，得到注意力权重
    attention_weights = F.softmax(scores, dim=-1)

    # 加权求和，计算输出
    output = torch.matmul(attention_weights, V)

    return output, attention_weights       

In [3]:
class PositionwiseFeedForward(nn.Module):
    def __init__(self, d_model, d_ff, dropout=0.1):
        """
        FFN
        args:
            d_model: 输入和输出向量的维度
            d_ff： FFN隐藏层的维度
            dropout：随机屏蔽部分输出，防止过拟合（也是一种正则化手段）
        """
        super(PositionwiseFeedForward,self).__init__()
        self.proj_up = nn.Linear(d_model, d_ff)
        self.proj_down = nn.Linear(d_ff, d_model)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        x = self.proj_up(x).relu()
        x = self.dropout(x)
        x = self.proj_down(x)
        return x

In [4]:
LN = nn.LayerNorm((512,512), eps=1e-9)
LN

LayerNorm((512, 512), eps=1e-09, elementwise_affine=True)

## encoder part

In [5]:
class EncoderLayer(nn.Module):
    def __init__(self, d_model, d_ff, h, dropout=0.1):
        """
        encoder

        args:
            d_model: 嵌入维度
            h: head numbers
            d_ff: FFN hidden dimantion
            dropout: Dropout probs
        """
        super(EncoderLayer,self).__init__()
        self.mha = TransformerMHA(d_model, h)
        self.ffn = PositionwiseFeedForward(d_model, d_ff, dropout)
        self.ln1 = nn.LayerNorm(d_model)
        self.ln2 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(p=dropout)
        self.d_model = d_model

    def forward(self, x, src_mask):
        """
        x shape is (batch_size, sqe_len, d_model)
        
        args:
            x: inputs
            src_mask: self-attention mask
        return:
            x: encoderlayer output shape is (batch_size, seq_len, d_model)
        """
        x = self.dropout(self.mha(x, src_mask)) +x
        x = self.ln1(x)
        x = self.dropout(self.ffn(x)) + x
        x = self.ln2(x)

        return x

In [6]:
encoder = EncoderLayer(512, 2048, 8)
encoder

EncoderLayer(
  (mha): TransformerMHA(
    (w_q): Linear(in_features=512, out_features=512, bias=True)
    (w_k): Linear(in_features=512, out_features=512, bias=True)
    (w_v): Linear(in_features=512, out_features=512, bias=True)
    (fc_out): Linear(in_features=512, out_features=512, bias=True)
  )
  (ffn): PositionwiseFeedForward(
    (proj_up): Linear(in_features=512, out_features=2048, bias=True)
    (proj_down): Linear(in_features=2048, out_features=512, bias=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (ln1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
  (ln2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
  (dropout): Dropout(p=0.1, inplace=False)
)

另一种模块化的实现：

In [7]:
class SublayerConnection(nn.Module):
    """
        子层连接的另一种实现方式，残差连接直接在该模块中实现。

        参数:
            feature_size: 输入特征的维度大小，即归一化的特征维度。
            dropout: 残差连接中的 Dropout 概率。
            epsilon: 防止除零的小常数。
        """
    def __init__(self, feature_size, dropout=0.1, epsilon=1e-9):
        super(SublayerConnection, self).__init__()
        self.norm = nn.LayerNorm(feature_size, eps=epsilon)
        self.dropout = nn.Dropout(p=dropout)

    def forward(self, x, sublayer):
        # 将子层输出应用 dropout 后经过残差连接后再进行归一化，可见本文「呈现」部分
        return self.norm(x + self.dropout(sublayer(x)))

In [8]:
class EncoderLayer2(nn.Module):
    def __init__(self, d_model, h, d_ff, dropout):
        """
        编码器层。
        
        参数:
            d_model: 嵌入维度
            h: 多头注意力的头数
            d_ff: 前馈神经网络的隐藏层维度
            dropout: Dropout 概率
        """
        super(EncoderLayer2, self).__init__()
        self.self_attn = TransformerMHA(d_model, h)  # 多头自注意力（Multi-Head Self-Attention）
        self.feed_forward = PositionwiseFeedForward(d_model, d_ff, dropout)  # 前馈神经网络
        
        # 定义两个子层连接，分别用于多头自注意力和前馈神经网络（对应模型架构图中的两个残差连接）
        self.sublayers = nn.ModuleList([SublayerConnection(d_model, dropout) for _ in range(2)])
        self.d_model = d_model

    def forward(self, x, src_mask):
        """
        前向传播函数。

        参数:
            x: 输入张量，形状为 (batch_size, seq_len, d_model)。
            src_mask: 源序列掩码，用于自注意力。

        返回:
            编码器层的输出，形状为 (batch_size, seq_len, d_model)。
        """
        x = self.sublayers[0](x, lambda x: self.self_attn(x, x, x, src_mask))  # 自注意力子层
        x = self.sublayers[1](x, self.feed_forward)  # 前馈子层
        return x

In [9]:
encoder = EncoderLayer2(512, 8, 2048, 0.1)
encoder

EncoderLayer2(
  (self_attn): TransformerMHA(
    (w_q): Linear(in_features=512, out_features=512, bias=True)
    (w_k): Linear(in_features=512, out_features=512, bias=True)
    (w_v): Linear(in_features=512, out_features=512, bias=True)
    (fc_out): Linear(in_features=512, out_features=512, bias=True)
  )
  (feed_forward): PositionwiseFeedForward(
    (proj_up): Linear(in_features=512, out_features=2048, bias=True)
    (proj_down): Linear(in_features=2048, out_features=512, bias=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (sublayers): ModuleList(
    (0-1): 2 x SublayerConnection(
      (norm): LayerNorm((512,), eps=1e-09, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
  )
)

# Decoder part

In [10]:
class DecoderLayer(nn.Module):
    def __init__(self, d_model, d_ff, h, dropout):
        super(DecoderLayer,self).__init__()
        """
        decoder

        args:
            d_model: 嵌入维度
            h: 多头注意力的头数
            d_ff: 前馈神经网络的隐藏层维度
            dropout: Dropout 概率
        """
        self.self_mha = TransformerMHA(d_model, h)
        self.corss_attention = TransformerMHA(d_model, h)
        self.ffn = PositionwiseFeedForward(d_model, d_ff)
        self.ln1 = nn.LayerNorm(d_model)
        self.ln2 = nn.LayerNorm(d_model)
        self.ln3 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(p=dropout)

    def forward(self, x, menmory, src_mask, tgt_mask):
        """
        args:
            x: 解码器输入 (batch_size, seq_len_tgt, d_model)
            memory: 编码器输出 (batch_size, seq_len_src, d_model)
            src_mask: 源序列掩码，用于交叉注意力
            tgt_mask: 目标序列掩码，用于自注意力
        return:
            x decoder outputs
        """
        x = self.ln1(self.dropout(self.self_mha(x, x, x, tgt_mask)) + x)
        x = self.ln2(self.dropout(
            self.corss_attention(x, menmory, menmory, src_mask)
        )+ x)
        x = self.ln3(self.dropout(self.ffn(x))) + x
        return x

In [11]:
decoder = DecoderLayer(512, 2048, 8, 0.1)
decoder

DecoderLayer(
  (self_mha): TransformerMHA(
    (w_q): Linear(in_features=512, out_features=512, bias=True)
    (w_k): Linear(in_features=512, out_features=512, bias=True)
    (w_v): Linear(in_features=512, out_features=512, bias=True)
    (fc_out): Linear(in_features=512, out_features=512, bias=True)
  )
  (corss_attention): TransformerMHA(
    (w_q): Linear(in_features=512, out_features=512, bias=True)
    (w_k): Linear(in_features=512, out_features=512, bias=True)
    (w_v): Linear(in_features=512, out_features=512, bias=True)
    (fc_out): Linear(in_features=512, out_features=512, bias=True)
  )
  (ffn): PositionwiseFeedForward(
    (proj_up): Linear(in_features=512, out_features=2048, bias=True)
    (proj_down): Linear(in_features=2048, out_features=512, bias=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (ln1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
  (ln2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
  (ln3): LayerNorm((512,), eps=1e-05, elemen

另一种实现

In [12]:
class DecoderLayer2(nn.Module):
    def __init__(self, d_model, h, d_ff, dropout):
        """
        解码器层。
        
        参数:
            d_model: 嵌入维度
            h: 多头注意力的头数
            d_ff: 前馈神经网络的隐藏层维度
            dropout: Dropout 概率
        """
        super(DecoderLayer2, self).__init__()
        self.self_attn = TransformerMHA(d_model, h)  # 掩码多头自注意力（Masked Multi-Head Self-Attention）
        self.cross_attn = TransformerMHA(d_model, h)  # 多头交叉注意力（Multi-Head Cross-Attention）
        self.feed_forward = PositionwiseFeedForward(d_model, d_ff, dropout)  # 前馈神经网络
        
        # 定义三个子层连接，分别用于掩码多头自注意力、多头交叉注意力和前馈神经网络（对应模型架构图中的三个残差连接）
        self.sublayers = nn.ModuleList([SublayerConnection(d_model, dropout) for _ in range(3)])
        self.d_model = d_model

    def forward(self, x, memory, src_mask, tgt_mask):
        """
        前向传播函数。
        参数:
            x: 解码器输入 (batch_size, seq_len_tgt, d_model)
            memory: 编码器输出 (batch_size, seq_len_src, d_model)
            src_mask: 源序列掩码，用于交叉注意力
            tgt_mask: 目标序列掩码，用于自注意力
        返回:
            x: 解码器层的输出
        """
        # 第一个子层：掩码多头自注意力（Masked Multi-Head Self-Attention）
        x = self.sublayers[0](x, lambda x: self.self_attn(x, x, x, tgt_mask))
        
        # 第二个子层：交叉多头注意力（Multi-Head Cross-Attention），使用编码器的输出 memory
        x = self.sublayers[1](x, lambda x: self.cross_attn(x, memory, memory, src_mask))
        
        # 第三个子层：前馈神经网络
        x = self.sublayers[2](x, self.feed_forward)
        
        return x

In [13]:
decoder = DecoderLayer2(512, 8, 2048, 0.1)
decoder

DecoderLayer2(
  (self_attn): TransformerMHA(
    (w_q): Linear(in_features=512, out_features=512, bias=True)
    (w_k): Linear(in_features=512, out_features=512, bias=True)
    (w_v): Linear(in_features=512, out_features=512, bias=True)
    (fc_out): Linear(in_features=512, out_features=512, bias=True)
  )
  (cross_attn): TransformerMHA(
    (w_q): Linear(in_features=512, out_features=512, bias=True)
    (w_k): Linear(in_features=512, out_features=512, bias=True)
    (w_v): Linear(in_features=512, out_features=512, bias=True)
    (fc_out): Linear(in_features=512, out_features=512, bias=True)
  )
  (feed_forward): PositionwiseFeedForward(
    (proj_up): Linear(in_features=512, out_features=2048, bias=True)
    (proj_down): Linear(in_features=2048, out_features=512, bias=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (sublayers): ModuleList(
    (0-2): 3 x SublayerConnection(
      (norm): LayerNorm((512,), eps=1e-09, elementwise_affine=True)
      (dropout): Dropout(p=0.1, in

# 编码器和解码器 

In [14]:
class Encoder(nn.Module):
    def __init__(self, d_model, d_ff, h, N, dropout=0.1):
        """
        编码器由 N 个EncoderLayer 构成

        args:
            d_model: 嵌入维度
            N: 编码器层的数量
            h: 多头注意力的头数
            d_ff: 前馈神经网络的隐藏层维度
            dropout: Dropout 概率
        """
        super(Encoder, self).__init__()
        self.layers = nn.ModuleList(
            [EncoderLayer(d_model, d_ff, h, dropout) for _ in range(N)]
        )
        self.norm = nn.LayerNorm(d_model)
    def forward(self, x, mask):
        for layer in self.layers:
            x = layer(x, mask)
        return self.norm(x) # 最后层归一化
        

In [15]:
EncoderBlock = Encoder(512, 2048, 8, 6, 0.1)
EncoderBlock

Encoder(
  (layers): ModuleList(
    (0-5): 6 x EncoderLayer(
      (mha): TransformerMHA(
        (w_q): Linear(in_features=512, out_features=512, bias=True)
        (w_k): Linear(in_features=512, out_features=512, bias=True)
        (w_v): Linear(in_features=512, out_features=512, bias=True)
        (fc_out): Linear(in_features=512, out_features=512, bias=True)
      )
      (ffn): PositionwiseFeedForward(
        (proj_up): Linear(in_features=512, out_features=2048, bias=True)
        (proj_down): Linear(in_features=2048, out_features=512, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (ln1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
      (ln2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
  )
  (norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
)

In [16]:
class Decoder(nn.Module):
    def __init__(self, d_model, d_ff, h, N, dropout=0.1):
        """
        解码器，由 N 个 DecoderLayer 堆叠而成。
        
        args:
            d_model: 嵌入维度
            N: 解码器层的数量
            h: 多头注意力的头数
            d_ff: 前馈神经网络的隐藏层维度
            dropout: Dropout 概率
        """
        super(Decoder, self).__init__()
        self.layers = nn.ModuleList([
            DecoderLayer(d_model, d_ff, h, dropout) for _ in range(N)
        ])
        self.norm = nn.LayerNorm(d_model)  # 最后层归一化

    def forward(self, x, memory, src_mask, tgt_mask):
        """
        前向传播函数。
        
        args:
            x: 解码器输入 (batch_size, seq_len_tgt, d_model)
            memory: 编码器的输出 (batch_size, seq_len_src, d_model)
            src_mask: 用于交叉注意力的源序列掩码
            tgt_mask: 用于自注意力的目标序列掩码
        return:
            decoder outputs
        """
        for layer in self.layers:
            x = layer(x, memory, src_mask, tgt_mask)
        return self.norm(x)  # 最后层归一化

In [17]:
DecoderBlock = Decoder(512, 2048, 8, 6, 0.1)
DecoderBlock

Decoder(
  (layers): ModuleList(
    (0-5): 6 x DecoderLayer(
      (self_mha): TransformerMHA(
        (w_q): Linear(in_features=512, out_features=512, bias=True)
        (w_k): Linear(in_features=512, out_features=512, bias=True)
        (w_v): Linear(in_features=512, out_features=512, bias=True)
        (fc_out): Linear(in_features=512, out_features=512, bias=True)
      )
      (corss_attention): TransformerMHA(
        (w_q): Linear(in_features=512, out_features=512, bias=True)
        (w_k): Linear(in_features=512, out_features=512, bias=True)
        (w_v): Linear(in_features=512, out_features=512, bias=True)
        (fc_out): Linear(in_features=512, out_features=512, bias=True)
      )
      (ffn): PositionwiseFeedForward(
        (proj_up): Linear(in_features=512, out_features=2048, bias=True)
        (proj_down): Linear(in_features=2048, out_features=512, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (ln1): LayerNorm((512,), eps=1e-05, elementwise_

其他组件

In [18]:
def create_padding_mask(seq, pad_token_id=0):
    # seq shape is [batch_size, seq_len] --> [batch_size, 1, 1, seq_len]
    mask = (seq != pad_token_id).unsqueeze(1).unsqueeze(2)
    return mask

def create_look_ahead_mask(size):
    mask = torch.tril(torch.ones(size, size)).type(torch.bool)  # 下三角矩阵
    return mask  # (seq_len, seq_len)

In [None]:
class Embeddings(nn.Module):
    """
    token ID tranform to embedding vector

    args: 
        vocab_size: 词表大小
        d_model：嵌入向量维度（隐藏层维度）
    """
    def __init__(self, vocab_size, d_model):
        super(Embeddings, self).__init__()
        self.embed = nn.Embedding(vocab_size, d_model)
        self.scaled_factor = math.sqrt(d_model)

    def forward(self, x):
        x = self.embed(x)
        x_embed = x * self.scaled_factor
        return x_embed

class PositionEmbeding(nn.Module):
    def __init__(self,d_model, dropout =0.1, max_len=5000):
        """
        PE,添加序列的唯一位置信息

        args:
            d_model: 嵌入维度
            dropout: 应用PE后的 Dropout的概率
            max_len: 位置编码的最大长，适应不同的输入序列
        """
        super(PositionEmbeding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)  # 论文的5.4 Residual Dropout
        
        # 创建位置编码矩阵 shape is (max_len, d_model)
        # 位置索引为 (max_len, 1)
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len).unsqueeze(1) # unsqueeze(1) to broadcast d_model

        # 频率计算
        # pos / (10000^(2i/d_model)) 
        # = pos * exp(log(10000^(-2i/d_model)))
        # = pos * exp(-2i/d_model * log(10000))
        div_term = torch.exp(
            torch.arange(0, d_model, 2) * (-math.log(10000.0) / d_model )
        )

        # 计算 sin 和 cos
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)


        # 扩充维度广播 input (batch_size, seq_len, d_model)
        pe = pe.unsqueeze(0)

        # 将位置编码注册为模型的缓冲区，不作为参数更新
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe[:, :x.size(1), :]  # 去除相同序列长度的位置编码进行合并
        x = self.dropout(x)
        return x
    
class SourceEmbedding(nn.Module):
    def __init__(self, d_model, src_vocab_size, dropout=0.1):
        """
        inputs 部分的 embeddings

        args:
            scr_vocab_size: 源语言词汇表大小
            d_model: 嵌入向量维度
            dropout概率 
        """
        super(SourceEmbedding, self).__init__()
        self.embed = Embeddings(src_vocab_size, d_model)
        self.positional_encoding = PositionEmbeding(d_model, dropout)

    def forward(self, x):
        x = self.positional_encoding(self.embed(x))  # (batch_size, seq_len_src, d_model)
        return x
    
class TargetEmbedding(nn.Module):
    def __init__(self, d_model, tgt_vocab_size, dropout=0.1):
        """
        inputs 部分的 embeddings

        args:
            tgt_vocab_size: 源语言词汇表大小
            d_model: 嵌入向量维度
            dropout概率 
        """
        super(TargetEmbedding, self).__init__()
        self.embed = Embeddings(tgt_vocab_size, d_model)
        self.positional_encoding = PositionEmbeding(d_model, dropout)

    def forward(self, x):
        x = self.positional_encoding(self.embed(x))  # (batch_size, seq_len_tgt, d_model)
        return x

# 最后一步构建完整的 Transformer

In [20]:
class Transformer(nn.Module):
    def __init__(self, src_vocab_size, tgt_vocab_size, d_model, N, h, d_ff, dropout =0.1):
        """
        Tansformer Architecture

        args:
            src_vocab_size: 源语言词汇表大小
            tgt_vocab_size: 目标语言词汇表大小
            d_model: 嵌入维度
            N: 编码器和解码器的层数
            h: 多头注意力的头数
            d_ff: 前馈神经网络的隐藏层维度
            dropout: Dropout 概率
        """
        super(Transformer,self).__init__()

        # inputs and outputs cross embedinglayer & add position embeding
        self.src_embed = SourceEmbedding(d_model, src_vocab_size, dropout)
        self.tgt_embed = TargetEmbedding(d_model, tgt_vocab_size, dropout)

        # 编码器和解码器
        self.encoder = Encoder(d_model, d_ff, h, N, dropout)
        self.decoder = Decoder(d_model, d_ff, h, N, dropout)

        # 输出线性层
        self.fc_out = nn.Linear(d_model, tgt_vocab_size)

    def forward(self, src, tgt):
        """
        args:
            src: 源序列输入 (batch_size, seq_len_src)
            tgt: 目标序列输入 (batch_size, seq_len_tgt)

        return:
            Transformer 的输出（未经过 Softmax）
        """
        # 生成掩码，
        src_mask = create_padding_mask(src)
        tgt_mask = create_look_ahead_mask(tgt)

        # 编码
        encoder_out = self.encoder(self.src_embed(src), src_mask)
        decoder_out = self.decoder(self.tgt_embed(tgt), encoder_out, src_mask, tgt_mask)

        out = self.fc_out(decoder_out)
        return out 

In [21]:
# 定义词汇表大小（根据数据集）
src_vocab_size = 5000  # 源语言词汇表大小
tgt_vocab_size = 5000  # 目标语言词汇表大小

# 使用 Transformer base 参数
d_model = 512      # 嵌入维度
N = 6              # 编码器和解码器的层数
h = 8              # 多头注意力的头数
d_ff = 2048        # 前馈神经网络的隐藏层维度
dropout = 0.1      # Dropout 概率

# 实例化模型
model = Transformer(
    src_vocab_size=src_vocab_size,
    tgt_vocab_size=tgt_vocab_size,
    d_model=d_model,
    N=N,
    h=h,
    d_ff=d_ff,
    dropout=dropout
)

# 打印模型架构
print(model)

Transformer(
  (src_embed): SourceEmbedding(
    (embed): Embeddings(
      (embed): Embedding(5000, 512)
    )
    (positional_encoding): PositionEmbeding(
      (dropout): Dropout(p=0.1, inplace=False)
    )
  )
  (tgt_embed): TargetEmbedding(
    (embed): Embeddings(
      (embed): Embedding(5000, 512)
    )
    (positional_encoding): PositionEmbeding(
      (dropout): Dropout(p=0.1, inplace=False)
    )
  )
  (encoder): Encoder(
    (layers): ModuleList(
      (0-5): 6 x EncoderLayer(
        (mha): TransformerMHA(
          (w_q): Linear(in_features=512, out_features=512, bias=True)
          (w_k): Linear(in_features=512, out_features=512, bias=True)
          (w_v): Linear(in_features=512, out_features=512, bias=True)
          (fc_out): Linear(in_features=512, out_features=512, bias=True)
        )
        (ffn): PositionwiseFeedForward(
          (proj_up): Linear(in_features=512, out_features=2048, bias=True)
          (proj_down): Linear(in_features=2048, out_features=512, b

**对照pytorch里面的transformer**

In [22]:
transformer_model = nn.Transformer(
    d_model=512,
    nhead=8,
    num_encoder_layers=6,
    num_decoder_layers=6,
    dim_feedforward=2048
)

# 直接打印模型结构
print(transformer_model)



Transformer(
  (encoder): TransformerEncoder(
    (layers): ModuleList(
      (0-5): 6 x TransformerEncoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
        )
        (linear1): Linear(in_features=512, out_features=2048, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
        (linear2): Linear(in_features=2048, out_features=512, bias=True)
        (norm1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (norm2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (dropout1): Dropout(p=0.1, inplace=False)
        (dropout2): Dropout(p=0.1, inplace=False)
      )
    )
    (norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
  )
  (decoder): TransformerDecoder(
    (layers): ModuleList(
      (0-5): 6 x TransformerDecoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=512, o