### Assemble

```sh
EncoderDecoder(
  (encoder): Encoder(
    (layers): ModuleList(
      (0-1): 2 x EncoderLayer(
        (self_attn): MultiHeadedAttention(
          (linears): ModuleList(
            (0-3): 4 x Linear(in_features=512, out_features=512, bias=True)
          )
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (feed_forward): PositionwiseFeedForward(
          (w_1): Linear(in_features=512, out_features=2048, bias=True)
          (w_2): Linear(in_features=2048, out_features=512, bias=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (sublayer): ModuleList(
          (0-1): 2 x SublayerConnection(
            (norm): LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
        )
      )
    )
    (norm): LayerNorm()
  )
  (decoder): Decoder(
    (layers): ModuleList(
      (0-1): 2 x DecoderLayer(
        (self_attn): MultiHeadedAttention(
          (linears): ModuleList(
            (0-3): 4 x Linear(in_features=512, out_features=512, bias=True)
          )
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (src_attn): MultiHeadedAttention(
          (linears): ModuleList(
            (0-3): 4 x Linear(in_features=512, out_features=512, bias=True)
          )
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (feed_forward): PositionwiseFeedForward(
          (w_1): Linear(in_features=512, out_features=2048, bias=True)
          (w_2): Linear(in_features=2048, out_features=512, bias=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (sublayer): ModuleList(
          (0-2): 3 x SublayerConnection(
            (norm): LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
        )
      )
    )
    (norm): LayerNorm()
  )
  (src_embed): Sequential(
    (0): Embeddings(
      (lut): Embedding(11, 512)
    )
    (1): PositionalEncoding(
      (dropout): Dropout(p=0.1, inplace=False)
    )
  )
  (tgt_embed): Sequential(
    (0): Embeddings(
      (lut): Embedding(11, 512)
    )
    (1): PositionalEncoding(
      (dropout): Dropout(p=0.1, inplace=False)
    )
  )
  (generator): Generator(
    (proj): Linear(in_features=512, out_features=11, bias=True)
  )
)
```

In [1]:
# def make_model(
#     src_vocab, tgt_vocab, N=6, d_model=512, d_ff=2048, h=8, dropout=0.1, log_on = False
# ):
#     "#SOL(辅助函数): 从超参数构建模型"
#     c = copy.deepcopy
#     attn = MultiHeadedAttention(h, d_model)
#     ff = PositionwiseFeedForward(d_model, d_ff, dropout)
#     position = PositionalEncoding(d_model, dropout)
#     model = EncoderDecoder(
#         Encoder(EncoderLayer(d_model, c(attn), c(ff), dropout), N), # HL: encoder
#         Decoder(DecoderLayer(d_model, c(attn), c(attn), c(ff), dropout), N), # HL: decoder
#         nn.Sequential(Embeddings(d_model, src_vocab), c(position)), # HL: source embeddings
#         nn.Sequential(Embeddings(d_model, tgt_vocab), c(position)), # HL: target embeddings
#         Generator(d_model, tgt_vocab), # HL: generator
#     )

#     # See https://arxiv.org/pdf/1502.01852.pdf
#     # NT: 初始化网络权重 (Glorot / fan_avg)
#     for p in model.parameters():
#         if p.dim() > 1:
#             nn.init.xavier_uniform_(p) # [-limit, limit] (limit = \sqrt(fan_in + fan_out))

#     # ✅ 如果需要 logging，给每个子模块加 log wrapper
#     if log_on:
#         for name, module in model.named_modules():
#             add_logging(module, enabled=True)
#     return model

from tools.make_model import make_model

In [2]:
from transformers import BertTokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# 示例输入
# sentence = "I love machine learning"
sentences = [
    "The quick brown fox jumps over the lazy dog.",
    "Hello, how are you doing today?",
    "Natural language processing is a fascinating field of study.",
    "The weather is really nice outside this morning.",
    "The quick brown fox jumps over the lazy dog.",
    "Hello, how are you doing today?",
    "Natural language processing is a fascinating field of study.",
    "The weather is really nice outside this morning.",
    "The quick brown fox jumps over the lazy dog.",
    "Hello, how are you doing today?",
    "Natural language processing is a fascinating field of study.",
    "The weather is really nice outside this morning.",
    "This is the sixteenth and final sentence in our batch.",
    "The weather is really nice outside this morning.",
    "This is the sixteenth and final sentence in our batch.",
    "The curious traveler walked along the ancient path, carrying a notebook filled with questions about history, philosophy, language, and the endless stories of civilizations that had risen and fallen. Each step he took revealed fragments of forgotten wisdom: the ruins of temples, broken columns, faded carvings, and scattered pottery, all whispering about the lives of people who once dreamed, loved, fought, and created with astonishing determination. He reflected on how knowledge was not a fixed possession but a flowing river, always reshaping itself as generations asked new questions and found new answers, often discovering contradictions that forced them to think deeper. The traveler ..."
]

tokens = tokenizer(sentences, padding=True, truncation=True, return_tensors="pt")

print(tokens.keys())  # dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])
# print(tokens["input_ids"].shape)   # [batch_size, seq_len]

src_vocab = tgt_vocab = tokenizer.vocab_size  # 30522
model = make_model(src_vocab, tgt_vocab, N=6, d_model=768, d_ff=2048, h=8, dropout=0.1, log_on=True)

# print("模型参数数量:", sum(p.numel() for p in model.parameters()))

src = tokens["input_ids"]  # [batch_size, seq_len]
tgt = tokens["input_ids"]  # 简单测试，目标用相同句子
# print(src.shape, tgt.shape)
# 注意 mask 也要传
src_mask = tokens["attention_mask"].unsqueeze(1).unsqueeze(2)  # [batch_size, 1,1,seq_len]
# print(src_mask.shape)
tgt_mask = None  # 测试先不做decoder mask

out = model(src, tgt, src_mask, tgt_mask)
print("模型输出 shape:", out.shape)  # [batch, seq_len, tgt_vocab]

dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])
[LOG] EncoderDecoder.forward 输入: [torch.Size([16, 128]), torch.Size([16, 128]), torch.Size([16, 1, 1, 128])]
[LOG] Sequential.forward 输入: [torch.Size([16, 128])]
[LOG] Embeddings.forward 输入: [torch.Size([16, 128])]
[LOG] Embedding.forward 输入: [torch.Size([16, 128])]
[LOG] Embedding.forward 输出: torch.Size([16, 128, 768])
[LOG] Embeddings.forward 输出: torch.Size([16, 128, 768])
[LOG] PositionalEncoding.forward 输入: [torch.Size([16, 128, 768])]
[LOG] Dropout.forward 输入: [torch.Size([16, 128, 768])]
[LOG] Dropout.forward 输出: torch.Size([16, 128, 768])
[LOG] PositionalEncoding.forward 输出: torch.Size([16, 128, 768])
[LOG] Sequential.forward 输出: torch.Size([16, 128, 768])
[LOG] Encoder.forward 输入: [torch.Size([16, 128, 768]), torch.Size([16, 1, 1, 128])]
[LOG] EncoderLayer.forward 输入: [torch.Size([16, 128, 768]), torch.Size([16, 1, 1, 128])]
[LOG] SublayerConnection.forward 输入: [torch.Size([16, 128, 768])]
[LOG] LayerNorm.forward 输入: 

- `Transformer` 结构

<div
    style="width: 600px; background-color: white; margin: 0 auto; padding: 20px"
>
    <img src="https://arxiv.org/html/1706.03762v7/extracted/1706.03762v7/Figures/ModalNet-21.png" alt="结构">
</div>