## Attention Is All You Need

- `Transformer` 结构

<div
    style="width: 600px; background-color: white; margin: 0 auto; padding: 20px"
>
    <img src="https://arxiv.org/html/1706.03762v7/extracted/1706.03762v7/Figures/ModalNet-21.png" alt="结构">
</div>

### 第一步：分词器

In [11]:
import torch
import torch.nn as nn
from transformers import AutoTokenizer, AutoModel # huggingface的tokenizer
from pprint import pprint

# STEP1: tokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
model = AutoModel.from_pretrained("bert-base-uncased")
sentences = ["I love machine learning!!", "Transformers are powerful models"]
encodings = tokenizer(sentences, return_tensors='pt', padding=True, truncation=True) # sentence_list -> token_ids_list
embedding_layer = model.get_input_embeddings()  # nn.Embedding(vocab_size, hidden_size)
# 打印结果
print("原始句子:", sentences)
print("分词+嵌入结果(句子被分成最小单位[tokenizer]，每个单位对应一个词嵌入向量(这里使用的是bert的嵌入层1*768))[embedding]:")

for token_ids in encodings["input_ids"]:
    tokens = tokenizer.convert_ids_to_tokens(token_ids)
    for tid, tok in zip(token_ids, tokens):
        embed = embedding_layer(tid.unsqueeze(0)) # 将token ID转化为embedding
        print(f"{tid.item():>5}  -->  {tok:>13} --> {embed.squeeze(0).detach().numpy()[:5]} ... {embed.shape}")
    print()

print("Attention Mask(0表示padding):\n", encodings["attention_mask"]) # NT: 0 for padding
print("词表大小:", tokenizer.vocab_size)
print("嵌入层:", embedding_layer)
embeddings = embedding_layer(encodings["input_ids"])
print("句子转化后的token IDs:\n", encodings["input_ids"])
print("第一步的输出(输入句子的嵌入表示):\n", embeddings)
print(embeddings.shape, "<-- (batch_size, sequence_length, hidden_size)")

原始句子: ['I love machine learning!!', 'Transformers are powerful models']
分词+嵌入结果(句子被分成最小单位[tokenizer]，每个单位对应一个词嵌入向量(这里使用的是bert的嵌入层1*768))[embedding]:
  101  -->          [CLS] --> [ 0.0136303  -0.02649042 -0.02350313 -0.00778762  0.0085892 ] ... torch.Size([1, 768])
 1045  -->              i --> [-0.02108689  0.005904   -0.01792564 -0.00347791  0.02398458] ... torch.Size([1, 768])
 2293  -->           love --> [ 0.06090903 -0.01906963 -0.01657766  0.02639303  0.03516982] ... torch.Size([1, 768])
 3698  -->        machine --> [ 0.02177574  0.01318982 -0.04846223 -0.03202071 -0.02587842] ... torch.Size([1, 768])
 4083  -->       learning --> [-0.099035   -0.03934661 -0.01086605  0.01564737 -0.00137342] ... torch.Size([1, 768])
  999  -->              ! --> [ 0.02978682 -0.03725905 -0.0356083  -0.08191887 -0.03916362] ... torch.Size([1, 768])
  999  -->              ! --> [ 0.02978682 -0.03725905 -0.0356083  -0.08191887 -0.03916362] ... torch.Size([1, 768])
  102  -->          [SEP] --> [-

### 第二步：位置编码

In [4]:
# SOL: positional encoding的实现(完整可见PositionalEncoding.py)
import torch
import math
import pandas as pd
import altair as alt
def positional_encoding(d_model, max_len=5000): # max_len: 序列的最大长度
    pe = torch.zeros(max_len, d_model)
    position = torch.arange(0, max_len).unsqueeze(1)
    div_term = torch.exp(
        torch.arange(0, d_model, 2) * -(math.log(10000.0) / d_model)
    )
    pe[:, 0::2] = torch.sin(position * div_term)
    pe[:, 1::2] = torch.cos(position * div_term)
    pe = pe.unsqueeze(0)
    return pe
pe = positional_encoding(d_model=768, max_len=64)  # 这里用20维方便可视化
print(pe.shape)

torch.Size([1, 64, 768])


In [5]:
# print(embeddings)
import torch
import pandas as pd
import altair as alt
from tools.PositionalEncoding import PositionalEncoding

# HL: 实际上一步骤max_len=8, d_model=768
pe = PositionalEncoding(d_model=20, dropout=0)  # 这里用20维方便可视化
x = torch.zeros(1, 100, 20)  # NT: 模拟输入句子(长度100，每个词的嵌入表示是20维)
y = pe.forward(x)

# 选择要展示的维度
dims_to_plot = [4, 5, 6, 7]  # 可以自己选择

# 构造 DataFrame
data = pd.concat([
    pd.DataFrame({
        "embedding": y[0, :, dim],
        "position": list(range(100)),
        "dimension": dim
    })
    for dim in dims_to_plot
])

# 用 Altair 绘制折线图
chart = (
    alt.Chart(data)
    .mark_line()
    .encode(
        x="position",
        y="embedding",
        color=alt.Color("dimension:N", title="Dimension:N")
    )
    .properties(width=800, height=400)
    .interactive()
)
vline = alt.Chart(pd.DataFrame({"x": [40]})).mark_rule(color="red", strokeDash=[5,5]).encode(x="x")
label = alt.Chart(pd.DataFrame({"x": [40], "label": ["x=40"]})).mark_text( align="left", dx=5, dy=-5, color="red" ).encode( x="x", text="label" )

chart_with_line = chart + vline + label

print("红色竖线表示在x=40这个位置，例如token(learning)不同维度需要加上的位置信息")
display(chart_with_line)
# 热力图
# heatmap = (
#     alt.Chart(data)
#     .mark_rect()
#     .encode(
#         x="position:O",
#         y="dimension:O",
#         color="embedding:Q"
#     )
#     .properties(width=800, height=400)
# )

# display(heatmap)

红色竖线表示在x=40这个位置，例如token(learning)不同维度需要加上的位置信息
