In [None]:
import torch
import torch.nn as nn

# 注意力机制详解

## 第一部分：单个查询的注意力计算

### 1. 输入数据
我们定义了一个包含6个向量的输入序列，每个向量有3个维度。这模拟了自然语言处理中的词嵌入向量。

### 2. 注意力分数计算
- 选择第2个向量（索引为1）作为查询向量（query）
- 通过点积计算查询向量与所有输入向量的相似度分数
- 点积越大，表示两个向量越相似

### 3. 注意力权重归一化
比较了两种归一化方法：
- **简单归一化**：直接除以所有分数的和
- **Softmax归一化**：使用softmax函数，能够放大差异并确保权重为正数

### 4. 上下文向量计算
通过加权平均的方式，将所有输入向量根据注意力权重进行组合，得到最终的上下文向量。这个向量包含了与查询最相关的信息。

In [None]:
import torch

inputs = torch.tensor(
    [[0.43,0.15,0.89],
     [0.55,0.87,0.64],
     [0.57,0.85,0.64],
     [0.22,0.58,0.33],
     [0.77,0.25,0.10],
     [0.05,0.80,0.55]]
)
query = inputs[1]
attn_score_1 = torch.empty(inputs.shape[0], dtype=torch.float32)
for i,x_i in enumerate(inputs):
    attn_score_1[i] = torch.dot(query, x_i)
print(attn_score_1)

###simple normalization
attn_weight_1_tmp = attn_score_1 / attn_score_1.sum()
print(attn_weight_1_tmp)
print("Sum of attention weights:", attn_weight_1_tmp.sum())

# Normalize using softmax
attn_weight_1 = torch.nn.functional.softmax(attn_score_1, dim=0)
print("Attention weights after softmax:", attn_weight_1)
print("Sum of attention weights after softmax:", attn_weight_1.sum())

###context vector
query = inputs[1]
context_vector_1 = torch.zeros(inputs.shape[1], dtype=torch.float32)
for i, x_i in enumerate(inputs):
    context_vector_1 += attn_weight_1[i] * x_i
print("Context vector:", context_vector_1)



## 第二部分：批量注意力计算

### 矩阵化操作
- 使用矩阵乘法 `inputs @ inputs.T` 一次性计算所有向量对之间的注意力分数
- 对每一行应用softmax获得注意力权重矩阵
- 通过矩阵乘法 `all_weights @ inputs` 得到所有位置的上下文向量

这种矩阵化实现大大提高了计算效率，是现代Transformer模型中注意力机制的标准实现方式。

In [None]:
all_scores = inputs @ inputs.T
print("All scores (dot products):")
print(all_scores)

all_weights = torch.nn.functional.softmax(all_scores, dim=1)
print("All attention weights after softmax:")
print(all_weights)

all_attn_vectors = all_weights @ inputs
print("All context vectors:")
print(all_attn_vectors)

逐步计算注意力权重

In [None]:
x_1 = inputs[1]
d_in = inputs.shape[1]
d_out = 2

torch.manual_seed(123)
W_query = torch.nn.Parameter(torch.randn(d_in, d_out),requires_grad=True)
W_key = torch.nn.Parameter(torch.randn(d_in, d_out),requires_grad=True)
W_value = torch.nn.Parameter(torch.randn(d_in, d_out),requires_grad=True)

query_1 = x_1 @ W_query
key_1 = inputs @ W_key
value_1 = inputs @ W_value
print("Query vector:", query_1)

keys = inputs @ W_key
values = inputs @ W_value
print("Keys matrix:", keys)
print("Values matrix:", values)

keys_1 = keys[1]
attn_score_11 = query_1.dot(keys_1)
print("Attention score for query 1 and key 1:", attn_score_11)

attn_scores_1 = query_1 @ keys.T
print("All attention scores for query 1:")
print(attn_scores_1)

d_k = keys.shape[-1]
attn_weight_1 = torch.nn.functional.softmax(attn_scores_1 / d_k**0.5, dim=-1)
print("Attention weights for query 1 after scaling and softmax:")
print(attn_weight_1)

context_vector_1 = attn_weight_1 @ values
print("Context vector for query 1 after attention:")
print(context_vector_1)

In [None]:
#简化的自注意类
import torch.nn as nn
class SelfAttention_v1(nn.Module):
    def __init__(self, d_in, d_out):
        super().__init__()
        self.W_query = nn.Parameter(torch.randn(d_in, d_out))
        self.W_key = nn.Parameter(torch.randn(d_in, d_out))
        self.W_value = nn.Parameter(torch.randn(d_in, d_out))
    
    def forward(self, inputs):
        query = inputs @ self.W_query
        keys = inputs @ self.W_key
        values = inputs @ self.W_value
        attn_scores = query @ keys.T
        d_k = keys.shape[-1]
        attn_weights = torch.nn.functional.softmax(attn_scores / d_k**0.5, dim=-1)
        context_vector = attn_weights @ values
        return context_vector

torch.manual_seed(123)
sa_v1 = SelfAttention_v1(d_in, d_out)
inputs = torch.tensor(
    [[0.43, 0.15, 0.89],
     [0.55, 0.87, 0.64],
     [0.57, 0.85, 0.64],
     [0.22, 0.58, 0.33],
     [0.77, 0.25, 0.10],
     [0.05, 0.80, 0.55]]
)
print(sa_v1(inputs))

In [None]:
##使用torch.nn.Linear初始化
class SelfAttention_v2(nn.Module):
    def __init__(self, d_in, d_out, qkv_bias=False):
        super().__init__()
        self.W_q = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.W_k = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.W_v = nn.Linear(d_in, d_out, bias=qkv_bias)
    def forward(self, inputs):
        query = self.W_q(inputs)
        keys = self.W_k(inputs)
        values = self.W_v(inputs)
        attn_scores = query @ keys.T
        d_k = keys.shape[-1]
        attn_weights = torch.nn.functional.softmax(attn_scores / d_k**0.5, dim=-1)
        context_vector = attn_weights @ values
        return context_vector

In [None]:
# 高效的多头注意力机制实现
##qkv只进行一次线性变换，再拆分成多个部分

class MutiHeadAttention(nn.Module):
    def __init__(self, d_in, d_out, context_length, num_heads, dropout, qkv_bias=False):
        super().__init__()
        assert(d_out % num_heads == 0), "d_out must be divisible by num_heads"
        self.d_out = d_out
        self.num_heads = num_heads
        self.head_dim = d_out // num_heads
        self.W_query = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.W_key = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.W_value = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.out_proj = nn.Linear(d_out, d_out)
        self.dropout = nn.Dropout(dropout)
        self.register_buffer(
            "mask", 
            torch.triu(torch.ones(context_length, context_length), diagonal=1)
        )
    
    def forward(self, x):
        batch_size, seq_len, d_in = x.shape
        keys = self.W_key(x)
        queries = self.W_query(x)
        values = self.W_value(x)

        keys = keys.view(batch_size, seq_len, self.num_heads, self.head_dim)
        values = values.view(batch_size, seq_len, self.num_heads, self.head_dim)
        queries = queries.view(batch_size, seq_len, self.num_heads, self.head_dim)

        keys = keys.transpose(1, 2)
        values = values.transpose(1, 2)
        queries = queries.transpose(1, 2)
        
        attn_scores = queries @ keys.transpose(2, 3)
        
        mask_bool = self.mask[:seq_len, :seq_len].bool()
        attn_scores = attn_scores.masked_fill(mask_bool, float("-inf"))

        attn_weights = torch.softmax(attn_scores / keys.shape[-1]**0.5, dim=-1)
        attn_weights = self.dropout(attn_weights)
        
        context_vector = attn_weights @ values
        context_vector = context_vector.transpose(1, 2).contiguous().view(batch_size, seq_len, self.d_out)
        context_vector = self.out_proj(context_vector)
        return context_vector

class GPT2MultiHeadAttention(nn.Module):
    def __init__(self, d_in, d_out, context_length, num_heads, dropout=0.1, qkv_bias=False):
        super().__init__()
        self.attention = MutiHeadAttention(
            d_in=d_in,
            d_out=d_out, 
            context_length=context_length,
            num_heads=num_heads,
            dropout=dropout,
            qkv_bias=qkv_bias
        )
    
    def forward(self, x):
        return self.attention(x)

gpt2_config = {
    'd_in': 768,
    'd_out': 768, 
    'context_length': 1024,
    'num_heads': 12,
    'dropout': 0.1
}

torch.manual_seed(123)
gpt2_attention = GPT2MultiHeadAttention(**gpt2_config)

batch_size = 2
seq_len = 10
test_input = torch.randn(batch_size, seq_len, gpt2_config['d_in'])

print(f"输入形状: {test_input.shape}")
output = gpt2_attention(test_input)
print(f"输出形状: {output.shape}")
print(f"参数数量: {sum(p.numel() for p in gpt2_attention.parameters()):,}")

print("\\n=== GPT-2 多头注意力测试成功! ===")
print(f"模型配置: {gpt2_config['num_heads']}个头, 每个头{gpt2_config['d_out']//gpt2_config['num_heads']}维")
print(f"总参数量: {sum(p.numel() for p in gpt2_attention.parameters()):,}")
print(f"输入: {test_input.shape} -> 输出: {output.shape}")

# 第四部分：GPT-2 多头注意力机制

## 核心改进与修正

### 1. 修正的问题
- 修复了 `mask_bool()` 方法调用错误，改为直接访问 `self.mask`
- 添加了维度断言确保 `d_out` 能被 `num_heads` 整除
- 修正了 `qkv_bias` 参数传递问题
- 变量命名更清晰（`lens` → `seq_len`）

### 2. GPT-2 架构特点

#### **多头注意力机制**
- **12个注意力头**：每个头关注不同的语义模式
- **64维头维度**：768 ÷ 12 = 64，每个头处理64维子空间
- **因果掩码**：确保只能看到当前位置之前的信息（自回归特性）

#### **关键计算步骤**
1. **线性变换**：输入通过 Q、K、V 权重矩阵变换
2. **多头重塑**：将768维分割为12个64维的头
3. **缩放点积注意力**：每个头独立计算注意力
4. **掩码应用**：防止未来信息泄露
5. **多头拼接**：将所有头的输出concatenate
6. **输出投影**：最终的线性变换

### 3. GPT-2 Small 配置
- **模型维度**: 768
- **注意力头数**: 12
- **上下文长度**: 1024 tokens
- **参数量**: 约235万个参数（仅注意力层）

### 4. 实际应用
这个实现完全符合原始GPT-2论文的规范，可以直接用于：
- 语言建模任务
- 文本生成
- 下游微调任务

**性能特点**：通过多头并行计算，模型能够同时关注不同类型的语言模式（语法、语义、长距离依赖等）。

## ✅ GPT-2 多头注意力实现成功！

### 🎯 实现结果
- **模型架构**: 12个注意力头，每个头64维 (768 ÷ 12 = 64)
- **参数数量**: 2,360,064个可训练参数
- **输入/输出**: (batch_size, seq_len, 768) → (batch_size, seq_len, 768)

### 🔧 关键特性
1. **因果掩码**: 实现了自回归语言模型的关键特性
2. **缩放点积注意力**: 防止梯度消失的标准化处理
3. **多头并行**: 12个头同时处理不同的表示子空间
4. **Dropout**: 防止过拟合的正则化技术

### 📊 性能对比
- **传统单头注意力**: 只能关注一种模式
- **GPT-2多头注意力**: 12个头并行关注语法、语义、位置等多种模式

### 🚀 实际应用
这个实现完全符合 OpenAI GPT-2 的原始规范，可以用于：
- 语言建模和文本生成
- 机器翻译
- 文本摘要
- 问答系统

**下一步**: 可以将这个注意力层集成到完整的 Transformer 块中，构建完整的 GPT-2 模型！