In [1]:
from transformers import AutoTokenizer, AutoModelForCausalLM,AutoModel
import torch

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
model_name = "../../model/Qwen3-0.6B"
model = AutoModel.from_pretrained(model_name, device_map="auto", torch_dtype=torch.float16,output_hidden_states=True,output_attentions=True).to("cuda:0")
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=False)


In [3]:
sentence = "What is the capital of France?"
inputs = tokenizer(sentence, return_tensors="pt").to(model.device)
outputs = model(**inputs)




In [4]:
outputs.attentions[0][0][0]

tensor([[1.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
         0.0000e+00],
        [6.6699e-01, 3.3276e-01, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
         0.0000e+00],
        [1.1823e-01, 6.2646e-01, 2.5513e-01, 0.0000e+00, 0.0000e+00, 0.0000e+00,
         0.0000e+00],
        [3.2690e-01, 3.1812e-01, 2.7539e-01, 7.9529e-02, 0.0000e+00, 0.0000e+00,
         0.0000e+00],
        [5.2673e-02, 3.2642e-01, 2.1741e-01, 5.3978e-04, 4.0308e-01, 0.0000e+00,
         0.0000e+00],
        [2.3376e-01, 2.8076e-01, 1.4856e-01, 3.6255e-02, 2.8418e-01, 1.6449e-02,
         0.0000e+00],
        [2.3743e-01, 1.2128e-01, 2.6489e-01, 4.5896e-06, 1.4075e-01, 4.5896e-06,
         2.3560e-01]], device='cuda:0', dtype=torch.float16,
       grad_fn=<SelectBackward0>)

In [5]:
embeddings = model.get_input_embeddings()(inputs.input_ids)


In [26]:
embeddings.shape

torch.Size([1, 7, 1024])

In [27]:
model.layers[0].self_attn.q_proj.weight.shape

torch.Size([2048, 1024])

In [6]:
## no bias in Q, K, V
Q_parameter = model.layers[0].self_attn.q_proj.weight.T[:,:128] ## [1024,128]

K_parameter = model.layers[0].self_attn.k_proj.weight.T[:,:128] ## [1024,128]

V_parameter = model.layers[0].self_attn.v_proj.weight.T[:,:128]  ## [1024,128]


In [7]:

import torch
import math

def build_rope_freqs(seq_len, head_dim, base=10000):
    """
    构建用于 RoPE 的 sin 和 cos 位置编码表
    输出形状：(seq_len, head_dim // 2)
    """
    half_dim = head_dim // 2
    pos = torch.arange(seq_len, dtype=torch.float32)
    freqs = torch.pow(base, -torch.arange(0, half_dim, dtype=torch.float32) / half_dim)
    angles = torch.einsum('i,j->ij', pos, freqs)  # (seq_len, half_dim)

    sin = torch.sin(angles)  # (seq_len, half_dim)
    cos = torch.cos(angles)
    return sin, cos

def apply_rope(x, sin, cos):
    """
    对输入 x 应用 RoPE
    x: (batch, seq_len, head_dim)
    sin, cos: (seq_len, head_dim // 2)
    """
    x1 = x[..., ::2]  # 偶数维
    x2 = x[..., 1::2]  # 奇数维

    # 将 sin, cos 调整形状匹配 (1, seq_len, head_dim // 2)
    sin = sin.unsqueeze(0).to(x.device)
    cos = cos.unsqueeze(0).to(x.device)

    x_rotated = torch.cat([x1 * cos - x2 * sin, x1 * sin + x2 * cos], dim=-1)
    return x_rotated

# 示例参数
batch = 1
seq_len = 7
head_dim = 128


# 1. 构建 RoPE 编码
sin, cos = build_rope_freqs(seq_len=seq_len, head_dim=head_dim)

# 2. 应用 RoPE


# # 3. 计算注意力分数（qk^T）-> (batch, seq_len, seq_len)
# attn_scores = torch.matmul(q_rope, k_rope.transpose(-2, -1)) / math.sqrt(head_dim)





In [None]:
q_rope = apply_rope(Q, sin, cos)
k_rope = apply_rope(K, sin, cos)

In [None]:
attn_scores = torch.matmul(q_rope, k_rope.transpose(-2, -1)) / math.sqrt(head_dim)

In [27]:
softmax(attn_scores)

tensor([[[0.1427, 0.1429, 0.1428, 0.1427, 0.1430, 0.1427, 0.1433],
         [0.1427, 0.1428, 0.1428, 0.1426, 0.1431, 0.1426, 0.1434],
         [0.1429, 0.1430, 0.1429, 0.1426, 0.1430, 0.1427, 0.1428],
         [0.1430, 0.1430, 0.1429, 0.1425, 0.1429, 0.1427, 0.1429],
         [0.1429, 0.1431, 0.1430, 0.1425, 0.1432, 0.1426, 0.1428],
         [0.1430, 0.1430, 0.1428, 0.1428, 0.1428, 0.1427, 0.1428],
         [0.1429, 0.1430, 0.1429, 0.1424, 0.1432, 0.1425, 0.1431]]],
       device='cuda:0', grad_fn=<SoftmaxBackward0>)

In [9]:
import math
Q = embeddings[0] @ Q_parameter ## [7,128]
K = embeddings[0] @ K_parameter ## [7,128]
Q = Q.unsqueeze(0)  ## [1,7,128]
K = K.unsqueeze(0)  ## [1,7,128]

# QKt = Q @ K.T ## [7,7]
# QKt = QKt / (math.sqrt(128))  
# softmax = torch.nn.Softmax(dim=-1)
# attention = softmax(QKt)

In [None]:
QKt = q_rope[0] @ k_rope[0].T
QKt = QKt / (math.sqrt(128))  
softmax = torch.nn.Softmax(dim=-1)
attention = softmax(QKt)

In [26]:
attention 

tensor([[0.1427, 0.1429, 0.1428, 0.1427, 0.1430, 0.1427, 0.1433],
        [0.1427, 0.1428, 0.1428, 0.1426, 0.1431, 0.1426, 0.1434],
        [0.1429, 0.1430, 0.1429, 0.1426, 0.1430, 0.1427, 0.1428],
        [0.1430, 0.1430, 0.1429, 0.1425, 0.1429, 0.1427, 0.1429],
        [0.1429, 0.1431, 0.1430, 0.1425, 0.1432, 0.1426, 0.1428],
        [0.1430, 0.1430, 0.1428, 0.1428, 0.1428, 0.1427, 0.1428],
        [0.1429, 0.1430, 0.1429, 0.1424, 0.1432, 0.1425, 0.1431]],
       device='cuda:0', grad_fn=<SoftmaxBackward0>)

In [None]:
attention

tensor([[0.1427, 0.1429, 0.1428, 0.1427, 0.1430, 0.1427, 0.1433],
        [0.1427, 0.1428, 0.1428, 0.1426, 0.1431, 0.1426, 0.1434],
        [0.1429, 0.1430, 0.1429, 0.1426, 0.1430, 0.1427, 0.1428],
        [0.1430, 0.1430, 0.1429, 0.1425, 0.1429, 0.1427, 0.1429],
        [0.1429, 0.1431, 0.1430, 0.1425, 0.1432, 0.1426, 0.1428],
        [0.1430, 0.1430, 0.1428, 0.1428, 0.1428, 0.1427, 0.1428],
        [0.1429, 0.1430, 0.1429, 0.1424, 0.1432, 0.1425, 0.1431]],
       device='cuda:0', grad_fn=<SoftmaxBackward0>)

In [48]:
attention[:,0]

tensor([0.1427, 0.1428, 0.1428, 0.1431, 0.1428, 0.1429, 0.1429],
       device='cuda:0', dtype=torch.float16, grad_fn=<SelectBackward0>)

In [49]:
attention

tensor([[0.1427, 0.1429, 0.1428, 0.1427, 0.1429, 0.1428, 0.1431],
        [0.1428, 0.1429, 0.1429, 0.1427, 0.1431, 0.1428, 0.1427],
        [0.1428, 0.1429, 0.1428, 0.1426, 0.1431, 0.1426, 0.1431],
        [0.1431, 0.1429, 0.1428, 0.1425, 0.1429, 0.1426, 0.1431],
        [0.1428, 0.1432, 0.1429, 0.1425, 0.1432, 0.1427, 0.1427],
        [0.1429, 0.1429, 0.1428, 0.1428, 0.1428, 0.1428, 0.1428],
        [0.1429, 0.1431, 0.1429, 0.1423, 0.1431, 0.1425, 0.1431]],
       device='cuda:0', dtype=torch.float16, grad_fn=<SoftmaxBackward0>)

In [50]:
sum(attention[:,0])

tensor(1., device='cuda:0', dtype=torch.float16, grad_fn=<AddBackward0>)

In [51]:
model

Qwen3Model(
  (embed_tokens): Embedding(151936, 1024)
  (layers): ModuleList(
    (0-27): 28 x Qwen3DecoderLayer(
      (self_attn): Qwen3Attention(
        (q_proj): Linear(in_features=1024, out_features=2048, bias=False)
        (k_proj): Linear(in_features=1024, out_features=1024, bias=False)
        (v_proj): Linear(in_features=1024, out_features=1024, bias=False)
        (o_proj): Linear(in_features=2048, out_features=1024, bias=False)
        (q_norm): Qwen3RMSNorm((128,), eps=1e-06)
        (k_norm): Qwen3RMSNorm((128,), eps=1e-06)
      )
      (mlp): Qwen3MLP(
        (gate_proj): Linear(in_features=1024, out_features=3072, bias=False)
        (up_proj): Linear(in_features=1024, out_features=3072, bias=False)
        (down_proj): Linear(in_features=3072, out_features=1024, bias=False)
        (act_fn): SiLU()
      )
      (input_layernorm): Qwen3RMSNorm((1024,), eps=1e-06)
      (post_attention_layernorm): Qwen3RMSNorm((1024,), eps=1e-06)
    )
  )
  (norm): Qwen3RMSNorm((102

In [61]:
model.config

Qwen3Config {
  "architectures": [
    "Qwen3ForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "bos_token_id": 151643,
  "eos_token_id": 151645,
  "head_dim": 128,
  "hidden_act": "silu",
  "hidden_size": 1024,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "max_position_embeddings": 40960,
  "max_window_layers": 28,
  "model_type": "qwen3",
  "num_attention_heads": 16,
  "num_hidden_layers": 28,
  "num_key_value_heads": 8,
  "output_attentions": true,
  "output_hidden_states": true,
  "rms_norm_eps": 1e-06,
  "rope_scaling": null,
  "rope_theta": 1000000,
  "sliding_window": null,
  "tie_word_embeddings": true,
  "torch_dtype": "float16",
  "transformers_version": "4.52.4",
  "use_cache": true,
  "use_sliding_window": false,
  "vocab_size": 151936
}