In [1]:
import torch
import torch.nn as nn
import torch.functional as F


In [2]:
torch.__version__

'2.5.1+cu118'

In [3]:
import torch
import sys

print(f"Python版本: {sys.version}")
print(f"PyTorch版本: {torch.__version__}")
print(f"CUDA是否可用: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"CUDA版本: {torch.version.cuda}")
    print(f"当前GPU设备: {torch.cuda.current_device()}")
    print(f"GPU设备名称: {torch.cuda.get_device_name()}")
    print(f"可用GPU数量: {torch.cuda.device_count()}")
else:
    print("CUDA不可用 - 可能是CPU版本的PyTorch")

Python版本: 3.10.18 | packaged by Anaconda, Inc. | (main, Jun  5 2025, 13:08:55) [MSC v.1929 64 bit (AMD64)]
PyTorch版本: 2.5.1+cu118
CUDA是否可用: True
CUDA版本: 11.8
当前GPU设备: 0
GPU设备名称: NVIDIA GeForce RTX 4060 Laptop GPU
可用GPU数量: 1


In [51]:
import torch
import math
import torch.nn as nn
import torch.nn.functional as F

def scaled_dot_attention(q, k, v, mask=None):
    """
    计算点积注意力 attention = softmax(qk.T/ sqrt(d_k))v

    args:
        q (batch_size, seq_len_q, d_k), 
        k (batch_size, seq_len_k, d_k), 
        v (batch_size, seq_len_v, d_v)
    return:
        scaled_attention, 
        attentionweight
    """
    d_k = k.size(-1)

    scores = q @ k.transpose(-1,-2) / math.sqrt(d_k)

    if mask is not None:
        scores = scores.masked_fill(mask ==0, float = "-inf")
        
    attention_weight = F.softmax(scores, dim=-1)
    scaled_attention = torch.matmul(attention_weight, v)

    return scaled_attention, attention_weight

In [45]:
q = torch.randn(1,6,4)
k = torch.randn(1,4,4)
v = torch.randn(1,4,5)

scaled_attention, attention_weight = scaled_dot_attention(q,k,v)

In [20]:
scaled_attention, attention_weight

(tensor([[[ 0.7149, -0.0963, -0.1478, -0.0407,  0.2928],
          [ 0.5236, -0.5029,  0.0528, -1.0003, -0.2526],
          [ 0.4343, -0.5855,  0.0497, -0.8911, -0.2477],
          [ 0.5488, -0.5088, -0.0192, -0.3413,  0.1439],
          [ 1.1156, -0.5996,  0.1530, -0.8195,  0.5239],
          [ 0.3151, -0.5996, -0.0101, -0.5088, -0.1551]]]),
 tensor([[[0.0738, 0.1798, 0.3316, 0.4147],
          [0.5277, 0.1932, 0.1021, 0.1770],
          [0.4690, 0.2728, 0.1006, 0.1576],
          [0.1818, 0.3203, 0.2664, 0.2315],
          [0.3385, 0.0814, 0.4508, 0.1294],
          [0.2845, 0.3918, 0.1331, 0.1906]]]))

In [54]:
class MHA(nn.Module):
    def __init__(self, d_model, h):
        super().__init__()
        assert d_model % h ==0, "d_model must be divided by head number (h)"
        self.d_model = d_model
        self.h = h
        self.h_dim = d_model // h

        self.w_q = nn.Linear(d_model, d_model)
        self.w_k = nn.Linear(d_model, d_model) 
        self.w_v = nn.Linear(d_model, d_model) 
        self.w_o = nn.Linear(d_model, d_model) 

    def forward(self, q, k, v, mask=None):
        """
        q shape is (batch_size, seq_len_q, d_model)
        k shape is (batch_size, seq_len_k, d_model)
        v shape is (batch_size, seq_len_v, d_model)
        and seq_lenk = seq_len_v
        """
        batch_size, seq_len_q, _ = q.size()
        seq_len_k = k.size(1)

        Q = self.w_q(q).view(batch_size, seq_len_q, self.h, -1).transpose(1,2)
        K = self.w_k(k).view(batch_size, seq_len_k, self.h, -1).transpose(1,2)
        V = self.w_v(v).view(batch_size, seq_len_k, self.h, -1).transpose(1,2)

         # 检查形状
        print(f"Q shape: {Q.shape}")  # 应该是 (batch_size, h, seq_len_q, d_k)
        print(f"K shape: {K.shape}")  # 应该是 (batch_size, h, seq_len_k, d_k)

        # 计算点积注意力 (b, h, seq_q, h_dim)
        scaled_attention, attention_weight = scaled_dot_attention(Q, K, V, mask=mask)

        concat_out = scaled_attention.transpose(1,2).contiguous()
        concat_scr = concat_out.view(batch_size, -1, self.d_model)

        out = self.w_o(concat_scr)

        return out, attention_weight


In [57]:
batch_size = 1
seq_len_q = 64
seq_len_kv = 32
d_model = 128
q = torch.randn(batch_size, seq_len_q, d_model)
k = torch.randn(batch_size, seq_len_kv, d_model)
v = k
model = MHA(128, 4)
model
# print("model:", model, "\nMHA:", model(q, k, v))

MHA(
  (w_q): Linear(in_features=128, out_features=128, bias=True)
  (w_k): Linear(in_features=128, out_features=128, bias=True)
  (w_v): Linear(in_features=128, out_features=128, bias=True)
  (w_o): Linear(in_features=128, out_features=128, bias=True)
)

In [58]:
model(q, k, v)

Q shape: torch.Size([1, 4, 64, 32])
K shape: torch.Size([1, 4, 32, 32])


(tensor([[[ 0.1304, -0.0331, -0.1133,  ..., -0.0386, -0.1027, -0.0802],
          [ 0.0993, -0.0205, -0.0985,  ..., -0.0653, -0.1236, -0.0706],
          [ 0.1306, -0.0144, -0.0764,  ..., -0.0397, -0.1126, -0.0456],
          ...,
          [ 0.0430, -0.0142, -0.1107,  ..., -0.0713, -0.0989, -0.0815],
          [ 0.1026, -0.0217, -0.0900,  ..., -0.0851, -0.1228, -0.0165],
          [ 0.0868,  0.0154, -0.0860,  ..., -0.0370, -0.1157, -0.0687]]],
        grad_fn=<ViewBackward0>),
 tensor([[[[0.0514, 0.0294, 0.0280,  ..., 0.0470, 0.0182, 0.0261],
           [0.0442, 0.0377, 0.0367,  ..., 0.0371, 0.0312, 0.0278],
           [0.0231, 0.0285, 0.0609,  ..., 0.0270, 0.0462, 0.0230],
           ...,
           [0.0252, 0.0344, 0.0221,  ..., 0.0223, 0.0332, 0.0270],
           [0.0144, 0.0128, 0.0547,  ..., 0.0211, 0.0145, 0.0484],
           [0.0211, 0.0338, 0.0591,  ..., 0.0509, 0.0146, 0.0203]],
 
          [[0.0464, 0.0240, 0.0463,  ..., 0.0408, 0.0315, 0.0150],
           [0.0528, 0.0215, 0