# Transformer架构的实现及注意力机制在目前模型的实现

## Step1 导包

In [3]:
import torch 
from torch import nn
from torch.nn import functional as F

## Step2 构建注意力机制

In [4]:
class Attention(nn.Module):
    def __init__(self,input_dim,output_dim):
        super().__init__()
        self.input_dim = input_dim
        self.output_dim = output_dim
        self.W_q = nn.Linear(input_dim , output_dim , bias= False)
        self.W_k = nn.Linear(input_dim , output_dim , bias=False)
        self.W_v = nn.Linear(input_dim , output_dim , bias=False)
    def forward(self,input): # input的是一个n*embedding_dim的东西
        W_q_out = self.W_q(input)
        W_k_out = self.W_k(input)
        W_v_out = self.W_v(input)
        attention_weight = F.softmax(torch.matmul(W_q_out,W_k_out.T) / (self.output_dim**0.5))
        output = attention_weight @ W_v_out
        return output
input_ids = torch.tensor(
    [
        [0.12,0.21,0.32,0.14,0.25],
        [0.25,0.14,0.23,0.44,0.76],
        [1.21,2.33,0.79,-0.77,1.44],
        [1.47,-1.21,-0.99,-0.11,0.33],
        [0.12,0.22,0.32,0.12,-0.22]
    ]
)
input_dim = 5
output_dim = 3
attention = Attention(input_dim,output_dim)
output = attention(input_ids)
output



  attention_weight = F.softmax(torch.matmul(W_q_out,W_k_out.T) / (self.output_dim**0.5))


tensor([[-0.3072, -0.1173,  0.1296],
        [-0.3292, -0.1604,  0.1467],
        [-0.2711, -0.0496,  0.0951],
        [-0.2993, -0.0954,  0.1194],
        [-0.2919, -0.0879,  0.1171]], grad_fn=<MmBackward0>)

## Step2 加入因果机制后的注意力机制

In [9]:
class CausalAttention(nn.Module):

    def __init__(self, d_in, d_out, context_length,
                 dropout, qkv_bias=False):
        #初始化定义网络结构和参数
        super().__init__()
        self.d_out = d_out
        self.W_query = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.W_key   = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.W_value = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.dropout = nn.Dropout(dropout) # New
        self.register_buffer('mask', torch.triu(torch.ones(context_length, context_length), diagonal=1)) # New
        #定义QKV并对进行dropout防止过拟合
        #注册mask向量,对未来进行负无穷的拟合
    def forward(self, x):
        b, num_tokens, d_in = x.shape # New batch dimension b
        #提取batch的大小、token的数量、跟宽度
        keys = self.W_key(x)
        queries = self.W_query(x)
        values = self.W_value(x)
        #进行运算计算
        attn_scores = queries @ keys.transpose(1, 2) # Changed transpose
        #通过点积来计算attention的数值
        attn_scores.masked_fill_(  # New, _ ops are in-place
            self.mask.bool()[:num_tokens, :num_tokens], -torch.inf)  # `:num_tokens` to account for cases where the number of tokens in the batch is smaller than the supported context_size
        attn_weights = torch.softmax(
            attn_scores / keys.shape[-1]**0.5, dim=-1## 缩放因子 √d，用于稳定梯度
        )
        #在时间顺序上进行mask确保信息不会被泄露
        attn_weights = self.dropout(attn_weights) # New
        #防止过拟合的dropout处理方式
        context_vec = attn_weights @ values
        # 根据注意力权重计算上下文向量
        return context_vec
input_ids = torch.tensor(
    [[
        [0.12,0.21,0.32,0.14,0.25],
        [0.25,0.14,0.23,0.44,0.76],
        [1.21,2.33,0.79,-0.77,1.44],
        [1.47,-1.21,-0.99,-0.11,0.33],
        [0.12,0.22,0.32,0.12,-0.22]
    ],
        [
        [0.12,0.21,0.32,0.14,0.25],
        [0.25,0.14,0.23,0.44,0.76],
        [1.21,2.33,0.79,-0.77,1.44],
        [1.47,-1.21,-0.99,-0.11,0.33],
        [0.12,0.22,0.32,0.12,-0.22]
    ]]
)
causal_attention = CausalAttention(input_dim , output_dim,5,0.5)
output = causal_attention(input_ids)
output

tensor([[[ 0.1554, -0.3540, -0.1129],
         [ 0.1743, -0.3675, -0.0673],
         [-0.0281, -0.4370, -0.2596],
         [ 0.1820,  0.0597, -0.0215],
         [ 0.1316,  0.3815,  0.1657]],

        [[ 0.0000,  0.0000,  0.0000],
         [ 0.0779, -0.1776, -0.0566],
         [-0.0403, -0.4298, -0.2922],
         [ 0.1419,  0.1511,  0.0077],
         [-0.0361, -0.2825, -0.1986]]], grad_fn=<UnsafeViewBackward0>)