In [71]:
from transformers import AutoModel,AutoTokenizer
import torch

In [2]:
model_name = "../../model/bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name,output_hidden_states=True,output_attentions=True).to("cuda")
sentences = "this is a test sentence"

### model parameters

1.模型详细结构
- model

2.模型整体结构
- model.config

3.模型参数
- model.num_parameters()
- eps 相当于$\epsilon$
- elementwise_affine 相当于bias
$$\mathrm{output}=\mathrm{weight}\cdot\frac{\mathrm{input}-\mu}{\sqrt{\sigma^2+\epsilon}}+\mathrm{bias}$$

- model.norm.weight :参考norm层详细的参数

#### model(**tokens)
1.model(**tokens) 解析
- 输出得到last_hidden_state、pooler_output
- last_hidden_state维度:(batch_size,sequence_length,hidden_size)
- pooler_output维度:(batch_size,hidden_size)
- 可将两者用于下游任务，例如分类等任务
- pooler_output == model.pooler(last_hidden_state)


In [21]:
input = tokenizer(sentences,return_tensors="pt")
output = model(**input.to("cuda"))




### tokenizer 

1.查看分词情况:
- tokenizer.tokenize(sentences)

2.编码解码
- tokenizer.encode(sentences) = tokenizer.convert_tokens_to_ids()
- tokenizer.decode(senteencs) = tokenizer.convert_ids_to_tokens()

3.特殊编码
- tokenizer.special_tokens_map

4.词汇表
- tokenizer.vocab

5.输出解析
- ipute_ids:对输入进行vocal表的映射，即将自然语言转换成模型理解的语言
- attention_mask:可以理解为对词的关注
    - 如果有很多句子，len(attention_mask)等于最长句子的长度，目的是为了能够批次化输入，确保所有句子的输入长度都是一样的，跟padding=True相结合理解
- token_type_ids:用于区分不同的句子，len(token_type_ids)==len(attention_mask),一般用于句子对,相应的任务,可以理解为用于预测下一个句子是什么

6.模型最大输入长度
- tokenizer.model_max_length

### attention解析


$$Attention(Q,K,V) = softmax(\frac{QK^{T}}{\sqrt {d_k}})V$$

- output.attentions[0].shape 维度:(batch_size,num_heads,tokens_length,tokens_length)
- model.encoder.layer[0].attention.self.query.weight.T[:,:64]
    - 表示的是第一头的参数

In [None]:
## 获取第0层的第0个头的attention
output.attentions[0][0][0]

tensor([[0.0777, 0.1372, 0.0554, 0.1955, 0.0559, 0.0735, 0.4049],
        [0.2865, 0.0584, 0.2193, 0.0441, 0.1009, 0.1757, 0.1150],
        [0.2156, 0.0881, 0.1929, 0.0755, 0.1443, 0.2072, 0.0762],
        [0.1419, 0.1552, 0.1616, 0.1128, 0.1338, 0.2101, 0.0847],
        [0.0772, 0.0861, 0.2310, 0.0518, 0.0810, 0.2867, 0.1864],
        [0.1058, 0.0736, 0.2011, 0.0416, 0.3280, 0.1485, 0.1014],
        [0.1396, 0.1613, 0.0944, 0.1794, 0.0649, 0.1575, 0.2029]],
       device='cuda:0', grad_fn=<SelectBackward0>)

In [None]:
## torch.Size([1, 7, 768])
embedding = model.embeddings(input.input_ids,input.token_type_ids)

In [None]:
Q_parameter = model.encoder.layer[0].attention.self.query.weight.T[:,:64] ## [768,64]
Q_bias = model.encoder.layer[0].attention.self.query.bias[:64] ##[64]
K_parameter = model.encoder.layer[0].attention.self.key.weight.T[:,:64] ## [768,64]
K_bias = model.encoder.layer[0].attention.self.key.bias[:64] ##[64]
V_parameter = model.encoder.layer[0].attention.self.value.weight.T[:,:64]  ## [768,64]
V_bias = model.encoder.layer[0].attention.self.value.bias[:64] ##[64]

In [None]:
import math
Q = embedding[0] @ Q_parameter + Q_bias ## [7, 64]
K = embedding[0] @ K_parameter + K_bias ## [7, 64]
QKt = Q @ K.T ## [7, 7]
QKt = QKt / (math.sqrt(64))  
softmax = torch.nn.Softmax(dim=-1)
attention = softmax(QKt)
attention

tensor([[0.0777, 0.1372, 0.0554, 0.1955, 0.0559, 0.0735, 0.4049],
        [0.2865, 0.0584, 0.2193, 0.0441, 0.1009, 0.1757, 0.1150],
        [0.2156, 0.0881, 0.1929, 0.0755, 0.1443, 0.2072, 0.0762],
        [0.1419, 0.1552, 0.1616, 0.1128, 0.1338, 0.2101, 0.0847],
        [0.0772, 0.0861, 0.2310, 0.0518, 0.0810, 0.2867, 0.1864],
        [0.1058, 0.0736, 0.2011, 0.0416, 0.3280, 0.1485, 0.1014],
        [0.1396, 0.1613, 0.0944, 0.1794, 0.0649, 0.1575, 0.2029]],
       device='cuda:0', grad_fn=<SoftmaxBackward0>)

In [None]:
V = embedding[0] @ V_parameter + V_bias ## [7, 64]
attention_embedding = attention @ V ## [7, 64]
attention_embedding ## 下一层的输入，但是只有一个头，真正的输入是多个头拼接的结果（12个头）

torch.Size([7, 64])

### bert过程
![](./attention%20is%20all%20you%20need.jpg)

In [145]:
token_embedding = output.hidden_states[0]

In [157]:
### multi-head attention 过程
multi_head_attention = model.encoder.layer[0].attention.self(token_embedding)

### 第一次 Add&Norm 过程
add_norm = model.encoder.layer[0].attention.output(multi_head_attention[0],token_embedding)

### Feed Forward 过程
feed_forward = model.encoder.layer[0].intermediate(add_norm)

### 第二次 Add&Norm 过程
add_norm_ff = model.encoder.layer[0].output(feed_forward, add_norm)

In [151]:
multi_head_attention

(tensor([[[ 0.2979,  0.0801, -0.0037,  ..., -0.0142,  0.1290,  0.0828],
          [ 0.3935,  0.1356, -0.0920,  ...,  0.0211,  0.1677,  0.0011],
          [ 0.1696,  0.1449, -0.1039,  ...,  0.1604,  0.2172,  0.0310],
          ...,
          [-0.0617,  0.1968, -0.0669,  ...,  0.1126,  0.1933, -0.0204],
          [-0.2835,  0.1495, -0.0021,  ...,  0.0973,  0.1865, -0.0636],
          [ 0.2575,  0.1120, -0.1008,  ...,  0.0175,  0.1508,  0.0878]]],
        device='cuda:0', grad_fn=<ViewBackward0>),)

In [155]:
model.encoder.layer[0]

BertLayer(
  (attention): BertAttention(
    (self): BertSdpaSelfAttention(
      (query): Linear(in_features=768, out_features=768, bias=True)
      (key): Linear(in_features=768, out_features=768, bias=True)
      (value): Linear(in_features=768, out_features=768, bias=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (output): BertSelfOutput(
      (dense): Linear(in_features=768, out_features=768, bias=True)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
  )
  (intermediate): BertIntermediate(
    (dense): Linear(in_features=768, out_features=3072, bias=True)
    (intermediate_act_fn): GELUActivation()
  )
  (output): BertOutput(
    (dense): Linear(in_features=3072, out_features=768, bias=True)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
)