# Transformer难点理解与实现

### Transformer细节实现的难点
- word embedding
- position embedding
- mask
    - encoder self-attention mask
    - intra-attention mask
    - decoder self-attention mask
- multi-head self-attention

In [180]:
import torch
import numpy as np
import torch.nn as nn
import torch.nn.functional as F


## 1. word embedding
在实际任务中，怎么构建word embedding？

In [181]:
# 以序列建模为例,考虑source sequence和target sequence 

# 构建序列，序列的字符以词表的索引的形式

# 设置随机种子
torch.manual_seed(3)


batch_size = 2
max_num_src_words= max_num_tgt_words = 8 # 原序列和目标序列的单词数最大都是8

max_src_seq_len = max_tgt_seq_len = 5 # 最大序列长度

src_len = torch.randint(2,5,(batch_size,)).to(dtype=torch.int32) # 随机生成整形，第一个位置是最小值，第二个位置是最大值，第三个是tuple
tgt_len = torch.randint(2,5,(batch_size,)).to(dtype=torch.int32) 

print(f"src_len:\n{src_len}\n tgt_len:\n{tgt_len}")
# tensor([3, 4]) tensor([3, 2])
# 表示src有两个batch，第一个句子长度为3，第二个句子长度为4.
#  tgt有两个batch，第一个句子长度为3，第二个句子长度为2


# 根据长度生成原序列和目标序列，是单词索引构成的句子
src_seq = [torch.randint(1,max_num_src_words,(L,)) for L in src_len] # 单词的索引最小为1，最大为8，size就是src_len的长度
tgt_seq = [torch.randint(1,max_num_tgt_words,(L,)) for L in tgt_len]
print(f"src_seq:\n{src_seq}\n tgt_seq:\n{tgt_seq}")

#src_seq, tgt_seq
# ([tensor([1, 7, 3]), tensor([5, 6, 1, 7])],
#  [tensor([7, 1, 7]), tensor([7, 3])])
# 表示src第一个句子是1,7,3，第二个句子是5,6,1,7
# tgt第一个句子是7,1,7,第二个句子是7, 3

# 进行padding，pad到最大长度
# src_seq = [F.pad(torch.randint(1,max_num_src_words,(L,)),(0,max_src_seq_len-L)) for L in src_len]
# tgt_seq = [F.pad(torch.randint(1,max_num_tgt_words,(L,)),(0,max_tgt_seq_len-L)) for L in tgt_len]

src_seq_padded = [F.pad(seq,(0,max_src_seq_len-len(seq))) for seq in src_seq]
tgt_seq_padded = [F.pad(seq,(0,max_tgt_seq_len-len(seq))) for seq in tgt_seq]
print(f"src_seq_padded:\n{src_seq_padded} \ntgt_seq_padded:\n{tgt_seq_padded}")
# [tensor([1, 7, 3, 0, 0]), tensor([5, 6, 1, 7, 0])]  包含两个一维张量（1D tensors）的列表。每个张量都有5个元素，所以每个张量的维度是 [5]


# 把src_seq和tgt_seq从列表变成二维Tensor：(batch_size, max_seq_len)
# unsqueez变成二维，加一个0维，用torch.cat 在第0维cat起来

src_seq_tensor = [torch.unsqueeze(torch.tensor(seq),0) for seq in src_seq_padded]
#[tensor([[1, 7, 3, 0, 0]]), tensor([[5, 6, 1, 7, 0]])] 包含两个二维张量（2D tensors）的列表。每个张量的形状（dimension）是 [1, 5]
src_seq_cat = torch.cat(src_seq_tensor, dim=0)            
# tensor([[1, 7, 3, 0, 0],[5, 6, 1, 7, 0]]) 二维张量

tgt_seq_tensor = [torch.unsqueeze(torch.tensor(seq),0) for seq in tgt_seq_padded]
tgt_seq_cat = torch.cat(tgt_seq_tensor, dim=0) 

print(f"src_seq_cat:\n{src_seq_cat} \n tgt_seq_cat:\n{tgt_seq_cat}")


# 构造embedding
# 通过torch.nn.Embedding来构造embedding的表，根据索引就可以从这个表中取某一行
# 参数：num_embeddings, 单词表大小，注意这里是 max_num_src_words+ 1，因为0是pad的索引，前面 有padding填充的是0
#  model_dim，模型的特征大小，通常是512
model_dim = 8
src_embedding_table = nn.Embedding(max_num_src_words + 1 ,model_dim) # 随机初始化
tgt_embedding_table = nn.Embedding(max_num_tgt_words + 1 ,model_dim)
print(src_embedding_table.weight)
# 根据词的索引来查表得到embedding,传单词的索引
src_embedding = src_embedding_table(src_seq_cat)
tgt_embedding = tgt_embedding_table(tgt_seq_cat)

print(src_embedding)

src_len:
tensor([3, 4], dtype=torch.int32)
 tgt_len:
tensor([3, 2], dtype=torch.int32)
src_seq:
[tensor([1, 7, 3]), tensor([5, 6, 1, 7])]
 tgt_seq:
[tensor([7, 1, 7]), tensor([7, 3])]
src_seq_padded:
[tensor([1, 7, 3, 0, 0]), tensor([5, 6, 1, 7, 0])] 
tgt_seq_padded:
[tensor([7, 1, 7, 0, 0]), tensor([7, 3, 0, 0, 0])]
src_seq_cat:
tensor([[1, 7, 3, 0, 0],
        [5, 6, 1, 7, 0]]) 
 tgt_seq_cat:
tensor([[7, 1, 7, 0, 0],
        [7, 3, 0, 0, 0]])
Parameter containing:
tensor([[ 0.2615,  0.9311, -0.5145, -1.6517,  1.0460,  0.5222, -0.1668,  0.0530],
        [ 0.5638,  2.2566,  1.8693, -1.1952,  0.9979,  0.4592,  2.4364, -0.1468],
        [-0.4760, -0.2929, -0.3481,  0.3487,  0.0371, -0.0677,  0.4290, -0.8681],
        [-0.2712,  0.1416,  0.1295,  0.6814, -0.9583,  0.0639,  0.6589,  0.8195],
        [-0.4554,  2.2124, -0.3770, -0.1437,  0.6480, -2.3256,  1.2683, -0.2483],
        [ 0.9578, -1.2890, -1.6483,  0.8290, -0.8373, -0.5296,  1.3544,  1.3778],
        [-0.0752, -0.4233,  0.4217, -

  src_seq_tensor = [torch.unsqueeze(torch.tensor(seq),0) for seq in src_seq_padded]
  tgt_seq_tensor = [torch.unsqueeze(torch.tensor(seq),0) for seq in tgt_seq_padded]


## 2. position embedding
Transformer没有局部性假设，也没有有序性假设，所以需要一个位置信息。

$$ PE(pos,2i)=sin(pos/10000^{2i/d_{model}}) $$
$$ PE(pos,2i+1)=cos(pos/10000^{2i/d_{model}}) $$

PE是一个二维矩阵，行数是训练的序列的最大长度。每一列是d_model。
pos决定了行，i决定了列。要构造两个矩阵pos和i，pos的每一行都是一样的，i的每一列都是一样的。用矩阵相乘的方式来构造PE。


In [182]:
# position的最大长度
max_position_len = 5 
pos_mat = torch.arange(max_position_len).reshape((-1,1))
i_mat = torch.pow(10000,torch.arange(0,max_num_src_words,2).reshape((1,-1))/model_dim)

pos_mat,i_mat


(tensor([[0],
         [1],
         [2],
         [3],
         [4]]),
 tensor([[   1.,   10.,  100., 1000.]]))

In [183]:
# 定义pe_embedding_table，对奇偶数列进行赋值
pe_embedding_table = torch.zeros(max_position_len,model_dim)
pe_embedding_table[:,0::2]= torch.sin(pos_mat / i_mat)
pe_embedding_table[:,1::2]= torch.cos(pos_mat / i_mat)

pe_embedding = nn.Embedding(max_position_len, model_dim)
pe_embedding.weight = nn.Parameter(pe_embedding_table, requires_grad=False)
print(pe_embedding.weight)

# 根据词的位置的索引来查表得到embedding,位置索引就是0,1,2,3,4，这里千万不要传入word_index
src_pos = torch.Tensor(torch.arange(max(src_len))).to(torch.int32)
tgt_pos = torch.Tensor(torch.arange(max(tgt_len))).to(torch.int32)
print(f"src_pos 为{src_pos}")
src_pe_embedding = src_embedding_table(src_pos)
tgt_pe_embedding = tgt_embedding_table(tgt_pos)
print(src_pe_embedding)
print(tgt_pe_embedding)


Parameter containing:
tensor([[ 0.0000e+00,  1.0000e+00,  0.0000e+00,  1.0000e+00,  0.0000e+00,
          1.0000e+00,  0.0000e+00,  1.0000e+00],
        [ 8.4147e-01,  5.4030e-01,  9.9833e-02,  9.9500e-01,  9.9998e-03,
          9.9995e-01,  1.0000e-03,  1.0000e+00],
        [ 9.0930e-01, -4.1615e-01,  1.9867e-01,  9.8007e-01,  1.9999e-02,
          9.9980e-01,  2.0000e-03,  1.0000e+00],
        [ 1.4112e-01, -9.8999e-01,  2.9552e-01,  9.5534e-01,  2.9995e-02,
          9.9955e-01,  3.0000e-03,  1.0000e+00],
        [-7.5680e-01, -6.5364e-01,  3.8942e-01,  9.2106e-01,  3.9989e-02,
          9.9920e-01,  4.0000e-03,  9.9999e-01]])
src_pos 为tensor([0, 1, 2, 3], dtype=torch.int32)
tensor([[ 0.2615,  0.9311, -0.5145, -1.6517,  1.0460,  0.5222, -0.1668,  0.0530],
        [ 0.5638,  2.2566,  1.8693, -1.1952,  0.9979,  0.4592,  2.4364, -0.1468],
        [-0.4760, -0.2929, -0.3481,  0.3487,  0.0371, -0.0677,  0.4290, -0.8681],
        [-0.2712,  0.1416,  0.1295,  0.6814, -0.9583,  0.0639,  0.6

## 3. mask
mask的目的是为了让模型高效地训练好。一次训练会用多个样本，也就是mini-batch训练，由于序列长度不一样，就需要mask来保证得到的表征是有效的表征，也就是说不希望表征中有padding的那些符号的表征。

$$Attention = \frac{QK^T}{\sqrt{d_k}}V$$

In [184]:
# Scale的重要性演示，除以sqrt (dk)可以让分布不那么尖锐，方差更小
# softmax, 下面的例子prob2的分布非常不平衡，prob1分布均匀
score = torch.randn(5)
print(score)
alpha1 = 0.1
alpha2= 10

prob1 = F.softmax(score*alpha1, -1)
prob2 = F.softmax(score*alpha2, -1)
prob1,prob2
#(tensor([0.2232, 0.1983, 0.1800, 0.2066, 0.1919]),
# tensor([9.9954e-01, 7.3128e-06, 4.5446e-10, 4.5167e-04, 2.7027e-07]))#

# Jakobian 矩阵（Jacobian 矩阵用于计算损失函数相对于网络参数的梯度）,下面例子中jacob_mat2接近0

def softmax_func(score):
    return F.softmax(score)

jacob_mat1 = torch.autograd.functional.jacobian(softmax_func,score*alpha1)
jacob_mat2 = torch.autograd.functional.jacobian(softmax_func,score*alpha2)
jacob_mat1,jacob_mat2

tensor([ 0.0051, -1.6722,  0.8699, -2.2548, -0.4953])


  return F.softmax(score)


(tensor([[ 0.1679, -0.0385, -0.0497, -0.0363, -0.0433],
         [-0.0385,  0.1479, -0.0420, -0.0307, -0.0366],
         [-0.0497, -0.0420,  0.1786, -0.0396, -0.0473],
         [-0.0363, -0.0307, -0.0396,  0.1413, -0.0346],
         [-0.0433, -0.0366, -0.0473, -0.0346,  0.1618]]),
 tensor([[ 1.7549e-04, -1.5994e-15, -1.7549e-04, -4.7188e-18, -2.0666e-10],
         [-1.5994e-15,  9.1121e-12, -9.1105e-12, -2.4498e-25, -1.0729e-17],
         [-1.7549e-04, -9.1105e-12,  1.7670e-04, -2.6880e-14, -1.1772e-06],
         [-4.7188e-18, -2.4498e-25, -2.6880e-14,  2.6885e-14, -3.1654e-20],
         [-2.0666e-10, -1.0729e-17, -1.1772e-06, -3.1654e-20,  1.1774e-06]]))

### 3.1 encoder self-attention mask
序列自身对自身的关联性的计算，不涉及到因果，一次性输入src。

mask放在softmax里面，既被mask的值希望是负无穷，那么softmax的概率接近于0，单词之间的关联性为0。
mask矩阵元素值为1或者负无穷，shape为(batch_size,max_src_len, max_src_len) 

In [185]:
# 构造有效编码器的位置，valid_encoder_pos的shape是(batch,1,T)
valid_encoder_pos = torch.unsqueeze(torch.cat([torch.unsqueeze(F.pad(torch.ones(L),(0, max_src_seq_len-L)),0) for L in src_len ]), 2)
valid_encoder_pos_matrix = torch.bmm(valid_encoder_pos, valid_encoder_pos.transpose(1,2))
# src_seq的第一个的有效位置是前三个，第二个的有效位置是前四个
print(valid_encoder_pos_matrix.shape)
print(f"valid_encoder_pos_matrix:{valid_encoder_pos_matrix}")
invalid_encoder_pos_matrix = 1- valid_encoder_pos_matrix
mask_encoder_self_attention = invalid_encoder_pos_matrix.to(torch.bool)
score = torch.randn(batch_size, max_src_seq_len,max_src_seq_len)
# 对score进行mask
masked_score = score.masked_fill(mask_encoder_self_attention, -1e9)
prob = F.softmax(masked_score, -1)

torch.Size([2, 5, 5])
valid_encoder_pos_matrix:tensor([[[1., 1., 1., 0., 0.],
         [1., 1., 1., 0., 0.],
         [1., 1., 1., 0., 0.],
         [0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0.]],

        [[1., 1., 1., 1., 0.],
         [1., 1., 1., 1., 0.],
         [1., 1., 1., 1., 0.],
         [1., 1., 1., 1., 0.],
         [0., 0., 0., 0., 0.]]])


In [186]:
print(f"score:{score}")
print(f"masked_score:{masked_score}")
print(f"prob:{prob}")

score:tensor([[[-0.6846,  0.0840, -0.9063, -1.4348,  0.6481],
         [-0.2415, -0.0999,  1.0256, -0.1243, -0.6120],
         [-1.2204,  1.6596,  0.8491,  0.0144,  0.9899],
         [ 0.7697, -0.1433,  0.2896, -1.2278,  0.7073],
         [ 0.5884,  0.1431, -0.9165, -0.4507, -0.6832]],

        [[ 0.0307,  0.3432, -0.6461, -0.3701, -1.3320],
         [ 0.6362, -0.3848, -0.2995, -0.2915, -1.0610],
         [ 0.5787,  1.4887, -1.2425,  0.4335,  0.8034],
         [ 0.8848, -0.5493, -0.6365, -1.6809, -0.6679],
         [ 0.4959,  0.3084,  0.7170, -0.6831,  1.9426]]])
masked_score:tensor([[[-6.8457e-01,  8.3978e-02, -9.0626e-01, -1.0000e+09, -1.0000e+09],
         [-2.4148e-01, -9.9878e-02,  1.0256e+00, -1.0000e+09, -1.0000e+09],
         [-1.2204e+00,  1.6596e+00,  8.4913e-01, -1.0000e+09, -1.0000e+09],
         [-1.0000e+09, -1.0000e+09, -1.0000e+09, -1.0000e+09, -1.0000e+09],
         [-1.0000e+09, -1.0000e+09, -1.0000e+09, -1.0000e+09, -1.0000e+09]],

        [[ 3.0720e-02,  3.4323e-01,

### 3.2 intra-attention mask
序列decoder seq对decoder seq的关联性的计算，涉及到因果，需要加mask。  
Q @ K^T shape:[batch_size, tgt_seq_len, src_seq_len]

In [187]:

# 构造有效解码器的位置
valid_decoder_pos = torch.unsqueeze(torch.cat([torch.unsqueeze(F.pad(torch.ones(L),(0, max_tgt_seq_len-L)),0) for L in tgt_len ]), 2)
# 目标序列对原序列的交叉矩阵
valid_cross_pos_matrix = torch.bmm(valid_decoder_pos, valid_encoder_pos.transpose(1,2))
print(f"valid_decoder_pos:{valid_decoder_pos}")
print(f"valid_encoder_pos:{valid_encoder_pos}")
invalid_cross_pos_matrix = 1- valid_cross_pos_matrix
print(f"invalid_cross_pos_matrix:{invalid_cross_pos_matrix}")

valid_decoder_pos:tensor([[[1.],
         [1.],
         [1.],
         [0.],
         [0.]],

        [[1.],
         [1.],
         [0.],
         [0.],
         [0.]]])
valid_encoder_pos:tensor([[[1.],
         [1.],
         [1.],
         [0.],
         [0.]],

        [[1.],
         [1.],
         [1.],
         [1.],
         [0.]]])
invalid_cross_pos_matrix:tensor([[[0., 0., 0., 1., 1.],
         [0., 0., 0., 1., 1.],
         [0., 0., 0., 1., 1.],
         [1., 1., 1., 1., 1.],
         [1., 1., 1., 1., 1.]],

        [[0., 0., 0., 0., 1.],
         [0., 0., 0., 0., 1.],
         [1., 1., 1., 1., 1.],
         [1., 1., 1., 1., 1.],
         [1., 1., 1., 1., 1.]]])


In [188]:
mask_decoder_cross_attention = invalid_cross_pos_matrix.to(torch.bool)
score = torch.randn(batch_size, max_src_seq_len,max_src_seq_len)
# 对score进行mask
masked_score = score.masked_fill(mask_decoder_cross_attention, -1e9)
prob = F.softmax(masked_score, -1)
prob

tensor([[[0.2028, 0.3452, 0.4520, 0.0000, 0.0000],
         [0.3709, 0.3258, 0.3033, 0.0000, 0.0000],
         [0.6650, 0.1965, 0.1385, 0.0000, 0.0000],
         [0.2000, 0.2000, 0.2000, 0.2000, 0.2000],
         [0.2000, 0.2000, 0.2000, 0.2000, 0.2000]],

        [[0.2563, 0.4074, 0.2614, 0.0749, 0.0000],
         [0.2543, 0.3565, 0.2327, 0.1565, 0.0000],
         [0.2000, 0.2000, 0.2000, 0.2000, 0.2000],
         [0.2000, 0.2000, 0.2000, 0.2000, 0.2000],
         [0.2000, 0.2000, 0.2000, 0.2000, 0.2000]]])

### 3.3 decoder self-attention mask
计算encoder的输出和mask-mha的输出的关联性
这里是自回归，每个单词都在上一个单词预测的基础上。
mask是一个三角形的矩阵

In [189]:
# 对每个序列构建下三角，tgt第一个序列3*3，第二个序列2*2
tri_mat = [torch.tril(torch.ones(L,L)) for L in tgt_len]
print(tri_mat)

# 构建有效decoder tri matrix，进行padding, concat
valid_decoder_tri_matrix = torch.cat([torch.unsqueeze(F.pad(torch.tril(torch.ones((L,L))),
                            (0,max_tgt_seq_len-L,0,max_tgt_seq_len-L)),0) for L in tgt_len])


print(valid_decoder_tri_matrix)  
print(valid_decoder_tri_matrix.shape)


[tensor([[1., 0., 0.],
        [1., 1., 0.],
        [1., 1., 1.]]), tensor([[1., 0.],
        [1., 1.]])]
tensor([[[1., 0., 0., 0., 0.],
         [1., 1., 0., 0., 0.],
         [1., 1., 1., 0., 0.],
         [0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0.]],

        [[1., 0., 0., 0., 0.],
         [1., 1., 0., 0., 0.],
         [0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0.]]])
torch.Size([2, 5, 5])


In [190]:
invalid_decoder_tri_matrix = 1- valid_decoder_tri_matrix
invalid_decoder_tri_matrix = invalid_decoder_tri_matrix.to(torch.bool)
print(invalid_decoder_tri_matrix)
score = torch.randn(batch_size,max_tgt_seq_len,max_tgt_seq_len)
masked_score = score.masked_fill(invalid_decoder_tri_matrix, -1e9)
prob = F.softmax(masked_score, -1)
prob

tensor([[[False,  True,  True,  True,  True],
         [False, False,  True,  True,  True],
         [False, False, False,  True,  True],
         [ True,  True,  True,  True,  True],
         [ True,  True,  True,  True,  True]],

        [[False,  True,  True,  True,  True],
         [False, False,  True,  True,  True],
         [ True,  True,  True,  True,  True],
         [ True,  True,  True,  True,  True],
         [ True,  True,  True,  True,  True]]])


tensor([[[1.0000, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.3884, 0.6116, 0.0000, 0.0000, 0.0000],
         [0.1540, 0.5055, 0.3405, 0.0000, 0.0000],
         [0.2000, 0.2000, 0.2000, 0.2000, 0.2000],
         [0.2000, 0.2000, 0.2000, 0.2000, 0.2000]],

        [[1.0000, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.8679, 0.1321, 0.0000, 0.0000, 0.0000],
         [0.2000, 0.2000, 0.2000, 0.2000, 0.2000],
         [0.2000, 0.2000, 0.2000, 0.2000, 0.2000],
         [0.2000, 0.2000, 0.2000, 0.2000, 0.2000]]])

## 4. multi-head attention

multi-head可以理解为batch，样本与样本之间独立，然后汇总去计算。

In [191]:
# scaled self attention
# Q和K都是(batch_sieze*num_head, seq+len, model_dim/num_head)
def scaled_dot_product_attention(Q,K,V,attn_mask):
    score = torch.bmm(Q,K.transpose(-2,-1))/torch.sqrt(model_dim)
    masked_score = score.masked_fill(attn_mask, -1e9)
    prob = F.softmax(masked_score, -1)
    context = torch.bmm(prob,V)
    return context


## 5. Masked loss计算

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F

logits = torch.randn(2,3,4)
# batchsize=2, seqlen=3, vocab_size =4
label = torch.randint(0,4,(2,3))

