In [169]:
import torch
import numpy as np
import torch.nn as nn
import torch.nn.functional as F

# 考虑source sentence和target sentence，构建序列，序列的字符以其在词表中的索引的形式表示
batch_size=2

# 单词表大小
max_src_num=8
max_tgt_num=8
model_dim=8

# 序列最大长度
max_src_len=5
max_tgt_len=5

# 构造src与tgt尺寸不同的张量，通过后续的padding操作对齐
src_len=torch.Tensor([2,4]).to(torch.int32)
tgt_len=torch.Tensor([4,3]).to(torch.int32)
print(src_len)

# 单词索引构成的句子
src_seq=[torch.randint(1,max_src_num,(L,)) for L in src_len]
tgt_seq=[torch.randint(1,max_tgt_num,(L,)) for L in tgt_len]
print(src_seq)
# 使用F.pad进行填充
src_seq=[F.pad(torch.randint(1,max_src_num,(L,)),(0,max_src_len-L) )for L in src_len]
# 输出：[tensor([6, 5, 0, 0, 0]), tensor([1, 6, 3, 7, 0])]
src_seq=[torch.unsqueeze(F.pad(torch.randint(1,max_src_num,(L,)),(0,max_src_len-L)),0)\
                   for L in src_len]

print(src_seq)


tensor([2, 4], dtype=torch.int32)
[tensor([7, 5]), tensor([4, 7, 4, 6])]
[tensor([[6, 2, 0, 0, 0]]), tensor([[5, 7, 1, 5, 0]])]


In [170]:
# 将其变为二维张量，unsqueese将一维张量（5）变为二维（1,5）
src_seq=torch.cat([torch.unsqueeze(F.pad(torch.randint(1,max_src_num,(L,)),(0,max_src_len-L)),0)\
                   for L in src_len],0)
print(src_seq)


tensor([[7, 1, 0, 0, 0],
        [3, 4, 5, 5, 0]])


In [171]:
# 同理，将target也转为二维张量
tgt_seq=torch.cat([torch.unsqueeze(F.pad(torch.randint(1,max_tgt_num,(L,)),(0,max_tgt_len-L)),0)\
                   for L in tgt_len])
print(tgt_seq)

tensor([[6, 6, 5, 3, 0],
        [7, 4, 1, 0, 0]])


## 上述操作利用单词索引构造了源句子和目标句子，并且做了padding，填充值默认为0
## 下面进行embedding构造

In [172]:
# 构造embedding
model_dim=16
src_embedding_table=nn.Embedding(max_src_num,model_dim)# 调用nn.Embedding中forward得到权重
tgt_embedding_table=nn.Embedding(max_tgt_num,model_dim)
src_embedding=src_embedding_table(src_seq)
tgt_embedding=tgt_embedding_table(tgt_seq)

print(src_embedding_table.weight)# 得到一个table，是src词表的权重
print(src_seq)# src词表的索引
print(src_embedding)# 按照索引取出的权重
print(src_embedding.shape)# shape中前两维不变，最后一维由原来的标量变为一维张量，所以维数变为三维


Parameter containing:
tensor([[ 0.6528,  2.4097,  0.9570,  0.3858, -0.3041,  0.6134,  0.6142,  1.3916,
          0.6576,  0.9314, -1.1369,  0.6160,  1.0785,  0.8872, -0.9456, -2.9881],
        [-0.5512,  0.4620, -1.7225, -1.8100, -1.2250, -0.1286, -0.4998, -0.3149,
         -1.9457,  0.2144, -1.7817,  1.1369, -0.2190,  0.1554, -0.7529,  0.6534],
        [ 1.0048, -0.7240, -0.1027, -0.9836, -0.4643, -0.2430,  0.3013,  0.6516,
         -0.2913, -0.2158, -0.3863, -0.7110,  1.6927,  0.1301,  0.8376,  0.1422],
        [ 1.0934, -1.4516, -0.7795,  0.7219, -1.3705,  0.0106,  0.0246,  0.4021,
          1.8732,  0.0562,  3.2037, -0.0233,  0.1952,  1.4447, -0.4063,  0.4728],
        [-2.7953, -1.7910, -1.3457, -0.9195,  1.1181,  1.0691, -2.6980, -1.5794,
         -0.3142,  1.2090,  1.8347, -0.1617, -0.4771,  0.2918, -2.0525, -0.3232],
        [ 1.0037, -0.1365, -0.0463, -0.0243,  1.1619,  0.0221,  1.7008, -2.1778,
          0.1995,  0.7878, -1.0047,  1.5440,  0.8991,  1.6637, -1.3616,  0.1457],


### 构造PositionEmbedding，其公式如下
$$
PE_{(pos,2i)}=\sin\frac {pos}{10000^{2i/d_{model}}}\\
PE_{(pos,2i+1)}=\cos\frac {pos}{10000^{2i/d_{model}}}
$$

In [173]:
pos_mat=torch.arange(max_src_len).reshape((-1,1))
i_mat=torch.pow(10000,torch.arange(0,model_dim,2).reshape((1,-1))/model_dim)

print(pos_mat)
print(i_mat)




tensor([[0],
        [1],
        [2],
        [3],
        [4]])
tensor([[1.0000e+00, 3.1623e+00, 1.0000e+01, 3.1623e+01, 1.0000e+02, 3.1623e+02,
         1.0000e+03, 3.1623e+03]])


In [174]:
# 构建position embedding
pe_embedding_table=torch.zeros(max_src_len,model_dim)
pe_embedding_table[:,0::2]=torch.sin(pos_mat/i_mat)
pe_embedding_table[:,1::2]=torch.cos(pos_mat/i_mat)
print(pe_embedding_table)
print(pe_embedding_table.shape)

tensor([[ 0.0000e+00,  1.0000e+00,  0.0000e+00,  1.0000e+00,  0.0000e+00,
          1.0000e+00,  0.0000e+00,  1.0000e+00,  0.0000e+00,  1.0000e+00,
          0.0000e+00,  1.0000e+00,  0.0000e+00,  1.0000e+00,  0.0000e+00,
          1.0000e+00],
        [ 8.4147e-01,  5.4030e-01,  3.1098e-01,  9.5042e-01,  9.9833e-02,
          9.9500e-01,  3.1618e-02,  9.9950e-01,  9.9998e-03,  9.9995e-01,
          3.1623e-03,  9.9999e-01,  1.0000e-03,  1.0000e+00,  3.1623e-04,
          1.0000e+00],
        [ 9.0930e-01, -4.1615e-01,  5.9113e-01,  8.0658e-01,  1.9867e-01,
          9.8007e-01,  6.3203e-02,  9.9800e-01,  1.9999e-02,  9.9980e-01,
          6.3245e-03,  9.9998e-01,  2.0000e-03,  1.0000e+00,  6.3246e-04,
          1.0000e+00],
        [ 1.4112e-01, -9.8999e-01,  8.1265e-01,  5.8275e-01,  2.9552e-01,
          9.5534e-01,  9.4726e-02,  9.9550e-01,  2.9995e-02,  9.9955e-01,
          9.4867e-03,  9.9995e-01,  3.0000e-03,  1.0000e+00,  9.4868e-04,
          1.0000e+00],
        [-7.5680e-01

In [175]:
pe_embedding=nn.Embedding(max_src_len,model_dim)
pe_embedding.weight=nn.Parameter(pe_embedding_table,requires_grad=False)


# src_pe_embedding=pe_embedding(src_seq)# 错误语句，不能传输入索引，而应该传也就是序列的pos
src_pos=torch.cat([torch.unsqueeze(torch.arange((max_src_len)),0) for _ in src_len]).to(torch.long)
tgt_pos=torch.cat([torch.unsqueeze(torch.arange((max_tgt_len)),0) for _ in tgt_len]).to(torch.long)


# 得到position embedding
src_pe_embedding=pe_embedding(src_pos)

tgt_pe_embedding=pe_embedding(tgt_pos)
print(src_pe_embedding.shape)
print(tgt_pe_embedding)

word_embedding=src_pe_embedding+src_embedding
print(word_embedding.shape)

torch.Size([2, 5, 16])
tensor([[[ 0.0000e+00,  1.0000e+00,  0.0000e+00,  1.0000e+00,  0.0000e+00,
           1.0000e+00,  0.0000e+00,  1.0000e+00,  0.0000e+00,  1.0000e+00,
           0.0000e+00,  1.0000e+00,  0.0000e+00,  1.0000e+00,  0.0000e+00,
           1.0000e+00],
         [ 8.4147e-01,  5.4030e-01,  3.1098e-01,  9.5042e-01,  9.9833e-02,
           9.9500e-01,  3.1618e-02,  9.9950e-01,  9.9998e-03,  9.9995e-01,
           3.1623e-03,  9.9999e-01,  1.0000e-03,  1.0000e+00,  3.1623e-04,
           1.0000e+00],
         [ 9.0930e-01, -4.1615e-01,  5.9113e-01,  8.0658e-01,  1.9867e-01,
           9.8007e-01,  6.3203e-02,  9.9800e-01,  1.9999e-02,  9.9980e-01,
           6.3245e-03,  9.9998e-01,  2.0000e-03,  1.0000e+00,  6.3246e-04,
           1.0000e+00],
         [ 1.4112e-01, -9.8999e-01,  8.1265e-01,  5.8275e-01,  2.9552e-01,
           9.5534e-01,  9.4726e-02,  9.9550e-01,  2.9995e-02,  9.9955e-01,
           9.4867e-03,  9.9995e-01,  3.0000e-03,  1.0000e+00,  9.4868e-04,
     

### Softmax演示，scale的重要性
在attention论文中作者使用scale dot-product attention对$QK^{T}$进行缩放，其主要目的就是为了将其方差固定在1，防止过大的方差导致的权重不平衡（大的越大，小的越小，像下面例子中所演示的）

In [176]:
score=torch.randn(5)
prob=F.softmax(score,0)
print(score)
print(prob)

tensor([ 1.0004, -0.2448, -1.9298,  1.0693,  0.6463])
tensor([0.3211, 0.0924, 0.0171, 0.3440, 0.2253])


In [177]:
# score的缩放在softmax上并不是线性的，而是大的越大小的越小

alpha1,alpha2=0.1,10
prob1,prob2=F.softmax(score*alpha1,0),F.softmax(score*alpha2,-1)
print(prob1,prob2)

tensor([0.2173, 0.1919, 0.1621, 0.2188, 0.2098]) tensor([3.3120e-01, 1.2951e-06, 6.2244e-14, 6.5920e-01, 9.5964e-03])


In [178]:
def softmax_func(score):
    return F.softmax(score)
jaco_mat1=torch.autograd.functional.jacobian(softmax_func,score*alpha1)
jaco_mat2=torch.autograd.functional.jacobian(softmax_func,score*alpha2)
print(jaco_mat1)
print(jaco_mat2)

tensor([[ 0.1701, -0.0417, -0.0352, -0.0476, -0.0456],
        [-0.0417,  0.1551, -0.0311, -0.0420, -0.0403],
        [-0.0352, -0.0311,  0.1358, -0.0355, -0.0340],
        [-0.0476, -0.0420, -0.0355,  0.1710, -0.0459],
        [-0.0456, -0.0403, -0.0340, -0.0459,  0.1658]])
tensor([[ 2.2151e-01, -4.2895e-07, -2.0616e-14, -2.1833e-01, -3.1784e-03],
        [-4.2895e-07,  1.2951e-06, -8.0615e-20, -8.5376e-07, -1.2429e-08],
        [-2.0616e-14, -8.0615e-20,  6.2244e-14, -4.1032e-14, -5.9732e-16],
        [-2.1833e-01, -8.5376e-07, -4.1032e-14,  2.2466e-01, -6.3260e-03],
        [-3.1784e-03, -1.2429e-08, -5.9732e-16, -6.3260e-03,  9.5043e-03]])


  


### 构造Encoder的self-attention mask

In [179]:
# mask_shape:[batch_size,max_src_len,max_src_len]
valid_encoder_pos=torch.unsqueeze(torch.cat([torch.unsqueeze(F.pad(torch.ones(L),(0,max_src_len-L)),0) for L in src_len]),2)
print(valid_encoder_pos)
print(valid_encoder_pos.shape)
# [tensor([1., 1., 0., 0., 0.]), tensor([1., 1., 1., 1., 0.])]
# 说明第一个句子有效位置为前两位，第二个句子有效位置为前四位


tensor([[[1.],
         [1.],
         [0.],
         [0.],
         [0.]],

        [[1.],
         [1.],
         [1.],
         [1.],
         [0.]]])
torch.Size([2, 5, 1])


In [180]:
valid_encoder_pos_mat=torch.bmm(valid_encoder_pos,valid_encoder_pos.transpose(1,2))
print(valid_encoder_pos_mat.shape)
print(src_len)
print(valid_encoder_pos_mat)

torch.Size([2, 5, 5])
tensor([2, 4], dtype=torch.int32)
tensor([[[1., 1., 0., 0., 0.],
         [1., 1., 0., 0., 0.],
         [0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0.]],

        [[1., 1., 1., 1., 0.],
         [1., 1., 1., 1., 0.],
         [1., 1., 1., 1., 0.],
         [1., 1., 1., 1., 0.],
         [0., 0., 0., 0., 0.]]])


上述第一个样本矩阵的含义：src的句子长度为2，所以有效关联性的第一个单词和一二个单词，而跟三四单词的关联性为0

In [181]:
invalid_encoder_pos_mat=1-valid_encoder_pos_mat
print(invalid_encoder_pos_mat)# 无效矩阵，此时0代表有效1代表无效

tensor([[[0., 0., 1., 1., 1.],
         [0., 0., 1., 1., 1.],
         [1., 1., 1., 1., 1.],
         [1., 1., 1., 1., 1.],
         [1., 1., 1., 1., 1.]],

        [[0., 0., 0., 0., 1.],
         [0., 0., 0., 0., 1.],
         [0., 0., 0., 0., 1.],
         [0., 0., 0., 0., 1.],
         [1., 1., 1., 1., 1.]]])


In [182]:
mask_encoder_self_attention=invalid_encoder_pos_mat.to(torch.bool)
print(mask_encoder_self_attention)# True代表此位置需要被mask掉，False表示此位置不能被mask掉 

tensor([[[False, False,  True,  True,  True],
         [False, False,  True,  True,  True],
         [ True,  True,  True,  True,  True],
         [ True,  True,  True,  True,  True],
         [ True,  True,  True,  True,  True]],

        [[False, False, False, False,  True],
         [False, False, False, False,  True],
         [False, False, False, False,  True],
         [False, False, False, False,  True],
         [ True,  True,  True,  True,  True]]])


In [183]:
score_demo=torch.randn(batch_size,max_src_len,max_src_len)

masked_score=score_demo.masked_fill(mask_encoder_self_attention,-np.inf)
p=F.softmax(masked_score,-1)

print(score_demo)
print(masked_score)
print(p)

tensor([[[-0.5316, -0.3219,  1.5766,  0.7516, -1.7724],
         [ 0.3198, -1.1277, -0.2506,  0.0061, -0.1857],
         [ 1.2191, -0.5813, -1.6802,  0.1388, -0.8864],
         [-2.3124, -0.5046, -0.5821,  0.6983, -0.1061],
         [-0.7098,  1.2676, -0.7041, -0.7373, -2.0713]],

        [[ 1.0207, -0.5792, -1.6169, -1.9377,  0.0970],
         [-0.0732,  0.1649, -1.2415,  0.8791, -0.7883],
         [ 0.8046, -0.2554,  0.9713,  0.6741,  0.5422],
         [-0.2889,  1.3921, -1.0500, -0.8623, -1.0440],
         [ 0.0082, -0.2510, -0.1469, -0.3241, -0.8230]]])
tensor([[[-0.5316, -0.3219,    -inf,    -inf,    -inf],
         [ 0.3198, -1.1277,    -inf,    -inf,    -inf],
         [   -inf,    -inf,    -inf,    -inf,    -inf],
         [   -inf,    -inf,    -inf,    -inf,    -inf],
         [   -inf,    -inf,    -inf,    -inf,    -inf]],

        [[ 1.0207, -0.5792, -1.6169, -1.9377,    -inf],
         [-0.0732,  0.1649, -1.2415,  0.8791,    -inf],
         [ 0.8046, -0.2554,  0.9713,  0.67

In [184]:
x = torch.tensor([1, 2, 3, 4])
y=torch.unsqueeze(x,0)
print(y)
print(x.shape,y.shape)

tensor([[1, 2, 3, 4]])
torch.Size([4]) torch.Size([1, 4])
