# word embedding

In [6]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
batch_size = 2

#确定词表大小
max_num_src_words = 8
max_num_tgt_words = 8
#确定特征大小
model_dim = 8
#确定序列最大长度
max_src_len = 5
max_tgt_len = 5

#确定序列长度
src_len = torch.Tensor([2,4]).to(torch.int32)
tgt_len = torch.Tensor([3,5]).to(torch.int32)

#确定序列，并padding
src_seq = torch.cat([torch.unsqueeze(F.pad(torch.randint(1,max_num_src_words,(L,)),(0,max_src_len-L)),0)\
            for L in src_len])
tgt_seq = torch.cat([torch.unsqueeze(F.pad(torch.randint(1,max_num_tgt_words,(L,)),(0,max_tgt_len-L)),0)\
            for L in tgt_len])

#构造embedding  torch.nn.embedding,第0行weight要让给padding
src_embedding_table = nn.Embedding(max_num_src_words+1,\
                                   model_dim)
tgt_embedding_table = nn.Embedding(max_num_tgt_words+1,\
                                  model_dim)
#调用Embedding class的forward方法（实例后加括号）
src_embedding = src_embedding_table(src_seq)
tgt_embedding = tgt_embedding_table(tgt_seq)

print(src_seq)
print(src_embedding.shape) #三维张量

tensor([[4, 6, 0, 0, 0],
        [1, 4, 7, 4, 0]])
torch.Size([2, 5, 8])


# Position Embedding

相当于一个大小为（max_position_len,model_dim）的矩阵,与每个句子的word_embedding相加

In [7]:
max_position_len = 5

#表示括号里的东西
 #reshape(-1,1)这里-1是指未设定行数，程序随机分配
pos_mat = torch.arange(max_position_len).reshape(-1,1)#（5，1）
i_mat = torch.pow(10000,torch.arange(0,\
        model_dim,2).reshape(1,-1)/model_dim).reshape(1,-1) #（1,4）
#计算PE

pe_embedding_table = torch.zeros((max_position_len,
                                  model_dim))
#偶数行
    #向量相除用到了广播机制
pe_embedding_table[:,0::2] = torch.sin(pos_mat/i_mat)
#奇数行
pe_embedding_table[:,1::2] = torch.cos(pos_mat/i_mat)

#实例化pe_embedding
pe_embedding = nn.Embedding(max_position_len,model_dim)
pe_embedding.weight = nn.Parameter(pe_embedding_table,requires_grad=False)

#提取原始序列的position信息
#_只起到遍历作用，传进去的还是max(src_len)
# src_pos =torch.cat([torch.unsqueeze(torch.arange(max(src_len)),0) for _ in src_len]).to(torch.int32)
# tgt_pos =torch.cat([torch.unsqueeze(torch.arange(max(tgt_len)),0) for _ in tgt_len]).to(torch.int32)
src_pos = torch.Tensor([[i for i in range(max_position_len)],[i for i in range(max_position_len)]]).to(torch.int32)
tgt_pos = torch.Tensor([[i for i in range(max_position_len)],[i for i in range(max_position_len)]]).to(torch.int32)
src_seq_embedding = pe_embedding(src_pos)
tgt_seq_embedding = pe_embedding(tgt_pos)
print(src_seq_embedding.shape)

torch.Size([2, 5, 8])


# Encoder Self-Attention Mask

原文中称为Scaled Dot-Product Attention:

$Attention(Q,K,V)=softmax(\frac{QK^T}{\sqrt{d_k}})V$

对于一批句子，如果Q大小为（2，5，8），Kt大小为（2，8，5），则它们想乘是一个（2，5，5）的张量

In [8]:
#scale的重要性

#概率层面
alpha1,alpha2 = 0.1,10
score = torch.randn(5)
prob1 = F.softmax(alpha1*score,-1)#-1代表维度
prob2 = F.softmax(alpha2*score,-1)
print(prob1,prob2)
#score乘以较大的数导致概率差别会很大（e-01代表0.1）

#雅可比层面（alpha2*score可能使梯度变得接近零，导致很难训练）
##记得补上##

tensor([0.1821, 0.1993, 0.2137, 0.2179, 0.1869]) tensor([1.4296e-08, 1.2058e-04, 1.2810e-01, 8.7178e-01, 1.9469e-07])


In [9]:
#构造Encoder Self-Attention Mask
#mask的shape,[batch_size,max(src_len),mac(src_len)],值为-或-inf
#src有效的position,padding都被记为0
valid_encoder_pos =torch.cat([torch.unsqueeze(F.pad(torch.ones(L),(0,max_src_len-L)),0) for L in src_len])
#再次扩维
valid_encoder_pos = torch.unsqueeze(valid_encoder_pos,2)
#bmm代表batch matrix multiply，得到有效邻接矩阵
valid_encoder_adjacency = torch.bmm(valid_encoder_pos\
                                   ,valid_encoder_pos.transpose(1,2))
#无效矩阵
invalid_encoder_pos_matrix = 1-valid_encoder_adjacency
#编码器自注意力mask矩阵
mask_encoder_self_attention = invalid_encoder_pos_matrix.to(torch.bool)


#初始化score
#(2,5,5)
score = torch.randn(batch_size,max_src_len,max_src_len)
print(score)
#padding部分不参与自注意力计算
#mask填充，需要一个bool类型的矩阵mask_encoder_self_attention
masked_score = score.masked_fill(mask_encoder_self_attention,-np.inf)
print(masked_score)
#进入mask得到注意力权重
prob = F.softmax(masked_score,-1)
print(prob)

tensor([[[-2.6170, -1.5600,  1.6287, -0.1512, -1.1252],
         [ 0.2792, -0.8092,  0.5152,  1.1749,  0.0832],
         [ 1.5149, -2.0074, -0.4100, -0.4935, -0.1545],
         [-0.3908, -0.7358, -0.2171,  0.3309,  0.3768],
         [-1.3045, -1.0203, -2.2219,  0.1250, -0.1415]],

        [[ 0.3674,  0.1926, -0.7320, -1.7360, -0.1688],
         [ 1.0316,  1.3850, -0.2825, -1.1772, -0.6705],
         [-1.2979,  0.9367, -0.6398, -0.3711, -0.3190],
         [-0.7012,  1.7016, -0.9971, -2.0819,  1.9150],
         [-0.5901,  0.6386,  1.7757,  1.4461,  0.6530]]])
tensor([[[-2.6170, -1.5600,    -inf,    -inf,    -inf],
         [ 0.2792, -0.8092,    -inf,    -inf,    -inf],
         [   -inf,    -inf,    -inf,    -inf,    -inf],
         [   -inf,    -inf,    -inf,    -inf,    -inf],
         [   -inf,    -inf,    -inf,    -inf,    -inf]],

        [[ 0.3674,  0.1926, -0.7320, -1.7360,    -inf],
         [ 1.0316,  1.3850, -0.2825, -1.1772,    -inf],
         [-1.2979,  0.9367, -0.6398, -0.37

# intra attention mask(cross Multi head Attention)

In [10]:
#计算Q*K^T shape(batch_size,tgt_seq_len,src_seq_len)
#Q shape(h,tgt_seq_len) K shape (h,src_seq_len)

#弄出valid_encoder_pos和valid_decoder_pos
valid_encoder_pos =torch.cat([torch.unsqueeze(F.pad(torch.ones(L),(0,max_src_len-L)),0) for L in src_len])
valid_encoder_pos = torch.unsqueeze(valid_encoder_pos,2)

valid_decoder_pos = torch.cat([torch.unsqueeze(F.pad(torch.ones(L),(0,max_tgt_len-L)),0) for L in tgt_len])
valid_decoder_pos = torch.unsqueeze(valid_decoder_pos,2)

valid_cross_pos_matrix = torch.bmm(valid_encoder_pos,valid_decoder_pos.transpose(1,2))
invalid_cross_pos_matrix = 1-valid_cross_pos_matrix
mask_cross_attention = invalid_encoder_pos_matrix.to(torch.bool)

print(valid_encoder_pos.shape)
print(valid_decoder_pos.shape)

print(valid_encoder_pos)
print(valid_decoder_pos)
print(valid_cross_pos_matrix)#仅同时没有padding时，才会为1

print(mask_cross_attention)

torch.Size([2, 5, 1])
torch.Size([2, 5, 1])
tensor([[[1.],
         [1.],
         [0.],
         [0.],
         [0.]],

        [[1.],
         [1.],
         [1.],
         [1.],
         [0.]]])
tensor([[[1.],
         [1.],
         [1.],
         [0.],
         [0.]],

        [[1.],
         [1.],
         [1.],
         [1.],
         [1.]]])
tensor([[[1., 1., 1., 0., 0.],
         [1., 1., 1., 0., 0.],
         [0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0.]],

        [[1., 1., 1., 1., 1.],
         [1., 1., 1., 1., 1.],
         [1., 1., 1., 1., 1.],
         [1., 1., 1., 1., 1.],
         [0., 0., 0., 0., 0.]]])
tensor([[[False, False,  True,  True,  True],
         [False, False,  True,  True,  True],
         [ True,  True,  True,  True,  True],
         [ True,  True,  True,  True,  True],
         [ True,  True,  True,  True,  True]],

        [[False, False, False, False,  True],
         [False, False, False, False,  True],
         [

In [11]:
score = torch.randn(batch_size,max_tgt_len,max_src_len)
masked_score = score.masked_fill(mask_cross_attention,-np.inf)
prob = F.softmax(masked_score,-1)
print(prob)

tensor([[[0.8148, 0.1852, 0.0000, 0.0000, 0.0000],
         [0.8557, 0.1443, 0.0000, 0.0000, 0.0000],
         [   nan,    nan,    nan,    nan,    nan],
         [   nan,    nan,    nan,    nan,    nan],
         [   nan,    nan,    nan,    nan,    nan]],

        [[0.1789, 0.2806, 0.0365, 0.5039, 0.0000],
         [0.1115, 0.6399, 0.0176, 0.2310, 0.0000],
         [0.1699, 0.2490, 0.3115, 0.2696, 0.0000],
         [0.0579, 0.1674, 0.0592, 0.7155, 0.0000],
         [   nan,    nan,    nan,    nan,    nan]]])


# Decoder self attention mask(自回归，强调因果律) 

In [12]:
#第一个是特殊字符

#在矩阵左右上下做padd，再unsqueeze,再cat
valid_decoder_tri_matrix = torch.cat([torch.unsqueeze(F.pad(torch.tril(torch.ones((L,L))),(0,max_tgt_len-L,0,max_tgt_len-L)),0)
              for L in tgt_len])

print(valid_decoder_tri_matrix.shape)
print(valid_decoder_tri_matrix)

invalid_decoder_tri_matrix = 1-valid_decoder_tri_matrix
invalid_decoder_tri_matrix = invalid_decoder_tri_matrix.to(torch.bool)

score = torch.randn(batch_size,max_tgt_len,max_tgt_len)

masked_score = score.masked_fill(invalid_decoder_tri_matrix,-1e9)

prob = F.softmax(masked_score,-1)

print(prob)

#构建scaled self-attention
def scaled_dot_product_attention(Q,K,V,attn_mask):
    score = torch.bmm(Q,K.transpose(-2,-1))/torch.sqrt(model_dim)
    masked_score = score.masked_fill(attn_mask,-1e9)
    prob = F.softmax(masked_score,-1)
    context = torch.bmm(prob,V)
    return context



torch.Size([2, 5, 5])
tensor([[[1., 0., 0., 0., 0.],
         [1., 1., 0., 0., 0.],
         [1., 1., 1., 0., 0.],
         [0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0.]],

        [[1., 0., 0., 0., 0.],
         [1., 1., 0., 0., 0.],
         [1., 1., 1., 0., 0.],
         [1., 1., 1., 1., 0.],
         [1., 1., 1., 1., 1.]]])
tensor([[[1.0000, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.1888, 0.8112, 0.0000, 0.0000, 0.0000],
         [0.1200, 0.1960, 0.6841, 0.0000, 0.0000],
         [0.2000, 0.2000, 0.2000, 0.2000, 0.2000],
         [0.2000, 0.2000, 0.2000, 0.2000, 0.2000]],

        [[1.0000, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.8265, 0.1735, 0.0000, 0.0000, 0.0000],
         [0.2251, 0.2024, 0.5725, 0.0000, 0.0000],
         [0.1525, 0.2636, 0.2060, 0.3778, 0.0000],
         [0.1115, 0.6125, 0.1186, 0.0419, 0.1155]]])


# loss_mask

In [23]:
import torch
import torch.nn as nn
import torch.nn.functional as F
#batchszie,seq_len,vocab_size
logits = torch.randn(2,3,4)
label = torch.randint(0,4,(2,3))
#交叉熵loss
logits = logits.transpose(1,2)
mean_loss = F.cross_entropy(logits,label)#六个单词平均交叉熵
loss = F.cross_entropy(logits,label,reduction='none')


#引入mask
max_tag_len = 3
tgt_len = torch.Tensor([2,3]).to(torch.int32)
mask = torch.cat([torch.unsqueeze(F.pad(torch.ones(L),(0,max_tag_len-L)),0) for L in tgt_len])
#loss mask
#element_wise_multiply
print(loss*mask)

#设置label为-100也能达到同样的loss mask效果，因为F.cross_entropy的参数ignore_index的存在
label[0,2] = -100
loss2 =  F.cross_entropy(logits,label,reduction='none')
print(loss2)

tensor([[1.4508, 1.7205, 0.0000],
        [1.0015, 3.0528, 1.2822]])
tensor([[1.4508, 1.7205, 0.0000],
        [1.0015, 3.0528, 1.2822]])
