In [1]:
import torch
import numpy as np
import torch.nn as nn
import torch.nn.functional as F

In [2]:
# word embedding（序列模型）
# source sentence 和 target sentence

# 构建序列，以词表中的索引表示序列的字符
batch_size = 2
# 单词表的大小
max_num_src_words = 8
max_num_tgt_words = 8

# embedding后的维度，原论文中的512
model_dim = 8

# 序列最大长度
max_src_seq_len = 5
max_tgt_seq_len = 5 
max_position_len = 5
 
# src_len = torch.randint(2, 5, (batch_size,))
# tgt_len = torch.randint(2, 5, (batch_size,))
src_len = torch.Tensor([2, 4]).to(torch.int32)
tgt_len = torch.Tensor([4, 3]).to(torch.int32)


# 以单词索引构成句子
src_seq = [torch.randint(1, max_num_src_words, (L,)) for L in src_len]
# 构建batch，seq需要padding成同样的长度，默认补0
src_seq = [F.pad(seq, (0, max(src_len)-len(seq))) for seq in src_seq]
# 将每个seq变成二维，以便拼接成二位张量batch输入
src_seq = [torch.unsqueeze(seq, 0) for seq in src_seq]
src_seq = torch.cat(src_seq)

# tgt数据同样操作，这里简写
tgt_seq = torch.cat([torch.unsqueeze(F.pad(torch.randint(1, max_num_tgt_words, (L,)), (0, max(tgt_len)-L)), 0) for L in tgt_len])

# 构造embedding
# nn.Embedding用于初始化一个weight table，实际句子里的word根据自己的索引去取自己的weight
src_embedding_table = nn.Embedding(max_num_src_words+1, model_dim)
tgt_embedding_table = nn.Embedding(max_num_tgt_words+1, model_dim)

src_embedding = src_embedding_table(src_seq)
tgt_embedding = tgt_embedding_table(tgt_seq)

print(src_embedding_table.weight)
print(src_seq)
print(src_embedding)


Parameter containing:
tensor([[-0.3249,  1.4673, -0.5678,  0.4758, -0.1507, -0.7540,  1.1388, -0.1609],
        [ 0.1754,  0.2556,  1.3769,  1.4690, -0.3868,  0.2991,  1.0700, -1.7144],
        [-0.2577, -0.0318,  0.3799, -1.4091, -1.4868,  1.7403, -0.2968,  2.6475],
        [ 0.2403, -0.2809, -0.1320,  1.0894,  0.3982, -0.0417, -0.1605,  1.3249],
        [-1.5856,  0.0556, -1.1777,  1.2505,  1.0750,  0.8235,  1.6107, -1.5065],
        [ 0.1568, -0.1742,  0.2178, -1.0936, -0.5422, -2.2246,  0.1710, -1.3192],
        [ 0.5250,  0.1284,  1.4986,  0.7969, -0.0632,  0.5656, -1.4087, -0.3530],
        [ 0.1960,  0.1431, -1.0588, -1.2810, -1.4329,  0.5558,  1.8252,  0.8264],
        [-0.7362, -0.6730,  1.0818,  1.5130,  1.8261, -0.6710,  0.9524,  1.7513]],
       requires_grad=True)
tensor([[3, 7, 0, 0],
        [1, 4, 6, 7]])
tensor([[[ 0.2403, -0.2809, -0.1320,  1.0894,  0.3982, -0.0417, -0.1605,
           1.3249],
         [ 0.1960,  0.1431, -1.0588, -1.2810, -1.4329,  0.5558,  1.8252,
 

In [3]:
# position embedding

# 表示pos，从0到model_dim
pos_mat = torch.arange(max_position_len).reshape(-1, 1)
# 表示10000^(2i/d_model)
i_mat = torch.pow(10000, torch.arange(0, model_dim, 2).reshape(1, -1) / model_dim)

pe_embedding_table = torch.zeros(max_position_len, model_dim)

# pos_mat会自动广播
pe_embedding_table[:, 0::2] = torch.sin(pos_mat/i_mat)
pe_embedding_table[:, 1::2] = torch.cos(pos_mat/i_mat)
# print(pe_embedding_table)

# 使用pe_embedding_table构造nn.Embedding，借用其forward方法获取word对应的pe
pe_embedding = nn.Embedding(max_position_len, model_dim)
pe_embedding.weight = nn.Parameter(pe_embedding_table, requires_grad = False)

# 传入pe_embedding的是word在句子中的位置索引
src_pos = torch.cat([torch.unsqueeze(torch.arange(max(src_len)), 0) for _ in src_len]).to(torch.int32)
tgt_pos = torch.cat([torch.unsqueeze(torch.arange(max(tgt_len)), 0) for _ in tgt_len]).to(torch.int32)

src_pe_embedding = pe_embedding(src_pos)
tgt_pe_embedding = pe_embedding(tgt_pos)

print(src_pe_embedding)
print(tgt_pe_embedding)


tensor([[[ 0.0000e+00,  1.0000e+00,  0.0000e+00,  1.0000e+00,  0.0000e+00,
           1.0000e+00,  0.0000e+00,  1.0000e+00],
         [ 8.4147e-01,  5.4030e-01,  9.9833e-02,  9.9500e-01,  9.9998e-03,
           9.9995e-01,  1.0000e-03,  1.0000e+00],
         [ 9.0930e-01, -4.1615e-01,  1.9867e-01,  9.8007e-01,  1.9999e-02,
           9.9980e-01,  2.0000e-03,  1.0000e+00],
         [ 1.4112e-01, -9.8999e-01,  2.9552e-01,  9.5534e-01,  2.9995e-02,
           9.9955e-01,  3.0000e-03,  1.0000e+00]],

        [[ 0.0000e+00,  1.0000e+00,  0.0000e+00,  1.0000e+00,  0.0000e+00,
           1.0000e+00,  0.0000e+00,  1.0000e+00],
         [ 8.4147e-01,  5.4030e-01,  9.9833e-02,  9.9500e-01,  9.9998e-03,
           9.9995e-01,  1.0000e-03,  1.0000e+00],
         [ 9.0930e-01, -4.1615e-01,  1.9867e-01,  9.8007e-01,  1.9999e-02,
           9.9980e-01,  2.0000e-03,  1.0000e+00],
         [ 1.4112e-01, -9.8999e-01,  2.9552e-01,  9.5534e-01,  2.9995e-02,
           9.9955e-01,  3.0000e-03,  1.0000e+00]

In [4]:
# softmax demo, why Scaled?
alpha1 = 0.1
alpha2 = 10
score = torch.randn(5)
prob1 = F.softmax(score*alpha1, -1)
prob2 = F.softmax(score*alpha2, -1)
prob1, prob2


(tensor([0.1886, 0.1609, 0.2342, 0.1948, 0.2215]),
 tensor([3.8058e-10, 4.8409e-17, 9.9631e-01, 9.8580e-09, 3.6889e-03]))

In [5]:
# encoder self-attention mask
# mask shape: [batch_size, max_src_len, max_src_len], 值为1或-inf

# 有效位置矩阵,有word为1,padding的为0
vaild_encoder_pos = torch.unsqueeze(torch.cat([torch.unsqueeze(F.pad(torch.ones(L), (0, max(src_len)-L)), 0) for L in src_len]), 2)

# 带批信息的两个矩阵相乘
# 自己乘自己的转置，可以得到每个位置和别的位置的邻接关系矩阵
vaild_encoder_pos_maxrix = torch.bmm(vaild_encoder_pos, vaild_encoder_pos.transpose(1, 2))
invaild_encoder_pos_matrix = 1 - vaild_encoder_pos_maxrix

# Ture表示需要对这个位置进行mask（padding的没有内容）
mask_encoder_self_attention = invaild_encoder_pos_matrix.to(torch.bool)

# demo
score = torch.randn(batch_size, max(src_len), max(src_len))
masked_score = score.masked_fill(mask_encoder_self_attention, -1e9)
prob = F.softmax(masked_score, -1)
print(src_len)
print(score)
print(masked_score)
print(prob)


tensor([2, 4], dtype=torch.int32)
tensor([[[-0.4269,  0.7850,  1.6566, -1.0907],
         [ 0.9207, -0.3187,  1.4922,  2.3232],
         [ 1.6359,  0.7555,  0.7687, -0.7684],
         [-0.2459,  1.2063,  0.0726, -0.9182]],

        [[ 0.8725,  0.4440, -0.7807,  0.7333],
         [ 1.8880, -0.1158,  0.1617,  0.7443],
         [ 0.6607,  0.9006, -0.1254, -1.0446],
         [ 0.6347,  0.7117, -0.0414,  1.0391]]])
tensor([[[-4.2693e-01,  7.8505e-01, -1.0000e+09, -1.0000e+09],
         [ 9.2069e-01, -3.1871e-01, -1.0000e+09, -1.0000e+09],
         [-1.0000e+09, -1.0000e+09, -1.0000e+09, -1.0000e+09],
         [-1.0000e+09, -1.0000e+09, -1.0000e+09, -1.0000e+09]],

        [[ 8.7246e-01,  4.4404e-01, -7.8066e-01,  7.3331e-01],
         [ 1.8880e+00, -1.1581e-01,  1.6172e-01,  7.4427e-01],
         [ 6.6065e-01,  9.0062e-01, -1.2541e-01, -1.0446e+00],
         [ 6.3471e-01,  7.1175e-01, -4.1411e-02,  1.0391e+00]]])
tensor([[[0.2294, 0.7706, 0.0000, 0.0000],
         [0.7755, 0.2245, 0.0000, 0

In [12]:
# intra-attention mask
# Q * K^T :[batch_size, tgt_seq_len, src_seq_len]
vaild_encoder_pos = torch.unsqueeze(torch.cat([torch.unsqueeze(F.pad(torch.ones(L), (0, max(src_len)-L)), 0) for L in src_len]), 2)
vaild_decoder_pos = torch.unsqueeze(torch.cat([torch.unsqueeze(F.pad(torch.ones(L), (0, max(tgt_len)-L)), 0) for L in tgt_len]), 2)
# print(vaild_encoder_pos)
# print(vaild_decoder_pos)
vaild_cross_pos_matrix = torch.bmm(vaild_decoder_pos, vaild_encoder_pos.transpose(1, 2))
invaild_cross_pos_matrix = 1 - vaild_cross_pos_matrix
mask_cross_attention = invaild_cross_pos_matrix.to(torch.bool)
print(mask_cross_attention)



tensor([[[1.],
         [1.],
         [0.],
         [0.]],

        [[1.],
         [1.],
         [1.],
         [1.]]])
tensor([[[1.],
         [1.],
         [1.],
         [1.]],

        [[1.],
         [1.],
         [1.],
         [0.]]])
tensor([[[False, False,  True,  True],
         [False, False,  True,  True],
         [False, False,  True,  True],
         [False, False,  True,  True]],

        [[False, False, False, False],
         [False, False, False, False],
         [False, False, False, False],
         [ True,  True,  True,  True]]])
