In [1]:
import torch
import numpy as np
import torch.nn as nn
import torch.nn.functional as F

In [2]:
# word embedding（序列模型）
# source sentence 和 target sentence

# 构建序列，以词表中的索引表示序列的字符
batch_size = 2
# 单词表的大小
max_num_src_words = 8
max_num_tgt_words = 8

# embedding后的维度，原论文中的512
model_dim = 8

# 序列最大长度
max_src_seq_len = 5
max_tgt_seq_len = 5 
max_position_len = 5
 
# src_len = torch.randint(2, 5, (batch_size,))
# tgt_len = torch.randint(2, 5, (batch_size,))
src_len = torch.Tensor([2, 4]).to(torch.int32)
tgt_len = torch.Tensor([4, 3]).to(torch.int32)


# 以单词索引构成句子
src_seq = [torch.randint(1, max_num_src_words, (L,)) for L in src_len]
# 构建batch，seq需要padding成同样的长度，默认补0
src_seq = [F.pad(seq, (0, max(src_len)-len(seq))) for seq in src_seq]
# 将每个seq变成二维，以便拼接成二位张量batch输入
src_seq = [torch.unsqueeze(seq, 0) for seq in src_seq]
src_seq = torch.cat(src_seq)

# tgt数据同样操作，这里简写
tgt_seq = torch.cat([torch.unsqueeze(F.pad(torch.randint(1, max_num_tgt_words, (L,)), (0, max(tgt_len)-L)), 0) for L in tgt_len])

# 构造embedding
# nn.Embedding用于初始化一个weight table，实际句子里的word根据自己的索引去取自己的weight
src_embedding_table = nn.Embedding(max_num_src_words+1, model_dim)
tgt_embedding_table = nn.Embedding(max_num_tgt_words+1, model_dim)

src_embedding = src_embedding_table(src_seq)
tgt_embedding = tgt_embedding_table(tgt_seq)

print(src_embedding_table.weight)
print(src_seq)
print(src_embedding)


Parameter containing:
tensor([[ 0.9000, -0.6256,  0.6726, -0.1816,  0.7150,  0.8555, -0.2532, -0.2423],
        [ 0.1547, -1.3035, -1.6971, -0.4087, -0.5984, -0.5903,  1.1582, -0.0030],
        [ 1.1436,  0.0747,  0.8126,  1.3493,  0.8050,  0.4210, -1.3321,  0.9543],
        [-0.0671, -1.5617,  1.0926, -1.0108, -0.6117,  0.2592, -1.2393,  1.3988],
        [-0.9268,  1.6869,  1.3008,  0.2514, -2.7583, -0.6485,  0.8413,  0.8847],
        [ 0.5082,  0.3490,  1.7147,  0.3676,  0.0841, -0.1915,  0.3898,  2.0124],
        [ 0.4554, -0.4219, -0.8746, -0.2956, -1.8032,  0.2988,  0.7808,  2.2952],
        [ 1.0937,  0.0850, -1.6161, -1.0750,  0.2122,  1.9693,  0.5850,  0.3212],
        [-1.2821,  0.3039,  0.7122, -0.4522, -1.5173, -0.0640,  0.0337, -2.5136]],
       requires_grad=True)
tensor([[6, 3, 0, 0],
        [1, 6, 3, 3]])
tensor([[[ 0.4554, -0.4219, -0.8746, -0.2956, -1.8032,  0.2988,  0.7808,
           2.2952],
         [-0.0671, -1.5617,  1.0926, -1.0108, -0.6117,  0.2592, -1.2393,
 

In [3]:
# position embedding

# 表示pos，从0到model_dim
pos_mat = torch.arange(max_position_len).reshape(-1, 1)
# 表示10000^(2i/d_model)
i_mat = torch.pow(10000, torch.arange(0, model_dim, 2).reshape(1, -1) / model_dim)

pe_embedding_table = torch.zeros(max_position_len, model_dim)

# pos_mat会自动广播
pe_embedding_table[:, 0::2] = torch.sin(pos_mat/i_mat)
pe_embedding_table[:, 1::2] = torch.cos(pos_mat/i_mat)
# print(pe_embedding_table)

# 使用pe_embedding_table构造nn.Embedding，借用其forward方法获取word对应的pe
pe_embedding = nn.Embedding(max_position_len, model_dim)
pe_embedding.weight = nn.Parameter(pe_embedding_table, requires_grad = False)

# 传入pe_embedding的是word在句子中的位置索引
src_pos = torch.cat([torch.unsqueeze(torch.arange(max(src_len)), 0) for _ in src_len]).to(torch.int32)
tgt_pos = torch.cat([torch.unsqueeze(torch.arange(max(tgt_len)), 0) for _ in tgt_len]).to(torch.int32)

src_pe_embedding = pe_embedding(src_pos)
tgt_pe_embedding = pe_embedding(tgt_pos)

print(src_pe_embedding)
print(tgt_pe_embedding)


tensor([[[ 0.0000e+00,  1.0000e+00,  0.0000e+00,  1.0000e+00,  0.0000e+00,
           1.0000e+00,  0.0000e+00,  1.0000e+00],
         [ 8.4147e-01,  5.4030e-01,  9.9833e-02,  9.9500e-01,  9.9998e-03,
           9.9995e-01,  1.0000e-03,  1.0000e+00],
         [ 9.0930e-01, -4.1615e-01,  1.9867e-01,  9.8007e-01,  1.9999e-02,
           9.9980e-01,  2.0000e-03,  1.0000e+00],
         [ 1.4112e-01, -9.8999e-01,  2.9552e-01,  9.5534e-01,  2.9995e-02,
           9.9955e-01,  3.0000e-03,  1.0000e+00]],

        [[ 0.0000e+00,  1.0000e+00,  0.0000e+00,  1.0000e+00,  0.0000e+00,
           1.0000e+00,  0.0000e+00,  1.0000e+00],
         [ 8.4147e-01,  5.4030e-01,  9.9833e-02,  9.9500e-01,  9.9998e-03,
           9.9995e-01,  1.0000e-03,  1.0000e+00],
         [ 9.0930e-01, -4.1615e-01,  1.9867e-01,  9.8007e-01,  1.9999e-02,
           9.9980e-01,  2.0000e-03,  1.0000e+00],
         [ 1.4112e-01, -9.8999e-01,  2.9552e-01,  9.5534e-01,  2.9995e-02,
           9.9955e-01,  3.0000e-03,  1.0000e+00]

In [4]:
# softmax demo, why Scaled?
alpha1 = 0.1
alpha2 = 10
score = torch.randn(5)
prob1 = F.softmax(score*alpha1, -1)
prob2 = F.softmax(score*alpha2, -1)
prob1, prob2


(tensor([0.1897, 0.2126, 0.1605, 0.2052, 0.2321]),
 tensor([1.7565e-09, 1.5540e-04, 9.3864e-17, 4.4152e-06, 9.9984e-01]))

In [5]:
# encoder self-attention mask
# mask shape: [batch_size, max_src_len, max_src_len], 值为1或-inf

# 有效位置矩阵,有word为1,padding的为0
vaild_encoder_pos = torch.unsqueeze(torch.cat([torch.unsqueeze(F.pad(torch.ones(L), (0, max(src_len)-L)), 0) for L in src_len]), 2)

# 带批信息的两个矩阵相乘
# 自己乘自己的转置，可以得到每个位置和别的位置的邻接关系矩阵
vaild_encoder_pos_maxrix = torch.bmm(vaild_encoder_pos, vaild_encoder_pos.transpose(1, 2))
invaild_encoder_pos_matrix = 1 - vaild_encoder_pos_maxrix

# Ture表示需要对这个位置进行mask（padding的没有内容）
mask_encoder_self_attention = invaild_encoder_pos_matrix.to(torch.bool)

# demo
score = torch.randn(batch_size, max(src_len), max(src_len))
masked_score = score.masked_fill(mask_encoder_self_attention, -1e9)
prob = F.softmax(masked_score, -1)
print(src_len)
print(score)
print(masked_score)
print(prob)


tensor([2, 4], dtype=torch.int32)
tensor([[[ 1.0832,  0.8467, -0.1441, -0.3483],
         [-0.6019, -2.1458,  1.5814,  0.3172],
         [-0.6616,  0.6113, -1.1589,  0.3891],
         [-0.0681,  0.5364,  0.4832, -1.0840]],

        [[-0.1853,  0.2207, -0.9035, -2.0175],
         [-2.0792,  0.7193,  0.8436,  0.9591],
         [ 0.6685, -1.0132,  0.2003,  2.2290],
         [-0.2823, -0.8534,  0.1689,  0.1783]]])
tensor([[[ 1.0832e+00,  8.4669e-01, -1.0000e+09, -1.0000e+09],
         [-6.0194e-01, -2.1458e+00, -1.0000e+09, -1.0000e+09],
         [-1.0000e+09, -1.0000e+09, -1.0000e+09, -1.0000e+09],
         [-1.0000e+09, -1.0000e+09, -1.0000e+09, -1.0000e+09]],

        [[-1.8526e-01,  2.2073e-01, -9.0348e-01, -2.0175e+00],
         [-2.0792e+00,  7.1931e-01,  8.4363e-01,  9.5911e-01],
         [ 6.6854e-01, -1.0132e+00,  2.0030e-01,  2.2290e+00],
         [-2.8232e-01, -8.5340e-01,  1.6885e-01,  1.7828e-01]]])
tensor([[[0.5589, 0.4411, 0.0000, 0.0000],
         [0.8240, 0.1760, 0.0000, 0