In [49]:
import math
import torch
import numpy as np
import torch.nn as nn
import torch.optim as optim
import torch.utils.data as Data


In [50]:
# 训练参数
device = 'cuda'
epochs = 100

src_len = 8  # 源句子最大长度 encoder_input max seq len
tgt_len = 7  # decoder_input(=output) max seq len

# transformer网络参数
d_model = 512  # Embedding Size (token embedding 和 position embedding 的编码维度)
d_ff = 2048  # Feed Farward Dimension (512->2048->512)
d_k = d_v = 64  # Dimension of K(=Q) and V (K和Q的维度是相同的，这里为了方便让V也等于Q)
n_layers = 6  # Number of encoder and decoder layer blocks
n_heads = 8  # Number of heads in Multi-Head Attention


Data

In [51]:
# 构建数据集

# 训练集
sentences = [
    # 中文和英语的单词个数不要求相同
    # encoder_input           decoder_input               decoder_output
    ['我 有 一 个 好 朋 友 P', 'S i have a good friend .', 'i have a good friend . E'],
    ['我 有 零 个 女 朋 友 P', 'S i have zero girl friend .', 'i have zero girl friend . E']
]

# 测试集
# 输入：”我 有 一 个 女 朋 友“
# 输出：”i have a girlfriend“

# 构建中文词表
src_vocab = {'P': 0, '我': 1, '有': 2, '一': 3,
             '个': 4, '好': 5, '朋': 6, '友': 7, '零': 8, '女': 9}
src_idx2word = {i: w for i, w in enumerate(src_vocab)}
src_vocab_size = len(src_vocab)

# 构建英文词表
tgt_vocab = {'P': 0, 'i': 1, 'have': 2, 'a': 3, 'good': 4,
             'friend': 5, 'zero': 6, 'girl': 7, 'S': 8, 'E': 9, '.': 10}
tgt_idx2word = {i: w for i, w in enumerate(tgt_vocab)}
tgt_vocab_size = len(tgt_vocab)

# 构建数据


def make_data(sentences):
    """把句子中的单词序列转换为单词对应词表中的索引序列"""
    encoder_inputs, decoder_inputs, decoder_outputs = [], [], []
    for i in range(len(sentences)):
        encoder_input = [[src_vocab[n] for n in sentences[i][0].split()]]
        decoder_input = [[tgt_vocab[n] for n in sentences[i][1].split()]]
        decoder_output = [[tgt_vocab[n] for n in sentences[i][2].split()]]

        encoder_inputs.extend(encoder_input)
        decoder_inputs.extend(decoder_input)
        decoder_outputs.extend(decoder_output)

    return torch.LongTensor(encoder_inputs), torch.LongTensor(decoder_inputs), torch.LongTensor(decoder_outputs)


encoder_inputs, decoder_inputs, decoder_outputs = make_data(sentences)
# encoder_inputs.shape[0],encoder_inputs,decoder_inputs,decoder_outputs


DataSet and DataLoader

In [52]:
class SentenceDataSet(Data.Dataset):
    """自定义DataSet"""

    def __init__(self, encoder_inputs, decoder_inputs, decoder_outputs):
        super(SentenceDataSet, self).__init__()
        self.encoder_inputs = encoder_inputs
        self.decoder_inputs = decoder_inputs
        self.decoder_outputs = decoder_outputs

    def __len__(self):
        return self.encoder_inputs.shape[0]

    def __getitem__(self, idx):
        return self.encoder_inputs[idx], self.decoder_inputs[idx], self.decoder_outputs[idx]


loader = Data.DataLoader(SentenceDataSet(
    encoder_inputs, decoder_inputs, decoder_outputs), 2, True)


Positional Encoding

In [53]:
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, dropout=0.1, max_len=5000):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)

        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        i_mat = torch.pow(10000, torch.arange(0, d_model, 2) / d_model)
        pe[:, 0::2] = torch.sin(position / i_mat)
        pe[:, 1::2] = torch.cos(position / i_mat)
        pe = pe.unsqueeze(0).transpose(0, 1)
        self.register_buffer('pe', pe)  # 在模型中定义一个常量pe

    def forward(self, x):
        """x:[seq_len, batch_size, d_model]"""
        x = x+self.pe[:x.size(0), :]
        return self.dropout(x)


Attention Padding Mask

In [54]:
def get_attention_padding_mask(seq_q, seq_k):
    """在对value向量加权和的时候, 使得padding的位置的alpha_ij=0"""
    # 这里有两个输入是因为做cross attention时获取mask矩阵也用的这个方法，但来自enc和dec的seq长度不一定相同
    # 这里变量中的q和k只是表示两个序列，只是个名字，和注意力中的Q、K没有关系
    # seq_q:[batch_size, seq_len]
    # seq_k:[batch_size, seq_len]

    batch_size, len_q = seq_q.size()
    batch_size, len_k = seq_k.size()

    padding_attention_mask = seq_k.data.eq(0).unsqueeze(1)  # [batch_size, 1, seq_len]

    # [batch_size, len_q, len_k]
    return padding_attention_mask.expand(batch_size, len_q, len_k)

seq_q = torch.Tensor([[1,2,3,0,0]])
seq_k = torch.Tensor([[1,0]])

print(get_attention_padding_mask(seq_q, seq_k))


tensor([[[False,  True],
         [False,  True],
         [False,  True],
         [False,  True],
         [False,  True]]])


Attention Subsequence Mask

In [55]:
def get_attention_subsequence_mask(seq):
    """用于decoder_input的上三角mask"""
    # seq:[batch_size, tgt_len]

    # attn_shape:[batch_size, tgt_len, tgt_len]
    attention_shape = [seq.size(0), seq.size(1), seq.size(1)]

    # 返回一个上三角矩阵，第k条对角线以下元素全为0（主对角线为第0条）
    subsequence_mask = np.triu(np.ones(attention_shape), k=1)

    subsequence_mask = torch.from_numpy(subsequence_mask).byte()

    return subsequence_mask

seq_k = torch.Tensor([[1,2,3,4,0], [1,0,3,5,0]])
get_attention_subsequence_mask(seq_k)


tensor([[[0, 1, 1, 1, 1],
         [0, 0, 1, 1, 1],
         [0, 0, 0, 1, 1],
         [0, 0, 0, 0, 1],
         [0, 0, 0, 0, 0]],

        [[0, 1, 1, 1, 1],
         [0, 0, 1, 1, 1],
         [0, 0, 0, 1, 1],
         [0, 0, 0, 0, 1],
         [0, 0, 0, 0, 0]]], dtype=torch.uint8)

Scaled Dot-Product Attention

In [56]:
class ScaledDotProductAttention(nn.Module):
    def __init__(self):
        super(ScaledDotProductAttention, self).__init__()

    def forward(self, Q, K, V, attn_mask):
        """
        Q: [batch_size, n_heads, len_q, d_k]
        K: [batch_size, n_heads, len_k, d_k]
        V: [batch_size, n_heads, len_v(=len_k), d_v]
        attn_mask: [batch_size, n_heads, seq_len, seq_len]

        说明: 在encoder-decoder的Attention层中len_q(q1,..qt)和len_k(k1,...km)可能不同
        """
        scores = torch.matmul(Q, K.transpose(-1, -2) / np.sqrt(d_k)) # scores:[batch_size, n_heads, len_q, len_k]
        
        # 使用mask矩阵填充scores(将scores中对应attn_mask为True的位置变为-1e9)
        scores.masked_fill_(attn_mask, -1e9)

        attn = nn.Softmax(dim=-1)(scores) 

        # attn:[batch_size, n_heads, len_q, len_k]
        # V: [batch_size, n_heads, len_v(=len_k), d_v]
        context = torch.matmul(attn, V) # context: [batch_size, n_heads, len_q, d_v]
        
        # context：[[z1,z2,...],[...]]向量, attn为注意力稀疏矩阵（用于可视化的）
        return context, attn

        


Multi-Head Attention

In [57]:
class MultiHeadAttention(nn.Module):
    """此类可以用于:
    Encoder的Self-Attention
    Decoder的Masked Self-Attention
    Encoder-Decoder的Corss Attention"""
    # 输入: [seq_len, d_model]
    # 输出: [seq_len, d_model]

    def __init__(self):
        super(MultiHeadAttention, self).__init__()
        self.W_Q = nn.Linear(d_model, d_k * n_heads, bias=False)  # Q和K的维度一定是相同的，不然无法做点积
        self.W_K = nn.Linear(d_model, d_k * n_heads, bias=False)
        self.W_V = nn.Linear(d_model, d_v * n_heads, bias=False)
        self.fc = nn.Linear(n_heads * d_v, d_model)

    def forward(self, input_Q, input_K, input_V, attn_mask):
        """
        input_Q: [batch_size, len_q, d_model]
        input_K: [batch_size, len_k, d_model]
        input_V: [batch_size, len_v(=len_k), d_model]
        attn_mask: [batch_size, seq_len, seq_len]   
        """

        residual, batch_size = input_Q, input_Q.size(0)

        # Trick: 多个head的参数矩阵是放在一起做线性变换的，然后再拆成多个head
        #          (batch_size, seq_len, dim)
        # -proj--> (batch_size, seq_len, dim_new)  dim_new: d_k(or d_v)
        # -split-> (batch_size, seq_len, head, W)
        # -trans-> (batch_size, head, seq_len, W)

        # Q: [batch_size, n_heads, len_q, d_k]
        Q = self.W_Q(input_Q).view(batch_size, -1, n_heads, d_k).transpose(1, 2)
        # K: [batch_size, n_heads, len_k, d_k]
        K = self.W_K(input_K).view(batch_size, -1, n_heads, d_k).transpose(1, 2)
        # V: [batch_size, n_heads, len_v(=len_k), d_v]
        V = self.W_V(input_V).view(batch_size,- 1, n_heads, d_v).transpose(1, 2)

        # mask矩阵也要扩充成4维
        # attn_mask: [batch_size, seq_len, seq_len] --> [batch_size, n_heads, seq_len, seq_len]
        attn_mask = attn_mask.unsqueeze(1).repeat(1, n_heads, 1, 1)

        # context: [batch_size, n_heads, len_q, d_v]
        # attn: [batch_size, n_heads, len_q, len_k]
        context, attn = ScaledDotProductAttention()(Q, K, V, attn_mask)

        # 将不同head的输出向量拼接在一起
        # context: [batch_size, n_heads, len_q, d_v] --> [batch_size, len_q, n_heads*d_v]
        context = context.transpose(1, 2).reshape(batch_size, -1, n_heads * d_v)

        # 保证attention的输出仍是[seq_len, d_model]
        output = self.fc(context) # [batch_size, len_q, d_model]

        return nn.LayerNorm(d_model).to(device)(output + residual), attn



Position-wise Feed-Forward Networks

In [58]:
class PoswiseFeedForwardNet(nn.Module):
    def __init__(self):
        super(PoswiseFeedForwardNet, self).__init__()
        self.fc = nn.Sequential(
            nn.Linear(d_model, d_ff, bias = False),
            nn.ReLU(),
            nn.Linear(d_ff, d_model, bias = False)
        )
    
    def forward(self, inputs):
        """inputs: [batch_size, seq_len, d_model]"""

        residual = inputs
        output = self.fc(inputs)
        return nn.LayerNorm(d_model).to(device)(output + residual) # [batch_size, seq_len, d_model]


Encoder Layer

In [59]:
class Encoderlayer(nn.Module):
    def __init__(self):
        super(Encoderlayer, self).__init__()
        self.encoder_self_attn = MultiHeadAttention()
        self.ffn = PoswiseFeedForwardNet()

    def forward(self, encoder_inputs, encoder_self_attn_mask):
        """
        encoder_inputs: [batch_size, src_len, d_model]
        encoder_self_attn_mask: [batch_size, src_len, src_len]

        encoder_outputs: [batch_size, src_len, d_model]
        """
        # 3个encoder_inputs分别用于计算Q、K、V
        encoder_outputs, attn = self.encoder_self_attn(encoder_inputs, encoder_inputs, encoder_inputs, encoder_self_attn_mask)
        encoder_outputs = self.ffn(encoder_outputs)
        # encoder_outputs: [batch_size, src_len, d_model]
        return encoder_outputs, attn


Decoder Layer

In [60]:
class DecoderLayer(nn.Module):
    def __init__(self):
        super(DecoderLayer, self).__init__()
        self.decoder_self_attn = MultiHeadAttention()
        self.decoder_encoder_att = MultiHeadAttention()
        self.ffn = PoswiseFeedForwardNet()

    def forward(self, decoder_inputs, encoder_outputs, decoder_self_attn_mask, decoder_encoder_attn_mask):
        """
        decoder_inputs:[batch_size, tgt_len, d_model]
        encoder_outputs:[batch_size, src_len, d_model]
        decoder_self_attn_mask: [batch_size, tgt_len, tgt_len]
        decoder_encoder_attn_mask: [batch_size, tgt_len, src_len]
        """
        # decoder_outputs: [batch_size, tgt_len, d_model]
        # decoder_self_attn: [batch_size, n_heads, tgt_len, tgt_len]
        decoder_outputs, decoder_self_attn = self.decoder_self_attn(decoder_inputs, decoder_inputs, decoder_inputs, decoder_self_attn_mask)
        # decoder_encoder_attn: [batch_size, n_heads, tgt_len, src_len]
        decoder_outputs, decoder_encoder_attn = self.decoder_encoder_att(decoder_inputs, encoder_outputs, encoder_outputs, decoder_encoder_attn_mask)
        decoder_outputs = self.ffn(decoder_outputs)

        return decoder_outputs, decoder_self_attn, decoder_encoder_attn # 后两个是用于可视化的

Encoder

In [61]:
class Encoder(nn.Module):
    def __init__(self):
        super(Encoder, self).__init__()
        self.src_emb = nn.Embedding(src_vocab_size, d_model)
        self.pos_emb = PositionalEncoding(d_model)
        self.layers = nn.ModuleList([Encoderlayer() for _ in range (n_layers)])
    
    def forward(self, encoder_inputs):
        """encoder_inputs: [batch_size, src_len]"""
        encoder_outputs = self.src_emb(encoder_inputs) # [batch_size, src_len, d_model]
        encoder_outputs = self.pos_emb(encoder_outputs.transpose(0, 1)).transpose(0, 1) # [batch_size, src_len, d_model]
        # Encoder输入序列的padding mask矩阵
        encoder_self_attn_mask = get_attention_padding_mask(encoder_inputs, encoder_inputs) # [batch_size, src_len, src_len]
        encoder_self_attns = [] # 保存attention值画热力图用
        for layer in self.layers:
            encoder_outputs, encoder_self_attn = layer(encoder_outputs, encoder_self_attn_mask)
            encoder_self_attns.append(encoder_self_attn) # 只是用于可视化
        return encoder_outputs, encoder_self_attns

        

Decoder

In [62]:
class Decoder(nn.Module):
    def __init__(self):
        super(Decoder, self).__init__()
        self.tgt_emb = nn.Embedding(tgt_vocab_size, d_model)
        self.pos_emb = PositionalEncoding(d_model)
        self.layers = nn.ModuleList([DecoderLayer() for _ in range(n_layers)])
    
    def forward(self, decoder_inputs, encoder_inputs, encoder_outputs):
        """
        decoder_inputs:[batch_size, tgt_len]
        encoder_inputs:[batch_size, src_len]
        encoder_outputs:[batch_size, src_len, d_model]
        """
        decoder_outputs = self.tgt_emb(decoder_inputs)
        decoder_outputs = self.pos_emb(decoder_outputs.transpose(0, 1)).transpose(0, 1).to(device) # [batch_size, tgt_len, d_model]
        
        # Decoder输入序列时的padding mask矩阵（这里例子中的decoder没有加pad，输入的句子长度刚好就是最大长度，但实际应用中都是需要加pad填充的）
        decoder_self_attn_pad_mask = get_attention_padding_mask(decoder_inputs, decoder_inputs).to(device) # [batch_size, tgt_len, tgt_len]

        # Masked Self-Attention
        decoder_self_attn_subsequence_mask = get_attention_subsequence_mask(decoder_inputs).to(device) # [batch_size, tgt_len, tgt_len]
        
        # 把两个mask矩阵相加，既屏蔽了padding信息，又屏蔽了未来时刻的信息
        # torch.gt用于逐元素比较两个矩阵，大于返回1，否则返回0
        decoder_self_attn_mask = torch.gt((decoder_self_attn_pad_mask+decoder_self_attn_subsequence_mask),0).to(device)

        # 用于Cross Attention的mask
        decoder_encoder_attn_mask = get_attention_padding_mask(decoder_inputs, encoder_inputs) # [batch_size, tgt_len, src_len]

        decoder_self_attns, decoder_encoder_attns = [], []
        for layer in self.layers:
            decoder_outputs, decoder_self_attn, decoder_encoder_attn = layer(decoder_outputs, encoder_outputs, decoder_self_attn_mask, decoder_encoder_attn_mask)

            decoder_self_attns.append(decoder_self_attn)
            decoder_encoder_attns.append(decoder_encoder_attn)
        
        # decoder_outputs: [batch_size, tgt_len, d_model]
        return decoder_outputs, decoder_self_attns, decoder_encoder_attns


Transformer

In [63]:
class Transformer(nn.Module):
    def __init__(self):
        super(Transformer, self).__init__()
        self.encoder = Encoder().to(device)
        self.decoder = Decoder().to(device)
        self.projection = nn.Linear(d_model, tgt_vocab_size, bias=False).to(device)

    def forward(self, encoder_inputs, decoder_inputs):
        """
           encoder_inputs: [batch_size, src_len]
           decoder_inputs: [batch_size, tgt_len]
        """

        encoder_outputs, encoder_self_attns = self.encoder(encoder_inputs)

        decoder_outputs, decoder_self_attns, decoder_encoder_attns = self.decoder(decoder_inputs, encoder_inputs, encoder_outputs)
        # decoder_outputs:[batch_size, tgt_len, d_model] --> decoder_logits:[batch_size, tgt_len, tgt_vocab_size]
        decoder_logits = self.projection(decoder_outputs)

        return decoder_logits.view(-1, decoder_logits.size(-1)), encoder_self_attns, decoder_self_attns, decoder_encoder_attns


Train

In [64]:
model = Transformer().to(device)
loss_fn = nn.CrossEntropyLoss(ignore_index=0)
optimizer = optim.SGD(model.parameters(), lr=1e-3, momentum=0.99)

In [65]:
for epoch in range(epochs):

    for encoder_inputs, decoder_inputs, decoder_outputs in loader:
        
        # encoder_inputs: [batch_size, src_len]
        # decoder_inputs: [batch_size, tgt_len]
        # decoder_outputs: [batch_size, tgt_len]
        
        encoder_inputs, decoder_inputs, decoder_outputs = encoder_inputs.to(device), decoder_inputs.to(device), decoder_outputs.to(device)

        # outputs: [batch_size*tgt_len, tgt_vocab_size]

        outputs, encoder_self_attns, decoder_self_attns, decoder_encoder_attns = model(encoder_inputs, decoder_inputs)

        # decoder_outputs.view(-1): [batch_size * tgt_len * tgt_vocab_size]
        loss = loss_fn(outputs, decoder_outputs.view(-1))

        print('Epoch:', '%04d'%(epoch+1), 'loss=','{:.6f}'.format(loss))

        optimizer.zero_grad()

        loss.backward()

        optimizer.step()



Epoch: 0001 loss= 2.740895
Epoch: 0002 loss= 2.635435
Epoch: 0003 loss= 2.333206
Epoch: 0004 loss= 2.167202
Epoch: 0005 loss= 1.812859
Epoch: 0006 loss= 1.651775
Epoch: 0007 loss= 1.395540
Epoch: 0008 loss= 1.116695
Epoch: 0009 loss= 0.933818
Epoch: 0010 loss= 0.772003
Epoch: 0011 loss= 0.602071
Epoch: 0012 loss= 0.445967
Epoch: 0013 loss= 0.345385
Epoch: 0014 loss= 0.240165
Epoch: 0015 loss= 0.176040
Epoch: 0016 loss= 0.134159
Epoch: 0017 loss= 0.117009
Epoch: 0018 loss= 0.087560
Epoch: 0019 loss= 0.095686
Epoch: 0020 loss= 0.093119
Epoch: 0021 loss= 0.090021
Epoch: 0022 loss= 0.087446
Epoch: 0023 loss= 0.073897
Epoch: 0024 loss= 0.081835
Epoch: 0025 loss= 0.066849
Epoch: 0026 loss= 0.059229
Epoch: 0027 loss= 0.056727
Epoch: 0028 loss= 0.054261
Epoch: 0029 loss= 0.056589
Epoch: 0030 loss= 0.045825
Epoch: 0031 loss= 0.045650
Epoch: 0032 loss= 0.055623
Epoch: 0033 loss= 0.055970
Epoch: 0034 loss= 0.058349
Epoch: 0035 loss= 0.066749
Epoch: 0036 loss= 0.058795
Epoch: 0037 loss= 0.038042
E

Predict

In [66]:
def greedy_decoder(model, encoder_input, start_symbol):

    encoder_outputs, encoder_self_attns = model.encoder(encoder_input)
    decoder_input = torch.zeros(1, 0).type_as(encoder_input.data)
    terminal = False
    next_symbol = start_symbol
    while not terminal:
        decoder_input = torch.cat([decoder_input.to(device), torch.tensor([[next_symbol]], dtype=encoder_input.dtype).to(device)], -1)
        decoder_outputs, _, _ = model.decoder(decoder_input, encoder_input, encoder_outputs)
        projected = model.projection(decoder_outputs)
        prob = projected.squeeze(0).max(dim=-1, keepdim=False)[1]

        next_word = prob.data[-1]
        next_symbol = next_word
        if next_symbol == tgt_vocab["E"]:
            terminal = True
    greedy_decoder_predict = decoder_input[:,1:]
    return greedy_decoder_predict


In [68]:
s = [['我 有 零 个 好 朋 友 P', '', '']]
enc_inputs, dec_inputs, dec_outputs = make_data(s)
test_loader = Data.DataLoader(SentenceDataSet(enc_inputs, decoder_inputs, decoder_outputs), 2, True)
enc_inputs, _, _ = next(iter(test_loader))

for i in range(len(enc_inputs)):
    greedy_dec_predict = greedy_decoder(model, enc_inputs[i].view(1, -1).to(device), start_symbol=tgt_vocab["S"])
    print(enc_inputs[i], '->', greedy_dec_predict.squeeze())
    print([src_idx2word[t.item()] for t in enc_inputs[i]], '->', [tgt_idx2word[n.item()] for n in greedy_dec_predict.squeeze()])

tensor([1, 2, 8, 4, 5, 6, 7, 0]) -> tensor([ 1,  2,  6,  7,  5, 10], device='cuda:0')
['我', '有', '零', '个', '好', '朋', '友', 'P'] -> ['i', 'have', 'zero', 'girl', 'friend', '.']
