#### 构建演示语料库和词汇表

数据说明：
eg. '小明 喜欢 小冰', '<SOS> XiaoMing likes XiaoBing','XiaoMing likes XiaoBing <EOS>'
    
- 第一句，中文句子，作为输入序列提供给编码器
- 第二句，(<SOS>+目标语言)：英文句子，作为解码器的输入序列。句子以特殊符号<SOS>开头，表示句子的开始。<SOS>有助于解码器学会在何时开始生成目标句子。
- 第三句，(目标语言+<EOS>)：也是英文句子，作为解码器的目标输出序列，句子以特殊符号<EOS>结尾，表示句子结束。<EOS>有助于解码器学会在何时结束目标句子的生成。

In [12]:
sentences = [
    ['小明 喜欢 小冰', '<SOS> XiaoMing likes XiaoBing','XiaoMing likes XiaoBing <EOS>'],
    ['我 爱 学习 人工智能', '<SOS> I Love studying AI', 'I Love studying AI <EOS>'],
    ['深度学习 改变 世界', '<SOS> DL changed the world', 'DL changed the world <EOS>'],
    ['自然 语言 处理 很 强大', '<SOS> NLP is so powerful', 'NLP is so powerful <EOS>'],
    ['神经网络 非常 复杂', '<SOS> Neural-Nets are complex', 'Neural-Nets are complex <EOS>']
]
# 初始化中英文词汇表
word_list_cn, word_list_en = [],[]
for sent in sentences:
    word_list_cn.extend(sent[0].split())
    word_list_en.extend(sent[1].split())
    word_list_en.extend(sent[2].split())


word_list_cn = list(set(word_list_cn))   
word_list_en = list(set(word_list_en))

word2idx_cn = {word:idx for idx, word in enumerate(word_list_cn)}
word2idx_en = {word:idx for idx, word in enumerate(word_list_en)}

idx2word_cn = {idx:word for idx, word in enumerate(word_list_cn)}
idx2word_en = {idx:word for idx, word in enumerate(word_list_en)}

voc_size_cn = len(word_list_cn)
voc_size_en = len(word_list_en)

#### 生成Seq2Seq训练数据

为什么要有解码器输入张量？  
在训练阶段，向解码器提供这个信息，模型就能够以正确单词为基础来生成下一个单词，以提高训练速度

In [33]:
import random
import torch

In [37]:
def make_data(sentences):
    random_sentence = random.choice(sentences)
    
    encoder_input = [word2idx_cn[sent] for sent in random_sentence[0].split()]
    
    decoder_input = [word2idx_en[sent] for sent in random_sentence[1].split()]
    
    target = [word2idx_en[sent] for sent in random_sentence[2].split()]
    
    encoder_input = torch.LongTensor(encoder_input).unsqueeze(0)
    decoder_input = torch.LongTensor(decoder_input).unsqueeze(0)
    target = torch.LongTensor(target).unsqueeze(0)
    
    return random_sentence, encoder_input, decoder_input, target

In [38]:
random_sentence, encoder_input, decoder_input, target = make_data(sentences)
print(f"文本为:{random_sentence}")
print(f"编码器输入数据:{encoder_input}, 维度:{encoder_input.shape}")
print(f"解码器输入数据:{decoder_input}, 维度:{decoder_input.shape}")
print(f"目标数据:{target}, 维度:{target.shape}")

文本为:['深度学习 改变 世界', '<SOS> DL changed the world', 'DL changed the world <EOS>']
编码器输入数据:tensor([[ 0, 10, 13]]), 维度:torch.Size([1, 3])
解码器输入数据:tensor([[18,  9, 17,  8, 13]]), 维度:torch.Size([1, 5])
目标数据:tensor([[ 9, 17,  8, 13, 19]]), 维度:torch.Size([1, 5])


#### 构建编码器和解码器

In [41]:
import torch.nn as nn

In [91]:
class Encoder(nn.Module):
    def __init__(self,voc_size_cn, n_hidden):
        super().__init__()
        self.embedd = nn.Embedding(voc_size_cn, n_hidden)
        self.rnn = nn.RNN(n_hidden, n_hidden, batch_first=True)
    
    def forward(self, inputs, hidden):
        embedd = self.embedd(inputs)
        output, hidden = self.rnn(embedd, hidden)
        return output, hidden

In [92]:
class Attention(nn.Module):
    def __init__(self):
        super(Attention, self).__init__()
        
    def forward(self, encoder_context, decoder_context):
        """
        decoder_context做为Q,
        encpder_context作为K和V
        """
        # [btach, 一行token的个数, hidden] -> [btach, 一行token的个数, 一行token的个数]
        score = torch.matmul(decoder_context, encoder_context.transpose(-2,-1))
        atten_weight = nn.functional.softmax(score, dim=-1)
        # [btach, 一行token的个数, 一行token的个数] -> [btach, 一行token的个数,hidden]
        context = torch.matmul(atten_weight, encoder_context)
        return context, atten_weight

In [93]:
class DecoderWithAttention(nn.Module):
    def __init__(self, voc_size_en, n_hidden):
        super(Decoder, self).__init__()
        self.embedd = nn.Embedding(voc_size_en, n_hidden)
        self.rnn = nn.RNN(n_hidden, n_hidden, batch_first=True)
        self.atten = Attention()
        self.linear = nn.Linear(2 * n_hidden, voc_size_en, bias=True)
    
    def forward(self, dec_input, hidden, enc_output):
        # [batch_size, token长度] -> [batch_size, token长度, n_hidden]
        embedd = self.embedd(dec_input)
        rnn_output, hidden = self.rnn(embedd, hidden)
        context, atten_weight = self.atten(enc_output, rnn_output)
        dec_output = torch.cat((output, context),dim=-1)
        dec_output = self.linear(dec_output)
        return dec_output, context, atten_weight

In [95]:
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder):
        self.encoder = encoder
        self.decoder = decoder
    
    def forward(self, encoder_input, hidden, decoder_input):
        encoder_output, encoder_hidden = self.encoder(encoder_input, hidden)
        decoder_output, _, atten_weight = self.decoder(decoder_input, encoder_hidden, encoder_output)
        return decoder_output, atten_weight   

In [80]:
model = Decoder(voc_size_en,128)
x = torch.LongTensor([[1,2,3,4]])
n_hidden = torch.zeros(1, x.size(0),128)
print(model(x, n_hidden).shape)

torch.Size([1, 4, 20])


In [73]:
x = torch.randn(2, 3)  # 一个2x3的张量
y = torch.randn(2, 3)  # 另一个2x3的张量

In [75]:
torch.cat((x,y), 1).shape

torch.Size([2, 6])