In [1]:
import torch
import jieba
import nltk
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import os
import sys
import math
import random
import numpy as np
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
from collections import Counter
from langconv import *

In [2]:
nltk.data.find('.')

FileSystemPathPointer('C:\\Users\\Administrator\\nltk_data')

In [3]:
dropout = 0.2
hidden_size = 100
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

# DataPreprocessing

In [4]:
# 读入中英文数据

In [5]:
def load_data(in_file):
    en = []
    cn = []
    num_examples = 0
    with open(file=in_file, mode='r', encoding='utf-8') as f:
        for line in f:
            line = line.strip().split('\t')
            line[1] = Converter('zh-hans').convert(line[1])
            en.append(['BOS'] + nltk.word_tokenize(line[0].lower()) + ['EOS'])
            cn.append(['BOS'] + list(jieba.cut(line[1])) + ['EOS'])
    return en, cn

In [6]:
train_file = './nmt/en-cn/train.txt'
dev_file = './nmt/en-cn/dev.txt'
train_en, train_cn = load_data(in_file=train_file)
dev_en, dev_cn = load_data(in_file=dev_file)

Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\ADMINI~1\AppData\Local\Temp\jieba.cache
Loading model cost 0.552 seconds.
Prefix dict has been built successfully.


In [7]:
print(len(train_en), len(train_cn))

14533 14533


In [8]:
print(train_en[:5])

[['BOS', 'anyone', 'can', 'do', 'that', '.', 'EOS'], ['BOS', 'how', 'about', 'another', 'piece', 'of', 'cake', '?', 'EOS'], ['BOS', 'she', 'married', 'him', '.', 'EOS'], ['BOS', 'i', 'do', "n't", 'like', 'learning', 'irregular', 'verbs', '.', 'EOS'], ['BOS', 'it', "'s", 'a', 'whole', 'new', 'ball', 'game', 'for', 'me', '.', 'EOS']]


In [9]:
print(train_cn[:5])

[['BOS', '任何人', '都', '可以', '做到', '。', 'EOS'], ['BOS', '要', '不要', '再', '来', '一块', '蛋糕', '？', 'EOS'], ['BOS', '她', '嫁给', '了', '他', '。', 'EOS'], ['BOS', '我', '不', '喜欢', '学习', '不规则', '动词', '。', 'EOS'], ['BOS', '这', '对', '我', '来说', '是', '个', '全新', '的', '球类', '游戏', '。', 'EOS']]


In [10]:
# 构建单词表

In [11]:
UNK_IDX = 0
PAD_IDX = 1

In [12]:
def build_dict(sentences, max_words=50000):
    word_count = Counter()
    for sentence in sentences:
        for sent in sentence:
            word_count[sent] += 1
    ls = word_count.most_common(max_words)
    total_words = len(ls) + 2
    word_dict = {word[0]: index + 2 for index, word in enumerate(ls)}
    word_dict['UNK'] = UNK_IDX
    word_dict['PAD'] = PAD_IDX
    return word_dict, total_words

In [13]:
en_dict, en_total_words = build_dict(train_en)
cn_dict, cn_total_words = build_dict(train_cn)
inv_en_dict = {v: k for k, v in en_dict.items()}
inv_cn_dict = {v: k for k, v in cn_dict.items()}

In [14]:
# print(cn_dict)
# print(cn_total_words)
# print(inv_en_dict)
# print(inv_cn_dict)

In [15]:
# 将单词全部转变为数字

In [16]:
def encode(en_sentences, cn_sentences, en_dict, cn_dict, sort_by_len=True):
    length = len(en_sentences)
    out_en_sentences = [[en_dict.get(w, 0) for w in sent] for sent in en_sentences]
    out_cn_sentences = [[cn_dict.get(w, 0) for w in sent] for sent in cn_sentences]
    def len_argsort(seq):
        return sorted(range(len(seq)), key=lambda x: len(seq[x]))
    if sort_by_len:
        sorted_index = len_argsort(out_en_sentences)
        out_en_sentences = [out_en_sentences[i] for i in sorted_index]
        out_cn_sentences = [out_cn_sentences[i] for i in sorted_index]
    return out_en_sentences, out_cn_sentences

In [17]:
train_en, train_cn = encode(train_en, train_cn, en_dict, cn_dict)
dev_en, dev_cn = encode(dev_en, dev_cn, en_dict, cn_dict)

In [18]:
k = 10000
print(' '.join([inv_en_dict[i] for i in train_en[k]]))
print(' '.join([inv_cn_dict[i] for i in train_cn[k]]))

BOS for what purpose did he come here ? EOS
BOS 他来 这里 的 目的 是 什么 ？ EOS


In [19]:
# 把全部句子分成batch

In [20]:
def get_minibatches(n, minibatch_size, shuffle=True):
    idx_list = np.arange(0, n, minibatch_size)
    if shuffle:
        np.random.shuffle(idx_list)  # 初始batch开始索引位置
    minibatches = []
    for idx in idx_list:
        minibatches.append(np.arange(idx, min(idx + minibatch_size, n)))  # 所有batch放在一个大列表里
    return minibatches

In [21]:
get_minibatches(n=100, minibatch_size=15)  # test

[array([75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89]),
 array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14]),
 array([90, 91, 92, 93, 94, 95, 96, 97, 98, 99]),
 array([60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74]),
 array([30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44]),
 array([15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29]),
 array([45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59])]

In [22]:
def prepare_data(seqs):
    lengths = [len(seq) for seq in seqs]
    n_samples = len(seqs)
    max_len = np.max(lengths)
    x = np.zeros((n_samples, max_len)).astype('int32')
    x_lengths = np.array(lengths).astype('int32')
    for idx, seq in enumerate(seqs):
        x[idx, :lengths[idx]] = seq
    return x, x_lengths

In [23]:
test_seqs = [[2, 12, 167, 23, 114, 5, 27, 1755, 4, 3], [3, 9, 13, 45, 33, 9, 9]]
prepare_data(seqs=test_seqs)

(array([[   2,   12,  167,   23,  114,    5,   27, 1755,    4,    3],
        [   3,    9,   13,   45,   33,    9,    9,    0,    0,    0]]),
 array([10,  7]))

In [24]:
def gen_examples(en_sentences, cn_sentences, batch_size):
    minibatches = get_minibatches(n=len(en_sentences), minibatch_size=batch_size)
    all_ex = []
    for minibatch in minibatches:
        mb_en_sentences = [en_sentences[t] for t in minibatch]
        mb_cn_sentences = [cn_sentences[t] for t in minibatch]
        mb_x, mb_x_len = prepare_data(mb_en_sentences)  # mb_cn_sentences -> mb_en_sentences
        mb_y, mb_y_len = prepare_data(mb_cn_sentences)
        all_ex.append((mb_x, mb_x_len, mb_y, mb_y_len))  # 一个列表为一个batch数据，所有batch组成一个大列表数据
    return all_ex

In [25]:
batch_size = 64
train_data = gen_examples(en_sentences=train_en, cn_sentences=train_cn, batch_size=batch_size)
random.shuffle(train_data)
dev_data = gen_examples(en_sentences=dev_en, cn_sentences=dev_cn, batch_size=batch_size)

In [26]:
# train_data[0]

# NonAttention

## Model Building

### Encoder

In [27]:
class PlainEncoder(nn.Module):
    def __init__(self, vocab_size, hidden_size, dropout=0.2):
        super(PlainEncoder, self).__init__()
        self.embed = nn.Embedding(vocab_size, hidden_size)
        self.rnn = nn.GRU(hidden_size, hidden_size, batch_first=True)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, lengths):  # lengths表示batch里每个句子的长度
        sorted_len, sorted_idx = lengths.sort(0, descending=True)
        x_sorted = x[sorted_idx.long()]
        embedded = self.dropout(self.embed(x_sorted))
        packed_embedded = pack_padded_sequence(
            input=embedded,
            lengths=sorted_len.long().cpu().data.numpy(),
            batch_first=True)
        packed_out, hid = self.rnn(packed_embedded)
        out, _ = pad_packed_sequence(sequence=packed_out, batch_first=True)
        _, original_idx = sorted_idx.sort(0, descending=False)
        out = out[original_idx.long()].contiguous()
        hid = hid[:, original_idx.long()].contiguous()
        return out, hid[[-1]]  # 有时候num_layers层数多，需要取出最后一层        

### Decoder

In [28]:
class PlainDecoder(nn.Module):
    def __init__(self, vocab_size, hidden_size, dropout=0.2):
        super(PlainDecoder, self).__init__()
        self.embed = nn.Embedding(vocab_size, hidden_size)
        self.rnn = nn.GRU(hidden_size, hidden_size, batch_first=True)
        self.out = nn.Linear(hidden_size, vocab_size)
        self.dropout = nn.Dropout(dropout)

    def forward(self, y, y_lengths, hid):
        sorted_len, sorted_idx = y_lengths.sort(0, descending=True)
        y_sorted = y[sorted_idx.long()]
        hid = hid[:, sorted_idx.long()]  # 隐藏层也要排序
        y_sorted = self.dropout(self.embed(y_sorted))
        packed_seq = pack_padded_sequence(
            input=y_sorted,
            lengths=sorted_len.long().cpu().data.numpy(),
            batch_first=True)
        out, hid = self.rnn(packed_seq, hid)
        unpacked, _ = pad_packed_sequence(out, batch_first=True)
        _, original_idx = sorted_idx.sort(0, descending=False)
        output_seq = unpacked[original_idx.long()].contiguous()
        hid = hid[:, original_idx.long()].contiguous()
        output = F.log_softmax(self.out(output_seq), -1)
        return output, hid

### Seq2Seq

In [29]:
class PlainSeq2Seq(nn.Module):
    def __init__(self, encoder, decoder):
        super(PlainSeq2Seq, self).__init__()
        self.encoder = encoder
        self.decoder = decoder
    
    def forward(self, x, x_lengths, y, y_lengths):
        encoder_out, hid = self.encoder(x, x_lengths)  # 调用PlainEncoder里面forward的方法，返回out和hid
        output, hid = self.decoder(y=y, y_lengths=y_lengths, hid=hid)  # 调用PlainDecoder里面forward的方法
        return output, None
    
    def translate(self, x, x_lengths, y, max_length=10):  # 这里的y是BOS的数值索引，为2
        encoder_out, hid = self.encoder(x, x_lengths)
        preds = []
        batch_size = x.shape[0]
        attns = []
        for i in range(max_length):
            output, hid = self.decoder(y=y, y_lengths=torch.ones(batch_size).long().to(y.device), hid=hid)
            y = output.max(2)[1].view(batch_size, 1)
            preds.append(y)
        return torch.cat(preds, 1), None

In [30]:
# masked cross entropy loss

In [34]:
class LanguageModelCriterion(nn.Module):
    def __init__(self):
        super(LanguageModelCriterion, self).__init__()
    
    def forward(self, input, target, mask):
        input = input.contiguous().view(-1, input.size(2))
        target = target.contiguous().view(-1, 1)
        mask = mask.contiguous().view(-1, 1)
        output = -input.gather(1, target) * mask  # gather(dim=1)对Tensor每行进行列索引
        output = torch.sum(output) / torch.sum(mask)
        return output

## Model training

In [32]:
# 传入中文和英文参数

In [33]:
encoder = PlainEncoder(vocab_size=en_total_words, hidden_size=hidden_size, dropout=dropout)
decoder = PlainDecoder(vocab_size=cn_total_words, hidden_size=hidden_size, dropout=dropout)
model = PlainSeq2Seq(encoder=encoder, decoder=decoder)
model = model.to(device)
loss_fn = LanguageModelCriterion().to(device)
optimizer = optim.Adam(model.parameters())

In [26]:
def evaluate(model, data):
    model.eval()
    total_num_words = total_loss = 0.
    with torch.no_grad():
        for it, (mb_x, mb_x_len, mb_y, mb_y_len) in enumerate(data):
            mb_x = torch.from_numpy(mb_x).to(device).long()
            mb_x_len = torch.from_numpy(mb_x_len).to(device).long()
            mb_input = torch.from_numpy(mb_y[:, :-1]).to(device).long()
            mb_output = torch.from_numpy(mb_y[:, 1:]).to(device).long()
            mb_y_len = torch.from_numpy(mb_y_len - 1).to(device).long()
            mb_y_len[mb_y_len <= 0] = 1
            mb_pred, attn = model(mb_x, mb_x_len, mb_input, mb_y_len)
            mb_out_mask = torch.arange(mb_y_len.max().item(), device=device)[None, :] < mb_y_len[:, None]
            mb_out_mask = mb_out_mask.float()
            loss = loss_fn(mb_pred, mb_output, mb_out_mask)
            num_words = torch.sum(mb_y_len).item()
            total_loss += loss.item() * num_words
            total_num_words += num_words
        print(f'Evaluation loss: {total_loss / total_num_words}')

In [27]:
def train(model, data, num_epochs):
    for epoch in range(num_epochs):
        model.train()
        total_num_words = total_loss = 0.
        for it, (mb_x, mb_x_len, mb_y, mb_y_len) in enumerate(data):
            mb_x = torch.from_numpy(mb_x).to(device).long()
            mb_x_len = torch.from_numpy(mb_x_len).to(device).long()
            mb_input = torch.from_numpy(mb_y[:, :-1]).to(device).long()
            mb_output = torch.from_numpy(mb_y[:, 1:]).to(device).long()
            mb_y_len = torch.from_numpy(mb_y_len - 1).to(device).long()
            mb_y_len[mb_y_len <= 0] = 1
            mb_pred, attn = model(mb_x, mb_x_len, mb_input, mb_y_len)
            mb_out_mask = torch.arange(mb_y_len.max().item(), device=device)[None, :] < mb_y_len[:, None]  # padding的位置设置为0，其他位置设置为1;max(), 记得加括号
            mb_out_mask = mb_out_mask.float()
            loss = loss_fn(mb_pred, mb_output, mb_out_mask)
            num_words = torch.sum(mb_y_len).item()
            total_loss += loss.item() * num_words
            total_num_words += num_words
            optimizer.zero_grad()
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 5.)  # 梯度裁剪
            optimizer.step()
            if it % 100 == 0:
                print(f'Epoch: {epoch}, Iteration: {it}, Loss: {loss.item()}')
        print(f'Epoch: {epoch}, Training loss: {total_loss / total_num_words}')
        if epoch % 10 == 0:
            evaluate(model, dev_data)

In [36]:
train(model=model, data=train_data, num_epochs=100)

Epoch: 0, Iteration: 0, Loss: 9.162109375
Epoch: 0, Iteration: 100, Loss: 5.5519208908081055
Epoch: 0, Iteration: 200, Loss: 5.0611348152160645
Epoch: 0, Training loss: 5.624579241444214
Evaluation loss: 4.950168485393913
Epoch: 1, Iteration: 0, Loss: 4.796090126037598
Epoch: 1, Iteration: 100, Loss: 5.11106014251709
Epoch: 1, Iteration: 200, Loss: 4.652934551239014
Epoch: 1, Training loss: 4.672120515636809
Epoch: 2, Iteration: 0, Loss: 4.415272235870361
Epoch: 2, Iteration: 100, Loss: 4.7995829582214355
Epoch: 2, Iteration: 200, Loss: 4.372103214263916
Epoch: 2, Training loss: 4.3419556214933035
Epoch: 3, Iteration: 0, Loss: 4.1346001625061035
Epoch: 3, Iteration: 100, Loss: 4.575839042663574
Epoch: 3, Iteration: 200, Loss: 4.130451202392578
Epoch: 3, Training loss: 4.10210233931212
Epoch: 4, Iteration: 0, Loss: 3.9095547199249268
Epoch: 4, Iteration: 100, Loss: 4.386038303375244
Epoch: 4, Iteration: 200, Loss: 3.888876438140869
Epoch: 4, Training loss: 3.9019035238671433
Epoch: 5, I

Epoch: 41, Iteration: 100, Loss: 1.9102275371551514
Epoch: 41, Iteration: 200, Loss: 1.4650654792785645
Epoch: 41, Training loss: 1.5175493052178421
Epoch: 42, Iteration: 0, Loss: 1.2017388343811035
Epoch: 42, Iteration: 100, Loss: 1.8336445093154907
Epoch: 42, Iteration: 200, Loss: 1.4173647165298462
Epoch: 42, Training loss: 1.4930325140211504
Epoch: 43, Iteration: 0, Loss: 1.1226239204406738
Epoch: 43, Iteration: 100, Loss: 1.8834588527679443
Epoch: 43, Iteration: 200, Loss: 1.4309792518615723
Epoch: 43, Training loss: 1.4700407962536666
Epoch: 44, Iteration: 0, Loss: 1.1705979108810425
Epoch: 44, Iteration: 100, Loss: 1.887696385383606
Epoch: 44, Iteration: 200, Loss: 1.3904147148132324
Epoch: 44, Training loss: 1.4500091486918358
Epoch: 45, Iteration: 0, Loss: 1.089949131011963
Epoch: 45, Iteration: 100, Loss: 1.8100453615188599
Epoch: 45, Iteration: 200, Loss: 1.3520302772521973
Epoch: 45, Training loss: 1.4237448310104193
Epoch: 46, Iteration: 0, Loss: 1.0702838897705078
Epoch: 

Epoch: 82, Iteration: 100, Loss: 1.2232677936553955
Epoch: 82, Iteration: 200, Loss: 0.7761495113372803
Epoch: 82, Training loss: 0.9018916189482964
Epoch: 83, Iteration: 0, Loss: 0.6450355052947998
Epoch: 83, Iteration: 100, Loss: 1.2015255689620972
Epoch: 83, Iteration: 200, Loss: 0.8658823370933533
Epoch: 83, Training loss: 0.8966411729968455
Epoch: 84, Iteration: 0, Loss: 0.6306508183479309
Epoch: 84, Iteration: 100, Loss: 1.1742701530456543
Epoch: 84, Iteration: 200, Loss: 0.7904707789421082
Epoch: 84, Training loss: 0.8889321643531246
Epoch: 85, Iteration: 0, Loss: 0.6847953796386719
Epoch: 85, Iteration: 100, Loss: 1.1852107048034668
Epoch: 85, Iteration: 200, Loss: 0.7653084993362427
Epoch: 85, Training loss: 0.8796683813478496
Epoch: 86, Iteration: 0, Loss: 0.6067969799041748
Epoch: 86, Iteration: 100, Loss: 1.156286597251892
Epoch: 86, Iteration: 200, Loss: 0.7906332612037659
Epoch: 86, Training loss: 0.868869749648312
Epoch: 87, Iteration: 0, Loss: 0.6766155958175659
Epoch: 

## Translation

In [28]:
def translate_dev(i):
    en_sent = ' '.join([inv_en_dict[w] for w in dev_en[i]])  # inv_cn_dict -> inv_en_dict
    cn_sent = ' '.join([inv_cn_dict[w] for w in dev_cn[i]])
    print(f"English: {en_sent}")  # en_dict -> en_sent
    print(f"Chinese: {' '.join(cn_sent)}")
    mb_x = torch.from_numpy(np.array(dev_en[i]).reshape(1, -1)).long().to(device)
    mb_x_len = torch.from_numpy(np.array([len(dev_en[i])])).long().to(device)
    bos = torch.Tensor([[cn_dict['BOS']]]).long().to(device)
    translation, attn = model.translate(mb_x, mb_x_len, bos)
    translation = [inv_cn_dict[i] for i in translation.data.cpu().numpy().reshape(-1)]
    trans = []
    for word in translation:
        if word != 'EOS':
            trans.append(word)
        else:
            break
    print(f"Translation: {''.join(trans)}")

In [61]:
for i in range(100, 110):
    translate_dev(i)
    print('===' * 10)

English: BOS you have nice skin . EOS
Chinese: B O S   你   的   皮 肤   真   好   。   E O S
Translation: 你有一双鞋。
English: BOS you 're UNK correct . EOS
Chinese: B O S   你   U N K   正 确   。   E O S
Translation: 你是不可抗拒的。
English: BOS everyone admired his courage . EOS
Chinese: B O S   每 个   人   都   佩 服   他   的   勇 气   。   E O S
Translation: 每个人都在谈论他的事。
English: BOS what time is it ? EOS
Chinese: B O S   几 点   了   ？   E O S
Translation: 怎么玩呢？
English: BOS i 'm free tonight . EOS
Chinese: B O S   我   今 晚   有 空   。   E O S
Translation: 我今晚有空。
English: BOS here is your book . EOS
Chinese: B O S   这   是   你   的   书   。   E O S
Translation: 你的书在这里。
English: BOS they are at lunch . EOS
Chinese: B O S   他 们   在   吃   午 饭   。   E O S
Translation: 他们在谈得很愉快。
English: BOS this chair is UNK . EOS
Chinese: B O S   这   把   椅 子   U N K   。   E O S
Translation: 这个是很好的。
English: BOS it 's pretty heavy . EOS
Chinese: B O S   它   U N K   。   E O S
Translation: 天快要下雨了。
English: BOS many attended his funeral . EOS


# LuongAttention

## Model Buinding

### Encoder

- Encoder模型的任务是把输入文字传入embedding层和GRU层，转换成一些hidden states作为后续的context vectors

In [29]:
class Encoder(nn.Module):
    def __init__(self, vocab_size, embed_size, enc_hidden_size, dec_hidden_size, dropout=0.2):
        super(Encoder, self).__init__()
        self.embed = nn.Embedding(vocab_size, embed_size)
        self.rnn = nn.GRU(embed_size, enc_hidden_size, batch_first=True, bidirectional=True)
        self.dropout = nn.Dropout(dropout)
        self.fc = nn.Linear(enc_hidden_size * 2, dec_hidden_size)
    
    def forward(self, x, lengths):
        sorted_len, sorted_idx = lengths.sort(0, descending=True)
        x_sorted = x[sorted_idx.long()]
        embedded = self.dropout(self.embed(x_sorted))
        packed_embedded = pack_padded_sequence(embedded, sorted_len.long().cpu().data.numpy(), batch_first=True)
        packed_out, hid = self.rnn(packed_embedded)
        out, _ = pad_packed_sequence(packed_out, batch_first=True)
        _, original_idx = sorted_idx.sort(0, descending=False)
        out = out[original_idx.long()].contiguous()
        hid = hid[:, original_idx.long()].contiguous()
        hid = torch.cat([hid[-2], hid[-1]], dim=1)
        hid = torch.tanh(self.fc(hid)).unsqueeze(0)
        return out, hid

### Attention

- 根据context vectors和当前的输出hidden states，计算输出

In [30]:
class Attention(nn.Module):
    def __init__(self, enc_hidden_size, dec_hidden_size):
        super(Attention, self).__init__()
        self.enc_hidden_size = enc_hidden_size
        self.dec_hidden_size = dec_hidden_size
        self.linear_in = nn.Linear(enc_hidden_size * 2, dec_hidden_size, bias=False)
        self.linear_out = nn.Linear(enc_hidden_size * 2 + dec_hidden_size, dec_hidden_size)
    
    def forward(self, output, context, mask):
        batch_size = output.size(0)
        output_len = output.size(1)
        input_len = context.size(1)
        context_in = self.linear_in(context.view(batch_size * input_len, -1)).view(batch_size, input_len, -1)
        attn = torch.bmm(output, context_in.transpose(1, 2))
        attn.data.masked_fill(mask, -1e6)
        attn = F.softmax(attn, dim=2)
        context = torch.bmm(attn, context)
        output = torch.cat((context, output), dim=2)
        output = output.view(batch_size * output_len, -1)
        output = torch.tanh(self.linear_out(output))
        output = output.view(batch_size, output_len, -1)
        return output, attn

### Decoder

- decoder会根据已经翻译好的句子内容，和context vectors来决定下一个输出的单词

In [38]:
class Decoder(nn.Module):
    def __init__(self, vocab_size, embed_size, enc_hidden_size, dec_hidden_size, dropout=0.2):
        super(Decoder, self).__init__()
        self.embed = nn.Embedding(vocab_size, embed_size)
        self.attention = Attention(enc_hidden_size, dec_hidden_size)
        self.rnn = nn.GRU(embed_size, hidden_size, batch_first=True)
        self.out = nn.Linear(dec_hidden_size, vocab_size)
        self.dropout = nn.Dropout(dropout)
    
    def create_mask(self, x_len, y_len):
        device = x_len.device
        max_x_len = x_len.max()
        max_y_len = y_len.max()
        x_mask = torch.arange(max_x_len, device=x_len.device)[None, :] < x_len[:, None]
        y_mask = torch.arange(max_y_len, device=x_len.device)[None, :] < y_len[:, None]
        mask = (1 - x_mask[:, :, None] * y_mask[:, None, :]).byte()
        return mask
    
    def forward(self, ctx, ctx_lengths, y, y_lengths, hid):
        sorted_len, sorted_idx = y_lengths.sort(0, descending=True)
        y_sorted = y[sorted_idx.long()]
        hid = hid[:, sorted_idx.long()]
        y_sorted = self.dropout(self.embed(y_sorted))
        packed_seq = pack_padded_sequence(y_sorted, sorted_len.long().cpu().data.numpy(), batch_first=True)
        out, hid = self.rnn(packed_seq, hid)
        unpacked, _ = pad_packed_sequence(out, batch_first=True)
        _, original_idx = sorted_idx.sort(0, descending=False)
        output_seq = unpacked[original_idx.long()].contiguous()
        hid = hid[:, original_idx.long()].contiguous()
        mask = self.create_mask(y_lengths, ctx_lengths)  # 在类中，可以在别的函数里调用其他函数
        output, attn = self.attention(output_seq, ctx, mask)
        output = F.log_softmax(self.out(output), -1)
        return output, hid, attn

### Seq2Seq

- 最后我们构建Seq2Seq模型把encoder，attention和decoder串到一起

In [39]:
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder):
        super(Seq2Seq, self).__init__()
        self.encoder = encoder
        self.decoder = decoder
    
    def forward(self, x, x_lengths, y, y_lengths):
        encoder_out, hid = self.encoder(x, x_lengths)
        output, hid, attn = self.decoder(ctx=encoder_out, ctx_lengths=x_lengths, y=y, y_lengths=y_lengths, hid=hid)
        return output, attn
    
    def translate(self, x, x_lengths, y, max_length=100):
        encoder_out, hid = self.encoder(x, x_lengths)
        preds = []
        batch_size = x.shape[0]
        attns = []
        for i in range(max_length):
            output, hid, attn = self.decoder(ctx=encoder_out, ctx_lengths=x_lengths, y=y, y_lengths=torch.ones(batch_size).long().to(y.device), hid=hid)
            y = output.max(2)[1].view(batch_size, 1)
            preds.append(y)
            attns.append(attn)
        return torch.cat(preds, 1), torch.cat(attns, 1)

## Model training

In [42]:
dropout = 0.2
embed_size = hidden_size = 100
encoder = Encoder(vocab_size=en_total_words, embed_size=embed_size, enc_hidden_size=hidden_size, dec_hidden_size=hidden_size, dropout=dropout)
decoder = Decoder(vocab_size=cn_total_words, embed_size=embed_size, enc_hidden_size=hidden_size, dec_hidden_size=hidden_size, dropout=dropout)
model_attn = Seq2Seq(encoder=encoder, decoder=decoder)
model_attn = model_attn.to(device)  # RuntimeError: Expected object of backend CPU but got backend CUDA for argument #3 'index'
loss_fn = LanguageModelCriterion().to(device)
optimizer = optim.Adam(model_attn.parameters())

In [43]:
train(model=model_attn, data=train_data, num_epochs=100)

Epoch: 0, Iteration: 0, Loss: 9.083841323852539
Epoch: 0, Iteration: 100, Loss: 5.233992099761963
Epoch: 0, Iteration: 200, Loss: 5.036600112915039
Epoch: 0, Training loss: 5.619496361293119
Evaluation loss: 5.0174953266245
Epoch: 1, Iteration: 0, Loss: 5.418123245239258
Epoch: 1, Iteration: 100, Loss: 4.573816299438477
Epoch: 1, Iteration: 200, Loss: 4.568813323974609
Epoch: 1, Training loss: 4.793363921525538
Epoch: 2, Iteration: 0, Loss: 4.965517044067383
Epoch: 2, Iteration: 100, Loss: 4.154123783111572
Epoch: 2, Iteration: 200, Loss: 4.20722770690918
Epoch: 2, Training loss: 4.404310847049115
Epoch: 3, Iteration: 0, Loss: 4.626755714416504
Epoch: 3, Iteration: 100, Loss: 3.8436853885650635
Epoch: 3, Iteration: 200, Loss: 3.9486076831817627
Epoch: 3, Training loss: 4.12138103870595
Epoch: 4, Iteration: 0, Loss: 4.387110233306885
Epoch: 4, Iteration: 100, Loss: 3.6054558753967285
Epoch: 4, Iteration: 200, Loss: 3.7147574424743652
Epoch: 4, Training loss: 3.8913391629439125
Epoch: 5,

Epoch: 41, Iteration: 0, Loss: 1.3922542333602905
Epoch: 41, Iteration: 100, Loss: 0.865725576877594
Epoch: 41, Iteration: 200, Loss: 0.9584619402885437
Epoch: 41, Training loss: 1.0918338428544667
Epoch: 42, Iteration: 0, Loss: 1.404598355293274
Epoch: 42, Iteration: 100, Loss: 0.8189520239830017
Epoch: 42, Iteration: 200, Loss: 0.9348135590553284
Epoch: 42, Training loss: 1.0675075024824014
Epoch: 43, Iteration: 0, Loss: 1.3821415901184082
Epoch: 43, Iteration: 100, Loss: 0.8094674348831177
Epoch: 43, Iteration: 200, Loss: 0.867322564125061
Epoch: 43, Training loss: 1.048121057894403
Epoch: 44, Iteration: 0, Loss: 1.3603922128677368
Epoch: 44, Iteration: 100, Loss: 0.746292769908905
Epoch: 44, Iteration: 200, Loss: 0.8881151080131531
Epoch: 44, Training loss: 1.0183486800628403
Epoch: 45, Iteration: 0, Loss: 1.3183605670928955
Epoch: 45, Iteration: 100, Loss: 0.7634983062744141
Epoch: 45, Iteration: 200, Loss: 0.9116721749305725
Epoch: 45, Training loss: 0.9990173028967627
Epoch: 46,

Epoch: 81, Training loss: 0.5283196900854777
Epoch: 82, Iteration: 0, Loss: 0.6960014700889587
Epoch: 82, Iteration: 100, Loss: 0.3309747576713562
Epoch: 82, Iteration: 200, Loss: 0.4342208206653595
Epoch: 82, Training loss: 0.5223976796397417
Epoch: 83, Iteration: 0, Loss: 0.6769198775291443
Epoch: 83, Iteration: 100, Loss: 0.34168413281440735
Epoch: 83, Iteration: 200, Loss: 0.42325451970100403
Epoch: 83, Training loss: 0.5158083271339596
Epoch: 84, Iteration: 0, Loss: 0.7354187369346619
Epoch: 84, Iteration: 100, Loss: 0.3864680528640747
Epoch: 84, Iteration: 200, Loss: 0.497295081615448
Epoch: 84, Training loss: 0.5058959165947498
Epoch: 85, Iteration: 0, Loss: 0.6709532141685486
Epoch: 85, Iteration: 100, Loss: 0.34118950366973877
Epoch: 85, Iteration: 200, Loss: 0.4417526125907898
Epoch: 85, Training loss: 0.5019440968690109
Epoch: 86, Iteration: 0, Loss: 0.676158607006073
Epoch: 86, Iteration: 100, Loss: 0.38896089792251587
Epoch: 86, Iteration: 200, Loss: 0.4313361942768097
Epo

## Translation

In [45]:
def translate_dev_attn(i):
    en_sent = ' '.join([inv_en_dict[w] for w in dev_en[i]])  # inv_cn_dict -> inv_en_dict
    cn_sent = ' '.join([inv_cn_dict[w] for w in dev_cn[i]])
    print(f"English: {en_sent}")  # en_dict -> en_sent
    print(f"Chinese: {' '.join(cn_sent)}")
    mb_x = torch.from_numpy(np.array(dev_en[i]).reshape(1, -1)).long().to(device)
    mb_x_len = torch.from_numpy(np.array([len(dev_en[i])])).long().to(device)
    bos = torch.Tensor([[cn_dict['BOS']]]).long().to(device)
    translation, attn = model_attn.translate(mb_x, mb_x_len, bos)
    translation = [inv_cn_dict[i] for i in translation.data.cpu().numpy().reshape(-1)]
    trans = []
    for word in translation:
        if word != 'EOS':
            trans.append(word)
        else:
            break
    print(f"Translation: {''.join(trans)}")

In [46]:
for i in range(100, 110):
    translate_dev_attn(i)
    print('===' * 10)

English: BOS you have nice skin . EOS
Chinese: B O S   你   的   皮 肤   真   好   。   E O S
Translation: 你有一双迷人的眼睛。
English: BOS you 're UNK correct . EOS
Chinese: B O S   你   U N K   正 确   。   E O S
Translation: 你是要到好。
English: BOS everyone admired his courage . EOS
Chinese: B O S   每 个   人   都   佩 服   他   的   勇 气   。   E O S
Translation: 每个人都能听他的情绪。
English: BOS what time is it ? EOS
Chinese: B O S   几 点   了   ？   E O S
Translation: 怎么看？
English: BOS i 'm free tonight . EOS
Chinese: B O S   我   今 晚   有 空   。   E O S
Translation: 我今晚有空。
English: BOS here is your book . EOS
Chinese: B O S   这   是   你   的   书   。   E O S
Translation: 你的书在这儿。
English: BOS they are at lunch . EOS
Chinese: B O S   他 们   在   吃   午 饭   。   E O S
Translation: 他们正在午餐。
English: BOS this chair is UNK . EOS
Chinese: B O S   这   把   椅 子   U N K   。   E O S
Translation: 这本书是假。
English: BOS it 's pretty heavy . EOS
Chinese: B O S   它   U N K   。   E O S
Translation: 它害怕了。
English: BOS many attended his funeral . EOS
Chin