In [1]:
import os
import sys
import math
from collections import Counter
import numpy as np
import random

import torch
import torch.nn as nn
import torch.nn.functional as F

import nltk

#忽略警告
import warnings

warnings.filterwarnings("ignore")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

## 数据预处理
### 加载数据
- 将每行数据分成两部分：英语&中文
- 在分割完成的英文和中文数据，开头加上"BOS",结尾加上"EOS"
- 英文数据使用nltk进行分词

In [2]:
def load_data(in_file):
    en=[]
    cn=[]
    num_examples=0
    with open(in_file,encoding='utf-8') as f:
        for line in f:
            line=line.strip().split("\t")
            en.append(["BOS"] + nltk.word_tokenize(line[0].lower()) + ["EOS"])
            # split chinese sentence into characters
            cn.append(["BOS"] + [c for c in line[1]] + ["EOS"])
    return en, cn
train_file = "data/en-cn/train.txt"
dev_file = "data/en-cn/dev.txt"
train_en, train_cn = load_data(train_file)
dev_en, dev_cn = load_data(dev_file)

### 构建单词表
`collections.Counter`用法
- counter工具用于支持便捷和快速的计数，例如：
``` python
from collections import Counter
cnt = Counter()
for word in ['red', 'blue', 'red', 'green', 'blue', 'blue']:
    cnt[word] += 1
print cnt
输出:Counter({'blue': 3, 'red': 2, 'green': 1})
```
- `counter.most_common(n)`从多到少返回一个长度为n的列表
``` python
输入：Counter('abracadabra').most_common(3)
输出：[('a', 5), ('r', 2), ('b', 2)]
```

In [3]:
UNK_IDX=0
PAD_IDX=1
def build_dict(sentences,max_words=50000):
    word_count=Counter()
    for sentence in sentences:
        for s in sentence:
            word_count[s]+=1
    ls=word_count.most_common(max_words)
    total_words = len(ls) + 2
    word2id = {w[0]: index+2 for index, w in enumerate(ls)}
    word2id["UNK"] = UNK_IDX
    word2id["PAD"] = PAD_IDX
    return word2id, total_words

en_word2id, en_total_words = build_dict(train_en)
cn_word2id, cn_total_words = build_dict(train_cn)
en_id2word = {v: k for k, v in en_word2id.items()}
cn_id2word = {v: k for k, v in cn_word2id.items()}

### 将单词转变为数字
- `dict.get(key,default=None)`
    - key -- 字典中要查找的键
    - default -- 如果指定键的值不存在时，返回该默认值
    ``` python 
    dict = {'Name': 'Runoob', 'Age': 27}
    print "Value : %s" %  dict.get('Age')
    print "Value : %s" %  dict.get('Sex', "Never")
    ```
    - 输出：
        * Value : 27
        - Value : Never

In [4]:
def encode(en_sentences, cn_sentences, en_word2id, cn_word2id, sort_by_len=True):

    length = len(en_sentences)
    out_en_sentences = [[en_word2id.get(w, 0) for w in sent] for sent in en_sentences]
    out_cn_sentences = [[cn_word2id.get(w, 0) for w in sent] for sent in cn_sentences]

    def len_argsort(seq):
        return sorted(range(len(seq)), key=lambda x: len(seq[x]))
       
    # 把中文和英文按照同样的顺序排序
    if sort_by_len:
        sorted_index = len_argsort(out_en_sentences)
        out_en_sentences = [out_en_sentences[i] for i in sorted_index]
        out_cn_sentences = [out_cn_sentences[i] for i in sorted_index]
        
    return out_en_sentences, out_cn_sentences

train_en, train_cn = encode(train_en, train_cn, en_word2id, cn_word2id)
dev_en, dev_cn = encode(dev_en, dev_cn, en_word2id, cn_word2id)

In [5]:
#查看数据集
k = 1000
print(" ".join([cn_id2word[i] for i in train_cn[k]]))
print(" ".join([en_id2word[i] for i in train_en[k]]))

BOS 我 有 很 多 才 能 。 EOS
BOS i have many abilities . EOS


### 将数据集划分为batch
- **`get_minibatches(n, batch_size, shuffle=True)`**
    - n：数据集的大小
    - batch_size：batch的大小
    - shuffle：是否对原数据进行打乱
    - 返回(batches)：列表(其中包含若干个长度为batch_size大小的数组)
- **`prepare_data(seqs)`**
    - seqs：一个batch的数据
    - 该函数是将一个batch的数据转换成矩阵的形式
    - 该矩阵的维度：高度为该batch中数据的个数；宽度为该batch中最长的数据的长度
    - 返回：x(该矩阵)，x_lengths(该batch中每条数据的长度)
- **`gen_examples(en_sentences, cn_sentences, batch_size)`**
    - en_sentences:英文数据
    - cn_sentences:中文数据
    - batch_size:每个batch的大小
    - 返回(all_ex):列表,其中包含若干个元组，每个元组有四个元素:
        - 英文数据矩阵
         ；英文数据长度
        ；中文数据矩阵
        ；中文数据长度

In [6]:
def get_minibatches(n, batch_size, shuffle=True):
    idx_list = np.arange(0, n, batch_size) # [0, 1, ..., n-1]
    if shuffle:
        np.random.shuffle(idx_list)
    batches = []
    for idx in idx_list:
        batches.append(np.arange(idx, min(idx + batch_size, n)))
    return batches

def prepare_data(seqs):
    lengths = [len(seq) for seq in seqs]
    n_samples = len(seqs)
    max_len = np.max(lengths)

    x = np.zeros((n_samples, max_len)).astype('int32')
    x_lengths = np.array(lengths).astype("int32")
    for idx, seq in enumerate(seqs):
        x[idx, :lengths[idx]] = seq
    return x, x_lengths #x_mask

def gen_examples(en_sentences, cn_sentences, batch_size):
    minibatches = get_minibatches(len(en_sentences), batch_size)
    all_ex = []
    for minibatch in minibatches:
        mb_en_sentences = [en_sentences[t] for t in minibatch]
        mb_cn_sentences = [cn_sentences[t] for t in minibatch]
        mb_x, mb_x_len = prepare_data(mb_en_sentences)
        mb_y, mb_y_len = prepare_data(mb_cn_sentences)
        all_ex.append((mb_x, mb_x_len, mb_y, mb_y_len))
    return all_ex

batch_size = 64
train_data = gen_examples(train_en, train_cn, batch_size)
random.shuffle(train_data)
dev_data = gen_examples(dev_en, dev_cn, batch_size)

## Seq2Seq(无Attention机制)
### 定义模型(无Attention机制)
> ####  PlainEncoder()
>     
- **`torch.sort(input, dim=-1, descending=False, out=None) -> (Tensor, LongTensor)`**
    - [Pytorch官方文档](https://pytorch.org/docs/stable/torch.html#torch.sort)
    - 返回：排序后的数据；排序后的数据原本所处的位置
- **`nn.utils.rnn.pack_padded_sequence & pad_packed_sequence`**
    - [Pytorch官方文档](https://pytorch.org/docs/stable/nn.html#pack-padded-sequence)
    - 主要是为了解决RNN的输入数据，不等长的问题
- **`torch.Tensor.contiguous()`**
    - [Pytorch官方文档](https://pytorch.org/docs/stable/tensors.html?highlight=contiguous#torch.Tensor.contiguous)
    - 主要是整合数据，让数据在内存上连续存在之类的
- 返回hid[[-1]]是为了取最后一层的最后一个cell的输出

In [7]:
class PlainEncoder(nn.Module):
    def __init__(self, vocab_size, hidden_size, dropout=0.2):
        super(PlainEncoder, self).__init__()
        self.embed = nn.Embedding(vocab_size, hidden_size)
        self.rnn = nn.GRU(hidden_size, hidden_size, batch_first=True)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, lengths):
        #这里要排序的原因，是因为对于pack_padded_sequence要求输入是排过序的
        sorted_len, sorted_idx = lengths.sort(0, descending=True)
        x_sorted = x[sorted_idx.long()]
        embedded = self.dropout(self.embed(x_sorted))
        
        packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, sorted_len.long().cpu().data.numpy(), batch_first=True)
        packed_out, hid = self.rnn(packed_embedded)
        out, _ = nn.utils.rnn.pad_packed_sequence(packed_out, batch_first=True)
        _, original_idx = sorted_idx.sort(0, descending=False)
        
        #此处相当于对一个tensor调了下顺序，然后在物理内存上再利用contiguous()将数据放在一起
        out = out[original_idx.long()].contiguous()
        hid = hid[:, original_idx.long()].contiguous()
        
        return out, hid[[-1]]

class PlainDecoder(nn.Module):
    def __init__(self, vocab_size, hidden_size, dropout=0.2):
        super(PlainDecoder, self).__init__()
        self.embed = nn.Embedding(vocab_size, hidden_size)
        self.rnn = nn.GRU(hidden_size, hidden_size, batch_first=True)
        self.out = nn.Linear(hidden_size, vocab_size)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, y, y_lengths, hid):
        sorted_len, sorted_idx = y_lengths.sort(0, descending=True)
        y_sorted = y[sorted_idx.long()]
        hid = hid[:, sorted_idx.long()]

        y_sorted = self.dropout(self.embed(y_sorted)) # batch_size, output_length, embed_size

        packed_seq = nn.utils.rnn.pack_padded_sequence(y_sorted, sorted_len.long().cpu().data.numpy(), batch_first=True)
        out, hid = self.rnn(packed_seq, hid)
        unpacked, _ = nn.utils.rnn.pad_packed_sequence(out, batch_first=True)
        _, original_idx = sorted_idx.sort(0, descending=False)
        output_seq = unpacked[original_idx.long()].contiguous()
#         print(output_seq.shape)
        hid = hid[:, original_idx.long()].contiguous()

        output = F.log_softmax(self.out(output_seq), -1)
        
        return output, hid
    
class PlainSeq2Seq(nn.Module):
    def __init__(self, encoder, decoder):
        super(PlainSeq2Seq, self).__init__()
        self.encoder = encoder
        self.decoder = decoder
        
    def forward(self, x, x_lengths, y, y_lengths):
        encoder_out, hid = self.encoder(x, x_lengths)
        output, hid = self.decoder(y=y,
                    y_lengths=y_lengths,
                    hid=hid)
        return output, None
    
    def translate(self, x, x_lengths, y, max_length=10):
        encoder_out, hid = self.encoder(x, x_lengths)
        preds = []
        batch_size = x.shape[0]
        attns = []
        for i in range(max_length):
            output, hid = self.decoder(y=y,
                    y_lengths=torch.ones(batch_size).long().to(y.device),
                    hid=hid)
            y = output.max(2)[1].view(batch_size, 1)
            preds.append(y)
            
        return torch.cat(preds, 1), None

### 定义损失函数

In [8]:
class LanguageModelCriterion(nn.Module):
    def __init__(self):
        super(LanguageModelCriterion, self).__init__()

    def forward(self, input, target, mask):
        # input: (batch_size * seq_len) * vocab_size
        input = input.contiguous().view(-1, input.size(2))
        # target: batch_size * 1
        target = target.contiguous().view(-1, 1)
        mask = mask.contiguous().view(-1, 1)
        output = -input.gather(1, target) * mask
        output = torch.sum(output) / torch.sum(mask)

        return output

### 定义训练参数

In [9]:
dropout = 0.2
hidden_size = 100
encoder = PlainEncoder(vocab_size=en_total_words,
                      hidden_size=hidden_size,
                      dropout=dropout)
decoder = PlainDecoder(vocab_size=cn_total_words,
                      hidden_size=hidden_size,
                      dropout=dropout)
model = PlainSeq2Seq(encoder, decoder)
model = model.to(device)
loss_fn = LanguageModelCriterion().to(device)
optimizer = torch.optim.Adam(model.parameters())

### 定义验证、训练以及测试函数

In [10]:
def evaluate(model, data):
    model.eval()
    total_num_words = total_loss = 0.
    with torch.no_grad():
        for it, (mb_x, mb_x_len, mb_y, mb_y_len) in enumerate(data):
            mb_x = torch.from_numpy(mb_x).to(device).long()
            mb_x_len = torch.from_numpy(mb_x_len).to(device).long()
            mb_input = torch.from_numpy(mb_y[:, :-1]).to(device).long()
            mb_output = torch.from_numpy(mb_y[:, 1:]).to(device).long()
            mb_y_len = torch.from_numpy(mb_y_len-1).to(device).long()
            mb_y_len[mb_y_len<=0] = 1

            mb_pred, attn = model(mb_x, mb_x_len, mb_input, mb_y_len)

            mb_out_mask = torch.arange(mb_y_len.max().item(), device=device)[None, :] < mb_y_len[:, None]
            mb_out_mask = mb_out_mask.float()

            loss = loss_fn(mb_pred, mb_output, mb_out_mask)

            num_words = torch.sum(mb_y_len).item()
            total_loss += loss.item() * num_words
            total_num_words += num_words
    print("Evaluation loss", total_loss/total_num_words)

In [11]:
def train(model, data, num_epochs=20):
    for epoch in range(num_epochs):
        model.train()
        total_num_words = total_loss = 0.
        for it, (mb_x, mb_x_len, mb_y, mb_y_len) in enumerate(data):
            mb_x = torch.from_numpy(mb_x).to(device).long()
            mb_x_len = torch.from_numpy(mb_x_len).to(device).long()
            mb_input = torch.from_numpy(mb_y[:, :-1]).to(device).long()
            mb_output = torch.from_numpy(mb_y[:, 1:]).to(device).long()
            mb_y_len = torch.from_numpy(mb_y_len-1).to(device).long()
            mb_y_len[mb_y_len<=0] = 1
            
            mb_pred, attn = model(mb_x, mb_x_len, mb_input, mb_y_len)
            
            mb_out_mask = torch.arange(mb_y_len.max().item(), device=device)[None, :] < mb_y_len[:, None]
            mb_out_mask = mb_out_mask.float()
            
            loss = loss_fn(mb_pred, mb_output, mb_out_mask)
            
            num_words = torch.sum(mb_y_len).item()
            total_loss += loss.item() * num_words
            total_num_words += num_words
            
            # 更新模型
            optimizer.zero_grad()
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 5.)
            optimizer.step()
            
            if it % 100 == 0:
                print("Epoch", epoch, "iteration", it, "loss", loss.item())

                
        print("Epoch", epoch, "Training loss", total_loss/total_num_words)
        if epoch % 5 == 0:
            evaluate(model, dev_data)

In [12]:
train(model, train_data, num_epochs=20)

Epoch 0 iteration 0 loss 8.10999870300293
Epoch 0 iteration 100 loss 5.194458484649658
Epoch 0 iteration 200 loss 5.378503322601318
Epoch 0 Training loss 5.472302010920261
Evaluation loss 4.846975916056102
Epoch 1 iteration 0 loss 5.308679103851318
Epoch 1 iteration 100 loss 4.5411224365234375
Epoch 1 iteration 200 loss 4.858755111694336
Epoch 1 Training loss 4.605002113540477
Epoch 2 iteration 0 loss 4.845099925994873
Epoch 2 iteration 100 loss 4.106603145599365
Epoch 2 iteration 200 loss 4.532668590545654
Epoch 2 Training loss 4.190401199153088
Epoch 3 iteration 0 loss 4.543196201324463
Epoch 3 iteration 100 loss 3.8407373428344727
Epoch 3 iteration 200 loss 4.312853813171387
Epoch 3 Training loss 3.9261740795265876
Epoch 4 iteration 0 loss 4.331034183502197
Epoch 4 iteration 100 loss 3.641451358795166
Epoch 4 iteration 200 loss 4.148542881011963
Epoch 4 Training loss 3.731367938867295
Epoch 5 iteration 0 loss 4.1790971755981445
Epoch 5 iteration 100 loss 3.4977612495422363
Epoch 5 i

In [13]:
def translate_dev(i):
    en_sent = " ".join([en_id2word[w] for w in dev_en[i]])
    print(en_sent)
    cn_sent = " ".join([cn_id2word[w] for w in dev_cn[i]])
    print("".join(cn_sent))

    mb_x = torch.from_numpy(np.array(dev_en[i]).reshape(1, -1)).long().to(device)
    mb_x_len = torch.from_numpy(np.array([len(dev_en[i])])).long().to(device)
    bos = torch.Tensor([[cn_word2id["BOS"]]]).long().to(device)

    translation, attn = model.translate(mb_x, mb_x_len, bos)
    translation = [cn_id2word[i] for i in translation.data.cpu().numpy().reshape(-1)]
    trans = []
    for word in translation:
        if word != "EOS":
            trans.append(word)
        else:
            break
    print("".join(trans))

for i in range(100,120):
    translate_dev(i)
    print()

BOS you have nice skin . EOS
BOS 你 的 皮 膚 真 好 。 EOS
你有一個好人。

BOS you 're UNK correct . EOS
BOS 你 部 分 正 确 。 EOS
你在做什麼事。

BOS everyone admired his courage . EOS
BOS 每 個 人 都 佩 服 他 的 勇 氣 。 EOS
他的父親是他的父親。

BOS what time is it ? EOS
BOS 几 点 了 ？ EOS
有什么时候？

BOS i 'm free tonight . EOS
BOS 我 今 晚 有 空 。 EOS
我今天下午。

BOS here is your book . EOS
BOS 這 是 你 的 書 。 EOS
你的車子有一本書。

BOS they are at lunch . EOS
BOS 他 们 在 吃 午 饭 。 EOS
他們在這裡。

BOS this chair is UNK . EOS
BOS 這 把 椅 子 很 UNK 。 EOS
这是个苹果。

BOS it 's pretty heavy . EOS
BOS 它 真 重 。 EOS
它是个好。

BOS many attended his funeral . EOS
BOS 很 多 人 都 参 加 了 他 的 葬 礼 。 EOS
他的父親是他的。

BOS training will be provided . EOS
BOS 会 有 训 练 。 EOS
别再试试。

BOS someone is watching you . EOS
BOS 有 人 在 看 著 你 。 EOS
有人的你都是个好人。

BOS i slapped his face . EOS
BOS 我 摑 了 他 的 臉 。 EOS
我在他的房子。

BOS i like UNK music . EOS
BOS 我 喜 歡 流 行 音 樂 。 EOS
我喜歡爵士樂。

BOS tom had no children . EOS
BOS T o m 沒 有 孩 子 。 EOS
汤姆不知道。

BOS please lock the door . EOS
BOS 請 把 門 鎖 上 。 EOS
請把門關上。

BOS tom has calme

## Seq2Seq包含Attention机制

In [14]:
class Encoder(nn.Module):
    def __init__(self, vocab_size, embed_size, enc_hidden_size, dec_hidden_size, dropout=0.2):
        super(Encoder, self).__init__()
        self.embed = nn.Embedding(vocab_size, embed_size)
        self.rnn = nn.GRU(embed_size, enc_hidden_size, batch_first=True, bidirectional=True)
        self.dropout = nn.Dropout(dropout)
        self.fc = nn.Linear(enc_hidden_size * 2, dec_hidden_size)

    def forward(self, x, lengths):
        sorted_len, sorted_idx = lengths.sort(0, descending=True)
        x_sorted = x[sorted_idx.long()]
        embedded = self.dropout(self.embed(x_sorted))
        
        packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, sorted_len.long().cpu().data.numpy(), batch_first=True)
        packed_out, hid = self.rnn(packed_embedded)
        out, _ = nn.utils.rnn.pad_packed_sequence(packed_out, batch_first=True)
        _, original_idx = sorted_idx.sort(0, descending=False)
        out = out[original_idx.long()].contiguous()
        hid = hid[:, original_idx.long()].contiguous()
        
        hid = torch.cat([hid[-2], hid[-1]], dim=1)
        hid = torch.tanh(self.fc(hid)).unsqueeze(0  )

        return out, hid

In [15]:
class Attention(nn.Module):
    def __init__(self, enc_hidden_size, dec_hidden_size):
        super(Attention, self).__init__()

        self.enc_hidden_size = enc_hidden_size
        self.dec_hidden_size = dec_hidden_size

        self.linear_in = nn.Linear(enc_hidden_size*2, dec_hidden_size, bias=False)
        self.linear_out = nn.Linear(enc_hidden_size*2 + dec_hidden_size, dec_hidden_size)
        
    def forward(self, output, context, mask):
        # output: batch_size, output_len, dec_hidden_size
        # context: batch_size, context_len, 2*enc_hidden_size
        # 此处output就相当于原文公式中的 ht; context相当于原文公式中的 hs拔
        
        batch_size = output.size(0)
        output_len = output.size(1)
        input_len = context.size(1)
        
        context_in = self.linear_in(context.view(batch_size*input_len, -1)).view(                
            batch_size, input_len, -1) # batch_size, context_len, dec_hidden_size
        
        # context_in.transpose(1,2): batch_size, dec_hidden_size, context_len 
        # output: batch_size, output_len, dec_hidden_size
        attn = torch.bmm(output, context_in.transpose(1,2)) 
        # batch_size, output_len, context_len

        attn.data.masked_fill(mask, -1e6)

        attn = F.softmax(attn, dim=2) 
        # batch_size, output_len, context_len

        context = torch.bmm(attn, context) 
        # batch_size, output_len, enc_hidden_size
        
        output = torch.cat((context, output), dim=2) # batch_size, output_len, hidden_size*2

        output = output.view(batch_size*output_len, -1)
        output = torch.tanh(self.linear_out(output))
        output = output.view(batch_size, output_len, -1)
        return output, attn

In [16]:
class Decoder(nn.Module):
    def __init__(self, vocab_size, embed_size, enc_hidden_size, dec_hidden_size, dropout=0.2):
        super(Decoder, self).__init__()
        self.embed = nn.Embedding(vocab_size, embed_size)
        self.attention = Attention(enc_hidden_size, dec_hidden_size)
        self.rnn = nn.GRU(embed_size, hidden_size, batch_first=True)
        self.out = nn.Linear(dec_hidden_size, vocab_size)
        self.dropout = nn.Dropout(dropout)

    def create_mask(self, x_len, y_len):
        # a mask of shape x_len * y_len
        device = x_len.device
        max_x_len = x_len.max()
        max_y_len = y_len.max()
        x_mask = torch.arange(max_x_len, device=x_len.device)[None, :] < x_len[:, None]
        y_mask = torch.arange(max_y_len, device=x_len.device)[None, :] < y_len[:, None]
#         mask = (1 - x_mask[:, :, None] * y_mask[:, None, :]).byte()
        mask = (~x_mask[:, :, None] * y_mask[:, None, :]).byte()
        return mask
    
    def forward(self, ctx, ctx_lengths, y, y_lengths, hid):
        sorted_len, sorted_idx = y_lengths.sort(0, descending=True)
        y_sorted = y[sorted_idx.long()]
        hid = hid[:, sorted_idx.long()]
        
        y_sorted = self.dropout(self.embed(y_sorted)) # batch_size, output_length, embed_size

        packed_seq = nn.utils.rnn.pack_padded_sequence(y_sorted, sorted_len.long().cpu().data.numpy(), batch_first=True)
        out, hid = self.rnn(packed_seq, hid)
        unpacked, _ = nn.utils.rnn.pad_packed_sequence(out, batch_first=True)
        _, original_idx = sorted_idx.sort(0, descending=False)
        output_seq = unpacked[original_idx.long()].contiguous()
        hid = hid[:, original_idx.long()].contiguous()

        mask = self.create_mask(y_lengths, ctx_lengths)

        output, attn = self.attention(output_seq, ctx, mask)
        output = F.log_softmax(self.out(output), -1)
        
        return output, hid, attn

In [17]:
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder):
        super(Seq2Seq, self).__init__()
        self.encoder = encoder
        self.decoder = decoder
        
    def forward(self, x, x_lengths, y, y_lengths):
        encoder_out, hid = self.encoder(x, x_lengths)
        output, hid, attn = self.decoder(ctx=encoder_out, 
                    ctx_lengths=x_lengths,
                    y=y,
                    y_lengths=y_lengths,
                    hid=hid)
        return output, attn
    
    def translate(self, x, x_lengths, y, max_length=100):
        encoder_out, hid = self.encoder(x, x_lengths)
        preds = []
        batch_size = x.shape[0]
        attns = []
        for i in range(max_length):
            output, hid, attn = self.decoder(ctx=encoder_out, 
                    ctx_lengths=x_lengths,
                    y=y,
                    y_lengths=torch.ones(batch_size).long().to(y.device),
                    hid=hid)
            y = output.max(2)[1].view(batch_size, 1)
            preds.append(y)
            attns.append(attn)
        return torch.cat(preds, 1), torch.cat(attns, 1)

### 定义模型训练参数

In [18]:
dropout = 0.2
embed_size = hidden_size = 100
encoder = Encoder(vocab_size=en_total_words,
                       embed_size=embed_size,
                      enc_hidden_size=hidden_size,
                       dec_hidden_size=hidden_size,
                      dropout=dropout)
decoder = Decoder(vocab_size=cn_total_words,
                      embed_size=embed_size,
                      enc_hidden_size=hidden_size,
                       dec_hidden_size=hidden_size,
                      dropout=dropout)
model = Seq2Seq(encoder, decoder)
model = model.to(device)
loss_fn = LanguageModelCriterion().to(device)
optimizer = torch.optim.Adam(model.parameters())

In [19]:
train(model, train_data, num_epochs=30)

Epoch 0 iteration 0 loss 8.074338912963867
Epoch 0 iteration 100 loss 5.239917755126953
Epoch 0 iteration 200 loss 5.501195430755615
Epoch 0 Training loss 5.4583675025075475
Evaluation loss 4.990075687667783
Epoch 1 iteration 0 loss 5.441702842712402
Epoch 1 iteration 100 loss 4.830410003662109
Epoch 1 iteration 200 loss 5.0964250564575195
Epoch 1 Training loss 4.8115485397896975
Epoch 2 iteration 0 loss 5.061765670776367
Epoch 2 iteration 100 loss 4.367208957672119
Epoch 2 iteration 200 loss 4.726598262786865
Epoch 2 Training loss 4.39293097572089
Epoch 3 iteration 0 loss 4.7129034996032715
Epoch 3 iteration 100 loss 4.004227161407471
Epoch 3 iteration 200 loss 4.442802906036377
Epoch 3 Training loss 4.052202010597128
Epoch 4 iteration 0 loss 4.454996585845947
Epoch 4 iteration 100 loss 3.7315001487731934
Epoch 4 iteration 200 loss 4.232832431793213
Epoch 4 Training loss 3.7945316203644133
Epoch 5 iteration 0 loss 4.212111949920654
Epoch 5 iteration 100 loss 3.5149412155151367
Epoch 5

In [20]:
for i in range(100,120):
    translate_dev(i)
    print()

BOS you have nice skin . EOS
BOS 你 的 皮 膚 真 好 。 EOS
你有很多。

BOS you 're UNK correct . EOS
BOS 你 部 分 正 确 。 EOS
你在看看的。

BOS everyone admired his courage . EOS
BOS 每 個 人 都 佩 服 他 的 勇 氣 。 EOS
每个人都是他的妻子。

BOS what time is it ? EOS
BOS 几 点 了 ？ EOS
它是什么时候？

BOS i 'm free tonight . EOS
BOS 我 今 晚 有 空 。 EOS
我今晚玩得很晚。

BOS here is your book . EOS
BOS 這 是 你 的 書 。 EOS
你这本书是你的。

BOS they are at lunch . EOS
BOS 他 们 在 吃 午 饭 。 EOS
他們在吃午饭。

BOS this chair is UNK . EOS
BOS 這 把 椅 子 很 UNK 。 EOS
这个苹果很大。

BOS it 's pretty heavy . EOS
BOS 它 真 重 。 EOS
它是一切的。

BOS many attended his funeral . EOS
BOS 很 多 人 都 参 加 了 他 的 葬 礼 。 EOS
許多意外上有人叫他。

BOS training will be provided . EOS
BOS 会 有 训 练 。 EOS
停止。

BOS someone is watching you . EOS
BOS 有 人 在 看 著 你 。 EOS
只是你的。

BOS i slapped his face . EOS
BOS 我 摑 了 他 的 臉 。 EOS
我和他的姐姐。

BOS i like UNK music . EOS
BOS 我 喜 歡 流 行 音 樂 。 EOS
我喜歡爵士樂。

BOS tom had no children . EOS
BOS T o m 沒 有 孩 子 。 EOS
汤姆没有孩子。

BOS please lock the door . EOS
BOS 請 把 門 鎖 上 。 EOS
請關門。

BOS tom has calmed do

#### 胡言乱语         
- 对于本文所实现的两种机器翻译模型：
- **训练模型过程中** ：对于Encoder部分，每一个cell的输入都取自上一个cell理论上的真实输出，而不是实际上的输出，具体可看**`class Encoder`**和**`class PlainEncoder()`**部分
- **利用模型预测的过程**：对于Encoder部分，每一个cell的输入都取自上一个cell的实际输出，具体参考**`model.translate()`**