In [48]:
import torch.utils.data as data
import torch.nn as nn
import torch

import json
import os
import re
import numpy as np
import random

def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
set_seed(9)

device = torch.device('cpu')

# Seq2Seq

机器翻译，英文到中文的seq2seq实验。

# Data Preprocess

## Data Structure

提供训练集、测试集、验证集，都是一句英文一句英文。其中中文利用jieba进行分词，英文使用subword-nmt将word转化为subword。如"loved","loving","loves"这三个单词，其本身的语义都是”爱”的意思。BPE通过训练，能够把上面的3个单词拆分成”lov”,”ed”,”ing”,”es”几部分，这样可以把词的本身的意思和时态分开，有效的减少了词表的数量。词与词之间用空白隔开，中英文之间用tab隔开。

````python
what were you doing in the at@@ tic ? 	你 在 閣樓 上 做 了 什麼 ？ 
````

字典部分，已经处理好中英的字典，放在json文件中，word2int，int2word都有。


## Preprocess - SeqDataset

需要做的事主要是：

- 特殊字元： < PAD >, < BOS >, < EOS >, < UNK >转化，分别用于填充，标记开始，标记结束，标记未知
- 长度规整，输入输出，需要规整到相同长度
- word to index，中英文分别处理。


In [2]:
class SeqDataset(data.Dataset):
    def __init__(self, path, name, sen_len):
        self.path = path # data path
        self.sen_len = sen_len
        self.name = name
        # load dict
        self.word2idx_cn, self.idx2word_cn = self.load_dict('cn')
        self.word2idx_en, self.idx2word_en = self.load_dict('en')
        # sentence to idx
        self.data, self.labels = self.load_data()
        self.cn_vocab_size = len(self.word2idx_cn)
        self.en_vocab_size = len(self.word2idx_en)
    def __getitem__(self, idx):
        return self.data[idx], self.labels[idx]
    
    def __len__(self):
        return len(self.data)
    
    def load_dict(self, lang):
        with open(os.path.join(self.path, f'int2word_{lang}.json'), 'r', encoding='utf-8') as f:
            idx2word = json.load(f)
        with open(os.path.join(self.path, f'word2int_{lang}.json'), 'r', encoding='utf-8') as f:
            word2idx = json.load(f)
        return word2idx,idx2word
    
    def load_data(self):    
        # building method
        def format_len(temp, sen_len, pad):
            if len(temp) > sen_len:
                end = temp[-1]
                temp = temp[:sen_len]
                temp[-1] = end
            else:
                temp = np.pad(temp, (0, sen_len - len(temp)), constant_values = pad)
            return np.array(temp)

        def sentence_to_idxs(sens, word2idx, sen_len):
            data = []
            BOS, EOS, UNK, PAD = word2idx['<BOS>'],word2idx['<EOS>'],word2idx['<UNK>'],word2idx['<PAD>']
            for sen in sens:
                temp = [BOS]
                for word in list(filter(None, sen.split(' '))):
                    temp.append(word2idx.get(word, UNK))
                temp.append(EOS)
                temp  = format_len(temp, sen_len, PAD)
                data.append(temp[np.newaxis, :])
            data = np.concatenate(data)
            return data

        # read data
        with open(os.path.join(self.path, f'{self.name}.txt'), 'r', encoding='utf-8') as f:
            lines = f.readlines()
            en,cn = [],[]
        lines = list(filter(None, lines))
        # split cn en
        for line in lines:
            temp = re.split('[\t\n]', line.strip())
            assert len(temp) == 2 and temp[0] is not None and temp[1] is not None
            en.append(temp[0])
            cn.append(temp[1])
        # word to idx
        data = sentence_to_idxs(en, self.word2idx_en, self.sen_len)
        labels = sentence_to_idxs(cn, self.word2idx_cn, self.sen_len)

        return data, labels

In [3]:
path = './cmn-eng/'
name = 'testing'
lang = 'cn'

In [4]:
# TEST
train_set = SeqDataset(path, name, 10)
print(train_set.word2idx_cn['快樂'], train_set.idx2word_cn['847'], train_set.word2idx_en['happy'])
train_set.labels.shape

847 快樂 219


(2636, 10)

# Achitecture 

模型的主体，包含

- Encoder
- Decoder
- Seq2Seq
- Attention

## Encoder

In [5]:
class Encoder(nn.Module):
    def __init__(self, en_vocab_size, emb_dim, hid_dim, n_layers, dropout):
        super().__init__()
        self.embedding = nn.Embedding(en_vocab_size, emb_dim)
        self.hid_dim = hid_dim
        self.n_layers = n_layers
        self.rnn = nn.GRU(emb_dim, hid_dim, n_layers, dropout=dropout, batch_first=True, bidirectional=True)
        self.dropout = nn.Dropout(dropout)

    def forward(self, input_):
        # input = [batch size, sequence len, vocab size]
        embedding = self.embedding(input_)
        # embedding = [none, seq_len, emb_dim]
        outputs, hidden = self.rnn(self.dropout(embedding))
        # outputs = [batch size, sequence len, hid dim * directions]
        # hidden =  [num_layers * directions, batch size  , hid dim]
        # outputs 是最上層RNN的輸出

        return outputs, hidden

In [6]:
from pytorch_model_summary import summary

In [7]:
print(summary(Encoder(100,256, 256, 2, 0.5).to(device), torch.zeros((2, 10), dtype = torch.long).to(device), show_hierarchical=True))

---------------------------------------------------------------------------------
      Layer (type)                  Output Shape         Param #     Tr. Param #
       Embedding-1                  [2, 10, 256]          25,600          25,600
         Dropout-2                  [2, 10, 256]               0               0
             GRU-3     [2, 10, 512], [4, 2, 256]       1,972,224       1,972,224
Total params: 1,997,824
Trainable params: 1,997,824
Non-trainable params: 0
---------------------------------------------------------------------------------



Encoder(
  (embedding): Embedding(100, 256), 25,600 params
  (rnn): GRU(256, 256, num_layers=2, batch_first=True, dropout=0.5, bidirectional=True), 1,972,224 params
  (dropout): Dropout(p=0.5, inplace=False), 0 params
), 1,997,824 params





## Attention

Attention的实现，主要是通过decoder当前时间步的信息 => seq len个权重。具体的实现方法有很多。我看了一些资料，小结一下。

Attenion输入，

- encoder_outputs = [none, seq_len, hidden_dim], 注意hidden_dim和encoder中GRU的方向有关，可能得x2
- hidden = [num_layers <* num_directions>, batch, hidden_dim] 当前时间步的hidden output，默认情况下，decoder中GRU为单向, 如果使用encoder_hidden做decoder第一个时间步的输入，那么需要把双向的结果接起来，最后维度x2
- input = [batch, 1]，输入，因为decoder是单步执行，所以只传一个时间步上的值，经过embedding会变成[batch, 1, emb_dim]

notes：attention求法

- $\boldsymbol{h}_{t}^{\top} \boldsymbol{W} \overline{\boldsymbol{h}}_{s} \quad$ [Luong's multiplicative style]， 其中h分别为encoder_outputs和hidden
- $\boldsymbol{v}_{a}^{\top} \tanh \left(\boldsymbol{W}_{1} \boldsymbol{h}_{t}+\boldsymbol{W}_{2} \overline{\boldsymbol{h}}_{s}\right)$，其中v，w都为参数矩阵，也就是linear
- 只使用input和hidden进行concatenate，然后利用linear转为seq len个单元

最后使用softmax求除权重。

注意上述过程中，会出现维度不匹配问题，多半是层次数引起的，其实在该维度上，上述方法都可以直接广播复制，只关注最后的维度即可。


notes：

无需担心3d矩阵乘法问题，实际上以下代码只关注dim 2的值，对这个dim上的值做线性变化。

In [8]:
a = torch.zeros(128, 10, 256)
linear = nn.Linear(256,10)
b = linear(a)
print(b.shape)

torch.Size([128, 10, 10])


In [9]:
# [2, 10, 512], [4, 2, 256]

In [10]:
class Attention(nn.Module):
    
    def __init__(self, num_layer, hidden_dim):
        # 这里的hidden_dim 为decoder的，是encoder的两倍
        super().__init__()
        self.hidden_dim = hidden_dim
        self.W1 = nn.Linear(num_layer * hidden_dim, hidden_dim)
        self.W2 = nn.Linear(hidden_dim, hidden_dim)
        self.V = nn.Linear(hidden_dim, 1)
    
    def forward(self, input_, hidden, encoder_outputs):
        '''
        input_:  decoder的输入，经过embedding [batch, 1, emb_dim]
        hidden: decoder的隐藏层，[num_layers x 1 , batch, hidden_dim], 其中的参数为decoder中的参数
        encoder_outputs: encoder输出，[batch, seq_len, hidden_dim]
        '''
        hidden = torch.cat([hidden[i, :, :] for i in range(hidden.size(0))], dim = 1).unsqueeze(1)
        # [batch, 1, total dim], 拼接所有层的最后一个dim
        score = torch.tanh(self.W1(hidden) + self.W2(encoder_outputs))
        # [batch, seq_len, hidden_dim], 两者通过linear转化最后一个维度，最后相加（中间维度广播道seq len）
        score = self.V(score)
        # [batch, seq_len,1]
        ahlpas = torch.softmax(score, dim = 1)
        # [batch, seq_len,1]
        context = torch.sum(encoder_outputs * ahlpas, dim = 1)
        # [batch, hidden_dim]
        return context

In [11]:
# TEST
num_layer = 2
hidden_dim = 256 * num_layer
input_ = torch.zeros((128, 1), dtype = torch.int64)
hidden = torch.zeros((num_layer, 128, hidden_dim))
encoder_outputs = torch.zeros((128, 10, hidden_dim))
att = Attention(num_layer, hidden_dim)
print(att(input_, hidden, encoder_outputs).shape)

torch.Size([128, 512])


## Decoder

Decoder任务比较简单，就是跑数据，不用管teacher force以及beam search，需要注意的是，decoder的输入是当个time step上的数据。

In [12]:
class Decoder(nn.Module):
    
    def __init__(self, cn_vocab_size,  emb_dim,  hidden_dim, num_layer,dropout, isatt):
        super().__init__()
        self.emb = nn.Embedding(cn_vocab_size, emb_dim)
        t_dim =  emb_dim
        self.isatt = False
        if isatt == True:
            self.isatt = isatt
            self.att = Attention(num_layer, hidden_dim)
            t_dim += hidden_dim
        
        self.rnn = nn.GRU(t_dim, hidden_dim, num_layer, dropout = dropout, batch_first = True)
        self.dropout = nn.Dropout(dropout)
        self.fc = nn.Sequential(
            nn.Linear(hidden_dim, hidden_dim * 2),
            nn.Linear(hidden_dim * 2, hidden_dim * 4),
            nn.Linear(hidden_dim * 4, cn_vocab_size)
        )
        self.cn_vocab_size = cn_vocab_size
    def forward(self, input_, hidden, encoder_outputs):
        '''
        input_ = [batch, 1]
        hidden = [num_layer, batch, hidden_dim], 其中hidden dim受GRU层数的影响，方向定为单向
        encoder_outputs = [batch, seq_len, hidden_dim]
        '''
        emb = self.emb(input_)
        emb = self.dropout(emb)
        in_cat = emb
        # [batch, 1, emb_dim]
        # attention
        if self.isatt == True:
            context = self.att(input_, hidden, encoder_outputs)
            # context = [batch, hidden_dim]
            context = context.unsqueeze(1)
            # [batch, 1, hidden_dim]
            in_cat = torch.cat([emb, context], dim = 2)
        # [batch, i, hidden_dim + emb_dim]
        out, hidden = self.rnn(in_cat)

        # out = [batch, 1, hidden_dim]
        out = out.squeeze(1)
        #[batch, hidden_dim]
        out = self.fc(out)
        # [batch, cn_vocab_size]
        return out, hidden

In [13]:
# TEST
cn_vocab_size, en_vocab_size = 3000, 2500
emb_dim, num_layer =  128, 3
hidden_dim, dropout = 256, 0.5
decoder = Decoder(cn_vocab_size, emb_dim, hidden_dim, num_layer, dropout, True)

In [14]:
print(summary(decoder, torch.zeros((2, 1), dtype = torch.long), torch.zeros(3, 2, 256), torch.zeros(2, 10, 256), show_hierarchical=True))

--------------------------------------------------------------------------------
      Layer (type)                 Output Shape         Param #     Tr. Param #
       Embedding-1                  [2, 1, 128]         384,000         384,000
         Dropout-2                  [2, 1, 128]               0               0
       Attention-3                     [2, 256]         262,913         262,913
             GRU-4     [2, 1, 256], [3, 2, 256]       1,282,560       1,282,560
          Linear-5                     [2, 512]         131,584         131,584
          Linear-6                    [2, 1024]         525,312         525,312
          Linear-7                    [2, 3000]       3,075,000       3,075,000
Total params: 5,661,369
Trainable params: 5,661,369
Non-trainable params: 0
--------------------------------------------------------------------------------



Decoder(
  (emb): Embedding(3000, 128), 384,000 params
  (att): Attention(
    (W1): Linear(in_features=768, out_featur

In [15]:
decoder

Decoder(
  (emb): Embedding(3000, 128)
  (att): Attention(
    (W1): Linear(in_features=768, out_features=256, bias=True)
    (W2): Linear(in_features=256, out_features=256, bias=True)
    (V): Linear(in_features=256, out_features=1, bias=True)
  )
  (rnn): GRU(384, 256, num_layers=3, batch_first=True, dropout=0.5)
  (dropout): Dropout(p=0.5, inplace=False)
  (fc): Sequential(
    (0): Linear(in_features=256, out_features=512, bias=True)
    (1): Linear(in_features=512, out_features=1024, bias=True)
    (2): Linear(in_features=1024, out_features=3000, bias=True)
  )
)

## Seq2Seq

这个负责构建整个模型架构。

In [16]:
class Seq2Seq(nn.Module):
    
    def __init__(self, encoder, decoder):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
    
    def forward(self, input_, target, teacher_force_rate):
        '''
        input_ = [batch, seq_len], 输入句子样本, en
        target = [batch, seq_len], 输出翻译样本，cn
        '''
        encoder_outputs, encoder_hidden = self.encoder(input_)
        # encoder_outputs = [batch size, sequence len, hid dim * directions]
        # encoder_hidden =  [num_layers * directions, batch size  , hid dim]
        shape = encoder_hidden.size()
        hidden = encoder_hidden.view(int(shape[0]/2), 2,  shape[1], shape[2])
        hidden = torch.cat([hidden[:, i, :, :] for i in range(2)], dim = 2)
        # [num_layter, batch_size, hid_dim_dec]
        x_dec = target[:, 0]
        # 预测概率和标签
        outputs = torch.zeros(input_.shape[0], input_.shape[1], self.decoder.cn_vocab_size).to(device)
        preds = []
        for step in range(1, target.size(1)):
            x_dec = x_dec.unsqueeze(1)
            # [batch, 1]
            out, hidden = self.decoder(x_dec, hidden, encoder_outputs)
            # out = [batch, cn_vocab_size]
            # hidden = [num_layter, batch_size, hid_dim_dec]
            pred = out.argmax(1, keepdim = True)
            outputs[:,step, :] = out
            teacher_force = random.random() <= teacher_force_rate
            x_dec = target[:, step] if teacher_force else pred
            preds.append(pred)
        preds = torch.cat(preds, dim = 1).to(device)
        # preds = [batch,]
        # outputs = [batch, seq_len, cn_vocab_size]
        return outputs, preds

    def inference(self, input_, target):
        '''
        input_ = [batch, seq_len], 输入句子样本, en
        target = [batch, seq_len], 输出翻译样本，cn
        '''
        teacher_force_rate = 1
        encoder_outputs, encoder_hidden = self.encoder(input_)
        # encoder_outputs = [batch size, sequence len, hid dim * directions]
        # encoder_hidden =  [num_layers * directions, batch size  , hid dim]
        shape = encoder_hidden.size()
        hidden = encoder_hidden.view(int(shape[0]/2), 2,  shape[1], shape[2])
        hidden = torch.cat([hidden[:, i, :, :] for i in range(2)], dim = 2)
        # [num_layter, batch_size, hid_dim_dec]
        x_dec = target[:, 0]
        # 预测概率和标签
        outputs = torch.zeros(input_.shape[0], input_.shape[1], self.decoder.cn_vocab_size).to(device)
        preds = []
        for step in range(1, target.size(1)):
            x_dec = x_dec.unsqueeze(1)
            # [batch, 1]
            out, hidden = self.decoder(x_dec, hidden, encoder_outputs)
            # out = [batch, cn_vocab_size]
            # hidden = [num_layter, batch_size, hid_dim_dec]
            pred = out.argmax(1, keepdim = True)
            outputs[:,step, :] = out
            teacher_force = random.random() <= teacher_force_rate
            x_dec = target[:, step] if teacher_force else pred
            preds.append(pred)
        preds = torch.cat(preds, dim = 1).to(device)
        # preds = [batch,]
        # outputs = [batch, seq_len, cn_vocab_size]
        return outputs, preds
    

In [17]:
# TEST
cn_vocab_size, en_vocab_size = train_set.cn_vocab_size,train_set.en_vocab_size
emb_dim, num_layer =  128, 3
hidden_dim, dropout = 256, 0.5
batch_size = 64
decoder = Decoder(cn_vocab_size, emb_dim, hidden_dim, num_layer, dropout, True)
encoder = Encoder(en_vocab_size, emb_dim, int(hidden_dim/2), num_layer, dropout)
seq2seq = Seq2Seq(encoder, decoder)
dataloader = data.DataLoader(train_set, batch_size=batch_size, shuffle=True)
for input_, target in dataloader:
    break
input_ = torch.tensor(input_, dtype = torch.long)
target = torch.tensor(target, dtype = torch.long)
pred_probs,pred_labels =seq2seq(input_, target,1)

  if sys.path[0] == '':
  del sys.path[0]


# Utils

## Model

In [18]:
def save_model(model, store_model_path, step):
    torch.save(model.state_dict(), f'{store_model_path}/model_{step}.ckpt')
    return

def load_model(model, load_model_path):
    print(f'Load model from {load_model_path}')
    model.load_state_dict(torch.load(f'{load_model_path}.ckpt'))
    return model

def build_model(config, en_vocab_size, cn_vocab_size):
    # 建構模型
    encoder = Encoder(en_vocab_size, config.emb_dim, config.hid_dim, config.n_layers, config.dropout).to(device)
    decoder = Decoder(cn_vocab_size, config.emb_dim, config.hid_dim * 2, config.n_layers, config.dropout, config.attention).to(device)
    model = Seq2Seq(encoder, decoder)
    print(model)
    # 建構 optimizer
    optimizer = torch.optim.Adam(model.parameters(), lr=config.learning_rate)
    print(optimizer)
    if config.load_model:
        model = load_model(model, config.load_model_path)
    model = model.to(device)

    return model, optimizer


## Other

In [19]:
def tokens2sentence(outputs, int2word):
    sentences = []
    for tokens in outputs:
        sentence = []
        for token in tokens:
            word = int2word[str(int(token))]
            if word == '<EOS>':
                break
            sentence.append(word)
        sentences.append(sentence)
  
    return sentences

def infinite_iter(data_loader):
    it = iter(data_loader)
    while True:
        try:
            ret = next(it)
            yield ret
        except StopIteration:
            it = iter(data_loader)

## BLEU score

In [20]:
import nltk
from nltk.translate.bleu_score import sentence_bleu
from nltk.translate.bleu_score import SmoothingFunction

def computebleu(sentences, targets):
    score = 0 
    assert (len(sentences) == len(targets))
    #  cut_token 将中文分词切分成字
    def cut_token(sentence):
        tmp = []
        for token in sentence:
            if token == '<UNK>' or token.isdigit() or len(bytes(token[0], encoding='utf-8')) == 1:
                tmp.append(token)
            else:
                tmp += [word for word in token]
        return tmp 

    for sentence, target in zip(sentences, targets):
        sentence = cut_token(sentence)
        target = cut_token(target)
        # notes: bleus score，其中weight指定的是n-grams的权重，reference需要是一个列表，我还不知道为什么
        score += sentence_bleu([target], sentence, weights=(1, 0, 0, 0))                                                                                          
    return score


## PLOT

In [None]:
def plot(train_losses, val_losses, bleu_scores):
    plt.figure()
    plt.plot(train_losses)
    plt.xlabel('次數')
    plt.ylabel('loss')
    plt.title('train loss')
    plt.show()
    plt.figure()
    plt.plot(val_losses)
    plt.xlabel('次數')
    plt.ylabel('loss')
    plt.title('validation loss')
    plt.show()
    plt.figure()
    plt.plot(bleu_scores)
    plt.xlabel('次數')
    plt.ylabel('BLEU score')
    plt.title('BLEU score')
    plt.show()

## Schedule Sampling

In [21]:
def schedule_sampling():
    return 1

# Train

## train epoch

In [22]:
def train(model, optimizer, train_iter, loss_function, total_steps, summary_steps, train_dataset):
    model.train()
    model.zero_grad()
    losses = []
    loss_sum = 0.0
    for step in range(summary_steps):
        sources, targets = next(train_iter)
        sources, targets = sources.to(device, torch.long), targets.to(device, torch.long)
        outputs, preds = model(sources, targets, schedule_sampling())
        # notes: seq2seq loss计算，问题一，忽略<BOS>
        # notes: seq2seq loss计算，问题二，由于cross_entropy直接收二维数据，这里直接reshape到二维\
        outputs = outputs[:, 1:].reshape(-1, outputs.size(2))
        targets = targets[:, 1:].reshape(-1)
        loss = loss_function(outputs, targets)

        optimizer.zero_grad()
        loss.backward()
        # notes: seq2seq 梯度限制，nlp lstm会遇到的问题之一，函数部分区域非常陡峭，梯度会突然很大，导致无法训练
        grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), 1)
        optimizer.step()
        # notes: loss技巧，下面使用了exp(loss)，方便观察变化
        # 每五次step打印一次，打印loss平均值，不再以"epoch"为单位
        loss_sum += loss.item()
        if (step + 1) % 5 == 0:
            loss_sum = loss_sum / 5
            print ("\r", "train [{}] loss: {:.3f}, Perplexity: {:.3f}      ".format(total_steps + step + 1, loss_sum, np.exp(loss_sum)), end=" ")
            losses.append(loss_sum)
            loss_sum = 0.0

    return model, optimizer, losses

## Test

In [35]:
import time

In [41]:
def test(model, dataloader, loss_function):
    model.eval()
    loss_sum, bleu_score= 0.0, 0.0
    n = 0
    result = []
    time_start = time.time()
    for sources, targets in dataloader:
        sources, targets = sources.to(device, torch.long), targets.to(device, torch.long)
        batch_size = sources.size(0)
        outputs, preds = model.inference(sources, targets)
        
        outputs = outputs[:, 1:].reshape(-1, outputs.size(2))
        targets = targets[:, 1:].reshape(-1)

        loss = loss_function(outputs, targets)
        loss_sum += loss.item()

        # 將預測結果轉為文字
        targets = targets.view(sources.size(0), -1) # 维度恢复
        preds = tokens2sentence(preds, dataloader.dataset.idx2word_cn)
        sources = tokens2sentence(sources, dataloader.dataset.idx2word_en)
        targets = tokens2sentence(targets, dataloader.dataset.idx2word_cn)
        for source, pred, target in zip(sources, preds, targets):
            result.append((source, pred, target))
        # 計算 Bleu Score
        bleu_score += computebleu(preds, targets)
        n += batch_size
    time_end = time.time()
    print(time_end - time_start)
    return loss_sum / len(dataloader), bleu_score / n, result


In [42]:
train_dataset = SeqDataset(config.data_path, 'training', config.max_output_len)
model, optimizer = build_model(config, train_dataset.en_vocab_size, train_dataset.cn_vocab_size)
loss_function = nn.CrossEntropyLoss(ignore_index=0)
val_dataset = SeqDataset(config.data_path, 'validation', config.max_output_len)
val_loader = data.DataLoader(val_dataset, batch_size=1)
test(model, val_loader, loss_function)

Seq2Seq(
  (encoder): Encoder(
    (embedding): Embedding(3922, 128)
    (rnn): GRU(128, 256, num_layers=3, batch_first=True, dropout=0.5, bidirectional=True)
    (dropout): Dropout(p=0.5, inplace=False)
  )
  (decoder): Decoder(
    (emb): Embedding(3805, 128)
    (rnn): GRU(128, 512, num_layers=3, batch_first=True, dropout=0.5)
    (dropout): Dropout(p=0.5, inplace=False)
    (fc): Sequential(
      (0): Linear(in_features=512, out_features=1024, bias=True)
      (1): Linear(in_features=1024, out_features=2048, bias=True)
      (2): Linear(in_features=2048, out_features=3805, bias=True)
    )
  )
)
Adam (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.999)
    eps: 1e-08
    lr: 5e-05
    weight_decay: 0
)
68.9969687461853


(8.245542758941651,
 0.0,
 [(['<BOS>',
    'she',
    're@@',
    'sted',
    'her',
    'head',
    'on',
    'her',
    'mother',
    "'s",
    'shoulder',
    '.'],
   ['賭',
    '賭',
    '賭',
    '賭',
    '賭',
    '賭',
    '賭',
    '賭',
    '賭',
    '賭',
    '賭',
    '賭',
    '賭',
    '賭',
    '賭',
    '賭',
    '賭',
    '賭',
    '賭',
    '賭',
    '賭',
    '賭',
    '賭',
    '賭',
    '賭',
    '賭',
    '賭',
    '賭',
    '賭',
    '賭',
    '賭',
    '賭',
    '賭',
    '賭',
    '賭',
    '賭',
    '賭',
    '賭',
    '賭',
    '賭',
    '賭',
    '賭',
    '賭',
    '賭',
    '賭',
    '賭',
    '賭',
    '賭',
    '賭'],
   ['她', '<UNK>', '<UNK>', '在', '她', '母親', '的', '肩上', '。']),
  (['<BOS>', 'he', 'is', 'tall', '.'],
   ['賭',
    '賭',
    '賭',
    '賭',
    '賭',
    '賭',
    '賭',
    '賭',
    '賭',
    '賭',
    '賭',
    '賭',
    '賭',
    '賭',
    '賭',
    '賭',
    '賭',
    '賭',
    '賭',
    '賭',
    '賭',
    '賭',
    '賭',
    '賭',
    '賭',
    '賭',
    '賭',
    '賭',
    '賭',
    '賭',
    '賭',
    '賭',
  

## Train Process

In [25]:
# notes: 大量参数的参数技巧，简化api，config存放可变参数，尽量不用全局变量，但config不接触底层
def train_process(config):
    # 準備訓練資料
    train_dataset = SeqDataset(config.data_path, 'training', config.max_output_len)
    train_loader = data.DataLoader(train_dataset, batch_size=config.batch_size, shuffle=True)
    train_iter = infinite_iter(train_loader)
    # 準備檢驗資料
    # valid过程无法批量操作
    val_dataset = SeqDataset(config.data_path, 'validation', config.max_output_len)
    val_loader = data.DataLoader(val_dataset, batch_size=1)
    # 建構模型
    model, optimizer = build_model(config, train_dataset.en_vocab_size, train_dataset.cn_vocab_size)
    loss_function = nn.CrossEntropyLoss(ignore_index=0)

    train_losses, val_losses, bleu_scores = [], [], []
    total_steps = 0
    # notes：训练的另一种写法，无epoch如何控制训练过程
    while (total_steps < config.num_steps):
        # 訓練模型
        model, optimizer, loss = train(model, optimizer, train_iter, loss_function, total_steps, config.summary_steps, train_dataset)
        train_losses += loss
        # 檢驗模型
    
        val_loss, bleu_score, result = test(model, val_loader, loss_function)
        val_losses.append(val_loss)
        bleu_scores.append(bleu_score)
        
        total_steps += config.summary_steps
        print ("\r", "val [{}] loss: {:.3f}, Perplexity: {:.3f}, blue score: {:.3f}       ".format(total_steps, val_loss, np.exp(val_loss), bleu_score))

        # 儲存模型和結果
        # notes：机器翻译任务early stop，因为bleus不像acc，它不是精准的指标，所以此处每隔一定的step保存一次模型，靠人为选择出合适的模型
        if total_steps % config.store_steps == 0 or total_steps >= config.num_steps:
            save_model(model, config.store_model_path, total_steps)
            with open(f'{config.store_model_path}/output_{total_steps}.txt', 'w') as f:
                for line in result:
                    print (line, file=f)
    
    return train_losses, val_losses, bleu_scores

In [26]:
# #     train_dataset = SeqDataset(config.data_path, 'training', config.max_output_len)
#     train_loader = data.DataLoader(train_dataset, batch_size=config.batch_size, shuffle=True)
#     train_iter = infinite_iter(train_loader)
#     # 準備檢驗資料
#     # valid过程无法批量操作
#     val_dataset = SeqDataset(config.data_path, 'validation', config.max_output_len)
#     val_loader = data.DataLoader(val_dataset, batch_size=1)

## Test Process

In [27]:
def test_process(config):
    # 準備測試資料
    test_dataset = SeqDataset(config.data_path, 'testing', config.max_output_len)
    # 无法批量操作，所以只能当个执行，效率也很慢就是
    test_loader = data.DataLoader(test_dataset, batch_size=1)
    # 建構模型
    model, optimizer = build_model(config, test_dataset.en_vocab_size, test_dataset.cn_vocab_size)
    print ("Finish build model")
    loss_function = nn.CrossEntropyLoss(ignore_index=0)
    model.eval()
    # 測試模型
    test_loss, bleu_score, result = test(model, test_loader, loss_function)
    # 儲存結果
    with open(f'./test_output.txt', 'w') as f:
        for line in result:
            print (line, file=f)

    return test_loss, bleu_score


# Main

## Config

In [64]:
class configurations(object):
    def __init__(self):
        self.batch_size = 60
        self.emb_dim = 128
        self.hid_dim = 128
        self.n_layers = 3
        self.dropout = 0.5
        self.learning_rate = 0.00005
        self.max_output_len = 50              # 最後輸出句子的最大長度
        self.num_steps = 3000                # 總訓練次數
        self.store_steps = 100                # 訓練多少次後須儲存模型
        self.summary_steps = 100              # 訓練多少次後須檢驗是否有overfitting
        self.load_model = False               # 是否需載入模型
        self.load_model_path = None           # 載入模型的位置 e.g. "./ckpt/model_{step}" 
        self.attention = True                # 是否使用 Attention Mechanism
        self.base_path = './'
        self.store_model_path = os.path.join(self.base_path, 'ckpt')     # 儲存模型的位置
        self.data_path =  os.path.join(self.base_path, 'cmn-eng')         # 資料存放的位置

## Training

In [65]:
if __name__ == '__main__':
    config = configurations()
    print ('config:\n', vars(config))
    train_losses, val_dataset_losses, bleu_scores = train_process(config)
    plot(train_lossesn, val_dataset_losses, bleu_scores)

config:
 {'batch_size': 60, 'emb_dim': 128, 'hid_dim': 128, 'n_layers': 3, 'dropout': 0.5, 'learning_rate': 5e-05, 'max_output_len': 50, 'num_steps': 3000, 'store_steps': 100, 'summary_steps': 100, 'load_model': False, 'load_model_path': None, 'attention': True, 'base_path': './', 'store_model_path': './ckpt', 'data_path': './cmn-eng'}


In [63]:
# summary
train_dataset = SeqDataset(config.data_path, 'training', config.max_output_len)
train_loader = data.DataLoader(train_dataset, batch_size=config.batch_size, shuffle=True)
train_iter = infinite_iter(train_loader)
# 準備檢驗資料
# valid过程无法批量操作
val_dataset = SeqDataset(config.data_path, 'validation', config.max_output_len)
val_loader = data.DataLoader(val_dataset, batch_size=1)
# 建構模型
model, optimizer = build_model(config, train_dataset.en_vocab_size, train_dataset.cn_vocab_size)
input_,target = next(train_iter)
print(summary(model, input_.to(device, torch.long), target.to(device, torch.long), 1,show_hierarchical=True))

Seq2Seq(
  (encoder): Encoder(
    (embedding): Embedding(3922, 256)
    (rnn): GRU(256, 512, num_layers=3, batch_first=True, dropout=0.5, bidirectional=True)
    (dropout): Dropout(p=0.5, inplace=False)
  )
  (decoder): Decoder(
    (emb): Embedding(3805, 256)
    (att): Attention(
      (W1): Linear(in_features=3072, out_features=1024, bias=True)
      (W2): Linear(in_features=1024, out_features=1024, bias=True)
      (V): Linear(in_features=1024, out_features=1, bias=True)
    )
    (rnn): GRU(1280, 1024, num_layers=3, batch_first=True, dropout=0.5)
    (dropout): Dropout(p=0.5, inplace=False)
    (fc): Sequential(
      (0): Linear(in_features=1024, out_features=2048, bias=True)
      (1): Linear(in_features=2048, out_features=4096, bias=True)
      (2): Linear(in_features=4096, out_features=3805, bias=True)
    )
  )
)
Adam (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.999)
    eps: 1e-08
    lr: 5e-05
    weight_decay: 0
)
---------------------------------------------

# 实验

## 无Teacher Force

In [None]:
class configurations(object):
    def __init__(self):
        self.batch_size = 60
        self.emb_dim = 256
        self.hid_dim = 512
        self.n_layers = 3
        self.dropout = 0.5
        self.learning_rate = 0.00005
        self.max_output_len = 50              # 最後輸出句子的最大長度
        self.num_steps = 12000                # 總訓練次數
        self.store_steps = 300                # 訓練多少次後須儲存模型
        self.summary_steps = 300              # 訓練多少次後須檢驗是否有overfitting
        self.load_model = False               # 是否需載入模型
        self.store_model_path = "./ckpt"      # 儲存模型的位置
        self.load_model_path = None           # 載入模型的位置 e.g. "./ckpt/model_{step}" 
        self.data_path = "./cmn-eng"          # 資料存放的位置
        self.attention = True                # 是否使用 Attention Mechanism
        
if __name__ == '__main__':
    config = configurations()
    print ('config:\n', vars(config))
    train_losses, val_dataset_losses, bleu_scores = train_process(config)
    plot(train_lossesn, val_dataset_losses, bleu_scores)