Each team will build the following sequence of neural translation systems for two language pairs, Vietnamese (Vi)→English (En) and Chinese (Zh)→En (prepared corpora will be provided):

Recurrent neural network based encoder-decoder without attention
Recurrent neural network based encoder-decoder with attention
Replace the recurrent encoder with either convolutional or self-attention based encoder.
[Optional] Build either or both fully self-attention translation system or/and multilingual translation system.

You are expected to implement these on your own (if necessary), experiment them with both language pairs, report their performance (measured in terms of automatic evaluation metrics) and analyze their behaviours and properties.  


In [32]:
from __future__ import unicode_literals, print_function, division
import pickle as pkl
from io import open
import unicodedata
import string
import re
import random
import torch
from torch.utils.data import Dataset
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import torch.nn as nn

In [2]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

'cpu'

## Data Pre-processing

In [3]:
SOS_token = 0
EOS_token = 1
PAD_IDX = 2
UNK_IDX = 3
class Lang:
    def __init__(self, name):
        self.name = name
        self.word2index = {}
        self.word2count = {}
        self.index2word = {0: "SOS", 1: "EOS", 2:"UNK", 3:"PAD"}
        self.n_words = 4  # Count SOS and EOS

    def addSentence(self, sentence):
        for word in sentence:
            self.addWord(word)

    def addWord(self, word):
        if word not in self.word2index:
            self.word2index[word] = self.n_words
            self.word2count[word] = 1
            self.index2word[self.n_words] = word
            self.n_words += 1
        else:
            self.word2count[word] += 1

In [4]:
import html
def normalizeString(s):
    s = re.sub(r"([.!?])", r" \1", s)
    s = html.unescape(s)
#     s = re.sub(r"([.!?])", r" \1", s)
#     s = re.sub(r"&apos;m", r"am", s)
#     s = re.sub(r"&apos;s", r"is", s)
#     s = re.sub(r"&apos;re", r"are", s)
#     s = re.sub(r"&apos;", r"", s)
    return s

In [5]:
def loadingLangs(sourcelang, targetlang, setname):
    input_ls = []
    output_ls = []
    print('Reading lines...')
    # Read the file 
    with open('data/iwslt-%s-%s/%s.tok.%s'%(sourcelang, targetlang, setname,sourcelang)) as f:
        for line in f.readlines():
            input_ls.append([normalizeString(word) for word in line.split()])
    with open('data/iwslt-%s-%s/%s.tok.%s'%(sourcelang, targetlang, setname,targetlang)) as f:
        for line in f.readlines():
            output_ls.append([normalizeString(word) for word in line.split()])
    pairs = list(zip(input_ls, output_ls))
    print('Read %s sentence pairs'%(len(input_ls)))
    input_lang = Lang(sourcelang)
    output_lang = Lang(targetlang)
    print("Counting words...")
    for pair in pairs:
        input_lang.addSentence(pair[0])
        output_lang.addSentence(pair[1])
    print("Counted words:")
    print(input_lang.name, input_lang.n_words)
    print(output_lang.name, output_lang.n_words)
    return input_lang, output_lang, pairs

In [7]:
source_tra, target_tra, pairs_tra = loadingLangs('zh', 'en', 'train')
source_val, target_val, pairs_val = loadingLangs('zh', 'en', 'dev')
source_tes, target_tes, pairs_tes = loadingLangs('zh', 'en', 'test')

Reading lines...
Read 213377 sentence pairs
Counting words...
Counted words:
zh 88918
en 69126
Reading lines...
Read 1261 sentence pairs
Counting words...
Counted words:
zh 6133
en 4018
Reading lines...
Read 1397 sentence pairs
Counting words...
Counted words:
zh 5215
en 3523


## Dataset 

In [8]:
# Hu
print("95% of chinese sentences length = {0}".format(np.percentile([len(x[0]) for x in pairs_tra], 95)))
print("95% of english sentences length = {0}".format(np.percentile([len(x[1]) for x in pairs_tra], 95)))
print(random.choice(pairs_tra))

95% of chinese sentences length = 44.0
95% of english sentences length = 48.0
(['你', '乘坐', '俄国', '的', '飞行', '飞行器', '飞行', '这', '是', '可以', '的', '因为', '苏联', '的', '太空', '计划', '缺乏', '资金', '一个', '座位', '可以', '得到', '两千', '两千万', '千万', '万美金', '美金', '对', '他们', '来说', '很', '不错'], ['You', 'can', 'fly', 'with', 'Russian', 'hardware', ' .', 'This', 'is', 'available', 'because', 'a', 'Russian', 'space', 'program', 'is', 'starving', ',', 'and', 'it', "'s", 'nice', 'for', 'them', 'to', 'get', '20', 'million', 'here', 'and', 'there', 'to', 'take', 'one', 'of', 'the', 'seats', ' .'])


In [9]:
MAX_SENT_LEN = 50

In [10]:
def indexesFromSentence(lang, sentence):
    return [lang.word2index[word] if word in lang.word2index else UNK_IDX for word in sentence]


def tensorFromSentence(lang, sentence):
    indexes = indexesFromSentence(lang, sentence)
    indexes.append(EOS_token)
    return torch.tensor(indexes, dtype=torch.long, device=device).view(-1, 1)


def tensorsFromPair(pair,source,target):
    input_lang = source
    output_lang = target
    input_tensor = tensorFromSentence(input_lang, pair[0]).reshape((-1))
    target_tensor = tensorFromSentence(output_lang, pair[1]).reshape((-1))
    return (input_tensor, input_tensor.shape[0], target_tensor, target_tensor.shape[0])

In [11]:
class NMTDataset(Dataset):
    def __init__(self, source, target, pairs):
        self.source = source
        self.target = target
        self.pairs = pairs
        
    def __len__(self):
        return len(self.pairs) #Hu
    
    def __getitem__(self, key):
        """
        Triggered when you call dataset[i]
        """
        inp_ten, inp_len, tar_ten, tar_len = tensorsFromPair(self.pairs[key], self.source, self.target)
        item = {}
        item['inputtensor'] = inp_ten[:MAX_SENT_LEN]
        item['inputlen'] = min(inp_len, MAX_SENT_LEN)
        item['targettensor'] = tar_ten[:MAX_SENT_LEN]
        item['targetlen'] = min(tar_len, MAX_SENT_LEN)
        return item

In [14]:
train_data = NMTDataset(source_tra, target_tra, pairs_tra)
val_data = NMTDataset(source_val, target_val, pairs_val)
test_data = NMTDataset(source_tes, target_tes, pairs_tes)

In [15]:
train_data.__len__()

213377

## Dataloader

In [16]:
#collate function

def collate_func(batch):
    """
    Customized function for DataLoader that dynamically pads the batch so that all
    data have the same length
    """
    src_data, tar_data, src_len, tar_len = [], [], [], []
    for datum in batch:        
        src_datum = np.pad(np.array(datum['inputtensor']),
                                pad_width=((0,MAX_SENT_LEN-datum['inputlen'])),
                                mode="constant", constant_values=PAD_IDX)
        tar_datum = np.pad(np.array(datum['targettensor']),
                                pad_width=((0,MAX_SENT_LEN-datum['targetlen'])),
                                mode="constant", constant_values=PAD_IDX)
        src_data.append(src_datum)
        tar_data.append(tar_datum)
        src_len.append(datum['inputlen'])
        tar_len.append(datum['targetlen'])
        
        ### Hu Modified
    ind_dec_order = np.argsort(src_len)[::-1]
    src_data = np.array(src_data)[ind_dec_order]
    src_len = np.array(src_len)[ind_dec_order]
    tar_data = np.array(tar_data)[ind_dec_order]
    return [torch.from_numpy(np.array(src_data)),torch.from_numpy(np.array(tar_data)),
               torch.LongTensor(np.array(src_len)),torch.LongTensor(np.array(tar_len))]

In [17]:
BATCH_SIZE = 32
train_loader = torch.utils.data.DataLoader(train_data,
                                           batch_size=BATCH_SIZE,shuffle=True,collate_fn=collate_func)
val_loader = torch.utils.data.DataLoader(val_data,
                                           batch_size=BATCH_SIZE,shuffle=False,collate_fn=collate_func)
test_loader = torch.utils.data.DataLoader(test_data,
                                           batch_size=BATCH_SIZE,shuffle=True,collate_fn=collate_func)

In [18]:
# sample data loader
for data in train_loader:
    print('input sentence batch: ')
    print('target sentence batch: ')
    print(data[1])
    print('input sentence len: ')
    print(data[2])
    print('target sentence len: ')
    print(data[3])
    break

input sentence batch: 
target sentence batch: 
tensor([[   52,    49,   244,  ...,   169,    21,     6],
        [  298,  1094,   773,  ...,   657,    21,  1949],
        [  328,   115,   674,  ...,     2,     2,     2],
        ...,
        [  158,     6,  1591,  ...,     2,     2,     2],
        [  148, 14298,   215,  ...,     2,     2,     2],
        [   52,   115,  1444,  ...,     2,     2,     2]])
input sentence len: 
tensor([50, 50, 40, 36, 36, 35, 34, 32, 31, 28, 26, 26, 25, 23, 22, 21, 20, 17,
        16, 13, 12, 11, 10, 10, 10,  9,  9,  8,  8,  7,  7,  6])
target sentence len: 
tensor([33, 30,  9, 21, 38,  6, 36,  7,  9, 50, 19, 20, 13, 13,  8, 29, 23, 40,
         6, 13,  9, 50, 26, 11, 16,  9, 11, 12, 33, 45, 39, 36])


----------------------------------------------

In [33]:
class EncoderRNN(nn.Module):
    def __init__(self, emb_size, hidden_size, num_layers, 
                 vocab_size, dropout, bidirection = True, word_vecs = None):
        super(EncoderRNN, self).__init__()
        self.hidden_size = hidden_size
        self.emb_size = emb_size
        self.num_layers = num_layers
        if bidirection == True:
            self.directions = 2
        else: 
            self.directions = 1
        if word_vecs == None:
            self.embedding = nn.Embedding(vocab_size, emb_size,padding_idx=PAD_IDX)
        else:
            self.embedding = nn.Embedding(vocab_size, emb_size, padding_idx=PAD_IDX)
            self.embedding.weight = nn.Parameter(word_vecs)
            self.embedding.requires_grad = False
        self.gru = nn.GRU(emb_size, hidden_size, num_layers, batch_first=True, 
                          dropout=dropout, bidirectional=bidirection)

    def forward(self, enc_input, lengths):
        # do we need pack padded
        batch_size, seq_len = enc_input.size() #seq_len will be used in cnn
        self.hidden = self.initHidden(batch_size)
        embedded = self.embedding(encoder_input).view(1, 1, -1)
        output = embedded
        output = torch.nn.utils.rnn.pack_padded_sequence(output, lengths, batch_first=True)
        output, hidden = self.gru(output, hidden)
        if self.directions == 2:
            hidden = torch.cat((hidden[0], hidden[1]), dim = 1)
        return output, hidden 

    def initHidden(self):
        return torch.zeros(self.num_layers * self.directions, batch_size,
                            self.hidden_size, self.hidden_size, device=device) 

In [34]:
class DecoderRNN(nn.Module):
    def __init__(self, output_size, emb_size, hidden_size, num_layers, dropout, bidirection = True, word_vecs = None):
        # output_size is the traget vocab size
        super(DecoderRNN, self).__init__()
        self.output_size = output_size
        self.hidden_size = hidden_size
        self.emb_size = emb_size
        self.num_layers = num_layers
        if bidirection == True:
            self.directions = 2
        else: 
            self.directions = 1
        
        self.embedding = nn.Embedding(output_size, hidden_size, padding_idx=PAD_IDX)
        
        self.gru = nn.GRU(hidden_size, hidden_size, num_layers, batch_first=True, 
                          dropout=dropout, bidirectional=bidirection)
        
        self.out = nn.Linear(self.directions * hidden_size, output_size)
        self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, dec_input, hidden):
        output = self.embedding(dec_input).view(1, 1, -1)
        for i in range(self.num_layers):
            output = F.relu(output)
            output, hidden = self.gru(output, hidden)
        if self.directions == 2:
            hidden = torch.cat((hidden[0], hidden[1]), dim = 1)
        output = output.squeeze(0)
        output = self.softmax(self.out(output))
        return output, hidden

    def initHidden(self):
        return torch.zeros(self.num_layers * self.directions, 1, self.hidden_size, device=device)

In [37]:
class BahdanauAttnDecoderRNN(nn.Module):
    def __init__(self, vocab_size, hidden_size, num_layers, num_classes,
                 dropout, bidirection = True, word_vecs = None):
        super(BahdanauAttnDecoderRNN, self).__init__()
        
        # Define parameters
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.num_layers = num_layers
        self.dropout = dropout
        
        # Define layers
        self.embedding = nn.Embedding(self.output_size, self.hidden_size)
        self.dropout = nn.Dropout(self.dropout)
        self.attn = nn.Linear(self.hidden_size*2, hidden_size)
        self.attn_combine = nn.Linear(self.hidden_size*2, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size, n_layers, dropout=dropout_p)
        self.out = nn.Linear(hidden_size, output_size)
        self.softmax = nn.LogSoftmax(dim=1)
        if bidirection == True:
            self.directions = 2
        else: 
            self.directions = 1
    
    def forward(self, dec_input, last_hidden, encoder_outputs):
        # Note that we will only be running forward for a single decoder time step, but will use all encoder outputs
        
        # Get the embedding of the current input word (last output word)
        word_embedded = self.embedding(dec_input).view(1, 1, -1) # S=1 x B x N
        word_embedded = self.dropout(word_embedded)
        
        ##############################
        # TODO: Implement Attention  #
        ##############################
        attn_weights = F.softmax(
            self.attn(torch.cat((word_embedded[0], last_hidden[0]), 1)))
        attn_applied = torch.bmm(attn_weights.unsqueeze(0),
                                 encoder_outputs.unsqueeze(0))
        
        for i in range(self.num_layers):
            output = F.relu(output)
#             output = torch.nn.utils.rnn.pack_padded_sequence(output, lengths, batch_first=True)
            output, hidden = self.gru(output, hidden)
        if self.directions == 2:
            hidden = torch.cat((hidden[0], hidden[1]), dim = 1)

        # Final output layer
        output = output.squeeze(0) # B x N
        output = F.log_softmax(self.out(output))
        
        # Return final output, hidden state, and attention weights (for visualization)
        return output, hidden, attn_weights

In [39]:
class EncoderCNN(nn.Module):
    def __init__(self, emb_size, hidden_size, num_layers, ker_size,
                 vocab_size, dropout, word_vecs = None):
        super(EncoderCNN, self).__init__()
        self.hidden_size = hidden_size
        self.emb_size = emb_size
        self.num_layers = num_layers

        if word_vecs == None:
            self.embedding = nn.Embedding(vocab_size, emb_size, padding_idx=PAD_IDX)
        else:
            self.embedding = nn.Embedding(vocab_size, emb_size, padding_idx=PAD_IDX)
            self.embedding.weight = nn.Parameter(word_vecs)
            self.embedding.requires_grad = False
        self.conv1 = nn.Conv1d(emb_size, hidden_size, kernel_size=ker_size, padding=(ker_size-1)//2)
        self.conv2 = nn.Conv1d(hidden_size, hidden_size, kernel_size=ker_size, padding=(ker_size-1)//2)
        self.linear1 = nn.Linear(hidden_size, hidden_size)
        self.linear2 = nn.Linear(hidden_size, hidden_size)
        self.dropout = torch.nn.Dropout(dropout)

    def forward(self, enc_input, lengths):
        # do we need pack padded
        batch_size, seq_len = enc_input.size() #seq_len will be used in cnn
        embedded = self.embedding(encoder_input).view(1, 1, -1)
        hidden = self.conv1(embedded.transpose(1,2)).transpose(1,2)
        hidden = F.relu(hidden.contiguous().view(-1, hidden_p.size(-1))).view(batch_size, seq_len, hidden.size(-1))

        hidden = self.conv2(hidden.transpose(1,2)).transpose(1,2)
        hidden = F.relu(hidden_p.contiguous().view(-1, hidden.size(-1))).view(batch_size, seq_len, hidden.size(-1))
        hidden = torch.nn.functional.max_pool1d(hidden.transpose(1,2), enc_input.size()[-1]).transpose(1,2)
        hidden = self.linear1(hidden)
        hidden = F.relu(self.dropout(hidden))
        hidden = self.linear2(hidden)
        return hidden, hidden # to use the same training model, need two outputs


----------------------------------------------

Draft:  
Beam search reference: https://github.com/IBM/pytorch-seq2seq/blob/master/seq2seq/models/TopKDecoder.py  
BLEU score: https://github.com/MaximumEntropy/Seq2Seq-PyTorch/blob/master/evaluate.py  

In [41]:
teacher_forcing_ratio = 0.5


def train(input_tensor, target_tensor, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion, max_length=MAX_SENT_LEN):
#     encoder_hidden = encoder.initHidden()

    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()

    input_length = input_tensor.size(0)
    target_length = target_tensor.size(0)

    encoder_outputs = torch.zeros(max_length, encoder.hidden_size, device=device)

    loss = 0

    for ei in range(input_length):
        encoder_output, encoder_hidden = encoder(
            input_tensor[ei], encoder_hidden)
        encoder_outputs[ei] = encoder_output[0, 0]

    decoder_input = torch.tensor([[SOS_token]], device=device)

    decoder_hidden = encoder_hidden

    use_teacher_forcing = True if random.random() < teacher_forcing_ratio else False

    if use_teacher_forcing:
        # Teacher forcing: Feed the target as the next input
        for di in range(target_length):
            decoder_output, decoder_hidden, decoder_attention = decoder(
                decoder_input, decoder_hidden, encoder_outputs)
            loss += criterion(decoder_output, target_tensor[di])
            decoder_input = target_tensor[di]  # Teacher forcing

    else:
        # Without teacher forcing: use its own predictions as the next input
        for di in range(target_length):
            decoder_output, decoder_hidden, decoder_attention = decoder(
                decoder_input, decoder_hidden, encoder_outputs)
            topv, topi = decoder_output.topk(1)
            decoder_input = topi.squeeze().detach()  # detach from history as input

            loss += criterion(decoder_output, target_tensor[di])
            if decoder_input.item() == EOS_token:
                break

    loss.backward()

    encoder_optimizer.step()
    decoder_optimizer.step()

    return loss.item() / target_length

In [42]:
def trainIters(encoder, decoder, n_iters, print_every=1000, plot_every=100, learning_rate=0.01):
    start = time.time()
    plot_losses = []
    print_loss_total = 0  # Reset every print_every
    plot_loss_total = 0  # Reset every plot_every

    encoder_optimizer = optim.SGD(encoder.parameters(), lr=learning_rate)
    decoder_optimizer = optim.SGD(decoder.parameters(), lr=learning_rate)
    training_pairs = [tensorsFromPair(random.choice(pairs))
                      for i in range(n_iters)]
    criterion = nn.NLLLoss()

    for iter in range(1, n_iters + 1):
        training_pair = training_pairs[iter - 1]
        input_tensor = training_pair[0]
        target_tensor = training_pair[1]

        loss = train(input_tensor, target_tensor, encoder,
                     decoder, encoder_optimizer, decoder_optimizer, criterion)
        print_loss_total += loss
        plot_loss_total += loss

        if iter % print_every == 0:
            print_loss_avg = print_loss_total / print_every
            print_loss_total = 0
            print('%s (%d %d%%) %.4f' % (timeSince(start, iter / n_iters),
                                         iter, iter / n_iters * 100, print_loss_avg))

        if iter % plot_every == 0:
            plot_loss_avg = plot_loss_total / plot_every
            plot_losses.append(plot_loss_avg)
            plot_loss_total = 0

    showPlot(plot_losses)

In [None]:
def test_model(loader, model):
    """
    Help function that tests the model's performance on a dataset
    @param: loader - data loader for the dataset to test against
    """
    
    correct = 0
    total = 0
    loss = 0
    model.eval()
    for i, (p, h, lengths_p, lengths_h, labels) in enumerate(loader):
        p_batch, h_batch, length_batch_p, length_batch_h, label_batch = p, h, lengths_p, lengths_h, labels
        outputs = F.softmax(model(p_batch, h_batch, length_batch_p, length_batch_h), dim=1)
        predicted = outputs.max(1, keepdim=True)[1]
        total += labels.size(0)
        loss += F.cross_entropy(outputs, labels).detach().numpy()
        correct += predicted.eq(labels.view_as(predicted)).sum().item()
    return np.round((100 * correct / total), 4), np.round(loss / i, 4)

In [None]:
def plot_loss_acc(validation_acc_history,
                validation_loss_history,
                train_acc_history,
                train_loss_history):
    
    batches = np.arange(0, len(validation_acc_history))
    f, axs = plt.subplots(1, 2, figsize=(15,5))
    ax1 = axs[0]
    ax1.plot(batches, train_loss_history, label='Training loss')
    ax1.set_xlabel("number of batches")
    ax1.plot(batches, validation_loss_history, alpha=0.7, label='Validation Loss')
    ax1.legend(loc='upper right')

    ax2 = axs[1]
    ax2.plot(batches, train_acc_history, label='Training Accuracy')
    ax2.set_xlabel("number of batches")
    ax2.plot(batches, validation_acc_history, alpha=0.7, label='Validation Accuracy')
    ax2.legend(loc='upper right')
    plt.show()