In [None]:
import torch
from torch.utils.data import Dataset

class MTDataset(Dataset):
    """
    Create a Dataset class to feed into a torch DataLoader
    
    Inputs:
    - input_matrix: word vectors of input sentences
    - target_matrix: word vectors of target sentences
    
    Return:
    - pairs of input tensors - target tensors
    """
    def __init__(self, input_matrix, target_matrix):
        self.data = []
        for i in range(len(input_matrix)):
            self.data.append((input_matrix[i], target_matrix[i]))
            
    def __getitem__(self, idx):
        return (torch.Tensor(self.data[idx][0]), torch.Tensor(self.data[idx][1]))
    
    def __len__(self):
        return len(self.data)

In [None]:

import torch.nn as nn
import torch

class Encoder(nn.Module):
    """
    Encoder using bi-directional GRU to encode input sentences
    
    Arguments:
    - vocab_size, embedding_dim, hidden_size: integers
    - modified: False to use Encoder's last backward hidden state like in original paper. True to use a combination between forward and backward hidden states
    
    Inputs:
    - x: batch of input sentences after converting to indices, size (batch, Tx)
    
    Returns:
    - out: output from GRU
    - last_backward_hidden, last_forward_hidden: hidden states of backward and forward GRU at the last word of input sentences
    """
    def __init__(self, vocab_size, embedding_dim, hidden_size, modified=False):
        super(Encoder, self).__init__()
        self.vocab_size = vocab_size
        self.embedding_dim = embedding_dim
        self.hidden_size = hidden_size
        self.modified = modified
        self.embedding = nn.Embedding(self.vocab_size, self.embedding_dim)
        self.gru = nn.GRU(self.embedding_dim, self.hidden_size, batch_first=True, bidirectional=True)
        if self.modified:
            self.fc_hidden = nn.Linear(hidden_size * 2, hidden_size)
        
    def forward(self, x):
        embedding = self.embedding(x)
        out, hidden = self.gru(embedding)
        last_backward_hidden = out[:, 0, self.hidden_size:].unsqueeze(0)
        last_forward_hidden = hidden[0].unsqueeze(0)
        if self.modified:
            enc_hidden = self.fc_hidden(torch.cat((last_backward_hidden, last_forward_hidden), dim=-1))
        else:
            enc_hidden = last_backward_hidden
        return out, enc_hidden
    
class Decoder(nn.Module):
    """
    Decoder with Attention for 1 timestep.
    
    Arguments:
    - hidden_size, vocab_size, embedding_dim: integers
    
    Inputs:
    - dec_input: current input
    - hidden: hidden state from previous timestep
    - enc_out: output from Encoder
    
    Returns:
    - out: Decoder's output
    - hidden: hidden states to feed into next timestep's Decoder
    """
    def __init__(self, hidden_size, vocab_size, embedding_dim):
        super(Decoder, self).__init__()
        self.hidden_size = hidden_size
        self.vocab_size = vocab_size
        self.embedding_dim = embedding_dim
        
        # Alignment model
        self.Wa = nn.Linear(self.hidden_size, self.hidden_size)
        self.Ua = nn.Linear(self.hidden_size * 2, self.hidden_size)
        self.Va = nn.Linear(self.hidden_size, 1)
        self.softmax = nn.Softmax(dim=1)
        
        # GRU layer
        self.embedding = nn.Embedding(self.vocab_size, self.embedding_dim)
        self.gru = nn.GRU(self.embedding_dim + self.hidden_size * 2, self.hidden_size, batch_first=True)
        self.out = nn.Linear(self.hidden_size, self.vocab_size)
    
    def forward(self, dec_input, hidden, enc_out):
        Tx = enc_out.shape[1]
        hidden_repeat = hidden.permute(1, 0, 2).repeat(1, Tx, 1)
        energies = self.Va(torch.tanh(self.Wa(hidden_repeat) + self.Ua(enc_out)))
        alphas = self.softmax(energies)
        context = torch.sum(alphas * enc_out, dim=1).unsqueeze(1)
        embedding = self.embedding(dec_input.unsqueeze(1))
        gru_input = torch.cat((embedding, context), dim=-1)
        out, hidden = self.gru(gru_input, hidden.contiguous())
        out = self.out(out)
        return out, hidden

In [None]:
import string
import numpy as np
import random

exclude = list(string.punctuation) + list(string.digits)

class Language(object):
    """
    Create a language class that contains necessary attributes.
    
    Inputs:
    - sentence_list: a list containing all sentences (string format)
    - train: True if used for training phase, otherwise False
    - word2id, id2word: get word2id and id2word from existing training set. Only used for val/test set (train = False), ignored if train = True.
    
    Returns a class that contains:
    - max_len: length of longest sentence in the list
    - sentences: list containing all sentences
    - word2id, id2word
    - vocab_size: number of words after preprocessing
    - wordvec: word vectors
    """
    def __init__(self, sentence_list, train=True, word2id=None, id2word=None):
        self.word2id = word2id
        self.id2word = id2word
        self.train = train
        self.preprocess(sentence_list)
        self.get_vocab()
        self.get_word_vectors()
        
    def preprocess(self, sentence_list):
        """
        Preprocess sentences by adding <START> and <END> tokens, then padding all sentences to the same length with <PAD> tokens.
        """
        self.max_len = 0
        self.sentences = []
        for sen in sentence_list:
            sen = '<START> ' + sen + ' <END>'
            length = len(sen.split())
            self.sentences.append(sen)
            if self.max_len < length:
                self.max_len = length
        self.padding()
    
    def padding(self):
        """
        Extend all sentences to the same size by adding <PAD> tokens.
        """
        for i, sen in enumerate(self.sentences):
            length = len(sen.split())
            diff = self.max_len - length
            paddings = [' <PAD>'] * diff
            self.sentences[i] = sen + ''.join(paddings)
            
    def get_vocab(self):
        """
        Retrieve word2id, id2word, vocab size.
        """
        if self.train:
            self.word2id = {}
            self.id2word = []
            for s in self.sentences:
                for char in s.split():
                    if char not in self.word2id:
                        self.id2word.append(char)
                        self.word2id[char] = len(self.id2word) - 1
        self.vocab_size = len(self.id2word)
        
    def get_word_vectors(self):
        """
        Retrieve word vectors.
        """
        self.wordvec = []
        for i, sen in enumerate(self.sentences):
            id_list = []
            for s in sen.split():
                if s in self.word2id:
                    id_list.append(self.word2id[s])
                else:
                    id_list.append(random.randint(0, self.vocab_size-1))
            self.wordvec.append(id_list)
        self.wordvec = np.array(self.wordvec)

In [None]:
import string
import random
import numpy as np
import torch

def generate_seed(seed):
    """
    Generate a seed for deterministic random calculation.
    
    Input:
    - seed: an integer for seed
    """
    torch.cuda.manual_seed(seed)
    torch.manual_seed(seed)
    np.random.seed(seed)
    random.seed(seed)
    torch.backends.cudnn.deterministic = True

def preprocess(inp_filename, target_filename, max_len):
    with open(inp_filename, 'r', encoding='utf8') as f_inp:
        lines_inp = f_inp.read().split('\n')
    with open(target_filename, 'r', encoding='utf8') as f_trg:
        lines_trg = f_trg.read().split('\n')
    
    sentences_inp, sentences_trg = [], []
    exclude = list(string.punctuation) + list(string.digits)
    
    for sen_inp, sen_trg in zip(lines_inp, lines_trg):
        sen_inp = ''.join([char for char in sen_inp if char not in exclude]).strip().lower()
        sen_trg = ''.join([char for char in sen_trg if char not in exclude]).strip().lower()
        len_inp = len(sen_inp.split())
        len_trg = len(sen_trg.split())
        if len_inp <= max_len and len_trg <= max_len:
            sentences_inp.append(sen_inp)
            sentences_trg.append(sen_trg)
    f_inp.close()
    f_trg.close()
    return sentences_inp, sentences_trg

In [None]:
import torch
import numpy as np
from nltk.translate.bleu_score import corpus_bleu

def validate(loader, encoder, decoder, id2word, device='cpu'):
    """
    Validate model's performance on validation set using BLEU-4 score.
    
    Inputs:
    - val_loader: DataLoader for validation set
    - encoder: an Encoder model
    - decoder: a Decoder model
    - id2word: id2word from target training set
    - device: 'cpu' or 'cuda'
    
    Return:
    - bleu: BLEU-4 score of dataset
    """
    encoder.eval()
    decoder.eval()
    references, hypotheses = [], []
    with torch.no_grad():
        for i, (x, y) in enumerate(loader):
            x = x.to(device=device, dtype=torch.long)
            y = y.to(device=device, dtype=torch.long)
            enc_out, enc_hidden = encoder(x)
            dec_hidden = enc_hidden
            dec_input = y[:, 0]
            ref_matrix = y.clone().cpu().numpy()
            for vec in ref_matrix:
                sentence = [id2word[id] for id in vec[1:] if id2word[id] not in ['<END>', '<PAD>']]
                references.append([sentence])
            hypo_matrix = []
            for t in range(1, y.size(1)):
                out, dec_hidden = decoder(dec_input, dec_hidden, enc_out)
                top = torch.max(out, dim=-1)[1].squeeze(1)
                dec_input = top
                next_id = list(top.clone().cpu().numpy())
                hypo_matrix.append(next_id)
            hypo_matrix = np.array(hypo_matrix).transpose()
            for vec in hypo_matrix:
                sentence = [id2word[id] for id in vec if id2word[id] not in ['<END>', '<PAD>']]
                hypotheses.append(sentence)
        bleu = corpus_bleu(list_of_references=references, hypotheses=hypotheses)
        encoder.train()
        decoder.train()
        return bleu

In [None]:

def train(encoder, decoder, train_loader, val_loader, optimizer, criterion, id2word, lr_scheduler=None, num_epochs=1, print_every=100, device='cpu', early_stop=False):
    """
    Function for training
    
    Inputs:
    - encoder, decoder
    - train_loader, val_loader: DataLoader for training set and validation set
    - optimizer: a torch.optim optimizer (e.g. torch.optim.Adam(...))
    - criterion: loss function (e.g. nn.CrossEntropyLoss())
    - id2word: id2word for target training set
    - lr_scheduler: learning rate scheduler (e.g. torch.optim.lr_scheduler.StepLR)
    - num_epochs
    - print_every
    - device: 'cpu' or 'cuda'
    """
    encoder.train()
    decoder.train()
    best_bleu = 0
    best_statedict = {'encoder': encoder.state_dict(), 'decoder': decoder.state_dict()}
    for epoch in range(num_epochs):
        print('Epoch ', epoch + 1)
        for i, (x, y) in enumerate(train_loader):
            x = x.to(device=device, dtype=torch.long)
            y = y.to(device=device, dtype=torch.long)
            enc_out, enc_hidden = encoder(x)
            dec_hidden = enc_hidden
            dec_input = y[:, 0]
            loss = 0
            optimizer.zero_grad()
            for t in range(1, y.size(1)):
                out, dec_hidden = decoder(dec_input, dec_hidden, enc_out)
                dec_input = y[:, t]
                loss += criterion(out.squeeze(1), y[:, t])
            loss.backward()
            optimizer.step()
            if i % print_every == 0:
                print('Iter %d, loss = %f' %(i, loss.item() / y.size(1)))
        if lr_scheduler != None:
            lr_scheduler.step()
        bleu = validate(val_loader, encoder, decoder, id2word, device)
        print('Validation BLEU score: %f\n' %bleu)
        if bleu > best_bleu:
            best_statedict = {'encoder': encoder.state_dict(), 'decoder': decoder.state_dict()}
            best_bleu = bleu
        elif early_stop:
            print('=== BLEU begins to decrease, training exits ===')
            return best_statedict
    return best_statedict

In [None]:
import string
import torch

def translate(sentence, inp_word2id, trg_word2id, trg_id2word, encoder, decoder, trg_max_len, device='cpu'):
    """
    Generate translation for input sentence.
    
    Inputs:
    - sentence: a sentence in string format
    - inp_word2id: word2id from input training set
    - trg_word2id: word2id from target training set
    - trg_id2word: id2word from target training set
    - encoder, decoder: Encoder, Decoder models
    - trg_max_len: max length for target sentence
    - device: 'cpu' or 'cuda'
    
    Return a sentence
    """
    exclude = list(string.punctuation) + list(string.digits)
    sentence = '<START> ' + ''.join([char for char in sentence if char not in exclude]).strip().lower() + ' <END>'
    sen_matrix = [inp_word2id[s] for s in sentence.split()]
    sen_tensor = torch.Tensor(sen_matrix).to(device=device, dtype=torch.long).unsqueeze(0)
    encoder.eval()
    decoder.eval()
    with torch.no_grad():
        enc_out, enc_hidden = encoder(sen_tensor)
        dec_hidden = enc_hidden
        dec_input = torch.Tensor([trg_word2id['<START>']]).to(device='cuda', dtype=torch.long)
        output_list = []
        for t in range(1, trg_max_len):
            out, dec_hidden = decoder(dec_input, dec_hidden, enc_out)
            dec_input = torch.max(out, dim=-1)[1].squeeze(1)
            next_id = dec_input.squeeze().clone().cpu().numpy()
            next_word = trg_id2word[next_id]
            if next_word == '<END>':
                break
            output_list.append(next_word)
        return ' '.join(output_list)

In [None]:
MAX_LEN = 64
sentences_inp_train, sentences_trg_train = preprocess('/kaggle/input/nlp-dataset/tokenization/train/train.vi','/kaggle/input/nlp-dataset/tokenization/train/train.en',  max_len=MAX_LEN)
sentences_inp_val, sentences_trg_val = preprocess('/kaggle/input/nlp-dataset/tokenization/dev/dev.vi','/kaggle/input/nlp-dataset/tokenization/dev/dev.en',  max_len=MAX_LEN)
sentences_inp_test, sentences_trg_test = preprocess('/kaggle/input/nlp-dataset/tokenization/test/test.vi','/kaggle/input/nlp-dataset/tokenization/test/test.en', max_len=MAX_LEN)

In [None]:
train_inp = Language(sentences_inp_train)
train_trg = Language(sentences_trg_train)

val_inp = Language(sentences_inp_val, train=False, word2id=train_inp.word2id, id2word=train_inp.id2word)
val_trg = Language(sentences_trg_val, train=False, word2id=train_trg.word2id, id2word=train_trg.id2word)

test_inp = Language(sentences_inp_test, train=False, word2id=train_inp.word2id, id2word=train_inp.id2word)
test_trg = Language(sentences_trg_test, train=False, word2id=train_trg.word2id, id2word=train_trg.id2word)

In [None]:
train_set = MTDataset(train_inp.wordvec, train_trg.wordvec)
val_set = MTDataset(val_inp.wordvec, val_trg.wordvec)
test_set = MTDataset(test_inp.wordvec, test_trg.wordvec)

In [None]:
from torch.utils.data import DataLoader
import torch
import torch.nn as nn
from torch.optim.lr_scheduler import StepLR

In [None]:
train_loader = DataLoader(train_set, batch_size=64, shuffle=True)
val_loader = DataLoader(val_set, batch_size=64)
test_loader = DataLoader(test_set, batch_size=64)

In [None]:
Tx, Ty = train_inp.max_len, train_trg.max_len
vocab_size_inp, vocab_size_trg = train_inp.vocab_size, train_trg.vocab_size
embedding_dim = 256
hidden_size = 1024

In [None]:
if torch.cuda.is_available():
    device='cuda'
else:
    device='cpu'

In [None]:
# choose a seed for both models for consistent results
SEED = 5

In [None]:
generate_seed(SEED)
encoder_2 = Encoder(vocab_size_inp, embedding_dim, hidden_size, modified=True).to(device=device)
decoder_2 = Decoder(hidden_size, vocab_size_trg, embedding_dim).to(device=device)

In [None]:
optimizer_2 = torch.optim.Adam(params=list(encoder_2.parameters()) + list(decoder_2.parameters()))
criterion_2 = nn.CrossEntropyLoss()
scheduler_2 = StepLR(optimizer_2, step_size=2, gamma=0.2)

In [None]:
# train model
statedict_2 = train(encoder_2, decoder_2, train_loader, val_loader, optimizer_2, criterion_2, train_trg.id2word, scheduler_2, 20, 200, device)

In [None]:
# save state dict
torch.save(statedict_2, 'statedict_2.pth')  #  save model's state dict

In [None]:
# load state dict
statedict_2 = torch.load('statedict_2.pth')
encoder_2.load_state_dict(statedict_2['encoder'])
decoder_2.load_state_dict(statedict_2['decoder'])

In [None]:
print('Model 2 BLEU score: %.3f' %(100*validate(test_loader, encoder_2, decoder_2, test_trg.id2word, device)))

In [None]:
sentence = "anh quốc và mỹ nhiều nước khác nữa nó cũng là nước khác và những nước khác nữa"
print("Sentence: " + sentence)

print("Model 2: " + translate(sentence, train_inp.word2id, train_trg.word2id, train_trg.id2word, encoder_2, decoder_2, MAX_LEN, device))