# Neural Machine Translation

In [1]:
import os
import sys
sys.path.append('pyfiles/')

In [2]:
import global_variables
import dataset_helper
import nnet_models_new

In [3]:
import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F
from torch.utils.data import DataLoader
from torch.optim.lr_scheduler import ReduceLROnPlateau

from functools import partial
import time

In [4]:
base_saved_models_dir = '.'

### The Dataset

We will work with a English to French Dataset

In [5]:
source_name = 'eng'
target_name = 'fra'
path_to_train_data = 'data/%s-%s_train.txt'%(source_name, target_name)
path_to_val_data = 'data/%s-%s_val.txt'%(source_name, target_name)

In [6]:
saved_models_dir = os.path.join(base_saved_models_dir, source_name+'2'+target_name)

In [7]:
## See first 5 records

! head -5 'data/eng-fra_train.txt'

I think we may have something that you'd be interested in buying.	Je pense que nous avons peut-être quelque chose dont vous seriez intéressés de faire l'acquisition.
They got it.	Ils l'ont eue.
I'm glad to see you.	Je suis enchanté de vous rencontrer.
He got into his car in a hurry.	Il monta en vitesse dans sa voiture.
Do you like Mozart's music?	Aimez-vous la musique de Mozart ?


### Processing and making PyTorch Dataset

We have to make it a pair - (source, target) sentence pair. For this, we have to read the file and parse it accordingly. We might have to take care of some details there, like making sure that we strip off any non-required special characters or extra space. All those boring details aside (which you can see in dataset_helper.py) what are the other things we have to do?

We have to make a vocabulary and tokenize like we have been doing. Here, we are writing a Language Class, like we did in the previous labs to take care of this for you. Once we have done all this and tokenized, we write a pytorch dataset object to help as handle this efficiently during training

In [8]:
saved_language_model_dir = os.path.join(saved_models_dir, 'lang_obj')

In [9]:
dataset_dict = {'train': dataset_helper.LanguagePair(source_name = source_name, target_name=target_name, 
                    filepath = path_to_train_data, 
                    lang_obj_path = saved_language_model_dir), 

                'val': dataset_helper.LanguagePair(source_name = source_name, target_name=target_name, 
                    filepath = path_to_val_data, 
                    lang_obj_path = saved_language_model_dir)}

The LanguagePair object we built has a DataFrame underneath. We see the first 5 rows of the dataframe below:

In [10]:
dataset_dict['train'].main_df.iloc[:5]

Unnamed: 0,source_data,target_data,source_tokenized,source_len,target_tokenized,target_len,source_indized,target_indized
0,i think we may have something that you d be in...,je pense que nous avons peut etre quelque chos...,"[i, think, we, may, have, something, that, you...",15,"[je, pense, que, nous, avons, peut, etre, quel...",19,"[5, 66, 15, 93, 20, 98, 23, 6, 143, 27, 458, 1...","[7, 68, 8, 17, 36, 90, 24, 74, 71, 296, 6, 665..."
1,they got it .,ils l ont eue .,"[they, got, it, .]",5,"[ils, l, ont, eue, .]",6,"[45, 117, 11, 4, 1]","[52, 9, 88, 2946, 4, 1]"
2,i m glad to see you .,je suis enchante de vous rencontrer .,"[i, m, glad, to, see, you, .]",8,"[je, suis, enchante, de, vous, rencontrer, .]",8,"[5, 13, 478, 7, 92, 6, 4, 1]","[7, 35, 5158, 5, 6, 570, 4, 1]"
3,he got into his car in a hurry .,il monta en vitesse dans sa voiture .,"[he, got, into, his, car, in, a, hurry, .]",10,"[il, monta, en, vitesse, dans, sa, voiture, .]",9,"[12, 117, 67, 76, 108, 10, 14, 410, 4, 1]","[12, 4992, 18, 1354, 29, 155, 125, 4, 1]"
4,do you like mozart s music ?,aimez vous la musique de mozart ?,"[do, you, like, mozart, s, music, ?]",8,"[aimez, vous, la, musique, de, mozart, ?]",8,"[19, 6, 72, 2, 22, 349, 16, 1]","[633, 6, 13, 356, 5, 2, 19, 1]"


### vocabulary sizes and sentence lengths

In [11]:
### vocabulary sizes
print('source vocab: ', dataset_dict['train'].source_lang_obj.n_words , 
      'target vocab: ', dataset_dict['train'].target_lang_obj.n_words)

source vocab:  4969 target vocab:  6788


In [12]:
### vocabulary sizes
print('max len: ', dataset_dict['train'].main_df['source_len'].max(), 
      'min len: ', dataset_dict['train'].main_df['source_len'].min() )

max len:  51 min len:  3


In [13]:
dataset_dict['train'].main_df['source_len'].quantile([0.5, 0.75, 0.9, 0.95, 0.99, 0.999])

0.500     8.0
0.750    10.0
0.900    12.0
0.950    13.0
0.990    17.0
0.999    22.0
Name: source_len, dtype: float64

51 looks like a very long sentence and at the $99.9$th percentile is 22. We probably don't want that much. How do we get rid of rest of the words or clip sentence at some MAX LEN? We can use the collate function of pytorch that we had seen earlier to do this. 

In [14]:
MAX_LEN = int(dataset_dict['train'].main_df['source_len'].quantile(0.999))
batchSize = 32

In [15]:
dataloader_dict = {'train': DataLoader(dataset_dict['train'], batch_size = batchSize, 
                            collate_fn = partial(dataset_helper.vocab_collate_func, MAX_LEN=MAX_LEN),
                            shuffle = True, num_workers=0), 
                    'val': DataLoader(dataset_dict['val'], batch_size = batchSize, 
                            collate_fn = partial(dataset_helper.vocab_collate_func, MAX_LEN=MAX_LEN),
                            shuffle = True, num_workers=0) }

The Seq2Seq Model
=================

A Recurrent Neural Network, or RNN, is a network that operates on a
sequence and uses its own output as input for subsequent steps.

A `Sequence to Sequence network <http://arxiv.org/abs/1409.3215>`__, or
seq2seq network, or `Encoder Decoder
network <https://arxiv.org/pdf/1406.1078v3.pdf>`__, is a model
consisting of usually of two RNNs called the encoder and decoder. The encoder reads
an input sequence and outputs a single vector, and the decoder reads
that vector to produce an output sequence. Essentially, all we need is some mechanism to read the source sentence and create an encoding and some mechanism to read the encoding and decode it to the target language. 

Unlike sequence prediction with a single RNN, where every input
corresponds to an output, the seq2seq model frees us from sequence
length and order, which makes it ideal for translation between two
languages.

Consider the sentence "Je ne suis pas le chat noir" → "I am not the
black cat". Most of the words in the input sentence have a direct
translation in the output sentence, but are in slightly different
orders, e.g. "chat noir" and "black cat". Because of the "ne/pas"
construction there is also one more word in the input sentence. It would
be difficult to produce a correct translation directly from the sequence
of input words.

With a seq2seq model the encoder creates a single vector which, in the
ideal case, encodes the "meaning" of the input sequence into a single
vector — a single point in some N dimensional space of sentences.




The Encoder
-----------

The encoder is anything which takes in a sentence and gives us a representation for the sentence. 

Usually, the encoder of a seq2seq network is a RNN that outputs some value for
every word from the input sentence. For every input word the encoder
outputs a vector and a hidden state, and uses the hidden state for the
next input word.

However, we will first start with a BoW encoder and then move on to RNN based encoders

In [16]:
### configuration

source_vocab = dataset_dict['train'].source_lang_obj.n_words;
target_vocab = dataset_dict['train'].target_lang_obj.n_words;
hidden_size = 512
rnn_layers = 1
lr = 0.25;
longest_label = 1;
gradient_clip = 0.3;
use_cuda = True

num_epochs = 10

### BagOfWords Encoder

In [None]:
encoder_bow = nnet_models_new.BagOfWords(input_size = source_vocab,
                                    hidden_size = hidden_size, 
                                    nlayers=10, 
                                    reduce = "sum")

In [None]:
print(encoder_bow)

The Decoder
--------------------


The decoder is another RNN that takes the encoder output vector(s) and outputs a sequence of words to create the translation.

Decoder w/o Attention
------------------------
In the simplest seq2seq decoder we use only last output of the encoder. This last output is sometimes called the context vector as it encodes context from the entire sequence. This context vector is used as the initial hidden state of the decoder.

At every step of decoding, the decoder is given an input token and hidden state. The initial input token is the start-of-string <SOS> token, and the first hidden state is the context vector (the encoder's last hidden state).

In [None]:
decoder_bow = nnet_models_new.DecoderRNN(target_vocab, hidden_size, rnn_layers)

In [None]:
print(decoder_bow)

In [None]:
nmt_bow = nnet_models_new.seq2seq(encoder_bow, decoder_bow,
                              lr = lr, 
                              use_cuda = use_cuda, 
                              hiddensize = hidden_size, 
                              numlayers = hidden_size, 
                              target_lang=dataset_dict['train'].target_lang_obj,
                              longest_label = longest_label,
                              clip = gradient_clip)

### Training Loop

In [None]:
def get_full_filepath(path, enc_type):
    filename = 'nmt_enc_'+enc_type+'_dec_rnn.pth'
    return os.path.join(path, filename)

In [None]:
def save_models(nmt_model, path, enc_type):
    if not os.path.exists(path):
            os.makedirs(path)
    filename = 'nmt_enc_'+enc_type+'_dec_rnn.pth'
    torch.save(nmt_model, os.path.join(path, filename))
   

In [None]:
def train_model(dataloader, nmt, num_epochs=50, val_every=1, saved_model_path = '.', enc_type ='rnn'):

    best_bleu = -1;
    for epoch in range(num_epochs):

        start = time.time()
        running_loss = 0

        for data in dataloader['train']:
    
            _, curr_loss = nmt.train_step(data);
            running_loss += curr_loss

        epoch_loss = running_loss / len(dataloader['train']) 
        
        print("epoch {} loss = {}, time = {}".format(epoch, epoch_loss,
                                                        time.time() - start))
        sys.stdout.flush()
   
        if epoch%val_every == 0:
            val_bleu_score = nmt.get_bleu_score(dataloader['val']);
            print('validation bleu: ', val_bleu_score)
            sys.stdout.flush()
            
            nmt.scheduler_step(val_bleu_score);
            
            if val_bleu_score > best_bleu:
                best_bleu = val_bleu_score
                best_wts = nmt.state_dict()
                save_models(nmt, saved_model_path, enc_type);

        print('='*50)

    print("Training completed. Best BLEU is {}".format(best_bleu))

    return nmt.load_state_dict(best_wts)

### Training Bow Encoder GRU Decoder Model

In [None]:
train_again = False
modelname = 'bow'
if os.path.exists(get_full_filepath(saved_models_dir, modelname)) and (not train_again):
    nmt_bow = torch.load(get_full_filepath(saved_models_dir, modelname))
else:
    nmt_bow = train_model(dataloader_dict, nmt_bow, 
                          num_epochs = num_epochs, 
                          saved_model_path = saved_models_dir, 
                          enc_type = 'bow_test')

### Check Performance

In [None]:
print(nmt_bow.get_bleu_score(dataloader_dict['val']))

## RNN Encoder

In [None]:
encoder_rnn = nnet_models_new.EncoderRNN(source_vocab, hidden_size, rnn_layers)

In [None]:
decoder_rnn = nnet_models_new.DecoderRNN(target_vocab, hidden_size, rnn_layers)

In [None]:
nmt_rnn = nnet_models_new.seq2seq(encoder_rnn, decoder_rnn,
                              lr = lr, 
                              use_cuda = use_cuda, 
                              hiddensize = hidden_size, 
                              numlayers = hidden_size, 
                              target_lang=dataset_dict['train'].target_lang_obj,
                              longest_label = longest_label,
                              clip = gradient_clip)

In [None]:
train_again = True
if os.path.exists(get_full_filepath(saved_models_dir, 'rnn')) and (not train_again):
    nmt_rnn = torch.load(get_full_filepath(saved_models_dir, 'rnn'))
else:
    nmt_rnn = train_model(dataloader_dict, nmt_rnn, 
                      num_epochs = num_epochs, 
                      saved_model_path = saved_models_dir, 
                      enc_type = 'rnn_test')

### Check Performance

In [None]:
nmt_rnn

In [None]:
print(nmt_rnn.get_bleu_score(dataloader_dict['val']))

## RNN Encoder + Source Side Attention

In [None]:
encoder_attention = True
self_attention = False

In [None]:
encoder_encoderattn = nnet_models_new.EncoderRNN(source_vocab, hidden_size, 1)

In [None]:
decoder_encoderattn = nnet_models_new.Decoder_SelfAttn(output_size=target_vocab,
                                 hidden_size=hidden_size, 
                                 encoder_attention = encoder_attention,
                                 self_attention = self_attention)

In [None]:
nmt_encoderattn = nnet_models_new.seq2seq(encoder_encoderattn, decoder_encoderattn,
                              lr = lr, 
                              use_cuda = use_cuda, 
                              hiddensize = hidden_size, 
                              numlayers = hidden_size, 
                              target_lang=dataset_dict['train'].target_lang_obj,
                              longest_label = longest_label,
                              clip = gradient_clip)

In [None]:
modelname = 'encoderattn'
if os.path.exists(get_full_filepath(saved_models_dir, modelname)):
    nmt_encoderattn = torch.load(get_full_filepath(saved_models_dir, modelname))
else:
    nmt_encoderattn = train_model(dataloader_dict, nmt_encoderattn, 
                      num_epochs = num_epochs, 
                      saved_model_path = saved_models_dir, 
                      enc_type = 'encoderattn_test')

### RNN Encoder, Self Attention Decoder

In [None]:
self_attention = True

In [None]:
encoder_selfattn = nnet_models_new.EncoderRNN(source_vocab, hidden_size, 1)

In [None]:
decoder_selfattn = nnet_models_new.Decoder_SelfAttn(output_size=target_vocab,
                                 hidden_size=hidden_size, 
                                 self_attention = self_attention)

In [None]:
nmt_selfattn = nnet_models_new.seq2seq(encoder_selfattn, decoder_selfattn,
                              lr = lr, 
                              use_cuda = use_cuda, 
                              hiddensize = hidden_size, 
                              numlayers = hidden_size, 
                              target_lang=dataset_dict['train'].target_lang_obj,
                              longest_label = longest_label,
                              clip = gradient_clip)

In [None]:
modelname = 'selfattn'
if os.path.exists(get_full_filepath(saved_models_dir, modelname)):
    nmt_selfattn = torch.load(get_full_filepath(saved_models_dir, modelname))
else:
    nmt_selfattn = train_model(dataloader_dict, nmt_selfattn, 
                      num_epochs = num_epochs, 
                      saved_model_path = saved_models_dir, 
                      enc_type = 'selfattn_test')