Original notebook [here](https://github.com/omarsar/pytorch_neural_machine_translation_attention/blob/master/NMT_in_PyTorch.ipynb)

In [3]:
import torch
import torch.functional as F
import torch.nn as nn
import torch.optim as optim
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np
import unicodedata  # To install : pip install Unidecode
import re
import time

print(torch.__version__)

1.8.1+cu101


In [5]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

cuda


## Getting data

In [9]:
!wget http://www.manythings.org/anki/spa-eng.zip
!unzip spa-eng.zip
!rm spa-eng.zip
!rm _about.txt

--2021-04-22 17:16:49--  http://www.manythings.org/anki/spa-eng.zip
Resolving www.manythings.org (www.manythings.org)... 104.21.55.222, 172.67.173.198, 2606:4700:3031::6815:37de, ...
Connecting to www.manythings.org (www.manythings.org)|104.21.55.222|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 4934182 (4.7M) [application/zip]
Saving to: ‘spa-eng.zip’


2021-04-22 17:16:51 (3.31 MB/s) - ‘spa-eng.zip’ saved [4934182/4934182]

Archive:  spa-eng.zip
  inflating: _about.txt              
  inflating: spa.txt                 


In [22]:
f = open('spa.txt', encoding='UTF-8').read().strip().split('\n')

In [23]:
lines = f

In [24]:
# sample size (smaller to reduce computation)
num_examples = 30000

# creates lists containing each pair
original_word_pairs = [[w for w in l.split('\t')] for l in lines[:num_examples]]

In [25]:
data = pd.DataFrame(original_word_pairs, columns=["eng", "es", "attribution"])

In [26]:
data.head()

Unnamed: 0,eng,es,attribution
0,Go.,Ve.,CC-BY 2.0 (France) Attribution: tatoeba.org #2...
1,Go.,Vete.,CC-BY 2.0 (France) Attribution: tatoeba.org #2...
2,Go.,Vaya.,CC-BY 2.0 (France) Attribution: tatoeba.org #2...
3,Go.,Váyase.,CC-BY 2.0 (France) Attribution: tatoeba.org #2...
4,Hi.,Hola.,CC-BY 2.0 (France) Attribution: tatoeba.org #5...


In [27]:
data.drop('attribution',axis=1, inplace=True)

In [28]:
data.head()

Unnamed: 0,eng,es
0,Go.,Ve.
1,Go.,Vete.
2,Go.,Vaya.
3,Go.,Váyase.
4,Hi.,Hola.


In [29]:
# converts the unicode file to ascii
def unicode_to_ascii(s):
    """
    Normalizes latin chars with accent to their canonical decomposition
    """
    return ''.join(c for c in unicodedata.normalize('NFD', s)
                  if unicodedata.category(c) != 'Mn')

def preprocess_sentence(w):
    w = unicode_to_ascii(w.lower().strip())
    
    # creating a space between a word and the punctuation following it
    # eg: "he is a boy." => "he is a boy ." 
    # Reference:- https://stackoverflow.com/questions/3645931/python-padding-punctuation-with-white-spaces-keeping-punctuation
    w = re.sub(r"([?.!,¿])", r" \1 ", w)
    w = re.sub(r'[" "]+', " ", w)
    
    # replacing everything with space except (a-z, A-Z, ".", "?", "!", ",")
    w = re.sub(r"[^a-zA-Z?.!,¿]+", " ", w)
    
    w = w.rstrip().strip()
    
    # adding a start and an end token to the sentence
    # so that the model know when to start and stop predicting.
    w = '<start> ' + w + ' <end>'
    return w

## Data Exploration

In [30]:
data['eng'] = data.eng.apply(lambda w: preprocess_sentence(w))
data['es'] = data.es.apply(lambda w: preprocess_sentence(w))
data.sample(10)

Unnamed: 0,eng,es
7196,<start> it s all yours . <end>,<start> es todo tuyo . <end>
21464,<start> do you like english ? <end>,<start> ¿ te gusta el ingles ? <end>
24300,<start> tom is making faces . <end>,<start> tom esta haciendo muecas . <end>
15166,<start> i want to go back . <end>,<start> quiero volver . <end>
3604,<start> recess ended . <end>,<start> se acabo el recreo . <end>
19853,<start> think for a moment . <end>,<start> piensa por un momento . <end>
1373,<start> ignore tom . <end>,<start> ignora a tom . <end>
16381,<start> tom believed mary . <end>,<start> tom le creia a maria . <end>
5927,<start> watch closely . <end>,<start> vigila atentamente . <end>
21327,<start> can i use the phone ? <end>,<start> ¿ puedo usar el telefono ? <end>


## Building Vocabulary Index

In [31]:
# This class creates a word -> index mapping (e.g,. "dad" -> 5) and vice-versa 
# (e.g., 5 -> "dad") for each language,

class LanguageIndex:
    def __init__(self, lang):
        """
        lang are the list of phrases from each language
        """
        self.lang = lang
        self.word2idx = {}
        self.idx2word = {}
        self.vocab = set()
        
        self.create_index()
    
    def create_index(self):
        for phrase in self.lang:
            # update with individual tokens
            self.vocab.update(phrase.split(' '))
        
        # sort the vocab
        self.vocab = sorted(self.vocab)
        
        # add a padding token with index 0
        self.word2idx['<pad>'] = 0
        
        # word to index mapping
        for index, word in enumerate(self.vocab):
            self.word2idx[word] = index + 1 # (+1) because of pad token
        
        # index to word mapping
        for word, index in self.word2idx.items():
            self.idx2word[index] = word

In [33]:
# index language using the class above
inp_lang = LanguageIndex(data["es"].values.tolist())
targ_lang = LanguageIndex(data["eng"].values.tolist())

# vectorize the input and target languages
input_tensor = [[inp_lang.word2idx[s] for s in es.split(' ')] for es in data["es"].values.tolist()]
target_tensor = [[targ_lang.word2idx[s] for s in eng.split(' ')] for eng in data["eng"].values.tolist()]
input_tensor[:10]

[[5, 9047, 3, 4],
 [5, 9166, 3, 4],
 [5, 9039, 3, 4],
 [5, 9046, 3, 4],
 [5, 4692, 3, 4],
 [5, 2279, 1, 4],
 [5, 2277, 1, 4],
 [5, 2276, 1, 4],
 [5, 2284, 1, 4],
 [5, 2284, 3, 4]]

In [34]:
target_tensor[:10]

[[5, 1815, 3, 4],
 [5, 1815, 3, 4],
 [5, 1815, 3, 4],
 [5, 1815, 3, 4],
 [5, 2008, 3, 4],
 [5, 3565, 1, 4],
 [5, 3565, 1, 4],
 [5, 3565, 1, 4],
 [5, 3565, 1, 4],
 [5, 3565, 3, 4]]

In [35]:
def max_length(tensor):
    return max(len(t) for t in tensor)

In [36]:
# calculate the max_length of input and output tensor
max_length_inp, max_length_tar = max_length(input_tensor), max_length(target_tensor)

In [37]:
def pad_sequences(x, max_len):
    padded = np.zeros((max_len), dtype=np.int64)
    if len(x) > max_len: padded[:] = x[:max_len]
    else: padded[:len(x)] = x
    return padded

In [39]:
# inplace padding
input_tensor = [pad_sequences(x, max_length_inp) for x in input_tensor]
target_tensor = [pad_sequences(x, max_length_tar) for x in target_tensor]
len(target_tensor)

30000

In [40]:
# creating training and validation sets using 80-20 split
input_tensor_train, input_tensor_val, target_tensor_train, target_tensor_val = train_test_split(input_tensor, target_tensor, test_size=0.2)

#show length
len(input_tensor_train), len(target_tensor_train), len(input_tensor_val), len(target_tensor_val)

(24000, 24000, 6000, 6000)

## Load Dataset into DataLoader for Batching

In [41]:
from torch.utils.data import Dataset, DataLoader

In [42]:
class MyData(Dataset):
    def __init__(self, X, y):
        self.data = X
        self.target = y
        self.length = [np.sum(1 - np.equal(x, 0)) for x in X]
    
    def __getitem__(self, index):
        x = self.data[index]
        y = self.target[index]
        x_len = selg.length[index]
        return x, y, x_len
    
    def __len__(self):
        return len(self.data)

## Parameters

In [43]:
BUFFER_SIZE = len(input_tensor_train)
BATCH_SIZE = 63
N_BATCH = BUFFER_SIZE // BATCH_SIZE
embedding_dim = 256
units = 1024
vocab_inp_size = len(inp_lang.word2idx)
vocab_tar_size = len(targ_lang.word2idx)

train_dataset = MyData(input_tensor_train, target_tensor_train)
val_dataset = MyData(input_tensor_val, target_tensor_val)

dataset = DataLoader(train_dataset, batch_size=BATCH_SIZE,
                    drop_last=True,
                    shuffle=True)

In [None]:
class Encoder(nn.Module):
    def __init__(self, vocab_size, embedding_dim, enc_units, batch_size):
        super(Encoder, self).__init__()
        self.batch_size = batch_size
        self.enc_units = enc_units
        self.vocab_size = vocab_size
        self.embedding_dim = embedding_dim
        self.embedding = nn.Embedding(self.vocab_size, self.embedding_dim)
        self.gru = nn.GRU(self.embedding_dim, self.enc_units)
    
    def forward(self, x, lens, device):
        x = self.embedding(x)
        ## complete the implementation