# *Major take away from Seq 2 Seq paper:*
1) Deeper LSTMs work better than shallow ones, hence authors used 4 layered LSTM

2) Use of separate LSTMs as well embeddings for encoder and decoder blocks

3) For Neural Machine Translation, reversing the order of sentence in source dataset for training produces short term dependencies, as the beginning of source sentence in read towards end, resulting in better overall score on test benchmarks

In [3]:
# !pip install --upgrade torch
# !pip install -U torchtext==0.8.0

In [4]:

import torch
import torch.nn as nn
import torch.optim as optim

from torchtext.datasets import Multi30k
from torchtext.data import Field, BucketIterator

import spacy
import numpy as np

import random
import math
import time

import warnings
warnings.filterwarnings("ignore")

In [5]:
SEED = 1234
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

In [6]:
!python -m spacy download en
!python -m spacy download de
!python -m spacy download fr

[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_sm')
[38;5;2m✔ Linking successful[0m
/usr/local/lib/python3.7/dist-packages/en_core_web_sm -->
/usr/local/lib/python3.7/dist-packages/spacy/data/en
You can now load the model via spacy.load('en')
Collecting de_core_news_sm==2.2.5
[?25l  Downloading https://github.com/explosion/spacy-models/releases/download/de_core_news_sm-2.2.5/de_core_news_sm-2.2.5.tar.gz (14.9MB)
[K     |████████████████████████████████| 14.9MB 29.5MB/s 
Building wheels for collected packages: de-core-news-sm
  Building wheel for de-core-news-sm (setup.py) ... [?25l[?25hdone
  Created wheel for de-core-news-sm: filename=de_core_news_sm-2.2.5-cp37-none-any.whl size=14907057 sha256=f5f31daa2b1053992f7db5b6baaf51969729e0a198318b24bb00d104041e4d80
  Stored in directory: /tmp/pip-ephem-wheel-cache-hnpiu3i_/wheels/ba/3f/ed/d4aa8e45e7191b7f32db4bfad565e7da1edbf05c916ca7a1ca
Successfully built de-core-news-sm
Ins

In [7]:
spacy_de = spacy.load('de')
spacy_en = spacy.load('en')

In [48]:
def tokenize_de(text):
    return [tok.text for tok in spacy_de.tokenizer(text)][::-1] # reverse as per the enhanced performance observation in original Seq2Seq paper

def tokenize_en(text):
    return [tok.text for tok in spacy_en.tokenizer(text)]

In [49]:
SRC = Field(tokenize=tokenize_en,
            init_token= '<sos>',
            eos_token = '<eos>',
            lower=True)

TRG = Field(tokenize=tokenize_de,
            init_token='<sos>',
            eos_token = '<eos>',
            lower = True)

In [50]:
train_data, valid_data, test_data = Multi30k.splits(exts=('.en', '.de'),
                                                    fields = (SRC, TRG))

In [51]:
print(f"Number of training examples: {len(train_data.examples)}")
print(f"Number of validation examples: {len(valid_data.examples)}")
print(f"Number of testing examples: {len(test_data.examples)}")

Number of training examples: 29000
Number of validation examples: 1014
Number of testing examples: 1000


In [52]:
SRC.build_vocab(train_data, min_freq=2)
TRG.build_vocab(train_data, min_freq=2)

In [53]:
print(f"Unique tokens in source (en) vocabulary: {len(SRC.vocab)}")
print(f"Unique tokens in target (de) vocabulary: {len(TRG.vocab)}")

Unique tokens in source (en) vocabulary: 5893
Unique tokens in target (de) vocabulary: 7855


In [54]:
vars(train_data[0])

{'src': ['two',
  'young',
  ',',
  'white',
  'males',
  'are',
  'outside',
  'near',
  'many',
  'bushes',
  '.'],
 'trg': ['.',
  'büsche',
  'vieler',
  'nähe',
  'der',
  'in',
  'freien',
  'im',
  'sind',
  'männer',
  'weiße',
  'junge',
  'zwei']}

In [55]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [56]:
device

device(type='cuda')

In [57]:
batch=128

train_iterator, valid_iterator, test_iterator = BucketIterator.splits((train_data, valid_data, test_data),
                                                                      batch_size=batch,
                                                                      device = device)

In [58]:
class Encoder(nn.Module):
    def __init__(self, input_dim, emb_dim, hid_dim, n_layers, dropout):
        super().__init__()

        self.hid_dim = hid_dim
        self.n_layers = n_layers

        self.embedding = nn.Embedding(input_dim, emb_dim)
        self.rnn = nn.LSTM(emb_dim, hid_dim, n_layers, dropout=dropout)

        self.dropout = nn.Dropout(dropout)

    def forward(self, src):
        embedded = self.dropout(self.embedding(src))

        outputs, (hidden, cell) = self.rnn(embedded)

        return hidden, cell



In [59]:
class Decoder(nn.Module):
    def __init__(self, output_dim, hid_dim, emb_dim, n_layers, dropout):
        super().__init__()

        self.output_dim = output_dim
        self.hid_dim = hid_dim
        self.n_layers = n_layers

        self.embedding = nn.Embedding(output_dim, emb_dim)
        self.rnn = nn.LSTM(emb_dim, hid_dim , n_layers, dropout=dropout)

        self.fc = nn.Linear(hid_dim, output_dim)

        self.dropout = nn.Dropout(dropout)

    def forward(self, input, hidden, cell):
        input = input.unsqueeze(0)
        embedded = self.dropout(self.embedding(input))

        output, (hidden, cell) = self.rnn(embedded, (hidden, cell))

        prediction = self.fc(output.squeeze(0))

        return prediction, hidden, cell



In [60]:
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.device = device

        assert encoder.n_layers == decoder.n_layers, "Please make sure that the number of layers in LSTM for both Encoder and Decoder are same"
        assert encoder.hid_dim == decoder.hid_dim, "Please make sure that the hidden layer size in LSTM for both Encoder and Decoder are same"
    
    def forward(self, source, target, teacher_forcing=0.5):
        # src = [source length, batch_size],  target = [target length, batch_size]
        trg_len, batch_size = target.shape[0], target.shape[1]
        trg_vocab_size = self.decoder.output_dim

        # decoder output tensor, shape = [target_sentence_length, batch_size, target vocab size],
        # each sample has different len where each token has same vector size(vocab size)
        outputs = torch.zeros(trg_len, batch_size, trg_vocab_size).to(self.device)

        # last hidden and cell states from the Encoder section are used as the
        # initial hidden inputs for the first iteration of Decoder time step
        hidden, cell = self.encoder(source)

        input_ = target[0, :] # the '<sos>' tokens as input for the decoder first time step

        for t in range(1, trg_len): # 1 to trg len as first token will be <sos>
            output_pred, hidden, cell = self.decoder(input_, hidden, cell) # input embedding, hidden and cell states -> prediction, hidden and cell as outputs

            outputs[t] = output_pred

            teacher = random.random() < teacher_forcing

            top_1 = output_pred.argmax(1) # get prediction with highest value

            # decide if to use predicted output as input to next time step or ground truth token
            input_ = target[t] if teacher else top_1
        return outputs


## Training Seq2Seq model

In [21]:
INPUT_DIM = len(SRC.vocab)
OUTPUT_DIM = len(TRG.vocab)

Enc_emd_dim = Dec_emd_dim =256

HIDDEN_DIM = 512
N_LAYERS = 2 # paper uses 4 though 
enc_dropout = dec_dropout = 0.5

enc = Encoder(input_dim=INPUT_DIM, emb_dim=Enc_emd_dim, hid_dim=HIDDEN_DIM, n_layers=N_LAYERS, dropout=enc_dropout)
dec = Decoder(output_dim=OUTPUT_DIM, emb_dim=Dec_emd_dim, hid_dim=HIDDEN_DIM, n_layers=N_LAYERS, dropout=dec_dropout)
model = Seq2Seq(encoder=enc, decoder=dec, device=device).to(device) 

## Initialize weights

In [61]:
def init_weights(model):
    for name, param in model.named_parameters():
        nn.init.uniform_(param.data, -0.08, 0.08)

model.apply(init_weights)

In [62]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)


In [24]:
print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 14,905,519 trainable parameters


In [63]:
# intialize an optimier for training

optimizer = optim.Adam(model.parameters())

### # as we can have <pad> tpoken as well in src and trg sentences and we should not calculate loss on it, hence we ignore <pad> topken while calculating loss

In [64]:
trg_pad_index = TRG.vocab.stoi[TRG.pad_token]

## we define crossentropy loss criterion for calculating loss

In [65]:
criterion = nn.CrossEntropyLoss(ignore_index=trg_pad_index)

##Training process:

1) get `source` and `target` sentences

2) ensure to make gradients to zero to avoid accumulation of gradients over time

3) get the output `y_hat` from model after feeding in `source` and `target`

4) calculate loss after changing dimensions of the input and output correctly, use `loss.backward()` to calculate gradients

5) clip the gradients to avoid `graident explosion`, a common issue with RNNs

6) update the model weights

7) calculate total loss

In [66]:
def train(model, iterator, optimizer, criterion, clip):
    """Train function to train the `model` with given `optimizer` using `criterion`"""
    model.train() # set model to train mode

    total_loss = 0

    for i, batch in enumerate(iterator):

        source, target = batch.src, batch.trg # get source and target batch

        # zero out previous gradients if any
        optimizer.zero_grad()

        # get output from model
        output = model(source, target)

        # get output dimension, target = [trg len , batch size], output = [trg len, batch size, target vocab/output dim]
        output_dim = output.shape[-1] # get target vocab size from last dimension

        # reshape output for usage in loss calculation
        # this line results in output = [target len * batch size, target vocab size]
        output = output[1:].view(-1, output_dim) # ignore `0` index values as it holds all zeros
        
        # change view of target to target_len*batch_size to make it 1-d vector, 
        # while ignoreing first token of each sample as it holds <sos> token
        target = target[1:].view(-1) 

        loss = criterion(output, target) # calculate loss

        loss.backward() # calculate gradients on the loss

        # clip to gradients to avoid gradient explosion
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)

        # take a single optimization step for the optimizer
        optimizer.step()
        total_loss += loss.item() # accumulate loss per epoch

    return total_loss / len(iterator) # return avergae loss
         



In [67]:
def evaluate(model, iterator, criterion):
    
    model.eval() # set model to evaluate mode
    
    epoch_loss = 0
    
    with torch.no_grad(): # since we do not need to calculate graidents in evaluation mode
        for i, batch in enumerate(iterator):
            src = batch.src
            trg = batch.trg
            output = model(src, trg, 0) #since we are evaluating hence turn off teacher forcing

            #trg = [trg len, batch size]
            #output = [trg len, batch size, output dim]
            output_dim = output.shape[-1] # get target vocab size from last dimension
            
            output = output[1:].view(-1, output_dim) # view for target len * batch size, output_dim
            trg = trg[1:].view(-1)

            #trg = [(trg len - 1) * batch size]
            #output = [(trg len - 1) * batch size, output dim]
            loss = criterion(output, trg)
            epoch_loss += loss.item()
    return epoch_loss / len(iterator)

In [68]:
def epoch_time(start, end):
    total_time=end-start
    mins = total_time//60
    seconds = total_time % 60
    return mins, seconds

## make sure to save best model after check at every epoch

In [31]:
epochs = 20
clip = 1

best_valid_loss = float('-inf')

In [32]:
print("\n\n")
for epoch in range(epochs):
    start_time = time.perf_counter()

    train_loss = train(model, train_iterator, optimizer, criterion, clip)

    valid_loss = evaluate(model, valid_iterator, criterion)

    end_time = time.perf_counter()

    total_mins, total_secs = epoch_time(start=start_time, end=end_time)

    print(f'\nEpoch:---------------->{epoch+1:02} | Time: {total_mins}m and {round(total_secs, 2)}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. PPL: {math.exp(valid_loss):7.3f}')





Epoch:---------------->01 | Time: 0.0m and 42.48s
	Train Loss: 5.187 | Train PPL: 178.910
	 Val. Loss: 5.241 |  Val. PPL: 188.790

Epoch:---------------->02 | Time: 0.0m and 43.32s
	Train Loss: 4.641 | Train PPL: 103.618
	 Val. Loss: 4.963 |  Val. PPL: 143.064

Epoch:---------------->03 | Time: 0.0m and 43.42s
	Train Loss: 4.370 | Train PPL:  79.030
	 Val. Loss: 4.886 |  Val. PPL: 132.438

Epoch:---------------->04 | Time: 0.0m and 43.34s
	Train Loss: 4.117 | Train PPL:  61.395
	 Val. Loss: 4.741 |  Val. PPL: 114.565

Epoch:---------------->05 | Time: 0.0m and 43.4s
	Train Loss: 3.884 | Train PPL:  48.611
	 Val. Loss: 4.624 |  Val. PPL: 101.945

Epoch:---------------->06 | Time: 0.0m and 43.71s
	Train Loss: 3.711 | Train PPL:  40.915
	 Val. Loss: 4.441 |  Val. PPL:  84.868

Epoch:---------------->07 | Time: 0.0m and 43.27s
	Train Loss: 3.578 | Train PPL:  35.800
	 Val. Loss: 4.369 |  Val. PPL:  78.994

Epoch:---------------->08 | Time: 0.0m and 43.33s
	Train Loss: 3.410 | Train PPL

# Different hyper-parameters -> Experiment number 2

In [74]:
INPUT_DIM = len(SRC.vocab)
OUTPUT_DIM = len(TRG.vocab)

Enc_emd_dim = Dec_emd_dim = 512 # previous one had 256

HIDDEN_DIM = 1024 # previously 512
N_LAYERS = 4 # previously 2
enc_dropout = dec_dropout = 0.5

enc = Encoder(input_dim=INPUT_DIM, emb_dim=Enc_emd_dim, hid_dim=HIDDEN_DIM, n_layers=N_LAYERS, dropout=enc_dropout)
dec = Decoder(output_dim=OUTPUT_DIM, emb_dim=Dec_emd_dim, hid_dim=HIDDEN_DIM, n_layers=N_LAYERS, dropout=dec_dropout)
model_2 = Seq2Seq(encoder=enc, decoder=dec, device=device).to(device) 

In [75]:
model_2.apply(init_weights)

Seq2Seq(
  (encoder): Encoder(
    (embedding): Embedding(5893, 512)
    (rnn): LSTM(512, 1024, num_layers=4, dropout=0.5)
    (dropout): Dropout(p=0.5, inplace=False)
  )
  (decoder): Decoder(
    (embedding): Embedding(7855, 512)
    (rnn): LSTM(512, 1024, num_layers=4, dropout=0.5)
    (fc): Linear(in_features=1024, out_features=7855, bias=True)
    (dropout): Dropout(p=0.5, inplace=False)
  )
)

In [76]:
print(f' model_2({model_2})\n has {count_parameters(model):,} trainable parameters')

 model_2(Seq2Seq(
  (encoder): Encoder(
    (embedding): Embedding(5893, 512)
    (rnn): LSTM(512, 1024, num_layers=4, dropout=0.5)
    (dropout): Dropout(p=0.5, inplace=False)
  )
  (decoder): Decoder(
    (embedding): Embedding(7855, 512)
    (rnn): LSTM(512, 1024, num_layers=4, dropout=0.5)
    (fc): Linear(in_features=1024, out_features=7855, bias=True)
    (dropout): Dropout(p=0.5, inplace=False)
  )
))
 has 14,905,519 trainable parameters


In [77]:
epochs = 15
clip = 1
optimizer = optim.Adam(model_2.parameters())
best_valid_loss = float('-inf')

In [78]:
print("\n\n\nExperiment number:2 \n\n")
for epoch in range(epochs):
    start_time = time.perf_counter()

    train_loss = train(model_2, train_iterator, optimizer, criterion, clip)

    valid_loss = evaluate(model_2, valid_iterator, criterion)

    end_time = time.perf_counter()

    total_mins, total_secs = epoch_time(start=start_time, end=end_time)

    print(f'\nEpoch:---------------->{epoch+1:02} | Time: {total_mins}m and {round(total_secs, 2)}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. PPL: {math.exp(valid_loss):7.3f}')




Experiment number:2 



Epoch:---------------->01 | Time: 2.0m and 29.76s
	Train Loss: 5.187 | Train PPL: 178.864
	 Val. Loss: 4.874 |  Val. PPL: 130.902

Epoch:---------------->02 | Time: 2.0m and 30.55s
	Train Loss: 4.830 | Train PPL: 125.177
	 Val. Loss: 4.824 |  Val. PPL: 124.518

Epoch:---------------->03 | Time: 2.0m and 30.87s
	Train Loss: 4.367 | Train PPL:  78.788
	 Val. Loss: 4.710 |  Val. PPL: 111.002

Epoch:---------------->04 | Time: 2.0m and 30.15s
	Train Loss: 4.071 | Train PPL:  58.629
	 Val. Loss: 4.694 |  Val. PPL: 109.333

Epoch:---------------->05 | Time: 2.0m and 30.91s
	Train Loss: 3.831 | Train PPL:  46.126
	 Val. Loss: 4.447 |  Val. PPL:  85.367

Epoch:---------------->06 | Time: 2.0m and 29.91s
	Train Loss: 3.642 | Train PPL:  38.176
	 Val. Loss: 4.345 |  Val. PPL:  77.116

Epoch:---------------->07 | Time: 2.0m and 31.32s
	Train Loss: 3.479 | Train PPL:  32.416
	 Val. Loss: 4.388 |  Val. PPL:  80.509

Epoch:---------------->08 | Time: 2.0m and 31.12s
	Train

## Clearly the experiment requires longer training to achieve better scores. More data would also be helpful as hidden and embeddings are much bigger in size than experiment number 1.

# Different hyper-parameters -> Experiment number 3

In [79]:
INPUT_DIM = len(SRC.vocab)
OUTPUT_DIM = len(TRG.vocab)

Enc_emd_dim = Dec_emd_dim = 256 # previous one had 512

HIDDEN_DIM = 512 # previously 1024
N_LAYERS = 4
enc_dropout = dec_dropout = 0.3

enc = Encoder(input_dim=INPUT_DIM, emb_dim=Enc_emd_dim, hid_dim=HIDDEN_DIM, n_layers=N_LAYERS, dropout=enc_dropout)
dec = Decoder(output_dim=OUTPUT_DIM, emb_dim=Dec_emd_dim, hid_dim=HIDDEN_DIM, n_layers=N_LAYERS, dropout=dec_dropout)
model_3 = Seq2Seq(encoder=enc, decoder=dec, device=device).to(device) 

optimizer = optim.Adam(model_3.parameters())

In [80]:
model_3.apply(init_weights)

Seq2Seq(
  (encoder): Encoder(
    (embedding): Embedding(5893, 256)
    (rnn): LSTM(256, 512, num_layers=4, dropout=0.3)
    (dropout): Dropout(p=0.3, inplace=False)
  )
  (decoder): Decoder(
    (embedding): Embedding(7855, 256)
    (rnn): LSTM(256, 512, num_layers=4, dropout=0.3)
    (fc): Linear(in_features=512, out_features=7855, bias=True)
    (dropout): Dropout(p=0.3, inplace=False)
  )
)

In [81]:
print(f' model_3({model_3})\n has {count_parameters(model):,} trainable parameters')

 model_3(Seq2Seq(
  (encoder): Encoder(
    (embedding): Embedding(5893, 256)
    (rnn): LSTM(256, 512, num_layers=4, dropout=0.3)
    (dropout): Dropout(p=0.3, inplace=False)
  )
  (decoder): Decoder(
    (embedding): Embedding(7855, 256)
    (rnn): LSTM(256, 512, num_layers=4, dropout=0.3)
    (fc): Linear(in_features=512, out_features=7855, bias=True)
    (dropout): Dropout(p=0.3, inplace=False)
  )
))
 has 14,905,519 trainable parameters


In [82]:
epochs = 20
clip = 1

best_valid_loss = float('-inf')

In [83]:
print("\n\n\nExperiment number:3 \n\n")
for epoch in range(epochs):
    start_time = time.perf_counter()

    train_loss = train(model_3, train_iterator, optimizer, criterion, clip)

    valid_loss = evaluate(model_3, valid_iterator, criterion)

    end_time = time.perf_counter()

    total_mins, total_secs = epoch_time(start=start_time, end=end_time)

    print(f'\nEpoch:---------------->{epoch+1:02} | Time: {total_mins}m and {round(total_secs, 2)}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. PPL: {math.exp(valid_loss):7.3f}')




Experiment number:3 



Epoch:---------------->01 | Time: 1.0m and 0.09s
	Train Loss: 5.202 | Train PPL: 181.665
	 Val. Loss: 5.021 |  Val. PPL: 151.634

Epoch:---------------->02 | Time: 1.0m and 0.78s
	Train Loss: 4.790 | Train PPL: 120.268
	 Val. Loss: 4.925 |  Val. PPL: 137.736

Epoch:---------------->03 | Time: 1.0m and 1.42s
	Train Loss: 4.420 | Train PPL:  83.132
	 Val. Loss: 4.735 |  Val. PPL: 113.891

Epoch:---------------->04 | Time: 1.0m and 1.86s
	Train Loss: 4.131 | Train PPL:  62.210
	 Val. Loss: 4.638 |  Val. PPL: 103.344

Epoch:---------------->05 | Time: 1.0m and 1.99s
	Train Loss: 3.947 | Train PPL:  51.786
	 Val. Loss: 4.596 |  Val. PPL:  99.081

Epoch:---------------->06 | Time: 1.0m and 2.08s
	Train Loss: 3.784 | Train PPL:  43.991
	 Val. Loss: 4.582 |  Val. PPL:  97.756

Epoch:---------------->07 | Time: 1.0m and 2.4s
	Train Loss: 3.621 | Train PPL:  37.360
	 Val. Loss: 4.538 |  Val. PPL:  93.488

Epoch:---------------->08 | Time: 1.0m and 2.14s
	Train Loss: 3.