## Sequence to Sequence Model 
Here we implement a Sequence to Sequence Model for the problem of language translation and in that we explicitly implement an encoder and decoder program for model training and hence evaluate our model to see its performance.

### Preparing the data
Here we import the data and preprocess our data so that we could train our 
model on that dataset.

In [1]:
## Importing the encodings from spacy to specific trainable information for the datasets 
import spacy
spacy_german = spacy.load('de_core_news_sm')
spacy_english = spacy.load('en_core_web_sm')

In [2]:
## Importing the datasets from torchtext to train out model
from torchtext.datasets import Multi30k
from torchtext import datasets
from torchtext import data

def tokenize_german(text):
    return [token.text for token in spacy_german.tokenizer(text)]

def tokenize_english(text):
    return [token.text for token in spacy_english.tokenizer(text)][::-1]

SOURCE = data.Field(tokenize = tokenize_english,init_token = '<sos>',eos_token = '<eos>',lower = True)
TARGET = data.Field(tokenize = tokenize_german,init_token = '<sos>',eos_token = '<eos>',lower = True)

train_data , valid_data , test_data = Multi30k.splits(exts = ('.en','.de'),fields = (SOURCE,TARGET))




In [4]:
## Checking the size of the respective datasets
print("Traning dataset size: " + str(len(train_data.examples)))
print("Validation dataset size: " + str(len(valid_data.examples)))
print("Testing dataset size: " + str(len(test_data.examples)))

Traning dataset size: 29000
Validation dataset size: 1014
Testing dataset size: 1000


In [5]:
## Hence developing the vocabulary from the training data by defining both the Source and Target
SOURCE.build_vocab(train_data,min_freq = 2)
TARGET.build_vocab(train_data,min_freq = 2)
print("English (Source) Vocabulary Size: " + str(len(SOURCE.vocab)))
print("German (Target) Vocabulary Size: " + str(len(TARGET.vocab)))

English (Source) Vocabulary Size: 5893
German (Target) Vocabulary Size: 7854


In [6]:
import torch
## Using cuda for parrallel processing 
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
batch_size = 32
## defining the iterators for yielding the mini-batches for training the model
train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits((train_data,valid_data,test_data),batch_size = batch_size,device = device)



### Building the Encoder and Decoder
Here we implement our respective encoders and decoders to use in the model training.

In [7]:
from torch import nn
## Hence building the Encoder and Decoder
class Encoder(nn.Module):
    def __init__(self,input_dims,emb_dims,hid_dims,n_layers,dropout):
        super().__init__()
        self.hid_dims = hid_dims
        self.n_layers = n_layers
        self.embedding = nn.Embedding(input_dims,emb_dims)
        self.rnn = nn.LSTM(emb_dims,hid_dims,n_layers,dropout=dropout)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self,src):
        embedded = self.dropout(self.embedding(src))
        outputs, (h,cell) = self.rnn(embedded)
        return h,cell
    
class Decoder(nn.Module):
    def __init__(self,output_dims,emb_dims,hid_dims,n_layers,dropout):
        super().__init__()
        
        self.output_dims = output_dims
        self.hid_dims = hid_dims
        self.n_layers = n_layers
        
        self.embedding = nn.Embedding(output_dims,emb_dims)
        self.rnn = nn.LSTM(emb_dims,hid_dims,n_layers,dropout = dropout)
        self.fc_out = nn.Linear(hid_dims,output_dims)
        
        self.dropout = nn.Dropout(dropout)
    
    def forward(self,inputs,h,cell):
        inputs = inputs.unsqueeze(0)
        
        embedded = self.dropout(self.embedding(inputs))
        
        output , (h,cell) = self.rnn(embedded,(h,cell))
        
        pred = self.fc_out(output.squeeze(0))
        
        return pred,h,cell

### Full Sequence to Sequence Model
Here we implement our full Sequence to Sequence Model using the respective encoders and decoders and hence training on the preprocessed dataset.

In [9]:
import random
## defining and implementing the full model
class Seq2Seq(nn.Module):
    def __init__(self,encoder,decoder,device):
        super().__init__()
        
        self.encoder = encoder
        self.decoder = decoder
        self.device = device
        
    def forward(self,src,trg,teacher_forcing_rate = 0.5):
        batch_size = trg.shape[1]
        target_length = trg.shape[0]
        target_vocab_size = self.decoder.output_dims
        
        outputs = torch.zeros(target_length,batch_size,target_vocab_size).to(self.device)
        
        h , cell = self.encoder(src)
        inputs = trg[0,:]
        
        for t in range(1,target_length):
            output , h ,cell = self.decoder(inputs , h,cell)
            outputs[t] = output
            top = output.argmax(1)
            inputs = trg[t] if (random.random() < teacher_forcing_rate) else top
            
        return outputs

## defining the model's hyperparameters
## We could easily tweak them to improve our model's performance
input_dimensions = len(SOURCE.vocab)
output_dimensions = len(TARGET.vocab)
encoder_embedding_dimensions = 256
decoder_embedding_dimensions = 256
hidden_layer_dimensions = 512
number_of_layers = 2
encoder_dropout = 0.5
decoder_dropout = 0.5

encod = Encoder(input_dimensions,encoder_embedding_dimensions,hidden_layer_dimensions,number_of_layers,encoder_dropout)
decod = Decoder(output_dimensions,decoder_embedding_dimensions,hidden_layer_dimensions,number_of_layers,decoder_dropout)

model = Seq2Seq(encod,decod,device).to(device)

### Training the Model
Here we respectively train our model using the helper functions defined.

In [11]:
from torch import optim
import time
import numpy as np

## initialize weights for training
def initialize_weights(m):
    for name,param in m.named_parameters():
        nn.init.uniform_(param.data,-0.1,0.1)
        
model.apply(initialize_weights)
optimizer = optim.Adam(model.parameters())
criterion = nn.CrossEntropyLoss(ignore_index = TARGET.vocab.stoi[TARGET.pad_token])

## define the training function
def train(model,iterator,optimizer,criterion,clip):
    model.train()
    epoch_loss = 0
    
    for i,batch in enumerate(iterator):
        src = batch.src
        trg = batch.trg
        optimizer.zero_grad()
        output = model(src,trg)
        output_dims = output.shape[-1]
        output = output[1:].view(-1,output_dims)
        trg = trg[1:].view(-1)
        
        loss = criterion(output,trg)
        
        loss.backward()
        
        torch.nn.utils.clip_grad_norm_(model.parameters(),clip)
        
        optimizer.step()
        epoch_loss += loss.item()
    
    return epoch_loss/len(iterator)

## defining the evaluation function
def evaluate(model,iterator,criterion):
    model.eval()
    epoch_loss = 0
    
    with torch.no_grad():
        for i,batch in enumerate(iterator):
            src = batch.src
            trg = batch.trg
            output = model(src,trg,0)
            output_dims = output.shape[-1]
            output = output[1:].view(-1,output_dims)
            trg = trg[1:].view(-1)

            loss = criterion(output,trg)
            epoch_loss += loss.item()

    return epoch_loss/len(iterator)

epochs = 10
grad_clip = 1

lowest_validation_loss = float('inf')

## training for the required number of epochs
for epoch in range(epochs):
    start_time = time.time()
    
    train_loss = train(model,train_iterator,optimizer,criterion,grad_clip)
    valid_loss = evaluate(model,valid_iterator,criterion)
    
    end_time = time.time()
    
    if valid_loss < lowest_validation_loss:
        lowest_validation_loss = valid_loss
        torch.save(model.state_dict(),'seq2seq.pt')
        
    ## checking models accuracies respectively to ensure model is being trained correctly
    print(f'Epoch: {epoch+1 : 02} | Time: {np.round(end_time-start_time,0)}s')
    print(f'\tTrain Loss: {train_loss: .4f}')
    print(f'\t Val. Loss: {valid_loss: .4f}')



Epoch:  1 | Time: 81.0s
	Train Loss:  4.7260
	 Val. Loss:  4.6445
Epoch:  2 | Time: 78.0s
	Train Loss:  4.0280
	 Val. Loss:  4.3099
Epoch:  3 | Time: 82.0s
	Train Loss:  3.6661
	 Val. Loss:  4.0707
Epoch:  4 | Time: 84.0s
	Train Loss:  3.3996
	 Val. Loss:  3.9362
Epoch:  5 | Time: 92.0s
	Train Loss:  3.1821
	 Val. Loss:  3.7796
Epoch:  6 | Time: 93.0s
	Train Loss:  2.9969
	 Val. Loss:  3.7215
Epoch:  7 | Time: 91.0s
	Train Loss:  2.8408
	 Val. Loss:  3.6440
Epoch:  8 | Time: 94.0s
	Train Loss:  2.6922
	 Val. Loss:  3.6399
Epoch:  9 | Time: 96.0s
	Train Loss:  2.5497
	 Val. Loss:  3.6287
Epoch:  10 | Time: 93.0s
	Train Loss:  2.4388
	 Val. Loss:  3.5828


### Evaluating the Model
Here we evaluate our model on the test iterator data that we would yield from the corpus of dataset made.

In [13]:
## Hence we evaluate our model
model.load_state_dict(torch.load('seq2seq.pt'))
test_loss = evaluate(model,test_iterator,criterion)
print(f'Test loss: {test_loss:.4f}')



Test loss: 3.5505


In [15]:
def translate(model, iterator, limit = 4):
    
    model.eval()
    
    epoch_loss = 0
    
    with torch.no_grad():
    
        for i, batch in enumerate(iterator):
            if i < limit :
                
                src = batch.src
                trg = batch.trg

                output = model(src, trg, 0)
                preds = torch.tensor([[torch.argmax(x).item()] for x in output])
                
                print('English Input: ' + str([SOURCE.vocab.itos[x] for x in src][1:-1][::-1]))
                print('Correct German Output: ' + str([TARGET.vocab.itos[x] for x in trg][1:-1]))
                print('Predicted German Output: ' + str([TARGET.vocab.itos[x] for x in preds][1:-1]))
                print('\n')
                
_, _, eval_iterator = data.BucketIterator.splits(
    (train_data, valid_data, test_data), 
    batch_size = 1, 
    device = device)

output = translate(model, eval_iterator)




English Input: ['two', 'men', 'wearing', 'hats', '.']
Correct German Output: ['zwei', 'männer', 'mit', 'mützen', '.']
Predicted German Output: ['zwei', 'männer', 'mit', 'sonnenbrillen', '.']


English Input: ['young', 'woman', 'climbing', 'rock', 'face']
Correct German Output: ['junge', 'frau', 'klettert', 'auf', 'felswand']
Predicted German Output: ['eine', 'junge', 'frau', 'klettert', 'einen']


English Input: ['a', 'woman', 'is', 'playing', 'volleyball', '.']
Correct German Output: ['eine', 'frau', 'spielt', 'volleyball', '.']
Predicted German Output: ['eine', 'frau', 'spielt', 'tennis', '.']


English Input: ['three', 'men', 'are', 'walking', 'up', 'hill', '.']
Correct German Output: ['drei', 'männer', 'gehen', 'bergauf', '.']
Predicted German Output: ['drei', 'männer', 'gehen', 'einen', 'berg']


