In [None]:
!pip install torchtext

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torchtext.datasets import Multi30k
from torchtext.data import Field, BucketIterator
import spacy
import numpy as np
import random
import math
import time

In [None]:
# Seeding
SEED = 100
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

In [None]:
#Downloading SpaCy's vocabulary
!python -m spacy download en_core_web_sm
!python -m spacy download de_core_news_sm

In [None]:
spacy_de = spacy.load('de_core_news_sm')
spacy_en = spacy.load('en_core_web_sm')

In [None]:
#Tokenizing German text and reversing it for better result using the slicing operation(As mentioned in the seq2seq paper)
def tokenize_de(text): 
    return [tok.text for tok in spacy_de.tokenizer(text)][::-1]

#Tokenizing English text
def tokenize_en(text):
    return [tok.text for tok in spacy_en.tokenizer(text)]

In [None]:
#Assigning the sos and eos tokens and converting all words to lowercase
SR = Field(tokenize = tokenize_de, init_token = '<sos>', eos_token = '<eos>', lower = True)

TR = Field(tokenize = tokenize_en, init_token = '<sos>', eos_token = '<eos>', lower = True)

#Downloading and loading the train, validation and test data from Multi30k Dataset where SR(source) is GERMAN and TR(target) is ENGLISH
train_data, valid_data, test_data = Multi30k.splits(exts = ('.de', '.en'), fields = (SR, TR))

In [None]:
#checking the no of examples
print(f"Number of training examples: {len(train_data.examples)}")
print(f"Number of validation examples: {len(valid_data.examples)}")
print(f"Number of testing examples: {len(test_data.examples)}")

In [None]:
#Building the vocabulary for the source and target languages
SR.build_vocab(train_data, min_freq = 2)
TR.build_vocab(train_data, min_freq = 2)
print(f"Unique tokens in source (de) vocabulary: {len(SR.vocab)}")
print(f"Unique tokens in target (en) vocabulary: {len(TR.vocab)}")

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

BATCH_SIZE = 128

#Using BucketIterator as it creates batches in such a way 
#that it minimizes the amount of padding in both the source and target sentences.
train_iterator, valid_iterator, test_iterator = BucketIterator.splits(
     (train_data, valid_data, test_data), batch_size = BATCH_SIZE, device = device)

In [None]:
# Creating the encoder module wherein for the forward method, we pass in the source sentence, src ,
# which is converted into dense vectors using the embedding layer, and then dropout is applied.
# These embeddings are then passed into the RNN. 

class Encoder(nn.Module):
    def __init__(self, input_dim, emb_dim, hid_dim, n_layers, dropout):
        super().__init__()
        
        self.hid_dim = hid_dim
        self.n_layers = n_layers
        self.embedding = nn.Embedding(input_dim, emb_dim)
        
        self.rnn = nn.LSTM(emb_dim, hid_dim, n_layers, dropout = dropout)
        
        self.dropout = nn.Dropout(dropout)
        
# The forward fn takes the src tensor as input,
# which represents a batch of input sequences where src = [src len, batch size]

    def forward(self, src):
        
        embedded = self.dropout(self.embedding(src))
        
        outputs, (hidden, cell) = self.rnn(embedded)
             
        return hidden, cell

In [None]:
# To verify the dimensions of our tensors

# src = [src len, batch size]
print("src shape:", src.size())
        
# embedded = [src len, batch size, emb dim]
print("embedded shape:", embedded.size())
        
# outputs = [src len, batch size, hid dim * n directions]
print("outputs shape:", outputs.size())

# hidden = [n layers * n directions, batch size, hid dim]
print("hidden shape:", hidden.size())

# cell = [n layers * n directions, batch size, hid dim]
print("cell shape:", cell.size())

In [None]:
class Decoder(nn.Module):
    def __init__(self, output_dim, emb_dim, hid_dim, n_layers, dropout):
        super().__init__()
        
        self.output_dim = output_dim
        self.hid_dim = hid_dim
        self.n_layers = n_layers
        
        self.embedding = nn.Embedding(output_dim, emb_dim)
        
        self.rnn = nn.LSTM(emb_dim, hid_dim, n_layers, dropout = dropout)
        
        self.fc_out = nn.Linear(hid_dim, output_dim)
        
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, input, hidden, cell):
        
        # If not unsqueezed, the decoder would treat each token separately as a sequence and
        # perform the decoding operations independently for each token, which we dont want
        
        input = input.unsqueeze(0)
        
        embedded = self.dropout(self.embedding(input))
             
        output, (hidden, cell) = self.rnn(embedded, (hidden, cell))

        prediction = self.fc_out(output.squeeze(0))        
        
        return prediction, hidden, cell

In [None]:
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device):
        super().__init__()
        
        self.encoder = encoder
        self.decoder = decoder
        self.device = device
        
        #Encoder and decoder must have equal number of layers and hidden dimensions
        assert encoder.hid_dim == decoder.hid_dim,
        assert encoder.n_layers == decoder.n_layers,

    def forward(self, src, trg, teacher_forcing_ratio = 0.5):
        
        #src = [src len, batch size]
        #trg = [trg len, batch size]
        #teacher_forcing_ratio is probability to use teacher forcing
        #if teacher_forcing_ratio is 0.5 we use ground-truth inputs 50% of the time
        
        batch_size = trg.shape[1]
        trg_len = trg.shape[0]
        trg_vocab_size = self.decoder.output_dim
        
        #tensor to store decoder outputs
        outputs = torch.zeros(trg_len, batch_size, trg_vocab_size).to(self.device)
        
        #last hidden state of the encoder is used as the initial hidden state of the decoder
        hidden, cell = self.encoder(src)
        
        #first input to the decoder is the <sos> token
        input = trg[0,:]
        
        for t in range(1, trg_len):
            
            #insert input token embedding, previous hidden and previous cell states
            #receive output tensor (predictions) and new hidden and cell states
            output, hidden, cell = self.decoder(input, hidden, cell)
            
            #place predictions in a tensor holding predictions for each token
            outputs[t] = output
            
            #decide if we are going to use teacher forcing or not
            teacher_force = random.random() < teacher_forcing_ratio
            
            #get the highest predicted token from our predictions
            apex = output.argmax(1) 
            
            #if teacher forcing, use actual next token as next input
            #if not, use predicted token
            input = trg[t] if teacher_force else apex
        
        return outputs

Training the Seq2Seq Model

In [None]:
#initializing our model
INPUT_DIM = len(SR.vocab)
OUTPUT_DIM = len(TR.vocab)
ENC_EMB_DIM = 256
DEC_EMB_DIM = 256
HID_DIM = 512
N_LAYERS = 2
ENC_DROPOUT = 0.5
DEC_DROPOUT = 0.5

enc = Encoder(INPUT_DIM, ENC_EMB_DIM, HID_DIM, N_LAYERS, ENC_DROPOUT)
dec = Decoder(OUTPUT_DIM, DEC_EMB_DIM, HID_DIM, N_LAYERS, DEC_DROPOUT)

model = Seq2Seq(enc, dec, device).to(device)

#In the seq2seq paper they state that they initialize all weights from a uniform distribution between -0.08 and +0.08
def init_weights(m):
    for name, param in m.named_parameters():
        nn.init.uniform_(param.data, -0.08, 0.08)
        
model.apply(init_weights)

#defining a function that will calculate the number of trainable parameters in the model
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

optimizer = optim.Adam(model.parameters())

#ignoring the loss whenever the target token is a padding token by passing
# the index of the <pad> token as the ignore_index argument
TR_PAD_IDX = TR.vocab.stoi[TR.pad_token]
criterion = nn.CrossEntropyLoss(ignore_index = TR_PAD_IDX)

In [None]:
def train(model, iterator, optimizer, criterion, clip):
    
    #Differentiating between the training and evaluation modes as
    #dropout and batch normalization behave differently in the two modes
    model.train()
    
    epoch_loss = 0
    
    for i, batch in enumerate(iterator):
        
        src = batch.src
        trg = batch.trg

        # Clear the accumulating gradients
        optimizer.zero_grad()
        
        output = model(src, trg)

        #number of units dimensions in the output of the model        
        output_dim = output.shape[-1]
        
        #Removing sos token and reshaping the output tensor
        output = output[1:].view(-1, output_dim)
        trg = trg[1:].view(-1)
                
        # Calculate the loss value for every epoch
        loss = criterion(output, trg)
        
        # Calculate the gradients for weights & biases using back-propagation
        loss.backward()
        
        #Setting threshold value for gradients
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        
        optimizer.step()
        
        epoch_loss += loss.item()
        
    return epoch_loss / len(iterator)

In [None]:
def evaluate(model, iterator, criterion):
    
    model.eval()
    
    epoch_loss = 0
    
    with torch.no_grad():
    
        for i, batch in enumerate(iterator):

            src = batch.src
            trg = batch.trg

            #turning off teacher forcing
            output = model(src, trg, 0) 

            output_dim = output.shape[-1]
            
            output = output[1:].view(-1, output_dim)
            trg = trg[1:].view(-1)

            loss = criterion(output, trg)
            
            epoch_loss += loss.item()
        
    return epoch_loss / len(iterator)

In [None]:
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

n_epoch = 10
clipp = 1

#initialized with +ve infinite
best_valid_loss = float('inf')

for epoch in range(n_epoch):
    
    start_time = time.time()
    
    train_loss = train(model, train_iterator, optimizer, criterion, clipp)
    valid_loss = evaluate(model, valid_iterator, criterion)
    
    end_time = time.time()
    
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    #updating the loss
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'seq2seqmodel_1.pt')
    
    print(f'Epoch: {epoch+1:02} \n Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} \n Train PPL: {math.exp(train_loss):7.3f}')
    print(f'\t Valid Loss: {valid_loss:.3f} \n  Valid PPL: {math.exp(valid_loss):7.3f}')

In [None]:
#loading the saved parameters and evaluating the test dataset
model.load_state_dict(torch.load('seq2seqmodel_1.pt'))
test_loss = evaluate(model, test_iterator, criterion)

print(f' Test Loss: {test_loss:.3f} \n Test PPL: {math.exp(test_loss):7.3f} ')