<a href="https://colab.research.google.com/github/GuptAmit725/NLP/blob/main/Seq2SeqAttention_GerToEng.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

<h1><I>Installling necessary packages

In [None]:
!pip install torchtext==0.6.0
!pip install -U spacy


In [None]:
!python -m spacy download de_core_news_sm
!python -m spacy download en_core_web_sm

<h1><I>Importing necessary packages

In [5]:
import torch 
import torch.nn as nn
from torchtext.datasets import Multi30k
from torchtext.data import Field, BucketIterator
import numpy as np
import spacy as sp
import random
import re
import string
import nltk
import UTILS, UTILS_ATT
nltk.download('punkt')
from torch.utils.tensorboard import SummaryWriter 

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


<h1><I>Loading the dataset

In [6]:
import spacy
spacy_eng = sp.load("en_core_web_sm")
spacy_ger = spacy.load("de_core_news_sm")

<h1><I>Tokenizing the data 

In [7]:
def tokenize_ger(text):
  return [tok.text for tok in spacy_ger.tokenizer(text)]

def tokenize_eng(text):
  return [tok.text for tok in spacy_eng.tokenizer(text)]

In [8]:
german = Field(tokenize=tokenize_ger, lower=True, init_token='<sos>', eos_token='<eos>')
english = Field(tokenize=tokenize_eng, lower=True, init_token='<sos>', eos_token='<eos>')

<h1><I>Splitting the data into train, validation and test

In [9]:
train_data, validation_data, test_data = Multi30k.splits( exts=('.de', '.en'), fields=(german,english))

downloading training.tar.gz


training.tar.gz: 100%|██████████| 1.21M/1.21M [00:02<00:00, 595kB/s]


downloading validation.tar.gz


validation.tar.gz: 100%|██████████| 46.3k/46.3k [00:00<00:00, 169kB/s]


downloading mmt_task1_test2016.tar.gz


mmt_task1_test2016.tar.gz: 100%|██████████| 66.2k/66.2k [00:00<00:00, 166kB/s]


In [10]:
type(spacy_eng)

spacy.lang.en.English

<h1><I>Building the vocabulary

In [11]:
german.build_vocab(train_data, max_size=10000, min_freq=2)
english.build_vocab(train_data, max_size=10000, min_freq=2)

<h1><I>CReating the classes of Encoder and decoder and creating model out of it

In [40]:
class Encoder(nn.Module):
  def __init__(self, input_size, embedding_size, 
               hidden_size,num_layers,d_rate):
    super(Encoder,self).__init__()
    self.num_layers = num_layers
    self.hidden_size = hidden_size
    self.dropout = nn.Dropout(d_rate)
    self.embedding = nn.Embedding(input_size, embedding_size)
    self.lstm = nn.LSTM(embedding_size, hidden_size, num_layers, bidirectional = True)
    self.fc_hidden = nn.Linear(hidden_size*2, hidden_size)
    self.fc_cell = nn.Linear(hidden_size*2,hidden_size)

  def forward(self, x):
    x = self.embedding(x)
    x = self.dropout(x)
    output, (hidden,cell) = self.lstm(x)
    hidden = self.fc_hidden(torch.cat((hidden[0:1],hidden[1:2]),dim=2))
    cell = self.fc_cell(torch.cat((cell[0:1],cell[1:2]),dim=2))
    return output,hidden, cell

class selfAttention(nn.Module):
  def __init__(self,hidden_size):
    super(selfAttention,self).__init__()
    self.energy = nn.Linear(hidden_size*3,1)
    self.softmax = nn.Softmax(dim=0)
    self.relu = nn.ReLU()

  def forward(self,hidden,enc_output):
    energy = self.relu(self.energy(torch.cat((hidden,enc_output),dim=2)))
    attention = self.softmax(energy) #(sequence_length,N,1) 
    context_vector = torch.einsum("snk,snl->knl",attention,enc_output) #(N,1,hidden_size*2)

    return context_vector



class Decoder(nn.Module):
  def __init__(self,input_size, 
               embedding_size, 
               hidden_size, 
               output_size, 
               num_layers, d_rate
               ):
    super(Decoder,self).__init__()
    self.num_layers = num_layers
    self.hidden_size = hidden_size
    self.embedding = nn.Embedding(input_size, embedding_size)
    self.dropout = nn.Dropout(d_rate)
    self.lstm = nn.LSTM(hidden_size*2 + embedding_size, hidden_size,num_layers)# removing dropout as we have only one layer.
    self.fc = nn.Linear(hidden_size,output_size)
    self.Attention = selfAttention(hidden_size)

  def forward(self,x,encoder_output, hidden, cell):
    x = x.unsqueeze(0)
    x = self.embedding(x)
    x = self.dropout(x) #(1,N,embedding_size)

    sequence_length = encoder_output.shape[0]
    hidden_reshaped = hidden.repeat(sequence_length,1,1)
    context_vector = self.Attention(hidden_reshaped,encoder_output)
    output, (hidden, cell) = self.lstm(torch.cat((context_vector,x),dim=2),(hidden,cell))
    pred = self.fc(output).squeeze(0)

    return pred, hidden, cell

class model(nn.Module):
  def __init__(self, encoder, decoder):
    super(model,self).__init__()
    self.encoder = encoder
    self.decoder = decoder

  def forward(self, src, trg, teacher_force_ratio=0.5):
    batch_size = src.shape[1]
    target_len = trg.shape[0]
    target_vocab_size = len(english.vocab)

    outputs = torch.zeros(target_len, batch_size, target_vocab_size).to(device)
    # Since we are doing word by word prediction

    encoder_output,hidden, cell = self.encoder(src)

    #grabing start token
    x = trg[0]

    for t in range(1,target_len):
      output, hidden, cell = self.decoder(x,encoder_output,hidden, cell)
      outputs[t] = output
      best_guess = output.argmax(1)
      x = target[t] if random.random() < teacher_force_ratio else best_guess
      # In this line we are feeding actual target value and output of the previous decoder alternatively.
    return outputs




<h1><I>Model training

In [13]:
!mkdir 'ckpt'

In [14]:
def evaluate(model, iterator, criterion):
    
    model_.eval()
    
    epoch_loss = 0
    
    with torch.no_grad():
        
        for i, batch in enumerate(iterator):
            
            src = batch.src
            trg = batch.trg
            
            output = model(src, trg, 0) # turn off teacher forcing.
            
            # trg = [sen_len, batch_size]
            # output = [sen_len, batch_size, output_dim]
            output_dim = output.shape[-1]
            
            output = output[1:].view(-1, output_dim)
            trg = trg[1:].view(-1)
            
            loss = criterion(output, trg)
            
            epoch_loss += loss.item()
            
    return epoch_loss / len(iterator)


In [42]:
import math
#training hyper parameters
epochs = 10
lr = 0.001
batch_size = 64

#Model hyperparameters
load_model = False
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
input_size_enc = len(german.vocab)
input_size_dec = len(english.vocab)
output_size = len(english.vocab)
encoder_embedding_size = 300
decoder_embedding_size = 300
hidden_size = 1024
num_layers = 1
enc_droput = 0.5
dec_dropout = 0.5
#tensorboard

writer = SummaryWriter(f'runs/loss_plot')
step = 0
loss = 0
validation_loss = 0
best_validation_loss = float('inf')
pad_idx = english.vocab.stoi['<pad>']
criterion = nn.CrossEntropyLoss(ignore_index=pad_idx)
train_iterator, valid_iterator, test_iterator = BucketIterator.splits(
    (train_data, validation_data, test_data),
    batch_size = batch_size,
    sort_within_batch = True,
    sort_key = lambda x: len(x.src),
    device = device
)

enc_net = Encoder(input_size_enc,
                  encoder_embedding_size, 
                  hidden_size, num_layers,
                  dec_dropout).to(device)

dec_net = Decoder(input_size_dec, 
                  decoder_embedding_size,
                  hidden_size, output_size,
                  num_layers, dec_dropout
                  ).to(device)


model_ = model(enc_net, dec_net).to(device)
optimizer = torch.optim.Adam(model_.parameters(),lr = lr)
sent = "Ich komme zu hause."
if load_model:
  load_checkpoint(torch.load('my_checkpoint.pth.ptar'), model_, optimizer)

for epoch in range(epochs): 
  print(f'Epoch [{epoch} / {epochs}]')
  checkpoint = {"state_dict": model_.state_dict(), "optimizer": optimizer.state_dict()}
  UTILS.save_checkpoint(checkpoint) 
  model_.eval()
  tr_sent = UTILS_ATT.translate_sentence(model_,sent,german, english,device)
  print(" ".join(tr_sent))
  model_.train()
  for batch_idx, batch in enumerate(train_iterator):
    inp_data = batch.src.to(device)
    target = batch.trg.to(device)
    #print('INP SHAPE ===== ', inp_data.shape, target.shape)
    output = model_(inp_data, target)

    output = output[1:].reshape(-1, output.shape[2])
    target = target[1:].reshape(-1)

    optimizer.zero_grad()
    loss = criterion(output, target)

    loss.backward()

    torch.nn.utils.clip_grad_norm_(model_.parameters(), max_norm = 1)

    optimizer.step()

    writer.add_scalar('Training Loss', loss, global_step=step)
    #print('TRAINING LOSS ====', loss)
    step += 1


  validation_loss = evaluate(model_,valid_iterator, criterion) 
  if validation_loss < best_validation_loss:
      best_validation_loss = validation_loss
      torch.save(model_.state_dict(), 'ckpt/Seq2SeqModel.pt')
  #print(f"Epoch: {epoch+1:02} | Time {epoch_mins}m {epoch_secs}s")
  print(f"\tTrain Loss: {loss:.3f} | Train PPL: {math.exp(loss):7.3f}")
  print(f"\tValid Loss: {validation_loss:.3f} | Valid PPL: {math.exp(validation_loss):7.3f}")

Epoch [0 / 10]
=> Saving checkpoint
strewn volcano headset boy travel travel travel travel lot mark engine hollister situation charcoal tree cereal brindle whatever consults spending leafless that furniture merchandise furniture merchandise furniture merchandise furniture merchandise furniture merchandise furniture merchandise furniture merchandise furniture merchandise furniture merchandise furniture merchandise furniture merchandise furniture merchandise furniture merchandise furniture merchandise
	Train Loss: 3.221 | Train PPL:  25.064
	Valid Loss: 3.954 | Valid PPL:  52.122
Epoch [1 / 10]
=> Saving checkpoint
<unk> are to to a . . <eos>
	Train Loss: 2.463 | Train PPL:  11.736
	Valid Loss: 3.709 | Valid PPL:  40.803
Epoch [2 / 10]
=> Saving checkpoint
construction workers are preparing to . <eos>
	Train Loss: 2.693 | Train PPL:  14.780
	Valid Loss: 3.609 | Valid PPL:  36.916
Epoch [3 / 10]
=> Saving checkpoint
a workers are preparing to be . <eos>
	Train Loss: 2.888 | Train PPL:  17

In [43]:
def test():
  best_model = model(enc_net, dec_net).to(device)
  best_model.load_state_dict(torch.load('ckpt/Seq2SeqModel.pt'))
  
  test_loss = evaluate(best_model, test_iterator, criterion)
  
  print(f"Test Loss : {test_loss:.3f} | Test PPL: {math.exp(test_loss):7.3f}")
  return best_model
    
best_model = test()

Test Loss : 3.469 | Test PPL:  32.114
