<h1><I>Installling necessary packages

In [None]:
!pip install torchtext==0.4
!pip install -U spacy


In [None]:
!python -m spacy download de_core_news_sm
!python -m spacy download en_core_web_sm

<h1><I>Importing necessary packages

In [12]:
import torch 
import torch.nn as nn
from torchtext.datasets import Multi30k
from torchtext.data import Field, BucketIterator
import numpy as np
import spacy as sp
import random
import re
import string
import nltk
nltk.download('punkt')
from torch.utils.tensorboard import SummaryWriter 

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


<h1><I>Loading the dataset

In [13]:
import spacy
spacy_eng = sp.load("en_core_web_sm")
spacy_ger = spacy.load("de_core_news_sm")

<h1><I>Tokenizing the data 

In [14]:
def tokenize_ger(text):
  return [tok.text for tok in spacy_ger.tokenizer(text)]

def tokenize_eng(text):
  return [tok.text for tok in spacy_eng.tokenizer(text)]

In [15]:
german = Field(tokenize=tokenize_ger, lower=True, init_token='<sos>', eos_token='<eos>')
english = Field(tokenize=tokenize_eng, lower=True, init_token='<sos>', eos_token='<eos>')

<h1><I>Splitting the data into train, validation and test

In [16]:
train_data, validation_data, test_data = Multi30k.splits( exts=('.de', '.en'), fields=(german,english))

In [8]:
type(spacy_eng)

spacy.lang.en.English

<h1><I>Building the vocabulary

In [17]:
german.build_vocab(train_data, max_size=10000, min_freq=2)
english.build_vocab(train_data, max_size=10000, min_freq=2)

<h1><I>CReating the classes of Encoder and decoder and creating model out of it

In [81]:
class Encoder(nn.Module):
  def __init__(self, input_size, embedding_size, 
               hidden_size,num_layers,d_rate):
    super().__init__()
    self.num_layers = num_layers
    self.hidden_size = hidden_size
    self.dropout = nn.Dropout(d_rate)
    self.embedding = nn.Embedding(input_size, embedding_size)
    self.lstm = nn.LSTM(embedding_size, hidden_size, num_layers, dropout=d_rate)

  def forward(self, x):
    #print('ENCODER INPUT SHAPE ======= ',x.shape) # (N, seq_length)
    x = self.embedding(x)
    #print('EMBEDDING SHAPE ======= ',x.shape)
    x = self.dropout(x)

    output, (hidden,cell) = self.lstm(x)
    #print('ENCODER OUTPUT SHAPE ======= ',output.shape)

    #print('=======================END OF ENCODER===================')
    return hidden, cell

class Decoder(nn.Module):
  def __init__(self,input_size, 
               embedding_size, 
               hidden_size, 
               output_size, 
               num_layers, d_rate
               ):
    super().__init__()
    self.num_layers = num_layers
    self.hidden_size = hidden_size
    self.embedding = nn.Embedding(input_size, embedding_size)
    self.dropout = nn.Dropout(d_rate)
    self.lstm = nn.LSTM(embedding_size, hidden_size,num_layers, dropout=d_rate)
    self.fc = nn.Linear(hidden_size,output_size)

  def forward(self,x, hidden, cell):
    #print('DECODER INPUT SHAPE ====== ', x.shape)
    x = x.unsqueeze(0)
    x = self.embedding(x)
    #x = x.reshape(x.shape[1], x.shape[2], x.shape[3])
    #print('DECODER EMBEDDING SHAPE ====== ', x.shape)
    x = self.dropout(x)

    output, (hidden, cell) = self.lstm(x)
    #print('DECODER OUTPUT SHAPE ====== ', output.shape)
    pred = self.fc(output).squeeze(0)
    #print('DECODER PREDICTION SHAPE ====== ', pred.shape)

    #print('========================END OF DECODER========================')

    return pred, hidden, cell


class model(nn.Module):
  def __init__(self, encoder, decoder):
    super().__init__()
    self.encoder = encoder
    self.decoder = decoder

  def forward(self, src, trg, teacher_force_ratio=0.5):
    batch_size = src.shape[1]
    target_len = trg.shape[0]
    target_vocab_size = len(english.vocab)

    outputs = torch.zeros(target_len, batch_size, target_vocab_size).to(device)
    # Since we are doing word by word prediction

    hidden, cell = self.encoder(src)

    #grabing start token
    x = trg[0]

    for t in range(1,target_len):
      output, hidden, cell = self.decoder(x,hidden, cell)
      outputs[t] = output
      best_guess = output.argmax(1)
      x = target[t] if random.random() < teacher_force_ratio else best_guess
      # In this line we are feeding actual target value and output of the previous decoder alternatively.
    return outputs




<h1><I>Model training

In [None]:
#training hyper parameters
epochs = 20
lr = 0.0001
batch_size = 64

#Model hyperparameters
load_model = False
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
input_size_enc = len(german.vocab)
input_size_dec = len(english.vocab)
output_size = len(english.vocab)
encoder_embedding_size = 300
decoder_embedding_size = 300
hidden_size = 512
num_layers = 2
enc_droput = 0.5
dec_dropout = 0.55
optimizer = torch.optim.Adam(model_.parameters(),lr = lr)
#tensorboard

writer = SummaryWriter(f'runs/loss_plot')
step = 0

train_iterator, valid_iterator, test_iterator = BucketIterator.splits(
    (train_data, validation_data, test_data),
    batch_size = batch_size,
    sort_within_batch = True,
    sort_key = lambda x: len(x.src),
    device = device
)

enc_net = Encoder(input_size_enc,
                  encoder_embedding_size, 
                  hidden_size, num_layers,
                  dec_dropout).to(device)

dec_net = Decoder(input_size_dec, 
                  decoder_embedding_size,
                  hidden_size, output_size,
                  num_layers, dec_dropout
                  ).to(device)


model_ = model(enc_net, dec_net).to(device)

pad_idx = english.vocab.stoi['<pad>']
criterion = nn.CrossEntropyLoss(ignore_index=pad_idx)

if load_model:
  load_checkpoint(torch.load('my_checkpoint.pth.ptar'), model_, optimizer)

for epoch in range(epochs):
  print(f'Epoch [{epoch} / {epochs}]')
  #checkpoint = {'state_dict':model_.state_dict(), 'optimizer': optimizer.state_dict()}
  #save_checkpoint(checkoint) 

  for batch_idx, batch in enumerate(train_iterator):
    inp_data = batch.src.to(device)
    target = batch.trg.to(device)
    #print('INP SHAPE ===== ', inp_data.shape, target.shape)
    output = model_(inp_data, target)

    output = output[1:].reshape(-1, output.shape[2])
    target = target[1:].reshape(-1)

    optimizer.zero_grad()
    loss = criterion(output, target)

    loss.backward()

    torch.nn.utils.clip_grad_norm_(model_.parameters(), max_norm = 1)

    optimizer.step()

    writer.add_scalar('Training Loss', loss, global_step=step)
    step += 1

Epoch [0 / 20]
Epoch [1 / 20]
Epoch [2 / 20]
Epoch [3 / 20]
Epoch [4 / 20]
Epoch [5 / 20]
Epoch [6 / 20]
Epoch [7 / 20]
Epoch [8 / 20]
Epoch [9 / 20]
Epoch [10 / 20]
Epoch [11 / 20]
Epoch [12 / 20]
Epoch [13 / 20]
Epoch [14 / 20]
Epoch [15 / 20]
Epoch [16 / 20]
