<a href="https://colab.research.google.com/github/Lmalviya/machineTranslationTask/blob/main/machineTranslationUsingLSTM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [221]:
!pip install datasets --quiet
!pip install scikit-learn --quiet

In [222]:
import re
import random
from tqdm import tqdm

import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F

import numpy as np
import pandas as pd
from torch.utils.data import TensorDataset, DataLoader, RandomSampler
from torch.utils.data import Dataset

from datasets import load_dataset
from sklearn.model_selection import train_test_split

In [223]:
device = torch.device("cuda" if torch.cuda.is_available() else 'cpu')

In [224]:
param = {
    'epochs': 1,
    'batchSize': 1,
    'lr': 5e-04,
    'shuffle': True,
    'hiddenSize': 128,
    'embeddingSize': 128,
}

In [225]:
UNK_ID = 0
SOS_ID = 1
PAD_ID = 2
EOS_ID = 3
MAX_LENGTH = 130

class Dictionalry:
  def __init__(self, name):
    self.name = name
    self.word2index = {"UNK": 0, "SOS": 1, "PAD": 2, "EOS": 3}
    self.word2count = {}
    self.index2word = {0: "UNK", 1: "SOS", 2: "PAD", 3: "EOS"}
    self.count = 4

  def cleaning(self, text):
    text = text.lower()
    textList = re.findall(r"\w+|[^\w\s]", text)
    textList = [word.strip() for word in textList]
    return textList

  def addWords(self, sentence):
    wordList = self.cleaning(sentence)
    for word in wordList:
      if word not in self.word2index.keys():
        self.word2index[word] = self.count
        self.index2word[self.count] = word
        self.word2count[word] = 1
        self.count += 1
      else:
        self.word2count[word] += 1

  def __name__(self):
    return self.name

  def __size__(self):
    return self.count-2

  def __wordFrequency__(self, word):
    return self.word2count[word]

  def __index__(self, word):
    return self.word2index[word]

  def __word__(self, index):
    return self.index2word[index]


langEn = Dictionalry("english_dictionary")
langFr = Dictionalry("french_dictionary")

## Download dataset and build the vocab

In [226]:
books = load_dataset("opus_books", "en-fr")

In [227]:
dataset_en_to_fr_list = []
for item in books['train']:
  tmpItem = {
      'english': item['translation']['en'],
      'french': item['translation']['fr']
  }
  dataset_en_to_fr_list.append(tmpItem)

dataset_df = pd.DataFrame(dataset_en_to_fr_list)
print(f'Number of data points: {len(dataset_df)}')
dataset_df.head()

Number of data points: 127085


Unnamed: 0,english,french
0,The Wanderer,Le grand Meaulnes
1,Alain-Fournier,Alain-Fournier
2,First Part,PREMIÈRE PARTIE
3,I,CHAPITRE PREMIER
4,THE BOARDER,LE PENSIONNAIRE


In [228]:
## Build the courpus
englishCorpus = ' '.join(dataset_df['english'].to_list())
frenchCorpus = ' '.join(dataset_df['french'].to_list())

## Build the dictionary
langEn.addWords(englishCorpus)
langFr.addWords(frenchCorpus)

In [229]:
param['vocab_en'] = langEn.__size__()
param['vocab_fr'] = langFr.__size__()
print(f'English vocab size: {langEn.__size__()}')
print(f'French vocab size: {langFr.__size__()}')

English vocab size: 55267
French vocab size: 68758


## Build dataloader

In [230]:
class preprocessing(Dataset):
  def __init__(self, dataFrame, vocab_en, vocab_fr):
    super(preprocessing, self).__init__()
    self.data = dataFrame
    self.vocab_en = vocab_en
    self.vocab_fr = vocab_fr


  def encode(self, text, vocab, maxlen=128):
    word2indexList = [SOS_ID]
    wordList = vocab.cleaning(text)
    for word in wordList:
      try:
        num = vocab.__index__(word)
      except:
        num = UNK_ID
      word2indexList.append(num)

    ## add SOS, EOS, UNK and PAD TOKENS
    if len(word2indexList) ==  maxlen+1:
      word2indexList.append(EOS_ID)
    elif len(word2indexList) >  maxlen+1:
      word2indexList = word2indexList[: maxlen+1]
      word2indexList.append(EOS_ID)
    else:
      tmpNum =  maxlen + 1 - len(word2indexList)
      for i in range(tmpNum):
        word2indexList.append(PAD_ID)
      word2indexList.append(EOS_ID)

    return word2indexList

  def __getitem__(self, index):
    langEnText = self.data.english[index]
    langFrText = self.data.french[index]
    word2IndexList_en = self.encode(langEnText, self.vocab_en)
    word2IndexList_fr = self.encode(langFrText, self.vocab_fr)
    return {
        "input_ids": torch.tensor(word2IndexList_en, dtype=torch.int),
        "target_ids": torch.tensor(word2IndexList_fr, dtype=torch.int)
    }

  def __len__(self):
    return len(self.data)


In [231]:
train_size = 0.80
train_dataset = dataset_df.sample(frac=train_size, random_state=200).reset_index(drop=True)
valid_dataset = dataset_df.drop(train_dataset.index).reset_index(drop=True)

print("Full dataset: {}".format(train_dataset.shape[0]+valid_dataset.shape[0]))
print("Train dataset: {}".format(train_dataset.shape))
print("Vaild dataset: {}".format(valid_dataset.shape))

train_dataset.head()

Full dataset: 127085
Train dataset: (101668, 2)
Vaild dataset: (25417, 2)


Unnamed: 0,english,french
0,I'll soon make you dry enough!',"Je vais bientôt vous faire sécher, je vous en ..."
1,"""Just one.",-- Une seule.
2,"I went off from the shop, as if driven along b...",Je m'écartai de la boutique comme repoussée pa...
3,"One could see, by the way in which her girdle ...",On voyait à la manière dont sa ceinture lui re...
4,"It evidently wanted to go on, and prognosticat...","Je sentais qu’il aurait aimé aller plus loin, ..."


In [232]:
train_set = preprocessing(train_dataset[:200], langEn, langFr)
validation_set = preprocessing(valid_dataset[:200], langEn, langFr)

trainLoaderParam = {'batch_size': param['batchSize'], 'shuffle': param['shuffle'], 'num_workers': 0}
valLoaderParam = {'batch_size': param['batchSize'], 'shuffle': param['shuffle'], 'num_workers': 0}

param['traindata'] = DataLoader(train_set, **trainLoaderParam)
param['valdata'] = DataLoader(validation_set, **valLoaderParam)

# Encoder-Decoder architecture

In [233]:
class EncoderLSTM(nn.Module):
  def __init__(self, vocabSize, hiddenSize, embeddingSize, dropout_p=0.1):
    super(EncoderLSTM, self).__init__()

    self.embedding = nn.Embedding(vocabSize, embeddingSize)
    self.LSTM = nn.LSTM(embeddingSize, hiddenSize, batch_first=True)
    self.dropout = nn.Dropout(dropout_p)

  def forward(self, inputs):
    embedding = self.embedding(inputs)
    embedding = self.dropout(embedding)
    output, hidden = self.LSTM(embedding)
    return output, hidden

param['encoder'] = EncoderLSTM(param['vocab_en'], param['hiddenSize'], param['embeddingSize'])

In [234]:
class DecoderLSTM(nn.Module):
  def __init__(self, vocabSize, hiddenSize, embeddingSize, dropout_p=0.1):
    super(DecoderLSTM, self).__init__()

    self.embedding = nn.Embedding(vocabSize, embeddingSize)
    self.LSTM = nn.LSTM(embeddingSize, hiddenSize, batch_first=True)
    self.linear = nn.Linear(hiddenSize, vocabSize)
    self.dropout = nn.Dropout(dropout_p)

  def forward(self, encoder_outputs, encoder_hidden, target_tensor=None):
    batch_size = encoder_outputs.size(0)

    decoder_input = torch.empty(batch_size, 1, dtype=torch.long, device=device)
    decoder_input.fill_(SOS_ID)  # Fill with the integer ID of the SOS token

    decoder_hidden = encoder_outputs
    decoder_output_list = []

    for i in range(MAX_LENGTH):
      decoder_output, decoder_hidden = self.forward_step(decoder_input, decoder_hidden)
      decoder_output_list.append(decoder_output)

      if target_tensor is not None:
        ## Teacher forcing: Feed the target as the next input
        decoder_input = target_tensor[:, i].unsqueeze(1)
      else:
        ## Without teacher forcing: use its own predictions as the next input
        _, topi = decoder_output.topk(1)
        decoder_input = topi.squeeze(-1).detach()

    decoder_output = torch.cat(decoder_output_list, dim=1)
    decoder_output = torch.argmax(decoder_output, dim=-1)
    return decoder_output, decoder_hidden

  def forward_step(self, inputs, hidden):
    embedding = self.embedding(inputs)
    output, hidden = self.LSTM(embedding)
    output = self.linear(output)
    return output, hidden

param['decoder'] = DecoderLSTM(param['vocab_fr'], param['hiddenSize'], param['embeddingSize'])

## Optimizer

In [235]:
param['encoderOptim'] = optim.Adam(param['encoder'].parameters(), lr=param['lr'])
param['decoderOptim'] = optim.Adam(param['decoder'].parameters(), lr=param['lr'])

## Loss function

In [236]:
def loss_fn(predected, targets):
  missed_predictions = torch.sum((predected != targets) & (targets != 0)& (targets != 1)& (targets != 2) & (targets != 3)).item()
  predict_words = torch.sum((targets != 0) & (targets != 1) & (targets != 2) & (targets != 3)).item()
  loss = missed_predictions / predict_words
  return torch.tensor(loss, requires_grad=True, dtype=torch.float)

## Training loop

In [237]:
def fit(param):
  best_loss = 1000
  encoder_model = param['encoder'].to(device)
  decoder_model = param['decoder'].to(device)
  encoder_optim = param['encoderOptim']
  decoder_optim = param['decoderOptim']
  traindata = param['traindata']
  valdata = param['valdata']

  plot_losses = []

  for epoch in range(param['epochs']):
    train_cumulative = 0.0
    encoder_model.train()
    decoder_model.train()
    for data in traindata:
      encoder_optim.zero_grad()
      decoder_optim.zero_grad()

      input_ids = data['input_ids'].to(device)
      target_ids = data['target_ids'].to(device)

      encoder_output, encoder_hidden = encoder_model(input_ids)
      decoder_output, _ = decoder_model(encoder_output, encoder_hidden, target_ids)
      # decoder_output, _ = decoder_model(encoder_output, encoder_hidden)

      loss = loss_fn(decoder_output, target_ids)
      loss.backward()

      encoder_optim.step()
      decoder_optim.step()
      train_cumulative += loss.item()

    avg_loss = train_cumulative/len(traindata)
    if epoch%100 == 0:
      print(f"\nTraining loss at epoch {epoch}: {avg_loss}")
    # plot_losses.append(train_cumulative)


In [238]:
fit(param)


Training loss at epoch 0: 1.0


## Testing

In [239]:
def test(param, tokezier, text, vocab_en, vocab_fr):
  encoder_model = param['encoder'].to(device)
  decoder_model = param['decoder'].to(device)
  input_ids = tokezier.encode(text, vocab_en)
  input_ids = torch.tensor(input_ids, dtype=torch.int).to(device)
  encoder_output, encoder_hidden = encoder_model(input_ids)
  decoder_output, _ = decoder_model(encoder_output, encoder_hidden)
  decoder_output = decoder_output.detach().cpu().numpy().tolist()

  wordList = []
  for index in decoder_output[0]:
    if index in [0, 1, 2, 3]:
      break
    word = vocab_fr.__word__(index)
    wordList.append(word)
  return ' '.join(wordList)


In [240]:
tokenizer = preprocessing(None, langEn, langFr)
text = "hi, i am lakhan from hello world."
print(test(param, tokenizer, text, langEn, langFr))