# Hindi to English Transliteration using Sequence to Sequence models

The purpose of this notebook is as follows:

- Use the Dakshini dataset to get Hindi and English word data for transliteration. A sample of the data is as below
```
अंकगणित	ankganit	3
अंकल	uncle	4
अंकुर	ankur	4
```
- Create an Encoder-Decoder setup using Pytorch which will be trained on the corpus and tested similarly
- Use Encoder-Decoder with Attention and check performance in comparison to without attention




In [None]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, TensorDataset, DataLoader
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import torch.nn.functional as F
import random

try:
  import pytorch_lightning as pl
except:
  !pip install --quiet pytorch-lightning>=1.5
  import pytorch_lightning as pl

In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

def set_seed(seed):
  np.random.seed(seed)
  torch.manual_seed(seed)
  torch.cuda.manual_seed(seed)
  torch.cuda.manual_seed_all(seed)
  random.seed(seed)

set_seed(132)


#Get the dataset


In [None]:
!wget https://storage.googleapis.com/gresearch/dakshina/dakshina_dataset_v1.0.tar

--2024-05-06 18:56:26--  https://storage.googleapis.com/gresearch/dakshina/dakshina_dataset_v1.0.tar
Resolving storage.googleapis.com (storage.googleapis.com)... 142.251.111.207, 142.251.16.207, 172.253.62.207, ...
Connecting to storage.googleapis.com (storage.googleapis.com)|142.251.111.207|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 2008340480 (1.9G) [application/x-tar]
Saving to: ‘dakshina_dataset_v1.0.tar’


2024-05-06 18:56:46 (96.4 MB/s) - ‘dakshina_dataset_v1.0.tar’ saved [2008340480/2008340480]



In [None]:
!tar -xf '/content/dakshina_dataset_v1.0.tar'

VAL_PATH = '/content/dakshina_dataset_v1.0/hi/lexicons/hi.translit.sampled.dev.tsv'
TRAIN_PATH = '/content/dakshina_dataset_v1.0/hi/lexicons/hi.translit.sampled.train.tsv'
TEST_PATH = '/content/dakshina_dataset_v1.0/hi/lexicons/hi.translit.sampled.test.tsv'

train_df = pd.read_csv(TRAIN_PATH, sep='\t', names=['Hindi', 'English', ''])
val_df = pd.read_csv(VAL_PATH, sep='\t', names=['Hindi', 'English', ''])
test_df = pd.read_csv(TEST_PATH, sep='\t', names=['Hindi', 'English', ''])

In [None]:
train_df.head(5)

Unnamed: 0,Hindi,English,Unnamed: 3
0,अं,an,3
1,अंकगणित,ankganit,3
2,अंकल,uncle,4
3,अंकुर,ankur,4
4,अंकुरण,ankuran,3


In [None]:
print(f'Train data shape : {train_df.shape}')
print(f'Val data shape : {val_df.shape}')
print(f'Test data shape : {test_df.shape}')

Train data shape : (44204, 3)
Val data shape : (4358, 3)
Test data shape : (4502, 3)


In [None]:
#To tackle a problematic case of नं	-> nan
train_df.fillna('naan', inplace=True)
val_df.fillna('naan', inplace=True)
test_df.fillna('naan', inplace=True)

In [None]:
train_df.isna().any()

Hindi      False
English    False
           False
dtype: bool

Create class 'LangProcess' which will take the language name and its words, and create an object which holds the information required for mapping the language words to its respective and other information related to the corpus.

In [None]:
class LangProcess():
  def __init__(self, lang_name, lang_data):
    '''
    lang_name : Name of the language
    lang_data : Data for this language (words)
    '''
    self.lang_name = lang_name
    self.char_to_idx = {'SOS':0, 'EOS':1}
    self.idx_to_char = {0:'SOS', 1:'EOS'}
    self.char_count = {}
    self.chars_size = 2
    self.chars = ['SOS', 'EOS']
    self.max_input_len = 1 #Including EOS

    for word in lang_data:
      if (len(word)+1)>self.max_input_len:
        self.max_input_len = len(word)+1
      self.add_char(word)

  def add_char(self, word):
    #NOTE : If word is 'nan', it is identified by pandas as Nan
    #However it is actually index 19536 ie नं	nan
    #Tackling it manually
    for char in word:
      if char in self.char_to_idx.keys():
        #Character already in corpus, simply increment count
        self.char_count[char] += 1
      else:
        self.char_to_idx[char] = self.chars_size
        self.idx_to_char[self.chars_size] = char
        self.char_count[char] = 1
        self.chars_size += 1
        self.chars.append(char)

  def indices_to_word(self, indices):
    word = ""
    for i in indices:
      if i==0 or i==1:
        break
      word += self.idx_to_char[i]
    return word

In [None]:
input_lang = LangProcess('Hindi', train_df['Hindi'])
output_lang = LangProcess('English', train_df['English'])

In [None]:
def display_stats(lang_obj):
  print(f"For Language : {lang_obj.lang_name}")
  print(f"Corpus char - index mapping : {lang_obj.char_to_idx}")
  print(f"Character size : {lang_obj.chars_size}")
  print(f"Max input size : {lang_obj.max_input_len}")

display_stats(input_lang)
print("*********************************")
display_stats(output_lang)

For Language : Hindi
Corpus char - index mapping : {'SOS': 0, 'EOS': 1, 'अ': 2, 'ं': 3, 'क': 4, 'ग': 5, 'ण': 6, 'ि': 7, 'त': 8, 'ल': 9, 'ु': 10, 'र': 11, 'श': 12, 'द': 13, 'न': 14, 'े': 15, 'भ': 16, '्': 17, 'ष': 18, 'ा': 19, 'ी': 20, 'ठ': 21, 'य': 22, 'ो': 23, 'ू': 24, 'ज': 25, 'च': 26, 'म': 27, 'ट': 28, 'ड': 29, 'व': 30, 'ः': 31, 'ह': 32, 'प': 33, 'ृ': 34, 'स': 35, 'ध': 36, 'ै': 37, '़': 38, 'ब': 39, 'उ': 40, 'ॉ': 41, 'ई': 42, 'ख': 43, 'घ': 44, 'छ': 45, 'ञ': 46, 'फ': 47, 'ओ': 48, 'थ': 49, 'ढ': 50, 'झ': 51, 'ौ': 52, 'आ': 53, 'इ': 54, 'ँ': 55, 'ए': 56, 'ऊ': 57, 'ॅ': 58, 'ऋ': 59, 'ऑ': 60, 'ऐ': 61, 'औ': 62, 'ङ': 63, 'ॐ': 64}
Character size : 65
Max input size : 20
*********************************
For Language : English
Corpus char - index mapping : {'SOS': 0, 'EOS': 1, 'a': 2, 'n': 3, 'k': 4, 'g': 5, 'i': 6, 't': 7, 'u': 8, 'c': 9, 'l': 10, 'e': 11, 'r': 12, 's': 13, 'h': 14, 'd': 15, 'b': 16, 'y': 17, 'o': 18, 'j': 19, 'z': 20, 'm': 21, 'v': 22, 'w': 23, 'p': 24, 'f': 25, 'x': 26, 'q':

Now, we need to write utility functions to process the input words as as sequence of numbers, based on the character to index mapping. Thus, we would have 2 lists, one for hindi and for english, containing the numeric representation for the words based on character indices.

In [None]:
def create_word_vector(lang_obj, word):
  word_mapped = [lang_obj.char_to_idx[char] for char in word]
  return word_mapped

def create_dataloader(data_df = train_df, batch_size=128, shuffle=True):

  data_len = data_df.shape[0]
  print(f"Processing {data_len} entries.....")

  #Initialize a 0 vector for each entry of training input and output data (zero padding the extra values considering max seq length)
  input_data = np.zeros((data_len, input_lang.max_input_len), dtype=np.int32)
  output_data = np.zeros((data_len, output_lang.max_input_len), dtype=np.int32)

  input_words = data_df['Hindi']
  output_words = data_df['English']

  for idx, (input_w, output_w) in enumerate(zip(input_words, output_words)):
    input_indices = create_word_vector(input_lang, input_w)
    output_indices = create_word_vector(output_lang, output_w)

    input_indices.append(input_lang.char_to_idx['EOS'])
    output_indices.append(output_lang.char_to_idx['EOS'])

    input_data[idx,:len(input_indices)] = input_indices
    output_data[idx,:len(output_indices)] = output_indices

  #Now all processing and conversion of words to sequence of char indices is done
  #Proceed with torch Dataset and DataLoader creation
  #TODO:Dataset and Dataloader

  dataset = TensorDataset(
      torch.LongTensor(input_data).to(device),
      torch.LongTensor(output_data).to(device)
      )

  dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=shuffle, num_workers=4)

  return dataloader



In [None]:
train_dataloader = create_dataloader(train_df, 128, True)
val_dataloader = create_dataloader(val_df, 128, False)
test_dataloader = create_dataloader(test_df, 128, False)

Processing 44204 entries.....




Processing 4358 entries.....
Processing 4502 entries.....


In [None]:
next(iter(train_dataloader))[1].shape

  self.pid = os.fork()


torch.Size([128, 21])

In [None]:
next(iter(train_dataloader))[1].view(-1, ).shape

torch.Size([2688])

## Encoder - Decoder architecture using Pytorch



In [None]:
class Encoder(nn.Module):
  def __init__(self, input_size, hidden_size, dropout=0.1):
    super().__init__()
    self.hidden_size = hidden_size
    #Below module will convert a word of n chars (ie sequence of n char indices) to n vectors of size hdden_size
    # inp = torch.LongTensor([1,12,32])
    # emb = nn.Embedding(33, 5)
    # tensor([[ 1.0509, -0.6933, -1.6445,  0.4131,  1.0456],
    #   [ 0.0117, -0.8146,  0.3862, -0.7615,  0.5252],
    #   [ 1.1112, -0.6562, -0.8455, -1.5215,  1.4525]],
    #  grad_fn=<EmbeddingBackward0>)
    self.embedding = nn.Embedding(input_size, hidden_size)
    self.GRU = nn.GRU(hidden_size, hidden_size, batch_first=True) #indication that the first dimension would be batch_size
    self.dropout = nn.Dropout(p=dropout)

  def forward(self, input):
    #1. Embed the input vector of n indexes -> n vectors of vector (each of hidden_size) ie (n,hidden_size)
    #2. Pass this through dropout (reglarization)
    #3. Now, pass this sequence of inputs to the GRU, to get output and final hidden state (sequences are managed internally)
    embedded = self.dropout(self.embedding(input))
    output, hidden = self.GRU(embedded)

    return output, hidden



In [None]:
class Decoder(nn.Module):
  def __init__(self, output_size, hidden_size, dropout=0.1):

    super().__init__()
    self.hidden_size = hidden_size
    self.embedding = nn.Embedding(output_size, hidden_size)
    self.GRU = nn.GRU(hidden_size, hidden_size, batch_first = True)
    self.linear = nn.Linear(hidden_size, output_size)
    self.dropout = nn.Dropout(dropout)

  def forward(self, encoder_output, hidden, target_outputs=None):
    #The decoder gets the hidden state of the last encoder (ie. context vector) and the batch of decoder target inputs
    #Initially, we pass the batch's first input token as the SOS token
    #Then, during training, teacher forcing is used (original output token passed as next input)
    batch_size  = encoder_output.shape[0]
    decoder_input = torch.empty(batch_size, 1, dtype=torch.long, device=device).fill_(output_lang.char_to_idx['SOS'])
    decoder_hidden = hidden
    decoder_outputs = []

    for idx in range(output_lang.max_input_len):
      decoder_output, decoder_hidden = self.forward_step(decoder_input, decoder_hidden)
      decoder_outputs.append(decoder_output)

      #In training phase, target outputs will be passed. This is used to track if teacher forcing should be used or not
      if target_outputs is not None:
        #Teacher forcing
        decoder_input = target_outputs[:,idx].unsqueeze(1)
      else:
        #Use decoder previous output as new input during inference
        #top_indices holds the largest prob value index for next output token (per input of the batch)
        #https://stackoverflow.com/questions/57237352/what-does-unsqueeze-do-in-pytorch
        top_vals, top_indices = decoder_output.topk(1)
        decoder_input = top_indices.squeeze(-1).detach()

    #print(f"decoder_outputs dim before : {len(decoder_outputs)} x {decoder_outputs[0].shape}")
    #decoder_outputs dim before : 21 x torch.Size([128, 1, 28])
    decoder_outputs = torch.cat(decoder_outputs, dim=1)
    #print(f"decoder_outputs dim after : {len(decoder_outputs)} x {decoder_outputs[0].shape}")
    #decoder_outputs dim after : 128 x torch.Size([21, 28])
    decoder_outputs = F.log_softmax(decoder_outputs, dim=-1)
    #print(f"decoder_outputs dim post softmax : {len(decoder_outputs)} x {decoder_outputs[0].shape}")
    #decoder_outputs dim post softmax : 128 x torch.Size([21, 28])

    return decoder_outputs, decoder_hidden, None #Added extra none because while using attention, we'll pass attention weights

  def forward_step(self, input, hidden):
      embedded = self.embedding(input)
      embedded = F.relu(embedded)
      output, hidden = self.GRU(embedded, hidden)
      output = self.linear(output)
      return output, hidden

## Creating Training related functionalities

In [None]:
def train_epoch(dataloader, encoder, decoder, loss_fn, encoder_optimizer, decoder_optimizer):

  #For each batch of data:
  #1. Pass data, labels to device (already done in dataloader)
  #2. Pass data to encoder, get outputs, hidden
  #3. Pass data to decoder, get predictions for batch
  #4. Set gradient to 0 for optimizer(encoder and decoder)
  #5. Compute loss
  #6. loss backward pass
  #7. Optimizer step (encoder and decoder)
  #8. Increment the total loss with the loss for this batch
  # After all loops, return aggregate loss for this epoch

  total_loss = []

  for batch in dataloader:
    data, target = batch

    data = data.to(device)
    target = target.to(device)

    encoder_output, encoder_hidden = encoder(data)
    decoder_output, decoder_hidden, _ = decoder(encoder_output, encoder_hidden, target)

    # print(f"Shape in loss : {decoder_output.shape}")
    # print(f"Dim preds : {decoder_output.view(-1, decoder_output.size(-1)).shape}")
    # print(f"Dim labels : {target.view(-1).shape}")

    #Dim labels : torch.Size([2688]) where 2688 = 128*21 (Batch size*max word length)
    #Dim preds : torch.Size([2688, 28])
    loss = loss_fn(
        decoder_output.view(-1, decoder_output.size(-1)),
        target.view(-1)
    )

    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()

    loss.backward()

    encoder_optimizer.step()
    decoder_optimizer.step()

    total_loss.append(loss.item())

  return sum(total_loss)/len(total_loss)


In [None]:
def train(encoder, decoder, encoder_optimizer, decoder_optimizer, loss_fn, epochs = 10):

  for e in range(epochs):
    loss = train_epoch(train_dataloader, encoder, decoder, loss_fn, encoder_optimizer, decoder_optimizer)
    print(f"Epoch {e} : Loss {loss}")



In [None]:
hidden_size = 128
learning_rate = 0.01

encoder = Encoder(input_lang.chars_size, hidden_size).to(device)
decoder = Decoder(output_lang.chars_size, hidden_size).to(device)

encoder_optimizer = torch.optim.Adam(encoder.parameters(), lr=learning_rate)
decoder_optimizer = torch.optim.Adam(decoder.parameters(), lr=learning_rate)

loss_fn = nn.NLLLoss()

train(encoder, decoder, encoder_optimizer, decoder_optimizer, loss_fn, epochs=10)

  self.pid = os.fork()


Epoch 0 : Loss 0.5638245147776741
Epoch 1 : Loss 0.330467631448211
Epoch 2 : Loss 0.2822112492743255
Epoch 3 : Loss 0.25825673350364486
Epoch 4 : Loss 0.2421217335528032
Epoch 5 : Loss 0.2307418474141573
Epoch 6 : Loss 0.2248903665411679
Epoch 7 : Loss 0.21667454219450152
Epoch 8 : Loss 0.21285596937802487
Epoch 9 : Loss 0.20875625101300332


In [None]:
def test(encoder, decoder, loss_fn, dataloader):

  overall_loss = []
  with torch.no_grad():
    for data in dataloader:
      input, target = data
      input = input.to(device)
      target = target.to(device)

      encoder_output, encoder_hidden = encoder(input)
      decoder_output, _, _ = decoder(encoder_output, encoder_hidden)

      loss = loss_fn(
        decoder_output.view(-1, decoder_output.size(-1)),
        target.view(-1)
      )

      overall_loss.append(loss.item())

      #Print 1 random input from batch along with its output and prediction
      idx = random.randint(0,input.shape[0]-1)
      input_indices = input[idx]
      target_indices = target[idx]
      pred_indices = []
      for i in range(len(decoder_output[idx])):
        pred_indices.append(torch.argmax(decoder_output[idx][i]).item())

      input_word = input_lang.indices_to_word(input_indices.tolist())
      target_word = output_lang.indices_to_word(target_indices.tolist())
      pred_word = output_lang.indices_to_word(pred_indices)
      print("***********************************")
      print(f"Input : {input_word}")
      print(f"Target : {target_word}")
      print(f"Pred : {pred_word}")
      print("***********************************")




In [None]:
test(encoder, decoder, loss_fn, test_dataloader)

  self.pid = os.fork()


***********************************
Input : अंतः
Target : antaha
Pred : antoh
***********************************
***********************************
Input : अवैज्ञानिक
Target : avaigyanic
Pred : avagyanik
***********************************
***********************************
Input : आवाज़ों
Target : aawajon
Pred : aavajon
***********************************
***********************************
Input : ईए
Target : ea
Pred : in
***********************************
***********************************
Input : एलर्जिक
Target : allergic
Pred : eliruci
***********************************
***********************************
Input : करपात्री
Target : karpatri
Pred : karapatri
***********************************
***********************************
Input : किल्लत
Target : killat
Pred : killat
***********************************
***********************************
Input : गांठों
Target : ganthon
Pred : ganthon
***********************************
***********************************
Input : चढ़ने
Ta