# **Sequence to Sequence based Question Answering Model**

In [None]:
!pip install indic-nlp-library
!pip install torchtext==0.10.0
!pip install deep_translator

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting indic-nlp-library
  Downloading indic_nlp_library-0.81-py3-none-any.whl (40 kB)
[K     |████████████████████████████████| 40 kB 6.0 MB/s 
[?25hCollecting morfessor
  Downloading Morfessor-2.0.6-py3-none-any.whl (35 kB)
Collecting sphinx-rtd-theme
  Downloading sphinx_rtd_theme-1.0.0-py2.py3-none-any.whl (2.8 MB)
[K     |████████████████████████████████| 2.8 MB 90.3 MB/s 
Collecting sphinx-argparse
  Downloading sphinx_argparse-0.3.2-py3-none-any.whl (12 kB)
Installing collected packages: sphinx-rtd-theme, sphinx-argparse, morfessor, indic-nlp-library
Successfully installed indic-nlp-library-0.81 morfessor-2.0.6 sphinx-argparse-0.3.2 sphinx-rtd-theme-1.0.0
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting torchtext==0.10.0
  Downloading torchtext-0.10.0-cp37-cp37m-manylinux1_x86_64.whl (7.6 MB)
[K     |█████████████

- **Move inside directory**

In [None]:
cd "/content/drive/MyDrive/chaii-hindi-and-tamil-question-answering/"

/content/drive/MyDrive


- **Read dataset**

In [None]:
import pandas as pd
data = pd.read_csv("/content/drive/MyDrive/chaii-hindi-and-tamil-question-answering/clean_data.csv", index_col=0, encoding="utf-8")

- **Concat question and context**

In [None]:
def concat_que_context(col):
  question = col[0]
  context = col[1]
  text = question +" ? "+ context
  return text
data['que_context'] = data[['question','context']].apply(concat_que_context, axis=1)

- **Display sample data**

In [None]:
data.head(2)

Unnamed: 0,id,context,question,answer,answer_start,que_context
0,004938454,இந்தியாவில் வங்கித்தொழில் பதினெட்டாம் நூற்றாண்...,இந்தியாவில் தாராளமயம் எப்போது தொடங்கியது,1990,10683,இந்தியாவில் தாராளமயம் எப்போது தொடங்கியது ? இந்...
1,9cbe4e227,ऍडविन पावल हबल अंग्रेज़ी edwin powell hubble ज...,एडविन पॉवेल हबल मृत्यु कब,२८ सितम्बर १९५३,79,एडविन पॉवेल हबल मृत्यु कब ? ऍडविन पावल हबल अंग...


- **Extract que-context and answer column**

In [None]:
data = data[['que_context','answer']]

In [None]:
data.head(2)

- **Check for null values**

In [None]:
data.isna().sum()

que_context    0
answer         0
dtype: int64

- **Split data into train, test and validation**

In [None]:
train_data = data.iloc[:914].reset_index(drop=True)
test_data = data.iloc[914:1014].reset_index(drop=True)
val_data = data.iloc[1014:].reset_index(drop=True)
print("Train dataset size: ", train_data.shape)
print("Test dataset size: ", test_data.shape)
print("Validation dataset size: ", val_data.shape)

Train dataset size:  (914, 2)
Test dataset size:  (100, 2)
Validation dataset size:  (92, 2)


- **Import libraries**

In [None]:
import os
import torch
import random
import pickle
from torchtext import vocab,data
from torchtext.vocab import Vectors
from torchtext.legacy.data import Field, TabularDataset, BucketIterator, Iterator
from indicnlp.tokenize import indic_tokenize 
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import warnings
warnings.filterwarnings("ignore")

folder_path = "/content/drive/MyDrive"

- **Load and process data** 

In [None]:

def load_dataset(folder_path,device):
  
  vec = Vectors(name = 'vec_file.txt', cache = folder_path, unk_init = torch.Tensor.normal_ )
  def tokenizer(text):
    text = indic_tokenize.trivial_tokenize(text)
    return text
  # build source text object apply tokenizer
  SRC = Field(tokenize=tokenizer,init_token = '<sos>', eos_token = '<eos>',fix_length=384)
  # build target text object apply tokenizer
  TRG = Field(tokenize=tokenizer,init_token = '<sos>', eos_token = '<eos>',fix_length=30)
    
  data_fields = [("que_context", SRC),
                   ("answer", TRG)]
  print(folder_path)
  train_data, valid_data, test_data = TabularDataset.splits(path=folder_path,  
                                            train='train_data.csv', 
                                            validation='val_data.csv',
                                            test='test_data.csv',
                                            format='csv', 
                                            fields=data_fields, 
                                            skip_header=True)


  SRC.build_vocab(train_data,vectors=vec)
  TRG.build_vocab(train_data, valid_data, test_data,vectors=vec)

  enc_embeddings_weights=SRC.vocab.vectors
  dec_embeddings_weights=TRG.vocab.vectors


  train_iterator, valid_iterator= BucketIterator.splits((train_data, valid_data), sort_key=lambda x: len(x.que_context), batch_size=4,device=device)
  test_iterator = BucketIterator((test_data), batch_size=1,device=device,shuffle=False)

   
  return SRC, TRG, enc_embeddings_weights, dec_embeddings_weights, train_iterator, valid_iterator, test_iterator, train_data, valid_data, test_data
    


In [None]:

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print("Backend available: ", device)

Backend available:  cuda


In [None]:
SRC, TRG, enc_embeddings_weights, dec_embeddings_weights, train_iterator, valid_iterator, test_iterator,train_data,valid_data, test_data = load_dataset(folder_path,device)

print("\n=========================================")
print("Total Number of Question_Context-Answer Pairs in Train Data: ",len(train_data))
print("Total Number of Question_Context-Answer Pairs in Validation Data: ",len(valid_data))
print("Total Number of Question_Context-Answer Pairs in Test Data: ",len(test_data))
print("\n=========================================")
print("Total Number of Batches in Train Data: ",len(train_iterator))
print("Total Number of Batches in Validation Data: ",len(valid_iterator))
print("Total Number of Batches in Test Data: ",len(test_iterator))
print("\n=========================================")
print("Highest Frequency Words in Que-Context Vocab \n",SRC.vocab.freqs.most_common(10))
print("\nHighest Frequency Words in Answer Vocab \n",TRG.vocab.freqs.most_common(10))


print(f"\n\nSource/Que-Context Vocabulary Size: {len(SRC.vocab)}")
print(f"Target/Answers Vocabulary Size:   {len(TRG.vocab)}")
source_vocab=SRC.vocab.stoi
target_vocab=TRG.vocab.stoi

with open(folder_path+'/source_vocab.pickle', 'wb') as src:
    pickle.dump(source_vocab, src, protocol=pickle.HIGHEST_PROTOCOL)
print("Context vocabulary saved successfully....")
with open(folder_path+'/target_vocab.pickle', 'wb') as tgt:
    pickle.dump(target_vocab, tgt, protocol=pickle.HIGHEST_PROTOCOL)
print("Answer vocabulary saved successfully....")


/content/drive/MyDrive

Total Number of Question_Context-Answer Pairs in Train Data:  914
Total Number of Question_Context-Answer Pairs in Validation Data:  92
Total Number of Question_Context-Answer Pairs in Test Data:  100

Total Number of Batches in Train Data:  229
Total Number of Batches in Validation Data:  23
Total Number of Batches in Test Data:  100

Highest Frequency Words in Que-Context Vocab 
 [('रूप', 3902), ('जाता', 3345), ('तथा', 3052), ('भारत', 2829), ('नाम', 1963), ('भारतीय', 1715), ('समय', 1713), ('उन्होंने', 1687), ('ஆம்', 1602), ('अन्य', 1557)]

Highest Frequency Words in Answer Vocab 
 [('किलोमीटर', 15), ('15', 13), ('मीटर', 12), ('जनवरी', 11), ('अक्टूबर', 11), ('अप्रैल', 10), ('7', 9), ('जून', 9), ('सितम्बर', 8), ('18', 8)]


Source/Que-Context Vocabulary Size: 149317
Target/Answers Vocabulary Size:   1585
Context vocabulary saved successfully....
Answer vocabulary saved successfully....


- **Build model architecture**

In [None]:

def model_obj(SRC, TRG, enc_embeddings_weights, dec_embeddings_weights, device):
  # define encoder model architecture
  class Encoder(nn.Module):
    def __init__(self, input_dim, emb_dim, hid_dim, n_layers, dropout, weights):
        super().__init__()
        self.input_dim = input_dim
        self.emb_dim = emb_dim
        self.hid_dim = hid_dim
        self.n_layers = n_layers
        self.dropout = dropout
        # make embedding layer
        self.embedding = nn.Embedding(input_dim, emb_dim)
        # make GRU layer 
        self.gru = nn.GRU(emb_dim, hid_dim, n_layers, dropout = dropout, bidirectional=True)
        # make fully connected layer
        self.fc = nn.Linear(hid_dim * 2, hid_dim)
        # apply dropout to model
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, src):
        # appy embedding layer on input data
        # apply dropout on embedding vector output
        embedded = self.dropout(self.embedding(src))
        # apply gru model on embedding vector
        outputs, hidden= self.gru(embedded)
        # apply linear layer on data
        hidden = self.fc(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim = 1))
        hidden=hidden.unsqueeze(0)
        # get encoder generated output that will be used as input for decoder
        return hidden
  # made decoder architecture
  class Decoder(nn.Module):
    def __init__(self, output_dim, emb_dim, hid_dim, n_layers, dropout,weights):
        super().__init__()

        self.emb_dim = emb_dim
        self.hid_dim = hid_dim
        self.output_dim = output_dim
        self.n_layers = n_layers
        self.dropout = dropout
        
        # make embedding layer
        self.embedding = nn.Embedding(output_dim, emb_dim) 
        # make gru layer
        self.gru = nn.GRU(emb_dim, hid_dim, self.n_layers)    
        # make linear 
        self.out = nn.Linear(hid_dim, output_dim) 
        # make softmax layer
        self.soft = nn.LogSoftmax(dim=1)
        # make dropout layer
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, input, hidden):
        
        input = input.unsqueeze(0)
        embedded = self.dropout(self.embedding(input))
        # appy relu activation function that will smooth generated values
        embedded = F.relu(embedded)
        output, hidden = self.gru(embedded, hidden)
        # apply fully connected linear layer
        prediction = self.out(output.squeeze(0))
        # apply softmax on generated outputs as it will select best form generated ids
        prediction = self.soft(prediction)
        # prediction are final prediction returned by decoder

        return prediction, hidden
  # build sequence 2 sequence model architecture that will take encoder and decoder objects and will generate final predictions
  class Seq2Seq_GRU(nn.Module):
    def __init__(self, encoder, decoder, device):
        super().__init__()

        self.encoder = encoder
        self.decoder = decoder
        self.device = device
        
    def forward(self, src, trg, truth_chance = 0.8):
        # define batch size how many examples will be taken in one iteration
        batch_size = trg.shape[1]
        # maximum length of answer
        max_len = trg.shape[0]
        # maximum length of source (que-context)
        input_max_len = src.shape[0]
        # define target vocabulary size
        trg_vocab_size = self.decoder.output_dim
        outputs = torch.zeros(max_len, batch_size, trg_vocab_size).to(self.device)
        inputs = torch.zeros(max_len, batch_size, trg_vocab_size).to(self.device)
        hidden = self.encoder(src)
        hidden=hidden
        # get actual answer
        input = trg[0,:]
        for t in range(1, max_len):
            output, hidden = self.decoder(input, hidden)
            # get the output generated by decoder
            outputs[t] = output

            truth_force = random.random() < truth_chance
            # get top probability value from generated predictions
            top1 = output.max(1)[1]
            # assign true answer as input if truth chance else assign the previous word top probability generated
            input = (trg[t] if truth_force else top1.detach())
        return outputs
  # define hyperparameters
  input_dimension = len(SRC.vocab)
  output_dimension = len(TRG.vocab)
  embedding_dimension = 300
  encoder_hidden_units = 512
  decoder_hidden_units = 512
  num_layers = 1
  dropout_rate = 0.2

  encoder = Encoder(input_dimension, embedding_dimension, encoder_hidden_units, num_layers, dropout_rate, enc_embeddings_weights)

  decoder = Decoder(output_dimension, embedding_dimension, decoder_hidden_units, num_layers, dropout_rate, dec_embeddings_weights)

  model = Seq2Seq_GRU(encoder, decoder, device).to(device)
  return model


In [None]:
# call model object fucntion that will build model architecture
model= model_obj(SRC, TRG, enc_embeddings_weights, dec_embeddings_weights,device)

PAD_IDX = SRC.vocab.stoi['<pad>']
PAD_IDX = TRG.vocab.stoi['<pad>']
# define optimizer and loss function that model will use
optimizer = optim.Adam(model.parameters(),lr=0.001)
criterion = nn.NLLLoss(ignore_index = PAD_IDX)

- **Model train function**

In [None]:
def train(model, iterator, optimizer, criterion):
    model.train() 
    epoch_loss = 0

    # iterate over all instances in the dataset
    for i, batch in enumerate(iterator):
        loss_iter=0

        src = batch.que_context.to(device)
        trg = batch.answer.to(device)
        # set all model gradients to zero thaat will be updated during model training
        optimizer.zero_grad()
        output = model(src, trg)
        # reshape the output to get predictions
        output = output[1:].view(-1, output.shape[-1])
        trg = trg[1:].view(-1)
        # calculate the loss, that finds the difference between original and generated predicted values
        loss = criterion(output, trg)
        # start backpropagation
        loss.backward()
        # apply optimizer to lower the loss
        optimizer.step()
        # add loss for each batch
        epoch_loss += loss.item()
        # take average loss of all batches in one epoch
        train_loss = epoch_loss / len(iterator)
    # return train loss of one epoch (average of all batches in one epoch)
    return train_loss

- **Model evaluation function**

In [None]:
def evaluate(model, iterator, criterion):
    model.eval()
    epoch_loss = 0
    # no gradients will be updated during evaluation
    with torch.no_grad():
        for i, batch in enumerate(iterator):
            src = batch.que_context.to(device)
            trg = batch.answer.to(device)
            output = model(src, trg, 0)
            # reshape generated predictions
            output = output[1:].view(-1, output.shape[-1])
            trg = trg[1:].view(-1)
            # evaluate loss
            loss = criterion(output, trg)
            # no optimization is used in evaluation phase
            epoch_loss += loss.item()
            # average validation loss for all batches in on epoch
            validation_loss = epoch_loss / len(iterator)
    return validation_loss

In [None]:
def save_model(model,epoch, optimizer, train_loss, valid_loss, direc_path):
  model_save_dir = direc_path
  file_name = 'model_checkpoint.pt'
  path=os.path.join(model_save_dir, file_name)
  state = {'epoch': epoch + 1, 'state_dict': model.state_dict(),
             'optimizer': optimizer.state_dict(), 'train_loss': train_loss, 'valid_loss' : valid_loss }
  
  return torch.save(state, path)

In [None]:
def load_checkpoint(model, filename):
    file_name = filename
    PATH=os.path.join(folder_path,file_name)
    if os.path.isfile(PATH):
        print("Loading checkpoint from ----> ", PATH)
        checkpoint = torch.load(PATH)
        # get current epoch number
        start_epoch = checkpoint['epoch']
        model.load_state_dict(checkpoint['state_dict'])
        optimizer.load_state_dict(checkpoint['optimizer'])
        # get current train loss
        train_loss = checkpoint['train_loss']
        # get current validation loss
        valid_loss = checkpoint['valid_loss']
        print("Loaded checkpoint")
    else:
        print("No checkpoint found at: ", filename)

    # return parameters current values
    return model, optimizer, start_epoch, train_loss, valid_loss

- **Start model training**
  - **As we can see that train loss is continuously decreasing whereas validation loss is increasing, that shows that model is overfitting**
  - **Main reason for model overfitting is we have very small dataset**

In [None]:
max_epochs = 10
epoch=0
best_valid_loss = float('inf')
print('\n\tStart Model Training',"-"*20,"\n")
for epoch in range(epoch,max_epochs):
      # call train function to perform model training
      training_loss = train(model, train_iterator, optimizer, criterion)
      # call evaluate function to perform model validation
      validation_loss = evaluate(model, valid_iterator, criterion)
      # save model pass current parameters values
      save_model(model,epoch, optimizer, training_loss, validation_loss, folder_path)
      print("Epoch: ", epoch+1)
      print("Train Loss: ", training_loss)
      print("Validation Loss: ", validation_loss)


	Start Model Training -------------------- 

Epoch:  1
Train Loss:  5.558204601945835
Validation Loss:  5.662248134613037
Epoch:  2
Train Loss:  5.008569371752343
Validation Loss:  6.214366995769998
Epoch:  3
Train Loss:  3.9987595596688283
Validation Loss:  6.171025255452031
Epoch:  4
Train Loss:  2.6103860212725842
Validation Loss:  7.018138118412184
Epoch:  5
Train Loss:  1.3325079815486633
Validation Loss:  7.592483831488567
Epoch:  6
Train Loss:  0.7202417590212093
Validation Loss:  8.140052919802459
Epoch:  7
Train Loss:  0.4613031075584121
Validation Loss:  7.994200934534487
Epoch:  8
Train Loss:  0.2924839135363096
Validation Loss:  8.545173645019531
Epoch:  9
Train Loss:  0.19694720310317637
Validation Loss:  8.462818746981414
Epoch:  10
Train Loss:  0.11346793245202882
Validation Loss:  8.451737341673478


In [None]:
# load model from checkpoint
model_path = folder_path+"/model_checkpoint.pt"

In [None]:
model, optimizer, epoch, train_loss, valid_loss = load_checkpoint(model, model_path)
print("\nloaded parm")  

model.eval()
test_data_loss=0
pred_list=[]
ref_list=[]
# set gradients to false
with torch.no_grad():
  # iterate over test data
  for i, batch in enumerate(test_iterator):
    #  extract input and output batch
    src = batch.que_context.to(device)
    trg = batch.answer.to(device)
    # apply model on test batches
    output = model(src, trg, 0) 
    output = output[1:].view(-1, output.shape[-1])
    trg = trg[1:].view(-1)
    # evaluate loss
    test_loss = criterion(output, trg)
    # get the predicted value having highest probability
    topv, topi = output.data.topk(1)
    # map original answer indices back to original words from vocabulary
    target = ' '.join([TRG.vocab.itos[o] for o in trg if (o != 3 and o != 1 )])
    prediction = ' '.join([TRG.vocab.itos[o] for o in topi[:,0] if o!= 3])
    # append all predictions and original answers in lists
    pred_list.append(prediction)
    ref_list.append(target)
    # add loss of all batches
    test_data_loss+=test_loss
  # average loss of al batches
  test_data_loss=test_data_loss/len(test_iterator)
  print("Test Loss: ", test_data_loss)






loaded parm
Test Loss:  tensor(7.9950, device='cuda:0')


In [None]:
# function to evaluate bleu scores of predicted answeres vs original answers
def evaluate_blue_score(actual, prediction):
  results = dict()
  bleu_score1 = 0
  bleu_score2 = 0
  bleu_score3 = 0
  bleu_score4 = 0
  # make sure length of predictions and actual answers match so no index will create wrong decision
  if len(actual) == len(prediction):
    # iterate over all predicitons
    for i in range(len(actual)):
      # if predicted string is null value return all zeros
      if prediction == "":
        return 0,0,0,0
      actual_tokenized = list(map(lambda x: indic_tokenize.trivial_tokenize(x), actual[i]))
      pred_tokenized = indic_tokenize.trivial_tokenize(prediction[i])
      chencherry = SmoothingFunction()
      # nltk functions to calculate bleu1, bleu2, bleu3 and bleu4 score
      bleu_1 = sentence_bleu(actual_tokenized, pred_tokenized, weights=(1, 0, 0, 0), smoothing_function=chencherry.method2)
      bleu_2 = sentence_bleu(actual_tokenized, pred_tokenized, weights=(0.5, 0.5, 0, 0), smoothing_function=chencherry.method2)
      bleu_3 = sentence_bleu(actual_tokenized, pred_tokenized, weights=(0.33, 0.33, 0.33, 0), smoothing_function=chencherry.method2)
      bleu_4 = sentence_bleu(actual_tokenized, pred_tokenized, weights=(0.25, 0.25, 0.25, 0.25), smoothing_function=chencherry.method2)
      # add up scores of each instance
      bleu_score1 +=bleu_1
      bleu_score2 +=bleu_2
      bleu_score3 +=bleu_3
      bleu_score4 +=bleu_4
    # convert decimale values  bleu scores to percentage
    results["bleu_1"] = [round(bleu_score1 / len(actual) * 100, 2)]
    results["bleu_2"] = [round(bleu_score2 / len(actual) * 100, 2)]
    results["bleu_3"] = [round(bleu_score3 / len(actual) * 100, 2)]
    results["bleu_4"] = [round(bleu_score4 / len(actual) * 100, 2)]
    # return total evaluated results
    return results
  else:
    print("Error: Actual values and predictions are not of same length....")


In [None]:
# import libraries
import pandas as pd
import numpy as np
import nltk
nltk.download('punkt')
from nltk.translate.bleu_score import sentence_bleu
from nltk.translate.bleu_score import SmoothingFunction
from indicnlp.tokenize import indic_tokenize 
from deep_translator import GoogleTranslator

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [None]:
pred = [i[:10] for i in pred_list]

In [None]:
# evaluate bleu scores
actual_answers = [[i] for i in ref_list]
ans_results = evaluate_blue_score(actual_answers, pred)
pd.DataFrame(ans_results)

Unnamed: 0,bleu_1,bleu_2,bleu_3,bleu_4
0,4.11,4.11,3.98,3.9


In [None]:
for i in range(len(ref_list)):
  print("\n----------------------------------\n")
  print("Actual answers: ",actual_answers[i][0])
  print("Predicted answer: ",pred[i])
  try:
    print("Translated Answer: ",GoogleTranslator(source='auto', target='en').translate(pred[i]))
  except:
    print("Translated Answer: ",pred[i])



----------------------------------

Actual answers:  उत्तरी
Predicted answer:  हिन्दू दिल
Translated Answer:  hindu heart

----------------------------------

Actual answers:  உலான் பாட்டர்
Predicted answer:  हिन्दू இரு
Translated Answer:  Be a Hindu

----------------------------------

Actual answers:  २७ अक्तूबर १६०५
Predicted answer:  22 जनवरी 1
Translated Answer:  22 Jan 1

----------------------------------

Actual answers:  वेदव्यास
Predicted answer:  महर्षि दिल
Translated Answer:  Maharishi Dil

----------------------------------

Actual answers:  சீஸ்மோகிராப்
Predicted answer:  லூயிஸ்
Translated Answer:  Louis

----------------------------------

Actual answers:  अपसौर
Predicted answer:  १४९६००००० 
Translated Answer:  १४९६०००००

----------------------------------

Actual answers:  epistemology
Predicted answer:  22 இருப்பத
Translated Answer:  22 is absent

----------------------------------

Actual answers:  16579 சதுர கிலோ மீட்டர்
Predicted answer:  அத்தானோடு 
Translated Answ