**Note that to run this notebook, you have to upload hindistatements.csv**

#The pipeline used is :


1.   Importing data
2.   Preprocessing data by tokenizing it and converting it to tensor
3.   Defining the model Architecture
4.   Running SGD on the model
5.   Evaluating on the test data



#Importing all the required libraries

In [None]:
import numpy as np
import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F
import math
import time
import unicodedata
import string
import re
import random
import csv
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# Getting the training dataset file from gdrive
!unzip -o ./drive/MyDrive/train.zip >> /dev/null

In [None]:
!git clone "https://github.com/anoopkunchukuttan/indic_nlp_library"
!git clone https://github.com/anoopkunchukuttan/indic_nlp_resources.git
!pip install Morfessor


Cloning into 'indic_nlp_library'...
remote: Enumerating objects: 1271, done.[K
remote: Counting objects: 100% (93/93), done.[K
remote: Compressing objects: 100% (68/68), done.[K
remote: Total 1271 (delta 50), reused 54 (delta 25), pack-reused 1178[K
Receiving objects: 100% (1271/1271), 9.56 MiB | 16.20 MiB/s, done.
Resolving deltas: 100% (654/654), done.
Cloning into 'indic_nlp_resources'...
remote: Enumerating objects: 133, done.[K
remote: Counting objects: 100% (7/7), done.[K
remote: Compressing objects: 100% (7/7), done.[K
remote: Total 133 (delta 0), reused 2 (delta 0), pack-reused 126[K
Receiving objects: 100% (133/133), 149.77 MiB | 41.64 MiB/s, done.
Resolving deltas: 100% (51/51), done.
Collecting Morfessor
  Downloading https://files.pythonhosted.org/packages/39/e6/7afea30be2ee4d29ce9de0fa53acbb033163615f849515c0b1956ad074ee/Morfessor-2.0.6-py3-none-any.whl
Installing collected packages: Morfessor
Successfully installed Morfessor-2.0.6


In [None]:
# The path to the local git repo for Indic NLP library
INDIC_NLP_LIB_HOME=r"/content/indic_nlp_library"

# The path to the local git repo for Indic NLP Resources
INDIC_NLP_RESOURCES="/content/indic_nlp_resources"

import sys
sys.path.append(r'{}'.format(INDIC_NLP_LIB_HOME))

from indicnlp import common
common.set_resources_path(INDIC_NLP_RESOURCES)

from indicnlp import loader
loader.load()

#Importing the nltk library to tokenize english sentences.

In [None]:
from indicnlp.tokenize import indic_tokenize
import nltk
nltk.download('popular',quiet = True)

True

#Defining the Start of Sentence and End of Sentence token

In [None]:
SOS = 0
EOS = 1
MAX_LENGTH = 598

#Importing data and converting to numpy arrays

In [None]:
datalines = []
with open("train.csv", 'r') as dfile:
    datareader = csv.reader(dfile)

    for row in datareader:
        datalines.append(row)
    
hindi_sents = [l[1] for l in datalines[1::]]           #getting hindi sentences
english_sents = [l[2] for l in datalines[1::]]         #getting english sentences


In [None]:
type(hindi_sents)

list

#Function to tokenize the data

In [None]:
def tokenize(data,lang):
  """
  Input:
      data : imported dataset from train.csv
      lang : 'hi' or 'eng'

  Output:
      dataset : dictionary containing all the necessary information like word frequency count, indexing dictionary,tokenized sentences etc.

  """
  if lang =='hi':
    hindi_sents = data
    hindi_id = 2
    hindi_words_freq = {}
    hindi_to_id = {}
    id_to_hindi = {0: "SOS", 1: "EOS"}
    hindi_sen_token = []
    hindi_token_all = []
    for hindi_sen in hindi_sents:
      hindi_sen_token = indic_tokenize.trivial_tokenize(hindi_sen)
      hindi_token_all.append(hindi_sen_token)
      for token in hindi_sen_token :
        if token not in hindi_words_freq.keys() : 
          hindi_to_id[token] = hindi_id
          id_to_hindi[hindi_id] = token
          hindi_id = hindi_id + 1
          hindi_words_freq[token] = 1
        else:
          hindi_words_freq[token] = hindi_words_freq[token] + 1

    dataset = {
    'hindi_vocab': hindi_words_freq, 
    'hindi_to_id': hindi_to_id, 
    'id_to_hindi': id_to_hindi, 
    'hindi_token_all': hindi_token_all}      


    return dataset

  elif lang =='eng':
    english_sents = data
    english_id = 2
    english_words_freq = {}
    english_to_id = {}
    id_to_english = {0: "SOS", 1: "EOS"}
    english_sen_token = []
    english_token_all = []
    for english_sent in english_sents:
      english_sen_token = nltk.word_tokenize(english_sent)
      english_token_all.append(english_sen_token)
      for token in english_sen_token :
        if token not in english_words_freq.keys() : 
          english_to_id[token] = english_id
          id_to_english[english_id] = token
          english_id = english_id + 1
          english_words_freq[token] = 1
        else:
          english_words_freq[token] = english_words_freq[token] + 1 

    dataset = {
    'english_vocab': english_words_freq, 
    'english_to_id': english_to_id, 
    'id_to_english': id_to_english, 
    'english_token_all': english_token_all}
      

      
      #english_sents_id = torch.tensor(english_sents_id)    

    return dataset



##Calling the tokenize function

In [None]:
hindi_data = tokenize(hindi_sents,'hi')
english_data = tokenize(english_sents,'eng')


In [None]:
def get_embedding(hindi_token_sent,english_token_sent):
  """
  Returns the vector embedding for the hindi and english sentence
  Input:
      hindi_token_sent : tokenized hindi sentence
      english_token_sent : tokenized english sentence


  """
  hindi_index_sent = []
  hindi_to_id = hindi_data['hindi_to_id'] 
  for word in hindi_token_sent:
        if word not in hindi_to_id.keys():
            hindi_index_sent.append(2)
        else: 
            hindi_index_sent.append(hindi_to_id[word])

  hindi_index_sent.append(EOS)  #Appending the EOS token
  hindi_index_tensor = torch.tensor(hindi_index_sent, dtype=torch.long, device=device).view(-1, 1) # Converting to tensor

  english_index_sent = []
  english_to_id = english_data['english_to_id'] 
  for word in english_token_sent:
        if word not in english_to_id.keys():
            english_index_sent.append(2)
        else: 
            english_index_sent.append(english_to_id[word])

  english_index_sent.append(EOS)
  english_index_tensor = torch.tensor(english_index_sent, dtype=torch.long, device=device).view(-1, 1)


  return (hindi_index_tensor,english_index_tensor)

#Defining the Encoder with GRU unit, it also defines the embedding of input data used in the Encoder

In [None]:
class EncoderModel(torch.nn.Module):
    
    def __init__(self, input_size, hidden_size):
        super(EncoderModel, self).__init__()
        
        # Setting Class Variables:
        self.hidden_size = hidden_size
        
        # Setting Layers:
        self.EMB = nn.Embedding(input_size, hidden_size)
        self.RNN = nn.GRU(hidden_size, hidden_size)

    def forward(self, x, h_t):
        x_embedding = self.EMB(x).view(1, 1, -1)
        output, h_t = self.RNN(x_embedding, h_t)
        return output, h_t

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)

#Defining the Attention Decoder with GRU units

In [None]:
class AttnDecoderModel(torch.nn.Module):
    def __init__(self, hidden_size, output_size, dropout_p, max_length):
        super(AttnDecoderModel, self).__init__()
        
        # Setting Class Variables:
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.dropout_p = dropout_p
        self.max_length = max_length

        # Setting Layers:
        self.EM_LOOKUP = nn.Embedding(self.output_size, self.hidden_size)
        self.ATTENTION = nn.Linear(self.hidden_size * 2, self.max_length)
        self.A_COMBINE = nn.Linear(self.hidden_size * 2, self.hidden_size)
        self.L_DROPOUT = nn.Dropout(self.dropout_p)
        self.RECURRENT = nn.GRU(self.hidden_size, self.hidden_size)
        self.out = nn.Linear(self.hidden_size, self.output_size)

    def forward(self, input, hidden, encoder_outputs):
        
        context = self.EM_LOOKUP(input).view(1, 1, -1)
        context = self.L_DROPOUT(context)

        # Attention Mechanism:
        attention_weights = torch.nn.functional.softmax(
            input = self.ATTENTION(
                torch.cat([context[0], hidden[0]], 1)
            ), 
            dim = 1
        )
        
        # print(attention_weights.unsqueeze(0).shape)
        # print(encoder_outputs.unsqueeze(0).shape)

        attention_weights = torch.bmm(
            attention_weights.unsqueeze(0),
            encoder_outputs.unsqueeze(0)
        )

        output = torch.cat(
            tensors = (context[0], attention_weights[0]), 
            dim = 1
        )
        
        output = self.A_COMBINE(output).unsqueeze(0)
        output = torch.nn.functional.relu(output)
        
        output, hidden = self.RECURRENT(output, hidden)
        
        output = torch.nn.functional.log_softmax(
            input = self.out(output[0]), 
            dim = 1
        )

        return output, hidden, attention_weights

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)

#Forward pass through the model with teacher forcing.

In [None]:
teacher_forcing_ratio = 0.5

def forward_pass(input_tensor, target_tensor, encoder, decoder, encoder_opt, decoder_opt, criterion, max_length=MAX_LENGTH):
    encoder_hidden = encoder.initHidden()

    encoder_opt.zero_grad()
    decoder_opt.zero_grad()

    input_length = input_tensor.size(0)          #Length of input sentence
    target_length = target_tensor.size(0)        #Length of target sentence

    encoder_outputs = torch.zeros(max_length, encoder.hidden_size, device=device)

    loss = 0

    for ei in range(input_length):
        encoder_output, encoder_hidden = encoder(
            input_tensor[ei], encoder_hidden)
        encoder_outputs[ei] = encoder_output[0, 0]

    decoder_input = torch.tensor([[SOS]], device=device)   #Adding start of sentence to the decoder input to be used in teacher forcing

    
    decoder_hidden = encoder_hidden

    #decoder_hidden = encoder_hidden
    
    if random.random() < teacher_forcing_ratio:
      use_teacher_forcing = True
    else:
      use_teacher_forcing = False  

    if use_teacher_forcing:
        # Teacher forcing: Feed the target as the next input
        for di in range(target_length):
            decoder_output, decoder_hidden, decoder_attention = decoder(
                decoder_input, decoder_hidden, encoder_outputs)
            loss += criterion(decoder_output, target_tensor[di])
            decoder_input = target_tensor[di]  # Teacher forcing

    else:
        # Without teacher forcing: use its own predictions as the next input
        for di in range(target_length):
            decoder_output, decoder_hidden, decoder_attention = decoder(
                decoder_input, decoder_hidden, encoder_outputs)
            topv, topi = decoder_output.topk(1)
            decoder_input = topi.squeeze().detach()  # detach from history as input

            loss += criterion(decoder_output, target_tensor[di])
            if decoder_input.item() == EOS:
                break

    loss.backward()

    encoder_opt.step()
    decoder_opt.step()

    return loss.item() / target_length

#Creating pairs of hindi_tensor and english_tensor for SGD

In [None]:
pairs = []
hindi_tokens = hindi_data['hindi_token_all']
english_tokens = english_data['english_token_all']
print(len(hindi_tokens))
print(len(english_tokens))
for i in range(len(hindi_tokens)):
  temp = get_embedding(hindi_tokens[i],english_tokens[i])
  pairs.append(temp)

102322
102322


#Stochastic Gradient Descent which iterates through one example at a time compute loss,back_prop and then update parameters

In [None]:
def SGD(encoder, decoder,encoder_opt,decoder_opt, num_epochs, print_every=5000, learning_rate=0.01):

  """
  encoder_opt = The optimizer of encoder
  decoder_opt  = The optimizer of decoder
  num_epcohs = Number of iterations through whole dataset
  """
  start = time.time()

  criterion = nn.NLLLoss()

  num_sents = len(hindi_data['hindi_token_all'])
  print_loss_total = 0  # Reset every print_every
  for epoch in range(num_epochs):

    choices = np.random.permutation(range(num_sents))   #Taking one random example at a time

    step = 0

    for i in choices:
 
      english_s = convert_to_tensor(english_data['english_token_all'][i],'en')       #Converting the sentence to tensor
      hindi_s = convert_to_tensor(hindi_data['hindi_token_all'][i],'hi')

      loss = forward_pass(hindi_s, english_s, encoder, decoder, encoder_opt, decoder_opt, criterion)

      step = step + 1

      if step%100 ==0:

        print(f'Epoch Number: {epoch + 1}, {step}/{num_sents} processed')
    

    

In [None]:
def convert_to_tensor(sent,lang):
  """
  Convert a sentence to a tensor, similar to get_embedding
  """
  index_sent = []
  if lang =='hi':
    hindi_to_id = hindi_data['hindi_to_id'] 
    for word in sent:
          if word not in hindi_to_id.keys():
              index_sent.append(2)
          else: 
              index_sent.append(hindi_to_id[word])

    index_sent.append(EOS)
    index_tensor = torch.tensor(index_sent, dtype=torch.long, device=device).view(-1, 1)
    return index_tensor

  if lang =='en':
    english_to_id = english_data['english_to_id'] 
    for word in sent:
          if word not in english_to_id.keys():
              index_sent.append(2)
          else: 
             index_sent.append(english_to_id[word])

    index_sent.append(EOS)
    index_tensor = torch.tensor(index_sent, dtype=torch.long, device=device).view(-1, 1)
    return index_tensor

#Function to evaluate on the test cases, similar to forward_pass. It also converts the tensor to sentence for better representation of output

In [None]:
def evaluate(encoder, decoder, sentence, max_length = MAX_LENGTH):

    with torch.no_grad():

        input_tensor = convert_to_tensor(
            indic_tokenize.trivial_tokenize(sentence), 
            'hi'
        ).to(device)

        input_length = input_tensor.size()[0]
        encoder_hidden = encoder.initHidden()

        encoder_outputs = torch.zeros(max_length, encoder.hidden_size, device=device)

        for ei in range(input_length):
            encoder_output, encoder_hidden = encoder(input_tensor[ei],
                                                     encoder_hidden)
            encoder_outputs[ei] += encoder_output[0, 0]

        decoder_input = torch.tensor([[0]], device=device) # init of sentence

        decoder_hidden = encoder_hidden

        decoded_words = []
        decoder_attentions = torch.zeros(max_length, max_length)

        for di in range(max_length):
            decoder_output, decoder_hidden, decoder_attention = decoder(
                decoder_input, decoder_hidden, encoder_outputs)

            topv, topi = decoder_output.data.topk(1)
            if topi.item() == 1:
                decoded_words.append('__<<stop>>__')
                break
            else:
                decoded_words.append(english_data['id_to_english'][topi.item()])

            decoder_input = topi.squeeze().detach()

        return decoded_words

#The SGD optimizer for the encoder and decoder

In [None]:
def optimize(hidden_size,hindi_data,english_data):
  
  # Defining the encoder, decoder and their optimizers

  encoder1 = EncoderModel(len(hindi_data['id_to_hindi']), hidden_size).to(device)
  attn_decoder1 = AttnDecoderModel(hidden_size, len(english_data['id_to_english']), dropout_p=0.1,max_length=MAX_LENGTH).to(device)
  encoder_opt = optim.SGD(encoder1.parameters(), lr=0.01)
  decoder_opt = optim.SGD(attn_decoder1.parameters(), lr=0.01)
  if trained == False:

    #Running SGD on the above optimizers
    SGD(encoder1,attn_decoder1, encoder_opt, decoder_opt, num_epochs = 20)  
  
  else :
    encoder1.load_state_dict(torch.load('drive/MyDrive/trained_model/encoder.zip'))    
    attn_decoder1.load_state_dict(torch.load('drive/MyDrive/trained_model/decoder.zip'))

    encoder_opt.load_state_dict(torch.load('drive/MyDrive/trained_model/enc_optimizer.zip'))
    decoder_opt.load_state_dict(torch.load('drive/MyDrive/trained_model/dec_optimizer.zip'))   
  

  return encoder1,attn_decoder1  

In [None]:
!nvidia-smi

Sat May  8 05:13:41 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 465.19.01    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   47C    P8    11W /  70W |      0MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

#Calling the optimizer to train the model

In [None]:
hidden_size = 512
trained = True
encoder1,attn_decoder1 = optimize(hidden_size,hindi_data,english_data)


#Ran for 15 epochs to get the result
#A local copy of the trained model is saved in my google drive

#Ran it again to check if its functioning correctly

Saving the weights in drive

In [None]:
save_weights = False

if save_weights == True:
    torch.save(encoder1.state_dict(), 'drive/MyDrive/_encoder')
    torch.save(attn_decoder1.state_dict(), 'drive/MyDrive/_decoder')
    torch.save(encoder_opt.state_dict(), 'drive/MyDrive/_enc_optimizer')
    torch.save(decoder_opt.state_dict(), 'drive/MyDrive/_dec_optimizer')

In [None]:
#Evaluate the test dataset
testlines = []
with open("testhindistatements.csv", 'r') as testfile:
    testreader = csv.reader(testfile)  #Reading each line 

    for row in testreader:
        testlines.append(row)

inputs = [l[2] for l in testlines[1:]]

outfile = open("answer.txt", 'w+')       

for i in range(len(inputs)):
    output_words = evaluate(encoder1, attn_decoder1, inputs[i],max_length = MAX_LENGTH)        #Evaluating each line and saving in "answer.txt"
    output_sentence = ' '.join(output_words[:-1])
    
    
    outfile.write(output_sentence + "\n")


outfile.close()

KeyboardInterrupt: ignored