## Import libraries and *packages*

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F
from torchtext.legacy.data import Field, BucketIterator,TabularDataset
import random
import re
from torchtext import data

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

##### INDIC_NLP

In [None]:
!git clone "https://github.com/anoopkunchukuttan/indic_nlp_library"
!git clone https://github.com/anoopkunchukuttan/indic_nlp_resources.git
!pip install Morfessor
# The path to the local git repo for Indic NLP library
INDIC_NLP_LIB_HOME=r"/content/indic_nlp_library"
# The path to the local git repo for Indic NLP Resources
INDIC_NLP_RESOURCES="/content/indic_nlp_resources"
import sys
sys.path.append(r'{}'.format(INDIC_NLP_LIB_HOME))
from indicnlp import common
common.set_resources_path(INDIC_NLP_RESOURCES)
from indicnlp import loader
loader.load()
from indicnlp.tokenize import indic_tokenize 
from indicnlp.transliterate.unicode_transliterate import ItransTransliterator
from indicnlp.normalize.indic_normalize import BaseNormalizer
from indicnlp.normalize.indic_normalize import DevanagariNormalizer
from indicnlp.morph import unsupervised_morph 
from indicnlp import common

hi_analyzer=unsupervised_morph.UnsupervisedMorphAnalyzer('hi')
from indicnlp.tokenize import indic_detokenize  

Cloning into 'indic_nlp_library'...
remote: Enumerating objects: 93, done.[K
remote: Counting objects: 100% (93/93), done.[K
remote: Compressing objects: 100% (68/68), done.[K
remote: Total 1271 (delta 50), reused 54 (delta 25), pack-reused 1178[K
Receiving objects: 100% (1271/1271), 9.56 MiB | 15.15 MiB/s, done.
Resolving deltas: 100% (654/654), done.
Cloning into 'indic_nlp_resources'...
remote: Enumerating objects: 7, done.[K
remote: Counting objects: 100% (7/7), done.[K
remote: Compressing objects: 100% (7/7), done.[K
remote: Total 133 (delta 0), reused 2 (delta 0), pack-reused 126[K
Receiving objects: 100% (133/133), 149.77 MiB | 41.38 MiB/s, done.
Resolving deltas: 100% (51/51), done.
Collecting Morfessor
  Downloading https://files.pythonhosted.org/packages/39/e6/7afea30be2ee4d29ce9de0fa53acbb033163615f849515c0b1956ad074ee/Morfessor-2.0.6-py3-none-any.whl
Installing collected packages: Morfessor
Successfully installed Morfessor-2.0.6


##### SPACY

In [None]:
!python3 -m spacy download en
import spacy
nlp = spacy.load("en_core_web_sm")

[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_sm')
[38;5;2m✔ Linking successful[0m
/usr/local/lib/python3.7/dist-packages/en_core_web_sm -->
/usr/local/lib/python3.7/dist-packages/spacy/data/en
You can now load the model via spacy.load('en')


In [None]:
!pip install -U nltk
import nltk
import sys
nltk.download('wordnet')
from nltk.translate.bleu_score import sentence_bleu
from nltk.translate.meteor_score import single_meteor_score

Collecting nltk
[?25l  Downloading https://files.pythonhosted.org/packages/8c/1c/c0981ef85165eb739c10f2b24d7729cef066b2bc220fbd1dd0d3c67df39a/nltk-3.6.1-py3-none-any.whl (1.5MB)
[K     |▎                               | 10kB 19.8MB/s eta 0:00:01[K     |▌                               | 20kB 27.5MB/s eta 0:00:01[K     |▊                               | 30kB 22.1MB/s eta 0:00:01[K     |█                               | 40kB 24.9MB/s eta 0:00:01[K     |█▏                              | 51kB 23.8MB/s eta 0:00:01[K     |█▍                              | 61kB 20.7MB/s eta 0:00:01[K     |█▋                              | 71kB 20.5MB/s eta 0:00:01[K     |█▉                              | 81kB 21.4MB/s eta 0:00:01[K     |██                              | 92kB 19.5MB/s eta 0:00:01[K     |██▎                             | 102kB 19.4MB/s eta 0:00:01[K     |██▌                             | 112kB 19.4MB/s eta 0:00:01[K     |██▊                             | 122kB 19.4MB/s e

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


# Training phase

## Prepare Data for the Model

#### Load dataset

In [None]:
data = pd.read_csv("train.csv")
data = data[["hindi","english"]]
train_data = data.sample(frac=0.9)
validation_data = data[~data.isin(train_data)].dropna()

In [None]:
data

Unnamed: 0,hindi,english
0,"एल सालवाडोर मे, जिन दोनो पक्षों ने सिविल-युद्ध...","In El Salvador, both sides that withdrew from ..."
1,मैं उनके साथ कोई लेना देना नहीं है.,I have nothing to do with them.
2,-हटाओ रिक.,"Fuck them, Rick."
3,क्योंकि यह एक खुशियों भरी फ़िल्म है.,Because it's a happy film.
4,The thought reaching the eyes...,The thought reaching the eyes...
...,...,...
102317,हम यहाँ ऊपर की ओर से लड़ रहे हैं.,We're fighting uphill here.
102318,"अकेले एक साल, चलो!","A year alone, come on."
102319,"और जानती हैं, मेरी माँ ने हमें सिखाया...","And you know, my mother taught us ..."
102320,तुमनेमेरी पूरी ज़िंदगी गया .,Since I was a boy.


#### Data Preprocessing

In [None]:
en_short_forms_dict ={"'ll":" will",
                      "'re":" are",
                      "i'm":"i am",
                      "'ve":" have",
                      "\'ve" :" have",
                      "\'s":"'s",
                      "\'ll":" will",
                      "\'re":" are",
                      "n\'t":" not" ,
                      " y'all":" you all",
                       " i\'m":" i am",
                      "'em":"them",
                      "can't":"can not",
                      "won't":"will not",
                      "cannot":"can not",
                       "isn't" :"is not",
                       "aren't":"are not",
                      "wouldn't":"would not",
                      "shouldn't":"should not",
                      "couldn't":"could not",
                      "wasn't":"was not",
                      "weren't":"were not",
                      "hasn't":"has not",
                      "hadn't":"had not",
                      "haven't":"have not",
                      "'ii":" will",
                      "fuckin'":"funcking"   
                     }

hi_digits={ "१":"1",
            "२":"2",
            "३":"3",
            "४":"4",
            "५":"5",
            "६":"6",
            "७":"7",
            "८":"8",
            "९":"9",
            "०":"0"    
}

In [None]:
def check_valid_hindi_word(word):
  alpha_num = "abcdefghijklmnopqrstuvwxyz♪♫"
  for c in word:
    if c in alpha_num or c in alpha_num.upper():
        return False
  return True

def remove_punctuations(sentence):
  regex = re.compile(r'[@_!♫♪#$%^&*(.,)<>?/\|}{~:-]')
  sentence = regex.sub('',sentence)
  return sentence


def hi_tokenizer(sentence):
  normalizer = DevanagariNormalizer("hi", remove_nuktas=True)
  sentence = normalizer.normalize(sentence)               #normalizes the sentence
  sentence = remove_punctuations(sentence)                #removes punctuations
  for k in hi_digits:
      if k in sentence:
          sentence = sentence.replace(k,hi_digits[k])
  hi_tokens = indic_tokenize.trivial_tokenize(sentence)  #tokenization
  for index,token in enumerate(hi_tokens):  
    if not check_valid_hindi_word(token):   
      hi_tokens[index] = "<unk>"                         #replace with <unk> if the word contains irrelevant characters
  return hi_tokens

def en_tokenizer(sentence):
  sentence = remove_punctuations(sentence)                       #removes punctuations
  sentence = sentence.lower()   
  for key in en_short_forms_dict:
    sentence= sentence.replace(key, en_short_forms_dict[key])   #replaces short forms with full forms
  li = list(token.text for token in nlp.tokenizer(sentence))    #tokenization
  return li


In [None]:
def data_preprocessing(data):
  remove_rows = list()
  for index in range(len(data)):
    hi_sentence = data.iloc[index]["hindi"]
    en_sentence = data.iloc[index]["english"]
    
    hi_tokens = hi_tokenizer(hi_sentence)
    no_of_hi_tokens = len(hi_tokens)
    en_tokens = en_tokenizer(en_sentence)
    no_of_en_tokens = len(en_tokens)

    remove =False
    #remove pairs if any of the sentences have with 0 length after removing punctuations
    if (no_of_hi_tokens==0 or no_of_en_tokens==0) :
       remove=True

    remove_rows.append(remove)  
  
  data["remove_row"]=remove_rows
  data.drop(data[data["remove_row"]==True].index, axis=0, inplace=True)
  data.drop(columns="remove_row",inplace=True)

In [None]:
data_preprocessing(train_data)
train_data

#### Load train and validation data into csv file

In [None]:
validation_data.to_csv("validation.csv", index=None)
train_data.to_csv("preprocessed_train.csv",index=None)  

#### Create Field and build vocab

In [None]:
hi_field = Field(tokenize = hi_tokenizer, init_token = "<sos>", eos_token = "<eos>",unk_token="<unk>", pad_token="<pad>",lower = True)
en_field = Field(tokenize = en_tokenizer,  init_token = "<sos>", eos_token = "<eos>",unk_token="<unk>", pad_token="<pad>", lower = True)

In [None]:
#load preprocessed training data
train_dataset =  TabularDataset(path="preprocessed_train.csv",format="csv", fields=[("hi",hi_field),("en",en_field)])

In [None]:
hi_field.build_vocab(train_dataset)   #create hindi vocab containing unique hindi tokens from hindi sentences in the training data
en_field.build_vocab(train_dataset)   #create english vocab containing unique english tokens from english sentences in the training data

In [None]:
input_size = len(hi_field.vocab)
output_size = len(en_field.vocab)
batch_size = 32  
train_iter = BucketIterator(dataset = train_dataset , batch_size=batch_size,sort_within_batch=True, device=device, sort_key= lambda x:len(x.hi))

In [None]:
print(input_size,output_size)

## Model

#### Encoder

In [None]:
class Encoder(nn.Module):
    def __init__(self, input_size, embedding_size, hidden_size, num_layers, dropout):
        super(Encoder,self).__init__()
        self.hidden_size = hidden_size          
        self.input_size = input_size            #size of hindi vocab, i.e., unique tokens in hindi sentences 
        self.embedding_size = embedding_size    #embedding dimension 
        self.num_layers = num_layers
        self.embedding = nn.Embedding(input_size, embedding_size)
        self.lstm = nn.LSTM(embedding_size, hidden_size,num_layers , dropout=dropout)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, input_sentence):
        embedding = self.embedding(input_sentence)
        embedding = self.dropout(embedding)  
        output,(hidden,cell)= self.lstm(embedding)  #lstm returns 3 values: output, hidden state and the cell state
        return (hidden,cell) 


#### Decoder

In [None]:
class Decoder(nn.Module):
    def __init__(self, input_size, embedding_size, hidden_size, output_size, num_layers,dropout):
        super(Decoder,self).__init__()
        self.output_size = output_size
        self.hidden_size = hidden_size
        self.input_size = input_size
        self.embedding_size = embedding_size  
        self.num_layers = num_layers
        self.embedding = nn.Embedding(input_size, embedding_size)
        self.lstm = nn.LSTM(embedding_size, hidden_size,num_layers , dropout=dropout)
        self.dropout = nn.Dropout(dropout)
        self.linear = nn.Linear(hidden_size,output_size)
        
    def forward(self, input, hidden, cell):           
        input = input.unsqueeze(0)   #added 1 dimension ,since shape(input)= (batch_size) but we want (1,batch_size)  
        embedding = self.embedding(input)
        embedding = self.dropout(embedding)   
        output,(hidden,cell) = self.lstm(embedding,(hidden,cell)) 
        predicted_output = self.linear(output)          #shape(predicted_output) = (1,batch_size,len(en_field.vocab))
        predicted_output = predicted_output.squeeze(0)  #remove 1 dimension such that shape(predicted_output) = (batch_size,len(en_field.vocab))
        return predicted_output,(hidden,cell)

#### Seq2Seq

In [None]:
#class for the seq2seq model
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.device = device
        
    def forward(self,input_sentence, target_output_sentence, teacher_forcing_ratio = 0.5):
        batch_size = input_sentence.shape[1]
        input_sen_len = input_sentence.shape[0]
        output_sen_len = target_output_sentence.shape[0]
        output_vocab_size = self.decoder.output_size
        
        #tensor to store predicted words by the decoder
        predicted_word_indexes = torch.zeros(output_sen_len, batch_size, output_vocab_size).to(self.device)

        #pass the input hindi sentence into the encoder 
        hidden,cell = self.encoder(input_sentence)
                
        decoder_input = target_output_sentence[0]  #first input to the decoder is always the init_token, i.e., <sos> token
        
        for i in range(1,output_sen_len):
            #pass the previous word along with the hidden and cell states of encoder into the decoder
            output,(hidden,cell) = self.decoder(decoder_input, hidden, cell)

            #append the next predicted word
            predicted_word_indexes[i] = output

            use_teacher_forcing = random.random() < teacher_forcing_ratio
            best_word = output.argmax(1)
            if use_teacher_forcing:
              decoder_input = target_output_sentence[i]
            else:
              decoder_input = best_word

        return predicted_word_indexes

## Training

In [None]:
MAX_LENGTH =400

#hyperparameters
num_epochs = 20
learning_rate = 0.001
hidden_size = 512
embedding_size = 256           #same for both lstms (encoder and decoder)
dropout = 0.5
num_layers = 2

In [None]:
def hi_sentence2tensor(sentence):
    hi_sentence_tokens =  hi_tokenizer(sentence)         #tokenization
    hi_token_index = list()
    for token in hi_sentence_tokens:
       hi_token_index.append(hi_field.vocab.stoi[token])  #represent tokens from their respective indices as per vocab created for hindi
    
    hi_token_index.insert(0, hi_field.vocab.stoi["<sos>"])     #insert token index of <sos> at start
    hi_token_index.append(hi_field.vocab.stoi["<eos>"])        #append token index of <eos> at the end

    #create tensor
    hi_sentence_tensor = torch.tensor(hi_token_index).unsqueeze(1).to(device)

    return hi_sentence_tensor 

In [None]:
#initialize the objects of Encoder, Decoder and Seq2Seq class
encoder = Encoder(input_size,embedding_size,hidden_size,num_layers,dropout).to(device)
decoder = Decoder(output_size,embedding_size,hidden_size,output_size, num_layers,dropout).to(device)
model = Seq2Seq(encoder,decoder,device).to(device)

In [None]:
#initialize the parameters
def init_weights(model):
    for name, parameter in model.named_parameters():
        nn.init.uniform_(parameter.data, -0.08, 0.08)
            
model.apply(init_weights)

In [None]:
def translate_sentence(model, sentence_tensor, hi_field, en_field, device, max_length=400):     
    with torch.no_grad():
        #pass the source sentence into the encoder to get the hidden and cell states
        hidden,cell = model.encoder(sentence_tensor)

    predicted_word_indices = [en_field.vocab.stoi[en_field.init_token]]  #index of <sos> in english vocab
    predicted_sentence = ""
    while len(predicted_word_indices)<max_length and predicted_word_indices[-1]!= en_field.vocab.stoi[en_field.eos_token]:
        prev_word = [predicted_word_indices[-1]]             
        prev_word = torch.tensor(prev_word).to(device)  #convert into tensor
        best_word = ""
        with torch.no_grad():
            '''
             pass the last predicted word along with the hidden and cell state of the encoder
             into the decoder to get the next predicted word
            '''
            output,(hidden, cell) = model.decoder(prev_word, hidden, cell)
            _ ,best_word = output.data.topk(1)                    #get the best predicted word index
        predicted_word_indices.append(best_word.item())           #append it to the list of predicted word indices
        predicted_sentence += en_field.vocab.itos[best_word]+" "  #append the word corresponding to the predicted index

    translated_sentence = predicted_sentence.replace("<sos> ","").replace("<eos>","").replace(" 's","'s").replace("[ ","[").replace(" ]","]").replace("( ","(").replace(" )",")")
    return translated_sentence

In [None]:
def train_iterator(model, iterator, criterion,encoder_optimizer,decoder_optimizer):
    for batch in iterator:
        #get hindi and their corresponding english sentences from the batch
        input_sentence = batch.hi
        target_sentence = batch.en

        encoder_optimizer.zero_grad()
        decoder_optimizer.zero_grad()    

        #pass the hindi and their corresponding english sentences into the model to get the predicted sentence     
        predicted_sentence = model(input_sentence, target_sentence) 

        #adjust the shapes
        predicted_sentence = predicted_sentence[1:].view(-1, predicted_sentence.shape[2])
        target_sentence = target_sentence[1:].view(-1)

        #calculate loss    
        loss = criterion(predicted_sentence,target_sentence)
        
        #backpropagate loss
        loss.backward()

        #clip the gradients
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1)

        #opitimize the parameters according to the propagated loss
        encoder_optimizer.step()
        decoder_optimizer.step()
    return loss 

In [None]:
def train(model, iterator,num_epochs=num_epochs):
    total_loss = 0

    #initialize the optimizer and the criterion(Loss function) to be used
    encoder_optimizer = optim.RMSprop(model.encoder.parameters(),lr=learning_rate)          #using RMSProp optimizer for encoder 
    decoder_optimizer = optim.Adam(model.decoder.parameters(),lr=learning_rate)             #using Adam optimizer for decoder
    criterion = nn.CrossEntropyLoss(ignore_index = en_field.vocab.stoi[en_field.pad_token]) #using CrossEntropyLoss function
    

    for epoch in range(num_epochs):
        
        model.train()

        loss = train_iterator(model,iterator,criterion,encoder_optimizer,decoder_optimizer)
        total_loss += loss.item()
        torch.save(model.state_dict(),"lstm_model.pt" )
        print(f'\n\nEpoch: {epoch+1}/{num_epochs}     Loss: {loss.item():.4f}')  

    print(f"\n\n Total loss ::: {total_loss/len(train_iter):.4f}")


In [None]:
train(model, train_iter, num_epochs=20)

In [None]:
torch.save(model.state_dict(),"lstm_model.pt" )
torch.save(encoder.state_dict(),"encoder.pt" )
torch.save(decoder.state_dict(),"decoder.pt" )

# Test on Validation set

In [None]:
validation_data = pd.read_csv("validation.csv")

In [None]:
model = Seq2Seq(encoder, decoder, device).to(device)
model.load_state_dict(torch.load('lstm_model.pt'))
model.eval()

In [None]:
validation_predictions = pd.DataFrame(columns=["hindi","english"])

In [None]:
#predict for each hindi sentence in the validation set
for i in range(len(validation_data)):
  #get hindi sentence
  sentence = validation_data.iloc[i]["hindi"]

  #tokenize and convert it into tensor
  sentence_tensor = hi_sentence2tensor(sentence)

  #get the translated sentence predicted by the trained model
  predicted_sentence = translate_sentence(model,sentence_tensor,hi_field,en_field,device,max_length=400)

  #make certain replacements
  predicted_sentence = predicted_sentence.replace(" 's","'s").replace("[ ","[").replace(" ]","]").replace("( ","(").replace(" )",")")
  
  #append  the results
  validation_predictions =validation_predictions.append({"hindi":sentence,"english":predicted_sentence},ignore_index=True)

In [None]:
#write both predicted sentences and actual output into separate .txt files
file = open("hypotheses.txt","w")
for x in validation_predictions.iloc[:-1]["english"]:
   file.write(x)
   file.write("\n")
file.write(validation_predictions.iloc[-1]["english"])
file.close()

file = open("references.txt","w")
for x in validation_data.iloc[:-1]["english"]:
   file.write(x)
   file.write("\n")
file.write(validation_data.iloc[-1]["english"])
file.close()

### Run evaluation.py script to generate bleu score and meteor score

In [None]:
import nltk
import sys
nltk.download('wordnet')
from nltk.translate.bleu_score import sentence_bleu
from nltk.translate.meteor_score import single_meteor_score

file1 = open("hypotheses.txt", 'r')
references = file1.readlines()
file2 = open("references.txt", 'r')
hypotheses = file2.readlines()

total_num = len(references)
total_bleu_scores = 0
total_meteor_scores = 0
for i in range(total_num):
  total_bleu_scores+=sentence_bleu([references[i].split(" ")], hypotheses[i].split(" "))
  total_meteor_scores+=single_meteor_score(references[i], hypotheses[i])

bleu_result = total_bleu_scores/total_num
meteor_result = total_meteor_scores/total_num

print("bleu score: ",bleu_result)
print("meteor score: ",meteor_result)

# Testing

In [None]:
test_data = pd.read_csv("hindistatements.csv")

In [None]:
test_data = test_data["hindi"]

In [None]:
predictions = pd.DataFrame(columns=["hindi","english"])

In [None]:
for i in range(len(test_data)):
  #get the hindi sentence
  sentence = test_data.iloc[i]

  #tokenize the sentence and convert it into tensor
  sentence_tensor = hi_sentence2tensor(sentence)

  #get the predicted translated sentence
  predicted_sentence = translate_sentence(model,sentence_tensor,hi_field,en_field,device,max_length=400)
  predicted_sentence = predicted_sentence.replace(" 's","'s").replace("[ ","[").replace(" ]","]").replace("( ","(").replace(" )",")")
  print(sentence,"\n",predicted_sentence,"\n\n")

  #append the results
  predictions =predictions.append({"hindi":sentence,"english":predicted_sentence},ignore_index=True)

In [None]:
#write results into answer
file = open("answer.txt","w")
for x in predictions.iloc[:4999]["english"]:
   file.write(x)
   file.write("\n")
file.write(predictions.iloc[4999]["english"])
file.close()