# Import libraries and packages

In [None]:
import numpy as np
import random
import matplotlib.pyplot as plt
%matplotlib inline
import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F
import random
import re
import csv

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
device

device(type='cuda')

##### INDIC_NLP

In [None]:
!git clone "https://github.com/anoopkunchukuttan/indic_nlp_library"
!git clone https://github.com/anoopkunchukuttan/indic_nlp_resources.git
!pip install Morfessor
# The path to the local git repo for Indic NLP library
INDIC_NLP_LIB_HOME=r"/content/indic_nlp_library"
# The path to the local git repo for Indic NLP Resources
INDIC_NLP_RESOURCES="/content/indic_nlp_resources"
import sys
sys.path.append(r'{}'.format(INDIC_NLP_LIB_HOME))
from indicnlp import common
common.set_resources_path(INDIC_NLP_RESOURCES)
from indicnlp import loader
loader.load()
from indicnlp.tokenize import indic_tokenize 
from indicnlp.transliterate.unicode_transliterate import ItransTransliterator
from indicnlp.normalize.indic_normalize import BaseNormalizer
from indicnlp.normalize.indic_normalize import DevanagariNormalizer
from indicnlp.morph import unsupervised_morph 
from indicnlp import common

hi_analyzer=unsupervised_morph.UnsupervisedMorphAnalyzer('hi')
from indicnlp.tokenize import indic_detokenize  

Cloning into 'indic_nlp_library'...
remote: Enumerating objects: 1271, done.[K
remote: Counting objects: 100% (93/93), done.[K
remote: Compressing objects: 100% (68/68), done.[K
remote: Total 1271 (delta 50), reused 54 (delta 25), pack-reused 1178[K
Receiving objects: 100% (1271/1271), 9.56 MiB | 14.61 MiB/s, done.
Resolving deltas: 100% (654/654), done.
Cloning into 'indic_nlp_resources'...
remote: Enumerating objects: 133, done.[K
remote: Counting objects: 100% (7/7), done.[K
remote: Compressing objects: 100% (7/7), done.[K
remote: Total 133 (delta 0), reused 2 (delta 0), pack-reused 126[K
Receiving objects: 100% (133/133), 149.77 MiB | 31.21 MiB/s, done.
Resolving deltas: 100% (51/51), done.
Collecting Morfessor
  Downloading https://files.pythonhosted.org/packages/39/e6/7afea30be2ee4d29ce9de0fa53acbb033163615f849515c0b1956ad074ee/Morfessor-2.0.6-py3-none-any.whl
Installing collected packages: Morfessor
Successfully installed Morfessor-2.0.6


##### SPACY

In [None]:
!python3 -m spacy download en
import spacy
nlp = spacy.load("en_core_web_sm")

[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_sm')
[38;5;2m✔ Linking successful[0m
/usr/local/lib/python3.7/dist-packages/en_core_web_sm -->
/usr/local/lib/python3.7/dist-packages/spacy/data/en
You can now load the model via spacy.load('en')


In [None]:
!pip install -U nltk
import nltk
import sys
nltk.download('wordnet')
from nltk.translate.bleu_score import sentence_bleu
from nltk.translate.meteor_score import single_meteor_score

Collecting nltk
[?25l  Downloading https://files.pythonhosted.org/packages/5e/37/9532ddd4b1bbb619333d5708aaad9bf1742f051a664c3c6fa6632a105fd8/nltk-3.6.2-py3-none-any.whl (1.5MB)
[K     |████████████████████████████████| 1.5MB 7.7MB/s 
Installing collected packages: nltk
  Found existing installation: nltk 3.2.5
    Uninstalling nltk-3.2.5:
      Successfully uninstalled nltk-3.2.5
Successfully installed nltk-3.6.2


[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


# Prepare Data for the Model

### Load dataset
Add the data from train.csv into **data** as list of lists. Each element of this list **data** is again a list of sentences, the first sentence is the *hindi sentence*,  and the second sentence is the corresponding *english sentence*.

In [None]:
data = list()
with open('train.csv') as csvfile:
     spamreader = csv.reader(csvfile, delimiter=",")
     for row in spamreader:
         data.append([row[1], row[2]])
data.pop(0)

['hindi', 'english']

In [None]:
#split into train and validation data
train_data = data[:-1]               #The model has been trained on the entire dataset for the test phase
validation_data = data[90000:]

### Data Preprocessing

In [None]:
"""
The sentences in target language (English) contain many contracted verb forms.
So we can replace them with their respective full forms. This will some how reduce the vocabulary size.
For example 'will', 'she'll' and 'she' would be three different words if we use them as they are,
but on replacing the short forms, we will have only two words 'she' and 'will'.
Also we can replace the Devanagiri numerals by their respective Western Arabic numeral form.
"""
en_short_forms_dict ={"'ll":" will",
                      "'re":" are",
                      "i'm":"i am",
                      "'ve":" have",
                      "\'ve" :" have",
                      "\'s":"'s",
                      "\'ll":" will",
                      "\'re":" are",
                      "n\'t":"n't" ,
                      " y'all":" you all",
                       " i\'m":" i am",
                      "'em":"them",
                      "can't":"can not",
                      "won't":"will not",
                      "cannot":"can not",
                       "isn't" :"is not",
                       "aren't":"are not",
                      "wouldn't":"would not",
                      "shouldn't":"should not",
                      "couldn't":"could not",
                      "wasn't":"was not",
                      "weren't":"were not",
                      "hasn't":"has not",
                      "hadn't":"had not",
                      "haven't":"have not",
                      "'ii":" will",
                      "fuckin'":"funcking"   
                     }

hi_digits={ "१":"1",
            "२":"2",
            "३":"3",
            "४":"4",
            "५":"5",
            "६":"6",
            "७":"7",
            "८":"8",
            "९":"9",
            "०":"0"    
}

In [None]:
#the function checks if the hindi word contains any english or special charaters
def check_valid_hindi_word(word):
  alpha_num = "abcdefghijklmnopqrstuvwxyz♪♫"
  for c in word:
    if c in alpha_num or c in alpha_num.upper():
        return False
  return True


#the function is used for removing certain punctuations
def remove_punctuations(sentence):
  regex = re.compile(r'[@_!♫♪#$%^&*(.,)<>?/\|}{~:;-]')
  sentence = regex.sub('',sentence)
  return sentence


#the function is used for tokenizing the hindi sentences
def hi_tokenizer(sentence):
  normalizer = DevanagariNormalizer("hi", remove_nuktas=True)
  sentence = normalizer.normalize(sentence)               #normalizes the sentence
  sentence = remove_punctuations(sentence)                #removes punctuations
  for k in hi_digits:
      if k in sentence:
          sentence = sentence.replace(k,hi_digits[k])     #replaces the Devanagiri digits with its Western Arabic form
  hi_tokens = indic_tokenize.trivial_tokenize(sentence)   #tokenization
  for index,token in enumerate(hi_tokens):  
    if not check_valid_hindi_word(token):   
      hi_tokens[index] = "<unk>"                          #replace with <unk> if the word contains irrelevant characters
  return hi_tokens


#the function is used for tokenizing the english sentences
def en_tokenizer(sentence):
  sentence = remove_punctuations(sentence)                      #removes punctuations
  sentence = sentence.lower()   
  for key in en_short_forms_dict:
    sentence= sentence.replace(key, en_short_forms_dict[key])   #replaces short forms with their full forms
    
  li = indic_tokenize.trivial_tokenize(sentence)                #tokenization

  #indic tokenizes the word "what's" as ['what',"'",'s']. So the following code changes it into ['what',"'s"]
  for i in range(len(li)-1):
     if li[i]=="'" and li[i+1]=="s":
        li[i+1] = "'s"
  while "'" in li:
     li.remove("'")
     
  return li


In [None]:
class Lang:
   def __init__(self):
     self.word2index = {"<unk>":0 , "<sos>":1, "<eos>":2, "<pad>":3}
     self.index2word = {0:"<unk>" , 1:"<sos>", 2:"<eos>", 3:"<pad>"}
     self.vocab_size = len(self.word2index)
     self.word_count = {"<unk>":1 , "<sos>":1, "<eos>":1, "<pad>":1}

   def add_to_vocab(self,token_list):
     for token in token_list:
        if token not in self.word2index:        #add to vocab only if its not already present 
           ind = len(self.word2index)
           self.word2index[token] = ind
           self.index2word[ind] = token          
           self.vocab_size += 1                 #increment the vocab_size
           self.word_count[token] = 1
        else:
           self.word_count[token] += 1   
   

   def tokens2tensor(self,token_list):
      token_indices = list()
      #for each token, append its index as per the built vocabulary.
      #If token is not present in the vocab, append the index of <unk> 
      for token in token_list:
          if token in self.word2index:
              token_indices.append(self.word2index[token])
          else:
              token_indices.append(self.word2index["<unk>"])

      #convert the token_indices into tensor              
      sentence_tensor = torch.tensor(token_indices).unsqueeze(1).to(device)
      
      return sentence_tensor

In [None]:
def data_preprocessing(data):
  remove_rows = list()
  english = Lang()    #object of Lang class for english 
  hindi = Lang()      #object of Lang class for hindi

  for index in range(len(data)):
    hi_sentence = data[index][0]
    en_sentence = data[index][1]    

    #tokenize the sentences and calculate the number of tokens in each sentence
    hi_tokens = hi_tokenizer(hi_sentence)
    no_of_hi_tokens = len(hi_tokens)
    en_tokens = en_tokenizer(en_sentence)
    no_of_en_tokens = len(en_tokens)

    #remove pairs if any of the sentences have with 0 length after removing punctuations or contains all <unk>
    if (no_of_hi_tokens==0 or no_of_en_tokens==0) or hi_tokens.count("<unk>")== no_of_hi_tokens :
        remove_rows.append([hi_sentence,en_sentence])
    else:
        hindi.add_to_vocab(hi_tokens)     #add tokens to hindi vocab 
        english.add_to_vocab(en_tokens)   #add tokens ro english vocab

  
  print(f"No of sentence before :: {len(data)}")
  for row in remove_rows:
    data.remove(row)
  print(f'No of sentence after :: {len(data)}')
  return data, hindi,english

In [None]:
train_data, hindi,english = data_preprocessing(train_data)

No of sentence before :: 102321
No of sentence after :: 101493


In [None]:
print(f"Hindi vocab size : {hindi.vocab_size} \nEnglish vocab size : {english.vocab_size}")

Hindi vocab size : 41722 
English vocab size : 32954


### Create Batches

Each sentence of train_data is first tokenized according to the defined tokenization functions for each of the languages to get a list of tokens. The *init_token* token is then appended at the start and *eos_token*  token is appended at the end of the token lists. A dictionary *temp_dict* stores these token lists for each pair along with the number of tokens. This dictionary is then sorted according to the number of tokens in *hindi* sentences.  The sorted data is then used for creating batches. The advantage of sorting the data is that minimum padding would be required while creating the batches.

In [None]:
input_size = hindi.vocab_size
output_size = english.vocab_size
batch_size = 32  

In [None]:
print(input_size,output_size)

41722 32954


In [None]:
def sort_sentences(data, sort_key):
    data_list = list()
    for pair in data:
        hi_sentence , en_sentence =  pair

        #append <sos> as the first token and <eos> as the last token
        hi_tokens = ['<sos>'] + hi_tokenizer(hi_sentence) + ['<eos>']
        en_tokens = ['<sos>'] + en_tokenizer(en_sentence) + ['<eos>']
        
        temp_dict = {"hi_tokens": hi_tokens, "en_tokens": en_tokens, "hi_len":len(hi_tokens), "en_len":len(en_tokens)}
        data_list.append(temp_dict)

    #add extra sentences with one <unk> token just to make sure size of each batch equals batch_size.
    #Even these sentences should include <sos> at the start and <eos> at the end 
    while len(data_list) % batch_size != 0:
        temp_dict = {"hi_tokens": ["<sos>","<unk>","<eos>"], "en_tokens": ["<sos>","<unk>","<eos>"], "hi_len": 3, "en_len": 3}
        data_list.insert(0,temp_dict)
   
    #sort according to length of hindi sentence
    sorted_data = sorted(data_list, key= lambda x: x["hi_len"])
    return sorted_data

sorted_data = sort_sentences(train_data,"hi_len")

In [None]:
def create_batches(sorted_data,batch_size):     
    batch_list = list()

    #divide into batches
    for i in range(0,len(sorted_data),batch_size):     
        batch = sorted_data[i : i+batch_size]

        #get the maximum length of sentences in the batch 
        hi_max_len = max(batch, key= lambda x: x["hi_len"])["hi_len"]
        en_max_len = max(batch, key= lambda x: x["en_len"])["en_len"]
        
        temp_batch = list()
        for item in batch:
            hi_tokens, en_tokens, hi_len, en_len = item.values()
            
            #add <pad> tokens at the end of sentence (after <eos> token) to make sure 
            #each sentence in the batch has same length
            if hi_len < hi_max_len:
                padding = ["<pad>"]*(hi_max_len-len(hi_tokens))
                item["hi_tokens"] = hi_tokens + padding
                
            if en_len < en_max_len:
                padding = ["<pad>"]*(en_max_len-len(en_tokens))
                item["en_tokens"] = en_tokens + padding

            #convert token list into tensor    
            hi_tensor = hindi.tokens2tensor(item["hi_tokens"])
            en_tensor = english.tokens2tensor(item["en_tokens"])

            #add the resultant tensors of both the languages into the batch
            temp_item = [hi_tensor, en_tensor]
            temp_batch.append(temp_item)
        
        #concat tensors such that the first row contains <sos> token of all sentences, 
        #2nd row contains the 1st word of all the sentences, and so on.
        #In other words, ith column is for ith sentence of the batch
        batch_hi = torch.cat([x[0] for x in temp_batch], dim=1)
        batch_eng = torch.cat([x[1] for x in temp_batch], dim=1)
        
        #add the batch tensors into batch_list
        batch_list.append([batch_hi, batch_eng])
            
    return batch_list
        

In [None]:
batch_list = create_batches(sorted_data, batch_size)

#shuffle the batches
random.shuffle(batch_list)

# Model

#### Encoder

In [None]:
class EncoderGRU(nn.Module):
    def __init__(self, input_size, embedding_size, hidden_size, n_layers, dropout):
        super(EncoderGRU,self).__init__()
        self.hidden_size = hidden_size          
        self.input_size = input_size            #size of hindi vocab, i.e., unique tokens in hindi sentences 
        self.embedding_size = embedding_size    #embedding dimension 
        self.n_layers = n_layers
        self.embedding = nn.Embedding(input_size, embedding_size)
        self.gru = nn.GRU(embedding_size, hidden_size,n_layers , dropout=dropout,bidirectional=False)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, input_sentence):
        
        embedding = self.embedding(input_sentence)   
        embedding = self.dropout(embedding) 
        output, hidden= self.gru(embedding)          #gru returns 2 values: output and hidden state 
        '''
         embedding.shape = [sentence_length, batch_size, embedding_size]
         input_sentence.shape = [sentence_length,batch_size]
         output.shape = [sentence_length,batch_size, n_directions * hidden_size)
         hidden.shape = [n_directions*n_layers,batch_size,hidden_size]
        '''
        return output,hidden                        


#### Decoder

In [None]:
class DecoderGRU(nn.Module):
    def __init__(self, input_size, embedding_size, hidden_size, output_size, n_layers,dropout):
        super(DecoderGRU,self).__init__()
        self.output_size = output_size
        self.hidden_size = hidden_size
        self.input_size = input_size
        self.embedding_size = embedding_size  
        self.n_layers = n_layers
        self.embedding = nn.Embedding(input_size, embedding_size)
        self.gru = nn.GRU(embedding_size, hidden_size,n_layers , dropout=dropout)
        self.dropout = nn.Dropout(dropout)
        self.linear = nn.Linear(hidden_size,output_size)

    def forward(self, input, hidden):
        #input.shape = [batch_size]           
        input = input.unsqueeze(0)                      #added 1 dimension, input.shape = [1,batch_size]  
        embedding = self.embedding(input)               #embedding.shape = [1, batch_size , embedding_size]
        embedding = self.dropout(embedding)     
        output, hidden = self.gru(embedding,hidden)  
        '''
         output.shape = [sentence_length,batch_size, n_directions * hidden_size)
         hidden.shape = [n_directions*n_layers,batch_size,hidden_size]
        '''   
        predicted_output = self.linear(output)         
        predicted_output = predicted_output.squeeze(0)   
        return predicted_output,hidden

#### Seq2Seq

In [None]:
class Seq2Seq(nn.Module):
    def __init__(self,input_size,output_size,embedding_size,hidden_size,n_layers,dropout, device):
        super().__init__()
        self.encoder = EncoderGRU(input_size,embedding_size,hidden_size,n_layers,dropout).to(device)
        self.decoder = DecoderGRU(output_size,embedding_size,hidden_size,output_size, n_layers,dropout).to(device)
        self.device = device
        
    def forward(self,input_sentence, target_output_sentence, teacher_forcing_ratio = 0.5):
        input_sen_len = input_sentence.shape[0]
        batch_size = input_sentence.shape[1]
        output_sen_len = target_output_sentence.shape[0]
        output_vocab_size = self.decoder.output_size
        
        #tensor to store predictions by the decoder
        predicted_word_indexes = torch.zeros(output_sen_len, batch_size, output_vocab_size).to(self.device)

        #pass the input hindi sentence into the encoder 
        output, hidden = self.encoder(input_sentence)

        decoder_input = target_output_sentence[0]  #first input to the decoder is always the init_token, i.e., <sos> token
        
        for i in range(1,output_sen_len):
            #pass the previous word along with the hidden and cell states of encoder into the decoder
            output,hidden = self.decoder(decoder_input, hidden)

            #append the next prediction
            predicted_word_indexes[i] = output

            use_teacher_forcing = random.random() < teacher_forcing_ratio
            best_word = output.argmax(1)
            if use_teacher_forcing:
              decoder_input = target_output_sentence[i]
            else:
              decoder_input = best_word

        return predicted_word_indexes

# Training phase

In [None]:
MAX_LENGTH =400

#hyperparameters
num_epochs = 20
learning_rate = 0.001
hidden_size = 512
embedding_size = 256           #same for both grus (encoder and decoder)
dropout = 0.5
n_layers = 2

In [None]:
#initialize the object of Seq2Seq class
model = Seq2Seq(input_size,output_size,embedding_size,hidden_size,n_layers,dropout, device).to(device)

The parameter initialization has been done as per the paper :  https://arxiv.org/pdf/1409.3215.pdf

In [None]:
#initialize the parameters
def init_weights(model):
    for name, parameter in model.named_parameters():
        nn.init.uniform_(parameter.data, -0.08, 0.08)
        
model.apply(init_weights)

Seq2Seq(
  (encoder): EncoderGRU(
    (embedding): Embedding(41722, 256)
    (gru): GRU(256, 512, num_layers=2, dropout=0.5)
    (dropout): Dropout(p=0.5, inplace=False)
  )
  (decoder): DecoderGRU(
    (embedding): Embedding(32954, 256)
    (gru): GRU(256, 512, num_layers=2, dropout=0.5)
    (dropout): Dropout(p=0.5, inplace=False)
    (linear): Linear(in_features=512, out_features=32954, bias=True)
  )
)

In [None]:
def train_batch_list(model, batch_list, criterion,encoder_optimizer,decoder_optimizer):
    for batch in batch_list:
        #get hindi and their corresponding english sentences from the batch
        input_sentence = batch[0]
        target_sentence = batch[1]
      
        encoder_optimizer.zero_grad()
        decoder_optimizer.zero_grad()    

        #pass the hindi and their corresponding english sentences into the model to get the predicted sentence     
        predicted_sentence = model(input_sentence, target_sentence) 

        #adjust the shapes
        predicted_sentence = predicted_sentence[1:].view(-1, predicted_sentence.shape[2])
        target_sentence = target_sentence[1:].view(-1)

        #calculate loss    
        loss = criterion(predicted_sentence,target_sentence)
        
        #backpropagate loss
        loss.backward()

        #clip the gradients
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1)

        #opitimize the parameters according to the propagated loss
        encoder_optimizer.step()
        decoder_optimizer.step()
    return loss 

In [None]:
def train(model, batch_list,num_epochs=num_epochs):
    total_loss = 0

    #initialize the optimizer and the criterion(Loss function) to be used
    encoder_optimizer = optim.Adam(model.encoder.parameters(),lr=learning_rate)       #using Adam optimizer for encoder 
    decoder_optimizer = optim.Adam(model.decoder.parameters(),lr=learning_rate)       #using Adam optimizer for decoder
    criterion = nn.CrossEntropyLoss(ignore_index = english.word2index["<pad>"])       #using CrossEntropyLoss function
    

    for epoch in range(num_epochs):

        #calculate loss for epoch
        loss = train_batch_list(model,batch_list,criterion,encoder_optimizer,decoder_optimizer)
        total_loss += loss.item()

        #save model
        torch.save(model.state_dict(),"gru_model_final.pt")
        
        print(f'\n\nEpoch: {epoch+1}/{num_epochs}     Loss: {loss.item():.4f}')  

    print(f"\n\n Total loss ::: {total_loss/len(batch_list):.4f}")


In [None]:
train(model,batch_list, num_epochs=num_epochs)



Epoch: 1/20     Loss: 4.7025


Epoch: 2/20     Loss: 4.0282


Epoch: 3/20     Loss: 3.9276


Epoch: 4/20     Loss: 3.5618


Epoch: 5/20     Loss: 2.9035


Epoch: 6/20     Loss: 3.1232


Epoch: 7/20     Loss: 3.0042


Epoch: 8/20     Loss: 3.0170


Epoch: 9/20     Loss: 2.8822


Epoch: 10/20     Loss: 2.8076


Epoch: 11/20     Loss: 2.5878


Epoch: 12/20     Loss: 2.6990


Epoch: 13/20     Loss: 2.5750


Epoch: 14/20     Loss: 2.4953


Epoch: 15/20     Loss: 2.3794


Epoch: 16/20     Loss: 2.2575


Epoch: 17/20     Loss: 2.4298


Epoch: 18/20     Loss: 2.5494


Epoch: 19/20     Loss: 2.2929


Epoch: 20/20     Loss: 2.1526


 Total loss ::: 0.0184


In [None]:
#save model
torch.save(model.state_dict(),"gru_model_final.pt" )

In [None]:
# from google.colab import drive
# drive.mount('/content/drive')

Mounted at /content/drive


# Test on Test set

In [None]:
#load the saved model
# model = Seq2Seq(input_size,output_size,embedding_size,hidden_size,n_layers,dropout, device).to(device)
# model.load_state_dict(torch.load("/content/drive/MyDrive/gru_model_final.pt",map_location=torch.device(device)))
model.eval()

Seq2Seq(
  (encoder): EncoderGRU(
    (embedding): Embedding(41722, 256)
    (gru): GRU(256, 512, num_layers=2, dropout=0.5)
    (dropout): Dropout(p=0.5, inplace=False)
  )
  (decoder): DecoderGRU(
    (embedding): Embedding(32954, 256)
    (gru): GRU(256, 512, num_layers=2, dropout=0.5)
    (dropout): Dropout(p=0.5, inplace=False)
    (linear): Linear(in_features=512, out_features=32954, bias=True)
  )
)

In [None]:
def translate_sentence(model, hi_sentence, hindi, english, device, max_length=400):
    #tokenize the hindi sentence
    hi_tokens = hi_tokenizer(hi_sentence)

    #convert it into tensor
    sentence_tensor = hindi.tokens2tensor(hi_tokens)   

    with torch.no_grad():
        #pass the source sentence into the encoder to get the hidden and cell states
        output,hidden = model.encoder(sentence_tensor)

    predicted_word_indices = [english.word2index["<sos>"]]              #index of <sos> in english vocab
    predicted_sentence = ""

    #repeat until the len of predicted sentence is less than max_length or the decoder predicts <eos>
    while len(predicted_word_indices)<max_length and predicted_word_indices[-1]!= english.word2index["<eos>"]:
        prev_word = [predicted_word_indices[-1]]             
        prev_word = torch.tensor(prev_word).to(device)                  #convert into tensor
        best_word = ""
        with torch.no_grad():
            '''
             pass the last predicted word along with the hidden and cell state of the encoder
             into the decoder to get the next predicted word
            '''
            output,hidden = model.decoder(prev_word, hidden)
            _ ,best_word = output.data.topk(1)                          #get the best predicted word index
          
        predicted_word_indices.append(best_word.item())                 #append it to the list of predicted word indices
        predicted_sentence += english.index2word[best_word.item()]+" "  #append the word corresponding to the predicted index

    translated_sentence = predicted_sentence.replace("<sos>","").replace("<eos>","")
    return translated_sentence


In [None]:
test_data = list()
with open('testhindistatements.csv') as csvfile:
     spamreader = csv.reader(csvfile, delimiter=",")
     for row in spamreader:
         test_data.append(row[2])
test_data.pop(0)

'hindi'

In [1]:
# test_data 

In [None]:
predictions = list()

In [2]:
for i in range(len(test_data)):
  #get the hindi sentence
  sentence = test_data[i]

  #get the predicted translated sentence
  predicted_sentence = translate_sentence(model,sentence,hindi,english,device,max_length=400)
  
  #print(sentence,"\n",predicted_sentence,"\n\n")

  #append the results
  predictions.append(predicted_sentence)

In [None]:
#write results into answer
file = open("answer.txt","w")
for x in predictions[:-1]:
   file.write(x)
   file.write("\n")
file.write(predictions[-1])
file.close()