In [20]:
!pip3 install indic-nlp-library

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




In [21]:
import pandas as pd
import numpy as np
from transformers import AutoTokenizer
import torch
from indicnlp.tokenize import indic_tokenize
import torch.nn.functional as F

print("All imports successful")

All imports successful


In [22]:
data = pd.read_csv("/kaggle/input/language-dataset/Sentence pairs in English-Hindi - 2025-02-11.tsv",
                  sep="\t",header=None,names=["SrcSentID","SrcSent","DstSentID","DstSent"])

In [23]:
data.head()

Unnamed: 0,SrcSentID,SrcSent,DstSentID,DstSent
0,1282,Muiriel is 20 now.,485968,म्यूरियल अब बीस साल की हो गई है।
1,1282,Muiriel is 20 now.,2060319,म्यूरियल अब बीस साल की है।
2,1294,Education in this world disappoints me.,485564,मैं इस दुनिया में शिक्षा पर बहुत निराश हूँ।
3,1302,That won't happen.,2060320,वैसा नहीं होगा।
4,1308,I miss you.,2060321,मुझें तुम्हारी याद आ रही है।


In [24]:
data.drop(labels=[data.columns[0],data.columns[2]],axis=1,inplace=True)

In [25]:
data.head()

Unnamed: 0,SrcSent,DstSent
0,Muiriel is 20 now.,म्यूरियल अब बीस साल की हो गई है।
1,Muiriel is 20 now.,म्यूरियल अब बीस साल की है।
2,Education in this world disappoints me.,मैं इस दुनिया में शिक्षा पर बहुत निराश हूँ।
3,That won't happen.,वैसा नहीं होगा।
4,I miss you.,मुझें तुम्हारी याद आ रही है।


In [26]:
src_sent_tokenizer = AutoTokenizer.from_pretrained("google-T5/T5-base")

In [27]:
data["SrcSent"] = data["SrcSent"].apply(lambda x: src_sent_tokenizer.tokenize(x))

In [28]:
data.head()

Unnamed: 0,SrcSent,DstSent
0,"[▁Mu, i, riel, ▁is, ▁20, ▁now, .]",म्यूरियल अब बीस साल की हो गई है।
1,"[▁Mu, i, riel, ▁is, ▁20, ▁now, .]",म्यूरियल अब बीस साल की है।
2,"[▁Education, ▁in, ▁this, ▁world, ▁disappoint, ...",मैं इस दुनिया में शिक्षा पर बहुत निराश हूँ।
3,"[▁That, ▁won, ', t, ▁happen, .]",वैसा नहीं होगा।
4,"[▁I, ▁miss, ▁you, .]",मुझें तुम्हारी याद आ रही है।


In [29]:
data["DstSent"] = data["DstSent"].apply(lambda x: indic_tokenize.trivial_tokenize(x,lang="hi"))

In [30]:
data["SrcSent"] = data["SrcSent"].apply(src_sent_tokenizer.convert_tokens_to_ids)

In [31]:
Vs = src_sent_tokenizer.get_vocab()

In [32]:
data.head()

Unnamed: 0,SrcSent,DstSent
0,"[4159, 23, 14018, 19, 460, 230, 5]","[म्यूरियल, अब, बीस, साल, की, हो, गई, है, ।]"
1,"[4159, 23, 14018, 19, 460, 230, 5]","[म्यूरियल, अब, बीस, साल, की, है, ।]"
2,"[2855, 16, 48, 296, 26963, 7, 140, 5]","[मैं, इस, दुनिया, में, शिक्षा, पर, बहुत, निराश..."
3,"[466, 751, 31, 17, 1837, 5]","[वैसा, नहीं, होगा, ।]"
4,"[27, 3041, 25, 5]","[मुझें, तुम्हारी, याद, आ, रही, है, ।]"


In [33]:
hindi_vocab = set()

for tokenized_hindi_sent in data["DstSent"]:
    hindi_vocab.update(tokenized_hindi_sent)

In [34]:
Vd = dict()
for idx, token in enumerate(hindi_vocab):
    Vd[token] = idx + 3
Vd["<PAD>"] = 0
Vd["<SOS>"] = 1
Vd["<EOS>"] = 2

In [35]:
hindi_idx2vocab = dict(zip(Vd.values(), Vd.keys()))
print(hindi_idx2vocab)

{3: 'वयस्क', 4: 'डरी', 5: 'राज्य', 6: 'भरा', 7: 'प्रयोग', 8: 'टिकटें', 9: 'रबड़', 10: 'तम्बू', 11: 'एतराज़', 12: 'बदकिस्मती', 13: 'पुराणों', 14: 'नेहा', 15: 'जिसका', 16: 'लाइका', 17: 'खुले', 18: 'अधिकांश', 19: 'रच', 20: 'रॉबर्ट', 21: 'शाकारी', 22: 'जुर्म', 23: 'स्पैनिश', 24: 'गाऊँगी', 25: 'तरफ', 26: 'प्रोफेसर', 27: 'सूज', 28: 'मिटाइए', 29: 'गिर', 30: 'उड़ने', 31: 'मनोचिकित्सा', 32: '€100', 33: 'नागरिक', 34: 'सेंटर', 35: 'पड़ेगी', 36: 'शुरू', 37: 'लिखीं', 38: 'सभ्यताओं', 39: 'टूट', 40: 'चुपचाप', 41: 'झटके', 42: 'आत्मा', 43: 'ग़ुस्से', 44: 'पुआ', 45: 'लाइए', 46: 'नये', 47: 'वरदान', 48: 'दौलत', 49: 'नौसैनिक', 50: 'इंक़िलाब', 51: 'रेपुबलिका', 52: 'पसंद', 53: 'मूँछ', 54: 'फिराया', 55: 'मरोगे', 56: 'धुलने', 57: 'आगे', 58: 'Tatoeba', 59: 'सीढ़ियाँ', 60: 'हिस्सो', 61: 'कमा', 62: 'चंद', 63: 'बजाने', 64: 'फेंक', 65: 'आख़िरकार', 66: 'झेल', 67: 'छूट', 68: 'मिठाईयाँ', 69: 'झूठी', 70: 'गिलहरी', 71: 'मनमोहक', 72: 'सूखा', 73: 'तोह्फ़ा', 74: 'तारीफ़', 75: 'सुपर', 76: 'परिस्तिथि', 77: 'ग्रामवासी', 78: '

In [36]:
def convert_hindi_tokens_to_ids(hindi_sent):
    return [Vd[token] for token in hindi_sent]

In [37]:
data["DstSent"] = data["DstSent"].apply(lambda x: convert_hindi_tokens_to_ids(x))

In [38]:
data.head()

Unnamed: 0,SrcSent,DstSent
0,"[4159, 23, 14018, 19, 460, 230, 5]","[1596, 2130, 4593, 6123, 2582, 4761, 4937, 663..."
1,"[4159, 23, 14018, 19, 460, 230, 5]","[1596, 2130, 4593, 6123, 2582, 6630, 6342]"
2,"[2855, 16, 48, 296, 26963, 7, 140, 5]","[1638, 6378, 4596, 3798, 6512, 6808, 1913, 313..."
3,"[466, 751, 31, 17, 1837, 5]","[3761, 570, 6552, 6342]"
4,"[27, 3041, 25, 5]","[2705, 5685, 1184, 753, 1820, 6630, 6342]"


In [39]:
def insert_sos_token_id(hindi_sent_token_ids):
    return [1] + hindi_sent_token_ids

In [40]:
data["DstSentInput"] = data["DstSent"].apply(lambda x: insert_sos_token_id(x))

In [41]:
def insert_eos_token_id(hindi_sent_token_ids):
    return hindi_sent_token_ids + [2]

In [42]:
data["DstSentLabel"] = data["DstSent"].apply(lambda x: insert_eos_token_id(x))

In [43]:
data.head()

Unnamed: 0,SrcSent,DstSent,DstSentInput,DstSentLabel
0,"[4159, 23, 14018, 19, 460, 230, 5]","[1596, 2130, 4593, 6123, 2582, 4761, 4937, 663...","[1, 1596, 2130, 4593, 6123, 2582, 4761, 4937, ...","[1596, 2130, 4593, 6123, 2582, 4761, 4937, 663..."
1,"[4159, 23, 14018, 19, 460, 230, 5]","[1596, 2130, 4593, 6123, 2582, 6630, 6342]","[1, 1596, 2130, 4593, 6123, 2582, 6630, 6342]","[1596, 2130, 4593, 6123, 2582, 6630, 6342, 2]"
2,"[2855, 16, 48, 296, 26963, 7, 140, 5]","[1638, 6378, 4596, 3798, 6512, 6808, 1913, 313...","[1, 1638, 6378, 4596, 3798, 6512, 6808, 1913, ...","[1638, 6378, 4596, 3798, 6512, 6808, 1913, 313..."
3,"[466, 751, 31, 17, 1837, 5]","[3761, 570, 6552, 6342]","[1, 3761, 570, 6552, 6342]","[3761, 570, 6552, 6342, 2]"
4,"[27, 3041, 25, 5]","[2705, 5685, 1184, 753, 1820, 6630, 6342]","[1, 2705, 5685, 1184, 753, 1820, 6630, 6342]","[2705, 5685, 1184, 753, 1820, 6630, 6342, 2]"


In [44]:
data.drop(labels=[data.columns[1]],axis=1,inplace=True)

In [45]:
X = list(data["SrcSent"])
Y_input = list(data["DstSentInput"])
Y_label = list(data["DstSentLabel"])

In [46]:
X_tensor = [torch.tensor(tokenized_eng_sent_ids) for tokenized_eng_sent_ids in X]
Y_input_tensor = [torch.tensor(tokenized_hin_sent_ids) for tokenized_hin_sent_ids in Y_input]
Y_label_tensor = [torch.tensor(tokenized_hin_sent_ids) for tokenized_hin_sent_ids in Y_label]

In [47]:
X_padded = torch.nn.utils.rnn.pad_sequence(X_tensor,batch_first=True)
Y_padded_input = torch.nn.utils.rnn.pad_sequence(Y_input_tensor,batch_first=True)
Y_padded_label = torch.nn.utils.rnn.pad_sequence(Y_label_tensor,batch_first=True)

In [48]:
Ns = X_padded.shape[1]
Nd = Y_padded_label.shape[1]

In [49]:
class Encoder(torch.nn.Module):

    def __init__(self,src_lang_vocab_size,word_embedding_dim):
        super(Encoder,self).__init__()
        self.first_embedding_layer = torch.nn.Embedding(num_embeddings=src_lang_vocab_size,
                                                       embedding_dim=word_embedding_dim)
        self.second_lstm_layer = torch.nn.LSTM(input_size=word_embedding_dim,
                                               hidden_size=word_embedding_dim,
                                              batch_first=True)

    def forward(self,X_padded_mini_batch):

        first_embedding_layer_out = self.first_embedding_layer(X_padded_mini_batch)
        encoder_output, (final_encoder_output,final_cell_state) = self.second_lstm_layer(first_embedding_layer_out)

        return encoder_output, (final_encoder_output,final_cell_state)

In [50]:
class Decoder(torch.nn.Module):

    def __init__(self,dst_lang_vocab_size,word_embedding_dim):
        super(Decoder,self).__init__()

        self.first_embedding_layer = torch.nn.Embedding(num_embeddings=dst_lang_vocab_size,
                                                       embedding_dim=word_embedding_dim)
        self.second_lstm_layer = torch.nn.LSTM(input_size=word_embedding_dim,
                                               hidden_size=word_embedding_dim,
                                              batch_first=True)
        self.prediction_layer = torch.nn.Linear(in_features=word_embedding_dim,out_features=dst_lang_vocab_size)
        #self.prediction_layer_activation = torch.nn.Softmax(dim=2)

    def forward(self,Y_padded_input_mini_batch,final_encoder_output,final_cell_state):

        first_embedding_layer_out = self.first_embedding_layer(Y_padded_input_mini_batch)
        decoder_lstm_layer_out, (final_decoder_lstm_layer_out, final_cell_state) = self.second_lstm_layer(first_embedding_layer_out,
                                                                                                         (final_encoder_output,
                                                                                                          final_cell_state))
        prediction = self.prediction_layer(decoder_lstm_layer_out)
        
        return prediction, (final_decoder_lstm_layer_out, final_cell_state)

In [51]:
class Seq2SeqEncDec(torch.nn.Module):

    def __init__(self,src_lang_vocab_size,dst_lang_vocab_size,word_embedding_dim):
        super(Seq2SeqEncDec,self).__init__()

        self.encoder = Encoder(src_lang_vocab_size,word_embedding_dim)
        self.decoder = Decoder(dst_lang_vocab_size,word_embedding_dim)

    def forward(self,X_padded_mini_batch,Y_padded_input_mini_batch):

        encoder_output, (final_encoder_output,final_cell_state) = self.encoder(X_padded_mini_batch)
        y_hat_mini_batch = self.decoder(Y_padded_input_mini_batch,
                                        final_encoder_output,final_cell_state)

        return y_hat_mini_batch

In [52]:
if torch.cuda.is_available():
    device = torch.device("cuda")
else:
    device = torch.device("cpu")

print(device)

cuda


In [53]:
X_padded_train = X_padded[0:13000]
Y_padded_input_train = Y_padded_input[0:13000]
Y_padded_label_train = Y_padded_label[0:13000]

X_padded_test = X_padded[13000:]
Y_padded_input_test = Y_padded_input[13000:]
Y_padded_label_test = Y_padded_label[13000:]

In [54]:
network = Seq2SeqEncDec(len(Vs),len(Vd),128).to(device)

In [55]:
loss_fn = torch.nn.CrossEntropyLoss(ignore_index=0)
optimizer = torch.optim.Adam(network.parameters())
num_epochs = 250
mb_size = 65

for epoch in range(num_epochs):
    for i in range(X_padded_train.shape[0]//mb_size):

        X_train_mb = X_padded_train[i*mb_size:(i+1)*mb_size]
        Y_input_mb = Y_padded_input_train[i*mb_size:(i+1)*mb_size]
        Y_label_mb = Y_padded_label_train[i*mb_size:(i+1)*mb_size]
        Y_label_mb = Y_label_mb.reshape(Y_label_mb.shape[0]*Y_label_mb.shape[1],)
        
        X_train_mb, Y_input_mb, Y_label_mb = X_train_mb.to(device), Y_input_mb.to(device), Y_label_mb.to(device)

        y_hat_train_mb = network(X_train_mb,Y_input_mb)
        y_hat_train_mb = y_hat_train_mb[0]
        y_hat_train_mb = y_hat_train_mb.reshape(y_hat_train_mb.shape[0]*y_hat_train_mb.shape[1],
                                                y_hat_train_mb.shape[2])

        loss_fn_value = loss_fn(y_hat_train_mb,Y_label_mb)

        loss_fn_value.backward()
        #torch.nn.utils.clip_grad_norm_(network.parameters(),max_norm=1.0)
        optimizer.step()
        optimizer.zero_grad()

        print("Epoch # {}, Time Step # {}, Loss Value = {}".format(epoch,i,loss_fn_value))

Epoch # 0, Time Step # 0, Loss Value = 8.888349533081055
Epoch # 0, Time Step # 1, Loss Value = 8.840303421020508
Epoch # 0, Time Step # 2, Loss Value = 8.807218551635742
Epoch # 0, Time Step # 3, Loss Value = 8.795755386352539
Epoch # 0, Time Step # 4, Loss Value = 8.775224685668945
Epoch # 0, Time Step # 5, Loss Value = 8.724142074584961
Epoch # 0, Time Step # 6, Loss Value = 8.725821495056152
Epoch # 0, Time Step # 7, Loss Value = 8.6615629196167
Epoch # 0, Time Step # 8, Loss Value = 8.643916130065918
Epoch # 0, Time Step # 9, Loss Value = 8.598362922668457
Epoch # 0, Time Step # 10, Loss Value = 8.512667655944824
Epoch # 0, Time Step # 11, Loss Value = 8.464422225952148
Epoch # 0, Time Step # 12, Loss Value = 8.321881294250488
Epoch # 0, Time Step # 13, Loss Value = 8.346323013305664
Epoch # 0, Time Step # 14, Loss Value = 8.181441307067871
Epoch # 0, Time Step # 15, Loss Value = 8.101869583129883
Epoch # 0, Time Step # 16, Loss Value = 7.936958312988281
Epoch # 0, Time Step # 17,

In [56]:
torch.save(network.state_dict(),"model.pth")
print("model path downloaded")

model path downloaded


In [65]:
def generate_translation(eng_sentence):

    tokenized_eng_sentence = src_sent_tokenizer.tokenize(eng_sentence)
    token_ids = src_sent_tokenizer.convert_tokens_to_ids(tokenized_eng_sentence)
    token_ids_tensor = torch.tensor(token_ids)
    token_ids_tensor = torch.unsqueeze(token_ids_tensor,0)

    if torch.cuda.is_available():
        device = torch.device("cuda")
        token_ids_tensor = token_ids_tensor.to(device)

    encoder_outputs,(final_encoder_output,final_candidate_cell_state) = network.encoder(token_ids_tensor)
    decoder_first_time_step_input = torch.tensor([[1]])

    if torch.cuda.is_available():
        encoder_outputs = encoder_outputs.to(device)
        final_encoder_output = final_encoder_output.to(device)
        final_candidate_cell_state = final_candidate_cell_state.to(device)
        decoder_first_time_step_input = decoder_first_time_step_input.to(device)

    decoder_first_time_step_output, (hidden_decoder_output, hidden_decoder_cell_state) = network.decoder(decoder_first_time_step_input,
                                                                          final_encoder_output,
                                                                          final_candidate_cell_state)

    generated_token_id = torch.argmax(F.softmax(decoder_first_time_step_output[:,0,:],dim=1),1)
    generated_token_id = torch.unsqueeze(generated_token_id,1)

    hindi_translated_sentence = str()
    hindi_translated_sentence += " " + hindi_idx2vocab[generated_token_id.item()]

    if torch.cuda.is_available():
        generated_token_id = generated_token_id.to(device)
        hidden_decoder_output = hidden_decoder_output.to(device)
        hidden_decoder_cell_state = hidden_decoder_cell_state.to(device)
        
    for i in range(Nd-1):
        
        decoder_first_time_step_output, (hidden_decoder_output, hidden_decoder_cell_state) = network.decoder(generated_token_id,
                                                                                                hidden_decoder_output,
                                                                                                hidden_decoder_cell_state)
        generated_token_id = torch.argmax(F.softmax(decoder_first_time_step_output[:,0,:],dim=1),1)
        generated_token_id = torch.unsqueeze(generated_token_id,1)

        if torch.cuda.is_available():
            generated_token_id = generated_token_id.to(device)
            hidden_decoder_output = hidden_decoder_output.to(device)
            hidden_decoder_cell_state = hidden_decoder_cell_state.to(device)

        if generated_token_id.item() == Vd["<EOS>"]:
            break

        hindi_translated_sentence += " " + hindi_idx2vocab[generated_token_id.item()]

    return hindi_translated_sentence