In [196]:
!pip3 install indic-nlp-library

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




In [197]:
import pandas as pd
import numpy as np
from transformers import AutoTokenizer
import torch
from indicnlp.tokenize import indic_tokenize

In [198]:
data = pd.read_csv("/kaggle/input/language-dataset/Sentence pairs in English-Hindi - 2025-02-11.tsv",
                  sep="\t",header=None,names=["SrcSentID","SrcSent","DstSentID","DstSent"])

In [199]:
data.head()

Unnamed: 0,SrcSentID,SrcSent,DstSentID,DstSent
0,1282,Muiriel is 20 now.,485968,म्यूरियल अब बीस साल की हो गई है।
1,1282,Muiriel is 20 now.,2060319,म्यूरियल अब बीस साल की है।
2,1294,Education in this world disappoints me.,485564,मैं इस दुनिया में शिक्षा पर बहुत निराश हूँ।
3,1302,That won't happen.,2060320,वैसा नहीं होगा।
4,1308,I miss you.,2060321,मुझें तुम्हारी याद आ रही है।


In [200]:
data.drop(labels=[data.columns[0],data.columns[2]],axis=1,inplace=True)

In [201]:
data.head()

Unnamed: 0,SrcSent,DstSent
0,Muiriel is 20 now.,म्यूरियल अब बीस साल की हो गई है।
1,Muiriel is 20 now.,म्यूरियल अब बीस साल की है।
2,Education in this world disappoints me.,मैं इस दुनिया में शिक्षा पर बहुत निराश हूँ।
3,That won't happen.,वैसा नहीं होगा।
4,I miss you.,मुझें तुम्हारी याद आ रही है।


In [202]:
src_sent_tokenizer = AutoTokenizer.from_pretrained("google-T5/T5-base")

In [203]:
data["SrcSent"] = data["SrcSent"].apply(lambda x: src_sent_tokenizer.tokenize(x))

In [204]:
data.head()

Unnamed: 0,SrcSent,DstSent
0,"[▁Mu, i, riel, ▁is, ▁20, ▁now, .]",म्यूरियल अब बीस साल की हो गई है।
1,"[▁Mu, i, riel, ▁is, ▁20, ▁now, .]",म्यूरियल अब बीस साल की है।
2,"[▁Education, ▁in, ▁this, ▁world, ▁disappoint, ...",मैं इस दुनिया में शिक्षा पर बहुत निराश हूँ।
3,"[▁That, ▁won, ', t, ▁happen, .]",वैसा नहीं होगा।
4,"[▁I, ▁miss, ▁you, .]",मुझें तुम्हारी याद आ रही है।


In [205]:
data["DstSent"] = data["DstSent"].apply(lambda x: indic_tokenize.trivial_tokenize(x,lang="hi"))

In [206]:
data["SrcSent"] = data["SrcSent"].apply(src_sent_tokenizer.convert_tokens_to_ids)

In [207]:
Vs = src_sent_tokenizer.get_vocab()

In [208]:
data.head()

Unnamed: 0,SrcSent,DstSent
0,"[4159, 23, 14018, 19, 460, 230, 5]","[म्यूरियल, अब, बीस, साल, की, हो, गई, है, ।]"
1,"[4159, 23, 14018, 19, 460, 230, 5]","[म्यूरियल, अब, बीस, साल, की, है, ।]"
2,"[2855, 16, 48, 296, 26963, 7, 140, 5]","[मैं, इस, दुनिया, में, शिक्षा, पर, बहुत, निराश..."
3,"[466, 751, 31, 17, 1837, 5]","[वैसा, नहीं, होगा, ।]"
4,"[27, 3041, 25, 5]","[मुझें, तुम्हारी, याद, आ, रही, है, ।]"


In [209]:
hindi_vocab = set()

for tokenized_hindi_sent in data["DstSent"]:
    hindi_vocab.update(tokenized_hindi_sent)

In [210]:
Vd = dict()
for idx, token in enumerate(hindi_vocab):
    Vd[token] = idx + 3
Vd["<PAD>"] = 0
Vd["<SOS>"] = 1
Vd["<EOS>"] = 2

In [211]:
def convert_hindi_tokens_to_ids(hindi_sent):
    return [Vd[token] for token in hindi_sent]

In [212]:
data["DstSent"] = data["DstSent"].apply(lambda x: convert_hindi_tokens_to_ids(x))

In [213]:
data.head()

Unnamed: 0,SrcSent,DstSent
0,"[4159, 23, 14018, 19, 460, 230, 5]","[2208, 5425, 1788, 1434, 5388, 5105, 6979, 469..."
1,"[4159, 23, 14018, 19, 460, 230, 5]","[2208, 5425, 1788, 1434, 5388, 4695, 2623]"
2,"[2855, 16, 48, 296, 26963, 7, 140, 5]","[3278, 6441, 1266, 5440, 3715, 5312, 2528, 258..."
3,"[466, 751, 31, 17, 1837, 5]","[3553, 881, 2146, 2623]"
4,"[27, 3041, 25, 5]","[5180, 649, 6342, 2385, 1164, 4695, 2623]"


In [214]:
def insert_sos_token_id(hindi_sent_token_ids):
    return [1] + hindi_sent_token_ids

In [215]:
data["DstSentInput"] = data["DstSent"].apply(lambda x: insert_sos_token_id(x))

In [216]:
def insert_eos_token_id(hindi_sent_token_ids):
    return hindi_sent_token_ids + [2]

In [217]:
data["DstSentLabel"] = data["DstSent"].apply(lambda x: insert_eos_token_id(x))

In [218]:
data.head()

Unnamed: 0,SrcSent,DstSent,DstSentInput,DstSentLabel
0,"[4159, 23, 14018, 19, 460, 230, 5]","[2208, 5425, 1788, 1434, 5388, 5105, 6979, 469...","[1, 2208, 5425, 1788, 1434, 5388, 5105, 6979, ...","[2208, 5425, 1788, 1434, 5388, 5105, 6979, 469..."
1,"[4159, 23, 14018, 19, 460, 230, 5]","[2208, 5425, 1788, 1434, 5388, 4695, 2623]","[1, 2208, 5425, 1788, 1434, 5388, 4695, 2623]","[2208, 5425, 1788, 1434, 5388, 4695, 2623, 2]"
2,"[2855, 16, 48, 296, 26963, 7, 140, 5]","[3278, 6441, 1266, 5440, 3715, 5312, 2528, 258...","[1, 3278, 6441, 1266, 5440, 3715, 5312, 2528, ...","[3278, 6441, 1266, 5440, 3715, 5312, 2528, 258..."
3,"[466, 751, 31, 17, 1837, 5]","[3553, 881, 2146, 2623]","[1, 3553, 881, 2146, 2623]","[3553, 881, 2146, 2623, 2]"
4,"[27, 3041, 25, 5]","[5180, 649, 6342, 2385, 1164, 4695, 2623]","[1, 5180, 649, 6342, 2385, 1164, 4695, 2623]","[5180, 649, 6342, 2385, 1164, 4695, 2623, 2]"


In [219]:
data.drop(labels=[data.columns[1]],axis=1,inplace=True)

In [220]:
X = list(data["SrcSent"])
Y_input = list(data["DstSentInput"])
Y_label = list(data["DstSentLabel"])

In [221]:
X_tensor = [torch.tensor(tokenized_eng_sent_ids) for tokenized_eng_sent_ids in X]
Y_input_tensor = [torch.tensor(tokenized_hin_sent_ids) for tokenized_hin_sent_ids in Y_input]
Y_label_tensor = [torch.tensor(tokenized_hin_sent_ids) for tokenized_hin_sent_ids in Y_label]

In [222]:
X_padded = torch.nn.utils.rnn.pad_sequence(X_tensor,batch_first=True)
Y_padded_input = torch.nn.utils.rnn.pad_sequence(Y_input_tensor,batch_first=True)
Y_padded_label = torch.nn.utils.rnn.pad_sequence(Y_label_tensor,batch_first=True)

In [223]:
Ns = X_padded.shape[1]
Nd = Y_padded_label.shape[1]

In [224]:
class Encoder(torch.nn.Module):

    def __init__(self,src_lang_vocab_size,word_embedding_dim):
        super(Encoder,self).__init__()
        self.first_embedding_layer = torch.nn.Embedding(num_embeddings=src_lang_vocab_size,
                                                       embedding_dim=word_embedding_dim)
        self.second_lstm_layer = torch.nn.LSTM(input_size=word_embedding_dim,
                                               hidden_size=word_embedding_dim,
                                              batch_first=True)

    def forward(self,X_padded_mini_batch):

        first_embedding_layer_out = self.first_embedding_layer(X_padded_mini_batch)
        encoder_output, (final_encoder_output,final_cell_state) = self.second_lstm_layer(first_embedding_layer_out)

        return encoder_output, (final_encoder_output,final_cell_state)

In [225]:
class Decoder(torch.nn.Module):

    def __init__(self,dst_lang_vocab_size,word_embedding_dim):
        super(Decoder,self).__init__()

        self.first_embedding_layer = torch.nn.Embedding(num_embeddings=dst_lang_vocab_size,
                                                       embedding_dim=word_embedding_dim)
        self.second_lstm_layer = torch.nn.LSTM(input_size=word_embedding_dim,
                                               hidden_size=word_embedding_dim,
                                              batch_first=True)
        self.prediction_layer = torch.nn.Linear(in_features=word_embedding_dim,out_features=dst_lang_vocab_size)
        #self.prediction_layer_activation = torch.nn.Softmax(dim=2)

    def forward(self,Y_padded_input_mini_batch,final_encoder_output,final_cell_state):

        first_embedding_layer_out = self.first_embedding_layer(Y_padded_input_mini_batch)
        decoder_lstm_layer_out, (final_decoder_lstm_layer_out, final_cell_state) = self.second_lstm_layer(first_embedding_layer_out,
                                                                                                         (final_encoder_output,
                                                                                                          final_cell_state))
        prediction = self.prediction_layer(decoder_lstm_layer_out)
        
        return prediction, (final_decoder_lstm_layer_out, final_cell_state)

In [226]:
class Seq2SeqEncDec(torch.nn.Module):

    def __init__(self,src_lang_vocab_size,dst_lang_vocab_size,word_embedding_dim):
        super(Seq2SeqEncDec,self).__init__()

        self.encoder = Encoder(src_lang_vocab_size,word_embedding_dim)
        self.decoder = Decoder(dst_lang_vocab_size,word_embedding_dim)

    def forward(self,X_padded_mini_batch,Y_padded_input_mini_batch):

        encoder_output, (final_encoder_output,final_cell_state) = self.encoder(X_padded_mini_batch)
        y_hat_mini_batch = self.decoder(Y_padded_input_mini_batch,
                                        final_encoder_output,final_cell_state)

        return y_hat_mini_batch

In [227]:
if torch.cuda.is_available():
    device = torch.device("cuda")
else:
    device = torch.device("cpu")

print(device)

cuda


In [228]:
X_padded_train = X_padded[0:13000]
Y_padded_input_train = Y_padded_input[0:13000]
Y_padded_label_train = Y_padded_label[0:13000]

X_padded_test = X_padded[13000:]
Y_padded_input_test = Y_padded_input[13000:]
Y_padded_label_test = Y_padded_label[13000:]

In [229]:
network = Seq2SeqEncDec(len(Vs),len(Vd),128).to(device)

In [None]:
loss_fn = torch.nn.CrossEntropyLoss(ignore_index=0)
optimizer = torch.optim.Adam(network.parameters())
num_epochs = 250
mb_size = 65

for epoch in range(num_epochs):
    for i in range(X_padded_train.shape[0]//mb_size):

        X_train_mb = X_padded_train[i*mb_size:(i+1)*mb_size]
        Y_input_mb = Y_padded_input_train[i*mb_size:(i+1)*mb_size]
        Y_label_mb = Y_padded_label_train[i*mb_size:(i+1)*mb_size]
        Y_label_mb = Y_label_mb.reshape(Y_label_mb.shape[0]*Y_label_mb.shape[1],)
        
        X_train_mb, Y_input_mb, Y_label_mb = X_train_mb.to(device), Y_input_mb.to(device), Y_label_mb.to(device)

        y_hat_train_mb = network(X_train_mb,Y_input_mb)
        y_hat_train_mb = y_hat_train_mb[0]
        y_hat_train_mb = y_hat_train_mb.reshape(y_hat_train_mb.shape[0]*y_hat_train_mb.shape[1],
                                                y_hat_train_mb.shape[2])

        loss_fn_value = loss_fn(y_hat_train_mb,Y_label_mb)

        loss_fn_value.backward()
        #torch.nn.utils.clip_grad_norm_(network.parameters(),max_norm=1.0)
        optimizer.step()
        optimizer.zero_grad()

        print("Epoch # {}, Time Step # {}, Loss Value = {}".format(epoch,i,loss_fn_value))

Epoch # 0, Time Step # 0, Loss Value = 8.861414909362793
Epoch # 0, Time Step # 1, Loss Value = 8.831487655639648
Epoch # 0, Time Step # 2, Loss Value = 8.808573722839355
Epoch # 0, Time Step # 3, Loss Value = 8.779844284057617
Epoch # 0, Time Step # 4, Loss Value = 8.762063980102539
Epoch # 0, Time Step # 5, Loss Value = 8.725783348083496
Epoch # 0, Time Step # 6, Loss Value = 8.695900917053223
Epoch # 0, Time Step # 7, Loss Value = 8.647950172424316
Epoch # 0, Time Step # 8, Loss Value = 8.642840385437012
Epoch # 0, Time Step # 9, Loss Value = 8.589899063110352
Epoch # 0, Time Step # 10, Loss Value = 8.513964653015137
Epoch # 0, Time Step # 11, Loss Value = 8.475321769714355
Epoch # 0, Time Step # 12, Loss Value = 8.330689430236816
Epoch # 0, Time Step # 13, Loss Value = 8.341524124145508
Epoch # 0, Time Step # 14, Loss Value = 8.156599044799805
Epoch # 0, Time Step # 15, Loss Value = 8.084738731384277
Epoch # 0, Time Step # 16, Loss Value = 7.883793830871582
Epoch # 0, Time Step # 1

In [239]:
torch.save(network.state_dict(),"model.pth")
print("model path downloaded")

model path downloaded
