In [None]:
!pip install opendatasets

In [None]:
import opendatasets as od
import os
import pandas as pd
from tqdm import tqdm
import re
import string
od.download('https://www.kaggle.com/datasets/vaibhavkumar11/hindi-english-parallel-corpus')

In [None]:
import pandas as pd
df = pd.read_csv('hindi-english-parallel-corpus/hindi_english_parallel.csv')
df = df.dropna()
df=df.reset_index()
MAXLEN=10
MINLEN = 0
hindi=[]
english=[]

for i in tqdm(range(len(df['hindi']))):
    if MINLEN< len(str(df['hindi'][i]).split()) < MAXLEN and  MINLEN< len(str(df['english'][i]).split()) < MAXLEN :
        hindi.append(df['hindi'][i])
        english.append(df['english'][i])
del df
def clean_sentence(sentence):
    sentence = str(sentence)
    sentence = sentence.strip()
    sentence = sentence.lower()
    sentence = re.sub('[' + string.punctuation + ']', '', sentence)
    return sentence
hindi = list(map(clean_sentence,hindi))
english = list(map(clean_sentence,english))
cleaned_data = {
    'english':english,
    'hindi':hindi
}
df  = pd.DataFrame.from_dict(cleaned_data)
if not os.path.exists('data'):
    os.mkdir('data')
df.to_csv('data/cleaned_english_hindi_corpus.csv')

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import matplotlib.pyplot as plt
from torch.utils.data import Dataset , DataLoader
import re
import os
import string
import pandas as pd
from tqdm import tqdm
import time

device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [None]:
df = pd.read_csv('data/cleaned_english_hindi_corpus.csv')

In [None]:
len(df)

In [None]:
SOS_TOKEN =  0
EOS_TOKEN = 1

class Lang:
    def __init__(self,name) -> None:
        self.name = name
        self.index2word = {0:SOS_TOKEN,1:EOS_TOKEN}
        self.word2index = {}
        self.word2count = {}
        self.n_words = 2
    def addWord(self,word):
        if word not in self.word2index:
            self.word2index[word]=self.n_words
            self.word2count[word]=1
            self.index2word[self.n_words]=word
            self.n_words+=1
    def addSentense(self,sentence):
        for word in sentence.split():
            self.addWord(word)

input_lang = Lang('english')
output_lang = Lang('hindi')

def make_pairs(input_lang,output_lang):
    pairs = []
    for i in tqdm(range(len(df))):
        eng_sent = str(df['english'][i])
        hin_sent = str(df['hindi'][i])
        input_lang.addSentense(eng_sent)
        output_lang.addSentense(hin_sent)
        pairs.append([eng_sent,hin_sent])
    return input_lang,output_lang,df['english'],df['hindi'],pairs


input_lang , output_lang ,eng,hin, pairs = make_pairs(input_lang,output_lang)


In [None]:
langs={
    'input_lang':input_lang,
    'output_lang':output_lang
}
torch.save(langs,'lang.pth')

In [None]:
input_lang.n_words , output_lang.n_words

In [None]:
MAXLEN=10

class LangData(Dataset):
    def __init__(self,eng,hin,input_lang,output_lang):
        self.eng = eng
        self.hin = hin
        n = len(self.eng)
        # converting into tensors
        eng_numpy = np.zeros((n,MAXLEN),dtype=np.int32)
        hin_numpy = np.zeros((n,MAXLEN),dtype=np.int32)
        for idx in tqdm(range(n)):
            inp = eng[idx]
            tgt = hin[idx]
            inp_id = self.indexFromSentences(input_lang,inp)
            tgt_id = self.indexFromSentences(output_lang,tgt)
            inp_id.append(EOS_TOKEN)
            tgt_id.append(EOS_TOKEN)
            eng_numpy[idx,:len(inp_id)]=inp_id
            hin_numpy[idx,:len(tgt_id)]=tgt_id
        self.eng_tensor = torch.LongTensor(eng_numpy)
        self.hin_tensor = torch.LongTensor(hin_numpy)
    def indexFromSentences(self,lang , sent):
        # try:
        return  [lang.word2index[word] for word in str(sent).split()]
        # except:
        #     print(sent)

    def __getitem__(self, index):
        return self.eng_tensor[index],self.hin_tensor[index]
    def __len__(self):
        return len(self.eng)


In [None]:
del df

In [None]:
batch_size=64
taking = 2000
langdataset = LangData(eng[:taking],hin[:taking],input_lang,output_lang)
print(len(langdataset))
print(len(langdataset))
dataloader = DataLoader(langdataset,batch_size=batch_size,shuffle=True)

In [None]:
for data in dataloader:
    a,b = data
    print(a,b)
    print(f'shape = [batchsize , maxlength]')
    print(f'input_shape {a.shape} , output_shape {b.shape}')
    break

In [None]:
class Encoder(nn.Module):
    def __init__(self, input_dim ,hidden_dim,p=0.5) -> None:
        super().__init__()
        self.dropout = nn.Dropout(p)
        self.embedding = nn.Embedding(input_dim,hidden_dim)
        self.lstm = nn.GRU(hidden_dim,hidden_dim,num_layers=1,batch_first=True)
    def forward(self,x):
        emb = self.dropout(self.embedding(x))
        output,hidden = self.lstm(emb)
        return output , hidden
class AttentionLayer(nn.Module):
    def __init__(self, hidden_dim) -> None:
        super().__init__()
        self.l1 = nn.Linear(hidden_dim,hidden_dim)
        self.l2 = nn.Linear(hidden_dim,hidden_dim)
        self.l3 = nn.Linear(hidden_dim,1)
    def forward(self,encoder_output , prev_hidden):
        score = self.l3(torch.tanh(self.l1(encoder_output)+self.l2(prev_hidden)))
        weights = torch.softmax(score,dim=1)
        weights = weights.permute(0,2,1)
        context = torch.bmm(weights,encoder_output)
        return context , weights
class Decoder(nn.Module):
    def __init__(self, output_dim , hidden_dim , p=0.5) -> None:
        super().__init__()
        self.embedding = nn.Embedding(output_dim,hidden_dim)
        self.attention = AttentionLayer(hidden_dim)
        self.lstm = nn.GRU(2*hidden_dim,hidden_dim,batch_first = True)
        self.out = nn.Linear(hidden_dim,output_dim)
        self.dropout = nn.Dropout(p)
    def forward(self,encoder_output , encoder_hidden,target=None,teacher_forcing_ratio=0.5):
        batch_size = encoder_output.shape[0]
        max_len = target.shape[1] if target is not None else MAXLEN
        decoder_input = torch.zeros(batch_size,1,dtype=torch.long).to(device)
        decoder_hidden = encoder_hidden
        outputs = []
        use_teacher_forcing = True if torch.rand(1).item() < teacher_forcing_ratio and target is not None else False

        if use_teacher_forcing:
            for i in range(max_len):
                output, decoder_hidden, weights = self.model(decoder_input, decoder_hidden, encoder_output)
                outputs.append(output)
                decoder_input = target[:, i].unsqueeze(1)  # Use target input at current time step
        else:
            for i in range(max_len):
                output, decoder_hidden, weights = self.model(decoder_input, decoder_hidden, encoder_output)
                outputs.append(output)
                decoder_input = output.argmax(-1)  # Use model's output as input at current time step

        outputs = torch.cat(outputs,dim=1)
        outputs = F.log_softmax(outputs,dim=2)
        return outputs,decoder_hidden
    def model(self,input,hidden,encoder_outputs):
        embedding = self.dropout(self.embedding(input))
        attention_hidden = hidden.permute(1,0,2)
        context,weights = self.attention(encoder_outputs,attention_hidden)
        input_gru = torch.cat((embedding,context),dim=2)
        output , hidden = self.lstm(input_gru,hidden)
        prediction = self.out(output)
        return prediction , hidden , weights

In [None]:
encoder = Encoder(input_lang.n_words,hidden_dim=256).to(device)
decoder = Decoder(output_lang.n_words,256).to(device)
enc_optim = torch.optim.Adam(encoder.parameters(),lr=0.001)
dec_optim = torch.optim.Adam(decoder.parameters(),lr=0.001)
criteriion = nn.NLLLoss()

In [None]:
if not os.path.exists('checkpoint/checkpoint.pth'):
    os.mkdir('checkpoint')
    checkpoint = {
        'encoder':encoder.state_dict(),
        'decoder':decoder.state_dict(),
        'encoder_optimizer':enc_optim.state_dict(),
        'decoder_optimizer':dec_optim.state_dict()
    }
else:
    checkpoint = torch.load('checkpoint/checkpoint.pth',map_location=torch.device(device))
    encoder.load_state_dict(checkpoint['encoder'])
    decoder.load_state_dict(checkpoint['decoder'])
    enc_optim.load_state_dict(checkpoint['encoder_optimizer'])
    dec_optim.load_state_dict(checkpoint['decoder_optimizer'])



In [None]:
1/0

In [None]:
for epoch in range(30):
    e_loss = 0
    print(f'Epoch {epoch}')
    st = time.time()
    for data in dataloader:
        input_tensor , target_tensor = data
        input_tensor = input_tensor.to(device)
        target_tensor = target_tensor.to(device)

        encoder_output , encoder_hidden = encoder(input_tensor)
        outputs , _ = decoder(encoder_output,encoder_hidden,target_tensor)

        loss = criteriion(outputs.view(-1,outputs.size(-1)) ,target_tensor.view(-1))

        loss.backward()

        enc_optim.step()
        dec_optim.step()

        enc_optim.zero_grad()
        dec_optim.zero_grad()

        e_loss = e_loss+loss.item()
    if epoch%1==0:
        checkpoint['encoder']=encoder.state_dict()
        checkpoint['decoder']=decoder.state_dict()
        checkpoint['encoder_optimizer']=enc_optim.state_dict()
        checkpoint['decoder_optimizer']=dec_optim.state_dict()
        torch.save(checkpoint,'checkpoint/checkpoint.pth')
        print('Checkpoint_saved')

    print('-'*30)
    print(f'loss {e_loss/len(dataloader):.3f}')
    print(f'time taken {(time.time()-st)}')


In [None]:
def indexFromSentence(lang , sent):
        return  [lang.word2index[word] for word in str(sent).split()]
def tensorFromSentence(lang, sentence):
    indexes = indexFromSentence(lang, sentence)
    indexes.append(EOS_TOKEN)
    return torch.tensor(indexes, dtype=torch.long, device=device).view(1, -1)
def anuvadkaro(sent):
  with torch.no_grad():
          inp = tensorFromSentence(input_lang,sent)
          # print(f'indexes={inp}')
          enc_out , enc_hidden = encoder(inp)
          dec_out , dec_hid = decoder(enc_out , enc_hidden)
          # print(dec_out.shape)
          dec_out = dec_out.argmax(-1)
          decoded_ids = dec_out.squeeze()
          decoded_words = []
          # print(f'decoded_ids={decoded_ids}')
          for idx in decoded_ids:
              if idx.item() == EOS_TOKEN:
                  decoded_words.append('<EOS>')
                  break
              else:
                  decoded_words.append(output_lang.index2word[idx.item()])
  return decoded_words[:-1]


In [None]:
print(anuvadkaro('highlight duration'))
print(anuvadkaro('perform action'))
print(anuvadkaro('too many selectable children')) #बहुत अधिक चयनीय शिशु हैं