<a href="https://colab.research.google.com/github/M-Arvind/Hindi-to-English-translation-model/blob/main/Hindi_English_Machine_translation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import pandas as pd
import string
import numpy as np
import re
import os
from tensorflow.keras.metrics import Accuracy
from tensorflow.keras.preprocessing.text import text_to_word_sequence
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Model, load_model
from tensorflow.keras.layers import Input, LSTM, Dense, Embedding, Bidirectional, Concatenate, Dropout
from tensorflow.keras.regularizers import l1, l2
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping
from tensorflow.train import latest_checkpoint
from matplotlib import pyplot

In [None]:
data = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Hindi_English_Truncated_Corpus.csv', encoding='utf-8')

In [None]:
data = data[data['source'] == 'ted']

In [None]:
data = data.sample(n = 10000, random_state=42)

In [None]:
data

Unnamed: 0,source,english_sentence,hindi_sentence
117380,ted,So another thing the robot can do,एक और बात जो रोबोट कर सकते हैं
26491,ted,that makes it really easy for publishers right...,जो प्रकाशकों के लिए इस सामाग्री को बनाना आसान ...
112352,ted,"from that teacher, Mrs. Posten","उसी शिक्षिका का, श्रीमती पोस्टन(Mrs. Posten)"
97358,ted,which no child would play inside the classroom...,जिसे कोई बच्चा कक्षा में या घर पर नहीं खेलेगा.
71183,ted,Do you have any recommendations?”,क्या आपकी नज़र में कोई है?”
...,...,...,...
65655,ted,A friend of mine did that - Richard Bollingbroke.,जिसे मेरे एक मित्र रिचर्ड बोलिंगब्रोक(Richard ...
72438,ted,or this year's floods.,और इस साल आये बाढ़ के बाद प्रकट किया |
11430,ted,struggle to get by.,मामूली चीजो केलिए संघर्ष कर रहे है.
16594,ted,"are giving some people a sense of, “Gosh, well...","कुछ लोगों को यह महसूस करवा रहे हैं ,“ हे भगवान..."


In [None]:
def clean_text(text):
    text = str(text)
    text = re.sub(r"i'm", "i am", text)
    text = re.sub(r"he's", "he is", text)
    text = re.sub(r"she's", "she is", text)
    text = re.sub(r"it's", "it is", text)
    text = re.sub(r"that's", "that is", text)
    text = re.sub(r"what's", "what is", text)
    text = re.sub(r"where's", "where is", text)
    text = re.sub(r"how's", "how is", text)
    text = re.sub(r"\'ll", " will", text)
    text = re.sub(r"\'ve", " have", text)
    text = re.sub(r"\'re", " are", text)
    text = re.sub(r"\'d", " would", text)
    text = re.sub(r"\'re", " are", text)
    text = re.sub(r"won't", "will not", text)
    text = re.sub(r"can't", "cannot", text)
    text = re.sub(r"n't", " not", text)
    text = re.sub(r"n'", "ng", text)
    text = re.sub(r"'bout", "about", text)
    text = re.sub(r"'til", "until", text)
    
    return text

In [None]:
data['english_sentence'] = data['english_sentence'].apply(lambda x: clean_text(x))
data['hindi_sentence'] = data['hindi_sentence'].apply(lambda x: clean_text(x))

In [None]:
data['english_sentence']=data['english_sentence'].apply(lambda x: x.lower())
data['hindi_sentence']=data['hindi_sentence'].apply(lambda x: x.lower())
data['english_sentence']=data['english_sentence'].apply(lambda x: x.strip())
data['hindi_sentence']=data['hindi_sentence'].apply(lambda x: x.strip())

In [None]:
english_filters = '।!“”1234567890"#$%&\'()*+,-./:;=?@[\\]^<>`{|}~' 
hindi_filters = '।!“”२३०८१५७९४६1234567890abcdefghijklmnopqrstuvwxyz"#$%&\'()*+,-./:;=?@[\\]^<>`{|}~' 
data['english_sentence'] = data['english_sentence'].apply(lambda x : ''.join(y for y in x if y not in english_filters))
data['hindi_sentence'] = data['hindi_sentence'].apply(lambda x : ''.join(y for y in x if y not in hindi_filters))

In [None]:
data['hindi_sentence'] = data['hindi_sentence'].apply(lambda x: 'START_ '+ x + ' _END')

In [None]:
all_eng_words=set()
for eng in data['english_sentence']:
    for word in eng.split():
        if word not in all_eng_words:
            all_eng_words.add(word)

all_hindi_words=set()
for hin in data['hindi_sentence']:
    for word in hin.split():
        if word not in all_hindi_words:
            all_hindi_words.add(word)

In [None]:
data['length_eng_sentence']=data['english_sentence'].apply(lambda x:len(x.split()))
data['length_hin_sentence']=data['hindi_sentence'].apply(lambda x:len(x.split()))

In [None]:
for x in data['length_eng_sentence']:
  if x>50:
    print(x)

In [None]:
len(all_eng_words)

8238

In [None]:
max_length_src=max(data['length_eng_sentence'])
max_length_tar=max(data['length_hin_sentence'])

In [None]:
max_length_tar

32

In [None]:
input_words = sorted(list(all_eng_words))
target_words = sorted(list(all_hindi_words))
num_encoder_tokens = len(all_eng_words)
num_decoder_tokens = len(all_hindi_words)
num_encoder_tokens, num_decoder_tokens

(8238, 9788)

In [None]:
num_encoder_tokens += 1
num_decoder_tokens += 1

In [None]:
input_token_index = dict([(word, i+1) for i, word in enumerate(input_words)])
target_token_index = dict([(word, i+1) for i, word in enumerate(target_words)])

In [None]:
reverse_input_char_index = dict((i, word) for word, i in input_token_index.items())
reverse_target_char_index = dict((i, word) for word, i in target_token_index.items())

In [None]:
data = shuffle(data)

In [None]:
X, y = data['english_sentence'], data['hindi_sentence']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.5,random_state=42)
X_train.shape, X_test.shape

((5000,), (5000,))

In [None]:
def generate_batch(X = X_train, y = y_train, batch_size = 32):
    while True:
        for j in range(0, len(X), batch_size):
            encoder_input_data = np.zeros((batch_size, max_length_src),dtype='float32')
            decoder_input_data = np.zeros((batch_size, max_length_tar),dtype='float32')
            decoder_target_data = np.zeros((batch_size, max_length_tar, num_decoder_tokens),dtype='float32')
            for i, (input_text, target_text) in enumerate(zip(X[j:j+batch_size], y[j:j+batch_size])):
                for t, word in enumerate(input_text.split()):
                    encoder_input_data[i, t] = input_token_index[word]
                for t, word in enumerate(target_text.split()):
                    if t<len(target_text.split())-1:
                        decoder_input_data[i, t] = target_token_index[word] 
                    if t>0:
                        decoder_target_data[i, t - 1, target_token_index[word]] = 1.
            yield([encoder_input_data, decoder_input_data], decoder_target_data)

In [None]:
Dim = 130
latent_dim = 150

In [None]:
train_samples = len(X_train)
test_samples = len(X_test)
batch_size = 32
epochs = 30

In [None]:
encoder_inputs = Input(shape=(None,))
encoder_emd = Embedding(num_encoder_tokens, latent_dim, mask_zero = True)(encoder_inputs)
encoder_lstm = LSTM(Dim,return_sequences=True, return_state=True)(encoder_emd)
encoder_lstm2 = LSTM(Dim, return_state=True)
encoder_outputs, state_h, state_c = encoder_lstm2(encoder_lstm)
encoder_states = [state_h, state_c]

decoder_inputs = Input(shape=(None, ))
decoder_emd = Embedding(num_decoder_tokens, latent_dim, mask_zero = True)(decoder_inputs)
decoder_lstm = LSTM(Dim, return_sequences=True, return_state=True)
decoder_output, _, _ = decoder_lstm(decoder_emd, initial_state = encoder_states)

decoder_dense = Dense(num_decoder_tokens, activation='softmax')
decoder_outputs = decoder_dense(decoder_output)

Model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

In [None]:
callback = EarlyStopping(monitor='accuracy', verbose=1, patience= 10)

In [None]:
Model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_2 (InputLayer)            [(None, None)]       0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, None, 150)    1235850     input_2[0][0]                    
__________________________________________________________________________________________________
input_3 (InputLayer)            [(None, None)]       0                                            
__________________________________________________________________________________________________
lstm (LSTM)                     [(None, None, 130),  146120      embedding_1[0][0]                
______________________________________________________________________________________________

In [None]:
Model.compile(optimizer=Adam(lr=0.01, beta_1=0.9, beta_2=0.999, decay=0.001), loss ='categorical_crossentropy', metrics=['accuracy'])

  "The `lr` argument is deprecated, use `learning_rate` instead.")


In [None]:
history = Model.fit_generator(generator = generate_batch(X_train, y_train, batch_size=batch_size),
                    steps_per_epoch=train_samples//batch_size,
                    epochs = epochs,
                    validation_data = generate_batch(X_test, y_test, batch_size=batch_size),
                    validation_steps = test_samples//batch_size,
                    callbacks=callback)



Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


In [None]:
Model.save('drive/MyDrive/saved_model/my_model')



INFO:tensorflow:Assets written to: drive/MyDrive/saved_model/my_model/assets


INFO:tensorflow:Assets written to: drive/MyDrive/saved_model/my_model/assets
