## Datset Link:- http://www.manythings.org/anki/

# Unzipping the Dataset

In [1]:
!unzip mar-eng.zip

Archive:  mar-eng.zip
  inflating: mar.txt                 
  inflating: _about.txt              


# Importing Necessary Libraries

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
import string
from string import digits
import re
from sklearn.utils import shuffle
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import LSTM, Input, Dense,Embedding, Concatenate, TimeDistributed
from tensorflow.keras.models import Model,load_model, model_from_json
from tensorflow.keras.utils import plot_model
from tensorflow.keras.preprocessing.text import one_hot, Tokenizer
from tensorflow.keras.callbacks import EarlyStopping,ModelCheckpoint
import pickle as pkl
import numpy as np

# Data Exploration & Extraction

In [2]:
with open('mar.txt','r',encoding='utf8') as f:
      data = f.read()

In [3]:
uncleaned_data_list = data.split('\n')

In [4]:
len(uncleaned_data_list)

45234

In [5]:
uncleaned_data_list = data.split('\n')
uncleaned_data_list = uncleaned_data_list[:45233]

english_words = []
marathi_words = []

cleaned_data_list = []

for word in uncleaned_data_list:
  english_words.append(word.split('\t')[:-1][0])
  marathi_words.append(word.split('\t')[:-1][1])
    
language_data = pd.DataFrame(columns=['English','Marathi'])
language_data['English'] = english_words
language_data['Marathi'] = marathi_words
language_data.to_csv('eng-mar.csv', index=False)

In [6]:
language_data.head()

Unnamed: 0,English,Marathi
0,Go.,जा.
1,Run!,पळ!
2,Run!,धाव!
3,Run!,पळा!
4,Run!,धावा!


In [8]:
english_texts = language_data['English'].values
marathi_texts = language_data['Marathi'].values

len(english_texts), len(marathi_texts)

(45233, 45233)

# Data Cleaning

In [12]:
#to lower case
english_texts = [x.lower() for x in english_texts]
marathi_texts = [x.lower() for x in marathi_texts]

#removing inverted commas
english_texts = [re.sub("'",'',x) for x in english_texts]
marathi_texts = [re.sub("'",'',x) for x in marathi_texts]

#function to remove punctuation
def remove_punc(text_list):
  table = str.maketrans('', '', string.punctuation)
  removed_punc_text = []
  for sent in text_list:
    sentance = [w.translate(table) for w in sent.split(' ')]
    removed_punc_text.append(' '.join(sentance))
  return removed_punc_text

english_texts = remove_punc(english_texts)
marathi_texts = remove_punc(marathi_texts)

remove_digits = str.maketrans('', '', digits)
removed_digits_text = []

for sent in english_texts:
  sentance = [w.translate(remove_digits) for w in sent.split(' ')]
  removed_digits_text.append(' '.join(sentance))
    
english_texts = removed_digits_text

# removing the digits from the marathi sentances
marathi_texts = [re.sub("[२३०८१५७९४६]","",x) for x in marathi_texts]
marathi_texts = [re.sub("[\u200d]","",x) for x in marathi_texts]

# removing the stating and ending whitespaces
english_texts = [x.strip() for x in english_texts]
marathi_texts = [x.strip() for x in marathi_texts]

 # Adding < start > & < end > token to every japanese sentences

In [11]:
# Putting the start and end words in the marathi sentances
marathi_texts = ["start " + x + " end" for x in marathi_texts]
# manipulated_marathi_text_
marathi_texts[0], english_texts[0]

('start जा end', 'go')

# Train Test Split the Dataset

In [13]:
X = english_texts
Y = marathi_texts
X_train, X_test, y_train, y_test=train_test_split(X,Y,test_size=0.1,random_state=0)

In [15]:
len(X_train),len(X_test)

(40709, 4524)

## Maximum length of a sentence in train & test data

In [16]:
def Max_length(data):
  max_length_ = max([len(x.split(' ')) for x in data])
  return max_length_
#Training data
max_length_english = Max_length(X_train)
max_length_marathi = Max_length(y_train)
#Test data
max_length_english_test = Max_length(X_test)
max_length_marathi_test = Max_length(y_test)
max_length_marathi, max_length_english

(37, 35)

In [17]:
max_length_marathi_test,max_length_english_test

(25, 25)

# Tokenizing Data using Keras Tokenizer

In [18]:
englishTokenizer = Tokenizer()
englishTokenizer.fit_on_texts(X_train)
Eword2index = englishTokenizer.word_index
vocab_size_source = len(Eword2index) + 1

X_train = englishTokenizer.texts_to_sequences(X_train)
X_train = pad_sequences(X_train, maxlen=max_length_english, padding='post')
X_test = englishTokenizer.texts_to_sequences(X_test)
X_test = pad_sequences(X_test, maxlen = max_length_english, padding='post')

marathiTokenizer = Tokenizer()
marathiTokenizer.fit_on_texts(y_train)
Mword2index = marathiTokenizer.word_index
vocab_size_target = len(Mword2index) + 1

y_train = marathiTokenizer.texts_to_sequences(y_train)
y_train = pad_sequences(y_train, maxlen=max_length_marathi, padding='post')
y_test = marathiTokenizer.texts_to_sequences(y_test)
y_test = pad_sequences(y_test, maxlen = max_length_marathi, padding='post')

vocab_size_source, vocab_size_target

(5643, 13587)

In [19]:
X_train[0], y_train[0]

(array([ 86, 122,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0], dtype=int32),
 array([  1,  39, 105,   3,   5,   2,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0], dtype=int32))

# Saving the Data & Tokenizers

In [30]:
with open('English - Marathi/NMT_data.pkl','wb') as f:
  pkl.dump([X_train, y_train, X_test, y_test],f)
with open('English - Marathi/NMT_Etokenizer.pkl','wb') as f:
  pkl.dump([vocab_size_source, Eword2index, englishTokenizer], f)
with open('English - Marathi/NMT_Mtokenizer.pkl', 'wb') as f:
  pkl.dump([vocab_size_target, Mword2index, marathiTokenizer], f)
X_train = np.array(X_train)
y_train = np.array(y_train)
X_test = np.array(X_test)
y_test = np.array(y_test)

# Loading Data & Tokenizers

In [44]:
with open('English - Marathi/NMT_Etokenizer.pkl','rb') as f:
  vocab_size_source,Eword2index,englishTokenizer = pkl.load(f)
with open('English - Marathi/NMT_Mtokenizer.pkl','rb') as f:
  vocab_size_target,Mword2index,marathiTokenizer = pkl.load(f)

In [45]:
with open('English - Marathi/NMT_data.pkl','rb') as f:
    X_train, y_train, X_test, y_test = pkl.load(f)

# Pay Attention - Importing Attention Layer

In [25]:
from attention import AttentionLayer

In [26]:
from keras import backend as K 
K.clear_session() 
latent_dim = 500

# Creating Our Main Neural Network

In [27]:
# Encoder 
encoder_inputs = Input(shape=(max_length_english,)) 
enc_emb = Embedding(vocab_size_source, latent_dim,trainable=True)(encoder_inputs)
#LSTM 1 
encoder_lstm1 = LSTM(latent_dim,return_sequences=True,return_state=True) 
encoder_output1, state_h1, state_c1 = encoder_lstm1(enc_emb)
#LSTM 2 
encoder_lstm2 = LSTM(latent_dim,return_sequences=True,return_state=True) 
encoder_output2, state_h2, state_c2 = encoder_lstm2(encoder_output1)
#LSTM 3 
encoder_lstm3=LSTM(latent_dim, return_state=True, return_sequences=True) 
encoder_outputs, state_h, state_c= encoder_lstm3(encoder_output2)
# Set up the decoder. 
decoder_inputs = Input(shape=(None,)) 
dec_emb_layer = Embedding(vocab_size_target, latent_dim,trainable=True) 
dec_emb = dec_emb_layer(decoder_inputs)
#LSTM using encoder_states as initial state
decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True) 
decoder_outputs,decoder_fwd_state, decoder_back_state = decoder_lstm(dec_emb,initial_state=[state_h, state_c])
#Attention Layer
attn_layer = AttentionLayer(name='attention_layer') 
attn_out, attn_states = attn_layer([encoder_outputs, decoder_outputs])
# Concat attention output and decoder LSTM output 
decoder_concat_input = Concatenate(axis=-1, name='concat_layer')([decoder_outputs, attn_out])
#Dense layer
decoder_dense = TimeDistributed(Dense(vocab_size_target, activation='softmax')) 
decoder_outputs = decoder_dense(decoder_concat_input)
# Define the model
model = Model([encoder_inputs, decoder_inputs], decoder_outputs) 
plot_model(model, to_file='train_model.png', show_shapes=True)

('You must install pydot (`pip install pydot`) and install graphviz (see instructions at https://graphviz.gitlab.io/download/) ', 'for plot_model/model_to_dot to work.')


In [28]:
model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 35)]         0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (None, 35, 500)      2821500     input_1[0][0]                    
__________________________________________________________________________________________________
lstm (LSTM)                     [(None, 35, 500), (N 2002000     embedding[0][0]                  
__________________________________________________________________________________________________
input_2 (InputLayer)            [(None, None)]       0                                            
______________________________________________________________________________________________

# Compiling & Defining Checkpoints

In [29]:
model.compile(optimizer='rmsprop',
              loss='sparse_categorical_crossentropy', 
              metrics=['accuracy'])

In [30]:
es = EarlyStopping(monitor='val_loss', mode='min', verbose=1)

In [31]:
print(y_train[:5])
print(y_train[:,:-1])

[[   1   39  105    3    5    2    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0]
 [   1   46 1258 4333 1076    3    2    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0]
 [   1   25  542  907   48    2    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0]
 [   1   22  233    5    2    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0]
 [   1   25  753   36 3155 5404 4334    2    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0]]
[[   1   39  105 ...    0    0    0]
 [

## Checkpoint for saving model whenver validation loss improves

In [32]:
checkpoint = ModelCheckpoint(filepath="english_to_marathi+attention.h5", 
                             monitor='val_loss',
                             verbose=1, 
                             save_best_only=True,
                             mode='min')

## A keyborad Interrupt was made during training because the accuracy already saturated for only 25 epochs

In [33]:
history = model.fit([X_train, y_train[:,:-1]], y_train.reshape(y_train.shape[0], y_train.shape[1],1)[:,1:], 
                    epochs=50, 
                    callbacks=[checkpoint],
                    batch_size=250,
                    validation_data = ([X_test, y_test[:,:-1]],y_test.reshape(y_test.shape[0], y_test.shape[1], 1)[:,1:]))

Epoch 1/50

Epoch 00001: val_loss improved from inf to 0.78112, saving model to english_to_marathi+attention.h5
Epoch 2/50

Epoch 00002: val_loss improved from 0.78112 to 0.68071, saving model to english_to_marathi+attention.h5
Epoch 3/50

Epoch 00003: val_loss improved from 0.68071 to 0.59326, saving model to english_to_marathi+attention.h5
Epoch 4/50

Epoch 00004: val_loss improved from 0.59326 to 0.52446, saving model to english_to_marathi+attention.h5
Epoch 5/50

Epoch 00005: val_loss improved from 0.52446 to 0.47423, saving model to english_to_marathi+attention.h5
Epoch 6/50

Epoch 00006: val_loss improved from 0.47423 to 0.43461, saving model to english_to_marathi+attention.h5
Epoch 7/50

Epoch 00007: val_loss improved from 0.43461 to 0.41350, saving model to english_to_marathi+attention.h5
Epoch 8/50

Epoch 00008: val_loss improved from 0.41350 to 0.38300, saving model to english_to_marathi+attention.h5
Epoch 9/50

Epoch 00009: val_loss improved from 0.38300 to 0.36180, saving m

KeyboardInterrupt: 

# Importing & Loading Saved Model

In [20]:
from tensorflow.keras.models import load_model

In [22]:
from attention import AttentionLayer

In [23]:
model_loaded = load_model('english_to_marathi+attention.h5',custom_objects={'AttentionLayer': AttentionLayer})

# Creating Inference Model!!

In [25]:
latent_dim=500
# encoder inference
encoder_inputs = model_loaded.input[0]  #loading encoder_inputs
encoder_outputs, state_h, state_c = model_loaded.layers[6].output #loading encoder_outputs
#print(encoder_outputs.shape)
encoder_model = Model(inputs=encoder_inputs,outputs=[encoder_outputs, state_h, state_c])
# decoder inference
# Below tensors will hold the states of the previous time step
decoder_state_input_h = Input(shape=(latent_dim,))
decoder_state_input_c = Input(shape=(latent_dim,))
decoder_hidden_state_input = Input(shape=(35,latent_dim))
# Get the embeddings of the decoder sequence
decoder_inputs = model_loaded.layers[3].output
#print(decoder_inputs.shape)
dec_emb_layer = model_loaded.layers[5]
dec_emb2= dec_emb_layer(decoder_inputs)
# To predict the next word in the sequence, set the initial states to the states from the previous time step
decoder_lstm = model_loaded.layers[7]
decoder_outputs2, state_h2, state_c2 = decoder_lstm(dec_emb2, initial_state=[decoder_state_input_h, decoder_state_input_c])
#attention inference
attn_layer = model_loaded.layers[8]
attn_out_inf, attn_states_inf = attn_layer([decoder_hidden_state_input, decoder_outputs2])
concate = model_loaded.layers[9]
decoder_inf_concat = concate([decoder_outputs2, attn_out_inf])
# A dense softmax layer to generate prob dist. over the target vocabulary
decoder_dense = model_loaded.layers[10]
decoder_outputs2 = decoder_dense(decoder_inf_concat)
# Final decoder model
decoder_model = Model(
[decoder_inputs] + [decoder_hidden_state_input,decoder_state_input_h, decoder_state_input_c],
[decoder_outputs2] + [state_h2, state_c2])
# decoder_model = Model(
# [decoder_inputs] + [decoder_hidden_state_input,decoder_state_input_h, decoder_state_input_c],
# [decoder_outputs2] + [state_h2, state_c2])

In [26]:
decoder_model.summary()

Model: "model_3"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_2 (InputLayer)            [(None, None)]       0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, None, 500)    6793500     input_2[0][0]                    
__________________________________________________________________________________________________
input_4 (InputLayer)            [(None, 500)]        0                                            
__________________________________________________________________________________________________
input_5 (InputLayer)            [(None, 500)]        0                                            
____________________________________________________________________________________________

## Defining index2word for loaded tokenizers

In [28]:
Eindex2word = englishTokenizer.index_word
Mindex2word = marathiTokenizer.index_word

# Decoder function for decoding Numerical output from model

In [34]:
def decode_output(input_seq):
    e_out, e_h, e_c = encoder_model.predict(input_seq)
    target_seq = np.zeros((1,1))
    target_seq[0, 0] = Mword2index['start']
    stop_condition = False
    decoded_sentence = ''
    
    while not stop_condition:
        output_tokens, h, c = decoder_model.predict([target_seq] + [e_out, e_h, e_c])
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        if sampled_token_index == 0:
          break
        else:
          sampled_token = Mindex2word[sampled_token_index]

          if(sampled_token!='end'):
              decoded_sentence += ' '+sampled_token

              # Exit condition: either hit max length or find stop word.
              if (sampled_token == 'end' or len(decoded_sentence.split()) >= (26-1)):
                  stop_condition = True

          # Update the target sequence (of length 1).
          target_seq = np.zeros((1,1))
          target_seq[0, 0] = sampled_token_index

          # Update internal states
          e_h, e_c = h, c

    return decoded_sentence

In [35]:
def Seq2Marathi(input_seq):
    newString=''
    for i in input_seq:
      if((i!=0 and i!=Mword2index['start']) and i!=Mword2index['end']):
        newString=newString+Mindex2word[i]+' '
    return newString

def Seq2English(input_seq):
    newString=''
    for i in input_seq:
      if(i!=0):
        newString=newString+Eindex2word[i]+' '
    return newString

In [36]:
for i in range(10):  
  print("English:-",Seq2English(X_test[i]))
  print("Original Marathi:",Seq2Marathi(y_test[i]))
  print("Predicted Marathi:",decode_output(X_test[i].reshape(1,35)))
  print("\n")

English:- tom walked to his office 
Original Marathi: टॉम आपल्या ऑफिसला चालत गेला 
Predicted Marathi:  टॉम त्याच्या ऑफिसला गेला


English:- i want you near me 
Original Marathi: मला तू माझ्याजवळ हवा आहेस 
Predicted Marathi:  मला तुम्ही माझ्याजवळ हव्या आहात


English:- i dont want to go outside 
Original Marathi: मला बाहेर नाही जायचंय 
Predicted Marathi:  मला बाहेर जायचं नाहीये


English:- february th is northern territories day in japan 
Original Marathi: जपानमध्ये फेब्रुवारी हा उत्तर भूप्रदेश दिवस असतो 
Predicted Marathi:  जपानमध्ये हा एकेकाळी ब्रिटिश लोकं एक रोमन होता


English:- you are my prisoner 
Original Marathi: तू माझा कैदी आहेस 
Predicted Marathi:  तुम्ही माझे कैदी आहात


English:- its all my fault 
Original Marathi: ही सगळी माझी चूक आहे 
Predicted Marathi:  ही सगळी माझीच चूक आहे


English:- show me your hands 
Original Marathi: हात दाखव 
Predicted Marathi:  तुझे हात दाखवा


English:- he likes geography and history 
Original Marathi: त्याला भूगोल आणि इतिहास आवडतात 
Predicted 

# Defining a predict function

In [40]:
def remove_punc(text_list):
  table = str.maketrans('', '', string.punctuation)
  removed_punc_text = []
  for sent in text_list:
    sentance = [w.translate(table) for w in sent.split(' ')]
    removed_punc_text.append(' '.join(sentance))
  return removed_punc_text

def PredMarathi(statement):
    english_text = [statement]
    english_text = [x.lower() for x in english_text]
    english_text = [re.sub("'",'',x) for x in english_text]
    english_text = remove_punc(english_text)
    
    remove_digits = str.maketrans('', '', digits)
    removed_digits_text = []
    for sent in english_text:
      sentance = [w.translate(remove_digits) for w in sent.split(' ')]
      removed_digits_text.append(' '.join(sentance))
    english_text = removed_digits_text
    english_text = [x.strip() for x in english_text]
    
    english_text = englishTokenizer.texts_to_sequences(english_text)
    english_text = np.array(english_text)
    english_text = pad_sequences(english_text, maxlen=35, padding='post')
    # 35 is the length of the english sentence with maximum length!!!
    print("Predicted Marathi:",decode_sequence(english_text.reshape(1,35)))

# Some Predictions 

In [41]:
PredMarathi("I am a good boy!!!")

Predicted Marathi:  मी चांगला मुलगा आहे


In [42]:
PredMarathi("How are you?")

Predicted Marathi:  तू कशी आहेस


In [45]:
PredMarathi("She is a beautiful girl!!!")

Predicted Marathi:  ती एक सुंदर मुलगी आहे
