In [1]:
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd
import string, os
import contractions
from tensorflow.keras.utils import Sequence
from tensorflow.keras.layers import Bidirectional
from tensorflow.keras.layers import Input, LSTM, Embedding
from tensorflow.keras.callbacks import ModelCheckpoint, Callback
from tensorflow.keras.layers import Concatenate, Dropout
from tensorflow.keras.layers import LeakyReLU
from tensorflow.keras.callbacks import ReduceLROnPlateau, CSVLogger
from tensorflow.keras.layers import TimeDistributed
from tensorflow.keras.optimizers import RMSprop
from tensorflow.keras.layers import Dense
from tensorflow.keras.models import Model
from tensorflow.keras import backend
from tensorflow.keras.models import load_model

In [2]:
corpus_df = pd.read_csv('eng-hindi.csv').sample(frac=.06, random_state=3, replace=False).reset_index(drop=True)
corpus_df.head()

Unnamed: 0,english_sentence,hindi_sentence
0,Working out of his studio at Amir Mahal in Che...,चेन्नै के रोयपेट्टां में अपने स्टुड़ियो में का...
1,Temperature can be observed by inserting an or...,जानवर की गुदा में सामान्य थर्मामीटर डालकर उसका...
2,but for a life jacket.,मगर बस एक लाइफ़-जैकेट ही बची थी।
3,Background and next steps,पृष्ठभूमि तथा अगले कदम
4,Main point: Detailed story of Mahabharat,मुख्य उल्लेख :महाभारत की विस्तृत कथा


In [3]:
def data_processing(sentence, lang):
    # strip any unneccesary spaces
    sentence = sentence.strip()
    # expand word contractions for english
    if lang == 'eng':
        sentence = contractions.fix(sentence)
    sentence = sentence.lower()
    # remove all punctuations from sentence '!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'
    sentence = sentence.translate(str.maketrans("", "", string.punctuation + "‘“”"))
    sentence = " ".join([w for w in sentence.split()]) # de-constructing the sentence
    # append <BOS> and <EOS> token
    # EOS (end of a sentence) 
    # BOS (beginning of a sentence)
    sentence = "<BOS> " + sentence + " <EOS>"
    return sentence

corpus_df.english_sentence = corpus_df.english_sentence.apply(data_processing, lang='eng')
corpus_df.hindi_sentence = corpus_df.hindi_sentence.apply(data_processing, lang='hindi')

In [4]:
corpus_df.head()

Unnamed: 0,english_sentence,hindi_sentence
0,<BOS> working out of his studio at amir mahal ...,<BOS> चेन्नै के रोयपेट्टां में अपने स्टुड़ियो ...
1,<BOS> temperature can be observed by inserting...,<BOS> जानवर की गुदा में सामान्य थर्मामीटर डालक...
2,<BOS> but for a life jacket <EOS>,<BOS> मगर बस एक लाइफ़जैकेट ही बची थी। <EOS>
3,<BOS> background and next steps <EOS>,<BOS> पृष्ठभूमि तथा अगले कदम <EOS>
4,<BOS> main point detailed story of mahabharat ...,<BOS> मुख्य उल्लेख महाभारत की विस्तृत कथा <EOS>


In [5]:
eng_index, hindi_index = set(), set()

for idx, (sen, sen1) in enumerate(zip(
              corpus_df['english_sentence'],
              corpus_df['hindi_sentence'])):
    sen = sen.split()
    sen1 = sen1.split()
    if len(sen) > 20:
        eng_index.add(idx)
    if len(sen1) > 20:
        hindi_index.add(idx)
        
indices = list(eng_index.union(hindi_index))
corpus_df.drop(indices, axis=0, inplace=True)

In [6]:
# splitting into train and test set
x, y = corpus_df['english_sentence'].to_numpy(), corpus_df['hindi_sentence'].to_numpy()

eng_train, eng_test, hindi_train, hindi_test = train_test_split(x, y, test_size= .05, random_state=3)

In [7]:
# retrieve english and hindi vocabulary.
eng_vocab = set()
hindi_vocab = set()

MAX_LEN_ENG = 0
MAX_LEN_HIN = 0

for idx, sen in enumerate(eng_train):
    eng_words = sen.split()
    # it returns max sentence length
    MAX_LEN_ENG = max(MAX_LEN_ENG, len(eng_words))
    for e_w in eng_words:
        if e_w not in eng_vocab:
            eng_vocab.add(e_w)

for idx, sen in enumerate(hindi_train):
    hindi_words = sen.split()
    MAX_LEN_HIN = max(MAX_LEN_HIN, len(hindi_words))
    for h_w in hindi_words:
        if h_w not in hindi_vocab:
            hindi_vocab.add(h_w)
            
# Adding unknown word tokens
eng_vocab.add('UNK')
hindi_vocab.add('UNK')
MAX_LEN_ENG += 1
MAX_LEN_HIN += 1

In [8]:
en_vocab_idx = {j : i for i, j in enumerate(eng_vocab, 1)}
idx_vocab_en = dict(map(reversed, en_vocab_idx.items()))

hin_vocab_idx = {j : i for i, j in enumerate(hindi_vocab, 1)}
idx_vocab_hin = dict(map(reversed, hin_vocab_idx.items()))

In [9]:
INPUT_VOCAB = len(eng_vocab) + 1
TARGET_VOCAB = len(hindi_vocab) + 1

print(f'Maximum sentence length(i.e n words) ENGLISH: {MAX_LEN_ENG}')
print(f'Maximum sentence length(i.e n words) HINDI: {MAX_LEN_HIN}')
print(f'Size of the vocabulary (English) : {INPUT_VOCAB}')
print(f'Size of the vocabulary (Hindi) : {TARGET_VOCAB}')

Maximum sentence length(i.e n words) ENGLISH: 21
Maximum sentence length(i.e n words) HINDI: 21
Size of the vocabulary (English) : 8071
Size of the vocabulary (Hindi) : 9315


In [10]:
class DataGenerator(Sequence):
    """
    Generates batches 
    """    
    def __init__(self, eng_train, hindi_train, 
                 MAX_LEN_ENG, 
                 MAX_LEN_HIN,
                 TARGET_VOCAB,
                 batch_size, 
                 shuffle,
                 training=True):        
        self.batch_size = batch_size
        self.shuffle = shuffle
        self.eng_train = eng_train
        self.hindi_train = hindi_train
        self.MAX_LEN_ENG = MAX_LEN_ENG
        self.MAX_LEN_HIN = MAX_LEN_HIN
        self.TARGET_VOCAB = TARGET_VOCAB
        self.on_epoch_end()
        
    def __len__(self):
        return len(self.eng_train) // self.batch_size
        
    def __getitem__(self, index):
        indexes = self.indexes[index*self.batch_size:(index+1)*self.batch_size]
        eng_batch = [self.eng_train[i] for i in indexes]
        hindi_batch = [self.hindi_train[i] for i in indexes]
        input_data, decoder_target = self.__data_generation(eng_batch, hindi_batch)
        return input_data, decoder_target
    
    def on_epoch_end(self):
        self.indexes = np.arange(len(self.eng_train))
        if self.shuffle == True:
            np.random.shuffle(self.indexes)
            
    def __data_generation(self, eng_batch, hindi_batch):
        encoder_input = np.zeros((self.batch_size, self.MAX_LEN_ENG), dtype='float32')  
        decoder_input = np.zeros((self.batch_size, self.MAX_LEN_HIN), dtype='float32')
        decoder_target = np.zeros((self.batch_size, self.MAX_LEN_HIN, TARGET_VOCAB), dtype='float32')
        
        for enum, sen in enumerate(eng_batch):
            sen = sen.split()
            for x, e_word in enumerate(sen):
                try:
                    encoder_input[enum, x] = en_vocab_idx[e_word]
                except KeyError:
                    encoder_input[enum, x] = en_vocab_idx['UNK']
        
        for enum, sen in enumerate(hindi_batch):
            sen = sen.split()
            for x, h_word in enumerate(sen):
                try:
                    decoder_input[enum, x] = hin_vocab_idx[h_word]
                    if x > 0:
                        decoder_target[enum, x - 1, hin_vocab_idx[h_word]] = 1.
                except KeyError:
                    decoder_input[enum, x] = hin_vocab_idx['UNK']
                    if x > 0:
                        decoder_target[enum, x - 1, hin_vocab_idx['UNK']] = 1.
        return [encoder_input, decoder_input], decoder_target

In [12]:
latent_dim = 150

## Encoder- SETUP
encoder_inputs = Input(shape=(MAX_LEN_ENG,))

encoder_emb_layer = Embedding(INPUT_VOCAB, latent_dim, mask_zero=True)(encoder_inputs)
encoder_dropout1 = Dropout(0.4)(encoder_emb_layer)
encoder_lstm_layer1 = Bidirectional(LSTM(latent_dim, return_sequences=True))(encoder_dropout1)
encoder_lstm_layer2 = Bidirectional(LSTM(latent_dim, return_state=True))

encoder_outputs, fstate_h, fstate_c, bstate_h, bstate_c = encoder_lstm_layer2(encoder_lstm_layer1)
state_h = Concatenate()([fstate_h, bstate_h])
state_c = Concatenate()([fstate_c, bstate_c])
encoder_states = [state_h, state_c]

## Decoder-SETUP
decoder_inputs = Input(shape=(MAX_LEN_HIN,))
decoder_emb_layer = Embedding(TARGET_VOCAB, latent_dim, mask_zero=True)
decoder_dropout1 = Dropout(0.4)
decoder_lstm_layer1 = LSTM(latent_dim*2, return_sequences=True)
decoder_lstm_layer2 = LSTM(latent_dim*2, return_sequences=True, return_state=True)

decoder_dropout1 = decoder_dropout1(decoder_emb_layer(decoder_inputs))
decoder_outputs, _, _ = decoder_lstm_layer2(decoder_lstm_layer1(decoder_dropout1, initial_state=encoder_states))
decoder_dense1 = Dense(latent_dim, activation=LeakyReLU(), kernel_initializer='he_uniform')
decoder_dense2 = Dense(TARGET_VOCAB, activation='softmax')
decoder_outputs = decoder_dense2(decoder_dense1(decoder_outputs))

In [13]:
nmt_model = Model(inputs=[encoder_inputs, decoder_inputs], outputs=decoder_outputs, name='NMT')
nmt_model.compile(optimizer=RMSprop(0.01), loss='categorical_crossentropy')
nmt_model.summary(line_length=105)

Model: "NMT"
_________________________________________________________________________________________________________
Layer (type)                      Output Shape           Param #      Connected to                       
input_3 (InputLayer)              [(None, 21)]           0                                               
_________________________________________________________________________________________________________
embedding_2 (Embedding)           (None, 21, 150)        1210650      input_3[0][0]                      
_________________________________________________________________________________________________________
dropout_2 (Dropout)               (None, 21, 150)        0            embedding_2[0][0]                  
_________________________________________________________________________________________________________
input_4 (InputLayer)              [(None, 21)]           0                                               
_________________________________

In [14]:
# Custom Callback
class _learning_rate(Callback) :
    def on_epoch_begin(self, epoch, logs=None):
        lr = float(backend.get_value(self.model.optimizer.lr))
        print(f"Learning Rate for epoch: {epoch}: ", lr)        

# callbacks
R_LR = ReduceLROnPlateau(monitor='val_loss', verbose=1, min_delta=1e-04, cooldown=2)
LOG = CSVLogger('training1.log', separator=';', append=True)
MC = ModelCheckpoint('saved/model-best-{epoch:03d}.h5', mode='min')

# Remove any earlier saved models if any.
if os.listdir('saved'):
    os.system('rm -rf ./saved/*')
        
# Default configs
batch_size = 128
nb_epochs = 100

In [15]:
# Data Generators.
train_gen = DataGenerator(eng_train, hindi_train, 
                          MAX_LEN_ENG, 
                          MAX_LEN_HIN,
                          TARGET_VOCAB,
                          batch_size=batch_size,
                          shuffle=True)
dev_gen = DataGenerator(eng_test, hindi_test, 
                        MAX_LEN_ENG, 
                        MAX_LEN_HIN,
                        TARGET_VOCAB,
                        batch_size=batch_size,
                        shuffle=True)

nmt_model.fit_generator(train_gen,
                        steps_per_epoch=INPUT_VOCAB//batch_size,
                        epochs=nb_epochs,
                        validation_data=dev_gen,
                        validation_steps=TARGET_VOCAB//batch_size,
                        verbose=1,
                        shuffle=True,
                        callbacks=[MC, _learning_rate(), R_LR, LOG])

Learning Rate for epoch: 0:  0.009999999776482582
Epoch 1/100
Learning Rate for epoch: 1:  0.009999999776482582
Epoch 2/100
Learning Rate for epoch: 2:  0.009999999776482582
Epoch 3/100
Learning Rate for epoch: 3:  0.009999999776482582
Epoch 4/100
Learning Rate for epoch: 4:  0.009999999776482582
Epoch 5/100
Learning Rate for epoch: 5:  0.009999999776482582
Epoch 6/100
Learning Rate for epoch: 6:  0.009999999776482582
Epoch 7/100
Learning Rate for epoch: 7:  0.009999999776482582
Epoch 8/100
Learning Rate for epoch: 8:  0.009999999776482582
Epoch 9/100
Learning Rate for epoch: 9:  0.009999999776482582
Epoch 10/100
Learning Rate for epoch: 10:  0.009999999776482582
Epoch 11/100
Learning Rate for epoch: 11:  0.009999999776482582
Epoch 12/100
Learning Rate for epoch: 12:  0.009999999776482582
Epoch 13/100
Learning Rate for epoch: 13:  0.009999999776482582
Epoch 14/100
Epoch 00014: ReduceLROnPlateau reducing learning rate to 0.0009999999776482583.
Learning Rate for epoch: 14:  0.00099999993

<tensorflow.python.keras.callbacks.History at 0x7f6948a99bd0>

In [142]:
# create an inference encoder model from the tensors we previously declared
inf_encoder_model = Model(encoder_inputs, [encoder_outputs, state_h, state_c])

# Generate a new set of tensors for our new inference decoder. Note that we are using new tensors, 
inf_decoder_inputs = Input(shape=(None,), name="inf_decoder_inputs")
# We'll need to force feed the two state variables into the decoder each step.
state_input_h = Input(shape=(latent_dim*2,), name="state_input_h")
state_input_c = Input(shape=(latent_dim*2,), name="state_input_c")
decoder_res, decoder_h, decoder_c = decoder_lstm_layer2(decoder_lstm_layer1(
                                            decoder_emb_layer(inf_decoder_inputs), 
                                            initial_state=[state_input_h, state_input_c]))
inf_decoder_output = decoder_dense2(decoder_dense1(decoder_res))
inf_decoder_model = Model([inf_decoder_inputs, state_input_h, state_input_c],
                         [inf_decoder_output, decoder_h, decoder_c])

In [137]:
eg_gen = DataGenerator(eng_train, hindi_train, 
                          MAX_LEN_ENG, 
                          MAX_LEN_HIN,
                          TARGET_VOCAB,
                          batch_size=1,
                          shuffle=True)

input_seq, target_output = eg_gen.__getitem__(index=4)

In [143]:
# encode the input as state vectors (h, c)
d_emb, d_h, d_c = inf_encoder_model.predict(input_seq)

# Populate the first character of target sequence with the start character.
target_seq = np.zeros((1, 1))
target_seq[0, 0] = hin_vocab_idx['<BOS>']

decode = False
decoded_sentence = ''
while not decode:
    output_tokens, h, c = inf_decoder_model.predict([target_seq, d_h, d_c])
    
    # pick word with max probability
    token_idx = np.argmax(output_tokens.flatten())
    char = idx_vocab_hin[token_idx]
    decoded_sentence += ' ' + char
    
    if char == '<EOS>' or len(decoded_sentence) > 50:
        decode = True
        
    target_seq[0, 0] = hin_vocab_idx[char]
    
    encode_state_vectors = [h, c]

In [144]:
decoded_sentence

' ३०० ३०० ३०० ३०० ३०० ३०० ३०० ३०० ३०० ३०० ३०० ३०० ३००'