In [1]:
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd
import string
import contractions
from tensorflow.keras.utils import Sequence
from tensorflow.keras.layers import Input, LSTM, Embedding
from tensorflow.keras.layers import Dense
from tensorflow.keras.models import Model

In [2]:
corpus_df = pd.read_csv('eng-hindi.csv')
corpus_df.head()

Unnamed: 0,english_sentence,hindi_sentence
0,politicians do not have permission to do what ...,"राजनीतिज्ञों के पास जो कार्य करना चाहिए, वह कर..."
1,"I'd like to tell you about one such child,",मई आपको ऐसे ही एक बच्चे के बारे में बताना चाहू...
2,This percentage is even greater than the perce...,यह प्रतिशत भारत में हिन्दुओं प्रतिशत से अधिक है।
3,what we really mean is that they're bad at not...,हम ये नहीं कहना चाहते कि वो ध्यान नहीं दे पाते
4,.The ending portion of these Vedas is called U...,इन्हीं वेदों का अंतिम भाग उपनिषद कहलाता है।


In [3]:
def data_processing(sentence, lang):
    # strip any unneccesary spaces
    sentence = sentence.strip()
    # expand word contractions for english
    if lang == 'eng':
        sentence = contractions.fix(sentence)
    sentence = sentence.lower()
    # remove all punctuations from sentence '!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'
    sentence = sentence.translate(str.maketrans("", "", string.punctuation + "‘“”"))
    sentence = " ".join([w for w in sentence.split()]) # de-constructing the sentence
    # append <BOS> and <EOS> token for target column
    # EOS (end of a sentence) 
    # BOS (beginning of a sentence)
    if lang == 'hindi':
        sentence = "<BOS> " + sentence + " <EOS>"
    return sentence

corpus_df.english_sentence = corpus_df.english_sentence.apply(data_processing, lang='eng')
corpus_df.hindi_sentence = corpus_df.hindi_sentence.apply(data_processing, lang='hindi')

In [4]:
corpus_df.head()

Unnamed: 0,english_sentence,hindi_sentence
0,politicians do not have permission to do what ...,<BOS> राजनीतिज्ञों के पास जो कार्य करना चाहिए ...
1,i would like to tell you about one such child,<BOS> मई आपको ऐसे ही एक बच्चे के बारे में बतान...
2,this percentage is even greater than the perce...,<BOS> यह प्रतिशत भारत में हिन्दुओं प्रतिशत से ...
3,what we really mean is that they are bad at no...,<BOS> हम ये नहीं कहना चाहते कि वो ध्यान नहीं द...
4,the ending portion of these vedas is called up...,<BOS> इन्हीं वेदों का अंतिम भाग उपनिषद कहलाता ...


In [5]:
eng_index, hindi_index = set(), set()

for idx, (sen, sen1) in enumerate(zip(
              corpus_df['english_sentence'],
              corpus_df['hindi_sentence'])):
    sen = sen.split()
    sen1 = sen1.split()
    if len(sen) > 20:
        eng_index.add(idx)
    if len(sen1) > 20:
        hindi_index.add(idx)
indices = list(eng_index.union(hindi_index))

corpus_df.drop(indices, axis=0, inplace=True)

In [6]:
# splitting into train and test set
x, y = corpus_df['english_sentence'].to_numpy(), corpus_df['hindi_sentence'].to_numpy()

eng_train, eng_test, hindi_train, hindi_test = train_test_split(x, y, test_size= .1)

In [7]:
# retrieve english and hindi vocabulary.
eng_vocab = set()
hindi_vocab = set()

MAX_LEN_ENG = 0
MAX_LEN_HIN = 0

for idx, sen in enumerate(eng_train):
    eng_words = sen.split()
    # it returns max sentence length
    MAX_LEN_ENG = max(MAX_LEN_ENG, len(eng_words))
    for e_w in eng_words:
        if e_w not in eng_vocab:
            eng_vocab.add(e_w)

for idx, sen in enumerate(hindi_train):
    hindi_words = sen.split()
    MAX_LEN_HIN = max(MAX_LEN_HIN, len(hindi_words))
    for h_w in hindi_words:
        if h_w not in hindi_vocab:
            hindi_vocab.add(h_w)

In [8]:
en_vocab_idx = {j : i for i, j in enumerate(eng_vocab, 1)}
idx_vocab_en = dict(map(reversed, en_vocab_idx.items()))

hin_vocab_idx = {j : i for i, j in enumerate(hindi_vocab, 1)}
hin_vocab_idx['UNW'] = -1 # add a extra unw (unknown word) if any new words in test set then assign -1 to it
idx_vocab_hin = dict(map(reversed, hin_vocab_idx.items()))

In [9]:
INPUT_VOCAB = len(eng_vocab) + 1
TARGET_VOCAB = len(hindi_vocab) + 1

print(f'Maximum sentence length(i.e n words) ENGLISH: {MAX_LEN_ENG}')
print(f'Maximum sentence length(i.e n words) HINDI: {MAX_LEN_HIN}')
print(f'Size of the vocabulary (English) : {INPUT_VOCAB}')
print(f'Size of the vocabulary (Hindi) : {TARGET_VOCAB}')

Maximum sentence length(i.e n words) ENGLISH: 20
Maximum sentence length(i.e n words) HINDI: 20
Size of the vocabulary (English) : 39352
Size of the vocabulary (Hindi) : 41705


In [10]:
class DataGenerator(Sequence):
    """
    Generates batches 
    """    
    def __init__(self, eng_train, hindi_train, 
                 MAX_LEN_ENG, 
                 MAX_LEN_HIN,
                 TARGET_VOCAB,
                 batch_size, 
                 shuffle):        
        self.batch_size = batch_size
        self.shuffle = shuffle
        self.eng_train = eng_train
        self.hindi_train = hindi_train
        self.MAX_LEN_ENG = MAX_LEN_ENG
        self.MAX_LEN_HIN = MAX_LEN_HIN
        self.TARGET_VOCAB = TARGET_VOCAB
        self.on_epoch_end()
        
    def __len__(self):
        return len(self.eng_train) // self.batch_size
        
    def __getitem__(self, index):
        indexes = self.indexes[index*self.batch_size:(index+1)*self.batch_size]
        eng_batch = [self.eng_train[i] for i in indexes]
        hindi_batch = [self.hindi_train[i] for i in indexes]
        input_data, decoder_target = self.__data_generation(eng_batch, hindi_batch)
        return input_data, decoder_target
    
    def on_epoch_end(self):
        self.indexes = np.arange(len(self.eng_train))
        if self.shuffle == True:
            np.random.shuffle(self.indexes)
            
    def __data_generation(self, eng_batch, hindi_batch):
        encoder_input = np.zeros((self.batch_size, self.MAX_LEN_ENG), dtype='float32')  
        decoder_input = np.zeros((self.batch_size, self.MAX_LEN_HIN), dtype='float32')
        decoder_target = np.zeros((self.batch_size, self.MAX_LEN_HIN, TARGET_VOCAB), dtype='float32')
        for enum, (sen1, sen2) in enumerate(zip(eng_batch, hindi_batch)):
            for x, word in enumerate(sen1.split()):
                encoder_input[enum, x] = en_vocab_idx[word]
            for x, word in enumerate(sen2.split()):
                if word not in hin_vocab_idx.keys():
                    decoder_input[enum, x] = hin_vocab_idx['UNW']
                    if x > 0:
                        decoder_target[enum, x-1, hin_vocab_idx['UNW']] = 1.
                else:
                    decoder_input[enum, x] = hin_vocab_idx[word]
                    if x > 0:
                        decoder_target[enum, x-1, hin_vocab_idx[word]] = 1.
        return [encoder_input, decoder_input], decoder_target

In [11]:
latent_dim = 300

# Encoder
encoder_inputs = Input(shape=(None,))
encoder_emb =  Embedding(INPUT_VOCAB, latent_dim, mask_zero=True)(encoder_inputs)
encoder_lstm = LSTM(latent_dim, return_state=True)
encoder_outputs, state_h, state_c = encoder_lstm(encoder_emb)
encoder_states = [state_h, state_c]

# Decoder
decoder_inputs  = Input(shape=(None,))
dec_emb_layer = Embedding(TARGET_VOCAB, latent_dim, mask_zero=True)
decoder_emb = dec_emb_layer(decoder_inputs)
decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_emb, initial_state=encoder_states)
decoder_dense = Dense(TARGET_VOCAB, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
model.compile(optimizer='rmsprop', loss='categorical_crossentropy')
model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, None)]       0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            [(None, None)]       0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (None, None, 300)    11805600    input_1[0][0]                    
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, None, 300)    12511500    input_2[0][0]                    
______________________________________________________________________________________________

In [14]:
batch_size = 16
nb_epochs = 5

train_gen = DataGenerator(eng_train, hindi_train, 
                          MAX_LEN_ENG, 
                          MAX_LEN_HIN,
                          TARGET_VOCAB,
                          batch_size=batch_size,
                          shuffle=True)
dev_gen = DataGenerator(eng_test, hindi_test, 
                        MAX_LEN_ENG, 
                        MAX_LEN_HIN,
                        TARGET_VOCAB,
                        batch_size=batch_size,
                        shuffle=True)

model.fit_generator(train_gen,
                    steps_per_epoch=INPUT_VOCAB//batch_size,
                    epochs=nb_epochs, 
                    max_queue_size=5,
                    validation_data=dev_gen,
                    validation_steps=TARGET_VOCAB//batch_size,
                    verbose=1,
                    shuffle=True,
                    use_multiprocessing=True,
                    workers=10)

Epoch 1/5
  88/2459 [>.............................] - ETA: 19:29 - loss: 3.9187

UnknownError: CUDNN_STATUS_BAD_PARAM
in tensorflow/stream_executor/cuda/cuda_dnn.cc(1424): 'cudnnSetRNNDataDescriptor( data_desc.get(), data_type, layout, max_seq_length, batch_size, data_size, seq_lengths_array, (void*)&padding_fill)' [Op:CudnnRNNV3]