In [89]:
import pandas as pd
import tensorflow as tf
import numpy as np
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input,LSTM,Dense, Dropout, Embedding
from tensorflow.keras.preprocessing.text import Tokenizer
import pickle as pkl
from sklearn.model_selection import train_test_split
import nltk
import re
import string
from string import digits

In [2]:
# !unzip archive.zip

Archive:  archive.zip
replace Hindi_English_Truncated_Corpus.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: 

## **Encoder-Decoder NMT**

In [99]:
corpus = pd.read_csv("Hindi_English_Truncated_Corpus.csv")
corpus.drop(columns = ['source'], inplace=True)
corpus.dropna(inplace=True)
corpus

Unnamed: 0,english_sentence,hindi_sentence
0,politicians do not have permission to do what ...,"राजनीतिज्ञों के पास जो कार्य करना चाहिए, वह कर..."
1,"I'd like to tell you about one such child,",मई आपको ऐसे ही एक बच्चे के बारे में बताना चाहू...
2,This percentage is even greater than the perce...,यह प्रतिशत भारत में हिन्दुओं प्रतिशत से अधिक है।
3,what we really mean is that they're bad at not...,हम ये नहीं कहना चाहते कि वो ध्यान नहीं दे पाते
4,.The ending portion of these Vedas is called U...,इन्हीं वेदों का अंतिम भाग उपनिषद कहलाता है।
...,...,...
127602,Examples of art deco construction can be found...,आर्ट डेको शैली के निर्माण मैरीन ड्राइव और ओवल ...
127603,and put it in our cheeks.,और अपने गालों में डाल लेते हैं।
127604,"As for the other derivatives of sulphur , the ...","जहां तक गंधक के अन्य उत्पादों का प्रश्न है , द..."
127605,its complicated functioning is defined thus in...,Zरचना-प्रकिया को उसने एक पहेली में यों बांधा है .


In [100]:
# here, we are randomly picking 30k rows from the dataset for the sake of training on our system
corpus = corpus.sample(n=15000,random_state=42)
corpus.shape

(15000, 2)

### Preprocessing the data for encoder-decoder input

In [101]:
# lowercasing all the english/hindi(if any) characters
corpus['english_sentence'] = corpus['english_sentence'].apply(lambda x: str(x).lower())
corpus['hindi_sentence'] = corpus['hindi_sentence'].apply(lambda x: str(x).lower())
corpus.head()

Unnamed: 0,english_sentence,hindi_sentence
3556,he declares the result and reports it to the e...,वही परिणाम की घोषणा करता है और निर्वाचन आयोग क...
25899,was a little uncomfortable for them.,थोडा कठिन था।
90924,"a multi-purpose auditorium , a branch of the s...","बहुउद्देशीय सभागार , भारतीय स्टेट बैंक की शाखा..."
78213,no fees is to be paid for filing the appeal to...,अधिकरण में अपील करने के लिए कोई फीस नहीं देनी ...
96955,headind kaun banega crorepati,शीर्षक कौन बनेगा करोड़पति (kaun banega crorepa...


In [102]:
# Removing quotes from sentences
corpus['english_sentence'] = corpus['english_sentence'].apply(lambda x: re.sub("'",'',x))
corpus['hindi_sentence'] = corpus['hindi_sentence'].apply(lambda x: re.sub("'",'',x))

In [103]:
spc = set(string.punctuation) # set of all special characters
# now, we'll remove the special characters from all the sentences

corpus['english_sentence'] = corpus['english_sentence'].apply(lambda x: ''.join(ch for ch in x if ch not in spc))
corpus['hindi_sentence'] = corpus['hindi_sentence'].apply(lambda x: ''.join(ch for ch in x if ch not in spc))

In [104]:
# Removing all the numbers from text
remove_digits = str.maketrans('', '', digits)
corpus['english_sentence'] = corpus['english_sentence'].apply(lambda x: x.translate(remove_digits))
corpus['hindi_sentence'] = corpus['hindi_sentence'].apply(lambda x: x.translate(remove_digits))

corpus['english_sentence'] = corpus['english_sentence'].apply(lambda x: re.sub("[0123456789]","",x))
corpus['hindi_sentence'] = corpus['hindi_sentence'].apply(lambda x: re.sub("[0123456789]","",x))
corpus['hindi_sentence'] = corpus['hindi_sentence'].apply(lambda x: re.sub("[०१२३४५६७८९१०]","",x))

In [105]:
# removing the extra spaces from both hindi and english corpus
corpus['english_sentence'] = corpus['english_sentence'].apply(lambda x: x.strip())
corpus['hindi_sentence'] = corpus['hindi_sentence'].apply(lambda x: x.strip())

corpus['english_sentence'] = corpus['english_sentence'].apply(lambda x: re.sub(' +',' ',x))
corpus['hindi_sentence'] = corpus['hindi_sentence'].apply(lambda x: re.sub(' +',' ',x))

In [106]:
# Adding the start and end tokems to target sequence so that the decoder knows when to start and stop
corpus['hindi_sentence'] = corpus['hindi_sentence'].apply(lambda x: 'START_ '+ x + ' _END')

In [107]:
corpus.head()

Unnamed: 0,english_sentence,hindi_sentence
3556,he declares the result and reports it to the e...,START_ वही परिणाम की घोषणा करता है और निर्वाचन...
25899,was a little uncomfortable for them,START_ थोडा कठिन था। _END
90924,a multipurpose auditorium a branch of the stat...,START_ बहुउद्देशीय सभागार भारतीय स्टेट बैंक की...
78213,no fees is to be paid for filing the appeal to...,START_ अधिकरण में अपील करने के लिए कोई फीस नही...
96955,headind kaun banega crorepati,START_ शीर्षक कौन बनेगा करोड़पति kaun banega c...


## Creating the Vocabulary

In [108]:
# creating hindi and english vocabulary
eng_words = set()
for eng in corpus['english_sentence']:
    for word in eng.split():
        if word not in eng_words:
            eng_words.add(word)
            
hin_words = set()
for hin in corpus['hindi_sentence']:
    for word in hin.split():
        if word not in hin_words:
            hin_words.add(word)
            
print('Words in English Vocabulary: ',len(eng_words))
print('Words in Hindi Vocabulary: ', len(hin_words))

Words in English Vocabulary:  22998
Words in Hindi Vocabulary:  27767


In [109]:
# checking the max_length for source and target sentences
corpus['len_english_sentences'] = corpus['english_sentence'].apply(lambda x: len(x.split(" ")))
corpus['len_hindi_sentences'] = corpus['hindi_sentence'].apply(lambda x: len(x.split(" ")))

corpus.head()

Unnamed: 0,english_sentence,hindi_sentence,len_english_sentences,len_hindi_sentences
3556,he declares the result and reports it to the e...,START_ वही परिणाम की घोषणा करता है और निर्वाचन...,19,22
25899,was a little uncomfortable for them,START_ थोडा कठिन था। _END,6,5
90924,a multipurpose auditorium a branch of the stat...,START_ बहुउद्देशीय सभागार भारतीय स्टेट बैंक की...,32,31
78213,no fees is to be paid for filing the appeal to...,START_ अधिकरण में अपील करने के लिए कोई फीस नही...,13,13
96955,headind kaun banega crorepati,START_ शीर्षक कौन बनेगा करोड़पति kaun banega c...,4,9


In [110]:
corpus = corpus[corpus.len_english_sentences<=30]
corpus = corpus[corpus.len_hindi_sentences<=30]
corpus.shape

(12709, 4)

In [111]:
max_length_source = max(corpus['len_english_sentences'])
max_length_target = max(corpus['len_hindi_sentences'])

print("English: ",max_length_source)
print("Hindi: ",max_length_target)

English:  30
Hindi:  30


In [112]:
# Now we'll convert the given word into an integer index and vice versa

source_words = sorted(list(eng_words))
target_words = sorted(list(hin_words))
num_encoder_tokens = len(eng_words)
num_decoder_tokens = len(hin_words)

In [113]:
# this has been done for zero padding
num_encoder_tokens+=1
num_decoder_tokens+=1

In [114]:
# Assigning an index to each token present in the vocabulary
source_token_index = dict([(word,i+1) for i,word in enumerate(source_words)])
target_token_index = dict([(word,i+1) for i,word in enumerate(target_words)])

In [115]:
# reversing the word index format to index word format to generate outputs
reverse_source_char_index = dict((i,word) for word,i in source_token_index.items())
reverse_target_char_index = dict((i,word) for word,i in target_token_index.items())

## Preparing the data for training

In order to proceed with this session I want you to run and understand the piece of code that is written here:
```python
txt = 'we are learning encoder-decoder architecture'
for t,word in enumerate(txt.split()):
    print(t," ",word)
print(len(txt.split())-1)
source_token_index['my']
```

In [116]:
source, target = corpus['english_sentence'], corpus['hindi_sentence']
X_train, X_test, y_train, y_test = train_test_split(source,target,test_size = 0.1)

In [117]:
# this particular block here is taking the batch and is performing one-hot-encoding
# this function will generate the data in batches for testing and training
def generate_batch(X = X_train, y = y_train, batch_size = 128):
    while True:
        for j in range(0,len(X),batch_size):
            encoder_input_data = np.zeros((batch_size, max_length_source),dtype='float32')
            decoder_input_data = np.zeros((batch_size, max_length_target),dtype='float32')
            decoder_target_data = np.zeros((batch_size,max_length_target,num_decoder_tokens),dtype='float32')
            for i, (input_text,target_text) in enumerate(zip(X[j:j+batch_size],y[j:j+batch_size])):
                for t,word in enumerate(input_text.split()):
                    encoder_input_data[i,t] = source_token_index[word] # encoder input sequence
                for t,word in enumerate(target_text.split()):
                    if t<len(target_text.split())-1:
                        # this means that the sentence is not completed hence it will be put back as the decoder input
                        decoder_input_data[i,t] = target_token_index[word] # decoder input sequence
                    if t>0:
                        # this will run only after the first index or the first decoder output hence t>0
                        # decoder target sequence (one hot encoded)
                        # does not include StarSeq token
                        # offset by one timestep
                        # and here we are feeding the output of the previous time stamp
                        decoder_target_data[i,t-1,target_token_index[word]] = 1.
            yield([encoder_input_data,decoder_input_data],decoder_target_data)

## Encoder-Decoder Architecture

### Encoder

In [118]:
latent_dim = 64

# this None below indicates that the sequences can have arbitrary length
encoder_inputs = Input(shape=(None,))
encoder_embeddings = Embedding(num_encoder_tokens,latent_dim,mask_zero=True)(encoder_inputs)
# mask zero is basically used for padding the sequences with zero so that all are of similar length

encoder_lstm = LSTM(latent_dim,return_state=True)
encoder_outputs, state_h,state_c = encoder_lstm(encoder_embeddings)
# we discard the encoder_outputs and only keep the states because that's what we need here for the encoder part

encoder_states = [state_h,state_c] # output states

### Decoder

In [119]:
decoder_inputs = Input(shape=(None,))
decoder_embedding_layer = Embedding(num_decoder_tokens,latent_dim,mask_zero=True)
decoder_embeddings = decoder_embedding_layer(decoder_inputs)

# decoder must return the full output sequence, and internal states.
# we don't use the return states in training model but we'll use them in inference.
decoder_lstm = LSTM(latent_dim,return_sequences=True,return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_embeddings, initial_state=encoder_states)

# here we'll also add a dense layer to get the outputs
decoder_dense = Dense(num_decoder_tokens,activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs) # here the output is fed back into the decoder layer

# hence we'll define a model that will turn the encoder_input_data and decoder_input_data into decoder_target_data
model = Model([encoder_inputs,decoder_inputs],decoder_outputs)

In [120]:
# now that our model has been created we'll train it using 100 epochs
model.compile(optimizer='adam',loss='categorical_crossentropy')
model.summary()

Model: "functional_19"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_13 (InputLayer)           [(None, None)]       0                                            
__________________________________________________________________________________________________
input_14 (InputLayer)           [(None, None)]       0                                            
__________________________________________________________________________________________________
embedding_4 (Embedding)         (None, None, 64)     1471936     input_13[0][0]                   
__________________________________________________________________________________________________
embedding_5 (Embedding)         (None, None, 64)     1777152     input_14[0][0]                   
______________________________________________________________________________________

In [121]:
train_sent = len(X_train)
val_sent = len(X_test)
batch_size = 128
epochs = 30

In [122]:
# training the model, this model is giving us the resource exhausted error, hence, we'll take it to google colab for now
model.fit_generator(generator = generate_batch(X_train, y_train, batch_size=batch_size),
                   steps_per_epoch=train_sent//batch_size,
                   epochs=epochs,
                   validation_data=generate_batch(X_test,y_test,batch_size=batch_size),
                   validation_steps=val_sent//batch_size)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<tensorflow.python.keras.callbacks.History at 0x7ff3383adf98>

In [123]:
model.save_weights('weights.h5')

In [129]:
## now we'll encode the input sequence to get the "thought vectors"
encoder_model = Model(encoder_inputs,encoder_states)

# Decoder setup, below tensors will hold the states of the previous time step
decoder_state_input_h = Input(shape=(latent_dim,))
decoder_state_input_c = Input(shape=(latent_dim,))
decoder_states_inputs = [decoder_state_input_h,decoder_state_input_c]

# get the embeddings of the decoder sequence
decoder_embeddings_2 = decoder_embedding_layer(decoder_inputs)

# To preduct the next word in the sequence, set the initial states to the states from previous time step
decoder_outputs_2,state_h_2,state_c_2 = decoder_lstm(decoder_embeddings_2, initial_state=decoder_states_inputs)
decoder_states_2 = [state_h_2,state_c_2]
decoder_outputs_2 = decoder_dense(decoder_outputs_2)
# here, in the above code we've used the dense softmax layer to generate probability distance over the target vocabulary

# Final decoder model
decoder_model = Model([decoder_inputs]+decoder_states_inputs,
                      [decoder_outputs_2]+ decoder_states_2)

In [130]:
def decoder_sequence(input_seq):
  # encode the input as state vectors,
  states_value = encoder_model.predict(input_seq)
  # generate empty target sequence of length 1
  target_seq = np.zeros((1,1))

  # add the start character as the first character of the target sequence
  target_seq[0,0] = target_token_index['START_']

  # sampling loop for a batch of sequences
  stop_condition = False
  decoded_sentence = ''
  while not stop_condition:
    output_tokens, h, c = decoder_model.predict([target_seq]+states_value)

    # sample a token
    sampled_token_index = np.argmax(output_tokens[0,-1,:])
    sampled_char = reverse_target_char_index[sampled_token_index]
    # here we've generated the first word of the sequence using the decoder
    decoded_sentence += ' '+sampled_char

    # exit codition: either hit max_length or find the stop character
    if (sampled_char == '_END' or len(decoded_sentence)>50):
      stop_condition = True

    # update the target sequence for the next time step
    target_seq = np.zeros((1,1))
    target_seq[0,0] = sampled_token_index # for the next time step

    # update the states 
    states_value = [h,c]

  return decoded_sentence

In [131]:
train_gen = generate_batch(X_train, y_train, batch_size = 1)
k=-1

In [139]:
k+=1
(input_seq, actual_output), _ = next(train_gen)
print(input_seq)
decoded_sent = decoder_sequence(input_seq)
print('Input English sentence:', X_train[k:k+1].values[0])
print('Actual Hindi Translation:', y_train[k:k+1].values[0][6:-4])
print('Predicted Hindi Translation:', decoded_sent[:-4])

[[22218. 18371.  9322. 10931.  2006.  4981.     0.     0.     0.     0.
      0.     0.     0.     0.     0.     0.     0.     0.     0.     0.
      0.     0.     0.     0.     0.     0.     0.     0.     0.     0.]]
Input English sentence: what should i know before deciding
Actual Hindi Translation:  फैसला लेने से पहले मुझे क्या जानना जरुरी है 
Predicted Hindi Translation:  क्योंकि आप के बारे में क्या 
