Installing sacrebleu library for calculating the bleu of the sentence

In [None]:
!pip install sacrebleu

Collecting sacrebleu
[?25l  Downloading https://files.pythonhosted.org/packages/7e/57/0c7ca4e31a126189dab99c19951910bd081dea5bbd25f24b77107750eae7/sacrebleu-1.5.1-py3-none-any.whl (54kB)
[K     |██████                          | 10kB 15.4MB/s eta 0:00:01[K     |████████████                    | 20kB 20.5MB/s eta 0:00:01[K     |██████████████████              | 30kB 11.0MB/s eta 0:00:01[K     |████████████████████████        | 40kB 8.6MB/s eta 0:00:01[K     |██████████████████████████████  | 51kB 5.1MB/s eta 0:00:01[K     |████████████████████████████████| 61kB 3.6MB/s 
[?25hCollecting portalocker==2.0.0
  Downloading https://files.pythonhosted.org/packages/89/a6/3814b7107e0788040870e8825eebf214d72166adf656ba7d4bf14759a06a/portalocker-2.0.0-py2.py3-none-any.whl
Installing collected packages: portalocker, sacrebleu
Successfully installed portalocker-2.0.0 sacrebleu-1.5.1


Installing Required libraries

In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
import string
import re
import math
import os
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, LSTM, Embedding, Bidirectional, Concatenate
from tensorflow.keras.optimizers import RMSprop
from sacrebleu import sentence_bleu

In [None]:
df=pd.read_csv("/content/drive/MyDrive/Colab Notebooks/Machine Translation/hin.txt", sep='\t', header=None, names=["english_sentence","hindi_sentence","path"])
df.head()

Unnamed: 0,english_sentence,hindi_sentence,path
0,Wow!,वाह!,CC-BY 2.0 (France) Attribution: tatoeba.org #5...
1,Help!,बचाओ!,CC-BY 2.0 (France) Attribution: tatoeba.org #4...
2,Jump.,उछलो.,CC-BY 2.0 (France) Attribution: tatoeba.org #6...
3,Jump.,कूदो.,CC-BY 2.0 (France) Attribution: tatoeba.org #6...
4,Jump.,छलांग.,CC-BY 2.0 (France) Attribution: tatoeba.org #6...


In [None]:
df=df.drop(columns=['path'])
df.shape

(2774, 2)

In [None]:
df.isnull().sum()

english_sentence    0
hindi_sentence      0
dtype: int64

***Text Pre-Processing***

In [None]:
df['english_sentence']=df['english_sentence'].apply(lambda x: x.lower())
df['hindi_sentence']=df['hindi_sentence'].apply(lambda x: x.lower())
df['english_sentence']=df['english_sentence'].apply(lambda x: x.strip())
df['hindi_sentence']=df['hindi_sentence'].apply(lambda x: x.strip())
df['english_sentence']=df['english_sentence'].apply(lambda x: re.sub(" +", " ", x))
df['hindi_sentence']=df['hindi_sentence'].apply(lambda x: re.sub(" +", " ", x))
df['english_sentence']=df['english_sentence'].apply(lambda x: ''.join(ch for ch in x if ch not in string.punctuation))
df['hindi_sentence']=df['hindi_sentence'].apply(lambda x: ''.join(ch for ch in x if ch not in string.punctuation))
df['english_sentence']=df['english_sentence'].apply(lambda x: re.sub(r'\d+', '', x))
df['hindi_sentence']=df['hindi_sentence'].apply(lambda x:  re.sub(r'\d+','',x))

In [None]:
start = '<s> '
end = ' </s>'
df['hindi_sentence'] = df['hindi_sentence'].apply(lambda x : start + x + end)

In [None]:
df.head(20)

Unnamed: 0,english_sentence,hindi_sentence
0,wow,<s> वाह </s>
1,help,<s> बचाओ </s>
2,jump,<s> उछलो </s>
3,jump,<s> कूदो </s>
4,jump,<s> छलांग </s>
5,hello,<s> नमस्ते। </s>
6,hello,<s> नमस्कार। </s>
7,cheers,<s> वाहवाह </s>
8,cheers,<s> चियर्स </s>
9,got it,<s> समझे कि नहीं </s>


Tokenizing the words present in the corpus

In [None]:
english_vocab = {}
for i in df.english_sentence:
  for word in i.split():
    if word not in english_vocab:
      english_vocab[word] = 1
    else:
      english_vocab[word]+=1

hindi_vocab={}
for j in df.hindi_sentence:
  for a in j.split():
    if a not in hindi_vocab:
      hindi_vocab[a] = 1
    else:
      hindi_vocab[a]+=1

In [None]:
num_encoder_tokens=len(english_vocab.keys())
num_decoder_token=len(hindi_vocab.keys())
length = []
for i in df.english_sentence:
  length.append(len(i.split(' ')))
max_input_length = max(length)
print('max_input_length: ', max_input_length)
length = []
for i in df.hindi_sentence:
  length.append(len(i.split(' ')))
max_output_length = max(length)
print('max_output_length: ', max_output_length)

max_input_length:  22
max_output_length:  27


In [None]:
input_words = sorted(list(english_vocab.keys()))
target_words = sorted(list(hindi_vocab.keys()))

In [None]:
input_token_index = dict([(word, i) for i, word in enumerate(input_words)])
target_token_index = dict([(word, i) for i, word in enumerate(target_words)])

Converting the word into array of vectors **(Mapping)**

In [None]:
encoder_input_data = np.zeros((len(df.english_sentence), max_input_length), dtype='float32')
decoder_input_data = np.zeros((len(df.hindi_sentence), max_output_length), dtype='float32')
decoder_target_data = np.zeros((len(df.hindi_sentence), max_output_length, num_decoder_token))

In [None]:
for i,(input_text, output_text) in enumerate(zip(df.english_sentence, df.hindi_sentence)):
  for t, word in enumerate(input_text.split()):
    encoder_input_data[i,t] = input_token_index[word]
  for t,word in enumerate(output_text.split()):
    decoder_input_data[i,t] = target_token_index[word]
    if t > 0:
      decoder_target_data[i,t-1,target_token_index[word]] = 1

In [None]:
embedding_dim = 256
units = 1024

**Encoder**

In [None]:
encoder_inputs = Input(shape=(max_input_length,))
enc_emb =  Embedding(num_encoder_tokens, embedding_dim)(encoder_inputs)
encoder_lstm = Bidirectional(LSTM(units=units//2,return_state=True, return_sequences=True,recurrent_initializer='glorot_uniform'))
encoder_outputs, forward_state_h, forward_state_c, backward_state_h, backward_state_c = encoder_lstm(enc_emb)
final_state_h = Concatenate()([forward_state_h,backward_state_h])
final_state_c = Concatenate()([forward_state_c,backward_state_c])
encoder_states = [final_state_h, final_state_c]

**Attention Model**

In [None]:
from tensorflow.python.keras.layers import Layer
from tensorflow.python.keras import backend as K
 
 
class AttentionLayer(Layer):
    """
    This class implements Bahdanau attention (https://arxiv.org/pdf/1409.0473.pdf).
    There are three sets of weights introduced W_a, U_a, and V_a
     """
 
    def __init__(self, **kwargs):
        super(AttentionLayer, self).__init__(**kwargs)
 
    def build(self, input_shape):
        assert isinstance(input_shape, list)
        # Create a trainable weight variable for this layer.
 
        self.W_a = self.add_weight(name='W_a',
                                   shape=tf.TensorShape((input_shape[0][2], input_shape[0][2])),
                                   initializer='uniform',
                                   trainable=True)
        self.U_a = self.add_weight(name='U_a',
                                   shape=tf.TensorShape((input_shape[1][2], input_shape[0][2])),
                                   initializer='uniform',
                                   trainable=True)
        self.V_a = self.add_weight(name='V_a',
                                   shape=tf.TensorShape((input_shape[0][2], 1)),
                                   initializer='uniform',
                                   trainable=True)
 
        super(AttentionLayer, self).build(input_shape)  # Be sure to call this at the end
 
    def call(self, inputs, verbose=False):
        """
        inputs: [encoder_output_sequence, decoder_output_sequence]
        """
        assert type(inputs) == list
        encoder_out_seq, decoder_out_seq = inputs
        if verbose:
            print('encoder_out_seq>', encoder_out_seq.shape)
            print('decoder_out_seq>', decoder_out_seq.shape)
 
        def energy_step(inputs, states):
            """ Step function for computing energy for a single decoder state """
 
            assert_msg = "States must be a list. However states {} is of type {}".format(states, type(states))
            assert isinstance(states, list) or isinstance(states, tuple), assert_msg
 
            """ Some parameters required for shaping tensors"""
            en_seq_len, en_hidden = encoder_out_seq.shape[1], encoder_out_seq.shape[2]
            de_hidden = inputs.shape[-1]
 
            """ Computing S.Wa where S=[s0, s1, ..., si]"""
            # <= batch_size*en_seq_len, latent_dim
            reshaped_enc_outputs = K.reshape(encoder_out_seq, (-1, en_hidden))
            # <= batch_size*en_seq_len, latent_dim
            W_a_dot_s = K.reshape(K.dot(reshaped_enc_outputs, self.W_a), (-1, en_seq_len, en_hidden))
            if verbose:
                print('wa.s>',W_a_dot_s.shape)
 
            """ Computing hj.Ua """
            U_a_dot_h = K.expand_dims(K.dot(inputs, self.U_a), 1)  # <= batch_size, 1, latent_dim
            if verbose:
                print('Ua.h>',U_a_dot_h.shape)
 
            """ tanh(S.Wa + hj.Ua) """
            # <= batch_size*en_seq_len, latent_dim
            reshaped_Ws_plus_Uh = K.tanh(K.reshape(W_a_dot_s + U_a_dot_h, (-1, en_hidden)))
            if verbose:
                print('Ws+Uh>', reshaped_Ws_plus_Uh.shape)
 
            """ softmax(va.tanh(S.Wa + hj.Ua)) """
            # <= batch_size, en_seq_len
            e_i = K.reshape(K.dot(reshaped_Ws_plus_Uh, self.V_a), (-1, en_seq_len))
            # <= batch_size, en_seq_len
            e_i = K.softmax(e_i)
 
            if verbose:
                print('ei>', e_i.shape)
 
            return e_i, [e_i]
 
        def context_step(inputs, states):
            """ Step function for computing ci using ei """
            # <= batch_size, hidden_size
            c_i = K.sum(encoder_out_seq * K.expand_dims(inputs, -1), axis=1)
            if verbose:
                print('ci>', c_i.shape)
            return c_i, [c_i]
 
        def create_inital_state(inputs, hidden_size):
            # We are not using initial states, but need to pass something to K.rnn funciton
            fake_state = K.zeros_like(inputs)  # <= (batch_size, enc_seq_len, latent_dim
            fake_state = K.sum(fake_state, axis=[1, 2])  # <= (batch_size)
            fake_state = K.expand_dims(fake_state)  # <= (batch_size, 1)
            fake_state = K.tile(fake_state, [1, hidden_size])  # <= (batch_size, latent_dim
            return fake_state
 
        fake_state_c = create_inital_state(encoder_out_seq, encoder_out_seq.shape[-1])
        fake_state_e = create_inital_state(encoder_out_seq, encoder_out_seq.shape[1])  # <= (batch_size, enc_seq_len, latent_dim
 
        """ Computing energy outputs """
        # e_outputs => (batch_size, de_seq_len, en_seq_len)
        last_out, e_outputs, _ = K.rnn(
            energy_step, decoder_out_seq, [fake_state_e],
        )
 
        """ Computing context vectors """
        last_out, c_outputs, _ = K.rnn(
            context_step, e_outputs, [fake_state_c],
        )
 
        return c_outputs, e_outputs
 
    def compute_output_shape(self, input_shape):
        """ Outputs produced by the layer """
        return [
            tf.TensorShape((input_shape[1][0], input_shape[1][1], input_shape[1][2])),
            tf.TensorShape((input_shape[1][0], input_shape[1][1], input_shape[0][1]))
        ]

**Decoder**

In [None]:
decoder_inputs = Input(shape=(None,))
dec_emb_layer = Embedding(num_decoder_token, embedding_dim)
dec_emb = dec_emb_layer(decoder_inputs)
decoder_lstm = LSTM(units, return_sequences=True, return_state=True,recurrent_initializer='glorot_uniform')
decoder_outputs, _, _ = decoder_lstm(dec_emb,
                                     initial_state=encoder_states)
attn_layer = AttentionLayer()
attention_result,attention_weights = attn_layer([encoder_outputs,decoder_outputs])

decoder_concat_input = Concatenate(axis=-1,name='concat_layer')([decoder_outputs,attention_result])

decoder_dense = Dense(num_decoder_token, activation='softmax')
decoder_outputs = decoder_dense(decoder_concat_input)
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

In [None]:
model.compile(optimizer='rmsprop', loss='categorical_crossentropy',metrics=['accuracy'])
model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 22)]         0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (None, 22, 256)      599808      input_1[0][0]                    
__________________________________________________________________________________________________
input_2 (InputLayer)            [(None, None)]       0                                            
__________________________________________________________________________________________________
bidirectional (Bidirectional)   [(None, 22, 1024), ( 3149824     embedding[0][0]                  
______________________________________________________________________________________________

Training our model

In [None]:
model.fit([encoder_input_data, decoder_input_data], decoder_target_data, batch_size=100, epochs=25)

Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25


<tensorflow.python.keras.callbacks.History at 0x7fd6ae3418d0>

In [None]:
encoder_model = Model(encoder_inputs, encoder_states)
encoder_model.summary()

Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 22)]         0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (None, 22, 256)      599808      input_1[0][0]                    
__________________________________________________________________________________________________
bidirectional (Bidirectional)   [(None, 22, 1024), ( 3149824     embedding[0][0]                  
__________________________________________________________________________________________________
concatenate (Concatenate)       (None, 1024)         0           bidirectional[0][1]              
                                                                 bidirectional[0][3]        

In [None]:
decoder_state_input_h = Input(shape=(units,))
decoder_state_input_c = Input(shape=(units,))
decoder_states_inputs = [decoder_state_input_h,decoder_state_input_c]
decoder_hidden_state_input = Input(shape=(num_encoder_tokens,units))

dec_emb2= dec_emb_layer(decoder_inputs) 
decoder_outputs2, state_h2, state_c2 = decoder_lstm(dec_emb2, initial_state=decoder_states_inputs)

attention_result_inf, attention_weights_inf = attn_layer([decoder_hidden_state_input,decoder_outputs2])
decoder_concatenate_input_inf = Concatenate(axis=-1,name='concat_layer')([decoder_outputs2,attention_result_inf])

decoder_states2 = [state_h2, state_c2]
decoder_outputs2 = decoder_dense(decoder_concatenate_input_inf)

decoder_model = Model(
    [decoder_inputs] +[decoder_hidden_state_input,decoder_state_input_h,decoder_state_input_c],
    [decoder_outputs2] + decoder_states2)
decoder_model.summary()

Model: "model_2"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_2 (InputLayer)            [(None, None)]       0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, None, 256)    760064      input_2[0][0]                    
__________________________________________________________________________________________________
input_3 (InputLayer)            [(None, 1024)]       0                                            
__________________________________________________________________________________________________
input_4 (InputLayer)            [(None, 1024)]       0                                            
____________________________________________________________________________________________