In [None]:
import pandas as pd
import numpy as np
import string
import tensorflow as tf
import seaborn as sns
import matplotlib.pyplot as plt
import re
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Input, Dense, Flatten, LSTM
from tensorflow.compat.v1.keras.layers import CuDNNLSTM
from tensorflow.keras import Model
from tensorflow.keras.callbacks import ReduceLROnPlateau, LearningRateScheduler
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [None]:
train = pd.read_csv('train_gen_2.csv')
test = pd.read_csv('test_gen_2.csv')

train.head()

Unnamed: 0,input,target
0,Ya. Next week coming.,Ya. Next week coming.
1,Yeah wana save n stinge... We shall eat smting...,"Yes, I want to save and stinge. We shall eat s..."
2,"Dunno how come cannot go online leh, tt fuji...",I don't know how come I cannot go online. That...
3,Hey come online? We discuss eng with regina,Can you come online? We shall discuss Eng with...
4,Ü all go then i go lor... Free one wat...,All go then I go. It is free.


In [None]:
required_chars = []
for char in string.printable:
  if ord(char) > 31 and ord(char) < 126:
    required_chars.append(char)


print(len(required_chars))
print(required_chars)

94
['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', '-', '.', '/', ':', ';', '<', '=', '>', '?', '@', '[', '\\', ']', '^', '_', '`', '{', '|', '}', ' ']


In [None]:
# Create a dictionary of chars and index value from 1. 0 is reserved for padding by the tokenizer.
vocabulary = dict()
for i in range(len(required_chars)):
  vocabulary[required_chars[i]] = i+1

In [None]:
# Use \t as Start of Sentence and \n as End of Sentence
vocabulary['\n'] = 95
vocabulary['\t'] = 96

In [None]:
# Characters that were found in train and test set and replaced with the normal english characters.
replacements = {'£':'', 'É': 'E', 'Ñ': 'N', 'Ü': 'U', 'à': 'a', 'ä': 'a', 'å': 'a', 'è': 'e', 'é': 'e', 'ì': 'i', 'ñ': 'n', 'ò': 'o', 'ö': 'o', 'ø': 'o', 'ù': 'u', 'ü': 'u',  '“': '"',  '”': '"',   '，': ',',   '？': '?' }

In [None]:
for old_char, new_char in replacements.items():
  train = train.replace(old_char, new_char, regex=True)
  test = test.replace(old_char, new_char, regex=True)

In [None]:
# Adding the \t and \n as part of start and end of sentence
train['target_ip'] = '\t' + train['target'].astype(str)
train['target_op'] =  train['target'].astype(str) + '\n'

test['target_ip'] = '\t' + test['target'].astype(str)
test['target_op'] =  test['target'].astype(str) + '\n'

In [None]:
train = train.drop(['target'], axis=1)
test = test.drop(['target'], axis=1)

In [None]:
train.head()

Unnamed: 0,input,target_ip,target_op
0,Ya. Next week coming.,\tYa. Next week coming.,Ya. Next week coming.\n
1,Yeah wana save n stinge... We shall eat smting...,"\tYes, I want to save and stinge. We shall eat...","Yes, I want to save and stinge. We shall eat s..."
2,"Dunno how come cannot go online leh, tt fuji...",\tI don't know how come I cannot go online. Th...,I don't know how come I cannot go online. That...
3,Hey come online? We discuss eng with regina,\tCan you come online? We shall discuss Eng wi...,Can you come online? We shall discuss Eng with...
4,U all go then i go lor... Free one wat...,\tAll go then I go. It is free.,All go then I go. It is free.\n


In [None]:
train.iloc[0]['target_ip']= str(train.iloc[0]['target_ip'])+'\n'
train.iloc[0]['target_op']= str(train.iloc[0]['target_op'])+'\n'

In [None]:
# Calculating the maximum length of among all the sentences which will be useful for padding.
max_length_encoder = train['input'].map(len).max()

print(max_length_encoder)

185


In [None]:
max_length_encoder = 170

In [None]:
max_length_decoder = max( train['target_ip'].map(len).max(), train['target_op'].map(len).max())
print(max_length_decoder)

200


In [None]:
# Tokenizer for the raw input and target output
tokenizer_raw_ip = Tokenizer(
    char_level=True,
    lower=False,
    filters=None
)

tokenizer_target_ip = Tokenizer(
    char_level=True,
    lower=False,
    filters=None
)

In [None]:
tokenizer_raw_ip.fit_on_texts(train['input'].values)
tokenizer_target_ip.fit_on_texts(train['target_ip'].values)

In [None]:
# Replacing the vocabulary of the trained index to a vocabulary mentioned in the research paper
tokenizer_target_ip.word_index = vocabulary
tokenizer_raw_ip.word_index = vocabulary

In [None]:
target_vocab_size=len(tokenizer_target_ip.word_index.keys())
print(target_vocab_size)
input_vocab_size=len(tokenizer_raw_ip.word_index.keys())
print(input_vocab_size)

96
96


In [None]:
class Encoder(tf.keras.Model):
    '''
    Encoder model -- That takes a input sequence and returns encoder-outputs,encoder_final_state_h,encoder_final_state_c
    '''

    def __init__(self,inp_vocab_size,embedding_size,lstm_size,input_length):
        super().__init__()
        self.inp_vocab_size = inp_vocab_size
        self.embedding_size = embedding_size
        self.lstm_size = lstm_size
        self.input_length = input_length

        self.encoder_output = 0
        self.hidden_state = 0
        self.cell_state = 0

        #Initialize Embedding layer
        self.encoder_embedding_layer = Embedding(input_dim=self.inp_vocab_size, output_dim=self.embedding_size, input_length=self.input_length, mask_zero=True, name="encoder_embedding_layer")

        #Intialize Encoder LSTM layer
        self.encoder_lstm_layer =  LSTM(self.lstm_size, return_state=True, return_sequences=True, name="Encoder_LSTM")

    def call(self,input_sequence):
        '''
          This function takes a sequence input and the initial states of the encoder.
          Pass the input_sequence input to the Embedding layer, Pass the embedding layer ouput to encoder_lstm
          returns -- encoder_output, last time step's hidden and cell state
        '''

        embedding = self.encoder_embedding_layer(input_sequence)
        self.encoder_output, self.hidden_state, self.cell_state = self.encoder_lstm_layer(embedding)

        return self.encoder_output, self.hidden_state, self.cell_state


In [None]:
class Attention(tf.keras.layers.Layer):
  '''
    Class the calculates score based on the scoring_function using Bahdanu attention mechanism.
  '''
  def __init__(self,scoring_function, att_units):
    super().__init__()
    self.scoring_function = scoring_function
    self.att_units = att_units

    # Please go through the reference notebook and research paper to complete the scoring functions
  
    if self.scoring_function=='dot':
      # Intialize variables needed for Dot score function here
      self.dot_products = []

    if scoring_function == 'general':
      # Intialize variables needed for General score function here
      self.W_a = tf.keras.layers.Dense(self.att_units)
      self.general = []

    elif scoring_function == 'concat':
      # Intialize variables needed for Concat score function here
      self.W1 = tf.keras.layers.Dense(self.att_units)
      self.W2 = tf.keras.layers.Dense(self.att_units)
      self.V = tf.keras.layers.Dense(1)
      
  
  
  def call(self,decoder_hidden_state,encoder_output):
    '''
      Attention mechanism takes two inputs current step -- decoder_hidden_state and all the encoder_outputs.
      * Based on the scoring function we will find the score or similarity between decoder_hidden_state and encoder_output.
        Multiply the score function with your encoder_outputs to get the context vector.
        Function returns context vector and attention weights(softmax - scores)
    '''
    output = []

    if self.scoring_function == 'dot':
        # Implement Dot score function here        
        decoder_hidden_state = tf.expand_dims(decoder_hidden_state, axis=2)
        output = tf.keras.layers.Dot(axes=(2, 1))([encoder_output, decoder_hidden_state])        

    elif self.scoring_function == 'general':
        # Implement General score function here
        decoder_hidden_state = tf.expand_dims(decoder_hidden_state, axis=2)
        
        output = self.W_a(encoder_output)
        
        output = tf.keras.layers.Dot(axes=(2, 1))([output, decoder_hidden_state])
        
        
    elif self.scoring_function == 'concat':
        # Implement General score function here
        decoder_hidden_state = tf.expand_dims(decoder_hidden_state, 1)      
        output = self.V(tf.nn.tanh(self.W1(decoder_hidden_state) + self.W2(encoder_output)))
    
    attention_weights = tf.nn.softmax(output, axis=1)
    context_vector = tf.keras.layers.Dot(axes=(1, 1))([attention_weights, encoder_output])
    context_vector = tf.reduce_sum(context_vector, axis=1)

    return context_vector, attention_weights
    
    

In [None]:
class One_Step_Decoder(tf.keras.Model):
  def __init__(self,tar_vocab_size, embedding_dim, input_length, dec_units ,score_fun ,att_units):
      super().__init__()
      self.tar_vocab_size = tar_vocab_size
      self.embedding_dim = embedding_dim
      self.input_length = input_length
      self.dec_units = dec_units
      self.score_fun = score_fun
      self.att_units = att_units

      self.decoder_output = 0
      self.decoder_final_state_h = 0 
      self.decoder_final_state_c = 0

      # Initialize decoder embedding layer, LSTM and any other objects needed
        
      self.decoder_embedding_layer = Embedding(input_dim=self.tar_vocab_size, output_dim=self.embedding_dim, input_length=self.input_length, mask_zero=True, name="decoder_embedding_layer")

      self.decoder_lstm_layer = LSTM(self.dec_units, return_state=True, return_sequences=True, name="onestep_Decoder")

      self.dense_layer = Dense(tar_vocab_size)


  def call(self,input_to_decoder, encoder_output, state_h,state_c):
    '''
        One step decoder mechanisim step by step:
      A. Pass the input_to_decoder to the embedding layer and then get the output(batch_size,1,embedding_dim)
      B. Using the encoder_output and decoder hidden state, compute the context vector.
      C. Concat the context vector with the step A output
      D. Pass the Step-C output to LSTM/GRU and get the decoder output and states(hidden and cell state)
      E. Pass the decoder output to dense layer(vocab size) and store the result into output.
      F. Return the states from step D, output from Step E, attention weights from Step -B
    '''
    embedding_layer = self.decoder_embedding_layer(input_to_decoder)
    attention=Attention(self.score_fun, self.att_units)
    context_vector, attention_weights = attention(state_h, encoder_output)
    embedding_layer = embedding_layer[:,0,:]
    concat_input = tf.concat([context_vector, embedding_layer], 1)
    concat_input = tf.expand_dims(concat_input, axis=1)
    decoder_output, decoder_h, decoder_c = self.decoder_lstm_layer(concat_input)
    output = self.dense_layer(decoder_output)
    output = output[:,0,:]
    return output, decoder_h, decoder_c, attention_weights, context_vector


In [None]:
tf.compat.v1.enable_eager_execution()
tf.config.run_functions_eagerly(True)

In [None]:
class Decoder(tf.keras.Model):
    def __init__(self,out_vocab_size, embedding_dim, input_length, dec_units ,score_fun ,att_units):
      #Intialize necessary variables and create an object from the class onestepdecoder
      super().__init__()
      self.out_vocab_size = out_vocab_size
      self.embedding_dim = embedding_dim
      self.input_length = input_length
      self.dec_units = dec_units
      self.score_fun = score_fun
      self.att_units = att_units

      self.one_step_decoder = One_Step_Decoder(self.out_vocab_size, self.embedding_dim, self.input_length, self.dec_units, self.score_fun, self.att_units)

    def call(self, input_to_decoder, encoder_output,decoder_hidden_state,decoder_cell_state ):

        # Initialize an empty Tensor array, that will store the outputs at each and every time step
        # Create a tensor array as shown in the reference notebook
        outputs = tf.TensorArray(tf.float32, size=200, name='output_array')

        #Iterate till the length of the decoder input
        for timestep in range(200):
            # Call onestepdecoder for each token in decoder_input
            # Store the output in tensorarray
            output, decoder_h, decoder_c, attention_weights, context_vector = self.one_step_decoder(input_to_decoder[:,timestep:timestep+1], encoder_output, decoder_hidden_state, decoder_cell_state)

            outputs = outputs.write(timestep, output)
            
        # Return the tensor array
        outputs = tf.transpose(outputs.stack(),[1,0,2])
  
        return outputs
        
    

In [None]:
class encoder_decoder(tf.keras.Model):
  def __init__(self, enc_vocab_size, enc_embedding_size, enc_lstm_size, enc_input_length, dec_vocab_size, dec_embedding_dim, dec_input_length, dec_units , score_fun , att_units):
    super().__init__()

    # Encoder
    self.enc_vocab_size = enc_vocab_size
    self.enc_embedding_size = enc_embedding_size
    self.enc_lstm_size = enc_lstm_size
    self.enc_input_length = enc_input_length

    # Decoder
    self.dec_vocab_size = dec_vocab_size
    self.dec_embedding_dim = dec_embedding_dim
    self.dec_input_length = dec_input_length
    self.dec_units = dec_units
    self.score_fun = score_fun
    self.att_units = att_units

    #Intialize objects from encoder decoder
    self.encoder = Encoder(self.enc_vocab_size, self.enc_embedding_size, self.enc_lstm_size, self.enc_input_length)

    self.decoder = Decoder(self.dec_vocab_size, self.dec_embedding_dim, self.dec_input_length, self.dec_units , self.score_fun , self.att_units)

    
  
  
  def call(self, data):
    input, output = data[0], data[1]
        
    # Intialize encoder states, Pass the encoder_sequence to the embedding layer
    encoder_output, state_h, state_c = self.encoder(input)

    # Decoder initial states are encoder final states, Initialize it accordingly
    # Pass the decoder sequence,encoder_output,decoder states to Decoder
    dec_output = self.decoder(output, encoder_output, state_h, state_c)
  
    # return the decoder output
    return dec_output




In [None]:
# Do once normal loss function works
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction='none')

def custom_lossfunction(real, pred):

  # Custom loss function that will not consider the loss for padded zeros.
  # Refer https://www.tensorflow.org/tutorials/text/nmt_with_attention#define_the_optimizer_and_the_loss_function
  
  loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction='none')

  mask = tf.math.logical_not(tf.math.equal(real, 0))
  loss_ = loss_object(real, pred)

  mask = tf.cast(mask, dtype=loss_.dtype)
  loss_ *= mask

  return tf.reduce_mean(loss_)


In [None]:
# Creating a data pipeline
class Dataset:
    def __init__(self, data, tokenizer_raw_ip, tokenizer_target_ip, max_length_encoder,max_length_decoder):
        self.encoder_inps = data['input'].values
        self.decoder_inps = data['target_ip'].values
        self.decoder_outs = data['target_op'].values
        self.tokenizer_target_ip = tokenizer_target_ip
        self.tokenizer_raw_ip = tokenizer_raw_ip
        self.max_length_encoder = max_length_encoder
        self.max_length_decoder = max_length_decoder

    def __getitem__(self, i):
        self.encoder_seq = self.tokenizer_raw_ip.texts_to_sequences([self.encoder_inps[i]]) # need to pass list of values
        self.decoder_inp_seq = self.tokenizer_target_ip.texts_to_sequences([self.decoder_inps[i]])
        self.decoder_out_seq = self.tokenizer_target_ip.texts_to_sequences([self.decoder_outs[i]])

        self.encoder_seq = pad_sequences(self.encoder_seq, maxlen=self.max_length_encoder, dtype='int32', padding='post')
        self.decoder_inp_seq = pad_sequences(self.decoder_inp_seq, maxlen=self.max_length_decoder, dtype='int32', padding='post')
        self.decoder_out_seq = pad_sequences(self.decoder_out_seq, maxlen=self.max_length_decoder, dtype='int32', padding='post')
        return self.encoder_seq, self.decoder_inp_seq, self.decoder_out_seq

    def __len__(self): # your model.fit_gen requires this function
        return len(self.encoder_inps)

    
class Dataloder(tf.keras.utils.Sequence):    
    def __init__(self, dataset, batch_size=1):
        self.dataset = dataset
        self.batch_size = batch_size
        self.indexes = np.arange(len(self.dataset.encoder_inps))


    def __getitem__(self, i):
        start = i * self.batch_size
        stop = (i + 1) * self.batch_size
        data = []
        for j in range(start, stop):
            data.append(self.dataset[j])

        batch = [np.squeeze(np.stack(samples, axis=1), axis=0) for samples in zip(*data)]
        # we are creating data like ([italian, english_inp], english_out) these are already converted into seq
        return tuple([[batch[0],batch[1]],batch[2]])

    def __len__(self):  # your model.fit_gen requires this function
        return len(self.indexes) // self.batch_size

    def on_epoch_end(self):
        self.indexes = np.random.permutation(self.indexes)

In [None]:
train_dataset = Dataset(train, tokenizer_raw_ip, tokenizer_target_ip, max_length_encoder, max_length_decoder)
test_dataset  = Dataset(test, tokenizer_raw_ip, tokenizer_target_ip, max_length_encoder, max_length_decoder)

train_dataloader = Dataloder(train_dataset, batch_size=64)
test_dataloader = Dataloder(test_dataset, batch_size=20)

print(train_dataloader[0][0][0].shape, train_dataloader[0][0][1].shape, train_dataloader[0][1].shape)

(64, 185) (64, 200) (64, 200)


In [None]:
train_dataloader[0][1].shape

(64, 200)

In [None]:
# Reduce learning rate based on the validation loss
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.99, verbose=1)

In [None]:
checkpoint_filepath = 'model_1'
model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_filepath,
    save_weights_only=True,
    monitor='val_loss',
    mode='min',
    save_best_only=True)

In [None]:
model_dot = encoder_decoder(enc_vocab_size = input_vocab_size+1, enc_embedding_size = 20, enc_lstm_size = 100, enc_input_length = max_length_encoder, \
                        dec_vocab_size = target_vocab_size+1, dec_embedding_dim = 20, dec_input_length = max_length_decoder, dec_units = 100, score_fun = 'general', att_units=100)

optimizer = tf.keras.optimizers.Adam(lr=0.01, clipnorm=5.0)

model_dot.compile(optimizer=optimizer,loss=custom_lossfunction)

In [None]:
train_steps=train.shape[0]//64
valid_steps=test.shape[0]//20

print(train_steps, valid_steps)

116 1


In [None]:
model_dot.load_weights("model_1")

model_dot.evaluate(test_dataloader)

In [None]:
optimizer = tf.keras.optimizers.Adam(lr=0.01, clipnorm=5.0)

model_dot.compile(optimizer=optimizer,loss=custom_lossfunction)

In [None]:
model_dot.fit(train_dataloader, steps_per_epoch=train_steps, epochs=260, validation_data=test_dataloader, validation_steps=valid_steps, callbacks=[reduce_lr, model_checkpoint_callback])
model_dot.summary()

  "Even though the tf.config.experimental_run_functions_eagerly "


Epoch 1/260
Epoch 2/260
Epoch 3/260
Epoch 4/260
Epoch 5/260
Epoch 6/260
Epoch 7/260
Epoch 8/260
Epoch 9/260
Epoch 10/260
Epoch 11/260
Epoch 12/260
Epoch 13/260
Epoch 14/260
Epoch 15/260
Epoch 16/260
Epoch 17/260
Epoch 18/260
Epoch 19/260
Epoch 20/260
Epoch 21/260
Epoch 22/260
Epoch 23/260
Epoch 24/260
Epoch 25/260
Epoch 26/260
Epoch 27/260
Epoch 28/260
Epoch 29/260
Epoch 30/260
Epoch 31/260
Epoch 32/260
Epoch 33/260
Epoch 34/260
Epoch 35/260
Epoch 36/260
Epoch 37/260
Epoch 38/260
Epoch 39/260
Epoch 40/260
Epoch 41/260
Epoch 42/260
Epoch 43/260
Epoch 44/260
Epoch 45/260
Epoch 46/260

Epoch 00046: ReduceLROnPlateau reducing learning rate to 0.009899999778717757.
Epoch 47/260
  5/116 [>.............................] - ETA: 5:53 - loss: 0.7317

KeyboardInterrupt: ignored

In [None]:
model_dot.load_weights("model_1")

model_dot.evaluate(test_dataloader)

  "Even though the tf.config.experimental_run_functions_eagerly "




0.5720122456550598

In [None]:
start_index = tokenizer_target_ip.word_index['\t']
end_index = tokenizer_target_ip.word_index['\n']
DECODER_SEQ_LEN = 200
max_len = 200


def predict(input_sentence):

  '''
  A. Given input sentence, convert the sentence into integers using tokenizer used earlier
  B. Pass the input_sequence to encoder. we get encoder_outputs, last time step hidden and cell state
  C. Initialize index of <start> as input to decoder. and encoder final states as input_states to onestepdecoder.
  D. till we reach max_length of decoder or till the model predicted word <end>:
         predictions, input_states, attention_weights = model.layers[1].onestepdecoder(input_to_decoder, encoder_output, input_states)
         Save the attention weights
         And get the word using the tokenizer(word index) and then store it in a string.
  E. Call plot_attention(#params)
  F. Return the predicted sentence
  '''
  encoder_seq = tokenizer_raw_ip.texts_to_sequences([input_sentence])
  encoder_seq = pad_sequences(encoder_seq, maxlen=max_length_encoder, dtype='int32', padding='post')


  enc_output, enc_state_h, enc_state_c = model_dot.layers[0](encoder_seq)

  dec_input = tf.expand_dims([tokenizer_target_ip.word_index['\t']], 0)

  input_state = [enc_state_h, enc_state_c]
  output_word = []
  # attention_plot = np.zeros((20, 20))


  for i in range(DECODER_SEQ_LEN):

      output, state_h, state_c, attention_weights, context_vector = model_dot.layers[1].one_step_decoder(dec_input, enc_output, input_state[0], input_state[1])

      input_state = [state_h, state_c]

      output_word_index = np.argmax(output)

      for key, value in tokenizer_target_ip.word_index.items():

         if output_word_index == value:
              output_word.append(key)

      dec_input = np.reshape(output_word_index, (1, 1))

      attention_weights = tf.reshape(attention_weights, (-1, ))
    
      # attention_plot[i] = attention_weights.numpy()


      if dec_input == tokenizer_target_ip.word_index['\n']:
        break


  predicted_sentence = ''.join(output_word)

  # attention_plot = attention_plot[:len(predicted_sentence.split(' ')), :len(input_sentence.split(' '))]
  # plot_attention(attention_plot, input_sentence.split(' '), predicted_sentence.split(' ')) # Comment out if you do need the plots of attention weights.

  print('Input_sentence:', input_sentence)
  print('Predicted_sentence:',predicted_sentence)

  
  return predicted_sentence

In [None]:
predicted_sentences = []
actual_sentences = []

for i, row in test.iterrows():
    output = predict(row['input'])
    predicted_sentences.append(output)
    english_out = row['target_op'].split()
    actual_sentences.append(english_out)
    sentence = ''.join(output)
    print('Input Sentence:',row['input'])
    print('Predicted Sentence:',sentence)
    print('Original English sentence:', row['target_op'])
    print('*'*30)

Input_sentence: Mmm thats better now i got a roast down me! i'd b better if i had a few drinks down me 2! Good indian?
Predicted_sentence: Not's athat a t's that'm at's inow m ng t I at t not's d mbengomed t'm atham That's bus t's t ino am ame boto atow athat's I bumert d tha gom at's g t's Tham fot's m fott'mbe t'mid t athat's I t t's a
Input Sentence: Mmm thats better now i got a roast down me! i'd b better if i had a few drinks down me 2! Good indian?
Predicted Sentence: Not's athat a t's that'm at's inow m ng t I at t not's d mbengomed t'm atham That's bus t's t ino am ame boto atow athat's I bumert d tha gom at's g t's Tham fot's m fott'mbe t'mid t athat's I t t's a
Original English sentence: That's better now, I got a roast down me! I'd be better if I had a few drinks down me too! Good Indian?

******************************
Input_sentence: Watch wat?
Predicted_sentence: Wh wat watchat wat watche tchat watche watche wat watchat wat wat wat watchat wat wat wat wat wat t wat wat wa

# Attention with OHE


In [None]:
train = pd.read_csv('train_2.csv')
test = pd.read_csv('test_2.csv')

train.head()

Unnamed: 0,input,target
0,Ya. Next week coming.,Ya. Next week coming.
1,Yeah wana save n stinge... We shall eat smting...,"Yes, I want to save and stinge. We shall eat s..."
2,"Dunno how come cannot go online leh, tt fuji...",I don't know how come I cannot go online. That...
3,Hey come online? We discuss eng with regina,Can you come online? We shall discuss Eng with...
4,Ü all go then i go lor... Free one wat...,All go then I go. It is free.


In [None]:
required_chars = []
for char in string.printable:
  if ord(char) > 31 and ord(char) < 126:
    required_chars.append(char)


print(len(required_chars))
print(required_chars)

94
['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', '-', '.', '/', ':', ';', '<', '=', '>', '?', '@', '[', '\\', ']', '^', '_', '`', '{', '|', '}', ' ']


In [None]:
# Create a dictionary of chars and index value from 1. 0 is reserved for padding by the tokenizer.
vocabulary = dict()
for i in range(len(required_chars)):
  vocabulary[required_chars[i]] = i+1

In [None]:
# Use \t as Start of Sentence and \n as End of Sentence
vocabulary['\n'] = 95
vocabulary['\t'] = 96

In [None]:
# Characters that were found in train and test set and replaced with the normal english characters.
replacements = {'£':'', 'É': 'E', 'Ñ': 'N', 'Ü': 'U', 'à': 'a', 'ä': 'a', 'å': 'a', 'è': 'e', 'é': 'e', 'ì': 'i', 'ñ': 'n', 'ò': 'o', 'ö': 'o', 'ø': 'o', 'ù': 'u', 'ü': 'u',  '“': '"',  '”': '"',   '，': ',',   '？': '?' }

In [None]:
for old_char, new_char in replacements.items():
  train = train.replace(old_char, new_char, regex=True)
  test = test.replace(old_char, new_char, regex=True)

In [None]:
# Adding the \t and \n as part of start and end of sentence
train['target_ip'] = '\t' + train['target'].astype(str)
train['target_op'] =  train['target'].astype(str) + '\n'

test['target_ip'] = '\t' + test['target'].astype(str)
test['target_op'] =  test['target'].astype(str) + '\n'

In [None]:
train = train.drop(['target'], axis=1)
test = test.drop(['target'], axis=1)

In [None]:
train.head()

Unnamed: 0,input,target_ip,target_op
0,Ya. Next week coming.,\tYa. Next week coming.,Ya. Next week coming.\n
1,Yeah wana save n stinge... We shall eat smting...,"\tYes, I want to save and stinge. We shall eat...","Yes, I want to save and stinge. We shall eat s..."
2,"Dunno how come cannot go online leh, tt fuji...",\tI don't know how come I cannot go online. Th...,I don't know how come I cannot go online. That...
3,Hey come online? We discuss eng with regina,\tCan you come online? We shall discuss Eng wi...,Can you come online? We shall discuss Eng with...
4,U all go then i go lor... Free one wat...,\tAll go then I go. It is free.,All go then I go. It is free.\n


In [None]:
train.iloc[0]['target_ip']= str(train.iloc[0]['target_ip'])+'\n'
train.iloc[0]['target_op']= str(train.iloc[0]['target_op'])+'\n'

In [None]:
# Calculating the maximum length of among all the sentences which will be useful for padding.
max_length_encoder = train['input'].map(len).max()

print(max_length_encoder)

161


In [None]:
max_length_encoder = 170

In [None]:
max_length_decoder = max( train['target_ip'].map(len).max(), train['target_op'].map(len).max())
print(max_length_decoder)

200


In [None]:
# Tokenizer for the raw input and target output
tokenizer_raw_ip = Tokenizer(
    char_level=True,
    lower=False,
    filters=None
)

tokenizer_target_ip = Tokenizer(
    char_level=True,
    lower=False,
    filters=None
)

In [None]:
tokenizer_raw_ip.fit_on_texts(train['input'].values)
tokenizer_target_ip.fit_on_texts(train['target_ip'].values)

In [None]:
# Replacing the vocabulary of the trained index to a vocabulary mentioned in the research paper
tokenizer_target_ip.word_index = vocabulary
tokenizer_raw_ip.word_index = vocabulary

In [None]:
target_vocab_size=len(tokenizer_target_ip.word_index.keys())
print(target_vocab_size)
input_vocab_size=len(tokenizer_raw_ip.word_index.keys())
print(input_vocab_size)

96
96


In [None]:
class Encoder(tf.keras.Model):
    '''
    Encoder model -- That takes a input sequence and returns encoder-outputs,encoder_final_state_h,encoder_final_state_c
    '''

    def __init__(self,inp_vocab_size,embedding_size,lstm_size,input_length):
        super().__init__()
        self.inp_vocab_size = inp_vocab_size
        self.embedding_size = embedding_size
        self.lstm_size = lstm_size
        self.input_length = input_length

        self.encoder_output = 0
        self.hidden_state = 0
        self.cell_state = 0

        #Initialize Embedding layer
        self.encoder_embedding_layer = Embedding(input_dim=self.inp_vocab_size, output_dim=self.embedding_size, input_length=self.input_length, mask_zero=True, name="encoder_embedding_layer")

        #Intialize Encoder LSTM layer
        self.encoder_lstm_layer =  LSTM(self.lstm_size, return_state=True, return_sequences=True, name="Encoder_LSTM")

    def call(self,input_sequence):
        '''
          This function takes a sequence input and the initial states of the encoder.
          Pass the input_sequence input to the Embedding layer, Pass the embedding layer ouput to encoder_lstm
          returns -- encoder_output, last time step's hidden and cell state
        '''

        # embedding = self.encoder_embedding_layer(input_sequence)
        self.encoder_output, self.hidden_state, self.cell_state = self.encoder_lstm_layer(input_sequence)

        return self.encoder_output, self.hidden_state, self.cell_state


In [None]:
class Attention(tf.keras.layers.Layer):
  '''
    Class the calculates score based on the scoring_function using Bahdanu attention mechanism.
  '''
  def __init__(self,scoring_function, att_units):
    super().__init__()
    self.scoring_function = scoring_function
    self.att_units = att_units

    # Please go through the reference notebook and research paper to complete the scoring functions
  
    if self.scoring_function=='dot':
      # Intialize variables needed for Dot score function here
      self.dot_products = []

    if scoring_function == 'general':
      # Intialize variables needed for General score function here
      self.W_a = tf.keras.layers.Dense(self.att_units)
      self.general = []

    elif scoring_function == 'concat':
      # Intialize variables needed for Concat score function here
      self.W1 = tf.keras.layers.Dense(self.att_units)
      self.W2 = tf.keras.layers.Dense(self.att_units)
      self.V = tf.keras.layers.Dense(1)
      
  
  
  def call(self,decoder_hidden_state,encoder_output):
    '''
      Attention mechanism takes two inputs current step -- decoder_hidden_state and all the encoder_outputs.
      * Based on the scoring function we will find the score or similarity between decoder_hidden_state and encoder_output.
        Multiply the score function with your encoder_outputs to get the context vector.
        Function returns context vector and attention weights(softmax - scores)
    '''
    output = []

    if self.scoring_function == 'dot':
        # Implement Dot score function here        
        decoder_hidden_state = tf.expand_dims(decoder_hidden_state, axis=2)
        output = tf.keras.layers.Dot(axes=(2, 1))([encoder_output, decoder_hidden_state])        

    elif self.scoring_function == 'general':
        # Implement General score function here
        decoder_hidden_state = tf.expand_dims(decoder_hidden_state, axis=2)
        
        output = self.W_a(encoder_output)
        
        output = tf.keras.layers.Dot(axes=(2, 1))([output, decoder_hidden_state])
        
        
    elif self.scoring_function == 'concat':
        # Implement General score function here
        decoder_hidden_state = tf.expand_dims(decoder_hidden_state, 1)      
        output = self.V(tf.nn.tanh(self.W1(decoder_hidden_state) + self.W2(encoder_output)))
    
    attention_weights = tf.nn.softmax(output, axis=1)
    context_vector = tf.keras.layers.Dot(axes=(1, 1))([attention_weights, encoder_output])
    context_vector = tf.reduce_sum(context_vector, axis=1)

    return context_vector, attention_weights
    
    

In [None]:
class One_Step_Decoder(tf.keras.Model):
  def __init__(self,tar_vocab_size, embedding_dim, input_length, dec_units ,score_fun ,att_units):
      super().__init__()
      self.tar_vocab_size = tar_vocab_size
      self.embedding_dim = embedding_dim
      self.input_length = input_length
      self.dec_units = dec_units
      self.score_fun = score_fun
      self.att_units = att_units

      self.decoder_output = 0
      self.decoder_final_state_h = 0 
      self.decoder_final_state_c = 0

      # Initialize decoder embedding layer, LSTM and any other objects needed
        
      # self.decoder_embedding_layer = Embedding(input_dim=self.tar_vocab_size, output_dim=self.embedding_dim, input_length=self.input_length, mask_zero=True, name="decoder_embedding_layer")

      self.decoder_lstm_layer = LSTM(self.dec_units, return_state=True, return_sequences=True, name="onestep_Decoder")

      self.dense_layer = Dense(tar_vocab_size)


  def call(self,input_to_decoder, encoder_output, state_h,state_c):
    '''
        One step decoder mechanisim step by step:
      A. Pass the input_to_decoder to the embedding layer and then get the output(batch_size,1,embedding_dim)
      B. Using the encoder_output and decoder hidden state, compute the context vector.
      C. Concat the context vector with the step A output
      D. Pass the Step-C output to LSTM/GRU and get the decoder output and states(hidden and cell state)
      E. Pass the decoder output to dense layer(vocab size) and store the result into output.
      F. Return the states from step D, output from Step E, attention weights from Step -B
    '''
    # embedding_layer = self.decoder_embedding_layer(input_to_decoder)
    attention=Attention(self.score_fun, self.att_units)
    context_vector, attention_weights = attention(state_h, encoder_output)
    input_to_decoder = input_to_decoder[:,0,:]
    concat_input = tf.concat([context_vector, input_to_decoder], 1)
    concat_input = tf.expand_dims(concat_input, axis=1)
    decoder_output, decoder_h, decoder_c = self.decoder_lstm_layer(concat_input)
    output = self.dense_layer(decoder_output)
    output = output[:,0,:]
    return output, decoder_h, decoder_c, attention_weights, context_vector


In [None]:
tf.compat.v1.enable_eager_execution()
tf.config.run_functions_eagerly(True)

In [None]:
class Decoder(tf.keras.Model):
    def __init__(self,out_vocab_size, embedding_dim, input_length, dec_units ,score_fun ,att_units):
      #Intialize necessary variables and create an object from the class onestepdecoder
      super().__init__()
      self.out_vocab_size = out_vocab_size
      self.embedding_dim = embedding_dim
      self.input_length = input_length
      self.dec_units = dec_units
      self.score_fun = score_fun
      self.att_units = att_units

      self.one_step_decoder = One_Step_Decoder(self.out_vocab_size, self.embedding_dim, self.input_length, self.dec_units, self.score_fun, self.att_units)

    def call(self, input_to_decoder, encoder_output,decoder_hidden_state,decoder_cell_state ):

        # Initialize an empty Tensor array, that will store the outputs at each and every time step
        # Create a tensor array as shown in the reference notebook
        outputs = tf.TensorArray(tf.float32, size=200, name='output_array')

        #Iterate till the length of the decoder input
        for timestep in range(200):
            # Call onestepdecoder for each token in decoder_input
            # Store the output in tensorarray
            output, decoder_h, decoder_c, attention_weights, context_vector = self.one_step_decoder(input_to_decoder[:,timestep:timestep+1], encoder_output, decoder_hidden_state, decoder_cell_state)

            outputs = outputs.write(timestep, output)
            
        # Return the tensor array
        outputs = tf.transpose(outputs.stack(),[1,0,2])
  
        return outputs
        
    

In [None]:
class encoder_decoder(tf.keras.Model):
  def __init__(self, enc_vocab_size, enc_embedding_size, enc_lstm_size, enc_input_length, dec_vocab_size, dec_embedding_dim, dec_input_length, dec_units , score_fun , att_units):
    super().__init__()

    # Encoder
    self.enc_vocab_size = enc_vocab_size
    self.enc_embedding_size = enc_embedding_size
    self.enc_lstm_size = enc_lstm_size
    self.enc_input_length = enc_input_length

    # Decoder
    self.dec_vocab_size = dec_vocab_size
    self.dec_embedding_dim = dec_embedding_dim
    self.dec_input_length = dec_input_length
    self.dec_units = dec_units
    self.score_fun = score_fun
    self.att_units = att_units

    #Intialize objects from encoder decoder
    self.encoder = Encoder(self.enc_vocab_size, self.enc_embedding_size, self.enc_lstm_size, self.enc_input_length)

    self.decoder = Decoder(self.dec_vocab_size, self.dec_embedding_dim, self.dec_input_length, self.dec_units , self.score_fun , self.att_units)

    
  
  
  def call(self, data):
    input, output = data[0], data[1]
        
    # Intialize encoder states, Pass the encoder_sequence to the embedding layer
    encoder_output, state_h, state_c = self.encoder(input)

    # Decoder initial states are encoder final states, Initialize it accordingly
    # Pass the decoder sequence,encoder_output,decoder states to Decoder
    dec_output = self.decoder(output, encoder_output, state_h, state_c)
  
    # return the decoder output
    return dec_output




In [None]:
# Do once normal loss function works
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction='none')

def custom_lossfunction(real, pred):

  # Custom loss function that will not consider the loss for padded zeros.
  # Refer https://www.tensorflow.org/tutorials/text/nmt_with_attention#define_the_optimizer_and_the_loss_function
  
  loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction='none')

  mask = tf.math.logical_not(tf.math.equal(real, 0))
  loss_ = loss_object(real, pred)

  mask = tf.cast(mask, dtype=loss_.dtype)
  loss_ *= mask

  return tf.reduce_mean(loss_)


In [None]:
# Creating a data pipeline
class Dataset:
    def __init__(self, data, tokenizer_raw_ip, tokenizer_target_ip, max_length_encoder,max_length_decoder):
        self.encoder_inps = data['input'].values
        self.decoder_inps = data['target_ip'].values
        self.decoder_outs = data['target_op'].values
        self.tokenizer_target_ip = tokenizer_target_ip
        self.tokenizer_raw_ip = tokenizer_raw_ip
        self.max_length_encoder = max_length_encoder
        self.max_length_decoder = max_length_decoder

    def __getitem__(self, i):
        self.encoder_seq = self.tokenizer_raw_ip.texts_to_sequences([self.encoder_inps[i]]) # need to pass list of values
        self.decoder_inp_seq = self.tokenizer_target_ip.texts_to_sequences([self.decoder_inps[i]])
        self.decoder_out_seq = self.tokenizer_target_ip.texts_to_sequences([self.decoder_outs[i]])

        self.encoder_seq = pad_sequences(self.encoder_seq, maxlen=self.max_length_encoder, dtype='int32', padding='post')
        self.decoder_inp_seq = pad_sequences(self.decoder_inp_seq, maxlen=self.max_length_decoder, dtype='int32', padding='post')
        self.decoder_out_seq = pad_sequences(self.decoder_out_seq, maxlen=self.max_length_decoder, dtype='int32', padding='post')

        self.encoder_seq = tf.keras.utils.to_categorical(self.encoder_seq, num_classes=len(tokenizer_raw_ip.word_index.keys())+1)
        self.decoder_inp_seq = tf.keras.utils.to_categorical(self.decoder_inp_seq, num_classes=len(tokenizer_target_ip.word_index.keys())+1)
        self.decoder_out_seq = tf.keras.utils.to_categorical(self.decoder_out_seq, num_classes=len(tokenizer_target_ip.word_index.keys())+1)
        
        return self.encoder_seq, self.decoder_inp_seq, self.decoder_out_seq

    def __len__(self): # your model.fit_gen requires this function
        return len(self.encoder_inps)

    
class Dataloder(tf.keras.utils.Sequence):    
    def __init__(self, dataset, batch_size=1):
        self.dataset = dataset
        self.batch_size = batch_size
        self.indexes = np.arange(len(self.dataset.encoder_inps))


    def __getitem__(self, i):
        start = i * self.batch_size
        stop = (i + 1) * self.batch_size
        data = []
        for j in range(start, stop):
            data.append(self.dataset[j])

        batch = [np.squeeze(np.stack(samples, axis=1), axis=0) for samples in zip(*data)]
        # we are creating data like ([italian, english_inp], english_out) these are already converted into seq
        return tuple([[batch[0],batch[1]],batch[2]])

    def __len__(self):  # your model.fit_gen requires this function
        return len(self.indexes) // self.batch_size

    def on_epoch_end(self):
        self.indexes = np.random.permutation(self.indexes)

In [None]:
train_dataset = Dataset(train, tokenizer_raw_ip, tokenizer_target_ip, max_length_encoder, max_length_decoder)
test_dataset  = Dataset(test, tokenizer_raw_ip, tokenizer_target_ip, max_length_encoder, max_length_decoder)

train_dataloader = Dataloder(train_dataset, batch_size=64)
test_dataloader = Dataloder(test_dataset, batch_size=20)

print(train_dataloader[0][0][0].shape, train_dataloader[0][0][1].shape, train_dataloader[0][1].shape)

(64, 161, 97) (64, 200, 97) (64, 200, 97)


In [None]:
train_dataloader[0][1].shape

(64, 200, 97)

In [None]:
# Reduce learning rate based on the validation loss
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.99, verbose=1)

In [None]:
checkpoint_filepath = 'model_1'
model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_filepath,
    save_weights_only=True,
    monitor='val_loss',
    mode='min',
    save_best_only=True)

In [None]:
model_dot = encoder_decoder(enc_vocab_size = input_vocab_size+1, enc_embedding_size = 20, enc_lstm_size = 100, enc_input_length = max_length_encoder, \
                        dec_vocab_size = target_vocab_size+1, dec_embedding_dim = 20, dec_input_length = max_length_decoder, dec_units = 100, score_fun = 'general', att_units=100)

optimizer = tf.keras.optimizers.Adam(lr=0.01)

model_dot.compile(optimizer=optimizer,loss='categorical_crossentropy')

In [None]:
train_steps=train.shape[0]//64
valid_steps=test.shape[0]//20

print(train_steps, valid_steps)

30 1


In [None]:
model_dot.fit(train_dataloader, steps_per_epoch=train_steps, epochs=260, validation_data=test_dataloader, validation_steps=valid_steps, callbacks=[reduce_lr, model_checkpoint_callback])
model_dot.summary()

  "Even though the tf.config.experimental_run_functions_eagerly "


Epoch 1/260
Epoch 2/260
Epoch 3/260
Epoch 4/260
Epoch 5/260
Epoch 6/260
Epoch 7/260
Epoch 8/260
Epoch 9/260
Epoch 10/260
Epoch 11/260
Epoch 12/260

Epoch 00012: ReduceLROnPlateau reducing learning rate to 0.009899999778717757.
Epoch 13/260
Epoch 14/260
Epoch 15/260
Epoch 16/260
Epoch 17/260
Epoch 18/260
Epoch 19/260

KeyboardInterrupt: ignored

In [None]:
model_dot.load_weights("model_1")

model_dot.evaluate(test_dataloader)

  "Even though the tf.config.experimental_run_functions_eagerly "




0.5720122456550598

In [None]:
start_index = tokenizer_target_ip.word_index['\t']
end_index = tokenizer_target_ip.word_index['\n']
DECODER_SEQ_LEN = 200
max_len = 200


def predict(input_sentence):

  '''
  A. Given input sentence, convert the sentence into integers using tokenizer used earlier
  B. Pass the input_sequence to encoder. we get encoder_outputs, last time step hidden and cell state
  C. Initialize index of <start> as input to decoder. and encoder final states as input_states to onestepdecoder.
  D. till we reach max_length of decoder or till the model predicted word <end>:
         predictions, input_states, attention_weights = model.layers[1].onestepdecoder(input_to_decoder, encoder_output, input_states)
         Save the attention weights
         And get the word using the tokenizer(word index) and then store it in a string.
  E. Call plot_attention(#params)
  F. Return the predicted sentence
  '''
  encoder_seq = tokenizer_raw_ip.texts_to_sequences([input_sentence])
  encoder_seq = pad_sequences(encoder_seq, maxlen=max_length_encoder, dtype='int32', padding='post')


  enc_output, enc_state_h, enc_state_c = model_dot.layers[0](encoder_seq)

  dec_input = tf.expand_dims([tokenizer_target_ip.word_index['\t']], 0)

  input_state = [enc_state_h, enc_state_c]
  output_word = []
  # attention_plot = np.zeros((20, 20))


  for i in range(DECODER_SEQ_LEN):

      output, state_h, state_c, attention_weights, context_vector = model_dot.layers[1].one_step_decoder(dec_input, enc_output, input_state[0], input_state[1])

      input_state = [state_h, state_c]

      output_word_index = np.argmax(output)

      for key, value in tokenizer_target_ip.word_index.items():

         if output_word_index == value:
              output_word.append(key)

      dec_input = np.reshape(output_word_index, (1, 1))

      attention_weights = tf.reshape(attention_weights, (-1, ))
    
      # attention_plot[i] = attention_weights.numpy()


      if dec_input == tokenizer_target_ip.word_index['\n']:
        break


  predicted_sentence = ''.join(output_word)

  # attention_plot = attention_plot[:len(predicted_sentence.split(' ')), :len(input_sentence.split(' '))]
  # plot_attention(attention_plot, input_sentence.split(' '), predicted_sentence.split(' ')) # Comment out if you do need the plots of attention weights.

  print('Input_sentence:', input_sentence)
  print('Predicted_sentence:',predicted_sentence)

  
  return predicted_sentence

In [None]:
predicted_sentences = []
actual_sentences = []

for i, row in test.iterrows():
    output = predict(row['input'])
    predicted_sentences.append(output)
    english_out = row['target_op'].split()
    actual_sentences.append(english_out)
    sentence = ''.join(output)
    print('Input Sentence:',row['input'])
    print('Predicted Sentence:',sentence)
    print('Original English sentence:', row['target_op'])
    print('*'*30)

Input_sentence: Mmm thats better now i got a roast down me! i'd b better if i had a few drinks down me 2! Good indian?
Predicted_sentence: Not's athat a t's that'm at's inow m ng t I at t not's d mbengomed t'm atham That's bus t's t ino am ame boto atow athat's I bumert d tha gom at's g t's Tham fot's m fott'mbe t'mid t athat's I t t's a
Input Sentence: Mmm thats better now i got a roast down me! i'd b better if i had a few drinks down me 2! Good indian?
Predicted Sentence: Not's athat a t's that'm at's inow m ng t I at t not's d mbengomed t'm atham That's bus t's t ino am ame boto atow athat's I bumert d tha gom at's g t's Tham fot's m fott'mbe t'mid t athat's I t t's a
Original English sentence: That's better now, I got a roast down me! I'd be better if I had a few drinks down me too! Good Indian?

******************************
Input_sentence: Watch wat?
Predicted_sentence: Wh wat watchat wat watche tchat watche watche wat watchat wat wat wat watchat wat wat wat wat wat t wat wat wa