In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Input, Dense, Flatten, LSTM
from tensorflow.compat.v1.keras.layers import CuDNNLSTM
from tensorflow.keras import Model
from tensorflow.keras.callbacks import ReduceLROnPlateau, LearningRateScheduler
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import time

In [None]:
max_length_encoder = 40
max_length_decoder = 41
target_vocab_size = 96
input_vocab_size = 96

required_chars = []
vocabulary = dict()

In [None]:
for char in string.printable:
    if 31 < ord(char) < 126:
        required_chars.append(char)

for i in range(len(required_chars)):
    vocabulary[required_chars[i]] = i+1

vocabulary['\n'] = 95
vocabulary['\t'] = 96

tokenizer_raw_ip = Tokenizer(
    char_level=True,
    lower=False,
    filters=None
)

tokenizer_target_ip = Tokenizer(
    char_level=True,
    lower=False,
    filters=None
)

# Using the printable 94 characters as the vocabulary
tokenizer_target_ip.word_index = vocabulary
tokenizer_raw_ip.word_index = vocabularycla

In [None]:
# Encoder class with Embedding layer and LSTM layer.
class Encoder(tf.keras.layers.Layer):
    def __init__(self, vocab_size, embedding_dim, input_length, enc_units):
        super().__init__()
        self.vocab_size = vocab_size
        self.embedding_dim = embedding_dim
        self.input_length = input_length
        self.enc_units= enc_units
        self.lstm_output = 0
        self.lstm_state_h=0
        self.lstm_state_c=0
        
    def build(self, input_shape):
        self.embedding = Embedding(input_dim=self.vocab_size, output_dim=self.embedding_dim, input_length=self.input_length, mask_zero=True, name="Embedding_Layer_Encoder")
        self.lstm_1 = LSTM(self.enc_units, recurrent_dropout=0.2, return_state=True, return_sequences=True, name="Encoder_LSTM_1")
        self.lstm_2 = LSTM(self.enc_units, recurrent_dropout=0.2, return_state=True, return_sequences=True, name="Encoder_LSTM_2")
        
    def call(self, input_sentances, training=True):
        # input_embedded = self.embedding(input_sentances)
        self.lstm_output,_,_ = self.lstm_1(input_sentances)
        self.lstm_output, self.lstm_state_h, self.lstm_state_c = self.lstm_2(self.lstm_output)

        return self.lstm_output, self.lstm_state_h,self.lstm_state_c

    def get_states(self):
        return self.lstm_state_h,self.lstm_state_c
    
# Decoder class with embedding and LSTM layer.    
class Decoder(tf.keras.layers.Layer):
    def __init__(self, vocab_size, embedding_dim, input_length, dec_units):
        super().__init__()
        self.vocab_size = vocab_size
        self.embedding_dim = embedding_dim
        self.dec_units = dec_units
        self.input_length = input_length
        # we are using embedding_matrix and not training the embedding layer
        self.embedding = Embedding(input_dim=self.vocab_size, output_dim=self.embedding_dim, input_length=self.input_length, mask_zero=True, name="Embedding_Layer_Decoder",)
        self.lstm = LSTM(self.dec_units,  dropout=0.2, return_sequences=True, return_state=True, name="Decoder_LSTM")
    
    def call(self, target_sentences, state_h, state_c):
        # target_embedded           = self.embedding(target_sentences)
        lstm_output, _,_        = self.lstm(target_sentences, initial_state=[state_h, state_c])
        return lstm_output

In [None]:
# Model 1 - 1 layer LSTM model for each encoder and decoder
class Model1(Model):
    def __init__(self, encoder_inputs_length,decoder_inputs_length, output_vocab_size):
        super().__init__() # https://stackoverflow.com/a/27134600/4084039
        self.encoder = Encoder(vocab_size=input_vocab_size+1, embedding_dim=30, input_length=encoder_inputs_length, enc_units=512)
        self.decoder = Decoder(vocab_size=target_vocab_size+1, embedding_dim=30, input_length=decoder_inputs_length, dec_units=512)
        self.dense   = Dense(output_vocab_size+1, activation='softmax')
        
    def call(self, data):
        input,output = data[0], data[1]
        encoder_output, encoder_h, encoder_c = self.encoder(input)
        decoder_output                       = self.decoder(output, encoder_h, encoder_c)
        output                               = self.dense(decoder_output)
        return output        

In [None]:
def predict(input_sentence):
  word_list = []
  split_sentence = input_sentence.split(" ")
  for word in split_sentence:

      encoder_seq = tokenizer_raw_ip.texts_to_sequences([word])

      encoder_seq = pad_sequences(encoder_seq, maxlen=max_length_encoder, dtype='int32', padding='post')

      encoder_seq = tf.keras.utils.to_categorical(encoder_seq, num_classes=len(tokenizer_raw_ip.word_index.keys())+1)

      enc_output, enc_state_h, enc_state_c = model.layers[0](encoder_seq)

      dec_input = np.zeros((1, 1, len(tokenizer_raw_ip.word_index.keys())+1))

      dec_input[0, 0, tokenizer_target_ip.word_index['\t']] = 1.

      input_state = [enc_state_h, enc_state_c]

      output_word = []

      for i in range(max_length_decoder):
          # cur_emb = model.layers[1].embedding(dec_input)

          predicted_out, state_h, state_c = model.layers[1].lstm(dec_input, input_state)

          dense_layer_out = model.layers[2](predicted_out)

          input_state = [state_h, state_c]
      
          output_word_index = np.argmax(dense_layer_out)

          # print(output_word_index)

          for key, value in tokenizer_target_ip.word_index.items():

            if output_word_index == value:
                output_word.append(key)

          dec_input = np.reshape(output_word_index, (1, 1))

          dec_input = np.zeros((1, 1, len(tokenizer_raw_ip.word_index.keys())+1))

          dec_input[0, 0, output_word_index] = 1.


          if output_word_index == tokenizer_target_ip.word_index['\n']:
            break

      word = "".join(output_word)
      word_list.append(word)

  sentence = ''.join(word_list)
  sentence = sentence.replace("\n", " ")
  return sentence


In [None]:
model  = Model1(encoder_inputs_length=max_length_encoder,decoder_inputs_length=max_length_decoder,output_vocab_size=target_vocab_size)
optimizer = tf.keras.optimizers.Adam(lr=0.001)
model.compile(optimizer=optimizer,loss='categorical_crossentropy', metrics=['accuracy'])
model.load_weights("model_2")

<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x7f6f6b14ce90>

In [None]:
def final_fun(input_sentence):
    output_sentence = predict(input_sentence)
    print("*"*30)
    print("Input: ", input_sentence)
    print("Output: ", output_sentence)
    return output_sentence

In [None]:
start_time = time.time()
# input_sentence = input("Enter your string: ")
input_sentence = "wo aer yuo"
output = final_fun(input_sentence)
print("Function 1 has taken %s seconds to execute" % (time.time() - start_time))

******************************
Input:  wo aer yuo
Output:  wou art you 
Function 1 has taken 1.5915215015411377 seconds to execute


# Inference

From the output seen in the training, several anaylsis of the model prediction are as follows:

- The model is able to do better compared to previous models since we are training the model on corrputed words and target words rather than input sentences which are longer. 
- The model achieves a good accuracy of around ~99% but note that there are several paddings due to which the accuracy is shown higher.


- Looking at the sentences, we see that errors where the letters are exchanged or letters are replaced with another letter the model is able to correct them to a very good extent
- The model however does not perform that well when it encouters missing letters in a word or addition of letters in the word.
- Each sentence had maximum of 3 errors introduced in them, the is able to correct 2 errors depending on the sentence and the vocabulary.

- With larger non repeating, non augmented dataset the model will perform much better covering wide variety of errors and mistakes.

