In [1]:
import pandas as pd
import numpy as np
import string
import tensorflow as tf
import seaborn as sns
import matplotlib.pyplot as plt
import re
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Input, Dense, Flatten, LSTM
from tensorflow.compat.v1.keras.layers import CuDNNLSTM
from tensorflow.keras import Model
from tensorflow.keras.callbacks import ReduceLROnPlateau, LearningRateScheduler
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import csv
from tqdm import tqdm

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
!cp "/content/drive/MyDrive/Novel_dataset_errors_train_3.csv" "/content/"
!cp "/content/drive/MyDrive/Novel_dataset_errors_test_3.csv" "/content/"

In [4]:
train = pd.read_csv('Novel_dataset_errors_train_3.csv')
test = pd.read_csv('Novel_dataset_errors_test_3.csv')

train.head()

Unnamed: 0,input,target
0,conteted fellows satiVsfied witrh their positi...,contented fellows satisfied with their positio...
1,at hte raPe of twentyseven miles a qday that t...,at the rate of twentyseven miles a day that th...
2,GLwUCESTER,GLOUCESTER
3,yas already full ou peNple There were people n...,was already full of people There were people n...
4,feet,feet


In [5]:
input_words = dict()

for i, row in tqdm(train.iterrows()):
    input_sentence = row['input']
    target_sentence = row['target']
    input_sentence_split = input_sentence.split(" ")
    target_sentence_split = target_sentence.split(" ")

    for i in range(len(input_sentence_split)):
        if input_sentence_split[i] not in input_words:
            input_words[input_sentence_split[i]] = target_sentence_split[i]

382072it [00:31, 12112.54it/s]


In [6]:
input_words_test = dict()

for i, row in tqdm(test.iterrows()):
    input_sentence = row['input']
    target_sentence = row['target']
    input_sentence_split = input_sentence.split(" ")
    target_sentence_split = target_sentence.split(" ")

    for i in range(len(input_sentence_split)):
        if input_sentence_split[i] not in input_words_test:
            input_words_test[input_sentence_split[i]] = target_sentence_split[i]

965it [00:00, 11931.30it/s]


In [7]:
input_words_list = []
target_words_list = []
input_words_list_test = []
target_words_list_test = []

for k,v in input_words.items():
  input_words_list.append(k)
  target_words_list.append(v)

for k,v in input_words_test.items():
  input_words_list_test.append(k)
  target_words_list_test.append(v)

In [8]:
train_words = pd.DataFrame({
    'input':input_words_list,
    'target':target_words_list
})

test_words = pd.DataFrame({
    'input':input_words_list_test,
    'target':target_words_list_test
})

In [None]:
train_words

Unnamed: 0,input,target
0,conteted,contented
1,fellows,fellows
2,satiVsfied,satisfied
3,witrh,with
4,their,their
...,...,...
260508,tuYgged,tugged
260509,mighV,might
260510,inOrigues,intrigues
260511,mmking,making


In [9]:
train = train_words
test = test_words

In [11]:
required_chars = []
for char in string.printable:
  if ord(char) > 31 and ord(char) < 126:
    required_chars.append(char)


print(len(required_chars))
print(required_chars)

94
['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', '-', '.', '/', ':', ';', '<', '=', '>', '?', '@', '[', '\\', ']', '^', '_', '`', '{', '|', '}', ' ']


In [12]:
# Create a dictionary of chars and index value from 1. 0 is reserved for padding by the tokenizer.
vocabulary = dict()
for i in range(len(required_chars)):
  vocabulary[required_chars[i]] = i+1

In [13]:
# Use \t as Start of Sentence and \n as End of Sentence
vocabulary['\n'] = 95
vocabulary['\t'] = 96

In [14]:
# Adding the \t and \n as part of start and end of sentence
train['target_ip'] = '\t' + train['target'].astype(str)
train['target_op'] =  train['target'].astype(str) + '\n'

test['target_ip'] = '\t' + test['target'].astype(str)
test['target_op'] =  test['target'].astype(str) + '\n'

In [15]:
train = train.drop(['target'], axis=1)
test = test.drop(['target'], axis=1)

In [16]:
train.head()

Unnamed: 0,input,target_ip,target_op
0,conteted,\tcontented,contented\n
1,fellows,\tfellows,fellows\n
2,satiVsfied,\tsatisfied,satisfied\n
3,witrh,\twith,with\n
4,their,\ttheir,their\n


In [17]:
train.iloc[0]['target_ip']= str(train.iloc[0]['target_ip'])+'\n'
train.iloc[0]['target_op']= str(train.iloc[0]['target_op'])+'\n'

In [18]:
# Calculating the maximum length of among all the sentences which will be useful for padding.
max_length_encoder = train['input'].map(len).max()

print(max_length_encoder)

40


In [19]:
max_length_decoder = max( train['target_ip'].map(len).max(), train['target_op'].map(len).max())
print(max_length_decoder)

41


In [20]:
# Tokenizer for the raw input and target output
tokenizer_raw_ip = Tokenizer(
    char_level=True,
    lower=False,
    filters=None
)

tokenizer_target_ip = Tokenizer(
    char_level=True,
    lower=False,
    filters=None
)

In [21]:
tokenizer_raw_ip.fit_on_texts(train['input'].values)
tokenizer_target_ip.fit_on_texts(train['target_ip'].values)

In [22]:
# Replacing the vocabulary of the trained index to a vocabulary mentioned in the research paper
tokenizer_target_ip.word_index = vocabulary
tokenizer_raw_ip.word_index = vocabulary

In [23]:
target_vocab_size=len(tokenizer_target_ip.word_index.keys())
print(target_vocab_size)
input_vocab_size=len(tokenizer_raw_ip.word_index.keys())
print(input_vocab_size)

96
96


In [24]:
# Encoder class with Embedding layer and LSTM layer.
class Encoder(tf.keras.layers.Layer):
    def __init__(self, vocab_size, embedding_dim, input_length, enc_units):
        super().__init__()
        self.vocab_size = vocab_size
        self.embedding_dim = embedding_dim
        self.input_length = input_length
        self.enc_units= enc_units
        self.lstm_output = 0
        self.lstm_state_h=0
        self.lstm_state_c=0
        
    def build(self, input_shape):
        self.embedding = Embedding(input_dim=self.vocab_size, output_dim=self.embedding_dim, input_length=self.input_length, mask_zero=True, name="Embedding_Layer_Encoder")
        self.lstm_1 = LSTM(self.enc_units, recurrent_dropout=0.2, return_state=True, return_sequences=True, name="Encoder_LSTM_1")
        self.lstm_2 = LSTM(self.enc_units, recurrent_dropout=0.2, return_state=True, return_sequences=True, name="Encoder_LSTM_2")
        
    def call(self, input_sentances, training=True):
        # input_embedded = self.embedding(input_sentances)
        self.lstm_output,_,_ = self.lstm_1(input_sentances)
        self.lstm_output, self.lstm_state_h, self.lstm_state_c = self.lstm_2(self.lstm_output)

        return self.lstm_output, self.lstm_state_h,self.lstm_state_c

    def get_states(self):
        return self.lstm_state_h,self.lstm_state_c
    
# Decoder class with embedding and LSTM layer.    
class Decoder(tf.keras.layers.Layer):
    def __init__(self, vocab_size, embedding_dim, input_length, dec_units):
        super().__init__()
        self.vocab_size = vocab_size
        self.embedding_dim = embedding_dim
        self.dec_units = dec_units
        self.input_length = input_length
        # we are using embedding_matrix and not training the embedding layer
        self.embedding = Embedding(input_dim=self.vocab_size, output_dim=self.embedding_dim, input_length=self.input_length, mask_zero=True, name="Embedding_Layer_Decoder",)
        self.lstm = LSTM(self.dec_units,  dropout=0.2, return_sequences=True, return_state=True, name="Decoder_LSTM")
    
    def call(self, target_sentences, state_h, state_c):
        # target_embedded           = self.embedding(target_sentences)
        lstm_output, _,_        = self.lstm(target_sentences, initial_state=[state_h, state_c])
        return lstm_output

In [25]:
# Creating a data pipeline
class Dataset:
    def __init__(self, data, tokenizer_raw_ip, tokenizer_target_ip, max_length_encoder,max_length_decoder):
        self.encoder_inps = data['input'].values
        self.decoder_inps = data['target_ip'].values
        self.decoder_outs = data['target_op'].values
        self.tokenizer_target_ip = tokenizer_target_ip
        self.tokenizer_raw_ip = tokenizer_raw_ip
        self.max_length_encoder = max_length_encoder
        self.max_length_decoder = max_length_decoder

    def __getitem__(self, i):
        self.encoder_seq = self.tokenizer_raw_ip.texts_to_sequences([self.encoder_inps[i]]) # need to pass list of values
        self.decoder_inp_seq = self.tokenizer_target_ip.texts_to_sequences([self.decoder_inps[i]])
        self.decoder_out_seq = self.tokenizer_target_ip.texts_to_sequences([self.decoder_outs[i]])

        self.encoder_seq = pad_sequences(self.encoder_seq, maxlen=self.max_length_encoder, dtype='int32', padding='post')
        self.decoder_inp_seq = pad_sequences(self.decoder_inp_seq, maxlen=self.max_length_decoder, dtype='int32', padding='post')
        self.decoder_out_seq = pad_sequences(self.decoder_out_seq, maxlen=self.max_length_decoder, dtype='int32', padding='post')

        self.encoder_seq = tf.keras.utils.to_categorical(self.encoder_seq, num_classes=len(tokenizer_raw_ip.word_index.keys())+1)
        self.decoder_inp_seq = tf.keras.utils.to_categorical(self.decoder_inp_seq, num_classes=len(tokenizer_target_ip.word_index.keys())+1)
        self.decoder_out_seq = tf.keras.utils.to_categorical(self.decoder_out_seq, num_classes=len(tokenizer_target_ip.word_index.keys())+1)

        return self.encoder_seq, self.decoder_inp_seq, self.decoder_out_seq

    def __len__(self): # your model.fit_gen requires this function
        return len(self.encoder_inps)

    
class Dataloder(tf.keras.utils.Sequence):    
    def __init__(self, dataset, batch_size=1):
        self.dataset = dataset
        self.batch_size = batch_size
        self.indexes = np.arange(len(self.dataset.encoder_inps))


    def __getitem__(self, i):
        start = i * self.batch_size
        stop = (i + 1) * self.batch_size
        data = []
        for j in range(start, stop):
            data.append(self.dataset[j])
            
        batch = [np.squeeze(np.stack(samples, axis=1), axis=0) for samples in zip(*data)]
        # we are creating data like ([italian, english_inp], english_out) these are already converted into seq
        return tuple([[batch[0],batch[1]],batch[2]])

    def __len__(self):  # your model.fit_gen requires this function
        return len(self.indexes) // self.batch_size

    def on_epoch_end(self):
        self.indexes = np.random.permutation(self.indexes)

In [26]:
train_dataset = Dataset(train, tokenizer_raw_ip, tokenizer_target_ip, max_length_encoder, max_length_decoder)
test_dataset  = Dataset(test, tokenizer_raw_ip, tokenizer_target_ip, max_length_encoder, max_length_decoder)

In [27]:
train_dataloader = Dataloder(train_dataset, batch_size=1024)
test_dataloader = Dataloder(test_dataset, batch_size=128)

print(train_dataloader[1][0][0].shape, train_dataloader[0][0][1].shape, train_dataloader[0][1].shape)

(1024, 40, 97) (1024, 41, 97) (1024, 41, 97)


In [28]:
# Model 1 - 1 layer LSTM model for each encoder and decoder
class Model1(Model):
    def __init__(self, encoder_inputs_length,decoder_inputs_length, output_vocab_size):
        super().__init__() # https://stackoverflow.com/a/27134600/4084039
        self.encoder = Encoder(vocab_size=input_vocab_size+1, embedding_dim=30, input_length=encoder_inputs_length, enc_units=512)
        self.decoder = Decoder(vocab_size=target_vocab_size+1, embedding_dim=30, input_length=decoder_inputs_length, dec_units=512)
        self.dense   = Dense(output_vocab_size+1, activation='softmax')
        
    def call(self, data):
        input,output = data[0], data[1]
        encoder_output, encoder_h, encoder_c = self.encoder(input)
        decoder_output                       = self.decoder(output, encoder_h, encoder_c)
        output                               = self.dense(decoder_output)
        return output        

In [29]:
# Reduce learning rate based on the validation loss
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.99, verbose=1, mode='min', min_lr=0.0001)

In [30]:
model  = Model1(encoder_inputs_length=max_length_encoder,decoder_inputs_length=max_length_decoder,output_vocab_size=target_vocab_size)

In [31]:
checkpoint_filepath = '/content/drive/MyDrive/my_model_spell/model_1'
model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_filepath,
    save_weights_only=True,
    monitor='loss',
    mode='min',
    save_best_only=True)

In [32]:
# Using Adam and Gradient clipping to prevent gradient explosion as mentioned in the research paper
optimizer = tf.keras.optimizers.Adam(lr=0.001)
model.compile(optimizer=optimizer,loss='categorical_crossentropy', metrics=['accuracy'])
train_steps=train.shape[0]//1024
valid_steps=test.shape[0]//128

In [None]:
model.fit(train_dataloader, steps_per_epoch=train_steps, epochs=200,  validation_data=test_dataloader, validation_steps=valid_steps, callbacks=[reduce_lr, model_checkpoint_callback])
model.summary()

Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200

KeyboardInterrupt: ignored

In [None]:
model.fit(train_dataloader, steps_per_epoch=train_steps, epochs=200,  validation_data=test_dataloader, validation_steps=valid_steps, callbacks=[reduce_lr, model_checkpoint_callback])
model.summary()

Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
 17/254 [=>............................] - ETA: 2:25 - loss: 0.0869 - accuracy: 0.9748

KeyboardInterrupt: ignored

In [None]:
model.fit(train_dataloader, steps_per_epoch=train_steps, epochs=50,  validation_data=test_dataloader, validation_steps=valid_steps, callbacks=[reduce_lr, model_checkpoint_callback])
model.summary()

Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50

In [33]:
!cp '/content/drive/MyDrive/my_model_spell/model_2/checkpoint' "/content/"
!cp '/content/drive/MyDrive/my_model_spell/model_2/model_2.data-00000-of-00001' "/content/"
!cp '/content/drive/MyDrive/my_model_spell/model_2/model_2.index' "/content/"

In [34]:
model.load_weights("model_2")

<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x7f68d015c750>

In [35]:
model.evaluate(test_dataloader)



[0.023236896842718124, 0.9938250184059143]

In [36]:
print("The validation loss of the model 1 is:", 0.0232)
print("The perplexity of the model 1 is:", 2**(0.0232))

The validation loss of the model 1 is: 0.0232
The perplexity of the model 1 is: 1.0162110099886057


#-------------------------------------------------------

In [37]:
start_index = tokenizer_target_ip.word_index['\t']
end_index = tokenizer_target_ip.word_index['\n']
DECODER_SEQ_LEN = max_length_decoder
max_len = max_length_decoder


def predict(input_sentence):
  word_list = []
  split_sentence = input_sentence.split(" ")
  for word in split_sentence:

      encoder_seq = tokenizer_raw_ip.texts_to_sequences([word])

      encoder_seq = pad_sequences(encoder_seq, maxlen=max_length_encoder, dtype='int32', padding='post')

      encoder_seq = tf.keras.utils.to_categorical(encoder_seq, num_classes=len(tokenizer_raw_ip.word_index.keys())+1)

      enc_output, enc_state_h, enc_state_c = model.layers[0](encoder_seq)

      dec_input = np.zeros((1, 1, len(tokenizer_raw_ip.word_index.keys())+1))

      dec_input[0, 0, tokenizer_target_ip.word_index['\t']] = 1.

      input_state = [enc_state_h, enc_state_c]

      output_word = []

      for i in range(DECODER_SEQ_LEN):
          # cur_emb = model.layers[1].embedding(dec_input)

          predicted_out, state_h, state_c = model.layers[1].lstm(dec_input, input_state)

          dense_layer_out = model.layers[2](predicted_out)

          input_state = [state_h, state_c]
      
          output_word_index = np.argmax(dense_layer_out)

          # print(output_word_index)

          for key, value in tokenizer_target_ip.word_index.items():

            if output_word_index == value:
                output_word.append(key)

          dec_input = np.reshape(output_word_index, (1, 1))

          dec_input = np.zeros((1, 1, len(tokenizer_raw_ip.word_index.keys())+1))

          dec_input[0, 0, output_word_index] = 1.


          if output_word_index == tokenizer_target_ip.word_index['\n']:
            break

      word = "".join(output_word)
      word_list.append(word)
      # print(word_list)
  sentence = ''.join(word_list)
  sentence = sentence.replace("\n", " ")
  return sentence


In [38]:
predicted_sentences = []
actual_sentences = []
count = 0

test_sentences = pd.read_csv('Novel_dataset_errors_test_3.csv')

for i, row in test_sentences.iterrows():
    output = predict(row['input'])
    predicted_sentences.append(output)
    english_out = row['target'].split()
    actual_sentences.append(english_out)
    sentence = ''.join(output)
    print('Input Sentence:',row['input'])
    print('Predicted Sentence:',sentence)
    print('Original English sentence:', row['target'])
    print('*'*30)
    count += 1
    if count == 40:
      break

Input Sentence: yM hate to Marcius hwere I find him were it
Predicted Sentence: My hate to Marcius where I find him were it 
Original English sentence: My hate to Marcius where I find him were it
******************************
Input Sentence: Petay Hhad Trepared many fine things he meant to say to the gentlemanin
Predicted Sentence: Petya had reparted many fine things he meant to say to the gentlemaning 
Original English sentence: Petya had prepared many fine things he meant to say to the gentlemanin
******************************
Input Sentence: abGruptly
Predicted Sentence: abroutely 
Original English sentence: abruptly
******************************
Input Sentence: mattedr acd Uto that I reply You are my best friend as you know but
Predicted Sentence: matterd and to that I reply You are my best friend as you know but 
Original English sentence: matter and to that I reply You are my best friend as you know but
******************************
Input Sentence: But thee sworcs were drwan 

In [None]:
from nltk.translate.bleu_score import sentence_bleu
avg_score = 0
for i in range(len(actual_sentences)):
    score = sentence_bleu([actual_sentences[i]], predicted_sentences[i].split())
    avg_score += score

In [44]:
print("Avg BLEU Score of Encoder Decoder Model:", (avg_score/(len(actual_sentences))))

Avg BLEU Score of Encoder Decoder Model: 0.6783860821709433


### Models Tried

- Basic Encoder Decoder Models - 1 layer, 2 layer
- Encoder Decoder with Luong Attention - 1 layer, 2 layer
- Transformer

### Dataset Tried

- Dataset of SMS
- SMS + Augmented SMS Data
- SMS + Augmented Novel Data
- Pure Novel Dataset
- Pure Novel Dataset + Augmentation with SMS language
- Pure Novel Dataset + Augmentation with typo mistakes, letter addition/deletion, letter exchanges


# Inference

From the output seen in the training, several anaylsis of the model prediction are as follows:

- The model is able to do better compared to previous models since we are training the model on corrputed words and target words rather than input sentences which are longer. 
- The model achieves a good accuracy of around ~99% but note that there are several paddings due to which the accuracy is shown higher.


- Looking at the sentences, we see that errors where the letters are exchanged or letters are replaced with another letter the model is able to correct them to a very good extent
- The model however does not perform that well when it encouters missing letters in a word or addition of letters in the word.
- Each sentence had maximum of 3 errors introduced in them, the is able to correct 2 errors depending on the sentence and the vocabulary.

- With larger non repeating, non augmented dataset the model will perform much better covering wide variety of errors and mistakes.