# Notes for the user
*Creator: Michael McAleer*
*SN: R00143621*

This notebook assumes that it is being run on Google CoLab. To run locally, change the path of the `root_dir` to your current working directory, the rest of the required paths are determined from there. You can find `root_dir` on line 6 of the section 'Data Path Configuration'.

All required files are included with this submission, it should work as-is out of the box when the `root_dir` has been set to current working directory.

The main corpus used in this notebook is `ukenglish.txt` which is a list of
approx. 80,000 unique english words, it can be found [here](http://www.gwicks.net/dictionaries.htm).

Although the notebook uses a good corpus in terms of data quality, this notebook has the ability to read from a directory and ingest all text based documents within, combining them into a single corpus before processing and cleaning.

An attempt was made to save the model mapping and associated weights but an issue was encounteres where a loaded model was peforming like it had never been trained. A related GitHub article can be found [here](https://github.com/keras-team/keras/issues/4875).

# Imports Libraries & Install Packages

In [1]:
from __future__ import print_function

import os
import re
import string
import time

import numpy as np
import tensorflow as tf

from copy import deepcopy
from keras.models import Model
from keras.layers import Input, LSTM, Dense
from nltk.tokenize import wordpunct_tokenize
from numpy.random import choice
from numpy.random import rand
from numpy.random import randint

# To run pycontractions in CoLab JDK needs to be downgraded to version 8
print('Downgrading open-jdk to version 8 for pycontractions install...')
!apt-get purge openjdk* -qq > /dev/null
!apt-get install -y openjdk-8-jdk-headless -qq > /dev/null
!pip install --quiet pycontractions
print('pycontractions install complete')
from pycontractions import Contractions

# Ignore TensorFlow warnings
tf.logging.set_verbosity(tf.logging.ERROR)

Using TensorFlow backend.


Downgrading open-jdk to version 8 for pycontractions install...
pycontractions install complete


# Data Path Configuration

In [2]:
# Mount gDrive
from google.colab import drive
drive.mount('/content/drive')

# Path to working director - must be set
root_dir = '/content/drive/My Drive/Colab Notebooks/NLP/Assignment1'

# Path to the data files
corpus_dir = '{root_dir}/corpora'.format(root_dir=root_dir)
training_dir = '{corpus_root}/training'.format(corpus_root=corpus_dir)
validation_dir = '{corpus_root}/validation'.format(corpus_root=corpus_dir)
model_dir = '{root_dir}/models'.format(root_dir=root_dir)
data_path = '{train}/ukenglish.txt'.format(train=training_dir)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# OCR Baseline System

In [0]:
# Provided OCR cannot read whole sentences, a lot of time was wasted tring to
# implement this but was ultimately left out as it is not a defined task in the
# assignment.  Feeding in a dummy sentence with errors and contractions. The OCR
# system has been included along with this assignment to show it is working for
# single word prediction and an attempt was made to get it working.
ocr_output_string = ("We're goinng to the zoo and I don't thnk I'll be "
                     "home for dnner")

# Task 1 - De-Contraction of String using PyContractions

In [4]:
# Load semantic vector model
py_cont = Contractions(api_key="glove-wiki-gigaword-50")
# Prevent loading on first expand_texts call
py_cont.load_models()

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [5]:
# Expand contractions from out OCR output sentence
decont_ocr_output = list(py_cont.expand_texts([ocr_output_string]))
# We are only feeding in one sentence so need to extract string from list
decont_ocr_output = decont_ocr_output[0]
# Output contraction success to screen with comparison against original
print('OCR output sentence: {s}'.format(s=ocr_output_string))
print('Decontracted sentence: {s}'.format(s=decont_ocr_output))

OCR output sentence: We're goinng to the zoo and I don't thnk I'll be home for dnner
Decontracted sentence: we are goinng to the zoo and I do not thnk I will be home for dnner


# Task 2 - Tokenise De-Contracted String with NLTK

In [6]:
# Using NLTK tokenise the de-contracted OCR output sentence, this is more
# efficient than just splitting on whitespace as it has awareness for
# punctuation and hyphenated words
ocr_tokens = wordpunct_tokenize(decont_ocr_output)
print('Tokenised de-contracted sentence: {s}'.format(s=ocr_tokens))

Tokenised de-contracted sentence: ['we', 'are', 'goinng', 'to', 'the', 'zoo', 'and', 'I', 'do', 'not', 'thnk', 'I', 'will', 'be', 'home', 'for', 'dnner']


# Task 3 - Detecting Errors

## Task 3.1 - Check if OCR output tokens are english words

In [0]:
# Define the set of valid chars and invalid punctuation regex pattern 
CHARS = list('abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ ')
REMOVE_CHARS = r'[#$%"\+@<=>!&,-.?:;()*\[\]^_`{|}~/\d\t\n\r\x0b\x0c“”]'

In [0]:
def load_corpus_data(corpus_directory):
    """Given a corpus directory, load all files within and return as one text.

    :param corpus_directory: path to director -- str
    :return: concatenated corpora -- str
    """
    corpus_files = os.listdir(corpus_directory)
    text = str()
    for book in corpus_files:
        book_path = os.path.join(corpus_directory, book)
        with open(book_path, 'r', encoding='utf8') as open_corpus:
            c_text = open_corpus.read()
            text += c_text
    return text


def tokenise_corpus(in_text):
    """Convert a string into tokens splitting on white space and remove all
    punctuation listed in REMOVE_CHARS regex pattern.

    :param in_text: input text -- str
    :return: tokens -- list
    """
    return [re.sub(REMOVE_CHARS, '', token) for token in (
        re.split("[-\n ]", in_text))]


def clean_text(in_tokens):
    """Clean a list of tokens generated from corpus.

    :param in_tokens: input tokens -- list
    :return: cleaned tokens -- list
    """
    # Remove non-english words
    english_words = [c for c in in_tokens if _is_english_chars(c)]
    # Remove any words with that have anything other than alpha chars
    alpha_words = [c for c in english_words if re.match(r'[a-zA-Z]', c)]
    # Normalise text by changing it all to lower case
    normal_text = [w.lower() for w in alpha_words]
    return list(filter(None, set(normal_text)))


def _is_english_chars(in_string):
    """Check if an input string consists of non-english characters.
    
    :param in_string: input string -- str
    :return: bool
    """
    try:
        in_string.encode(encoding='utf-8').decode('ascii')
    except UnicodeDecodeError:
        return False
    else:
        return True


def _is_english_word(in_word, corpus):
    """Perform set lookup on corpus to check if word exists. Calculates in
    O(1) time.
    
    :param in_word: input word -- str
    :param corpus: english words corpus -- set
    :return: bool
    """
    return in_word.lower() in corpus

In [9]:
# Start corpus processing timer
p_start = time.time()
# Load corpora
corpus_text = load_corpus_data(training_dir)
# Tokenise corpus
corpus_tokens = tokenise_corpus(corpus_text)
# Clean corpus tokens
corpus_tokens = clean_text(corpus_tokens)
# Calculate unique character count in entire corpus
corpus_unique_chars = sorted(set(' '.join(corpus_tokens)))
# Calculate the longest token in the corpus
corpus_max_len = max([len(token) for token in corpus_tokens]) + 2
# Create corpus set for hash lookup
corpus_lookup_set = set(corpus_tokens)

print('Total corpus processing time: {t}'.format(t=time.time() - p_start))
print('Corpus size: {s}'.format(s=len(corpus_tokens)))
print('Corpus unique character count: {c}'.format(c=len(corpus_unique_chars)))
print('Corpus largest word: {size}'.format(size=corpus_max_len))

Total corpus processing time: 0.2998542785644531
Corpus size: 82036
Corpus unique character count: [' ', "'", 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']
Corpus largest word: 23


In [10]:
incorrect_words = list()
# For each of the OCR tokens...
for word in ocr_tokens:
    # Check if the word exists in the corpus...
    if not _is_english_word(word, corpus_lookup_set):
        # If not add to incorrect word list
        incorrect_words.append(word)

print('Incorrect words: {w}'.format(w=incorrect_words))

Incorrect words: ['goinng', 'thnk', 'dnner']


# LSTM Char-to-Char Sequence Learner 

In [0]:
# Reset TensorFlow default graph
tf.reset_default_graph()

In [0]:
# LSTM configuration options
# Start of sequence
SOS = '$'
# End of sequence  
EOS = '*'
BATCH_SIZE = 1024
N_EPOCHS = 70
HIDDEN_DIM = 256

## Prepare the training data

In [0]:
def _add_noise_to_token(tkn, noise_rate):
    """Add noise to token to simulate spelling mistakes.

    :param tkn: input token -- str
    :param noise_rate: noise rate -- float
    :return: noise sampled token -- str
    """
    # Do not run transform unless token is 3 or more chars in length
    if len(tkn) < 3:
        return tkn
    # Use only lower-case ASCII characters for noise
    chars = list(string.ascii_lowercase)
    # There are four ways in which noise can be generated, divide error rate
    # to provide equal chance for each change possible
    ran = rand()
    prob = noise_rate / 4.0
    # Replace a character with a random character
    if ran < prob:
        rci = randint(len(tkn))
        token = tkn[:rci] + choice(CHARS) + tkn[rci + 1:]
    # Delete a character
    elif prob < ran < prob * 2:
        rci = randint(len(tkn))
        tkn = tkn[:rci] + tkn[rci + 1:]
    # Add a random character
    elif prob * 2 < ran < prob * 3:
        rci = randint(len(tkn))
        tkn = tkn[:rci] + np.random.choice(CHARS) + tkn[rci:]
    # Transpose 2 characters
    elif prob * 3 < ran < prob * 4:
        rci = randint(len(tkn) - 1)
        token = tkn[:rci] + tkn[rci + 1] + tkn[rci] + tkn[rci + 2:]
    return tkn


def encode_token(in_token, max_length):
    """Encode token with end of sentence marker and add padding equal to max
    corpus token length.

    :param in_token: input token -- str
    :param max_length: corpus max token length -- int
    :return: encoded token -- list
    """
    in_token += EOS * (max_length - len(in_token))
    return [in_token]


def transform(in_tokens, max_length, noise_rate=0.8, shuffle=True):
    """Transform corpus tokens into encoder inputs and decoder targets.  All
    tokens are padded to the length of the largest corpus token.

    :param in_tokens: input tokens -- list
    :param max_length: corpus token max length -- int
    :param noise_rate: error rate for nous
    :param shuffle:
    :return:
    """
    # Initialise encoded, decoded, and target token lists
    encoder_out_tokens, decoder_out_tokens = list(), list()
    target_out_tokens = list()
    # If set to True, the corpus tokens will be shuffled from their sorted in
    # alphabetical state, this will help with the model validation split on the
    # training data
    if shuffle:
        print('Shuffling data...')
        np.random.shuffle(in_tokens)
    # For each token in the corpus...
    for token in in_tokens:
        # Add noise to the token for the encoder
        encoder_token = _add_noise_to_token(token, noise_rate=noise_rate)
        # Pad encoded token with EOS marker (*) equal to corpus token max
        # length
        encoder_token += EOS * (max_length - len(encoder_token))
        # Add token to encoded tokens list
        encoder_out_tokens.append(encoder_token)
        # Add SOS marker for decoded token ($)
        decoder_token = SOS + token
        # Pad decoded token with EOS marker (*) equal to corpus token max
        # length
        decoder_token += EOS * (max_length - len(decoder_token))
        # Add token to decoded tokens lis
        decoder_out_tokens.append(decoder_token)
        # Remove the SOS token from the decoder to create target token
        target_token = decoder_token[1:]
        # Pad remaining space
        target_token += EOS * (max_length - len(target_token))
        # Add token to target tokens lis
        target_out_tokens.append(target_token)
        # Assert all three token variants are equal in length
        assert (len(encoder_token) == len(decoder_token) == len(target_token))
    return encoder_out_tokens, decoder_out_tokens, target_out_tokens

In [27]:
# Transform corpus tokens into encoder, decoder, and target tokens
encoder_tokens, decoder_tokens, target_tokens = transform(corpus_tokens,
                                                          corpus_max_len)

# Generate encoder and decoder unique character sets
input_chars = sorted(set(' '.join(encoder_tokens)))
target_chars = sorted(set(' '.join(decoder_tokens)))
num_encoder_tokens = len(input_chars)
num_decoder_tokens = len(target_chars)
# Calculate the max length of encoder and decoder
max_encoder_len = max([len(txt) for txt in encoder_tokens])
max_decoder_len = max([len(txt) for txt in decoder_tokens])
# Generate dictionary mapping for efficient character to index
# encoding/decoding lookups
input_char_to_index = dict(
    [(char, i) for i, char in enumerate(input_chars)])
target_char_to_index = dict(
    [(char, i) for i, char in enumerate(target_chars)])
# Generate dictionary mapping for efficient index to character
# encoding/decoding lookups
input_index_to_char = dict(
    (i, char) for char, i in input_char_to_index.items())
target_index_to_char = dict(
    (i, char) for char, i in target_char_to_index.items())

print('Number of unique encoder characters:', num_encoder_tokens)
print('Number of unique decoder characters:', num_decoder_tokens)
print('Max token length for encoder:', max_encoder_len)
print('Max token length for decoder:', max_decoder_len)

Shuffling data...
Number of unique encoder characters: 55
Number of unique decoder characters: 30
Max token length for encoder: 23
Max token length for decoder: 23


In [0]:
# One hot encode the encoder, decoder, and target tokens

# Generate empty numpy arrays with zeros with dimensions
# max_token_length x num_input_characters
encoder_one_hot = np.zeros(
    (len(encoder_tokens), max_encoder_len, num_encoder_tokens),
    dtype='float32')
decoder_one_hot = np.zeros(
    (len(encoder_tokens), max_decoder_len, num_decoder_tokens),
    dtype='float32')
target_one_hot = np.zeros(
    (len(encoder_tokens), max_decoder_len, num_decoder_tokens),
    dtype='float32')

# For each token in the encoder and decoder vectors, iterate over each row of
# the zero-d numpy array and convert the corresponding character index to 1.
# This operation has been zipped into one for loop to prevent two iterative
# loops of full corpus length.
for i, (input_text, target_text) in enumerate(
        zip(encoder_tokens, decoder_tokens)):
    # One-hot encode the encoder tokens
    for t, char in enumerate(input_text):
        encoder_one_hot[i, t, input_char_to_index[char]] = 1.
    encoder_one_hot[i, t + 1:, input_char_to_index[' ']] = 1.
    # One hot encode the decoder tokens
    for t, char in enumerate(target_text):
        decoder_one_hot[i, t, target_char_to_index[char]] = 1.
        if t > 0:
            # decoder does not include the SOS marker so is not included
            target_one_hot[i, t - 1, target_char_to_index[char]] = 1.
    decoder_one_hot[i, t + 1:, target_char_to_index[' ']] = 1.
    target_one_hot[i, t:, target_char_to_index[' ']] = 1.

## Build & Fit LSTM Model on Training Data

In [29]:
# Define encoder input shape
encoder_inputs = Input(shape=(None, num_encoder_tokens))
# Add encoder LSTM layer
encoder = LSTM(HIDDEN_DIM, return_state=True)
# Process input sequence, only the hidden and memory states need to be retained
__, hidden_state, memory_state = encoder(encoder_inputs)
# Store the encoder states for later in the model
encoder_states = [hidden_state, memory_state]

# Define decoder input shape
decoder_inputs = Input(shape=(None, num_decoder_tokens))
# Add decoder LSTM layer, sequences are returned and states are returned for
# use in inference
decoder_lstm = LSTM(HIDDEN_DIM, return_sequences=True, return_state=True)
# Define the decoder using the encoder states as the initial state
decoder_outputs, _, _ = decoder_lstm(decoder_inputs,
                                     initial_state=encoder_states)
# Add dense layer of softmax activation neurons equal to the amount of decoder
# tokens
decoder_dense = Dense(num_decoder_tokens, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

# Define mode that will take encoder inputs decoder inputs and outputs
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

# Compile the model
model.compile(optimizer='rmsprop', loss='categorical_crossentropy',
              metrics=['accuracy'])
# Fit the model with a validation split of 10%
model.fit([encoder_one_hot, decoder_one_hot], target_one_hot,
          batch_size=BATCH_SIZE, epochs=N_EPOCHS, validation_split=0.1)

Train on 73832 samples, validate on 8204 samples
Epoch 1/70
Epoch 2/70
Epoch 3/70
Epoch 4/70
Epoch 5/70
Epoch 6/70
Epoch 7/70
Epoch 8/70
Epoch 9/70
Epoch 10/70
Epoch 11/70
Epoch 12/70
Epoch 13/70
Epoch 14/70
Epoch 15/70
Epoch 16/70
Epoch 17/70
Epoch 18/70
Epoch 19/70
Epoch 20/70
Epoch 21/70
Epoch 22/70
Epoch 23/70
Epoch 24/70
Epoch 25/70
Epoch 26/70
Epoch 27/70
Epoch 28/70
Epoch 29/70
Epoch 30/70
Epoch 31/70
Epoch 32/70
Epoch 33/70
Epoch 34/70
Epoch 35/70
Epoch 36/70
Epoch 37/70
Epoch 38/70
Epoch 39/70
Epoch 40/70
Epoch 41/70
Epoch 42/70
Epoch 43/70
Epoch 44/70
Epoch 45/70
Epoch 46/70
Epoch 47/70
Epoch 48/70
Epoch 49/70
Epoch 50/70
Epoch 51/70
Epoch 52/70
Epoch 53/70
Epoch 54/70
Epoch 55/70
Epoch 56/70
Epoch 57/70
Epoch 58/70
Epoch 59/70
Epoch 60/70
Epoch 61/70
Epoch 62/70
Epoch 63/70
Epoch 64/70
Epoch 65/70
Epoch 66/70
Epoch 67/70
Epoch 68/70
Epoch 69/70
Epoch 70/70


<keras.callbacks.History at 0x7f34c6e35da0>

In [0]:
# With model trained on corpus the model encoder and decoder can be defined
# for use in character level spelling sequence prediction on unseen data

# Extract the encoder using the inputs and states defined previously
encoder_model = Model(encoder_inputs, encoder_states)
# Define decoder hidden and memory state inputs
decoder_state_input_hidden = Input(shape=(HIDDEN_DIM,))
decoder_state_input_memory = Input(shape=(HIDDEN_DIM,))
# Combine decoder input states
decoder_states_inputs = [decoder_state_input_hidden,
                         decoder_state_input_memory]
# Extract the decoder
decoder_outputs, hidden_state, memory_state = decoder_lstm(
    decoder_inputs, initial_state=decoder_states_inputs)
# Set decoder states
decoder_states = [hidden_state, memory_state]
# Recreate our dense output layer for decoder outputs
decoder_outputs = decoder_dense(decoder_outputs)
# Combine decoder input and output states
decoder_model = Model([decoder_inputs] + decoder_states_inputs,
                      [decoder_outputs] + decoder_states)

## Decode Model Predictions

In [0]:
def decode_sequence(input_seq, e_model, d_model, d_chars,
                    tgt_index_char_map):
    """Decode an input sequence to get a predicted output sequence.

    :param input_seq: encoded input sequence -- np.array
    :param e_model: encoder model -- keras model
    :param d_model: decoder model -- keras model
    :param d_chars: number of decoder characters -- int
    :param tgt_index_char_map: target index to character map -- dict
    :return: decoded string -- str
    """
    # Initialise decoded string response
    decoded_string = ''
    # Get encoded sequence state after going through encoder, used in decoding
    states_value = e_model.predict(input_seq)
    # Generate empty target sequence of dimension 1 x number of decoder chars
    target_seq = np.zeros((1, 1, d_chars))
    # Set start of target sequence char as start of sequence marker
    target_seq[0, 0, target_char_to_index[SOS]] = 1.
    # Set continuous loop until stop condition has been met
    stop_condition = False
    while not stop_condition:
        # Predict the next character probabilities from the decoder using the
        # state of the sequence after going through the encoder
        output_tokens, hidden, memory = d_model.predict(
            [target_seq] + states_value)
        # From the character probabilities, take the character index with the
        # highest probability
        predicted_token_index = np.argmax(output_tokens[0, -1, :])
        # Get the corresponding character from the target index to char map
        predicted_char = tgt_index_char_map[predicted_token_index]
        # Add the character to the decoded string
        decoded_string += predicted_char
        # If the predicted character was the end of string marker or the
        # decoder has hit the maximum decoder token length, stop the loop
        if predicted_char == EOS or len(decoded_string) > max_decoder_len:
            stop_condition = True
        # Update the target sequence with the predicted character
        target_seq = np.zeros((1, 1, d_chars))
        target_seq[0, 0, predicted_token_index] = 1.
        # Update states
        states_value = [hidden, memory]

    return decoded_string


def predict_spelling(token, e_model, d_model, c_max_len, e_max_len, e_chars,
                     d_chars, input_c_to_i, tgt_i_to_c):
    """Predict the correct spelling of an input string using LSTM model.

    :param token: input token for error correction -- str
    :param e_model: encoder model -- keras model
    :param d_model: decoder model -- keras model
    :param c_max_len: corpus max length -- int
    :param e_max_len: encoder max length -- int
    :param e_chars: number of encoder characters -- int
    :param d_chars: number of decoder characters -- int
    :param input_c_to_i: input character to index map -- dict
    :param tgt_i_to_c: input index to character map -- dict
    :return: predicted correct spelling -- str
    """
    # Encode the input string
    encoded_incorrect_token = encode_token(token, c_max_len)
    # Create an empty numpy array of zeros of same dimensions as training data
    encoder_incorrect_data = np.zeros((1, e_max_len, e_chars), dtype='float32')
    # One hot encode the input string
    for i, encoded_incorrect_token in enumerate(encoded_incorrect_token):
        for x, char in enumerate(encoded_incorrect_token):
            encoder_incorrect_data[i, x, input_c_to_i[char]] = 1.
    # Predict the spelling of the input string
    decoded_sentence = decode_sequence(
        encoder_incorrect_data, e_model, d_model, d_chars, tgt_i_to_c)
    # Remove any EOS markers in the string after decoding
    decoded_sentence = re.sub('\*', '', decoded_sentence)
    return decoded_sentence

In [0]:
def run_spell_checker(incorrect_word_list, e_model, d_model, c_set, c_max_len, 
                      e_max_len, e_chars, d_chars, input_c_to_i, tgt_i_to_c, 
                      predict_cnt=5):
    """Run the LSTM char-to-char spell checker on a list of incorrectly spelled
    words.
    
    :param incorrect_word_list: incorrect words -- list
    :param e_model: encoder model -- keras model
    :param d_model: decoder model -- keras model
    :param c_set: clean corpus tokens -- set
    :param c_max_len: corpus max length -- int
    :param e_max_len: encoder max length -- int
    :param e_chars: number of encoder characters -- int
    :param d_chars: number of decoder characters -- int
    :param input_c_to_i: input character to index map -- dict
    :param tgt_i_to_c: input index to character map -- dict
    :param predict_cnt: amount of additional predictions to try -- int
    :return: english word possibilities -- dict
    """
    # Create response dict to hold predicted possibilities of correct spelling
    response = dict()
    # For each of the incorrectly spelled words from the OCR output
    for word in incorrect_word_list:
        # Add word as a key in the response dict with a list value to hold
        # spelling possibilities
        response[word] = list()
        # Predict the spelling of the incorrect word
        spelling = predict_spelling(
            word, e_model, d_model, c_max_len, e_max_len, e_chars, d_chars, 
            input_c_to_i, tgt_i_to_c)
        # If the predicted spelling is in the corpus add it to the response 
        # dict in the corresponding key list
        if spelling in c_set:
            response[word].append(spelling)
        # For completeness, predict a range of additional possibilities by
        # adding noise to the incorrect word and predicting again
        inc_tok = [word] * predict_cnt
        incorrect_tokens, decoded_tokens, target_tokens = transform(
            inc_tok, corpus_max_len, shuffle=False)
        for token in incorrect_tokens:
            alternative_spelling = predict_spelling(
                token, e_model, d_model, c_max_len, e_max_len, e_chars,
                d_chars, input_c_to_i, tgt_i_to_c)
            # If the noisy alternative prediction is in the corpus add it to
            # the response dict
            if alternative_spelling in c_set:
                response[word].append(alternative_spelling)
        # After all predictions are complete, filter out any repeats and
        # predictions which may match the incorrect spelling
        response[word] = set([i for i in response[word] if i != word])
    return response

In [67]:
predicted_spellings = run_spell_checker(
    incorrect_words, encoder_model, decoder_model, corpus_lookup_set,
    corpus_max_len, max_encoder_len, num_encoder_tokens, num_decoder_tokens,
    input_char_to_index, target_index_to_char, predict_cnt=50)
print(predicted_spellings)

{'goinng': {'going'}, 'thnk': {'tank'}, 'dnner': {'dinner', 'den'}}


# Task 3.2 Calculate Levenshtein Distance

In [0]:
def calculate_levenshtein_distance(seq_a, seq_b):
    """Calculate levenshtein distance between two input strings.
    
    :param seq_a: input string -- str
    :param seq_b: input string -- str
    :return: distance -- int
    """
    # Set the size of the matrix
    size_x, size_y = len(seq_a) + 1, len(seq_b) + 1
    # Initialise matrix
    m = np.zeros((size_x, size_y))
    # Set row/col integer labels
    for x in range(size_x):
        m[x, 0] = x
    for y in range(size_y):
        m[0, y] = y
    # For each [x, y] position, compare row-wise and column wise
    for x in range(1, size_x):
        for y in range(1, size_y):
            # If two letters are equal, the new value at position [x, y]
            # is the minimum between the value of position [x-1, y] + 1,
            # position [x-1, y-1], and position [x, y-1] + 1.
            if seq_a[x - 1] == seq_b[y - 1]:
                m[x, y] = min(m[x - 1, y] + 1, m[x - 1, y - 1],
                              m[x, y - 1] + 1)
            # Else it is the minimum between the value of
            # position [x-1, y] + 1, position [x-1, y-1] + 1, and
            # position [x, y-1] + 1
            else:
                m[x, y] = min(m[x - 1, y] + 1, m[x - 1, y - 1] + 1,
                              m[x, y - 1] + 1)
    # Return the difference in the two strings minus the row and col labels
    return m[size_x - 1, size_y - 1]


def calculate_shortest_distances(predictions):
    """Calculate the shortest distance between an incorrect word and all
    possible correct words.
    
    :param predictions: model predictions -- dict
    :return: short distance info, all distance info -- tuple(dict, dict)
    """
    # Initialise response dicts
    all_distance = dict()
    short_distance = dict()
    # For each of the incorrectly spelled words
    for word_key in predictions.keys():
        # Set the initial distance as infinite
        shortest_distance = (None, float('inf'))
        # Initialise all distances list to hold all possibility info
        all_distance[word_key] = list()
        # For each of the correct spelling predictions
        for alternative in predictions[word_key]:
            # Calculate the distance between the incorrect word and the 
            # predicted variant
            distance = calculate_levenshtein_distance(word_key, alternative)
            # Create distance info tuple to hold prediction and distance
            distance_tuple = (alternative, distance)
            # Add the distance info to the all distance dict
            all_distance[word_key].append(distance_tuple)
            # If the calculated distance is the lowest distance encountered
            if distance < shortest_distance[1]:
                # Set the current distance as the shortest distance
                shortest_distance = (alternative, distance)
                short_distance[word_key] = shortest_distance

    return short_distance, all_distance

In [72]:
short_distances, all_distances = calculate_shortest_distances(
    predicted_spellings)
print('Shortest distance predictions:')
for k, v in short_distances.items():
    print('--"{k}" shortest distance word: {v}'.format(k=k, v=v))

Shortest distance predictions:
--"goinng" shortest distance word: ('going', 1.0)
--"thnk" shortest distance word: ('tank', 1.0)
--"dnner" shortest distance word: ('dinner', 1.0)


# Apply Change to OCR Prediction and Re-Contract

In [0]:
def apply_spelling_change(closest_distances, in_tokens):
    """Apply the closest distance token to the original OCR output sentence
    and re-contract.
    
    :param closest_distances: shortest distance info -- dict
    :param in_tokens: OCR output tokens -- list
    :return: modified re-contracted string -- list
    """
    response_tokens = deepcopy(in_tokens)
    for x in closest_distances.keys():
        correction = closest_distances[x]
        for i, w in enumerate(ocr_tokens):
            if w == x:
                response_tokens[i] = correction[0]
    correct_output = ' '.join(response_tokens)
    return list((py_cont.contract_texts([correct_output])))

In [74]:
corrected_ocr_tokens = apply_spelling_change(short_distances, ocr_tokens)
print('Original OCR Output: {o}'.format(o=ocr_tokens))
print('Error module corrected output: {o}'.format(o=corrected_ocr_tokens))

Original OCR Output: ['we', 'are', 'goinng', 'to', 'the', 'zoo', 'and', 'I', 'do', 'not', 'thnk', 'I', 'will', 'be', 'home', 'for', 'dnner']
Error module corrected output: ["we're going to the zoo and I don't tank I'll be home for dinner"]
