Things left to do:
- Improve word tokenization precision.
- Try to parallelize the process.
- Finish Tokenizer class.
- Try to make the process more efficient: 
    - Only calculate distance between misspelled word and a subset of the dictionary
    - Use numpy for levehnstein distance formula


Guillem Amat, ga98\
Sebastián Soriano Pérez, ss1072

# Project 1: Basic Spelling Correction

Build from scratch a spelling corrector in Python. It should include:

1. tokenization
2. edit distance-based non-word spelling correction
3. de-tokenization

## Importing Data

In [1]:
'''Importing packages'''
import re
import string
#from nltk.corpus import words
import numpy as np
import time

In [2]:
'''Loading test text file'''
# Sets file path
#text_path = r'C:\Users\guill\Desktop\Current Semester\Natural Language Processing\Homeworks\Homework_2\austen-sense-corrupted.txt'
text_path = 'austen-sense-corrupted.txt'

# Opens file and stores it in corrupted_text
with open(text_path, 'r', encoding = 'utf-8') as file:
    corrupted_text = file.read()
    file.close()

In [3]:
'''Loading English dictionary'''
# Sets file path
dictionary_path = 'dict.txt'

# Opens file
with open(dictionary_path, 'r', encoding = 'utf-8') as file:
    dictionary_string = file.read()
    file.close()

# Creates list of words in the dictionary file
dictionary = re.split(r'\n', dictionary_string)

## Tokenizer

**Defining Functions**

In [4]:
#Improve the regex when time allows
def tokenize(text: str) -> list:
    text = str(text)
    return re.findall(r"[A-Za-z]+(?:'?[a-z]*)", text)

In [5]:
def wrong_words(tokens: list, dictionary: list) -> list:
    incorrect_words = [word.lower()
                       for word in tokens
                       if (word.lower() not in dictionary)
                       and (word not in dictionary)]
    return set(incorrect_words)

## Levenshtein Distance

**Defining Functions**

In [6]:
def get_distance(A: str, B:str) -> int:
    '''Returns the Levensthein edit distance between strings A and B. Output is an integer.'''
    # Creating variables for lengths of A and B plus 1
    n = len(A) + 1
    m = len(B) + 1
    
    # Creating matrix D with edit distances between strings A (rows) and B (columns).
    # Row 0 and Column 0 represent empty strings.
    D = [[None for j in range(m)] for i in range(n)]
    
    # Filling out column 0 and row 0 with edit distances equal to i and j respectively
    # which is the edit distance between an empty string and the i-th or j-th character.
    for i in range(n): D[i][0] = i
    for j in range(m): D[0][j] = j
    
    # Filling out the rest of the matrix with the minimum edit distances
    for j in range(1, m):
        for i in range(1, n):
            ins = D[i][j - 1] + 1     # insertion adds 1
            dlt = D[i - 1][j] + 1     # deletion adds 1
            mtc = D[i - 1][j - 1]     # match adds 0
            mis = D[i - 1][j - 1] + 1 # mismatch adds 1 (substitution)

            if A[i - 1] == B[j - 1]: 
                D[i][j] = min(ins, dlt, mtc)
            else:
                D[i][j] = min(ins, dlt, mis)
    
    # Returns optimal distance between two strings
    return D[n - 1][m - 1]

In [7]:
def correct_spelling(W:str, D: dict) -> str:
    '''
    Returns the correct spelling of a word (a string). 
    Uses get_distance() to compute distance between string W and each word in a dictionary D (list of strings).
    Returns the word in the dictionary D with the minimum distance to W (first appearance).
    '''
    # Loops through each word d in D to compute get_distance(W, d)
    min_distance = float('inf')
    min_index    = None
    
    for i, d in enumerate(D): 
        distance = get_distance(W, d)
        
        if distance < min_distance: 
            min_distance = distance
            min_index    = i
    
    return D[min_index]

In [8]:
def correct_spelling_2(W, D):
    '''
    Returns the correct spelling of a word (a string). 
    Uses get_distance() to compute distance between string W and each word in a dictionary D (list of strings).
    Returns the word in the dictionary D with the minimum distance to W (first appearance).
    '''
    # Creates empty list of distances between W and each element in D
    distances = []
    
    # Loops through each word d in D to compute get_distance(W, d)
    for d in D: distances.append(get_distance(W, d))
    
    # Retrieves the first word in D with the minimum distance to W
    min_distance = min(distances)
    min_index    = distances.index(min_distance)
    
    return D[min_index]

In [9]:
def correct_list(L, D):
    '''
    Returns a list of strings corrected with correct_spelling(), 
    after checking if the word exists in the dictionary D.
    '''
    # Loops the words l in list L and corrects the ones that are not found in D
    correct_L = []
    
    for l in L:
        if l in D: 
            correct_L.append(l)
        else:
            correct_L.append(correct_spelling(l, D))
            
    return correct_L

**Testing Performance**

In [10]:
'''Testing get_distance with a few use cases'''
assert get_distance('level', 'level') == 0
assert get_distance('level', 'leaven') == 2

In [11]:
'''Testing correct_spelling() function'''
# Measures time it takes to run
t0 = time.time()
print(correct_spelling('adsad', dictionary))
t1 = time.time()
print('time elapsed:', t1 - t0)

adad
time elapsed: 9.39098596572876


In [12]:
'''Testing correct_spelling_2() function'''
# Measures time it takes to run
t0 = time.time()
print(correct_spelling_2('adsad', dictionary))
t1 = time.time()
print('time elapsed:', t1 - t0)

adad
time elapsed: 9.340983152389526


In [14]:
'''Testing correct_list()'''
# Creates sample list of misspelled words
misspelled_words_test = ['estres', 'think', 'panicok', 'neturral', 'probability', 'millom', 'he', 'ittt']

# Measures time it takes to run
t0 = time.time()
print(correct_list(misspelled_words_test, dictionary))
t1 = time.time()
print('time_elapsed:', t1 - t0)

['estre', 'think', 'panic', 'demurral', 'probability', 'billon', 'he', 'Atta']
time_elapsed: 57.1741669178009


## Untokenize

**Defining Functions**

In [15]:
def correct_text(text, misspelled_words, dictionary):
    '''
    Receives text (a string), a list of misspelled words found in that text, and a dictionary (list of strings).
    Returns the corrected text (a string), where the misspelled words are replaced by the words 
    corrected by correct_list().
    '''
    # Creates list of corrected words from misspelled_words
    corrected_words = correct_list(misspelled_words, dictionary)
    
    # Loops through the list of misspelled words to replace each appearance in text with its corrected counterpart
    corrected_text = text
    for misspelled_word, corrected_word in zip(misspelled_words, corrected_words):
        corrected_text = re.sub(r'\b%s\b' % misspelled_word, corrected_word, corrected_text)
        
    return corrected_text

## Spelling Correction Pipeline

In [16]:
'''Implements previously defined functions to correct the spelling on sample text'''
# Creates shorter sample text due to time constraints
sample_text = corrupted_text[:2500]

# Tokenizes sample text
list_of_words = tokenize(sample_text)

# Generates list of unique misspelled words found in sample text
misspelled_words = wrong_words(list_of_words, dictionary)

# Corrects the misspelled words in the sample text
corrected_text = correct_text(sample_text, misspelled_words, dictionary)

In [17]:
'''Compares original sample text with corrected sample text'''
print(sample_text)
print(corrected_text)

[Sense and Sensibility by Jane Austen 1811]

CHAPOTER 1


The family of Dashwood had long been settled i Sussex.
Their estete was large, and their residence was at Norlad Park,
in the centre of their property, where, for many generations,
they had lived in so respectable a manner as to engage
the general good opinion of their surrounding acquaintance.
The late owner of thfs estat was a single man, who lived
to a very advanced age, and who for many years of hijs life,
had a constant companion nd housekeeper in his sister.
But her death, which happened ten ryears beore his own,
produced a great alteration in his home; fuor gto supply
her lodss, he invited and eceivepd into his house the family
of his nephew Mr. Henry Dashwood, the legal inheritkr
of the Norland estate, and te lperqson to wsom he intended
to bequeath it.  In the society o his nephew and niece,
and theoir childrn, the old Gentaeman's das were
comfortably spent.  His attacsment to them all increased.
The consmant attention 

<br>

## Resources

**Potential list of steps to speed up the process**

- Use a dictionary compiled from the book.
- Check if a word is correctly spelled. Run through the whole dictionary to see if it exists.
- Investigate whether Spark is a possibility.
- Assume whether the two first letters are okey.
- Filter for misspelled words.
- Ignore whatever that starts with a Capital Letter.

**Resources**

- *Tokenization:* https://stackoverflow.com/questions/21948019/python-untokenize-a-sentence, https://medium.com/analytics-vidhya/tokenization-building-a-tokenizer-and-a-sentencizer-c19a00393c19
- *Levehnstein Distance:* https://www.python-course.eu/levenshtein_distance.php, https://stackabuse.com/levenshtein-distance-and-text-similarity-in-python/, https://en.wikibooks.org/wiki/Algorithm_Implementation/Strings/Levenshtein_distance#Python,
https://blog.paperspace.com/implementing-levenshtein-distance-word-autocomplete-autocorrect/

<br>

## Appendix

**A) Advanced Tokenizer**

If given more time we would have liked to implement an *Advanced Tokenizer* class. The class would establish a hierarchical structure composed of paragraphs, sentences and words(tokens). It would allow the user to identify tokens with higher precision, perform modifications on them and, more importantly, revert back to the text in its original form. The code and the tests below show our work on the class.

In [None]:
class tokenizer:
    '''Tokenizer class that allows the user
    to break text into tokens'''
    def __init__(self, text = None, sentences = None):
        self.text = str(text) if text is not None else None
        self.sentences = sentences if sentences is not None else []
        self.paragraphs = []
        self.tokens = []
        
    def sentencize(self):
        '''Splits the text into sentences'''
        self.paragraphs = [paragraph.replace('\n', ' ')
                           for paragraph in re.split(r'(?:\n){2,}', self.text)]
        
        self.sentences = [re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<![A-Z][a-z]{2}\.)(?<=\.|\?)\s', paragraph)
                          for paragraph in self.paragraphs]
        
        return self.sentences
        
    def _desentencize(self):
        '''Joins sentences into a single text corpus'''
        self._paragraphs = [' '.join(sentence) for sentence in self.sentences]
        self._text = ['\n'.join(self._paragraphs)]
        return self._text
        
    def tokenize(self):
        '''Splits sentences into words or tokens'''
        self.tokens = [re.split(r'\s', sentence) for sentence in self.sentences]
        #if self.paragraphs is None:
        #else:
        #    self.tokens = [re.split(r'\s', words) for sentence in self.sentences]
        
        
        return self.tokens
        
    def detokenize(self):
        '''Joins words into sentences'''
        self.sentences = ["".join([" "+i if not i.startswith("'")
                                   and i not in string.punctuation
                                   else i for i in tokens]).strip()
                          for tokens in self.tokens]
        return self.sentences

<br>

**Tests: Class**

In [None]:
# Sample objects to test

In [None]:
text = '''I worked on my NLP Assignment today. This is another sentence.\n
I tested many methods and models.\n
I think I might have succeeded.
'''

In [None]:
list_of_sentences =  ['Hello my friend, how are you?', 'How is it going?', 'It is 12.30pm', 'Morning']

<br>

In [290]:
# Moving from text to sentences and paragraphs and back

In [None]:
sentences = tokenizer(text = text)

In [None]:
sentences.sentencize()

In [None]:
print(sentences._desentencize()[0])

<br>

In [291]:
# Moving from sentences to words and back

In [None]:
words = tokenizer(sentences = list_of_sentences)

In [None]:
words.tokenize()

In [None]:
words.detokenize()