## Spell Checker
*by Mohammad Akbar*

In order to check spelling we need a dictionary.<br/>
For this program we will be using the dictionary `words.words()` from the `nltk` (natural language tool kit) module.

In [1]:
import nltk
nltk.download('wordnet')
from nltk.corpus import wordnet as wn
from nltk.corpus import words as words

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Akbar\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Now we import the regex package `re`.

In [2]:
import re

We will use `sortedcontainers` to improve performance.

In [3]:
from sortedcontainers import SortedSet,SortedList

Unfortunately, `wordnet` does **NOT** include:<br/> `determiners`, `prepositions`, `pronouns`, `conjunctions`, `particles`, `auxiliary verbs`.<br/>
Lets add these to our dictionary manually

In [4]:
ACCEPTED = SortedSet([])
notACCEPTED = SortedSet([])
CUSTOMDICT = SortedSet([])
ALLWORDS = []
import os

def genCustom():
    filenms = [name for name in os.listdir("./hardcode") if name.endswith(".txt")]
    for filenm in filenms:
        with open("./hardcode/"+filenm,'r') as file:
            for line in file:
                word = "".join(line.split())
                if not wn.synsets(word,'asrnv'):
                    ACCEPTED.add(word.lower())
    f = open("./hardcode/custom_dict.txt", "w")
    for word in ACCEPTED:
        f.write(word+"\n")
    f.close()

def readCustom():
    with open("./hardcode/custom_dict.txt",'r') as file:
        for line in file:
            word = "".join(line.split())
            CUSTOMDICT.add(word.lower())

def lookUp(word):
    if word in CUSTOMDICT or wn.synsets(word,'asrnv'):
        return True
    return False

readCustom()
print(len(CUSTOMDICT))

154


Time to start parsing our file!

In [5]:
pattern = re.compile(r"([\w\-\\']*[a-zA-Z]+[\w\-\']*)") # regex for words with atleast 1 a-zA-Z
with open("mobydick.txt") as file:                         # open input file
    for count , line in enumerate(file):                      # foreach line
        for match in re.finditer(pattern, line):                 # foreach word in line
            word = line[match.start():match.end()].lower()          # words found in line, forced lowercase
            if word in ACCEPTED or word in notACCEPTED:             # if word already memoized
                continue                                               # go to next word
            if lookUp(word):                                        # if word in wordnet, 'asrnv' means nouns,verbs,... 
                ACCEPTED.add(word)                                     # memoize as ACCEPTED
            else:                                                   # if word NOT in wordnet
                notACCEPTED.add(word)                                  # memoize as notACCEPTED

Great! We have our file parsed. However, there are some false negatives in `notACCEPTED`.<br/>
Lets account for words ending with `'s` or `s'`

In [6]:
def goodApostrophe(word):
    word_no_apst = re.sub("\'s$|s\'$",'',word)
    if word == word_no_apst:
        return False
    elif word_no_apst in ACCEPTED or lookUp(word):
        return True
    else:
        return False

In [7]:
APOSTROPHES = SortedSet([])
for word in notACCEPTED:
    if goodApostrophe(word):
        APOSTROPHES.add(word)

ACCEPT = ACCEPTED.union(APOSTROPHES)
notACCEPTED = notACCEPTED.difference(APOSTROPHES)

In [8]:
from IPython.display import display, Markdown, Latex
display(Markdown("**"
                 + format(len(APOSTROPHES), ',d')
                 + "** words found in dictionary, when `'s` or `s'` was removed"
                ))

**155** words found in dictionary, when `'s` or `s'` was removed

We've go as far as we can with dictionaries, but there are still more words to recognize.<br/>
Lets include compound words next `compound words` example: *gallant-cross-tree*

In [9]:
COMPOUNDWORDS = SortedSet([])
pattern_compound = re.compile(r"([^\-\s]+)")
for word in notACCEPTED:
    accept_compound = True
    roots = re.findall(pattern_compound, word)
    for r , root in enumerate(roots):
        if root in ACCEPTED or lookUp(root) or goodApostrophe(root):
            continue
        else:
            accept_compound = False
            break
    if word.startswith('-') or word.endswith('-'):
        accept_compound = False
    if accept_compound:
        COMPOUNDWORDS.add(word)
print(str(len(COMPOUNDWORDS)) + " compound words found!")

637 compound words found!


In [10]:
ACCEPT = ACCEPTED.union(COMPOUNDWORDS)
notACCEPTED = notACCEPTED.difference(COMPOUNDWORDS)

In [11]:
from IPython.display import display, Markdown, Latex
display(Markdown( "**" 
      + format(len(ACCEPTED), ',d')
      + "** (*correctly spelled*) + **"
      + format(len(notACCEPTED), ',d')
      + "** (*NOT in dictionary*) = **" 
      + format(len(ACCEPTED)+len(notACCEPTED), ',d')
      + "** (*total unique words*)<br/>**"
      + '{0:.2%}'.format(float(len(ACCEPTED))/float(len(ACCEPTED)+len(notACCEPTED))) 
      + "** *correctly spelled*"))

**11,188** (*correctly spelled*) + **1,748** (*NOT in dictionary*) = **12,936** (*total unique words*)<br/>**86.49%** *correctly spelled*

Lets at what we have so far

In [12]:
print(str(len(notACCEPTED))+ " words not 'yet' accepted")

1748 words not 'yet' accepted


In [13]:
from spellchecker import SpellChecker

In [126]:
def closestQuerty():
    import numpy as np
    from tabulate import tabulate
    quertyr = [['1','2','3','4','5','6','7','8','9','0','-'],
               ['Q','W','E','R','T','Y','U','I','O','P',''],
               ['A','S','D','F','G','H','J','K','L',';',''],
               ['Z','X','C','V','B','N','M',',','.','' ,'']]
    
    querty = ['1','2','3','4','5','6','7','8','9','0','-'
              ,'Q','W','E','R','T','Y','U','I','O','P'
              ,'A','S','D','F','G','H','J','K','L',';','\''
              ,'Z','X','C','V','B','N','M',',','.']
    n = len(querty)
    adjMat = np.full((n, n), np.inf)
    for i in range(n):
        adjMat[i][i]=0
    for i , row in enumerate(quertyr):
        for j , letter in enumerate(row):
            if letter is '':
                continue
            current_letter_idx = querty.index(letter)
            unvisited = querty.copy().remove(letter)
            visited = [letter]
            if(i>0):
                up_letter = quertyr[i-1][j]
                if up_letter != '':
                    up_letter_idx = querty.index(up_letter)
                    adjMat[current_letter_idx][up_letter_idx] = 1
                    adjMat[up_letter_idx][current_letter_idx] = 1
            if(i<3):
                up_letter = quertyr[i+1][j]
                if up_letter != '':
                    up_letter_idx = querty.index(up_letter)
                    adjMat[current_letter_idx][up_letter_idx] = 1
                    adjMat[up_letter_idx][current_letter_idx] = 1
            if(j>0):
                up_letter = quertyr[i][j-1]
                if up_letter != '':
                    up_letter_idx = querty.index(up_letter)
                    adjMat[current_letter_idx][up_letter_idx] = 1
                    adjMat[up_letter_idx][current_letter_idx] = 1
            if(j<10):
                up_letter = quertyr[i][j+1]
                if up_letter != '':
                    up_letter_idx = querty.index(up_letter)
                    adjMat[current_letter_idx][up_letter_idx] = 1
                    adjMat[up_letter_idx][current_letter_idx] = 1
    from scipy.sparse import csr_matrix
    from scipy.sparse.csgraph import floyd_warshall
    dist_matrix, predecessors = floyd_warshall(csgraph=adjMat, directed=False, return_predecessors=True)
    return dist_matrix
quertydist = closestQuerty()

1 0
2 1
3 2
4 3
5 4
6 5
7 6
8 7
9 8
0 9
- 10
Q 11
W 12
E 13
R 14
T 15
Y 16
U 17
I 18
O 19
P 20
A 21
S 22
D 23
F 24
G 25
H 26
J 27
K 28
L 29
; 30
Z 32
X 33
C 34
V 35
B 36
N 37
M 38
, 39
. 40
[[ 0.  1.  2. ...  9. 10. 11.]
 [ 1.  0.  1. ...  8.  9. 10.]
 [ 2.  1.  0. ...  7.  8.  9.]
 ...
 [ 9.  8.  7. ...  0.  1.  2.]
 [10.  9.  8. ...  1.  0.  1.]
 [11. 10.  9. ...  2.  1.  0.]]


In [222]:
def getQueryQuess(c,i):
    import numpy as np
    querty = ['1','2','3','4','5','6','7','8','9','0','-'
              ,'Q','W','E','R','T','Y','U','I','O','P'
              ,'A','S','D','F','G','H','J','K','L',';','\''
              ,'Z','X','C','V','B','N','M',',','.']
    index = querty.index(c)
    idxs = np.argwhere(quertydist[index][:]==i)
    guesses = []
    for idx in idxs:
        guesses.append(querty[idx[0]])
    return guesses

import operator
nltk.download('punkt')
words = []
with open("mobydick.txt") as file:                         # open input file
    for line in file:
        words += nltk.tokenize.word_tokenize(line.lower())

fdist1 = nltk.FreqDist(words)
fqdict = dict((word, freq) for word, freq in fdist1.items())
nltk.download('brown')
from nltk.corpus import brown
freqs = nltk.FreqDist(w.lower() for w in brown.words())


nltk.download('stopwords')
stopwords = nltk.corpus.stopwords.words('english')
allWordExceptStopDist = nltk.FreqDist(w.lower() for w in brown.words() if w not in stopwords) 
mostCommon= allWordDist.most_common(10).keys()

def closest(word):
    print(word, end =" : ")
    alt_words = []
    for i, c  in enumerate(word.upper()):
        for j in range(15):
            guesses = getQueryQuess(c,j)
            for guess in guesses:
                alt_word = word[:i] + guess + word[i+1:]
                if lookUp(alt_word.lower()):
                    alt_words.append(alt_word.lower())
                    if alt_word.lower() in freqs:
                        print(freqs[alt_word.lower()])
                    else:
                        print(0)
    return alt_words
        
closest("Whas")

SyntaxError: invalid syntax (<ipython-input-222-95fd543b5990>, line 44)

In [193]:
nltk.download('punkt')
words = []
with open("mobydick.txt") as file:                         # open input file
    for line in file:
        words += nltk.tokenize.word_tokenize(line.lower())

fdist1 = nltk.FreqDist(words)

filtered_word_freq = dict((word, freq) for word, freq in fdist1.items() if word.isdigit())

print(filtered_word_freq)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Akbar\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


