In [5]:
import re
import os

In [6]:
from collections import Counter

In [7]:
def words(document):
    "convert text to lower case and tokenize the document"
    return re.findall(r'\w+', document.lower())

In [9]:
path = os.getcwd().replace("\\", "/")
all_words = Counter(words(open(path + '/big.txt').read()))

In [10]:
all_words['chair']

135

In [21]:
all_words.most_common(10)

[('the', 79809),
 ('of', 40024),
 ('and', 38312),
 ('to', 28765),
 ('in', 22023),
 ('a', 21124),
 ('that', 12512),
 ('he', 12401),
 ('was', 11410),
 ('it', 10681)]

In [22]:
def edits_one(word):
    "create all edits that are one edit away from 'word'."
    alphabets = 'abcdefghijklmnopqrstuvwxyz'
    splits = [(word[:i], word[i:]) for i in range(len(word) + 1)]
    deletes = [(left + right[1:]) for left, right in splits if right]
    inserts = [left + c + right for left, right in splits for c in alphabets]
    replaces = [left + c + right[1:]                         for left, right in splits if right for c in alphabets]
    transposes = [left + right[1] + right [0] + right[2:]    for left, right in splits if len(right)>1]
    return set(deletes + inserts + replaces + transposes)

In [23]:
def edits_two(words):
    "create all edits that are two edits away from 'word'"
    return set(e2 for e1 in edits_one(word) for e2 in edits_one(e1))

In [24]:
def known(words):
    "the subset of the 'words' that apear in the 'all_words'"
    return set(word for word in words if word in all_words)

In [25]:
def possible_corrections(word):
    "generate possible spelling corrections for word"
    return(known([word]) or known(edits_one(word)) or known(edits_two(word)) or [word])

In [26]:
def prob(word, N=sum(all_words.values())):
    "probility of 'word': number of appearances of 'word' / total number of tokens"
    return all_words[word] / N

In [28]:
print(len(set(edits_one("monney"))))
print(known(edits_one("monney")))

336
{'money', 'monkey'}


In [29]:
print(possible_corrections("monney"))

{'money', 'monkey'}


In [30]:
print(prob("money"))
print(prob("monkey"))

0.0002922233626303688
5.378344097491451e-06


In [37]:
def spell_check(word):
    "print the most probable spelling correction for 'word' out of all the 'possible_corrections'"
    correct_word = max(possible_corrections(word), key = prob)
    if correct_word != word:
        return "Did you mean " + correct_word + "?"
    else:
        return "Correct Spelling."

In [47]:
print(spell_check("assume"))

Correct Spelling.
