In [119]:
import re
from collections import Counter
from pathlib import Path

In [120]:
# function to tokenise words
def words(document):
    "Convert text to lower case and tokenise the document"
    return re.findall(r'\w+', document.lower())

In [121]:
folder = Path("C:/Upgrad Projects/NLP-M1")

In [122]:
# create a frequency table of all the words of the document
all_words = Counter(words(open(folder/'big.txt').read()))

In [123]:
# check frequency of a random word, say, 'chair'
all_words['chair']

135

In [124]:
# look at top 10 frequent words
all_words.most_common(10)

[('the', 79809),
 ('of', 40024),
 ('and', 38312),
 ('to', 28765),
 ('in', 22023),
 ('a', 21124),
 ('that', 12512),
 ('he', 12401),
 ('was', 11410),
 ('it', 10681)]

In [125]:
def edits_one(word):
    "Create all edits that are one edit away from `word`."
    alphabets    = 'abcdefghijklmnopqrstuvwxyz'
#   print([i for i in range(len(word) + 1)])
    splits     = [(word[:i], word[i:])                   for i in range(len(word) + 1)]
#     print('splits')
#     print(splits)

#     dels = []
#     for left, right in splits:
#         print('left = {}, right = {}'.format(left, right))
#         if right:
#               dels.append(left + right[1:])
#               print(dels)  
    
    deletes    = [left + right[1:]                       for left, right in splits if right]
#     print('deletes')
#     print(deletes)
#     print('dels')
#     print(dels)
              

    inserts    = [left + c + right                       for left, right in splits for c in alphabets]
#     print('inserts')
#     print(inserts)
    replaces   = [left + c + right[1:]                   for left, right in splits if right for c in alphabets]
#     print('--- Inside Replaces ---')
#     print(replaces)
#     print('--- Inside Transposes ---')
    trans = []
#     for left, right in splits:
#         print('outside left = {}, right = {}'.format(left, right))
#         if len(right)>1:
#             print('inside left = {}, right = {}'.format(left, right))
#             trans.append(left + right[1] + right[0] + right[2:])
#             print(trans)
#     print('trans')
#     print(trans)        
    transposes = [left + right[1] + right[0] + right[2:] for left, right in splits if len(right)>1]
#     print('transposes')
#     print(transposes)
    return set(deletes + inserts + replaces + transposes)

In [126]:
def edits_two(word):
    "Create all edits that are two edits away from `word`."
    return (e2 for e1 in edits_one(word) for e2 in edits_one(e1))

In [127]:
def known(words):
    "The subset of `words` that appear in the `all_words`."
    return set(word for word in words if word in all_words)

In [128]:
def possible_corrections(word):
    "Generate possible spelling corrections for word."
    return (known([word]) or known(edits_one(word)) or known(edits_two(word)) or [word])

In [129]:
sum(all_words.values())

1115585

In [130]:
def prob(word, N=sum(all_words.values())): 
    "Probability of `word`: Number of appearances of 'word' / total number of tokens"
    return all_words[word] / N

In [131]:
print(len(set(edits_one("monney"))))
print(edits_one("monney"))

336
{'myonney', 'nmonney', 'menney', 'monned', 'mojnney', 'mkonney', 'monnefy', 'dmonney', 'monngey', 'money', 'mohnney', 'monneyd', 'monneyx', 'conney', 'mgonney', 'monvey', 'monnedy', 'jonney', 'monnay', 'monntey', 'monnei', 'montey', 'gonney', 'monnqey', 'monneyb', 'mgnney', 'monwney', 'monneyh', 'mohney', 'monnehy', 'mlonney', 'monhey', 'monhney', 'monneya', 'monqney', 'mooney', 'vmonney', 'mogney', 'moanney', 'monnep', 'monnny', 'monnec', 'monnyy', 'mondney', 'monnkey', 'aonney', 'monnee', 'monnen', 'omonney', 'monnyey', 'monnepy', 'monneyi', 'mqonney', 'hmonney', 'mmonney', 'lmonney', 'mobney', 'monniy', 'kmonney', 'monneyz', 'xonney', 'monnfey', 'monnzy', 'monnuy', 'mdonney', 'monnea', 'monsney', 'ionney', 'monndy', 'monrney', 'mcnney', 'monneny', 'monnek', 'maonney', 'mvnney', 'mponney', 'mwonney', 'monkney', 'msonney', 'monneyr', 'ronney', 'mdnney', 'monnley', 'mnoney', 'monnecy', 'mongney', 'mwnney', 'momnney', 'moniey', 'mofney', 'monneu', 'monneyc', 'umonney', 'moncey', 'mo

In [132]:
print(known(edits_one("monney")))

{'monkey', 'money'}


In [133]:
# Let's look at words that are two edits away
print(len(set(edits_two("monney"))))
print(known(edits_two("monney")))

51013
{'convey', 'bonny', 'morley', 'donne', 'motley', 'monkey', 'money', 'monkeys', 'honey', 'manned', 'moines', 'bonne', 'manner', 'donkey', 'bonnet', 'monday', 'moaned', 'olney', 'monger', 'donned', 'tonne'}


In [134]:
# Let's look at possible corrections of a word
print(possible_corrections("monney"))

{'monkey', 'money'}


In [135]:
# Let's look at probability of a word
print(prob("money"))
print(prob("monkey"))

0.0002922233626303688
5.378344097491451e-06


In [136]:
def spell_check(word):
    "Print the most probable spelling correction for `word` out of all the `possible_corrections`"
    correct_word = max(possible_corrections(word), key=prob)
    if correct_word != word:
        return "Did you mean " + correct_word + "?"
    else:
        return "Correct spelling."

In [137]:
# test spell check
print(spell_check("monney"))

Did you mean money?


In [138]:
['monney']

['monney']

In [139]:
print({'monkey', 'money'} or {'monkey', 'money'} or ['monney'])

{'monkey', 'money'}


In [140]:
print({'monkey', 'money'} or ['monney'])

{'monkey', 'money'}


In [141]:
print({} or ['monney'])

['monney']


In [142]:
# test spell check
print(spell_check("wealttthiiii"))

Correct spelling.


In [143]:
print(possible_corrections("wealttthiiii"))

['wealttthiiii']


In [144]:
all_words['wealttthiiii']

0

In [145]:
print(len(edits_one('emfasize')))

442


In [146]:
type(edits_two("emfasize"))


generator

In [147]:
print(len(set(edits_two("emfasize"))))

90902


In [148]:
print(possible_corrections("emfasize"))

{'emphasize'}


In [150]:
import math

In [152]:
0.5/0.75

0.6666666666666666

In [151]:
math.log10(0.5/0.75)

-0.17609125905568127

In [154]:
math.log10(0.667)

-0.17587416608345102