In [1]:
import re
from collections import Counter

In [2]:
# function to tokenise words
def words(document):
    "Convert text to lower case and tokenise the document"
    return re.findall(r'\w+', document.lower())

In [3]:
# create a frequency table of all the words of the document
all_words = Counter(words(open('big.txt').read()))

In [9]:
print(len(all_words))
# check frequency of a random word, say, 'chair'
all_words['chair']

32198


135

In [10]:
# look at top 10 frequent words
all_words.most_common(10)

[('the', 79809),
 ('of', 40024),
 ('and', 38312),
 ('to', 28765),
 ('in', 22023),
 ('a', 21124),
 ('that', 12512),
 ('he', 12401),
 ('was', 11410),
 ('it', 10681)]

In [11]:
def edits_one(word):
    "Create all edits that are one edit away from `word`."
    alphabets    = 'abcdefghijklmnopqrstuvwxyz'
    splits     = [(word[:i], word[i:])                   for i in range(len(word) + 1)]
    deletes    = [left + right[1:]                       for left, right in splits if right]
    inserts    = [left + c + right                       for left, right in splits for c in alphabets]
    replaces   = [left + c + right[1:]                   for left, right in splits if right for c in alphabets]
    transposes = [left + right[1] + right[0] + right[2:] for left, right in splits if len(right)>1]
    return set(deletes + inserts + replaces + transposes)

In [12]:
def edits_two(word):
    "Create all edits that are two edits away from `word`."
    return (e2 for e1 in edits_one(word) for e2 in edits_one(e1))

In [13]:
def known(words):
    "The subset of `words` that appear in the `all_words`."
    return set(word for word in words if word in all_words)

In [14]:
def possible_corrections(word):
    "Generate possible spelling corrections for word."
    return (known([word]) or known(edits_one(word)) or known(edits_two(word)) or [word])

In [15]:
def prob(word, N=sum(all_words.values())): 
    "Probability of `word`: Number of appearances of 'word' / total number of tokens"
    return all_words[word] / N

In [25]:
print(len(set(edits_one("emfasize"))))
print(edits_one("emfasize"))

442
{'emfaysize', 'emfaszize', 'emfiasize', 'pemfasize', 'emfmsize', 'emfasqze', 'emfasizo', 'emfasizb', 'emfasizx', 'emfasizeb', 'emfmasize', 'vmfasize', 'emfasizes', 'emfaswze', 'emfarize', 'emfasdze', 'emzfasize', 'emnasize', 'emkasize', 'emsfasize', 'emfasiwze', 'emfasiie', 'edmfasize', 'iemfasize', 'emfgasize', 'emfasiez', 'eumfasize', 'emfaaize', 'emfasiyze', 'aemfasize', 'emfeasize', 'emfaseze', 'emfasgze', 'emfahize', 'egfasize', 'emfasaze', 'emfasikze', 'emfaszie', 'mefasize', 'emfasiuze', 'emjfasize', 'emfasizie', 'emfasije', 'emfahsize', 'ezfasize', 'emfuasize', 'etfasize', 'xemfasize', 'emfusize', 'emfqasize', 'emfasnze', 'rmfasize', 'emfatsize', 'emfcsize', 'emlfasize', 'emfasizi', 'emfasigze', 'emfaside', 'emfafsize', 'emfasqize', 'emfasizem', 'emfansize', 'emfasuze', 'emcasize', 'emfasizze', 'emfasizue', 'cmfasize', 'yemfasize', 'emfasizf', 'emfapize', 'emfasizr', 'lmfasize', 'emfasizel', 'ecmfasize', 'emfaszze', 'emfasmize', 'emfasizce', 'wemfasize', 'exmfasize', 'emfas

In [17]:
print(known(edits_one("monney")))

{'money', 'monkey'}


In [18]:
# Let's look at words that are two edits away
print(len(set(edits_two("monney"))))
print(known(edits_one("monney")))

51013
{'money', 'monkey'}


In [26]:
# Let's look at possible corrections of a word
print(possible_corrections("emfasize"))

{'emphasize'}


In [27]:
# Let's look at probability of a word
print(prob("money"))
print(prob("emfasize"))

0.0002922233626303688
0.0


In [21]:
def spell_check(word):
    "Print the most probable spelling correction for `word` out of all the `possible_corrections`"
    correct_word = max(possible_corrections(word), key=prob)
    if correct_word != word:
        return "Did you mean " + correct_word + "?"
    else:
        return "Correct spelling."

In [22]:
# test spell check
print(spell_check("monney"))

Did you mean money?


In [54]:
from spell_corrector import rectify
correct = rectify("stidy")
print(correct)

study


In [29]:
text = "The Nobel Prize is a set of five annual international awards bestowed in several categories by Swedish and Norwegian institutions in recognition of academic, cultural, or scientific advances. In the 19th century, the Nobel family who were known for their innovations to the oil industry in Azerbaijan was the leading representative of foreign capital in Baku. The Nobel Prize was funded by personal fortune of Alfred Nobel. The Board of the Nobel Foundation decided that after this addition, it would allow no further new prize."

In [34]:
import re

In [44]:
Counter(re.findall(r'\w+', text.lower()))

Counter({'the': 8,
         'nobel': 5,
         'prize': 3,
         'is': 1,
         'a': 1,
         'set': 1,
         'of': 5,
         'five': 1,
         'annual': 1,
         'international': 1,
         'awards': 1,
         'bestowed': 1,
         'in': 5,
         'several': 1,
         'categories': 1,
         'by': 2,
         'swedish': 1,
         'and': 1,
         'norwegian': 1,
         'institutions': 1,
         'recognition': 1,
         'academic': 1,
         'cultural': 1,
         'or': 1,
         'scientific': 1,
         'advances': 1,
         '19th': 1,
         'century': 1,
         'family': 1,
         'who': 1,
         'were': 1,
         'known': 1,
         'for': 1,
         'their': 1,
         'innovations': 1,
         'to': 1,
         'oil': 1,
         'industry': 1,
         'azerbaijan': 1,
         'was': 2,
         'leading': 1,
         'representative': 1,
         'foreign': 1,
         'capital': 1,
         'baku': 1,
         '

In [45]:
3/4

0.75

In [46]:
import math

In [49]:
math.log10(.5/(.75))

-0.17609125905568127

In [61]:
from sklearn import datasets
import pandas as pd

  return f(*args, **kwds)
  return f(*args, **kwds)


In [56]:
dat = datasets.load_boston()


In [62]:
dd = pd.DataFrame(dat.data,columns=dat.feature_names)

In [63]:
dd.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33


In [66]:
dd.describe()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
CRIM,506.0,3.613524,8.601545,0.00632,0.082045,0.25651,3.677083,88.9762
ZN,506.0,11.363636,23.322453,0.0,0.0,0.0,12.5,100.0
INDUS,506.0,11.136779,6.860353,0.46,5.19,9.69,18.1,27.74
CHAS,506.0,0.06917,0.253994,0.0,0.0,0.0,0.0,1.0
NOX,506.0,0.554695,0.115878,0.385,0.449,0.538,0.624,0.871
RM,506.0,6.284634,0.702617,3.561,5.8855,6.2085,6.6235,8.78
AGE,506.0,68.574901,28.148861,2.9,45.025,77.5,94.075,100.0
DIS,506.0,3.795043,2.10571,1.1296,2.100175,3.20745,5.188425,12.1265
RAD,506.0,9.549407,8.707259,1.0,4.0,5.0,24.0,24.0
TAX,506.0,408.237154,168.537116,187.0,279.0,330.0,666.0,711.0


In [67]:
dd.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 506 entries, 0 to 505
Data columns (total 13 columns):
CRIM       506 non-null float64
ZN         506 non-null float64
INDUS      506 non-null float64
CHAS       506 non-null float64
NOX        506 non-null float64
RM         506 non-null float64
AGE        506 non-null float64
DIS        506 non-null float64
RAD        506 non-null float64
TAX        506 non-null float64
PTRATIO    506 non-null float64
B          506 non-null float64
LSTAT      506 non-null float64
dtypes: float64(13)
memory usage: 51.5 KB
