In [1]:
import string
import re
import numpy as np
from collections import Counter

# Reading all words and for this we need vocabulary dictionary

In [2]:
def read_corpus(filename):
    with open(filename,'r',encoding='utf-8') as file:
        lines=file.readlines()
        
        words=[]
        for word in lines:
            words += re.findall(r'\w+',word.lower())
    return words

corpus=read_corpus(r'big.txt')

In [3]:
len(corpus)

222663

# Create our vocabulary for unique words

In [4]:
vocab=set(corpus)
len(vocab)

17647

# To see how many times words appear in our corpus

In [5]:
word_count=Counter(corpus)
word_count['hi']

0

# Calculate Word Probability

In [6]:
total_word_counts=float(sum(word_count.values()))

In [7]:
word_prob={word:word_count[word]/total_word_counts for word in word_count.keys()}

In [8]:
word_prob

{'the': 0.06603252448767868,
 'project': 0.0004086893646452262,
 'gutenberg': 0.0004221626404027611,
 'ebook': 4.4910919191783095e-05,
 'of': 0.030278941719100165,
 'moby': 0.0004041982727260479,
 'dick': 0.0004041982727260479,
 'or': 0.003579400259585113,
 'whale': 0.005524043060589321,
 'by': 0.005488114325235894,
 'herman': 1.796436767671324e-05,
 'melville': 1.796436767671324e-05,
 'this': 0.006462681271697588,
 'is': 0.007863901950481221,
 'for': 0.007383355115129142,
 'use': 0.0002200635040397372,
 'anyone': 2.694655151506986e-05,
 'anywhere': 7.185747070685296e-05,
 'at': 0.005995607712103043,
 'no': 0.002667708599991916,
 'cost': 1.796436767671324e-05,
 'and': 0.029268446037285047,
 'with': 0.00794474160502643,
 'almost': 0.000884745108078127,
 'restrictions': 8.98218383835662e-06,
 'whatsoever': 3.143764343424817e-05,
 'you': 0.004302466058572821,
 'may': 0.001145228439390469,
 'copy': 8.533074646438789e-05,
 'it': 0.011380426923197837,
 'give': 0.0004041982727260479,
 'away':

# Autocorrect Operation

# 1 Split Operation

In [9]:
def split(word):
    return [(word[:i],word[i:]) for i in range(len(word)+1)]

In [10]:
split('why')

[('', 'why'), ('w', 'hy'), ('wh', 'y'), ('why', '')]

# 2 Delete Operation

In [11]:
def delete(word):
    return [left+right[1:] for left,right in split(word) if right]

In [12]:
print(delete('why'))

['hy', 'wy', 'wh']


# 3 Swap Operation

In [13]:
def swap(word):
    return [ left + right[1]+right[0] for left,right in split(word) if len(right)>1]

In [14]:
swap('why')

['hw', 'wyh']

# Replace Operation

In [15]:
def replace(word):
    return [left + center + right[1:] for left,right in split(word) if right for center in string.ascii_lowercase]

In [16]:
replace('why')

['ahy',
 'bhy',
 'chy',
 'dhy',
 'ehy',
 'fhy',
 'ghy',
 'hhy',
 'ihy',
 'jhy',
 'khy',
 'lhy',
 'mhy',
 'nhy',
 'ohy',
 'phy',
 'qhy',
 'rhy',
 'shy',
 'thy',
 'uhy',
 'vhy',
 'why',
 'xhy',
 'yhy',
 'zhy',
 'way',
 'wby',
 'wcy',
 'wdy',
 'wey',
 'wfy',
 'wgy',
 'why',
 'wiy',
 'wjy',
 'wky',
 'wly',
 'wmy',
 'wny',
 'woy',
 'wpy',
 'wqy',
 'wry',
 'wsy',
 'wty',
 'wuy',
 'wvy',
 'wwy',
 'wxy',
 'wyy',
 'wzy',
 'wha',
 'whb',
 'whc',
 'whd',
 'whe',
 'whf',
 'whg',
 'whh',
 'whi',
 'whj',
 'whk',
 'whl',
 'whm',
 'whn',
 'who',
 'whp',
 'whq',
 'whr',
 'whs',
 'wht',
 'whu',
 'whv',
 'whw',
 'whx',
 'why',
 'whz']

# Insert Operations

In [17]:
def insert(word):
    return [left+center+ right[1:] for left,right in split(word) for center in string.ascii_lowercase]

In [18]:
insert('why')

['ahy',
 'bhy',
 'chy',
 'dhy',
 'ehy',
 'fhy',
 'ghy',
 'hhy',
 'ihy',
 'jhy',
 'khy',
 'lhy',
 'mhy',
 'nhy',
 'ohy',
 'phy',
 'qhy',
 'rhy',
 'shy',
 'thy',
 'uhy',
 'vhy',
 'why',
 'xhy',
 'yhy',
 'zhy',
 'way',
 'wby',
 'wcy',
 'wdy',
 'wey',
 'wfy',
 'wgy',
 'why',
 'wiy',
 'wjy',
 'wky',
 'wly',
 'wmy',
 'wny',
 'woy',
 'wpy',
 'wqy',
 'wry',
 'wsy',
 'wty',
 'wuy',
 'wvy',
 'wwy',
 'wxy',
 'wyy',
 'wzy',
 'wha',
 'whb',
 'whc',
 'whd',
 'whe',
 'whf',
 'whg',
 'whh',
 'whi',
 'whj',
 'whk',
 'whl',
 'whm',
 'whn',
 'who',
 'whp',
 'whq',
 'whr',
 'whs',
 'wht',
 'whu',
 'whv',
 'whw',
 'whx',
 'why',
 'whz',
 'whya',
 'whyb',
 'whyc',
 'whyd',
 'whye',
 'whyf',
 'whyg',
 'whyh',
 'whyi',
 'whyj',
 'whyk',
 'whyl',
 'whym',
 'whyn',
 'whyo',
 'whyp',
 'whyq',
 'whyr',
 'whys',
 'whyt',
 'whyu',
 'whyv',
 'whyw',
 'whyx',
 'whyy',
 'whyz']

# Find minimum distance

In [19]:
def level_one_edits(word):
    return set(delete(word) + swap(word)+replace(word)+insert(word))

In [20]:
level_one_edits('sahi')

{'aahi',
 'ahi',
 'as',
 'bahi',
 'cahi',
 'dahi',
 'eahi',
 'fahi',
 'gahi',
 'hahi',
 'iahi',
 'jahi',
 'kahi',
 'lahi',
 'mahi',
 'nahi',
 'oahi',
 'pahi',
 'qahi',
 'rahi',
 'saai',
 'sabi',
 'saci',
 'sadi',
 'saei',
 'safi',
 'sagi',
 'sah',
 'saha',
 'sahb',
 'sahc',
 'sahd',
 'sahe',
 'sahf',
 'sahg',
 'sahh',
 'sahi',
 'sahia',
 'sahib',
 'sahic',
 'sahid',
 'sahie',
 'sahif',
 'sahig',
 'sahih',
 'sahii',
 'sahij',
 'sahik',
 'sahil',
 'sahim',
 'sahin',
 'sahio',
 'sahip',
 'sahiq',
 'sahir',
 'sahis',
 'sahit',
 'sahiu',
 'sahiv',
 'sahiw',
 'sahix',
 'sahiy',
 'sahiz',
 'sahj',
 'sahk',
 'sahl',
 'sahm',
 'sahn',
 'saho',
 'sahp',
 'sahq',
 'sahr',
 'sahs',
 'saht',
 'sahu',
 'sahv',
 'sahw',
 'sahx',
 'sahy',
 'sahz',
 'sai',
 'saih',
 'saii',
 'saji',
 'saki',
 'sali',
 'sami',
 'sani',
 'saoi',
 'sapi',
 'saqi',
 'sari',
 'sasi',
 'sati',
 'saui',
 'savi',
 'sawi',
 'saxi',
 'sayi',
 'sazi',
 'sbhi',
 'schi',
 'sdhi',
 'sehi',
 'sfhi',
 'sghi',
 'sha',
 'shhi',
 'shi',


# Get all cands that we want to find out

In [21]:
def level_two_edits(word):
    return [e2 for e1 in level_one_edits(word) for e2 in level_one_edits(e1)]

In [22]:
level_two_edits('Sahil')

['Sahip',
 'Sehip',
 'pahip',
 'oahip',
 'Sfhip',
 'Sayip',
 'Sahif',
 'Sahipq',
 'Sahipv',
 'Sahik',
 'Sahih',
 'hahip',
 'sahip',
 'gahip',
 'Sahipp',
 'Sakip',
 'eahip',
 'Saxip',
 'Sphip',
 'Sahup',
 'Sahxp',
 'Sahit',
 'Shhip',
 'Sahhp',
 'Sahpp',
 'Sahgp',
 'Sahii',
 'Sahipi',
 'Smhip',
 'fahip',
 'Slhip',
 'Sahjp',
 'Saqip',
 'Sahiph',
 'Sahipt',
 'Sahipa',
 'iahip',
 'Sxhip',
 'Sahipo',
 'Ship',
 'Sasip',
 'Sazip',
 'Sthip',
 'Sahzp',
 'Sapip',
 'Sqhip',
 'Sahips',
 'Samip',
 'Sha',
 'Sahap',
 'Sacip',
 'Sahi',
 'Sahyp',
 'Sahid',
 'Sahipu',
 'ahip',
 'Sahdp',
 'Sahim',
 'Sahipb',
 'dahip',
 'Sahil',
 'Sahmp',
 'Sahipx',
 'Sahij',
 'jahip',
 'Snhip',
 'yahip',
 'Sahipm',
 'Saih',
 'Sanip',
 'Sahipd',
 'Sahiv',
 'Sahiu',
 'Sahnp',
 'nahip',
 'Sahrp',
 'Sahipw',
 'Sahsp',
 'Sihip',
 'Sahiz',
 'Sahipk',
 'Saiip',
 'rahip',
 'Sahipe',
 'Sahio',
 'Sabip',
 'Sahcp',
 'tahip',
 'Sahipz',
 'qahip',
 'Szhip',
 'Sahir',
 'bahip',
 'Sahp',
 'aahip',
 'Svhip',
 'Sahfp',
 'Suhip',
 'Sahtp',

# Autocorrect Search Bar

In [22]:
def correct_spelling_word(word,vocab,word_prob):
    if word in vocab:
        print(f"{word} is already correctly spelled")
        return
    #getting all suggestions
    suggestions=level_one_edits(word) or level_two_edits(word) or [word]
    best_guess=[w for w in suggestions if w in vocab]
    return [(w,word_prob[w]) for w in best_guess if w in word_prob]

In [23]:
search_word="cor"
guess=correct_spelling_word(search_word,vocab,word_prob)
print(guess)

[('core', 4.49109191917831e-06), ('cow', 4.49109191917831e-06), ('cod', 5.838419494931803e-05), ('cord', 4.041982727260479e-05), ('gor', 1.347327575753493e-05), ('car', 4.49109191917831e-06), ('co', 4.49109191917831e-06), ('cork', 1.796436767671324e-05), ('corn', 2.2455459595891548e-05), ('cob', 4.49109191917831e-06), ('for', 0.007383355115129142), ('nor', 0.0007006103393918164), ('con', 4.49109191917831e-06), ('or', 0.003579400259585113)]


In [58]:
import pickle

In [62]:
pickle.dump(corpus,open('corpus.pkl','wb'))

In [25]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [32]:
tf=TfidfVectorizer()

In [27]:
corpus_vec=tf.fit_transform(corpus)

{'gallows',
 'an',
 'trysail',
 'ensigns',
 'operated',
 'pagoda',
 'clock',
 'cranes',
 'sperma',
 'consternation',
 '_thar_',
 'turkeys',
 'colouring',
 'alien',
 'wary',
 'loudly',
 'tack',
 'oxen',
 'blanks',
 'disobedience',
 'cleanse',
 'establishing',
 'score',
 'corpusants',
 'referred',
 'debel',
 'peasant',
 '_teenth_',
 'nation',
 'crater',
 'usurper',
 'evincing',
 'dusting',
 'antics',
 'cinnamon',
 'uncheered',
 '78',
 'steel',
 'tempest',
 'lad',
 'liquor',
 'unsolicited',
 'disturbing',
 'leviathanism',
 'heraldic',
 'palpable',
 'poring',
 'angelo',
 'howlings',
 'skimming',
 'fuzzing',
 'vividness',
 'centres',
 'grandissimus',
 'slacken',
 'agrees',
 'ignorance',
 'rioting',
 'evaporates',
 'revolved',
 'beats',
 'acts',
 'slipperiness',
 'outstretching',
 'preble_',
 'lins',
 'corals',
 'confoundedly',
 'spurred',
 'locked',
 'apostolic',
 'whitenesses',
 'unsuccessful',
 'matsmai',
 'signification',
 'jesty',
 'rides',
 'interlude',
 'machine',
 'colonnades',
 'exp