In [25]:
import re
from tqdm import tqdm
import pandas as pd

## Finding the Unique words

In [8]:
with open('big.txt','r') as fd:
    lines = fd.readlines()
    words = []
    for line in lines:
        words += re.findall('\w+', line)
    
print(len(words))
vocab = set(words)
print(len(vocab))

1115585
38160


## Finding the probability distribution

In [9]:
word_probability = {}

for word in tqdm(vocab):
    word_probability[word] = float(words.count(word)/len(words))

100%|████████████████████████████████████████████████████████████████████████████| 38160/38160 [15:22<00:00, 41.38it/s]


In [10]:
len(word_probability)

38160

In [11]:
word_probability['the']

0.06479291134247951

## Text Preprocessing

### Splitting

In [14]:
def split(word):
    
    parts = []
    
    for i in range(len(word) + 1):
        parts += [(word[:i], word[i:])]
    
    return parts

split('mahfooz')

[('', 'mahfooz'),
 ('m', 'ahfooz'),
 ('ma', 'hfooz'),
 ('mah', 'fooz'),
 ('mahf', 'ooz'),
 ('mahfo', 'oz'),
 ('mahfoo', 'z'),
 ('mahfooz', '')]

### Delete

In [16]:
def delete(word):
    output = []
    for l, r in split(word):
        output.append(l + r[1:])
    return output

delete('remamber')

['emamber',
 'rmamber',
 'reamber',
 'remmber',
 'remaber',
 'remamer',
 'remambr',
 'remambe',
 'remamber']

### Swap

In [18]:
def swap(word):
    
    output = []
    for l,r in split(word):
        if (len(r) > 1):
            output.append(l + r[1] + r[0] + r[2:])
    return output

swap('hppay')

['phpay', 'hppay', 'hpapy', 'hppya']

### Replace

In [19]:
def replace(word):
    
    characters = 'abcdefghijklmnopqrstuvwxyz'
    output = []
    
    for l,r in split(word):
        for char in characters:
            output.append(l + char + r[1:])
    return output

len(replace('lave'))

130

### Insert

In [21]:
def insert(word):
    
    characters = 'abcdefghijklmnopqrstuvwxyz'
    output = []
    
    for l,r in split(word):
        for char in characters:
            output.append(l + char + r)
            
    return output

len(insert('lve'))

104

# Finding the predictions

### Combinig the possible words

In [45]:
def edit(word):   
    return list(set(insert(word) + delete(word) + swap(word) + replace(word)))

## Predicting the words

In [46]:
def spell_check_edit_1(word, count = 5):
    
    output = []
    suggested_words = edit(word)
    
    for wrd in suggested_words:        
        if wrd in word_probability.keys():
            output.append([wrd, word_probability[wrd]])
            
    return list(pd.DataFrame(output, columns = ['word','prob']).sort_values(by = 'prob', ascending = False).head(count)['word'].values)

In [55]:
spell_check_edit_1('Loave')

['Leave', 'Love']

## Advanced method of predicting words

In [56]:
def spell_check_edit_2(word, count = 5):
    
    output = []
    suggested_words = edit(word)       # Level one Edit
    
    for e1 in edit(word):
        suggested_words += edit(e1)    # Second Level Edit 
    
    suggested_words = list(set(suggested_words))
    
    for wrd in suggested_words:
        if wrd in word_probability.keys():
            output.append([wrd, word_probability[wrd]])
    return list(pd.DataFrame(output, columns = ['word','prob']).sort_values(by = 'prob', ascending = False).head(count)['word'].values)
        
spell_check_edit_2('Loave')

['have', 'love', 'gave', 'leave', 'save']