In [92]:
import re
from tqdm import tqdm
import pandas as pd

## 1. Finding unique words

In [93]:
with open('big.txt','r') as fd:
    lines=fd.readlines()
    words=[]
    for line in lines:
        words+=re.findall('\w+',line.lower())
        
print(len(words))
vocab=list(set(words))
print(len(vocab))

1115585
32198


## 2. Finding the Probabilty Distribution

In [3]:
# words.count('Book') #print how many times a word appears in the dataset
# words.count('the')/len(words) #to find probability of each word to appear

In [4]:
word_probability={}
for word in tqdm(vocab):
    word_probability[word]=float(words.count(word)/len(words))

100%|████████████████████████████████████████████████████████████████████████████| 32198/32198 [10:10<00:00, 52.71it/s]


In [6]:
list(word_probability.items())[:10]

[('besprinkled', 1.7927813658304835e-06),
 ('mobbing', 8.963906829152417e-07),
 ('bedford', 1.7927813658304835e-06),
 ('inheritance', 1.4342250926643868e-05),
 ('lamentations', 8.963906829152417e-07),
 ('emancipators', 8.963906829152417e-07),
 ('droop', 2.6891720487457255e-06),
 ('ugliness', 8.963906829152417e-07),
 ('ear', 4.123397141410112e-05),
 ('_main', 2.6891720487457255e-06)]

In [7]:
word_probability['the']

0.07154004401278254

## 3. Text preprocessing

### Splitting

In [94]:
def split(word):
    
    parts=[]
    for i in range(len(word)+1):
        parts+=[(word[:i],word[i:])]
    return parts

###  3.1 Delete
'loave' -> 'love'

In [95]:
def delete(word):
    
    output=[]
    for l,r in split(word):
        output.append(l+r[1:])
    return output

delete('loave')

['oave', 'lave', 'love', 'loae', 'loav', 'loave']

### 3.2 Swap
'lvoe' -> 'love'

In [96]:
def swap(word):
    
    output=[]
    for l,r in split(word):
        if(len(r)>1):
            output.append(l+r[1]+r[0]+r[2:])
    return output

swap('lvoe')

['vloe', 'love', 'lveo']

### 3.3 Replace
'lave' -> 'love'

In [97]:
def replace(word):
    
    characters='abcdefghijklmnopqrstuvwxyz'
    output=[]
    
    for l,r in split(word):
        for char in characters:
            output.append(l+char+r[1:])
    return output

len(replace('lave'))

130

### 3.4 Insert
'lve' -> 'love'

In [98]:
def insert(word):
    
    characters='abcdefghijklmnopqrstuvwxyz'
    output=[]
    
    for l,r in split(word):
        for char in characters:
            output.append(l+char+r)
    return output

len(insert('lve'))

104

## 4. Finding the prediction(Level 1)

### 4.1 Combining Possible words

In [99]:
def edit(word):
    return list(set(insert(word)+delete(word)+swap(word)+replace(word)))

### 4.2 predicting the word

In [85]:
def spell_check_edit_1(word,count=5):
    
    suggested_words=edit(word)
    output=[]
    
    for wrd in suggested_words:
        if wrd in word_probability.keys():
            output.append([wrd,word_probability[wrd]])
            
    return list(pd.DataFrame(output,columns=['word','prob']).sort_values(by='prob',ascending=False).head(count)['word'].values)

In [101]:
spell_check_edit_1('famili')

['family']

## 5. Finding the Prediction(Level 2)

### 5.1 Combining Possible Words

In [102]:
def spell_check_edit_2(word,count=5):

    output=[]
    suggested_words=edit(word)       #level one edit
    
    for e1 in edit(word):
        suggested_words +=edit(e1)   #level two edit
        
    suggested_words=list(set(suggested_words))
    
    for wrd in suggested_words:
        if wrd in word_probability.keys():
            output.append([wrd,word_probability[wrd]])
    return list(pd.DataFrame(output,columns=['word','prob']).sort_values(by='prob',ascending=False).head(count)['word'].values)

spell_check_edit_2('famili')

['vasili', 'family', 'familiar', 'families', 'fail']