In [1]:
import re
from tqdm import tqdm
import pandas as pd

In [2]:
with open('big.txt', 'r') as fd:
    lines = fd.readlines()
    words = []
    for line in lines:
        words += re.findall('\w+', line)

len(words)

1115585

In [3]:
with open('big.txt', 'r') as fd:
    lines = fd.readlines()
    words = []
    for line in lines:
        words += line.split(' ')

len(words)

1164968

In [4]:
print('There are', 1164968-1115585 , 'words not starting from a character.')

There are 49383 words not starting from a character.


## 1. <u>Finding unique words </u>

In [5]:
with open('big.txt', 'r') as fd:
    lines = fd.readlines()
    words = []
    for line in lines:
        words += re.findall('\w+', line.lower())

print(len(words))
vocab = list(set(words))
print(len(vocab))

1115585
32198


In [6]:
with open('big.txt','r') as fd:
    lines = fd.readlines()
    words = []
    for line in lines:
        words += re.findall('\w+',line.lower())
        
print(len(words))
vocab = list(set(words))
print(len(vocab))

1115585
32198


## 2. <u>Finding the probability distribution</u>

In [7]:
words.count('the') / len(words)

0.07154004401278254

In [8]:
word_probability = {}

for word in tqdm(vocab):
    word_probability[word] = float(words.count(word)/len(words))


100%|████████████████████████████████████████████████████████████████████████████| 32198/32198 [11:07<00:00, 48.23it/s]


 ## 3. <u>Text PreProcessing</u>

### Splitting

In [9]:
def split(word):  
    parts = []
    for i in range(len(word) + 1):
        parts += [(word[ : i], word[i : ])]
    return parts

split('harsh')

[('', 'harsh'),
 ('h', 'arsh'),
 ('ha', 'rsh'),
 ('har', 'sh'),
 ('hars', 'h'),
 ('harsh', '')]

### 3.1 Delete

loave -> love

In [10]:
def delete(word):
    output = []
    for l, r in split(word):
        output.append(l + r[1:])
    return output

delete('haarsh')

['aarsh', 'harsh', 'harsh', 'haash', 'haarh', 'haars', 'haarsh']

### 3.2 Swap

'lvoe' -> 'love'

In [11]:
def swap(word):
    output = []
    for l,r in split(word):
        if len(r) > 1:
            output.append(l + r[1] + r[0] + r[2:])
    return output

swap('hrash')

['rhash', 'harsh', 'hrsah', 'hrahs']

### 3.3 Replace

In [12]:
def replace(word):
    
    characters = 'abcdefghijklmnopqrstuvwxyz'
    output = []    

    for l,r in split(word):
        for char in characters:
            output.append(l + char +  r[1:])
    return output

len(replace('lave'))

130

### 3.4 Insert

In [13]:
def insert(word):
    characters = 'abcdefghijklmnopqrstuvwxyz'
    output = []

    for l,r in split(word):
        for char in characters:
            output.append(l + char + r)
    return output

len(insert('lve'))

104

## 4. <u>Finding the Prediction (Level - 1)</u>

### 4.1 Combining Possible Words

In [14]:
def edit(word):
    return list(set(insert(word) + delete(word) + swap(word) + replace(word)))

### 4.2 Predicting the Word

In [15]:
suggested_words = edit('loave')

In [27]:
def spell_check_edit_1(word, count = 5):
    output = []
    suggested_words = edit(word)
    
    for wrd in suggested_words:
        if wrd in word_probability.keys():
            output.append([wrd, word_probability[wrd]])
    return list(pd.DataFrame(output, columns = ['word','prob']).sort_values(by = 'prob', ascending = False).head(count)['word'].values)

spell_check_edit_1('famili')

['family']

## 5. Finding the Prediction (Level - 2)

### 5.1) Combining Possible Words

In [24]:
def spell_check_edit_2(word, count = 5):
    
    output = []
    suggested_words = edit(word)       # Level one Edit
    
    for e1 in edit(word):
        suggested_words += edit(e1)    # Second Level Edit 
    
    suggested_words = list(set(suggested_words))
    
    for wrd in suggested_words:
        if wrd in word_probability.keys():
            output.append([wrd, word_probability[wrd]])
    return list(pd.DataFrame(output, columns = ['word','prob']).sort_values(by = 'prob', ascending = False).head(count)['word'].values)
        
spell_check_edit_2('fameli')

['family', 'namely', 'fame', 'amelie', 'camelia']