# Edit distance

In [1]:
import pandas as pd
import numpy as np
import textdistance
import re
from collections import Counter
words = []
with open('../data/autocorrect/book.txt', 'r') as f:
    file_name_data = f.read()
    file_name_data=file_name_data.lower()
    words = re.findall('\w+',file_name_data)

V = set(words)
print(f"Top ten words in the text are:{words[0:10]}")
print(f"Total Unique words are {len(V)}.")

Top ten words in the text are:['the', 'project', 'gutenberg', 'ebook', 'of', 'moby', 'dick', 'or', 'the', 'whale']
Total Unique words are 17647.


In [2]:
word_freq = {}  
word_freq = Counter(words)
print(word_freq.most_common()[0:10])

[('the', 14703), ('of', 6742), ('and', 6517), ('a', 4799), ('to', 4707), ('in', 4238), ('that', 3081), ('it', 2534), ('his', 2530), ('i', 2120)]


In [3]:
probs = {}     
Total = sum(word_freq.values())    
for k in word_freq.keys():
    probs[k] = word_freq[k]/Total

In [4]:
def my_autocorrect(input_word):
    input_word = input_word.lower()
    if input_word in V:
            return('Your word seems to be correct')
    else:
        sim = [1-(textdistance.Jaccard(qval=2).distance(v,input_word)) for v in word_freq.keys()]
        df = pd.DataFrame.from_dict(probs, orient='index').reset_index()
        df = df.rename(columns={'index':'Word', 0:'Prob'})
        df['Similarity'] = sim
        output = df.sort_values(['Similarity', 'Prob'], ascending=False).head()
        return(output)

In [5]:
my_autocorrect("ramm")

Unnamed: 0,Word,Prob,Similarity
271,ram,3.6e-05,0.666667
11220,rammed,9e-06,0.6
451,grammar,9e-06,0.5
3469,crammed,4e-06,0.5
13761,ramming,4e-06,0.5


# Spacy contextualSpellCheck

In [1]:
import spacy
import contextualSpellCheck

nlp = spacy.load('en_core_web_sm')
contextualSpellCheck.add_to_pipe(nlp)

def contextual_spellcheck(input_str: str):
    doc = nlp(input_str)

    return doc._.performed_spellCheck, doc._.suggestions_spellCheck, doc._.score_spellCheck, doc._.outcome_spellCheck

In [2]:
list(nlp.vocab.strings)

['\x01',
 '\x02',
 '\x03',
 '\x04',
 '\x05',
 '\x06',
 '\x07',
 '\x08',
 '\t',
 '\t\t',
 '\t\x0b',
 '\t\x0c',
 '\t\r',
 '\t\x1c',
 '\t\x1d',
 '\t\x1e',
 '\t\x1f',
 '\t ',
 '\t\x85',
 '\t\x85\u1680',
 '\t\xa0',
 '\t\u1680',
 '\t\u2001',
 '\t\u2005',
 '\t\u2006',
 '\t\u2007',
 '\t\u2008',
 '\t\u2009',
 '\t\u200a',
 '\t\u2028',
 '\t\u205f',
 '\t\u3000',
 '\n',
 '\n\t',
 '\n\x0b',
 '\n\r',
 '\n\r\t',
 '\n\x1c',
 '\n ',
 '\n\u1680',
 '\n\u2001',
 '\n\u2002',
 '\n\u2003',
 '\n\u2004',
 '\n\u2005',
 '\n\u2006',
 '\n\u2007',
 '\n\u2008',
 '\n\u2009',
 '\n\u200a',
 '\n\u2029',
 '\n\u202f',
 '\n\u205f',
 '\n\u3000',
 '\x0b',
 '\x0b\n',
 '\x0b\x0b',
 '\x0b\x0c',
 '\x0b\r',
 '\x0b ',
 '\x0b\x85',
 '\x0b\xa0',
 '\x0b\u1680',
 '\x0b\u2001',
 '\x0b\u2002',
 '\x0b\u2003',
 '\x0b\u2005',
 '\x0b\u2005\u2000',
 '\x0b\u2006',
 '\x0b\u2007',
 '\x0b\u2008',
 '\x0b\u200a',
 '\x0b\u2028',
 '\x0b\u2029',
 '\x0b\u202f',
 '\x0b\u205f\u2007',
 '\x0b\u3000',
 '\x0c',
 '\x0c\t',
 '\x0c\n',
 '\x0c\r',
 '\x0c\x1d',
 

In [3]:
from collections import Counter

def build_vocab(texts, max_vocab=10000, min_freq=3):
    nlp_ = spacy.blank("en") # just the tokenizer
    wc = Counter()
    for doc in nlp_.pipe(texts):
        for word in doc:
            wc[word.lower_] += 1

    word2id = {}
    id2word = {}
    for word, count in wc.most_common():
        if count < min_freq: break
        if len(word2id) >= max_vocab: break
        wid = len(word2id)
        word2id[word] = wid
        id2word[wid] = word
    return word2id, id2word

In [4]:
import pandas as pd
df_train = pd.read_csv("../data/train.csv")
df_train = df_train.sample(frac = 1)
train_data = df_train['text'].to_numpy()

vocab_size = 5000 # params
oov_tok = '<OOV>'

# tokenizer = Tokenizer(num_words = vocab_size, oov_token=oov_tok)
# tokenizer.fit_on_texts(train_data)

# train_sequences = tokenizer.texts_to_sequences(train_data)

# max_length = 50 # params
# trunc_type = 'post'
# padding_type = 'post'

In [6]:
word2id, id2word = build_vocab(train_data)