In [0]:
import pandas as pd
import numpy as np
from collections import defaultdict
from tqdm import tqdm

Import Unigramm frequency, make dictionary, extract important data

In [0]:
freqdata = pd.read_csv('words.csv',header = None, sep = ' ')
freqdata = freqdata.drop(columns = [0, 3])
freqdata = freqdata.rename(columns = {1: 'Freq', 2: 'Word'})
words = freqdata['Word'].tolist()
freqwords = freqdata['Freq'].tolist()
freqdicti = defaultdict(float)
max_length = len(max(words, key = len))
for i in range(len(words)):
    freqdicti[words[i]] = freqwords[i]

Import datasets

In [14]:
annotated_texts = pd.read_csv('annotated_texts.csv')
annotated_texts.head()

Unnamed: 0,document_id,sentence_id,original_text,corrected_text,quote,correction,tags,annotated,checked
0,1,2,Одной из самых главных экологических проблем н...,Одной из самых главных экологических проблем н...,окружющей,окружающей,Ortho,1,1
1,1,10,Такие условия легко способствуют отравлению св...,Такие условия легко способствуют отравлению св...,свинцем,свинцом,Infl,1,1
2,1,11,Этот металл может сохраняться в организме чело...,Этот металл может сохраняться в организме чело...,воздействую,воздействуя,Syntax,1,1
3,1,14,Основной путь попадания свинца в организм - эт...,Основной путь попадания свинца в организм - ...,это,,"Syntax, Insert, Transfer",1,1
4,1,15,"Кроме того, содержание свинца в выращиваемых п...","Кроме того, содержание свинца в выращиваемых п...",свинцем,свинцом,Infl,1,1


Damerau-Levenstein Distance

In [0]:
def dam_lev_distance(s1, s2):
    d = {}
    lenstr1 = len(s1)
    lenstr2 = len(s2)

    for i in range(-1,lenstr1+1):
        d[(i,-1)] = i+1
    for j in range(-1,lenstr2+1):
        d[(-1,j)] = j+1


    for i in range(lenstr1):
        for j in range(lenstr2):
            if s1[i] == s2[j]:
                cost = 0
            else:
                cost = 1
            d[(i,j)] = min(
                           d[(i-1,j)] + 1, # deletion
                           d[(i,j-1)] + 1, # insertion
                           d[(i-1,j-1)] + cost, # substitution
                          )
            
            if i and j and s1[i]==s2[j-1] and s1[i-1] == s2[j]:
                d[(i,j)] = min (d[(i,j)], d[i-2,j-2] + cost) # transposition

    return d[lenstr1-1,lenstr2-1]

Symmetric Delete Spelling by steps

1st step: function for list of possible deletions for word


In [0]:
def del_list(word, max_ed_dist):
    deletes = []
    upd_word = [word]
    for l in range(max_ed_dist):
        utw = []
        for word in upd_word:
            for i in range(len(word)):
                without_i = word[:i] + word[i + 1:]
                if without_i not in deletes:
                    deletes.append(without_i)
                if without_i not in utw:
                    utw.append(without_i)
        upd_word = utw
    return deletes

Deletes for all unigrams

In [0]:
def delition_vocabulary(words, max_ed_dist):
    deldictionary = defaultdict(list)
    for i in tqdm(range(len(words))):
        deldictionary[words[i]].append(words[i])
    for i in tqdm(range(len(words))):
        deletes = del_list(words[i], max_ed_dist)
        for delete in deletes:
            deldictionary[delete].append(words[i])
    return deldictionary

Create dictionary for (deletes | words)

Can create new with new maximum edition distance, or download pickle file with prepared dictionary

In [10]:
deldict = delition_vocabulary(words, 3)

100%|██████████| 69307/69307 [00:00<00:00, 617227.98it/s]
100%|██████████| 69307/69307 [01:14<00:00, 935.29it/s]


Save this big dicti in pickle format

In [0]:
import pickle
with open('deldicti.pickle', 'wb') as f:
    pickle.dump(deldict, f)

Load delitions dictionary 

In [0]:
import pickle
with open('deldicti.pickle', 'rb') as f:
    deldict = pickle.load(f)

Make suggestions

In [0]:
def suggestions_maker(word, max_ed_dist, max_length, deldict):
    sug_dict = defaultdict(list)
    if len(word) - max_length > max_ed_dist:
        return []
    deletes = del_list(word, max_ed_dist)
    deletes.append(word)
    allsugs = []
    for delition in deletes:
        suggs = deldict[delition]
        for sugg in suggs:
            if sugg not in allsugs:
                allsugs.append(sugg)
    for sug in allsugs:
        dist = dam_lev_distance(sug, word)
        sug_dict[sug].append(dist)
        freq = freqdicti[sug] * -1
        sug_dict[sug].append(freq)
    return sug_dict

Correcting spelling for one word

In [0]:
def choose_best_suggestion (word, max_ed_dist, max_length, deldict):
    try:
        suggs = suggestions_maker(word, max_ed_dist, max_length, deldict).items()
        sorted_suggs = sorted(suggs, key=lambda item : (item[1][0], item[1][1]))
        return sorted_suggs[0][0]
    except:
        return word

Preprocessing of dataset


In [0]:
misspelling = annotated_texts['quote'].tolist()
rightanswers = annotated_texts['correction'].tolist()

Make corrections


In [21]:
corrected_data = []
for i in tqdm(range(len(misspelling))):
    iitem = str(misspelling[i]).split()
    correcteditem = []
    for word in iitem:
        correctedword = choose_best_suggestion (word, 3, max_length, deldict)
        correcteditem.append(correctedword)
    corrected_data.append(correcteditem)


100%|██████████| 61595/61595 [55:00<00:00, 18.66it/s]


In [23]:
len(corrected_data)

61595

Look at accuracy

In [29]:
i = 0
for i in range(len(corrected_data)):
    if corrected_data[i] == rightanswers[i]:
      i += 1
correctly_guessed = i/len(corrected_data) * 100
print("Percent of properly corrected examples: ", correctly_guessed, "%")

Percent of properly corrected examples:  99.99837649159834 %
