1. Реализуйте алгоритм Symspell. Он похож на алгоритм Норвига, но проще и быстрее. Там к словам в словаре применяется только одна операция - удаление символа (1-n). Чтобы найти исправление из слова тоже удаляются символы и сравниваются с теми, что хранятся в словаре. Оцените качество полученного алгоритма теми же тремя метриками.

In [80]:
import os, re
from string import punctuation
import numpy as np
import json
from collections import Counter
from pprint import pprint
from nltk import sent_tokenize
punctuation += "«»—…“”"
punct = set(punctuation)
import gzip
import csv

from sklearn.metrics import classification_report, accuracy_score

In [81]:
bad = open('sents_with_mistakes.txt', encoding='utf8').read().splitlines()
true = open('correct_sents.txt', encoding='utf8').read().splitlines()

In [82]:
def align_words(sent_1, sent_2):
    tokens_1 = sent_1.lower().split()
    tokens_2 = sent_2.lower().split()
    
    tokens_1 = [re.sub('(^\W+|\W+$)', '', token) for token in tokens_1 if (set(token)-punct)]
    tokens_2 = [re.sub('(^\W+|\W+$)', '', token) for token in tokens_2 if (set(token)-punct)]
    
    return list(zip(tokens_1, tokens_2))

In [83]:
mistakes = []
total = 0
for i in range(len(true)):
    word_pairs = align_words(true[i], bad[i])
    for pair in word_pairs:
        if pair[0] != pair[1]:
            mistakes.append(pair)
        total += 1

In [84]:
corpus = open('corpus_5000.txt', 'w', encoding='utf-8')
with gzip.open('lenta-ru-news.csv.gz', 'rt', encoding='utf-8') as archive:
    reader = csv.reader(archive, delimiter=',', quotechar='"')
    for i, line in enumerate(reader):
        if i < 5000:
            corpus.write(line[2].replace('\xa0', ' ') + '\n')

In [85]:
def normalize(text):
    
    normalized_text = [(word.strip(punctuation)) for word \
                                                            in text.lower().split()]
    normalized_text = [word for word in normalized_text if word]
    return normalized_text

In [86]:
corpus = []
for text in open('corpus_5000.txt', encoding='utf-8').read().splitlines():
    sents = sent_tokenize(text)
    norm_sents = [normalize(sent) for sent in sents]
    corpus += norm_sents

In [87]:
vocab = set()

for sent in corpus:
    vocab.update(sent)


In [88]:
def predict_mistaken(word, vocab):

    if word in vocab:
        return 0
    else:
        return 1

In [89]:
y_true = []
y_pred = []

for i in range(len(true)):
    word_pairs = align_words(true[i], bad[i])
    for pair in word_pairs:
        if pair[0] == pair[1]:
            y_true.append(0)
        else:
            y_true.append(1)
        
        y_pred.append(predict_mistaken(pair[1], vocab))

In [90]:
print(classification_report(y_true, y_pred, ))

             precision    recall  f1-score   support

          0       0.98      0.86      0.92      8707
          1       0.49      0.91      0.64      1303

avg / total       0.92      0.86      0.88     10010



In [91]:
WORDS = Counter()
for sent in corpus:
    WORDS.update(sent)

In [92]:
N = sum(WORDS.values())
def P(word, N=N): 
    "Вычисляем вероятность слова"
    return WORDS[word] / N

In [93]:
def known(words): 
    "Выбираем слова, которые есть в корпусе"
    return set(w for w in words if w in WORDS)

def edits1(word):
    "Создаем кандидатов, которые отличаются на одну букву"
    letters    = 'йцукенгшщзхъфывапролджэячсмитьбюё'
    splits     = [(word[:i], word[i:])    for i in range(len(word) + 1)]
    deletes    = [L + R[1:]               for L, R in splits if R]
    return set(deletes)

In [94]:
forms = {}
for word in WORDS.keys():
    forms_keys = edits1(word)
    for form_key in forms_keys:
        forms[form_key] = [word]

In [95]:
def known2(words):
    "Выбираем слова, которые есть в словаре форм"
    return set(w for w in words if w in forms)

In [97]:
def candidates(word): 
    "Генерируем кандидатов на исправление"
    return (known([word]) or known2([word]) or known(edits1(word)) or [word])

def correction(word): 
    "Находим наиболее вероятное похожее слово"
    return max(candidates(word), key=P)

In [98]:
correct = 0
total = 0

total_mistaken = 0
mistaken_fixed = 0

total_correct = 0
correct_broken = 0

cashed = {}
for i in range(len(true)):
    word_pairs = align_words(true[i], bad[i])
    for pair in word_pairs:
        predicted = cashed.get(pair[1], correction(pair[1]))
        cashed[pair[0]] = predicted
        if predicted == pair[0]:
            correct += 1
        total += 1
        
        if pair[0] == pair[1]:
            total_correct += 1
            if pair[0] !=  predicted:
                correct_broken += 1
        else:
            total_mistaken += 1
            if pair[0] == predicted:
                mistaken_fixed += 1
        
    if not i % 100:
        print(i)

0
100
200
300
400
500
600
700
800
900


In [99]:
print(correct/total)
print(mistaken_fixed/total_mistaken)
print(correct_broken/total_correct)

0.6941058941058941
0.11742133537989255
0.2195934305731021
