# Pre-historic method

In [2]:
# !wget https://norvig.com/big.txt; mv big.txt train_texts_en.txt

In [3]:
import re
from collections import Counter

In [4]:
def words(text):
    return re.findall(r'\w+', text.lower())

words_occurences = Counter(words(open('train_texts_en.txt').read()))

In [6]:
def language_model(word, total_occurences=sum(words_occurences.values())):
    '''Probability of `word`. Naive implementation.'''
    return words_occurences[word] / total_occurences

In [9]:
def limit_candidates(words):
    '''The subset of `words` that appear in the dictionary of words_occurences.'''
    output = set()
    for word in words:
        if word in words_occurences:
            output.add(word)
    return output


In [14]:
def edits1(word):
    '''All edits that are one edit away from `word`.'''
    letters    = 'abcdefghijklmnopqrstuvwxyz'
    splits     = [(word[:i], word[i:]) for i in range(len(word) + 1)]
    # <YOUR CODE: generate various candidates>
    deletes = [L + R[1:] for L, R in splits]
    transposes = [L[:-1] + R[0] + L[-1] + R[1:] for L, R in splits if len(L) > 0 and len(R) > 0]
    replaces = [L + c + R[1:] for L, R in splits for c in letters]
    inserts = [L + c + R for L, R in splits for c in letters]
    return set(deletes + transposes + replaces + inserts)

def edits2(word):
    '''All edits that are 2 edits away from `word`.'''
    #<YOUR CODE>
    output = set()
    for candidate in edits1(word):
        output.update(edits1(candidate))
    return output

def generate_candidates(word):
    '''Generate possible spelling corrections for word: 2 edits away is enough.'''
    #<YOUR CODE>
    return limit_candidates(edits2(word)) or limit_candidates(edits1(word))


In [16]:
word = "mestak"
assert type(generate_candidates(word)) == set
assert generate_candidates(word) == {'meatal', 'mental', 'mesial', 'metal', 'mistake', 'vestas'}

In [17]:
def correct_word(word):
    '''Most probable spelling correction for word.'''
    return max(generate_candidates(word), key=language_model)


In [26]:
word = "mestak"
correct_word(word)

'mistake'

# Recent methods

In [27]:
import time
import torch
import pandas as pd

from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [28]:
samples = [
    "пквкая чейчс погаа",
    "как проайти ло музея",
    "как папасть на сайт однакласники",
    "кто еапичл картмну зевденая очь",
    "прийдя в МГТУ я был удивлен никого необноружив там...",
    "Думю ешцъа лет череа 10 ретроспективно просматривотьэ то будкетцц мне невероя тна ин те р но",
    "crjkmrj ctqxfc dhtvtyb",
]

In [29]:
results = dict()
used_time = dict()

## Deep Pavlov

In [30]:
# !pip install -q deeppavlov
# !python -m deeppavlov install levenshtein_corrector_ru

In [31]:
from deeppavlov import build_model, configs

model = build_model('levenshtein_corrector_ru', download=False)

In [32]:
t_start = time.perf_counter()
results["deep_pavlov"] = {sample: model([sample])[0] for sample in tqdm(samples)}
cpu_time = time.perf_counter() - t_start
gpu_time = None
used_time["deep_pavlov"] = (cpu_time, gpu_time)

  0%|                                                                                                                  | 0/7 [00:00<?, ?it/s]

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 21.62it/s]


## Sage

In [33]:
# ! git clone https://github.com/ai-forever/sage.git
# cd sage
# ! pip install .
# ! pip install -r requirements.txt

In [38]:
device = torch.device("cuda:2")

model_names = [
    'ai-forever/RuM2M100-1.2B',
    'ai-forever/RuM2M100-418M',
    'ai-forever/FRED-T5-large-spell',
    'ai-forever/T5-large-spell',
]

In [39]:
def generate_fixed_from_samples(model, tokenizer, samples, device='cpu', prefix=''):
    samples_copy = [prefix + s for s in samples]
    model = model.to(device)
    t_start = time.perf_counter()

    tokens = tokenizer(samples_copy, padding=True, return_tensors='pt')
    output = model.generate(tokens['input_ids'].to(device), do_sample=True, top_k=50, top_p=0.95, num_return_sequences=1)
    results = tokenizer.batch_decode(output.cpu(), skip_special_tokens=True)

    all_time = time.perf_counter() - t_start
    return dict(zip(samples, results)), all_time


In [40]:
for model_name in tqdm(model_names):
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

    prefix = "Исправь: " if model_name == "ai-forever/FRED-T5-large-spell" else ""

    _, time_for_gen_cpu = generate_fixed_from_samples(model, tokenizer, samples, device='cpu', prefix=prefix)

    fixed_samples, time_for_gen_gpu = generate_fixed_from_samples(model, tokenizer, samples, device=device, prefix=prefix)

    used_time[model_name] = (time_for_gen_cpu, time_for_gen_gpu)
    results[model_name] = fixed_samples

  0%|                                                                                                                  | 0/4 [00:00<?, ?it/s]

 50%|█████████████████████████████████████████████████████                                                     | 2/4 [00:34<00:32, 16.27s/it]Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:55<00:00, 13.88s/it]


## Results

In [41]:
df_used_time = pd.DataFrame(used_time, index=["cpu: ms", "gpu: ms"]).T / len(samples) * 1000
df_used_time

Unnamed: 0,cpu: ms,gpu: ms
deep_pavlov,47.143819,
ai-forever/RuM2M100-1.2B,1904.489029,187.972296
ai-forever/RuM2M100-418M,1295.493372,110.247526
ai-forever/FRED-T5-large-spell,748.755808,143.176129
ai-forever/T5-large-spell,815.866127,101.259522


In [42]:
df_rps = 1.0 / (df_used_time / 1000)
df_rps.columns = ["cpu: rps", "gpu: rps"]
df_rps

Unnamed: 0,cpu: rps,gpu: rps
deep_pavlov,21.211688,
ai-forever/RuM2M100-1.2B,0.525075,5.319933
ai-forever/RuM2M100-418M,0.771907,9.070498
ai-forever/FRED-T5-large-spell,1.335549,6.984405
ai-forever/T5-large-spell,1.225691,9.875614


In [43]:
from collections import defaultdict
to_print = defaultdict(dict)
for model_name in results:
    for sample in samples:
        to_print[sample].update({model_name: results[model_name][sample]})

str_result = ""
for sample in to_print:
    str_result += f"SAMPLE: {sample}\n\n"
    for model_name, fixed in to_print[sample].items():
        str_result += f"{model_name}"
        if model_name == 'deep_pavlov':
            str_result += '\t'*2
        str_result += f"\tOUTPUT: {fixed}\n"
    str_result += '#' * 150 + '\n'


print(str_result)

SAMPLE: пквкая чейчс погаа

deep_pavlov			OUTPUT: пквкая чейчс погас
ai-forever/RuM2M100-1.2B	OUTPUT: Дышать какая сейчас погода
ai-forever/RuM2M100-418M	OUTPUT: .. Pokвская, чей час погана.
ai-forever/FRED-T5-large-spell	OUTPUT: Лепковская сейчас пока что.Репковская сейчас пока что:1. Лупковская
ai-forever/T5-large-spell	OUTPUT: kobrвoka kobrвa
######################################################################################################################################################
SAMPLE: как проайти ло музея

deep_pavlov			OUTPUT: как пройти до музея
ai-forever/RuM2M100-1.2B	OUTPUT: apart как пройти до музея
ai-forever/RuM2M100-418M	OUTPUT: Как пройти по Музея
ai-forever/FRED-T5-large-spell	OUTPUT: как пройти до музея. как пройти до музея. как пройти до музея. как пройти до музея
ai-forever/T5-large-spell	OUTPUT: кака как podabaти ло ryryе
######################################################################################################################################