#### Install required libs

In [1]:
# !pip install Levenshtein sklearn
# !pip install pyspellchecker symspellpy pyenchant textblob language-tool-python

In [2]:
import pandas as pd
import numpy as np

#### Preprocessing
- Combining the datasets
- Formatting
- Add correct words to the dataset

In [3]:
from src import *

parse_birkbeck('data/holbrook-missp.dat.txt', True)

Unnamed: 0,misspelled_word,correct_word
0,Consulatoin,?
1,Doc_chm,?
2,Half-hose,?
3,Hurbured,?
4,anetes,?
...,...,...
1786,you,yours
1787,yourns,yours
1788,your_self,yourself
1789,your_sleve,yourself


In [4]:
df = pd.concat([parse_birkbeck('data/aspell.dat.txt'),
               parse_birkbeck('data/holbrook-missp.dat.txt', counted=True),
               parse_birkbeck('data/missp.dat.txt'),
               parse_birkbeck('data/wikipedia.dat.txt')], axis=0)
df = df.reset_index(drop=True)
df.shape

(40910, 2)

In [5]:
df['misspelled_word'] = df['misspelled_word'].str.replace('_', ' ')
df['correct_word'] = df['correct_word'].str.replace('_', ' ')
df = df[df['correct_word'] != '?']
df['correct_word'] = df['correct_word'].str.lower()
t = df['correct_word'].unique()
correct_words = pd.DataFrame((t, t)).T
correct_words.columns = ['misspelled_word', 'correct_word']
df = pd.concat([df, correct_words], axis=0)
df = df.reset_index(drop=True)
df.shape

(48652, 2)

### Spellchecker wrappers

In [6]:
import language_tool_python
from symspellpy import SymSpell, Verbosity
from spellchecker import SpellChecker
import enchant

symspell = SymSpell(max_dictionary_edit_distance=2, prefix_length=7)
symspell.load_dictionary("symspellpy/frequency_dictionary_en_82_765.txt", term_index=0, count_index=1)

tool = language_tool_python.LanguageTool('en-US')

pyspell = SpellChecker()

d = enchant.Dict("en_US")

In [7]:
from textblob import TextBlob
import time


def spell_check_pyspellchecker(misspelled_word):
    start_time = time.time()
    correction = pyspell.correction(misspelled_word)
    elapsed_time = time.time() - start_time
    return correction, elapsed_time

def spell_check_symspellpy(misspelled_word):
    start_time = time.time()
    suggestions = symspell.lookup(misspelled_word, Verbosity.CLOSEST, max_edit_distance=2)
    elapsed_time = time.time() - start_time
    return suggestions[0].term if suggestions else misspelled_word, elapsed_time

def spell_check_pyenchant(misspelled_word):
    start_time = time.time()
    if d.check(misspelled_word):
        return misspelled_word, time.time() - start_time
    else:
        suggestions = d.suggest(misspelled_word)
        elapsed_time = time.time() - start_time
        return suggestions[0] if suggestions else misspelled_word, elapsed_time

def spell_check_textblob(misspelled_word):
    start_time = time.time()  # Start timing
    correction = str(TextBlob(misspelled_word).correct())
    elapsed_time = time.time() - start_time
    return correction, elapsed_time

def spell_check_languagetool(misspelled_word):
    start_time = time.time()
    matches = tool.check(misspelled_word)
    corrected = language_tool_python.utils.correct(misspelled_word, matches)
    elapsed_time = time.time() - start_time
    return corrected, elapsed_time

# Test the spell checkers
misspelled_word = "mispeled"
print(f"Original word: {misspelled_word}")
pyspell_correction, pyspell_time = spell_check_pyspellchecker(misspelled_word)
print(f"PySpellChecker: {pyspell_correction} (Time: {pyspell_time:.6f} seconds)")

symspell_correction, symspell_time = spell_check_symspellpy(misspelled_word)
print(f"SymSpellPy: {symspell_correction} (Time: {symspell_time:.6f} seconds)")

pyenchant_correction, pyenchant_time = spell_check_pyenchant(misspelled_word)
print(f"PyEnchant: {pyenchant_correction} (Time: {pyenchant_time:.6f} seconds)")

textblob_correction, textblob_time = spell_check_textblob(misspelled_word)
print(f"TextBlob: {textblob_correction} (Time: {textblob_time:.6f} seconds)")

languagetool_correction, languagetool_time = spell_check_languagetool(misspelled_word)
print(f"LanguageTool: {languagetool_correction} (Time: {languagetool_time:.6f} seconds)")


Original word: mispeled
PySpellChecker: misled (Time: 0.267127 seconds)
SymSpellPy: misled (Time: 0.000000 seconds)
PyEnchant: misspelled (Time: 0.029000 seconds)
TextBlob: misled (Time: 0.195351 seconds)
LanguageTool: misled (Time: 2.562655 seconds)


## Benchmark

In [8]:
results = {
    'spell_checker': [],
    'accuracy': [],
    'precision': [],
    'recall': [],
    'f1': [],
    'avg_levenshtein': [],
    'avg_time': []
}

def benchmark_spell_checkers(df):
    results = []
    spell_checkers = {
        'PySpellChecker': spell_check_pyspellchecker,
        'SymSpellPy': spell_check_symspellpy,
        'PyEnchant': spell_check_pyenchant,
        'TextBlob': spell_check_textblob,
        'LanguageTool': spell_check_languagetool
    }

    for name, spell_checker in spell_checkers.items():
        spell_check_results = df['misspelled_word'].apply(spell_checker)

        original_word = df['misspelled_word'].reset_index(drop=True)
        predicted_word, correction_time = zip(*spell_check_results)
        predicted_word = pd.Series(predicted_word, name='predicted_word').reset_index(drop=True).fillna("bruh")
        correction_time = pd.Series(correction_time, name='correction_time').reset_index(drop=True).fillna(0)

        correct_word = df['correct_word'].reset_index(drop=True)

        metrics = {
            'spell_checker': name,
            'accuracy': accuracy(correct_word, predicted_word),
            'precision': precision(correct_word, predicted_word, original_word),
            'recall': recall(correct_word, predicted_word, original_word),
            'avg_levenshtein': avg_levenshtein(correct_word, predicted_word),
            'avg_time': avg_time(correction_time)
        }

        metrics['f1'] = f1(metrics['precision'], metrics['recall'])

        results.append(metrics)
        print(f"Done evaluating {name}")

    results_df = pd.DataFrame(results)
    return results_df


In [10]:
benchmarks = benchmark_spell_checkers(df)
benchmarks

Done evaluating PySpellChecker
Done evaluating SymSpellPy
Done evaluating PyEnchant
Done evaluating TextBlob
Done evaluating LanguageTool


Unnamed: 0,spell_checker,accuracy,precision,recall,avg_levenshtein,avg_time,f1
0,PySpellChecker,0.450793,0.400328,0.358359,2.45624,0.135717,0.378183
1,SymSpellPy,0.454041,0.474363,0.355447,1.838465,5.8e-05,0.406385
2,PyEnchant,0.429972,0.373253,0.33193,2.052495,0.024769,0.351381
3,TextBlob,0.411412,0.43517,0.321383,1.916735,0.086484,0.36972
4,LanguageTool,0.457597,0.421233,0.366827,1.901484,0.024247,0.392152


In [11]:
benchmarks.to_csv('results.csv', index=False)