In [4]:
import re
import string
from collections import Counter
import numpy as np

In [5]:
class SpellChecker(object):

  def __init__(self, corpus_file_path):
    with open(corpus_file_path, "r" ,encoding="utf-8") as file:
      lines = file.readlines()
      words = []
      for line in lines:
        words += re.findall(r'\w+', line.lower())

    self.vocabs = set(words)
    self.word_counts = Counter(words)
    total_words = float(sum(self.word_counts.values()))
    self.word_probas = {word: self.word_counts[word] / total_words for word in self.vocabs}

  def _level_one_edits(self, word):
    letters = string.ascii_lowercase
    splits = [(word[:i], word[i:]) for i in range(len(word) + 1)]
    deletes = [l + r[1:] for l,r in splits if r]
    swaps = [l + r[1] + r[0] + r[2:] for l, r in splits if len(r)>1]
    replaces = [l + c + r[1:] for l, r in splits if r for c in letters]
    inserts = [l + c + r for l, r in splits for c in letters] 

    return set(deletes + swaps + replaces + inserts)

  def _level_two_edits(self, word):
    return set(e2 for e1 in self._level_one_edits(word) for e2 in self._level_one_edits(e1))

  def check(self, word):
    candidates = self._level_one_edits(word) or self._level_two_edits(word) or [word]
    valid_candidates = [w for w in candidates if w in self.vocabs]
    return sorted([(c, self.word_probas[c]) for c in valid_candidates], key=lambda tup: tup[1], reverse=True)

In [6]:
checker = SpellChecker("./english3.txt")

In [7]:
checker.check("haha")

[('hara', 5.141784713474047e-06),
 ('aha', 5.141784713474047e-06),
 ('taha', 5.141784713474047e-06),
 ('haka', 5.141784713474047e-06),
 ('hahs', 5.141784713474047e-06),
 ('hatha', 5.141784713474047e-06),
 ('hah', 5.141784713474047e-06)]

In [9]:
checker.check("chevkmate")

[('checkmate', 5.141784713474047e-06)]

In [10]:
checker.check("hgde")

[('hide', 5.141784713474047e-06),
 ('hade', 5.141784713474047e-06),
 ('hyde', 5.141784713474047e-06)]