In [1]:
import re 
import numpy as np 
from collections import Counter 
import string

In [2]:
def read_corpus(filename):
    with open(filename, 'r') as file:
        lines = file.readlines()
        words = []
        for line in lines:
            words+=re.findall(r'\w+', line.lower())
    return words

In [3]:
words = read_corpus('corpus.txt')
print(f"Total number of words in the corpus is {len(words)}")

Total number of words in the corpus is 929396


In [4]:
vocab = set(words)
print(f"Total number of unique words in the corpus is {len(vocab)}")

Total number of unique words in the corpus is 23902


In [5]:
word_count = Counter(words)
print(word_count['journey'])

29


In [6]:
total_word_count = float(sum(word_count.values()))
word_probas = {word: word_count[word] / total_word_count for word in word_count.keys()}

In [7]:
print(word_probas['journey'])

3.120306091267877e-05


In [8]:
def split(word):
  return [(word[:i], word[i:]) for i in range(len(word) + 1)]

In [9]:
print(split("journey"))

[('', 'journey'), ('j', 'ourney'), ('jo', 'urney'), ('jou', 'rney'), ('jour', 'ney'), ('journ', 'ey'), ('journe', 'y'), ('journey', '')]


In [10]:
def delete(word):
  return [l + r[1:] for l,r in split(word) if r]

In [11]:
print(delete("journey"))

['ourney', 'jurney', 'jorney', 'jouney', 'jourey', 'journy', 'journe']


In [12]:
def swap(word):
  return [l + r[1] + r[0] + r[2:] for l, r in split(word) if len(r)>1]

In [13]:
print(swap("journey"))

['ojurney', 'juorney', 'joruney', 'jounrey', 'joureny', 'journye']


In [14]:
string.ascii_lowercase

'abcdefghijklmnopqrstuvwxyz'

In [15]:
def replace(word):
  letters = string.ascii_lowercase
  return [l + c + r[1:] for l, r in split(word) if r for c in letters]

In [16]:
print(replace("journey"))

['aourney', 'bourney', 'courney', 'dourney', 'eourney', 'fourney', 'gourney', 'hourney', 'iourney', 'journey', 'kourney', 'lourney', 'mourney', 'nourney', 'oourney', 'pourney', 'qourney', 'rourney', 'sourney', 'tourney', 'uourney', 'vourney', 'wourney', 'xourney', 'yourney', 'zourney', 'jaurney', 'jburney', 'jcurney', 'jdurney', 'jeurney', 'jfurney', 'jgurney', 'jhurney', 'jiurney', 'jjurney', 'jkurney', 'jlurney', 'jmurney', 'jnurney', 'journey', 'jpurney', 'jqurney', 'jrurney', 'jsurney', 'jturney', 'juurney', 'jvurney', 'jwurney', 'jxurney', 'jyurney', 'jzurney', 'joarney', 'jobrney', 'jocrney', 'jodrney', 'joerney', 'jofrney', 'jogrney', 'johrney', 'joirney', 'jojrney', 'jokrney', 'jolrney', 'jomrney', 'jonrney', 'joorney', 'joprney', 'joqrney', 'jorrney', 'josrney', 'jotrney', 'journey', 'jovrney', 'jowrney', 'joxrney', 'joyrney', 'jozrney', 'jouaney', 'joubney', 'joucney', 'joudney', 'joueney', 'joufney', 'jougney', 'jouhney', 'jouiney', 'joujney', 'joukney', 'joulney', 'joumney'

In [17]:
def insert(word):
  letters = string.ascii_lowercase
  return [l + c + r for l, r in split(word) for c in letters]

In [18]:
print(insert("journey"))

['ajourney', 'bjourney', 'cjourney', 'djourney', 'ejourney', 'fjourney', 'gjourney', 'hjourney', 'ijourney', 'jjourney', 'kjourney', 'ljourney', 'mjourney', 'njourney', 'ojourney', 'pjourney', 'qjourney', 'rjourney', 'sjourney', 'tjourney', 'ujourney', 'vjourney', 'wjourney', 'xjourney', 'yjourney', 'zjourney', 'jaourney', 'jbourney', 'jcourney', 'jdourney', 'jeourney', 'jfourney', 'jgourney', 'jhourney', 'jiourney', 'jjourney', 'jkourney', 'jlourney', 'jmourney', 'jnourney', 'joourney', 'jpourney', 'jqourney', 'jrourney', 'jsourney', 'jtourney', 'juourney', 'jvourney', 'jwourney', 'jxourney', 'jyourney', 'jzourney', 'joaurney', 'joburney', 'jocurney', 'jodurney', 'joeurney', 'jofurney', 'jogurney', 'johurney', 'joiurney', 'jojurney', 'jokurney', 'jolurney', 'jomurney', 'jonurney', 'joourney', 'jopurney', 'joqurney', 'jorurney', 'josurney', 'joturney', 'jouurney', 'jovurney', 'jowurney', 'joxurney', 'joyurney', 'jozurney', 'jouarney', 'joubrney', 'joucrney', 'joudrney', 'jouerney', 'jo

In [19]:
def edit1(word):
  return set(delete(word) + swap(word) + replace(word) + insert(word))

In [20]:
print(edit1("journey"))

{'joruney', 'joumney', 'jwurney', 'juorney', 'journeyc', 'dourney', 'uourney', 'jgourney', 'juourney', 'jofurney', 'xourney', 'journes', 'journey', 'jsourney', 'jourtney', 'cjourney', 'jpourney', 'jaurney', 'jolrney', 'journef', 'jouxrney', 'jojrney', 'jovrney', 'jourhney', 'pourney', 'journqy', 'journiey', 'jourtey', 'mourney', 'journry', 'zjourney', 'aourney', 'journen', 'joubney', 'journex', 'journiy', 'journeyn', 'zourney', 'jkurney', 'journesy', 'joarney', 'journgey', 'journev', 'jtourney', 'journeh', 'jxurney', 'jouyrney', 'joturney', 'jqurney', 'joubrney', 'jouirney', 'eourney', 'joumrney', 'journed', 'journeky', 'jocurney', 'journhy', 'jjourney', 'jouriey', 'jgurney', 'vjourney', 'jourrney', 'journedy', 'jouiney', 'journegy', 'journeyx', 'rjourney', 'jourqey', 'ojourney', 'journjey', 'joufney', 'hjourney', 'jouraey', 'journy', 'journqey', 'journdey', 'jourxney', 'jokurney', 'iourney', 'joujrney', 'journet', 'journvey', 'jrurney', 'jfourney', 'journxey', 'jzurney', 'joupney', 'j

In [21]:
def edit2(word):
  return set(e2 for e1 in edit1(word) for e2 in edit1(e1))

In [22]:
print(edit2("journey"))

{'joudrnky', 'jojrneyo', 'ojurley', 'jopulney', 'jolrjey', 'joulrfey', 'jiyourney', 'jourkeq', 'joxrneym', 'journleyx', 'journwpey', 'jofurneny', 'jfournqy', 'johney', 'jjouurney', 'jouireney', 'jourdxy', 'jdourneym', 'journueyu', 'jotdney', 'oodurney', 'jowrnfy', 'jouknxy', 'xourneyu', 'sournxey', 'joucrxey', 'jovurndey', 'jrournhey', 'joucrngy', 'kjoucrney', 'journeoo', 'jourineby', 'fjosrney', 'jiourtey', 'jomkurney', 'jounrnen', 'oonrney', 'sjoruney', 'journoyu', 'jousprney', 'jouinecy', 'gjournely', 'ljouxrney', 'jourqneyy', 'jpurneyv', 'joournky', 'jtournej', 'jiurneyb', 'jourvnxey', 'jourreyq', 'yourneq', 'jzoumrney', 'joffrney', 'jogurneyk', 'jourfneb', 'jourhney', 'jourfneo', 'ojouraney', 'ffourney', 'tjomurney', 'xjoeurney', 'jnouarney', 'jofrnei', 'jouereney', 'fjourngy', 'tnjourney', 'jdournea', 'jourynuy', 'jowurnpey', 'jwurnegy', 'jmusney', 'ojoeurney', 'juoreny', 'joufneys', 'jourlnegy', 'joquriey', 'geurney', 'jmprney', 'jonxney', 'yjiurney', 'jodney', 'cjoucney', 'joub

In [23]:
def correct_spelling(word, vocabulary, word_probabilities):
  if word in vocabulary:
    print(f"{word} is already correctly spelt")
    return 

  suggestions = edit1(word) or edit2(word) or [word]
  best_guesses = [w for w in suggestions if w in vocabulary]
  return [(w, word_probabilities[w]) for w in best_guesses]

In [24]:
word = "famile"
corrections = correct_spelling(word, vocab, word_probas)

if corrections:
  print(corrections)
  probs = np.array([c[1] for c in corrections])
  best_ix = np.argmax(probs)
  correct = corrections[best_ix][0]
  print(f"{correct} is suggested for {word}")

[('family', 8.607740941428627e-06), ('famine', 1.291161141214294e-05), ('facile', 1.0759676176785783e-06)]
famine is suggested for famile


In [25]:
class SpellChecker(object):

  def __init__(self, corpus_file_path):
    with open(corpus_file_path, "r") as file:
      lines = file.readlines()
      words = []
      for line in lines:
        words += re.findall(r'\w+', line.lower())

    self.vocabs = set(words)
    self.word_counts = Counter(words)
    total_words = float(sum(self.word_counts.values()))
    self.word_probas = {word: self.word_counts[word] / total_words for word in self.vocabs}

  def _level_one_edits(self, word):
    letters = string.ascii_lowercase
    splits = [(word[:i], word[i:]) for i in range(len(word) + 1)]
    deletes = [l + r[1:] for l,r in splits if r]
    swaps = [l + r[1] + r[0] + r[2:] for l, r in splits if len(r)>1]
    replaces = [l + c + r[1:] for l, r in splits if r for c in letters]
    inserts = [l + c + r for l, r in splits for c in letters] 

    return set(deletes + swaps + replaces + inserts)

  def _level_two_edits(self, word):
    return set(e2 for e1 in self._level_one_edits(word) for e2 in self._level_one_edits(e1))

  def check(self, word):
    candidates = self._level_one_edits(word) or self._level_two_edits(word) or [word]
    valid_candidates = [w for w in candidates if w in self.vocabs]
    return sorted([(c, self.word_probas[c]) for c in valid_candidates], key=lambda tup: tup[1], reverse=True)


In [26]:
checker = SpellChecker("corpus.txt")

In [27]:
checker.check("sentense")

[('sentence', 4.1962737089464553e-05)]