in this little notebook, we show how it can be easy to create a typing error corrector using similarities with levenshtein distance.
This code can be used for example in text data preprocessing for example

In [1]:
# installing the jellyfish library to compute levenshtein distance
! pip install jellyfish

Collecting jellyfish
  Downloading jellyfish-0.8.8.tar.gz (134 kB)
[?25l[K     |██▍                             | 10 kB 24.5 MB/s eta 0:00:01[K     |████▉                           | 20 kB 8.5 MB/s eta 0:00:01[K     |███████▎                        | 30 kB 5.5 MB/s eta 0:00:01[K     |█████████▊                      | 40 kB 5.4 MB/s eta 0:00:01[K     |████████████▏                   | 51 kB 2.7 MB/s eta 0:00:01[K     |██████████████▋                 | 61 kB 2.9 MB/s eta 0:00:01[K     |█████████████████               | 71 kB 3.0 MB/s eta 0:00:01[K     |███████████████████▌            | 81 kB 3.4 MB/s eta 0:00:01[K     |██████████████████████          | 92 kB 3.6 MB/s eta 0:00:01[K     |████████████████████████▍       | 102 kB 2.8 MB/s eta 0:00:01[K     |██████████████████████████▉     | 112 kB 2.8 MB/s eta 0:00:01[K     |█████████████████████████████▎  | 122 kB 2.8 MB/s eta 0:00:01[K     |███████████████████████████████▊| 133 kB 2.8 MB/s eta 0:00:01[K     |██

In [3]:
# let's assume we have a list of words (here a list of companies that can be for example a list of customers or providers ...)
customers_list = ['apple', 'facebook', 'microsoft', 'google', 'amazon', 'ibm']

In [7]:
import jellyfish

In [54]:
# now let's define a function that computes the distance between a word and all the elements of a list and then returns the closest word in that list
def word_corrector(word,words_list):
  # creating a dictionnary where keys are words from words_list and values are levenshtein distance between key and word
  dict_distances = {i:jellyfish.levenshtein_distance(word,i) for i in words_list}

  # we return then the key with the lower value corresponding the the closest to word in terms of levenstein distance
  return min(dict_distances, key=dict_distances.get)

In [55]:
# example 1 
word_corrector('fucebook',customers_list)

'facebook'

In [13]:
# example 2
word_corrector('uple',customers_list)

'apple'

In [56]:
# we could also have returned a list of word from the closest to the farthest
def word_corrector_v2(word, words_list):
  # creating a dictionnary where keys are words from words_list and values are levenshtein distance between key and word
  dict_distances = {i:jellyfish.levenshtein_distance(word,i) for i in words_list}

  return sorted(dict_distances,key = dict_distances.get)

In [58]:
word_corrector_v2('abm',customers_list)

['ibm', 'apple', 'amazon', 'facebook', 'google', 'microsoft']

In [62]:
# or we could set a threshold for the levenshtein distance and return the words that have a smaller distance than this threshold. this could be useful if we have quite similar words in our list
def word_corrector_v3(word, words_list,threshold):
  # creating a dictionnary where keys are words from words_list and values are levenshtein distance between key and word
  dict_distances = {i:jellyfish.levenshtein_distance(word,i) for i in words_list if jellyfish.levenshtein_distance(word,i)<threshold}

  return sorted(dict_distances,key = dict_distances.get)

In [67]:
word_corrector_v3('ggle',customers_list,4)

['google', 'apple']