In [2]:
import re
import numpy as np
import pandas as pd

from collections import Counter


# Preprocess the data

In [5]:
%cd /content/drive/MyDrive/Colab Notebooks/nlp/apps/autocorrect

/content/drive/MyDrive/Colab Notebooks/nlp/apps/autocorrect


In [14]:
path = 'data/sherlock_novels.txt'

In [12]:
def preprocessing(filename):
    """
    Takes a txt file and returns the words in the file
    Args:
        filename: path to the file
    return:
        words: list containing the words in the file
    """
    words = []
    with open(filename) as f:
        text = f.read()

    # \w: Returns a match where the string contains any word characters 
    # +: One or more occurrences
    words = re.findall(r'\w+', text.lower())
    return words

#sherlock_words = preprocessing('data/sherlock_novels.txt')
#sherlock_words[:10]
#len(sherlock_words)


In [15]:
def get_count(words):
    """
    Returns a dict with the count of each word in a corpus
    Args:
        words: list
    return:
        word_count: dict
    """
    word_count = Counter(words)

    return word_count

sherlock = preprocessing(path)
word_count = get_count(sherlock)
word_count['sherlock']


406

# Calculate the probability of each word

Given the dictionary of word_count, compute the probability that each word will appear if randomly selected from the corpus of words.

In [16]:
def probabilities(word_count):
    """
    Returs a dict with the probability of each word
    Args:
        word_count: dict
    returns:
        probs: dict
    """
    probs = dict()
    total = sum(word_count.values())
    for word, count in word_count.items():
        probs[word] = count / total
    
    return probs

probs = probabilities(word_count)
probs['sherlock']

0.0006077908002029962

# Add the edit functions

- delete_letter
- swithc_letter
- replace_letter
- insert_letter

In [22]:
def delete_letter(word):
    """
    Takes a string and returns all the possible strings
    if we delete 1 letter from the word
    Args:
        word: str
    returns:
        del_list: list
    """
    # given the word sherlock it returns:
    # [('', 'sherlock'), ('s', 'herlock'), ('sh', 'erlock'), ('she', 'rlock'), 
    # ('sher', 'lock'), ('sherl', 'ock'), ('sherlo', 'ck'), ('sherloc', 'k')]
    split_list = [(word[:i], word[i:]) for i in range(len(word))]

    # given the word sherlock it returns
    # ['herlock', 'serlock', 'shrlock', 'shelock', 'sherock', 'sherlck', 'sherlok', 'sherloc']
    del_list = [start + remaining[1:] for start, remaining in split_list]
    
    return del_list

def switch_letter(word):
    """
    Takes a word and returns all the possible strings
    with one adjacent character switched
    Args:
        word: str
    returns:
        switches: list
    """
    split_list = [(word[:i], word[i:]) for i in range(len(word))]
    # given the word sherlock it returns
    # ['hserlock', 'sehrlock', 'shrelock', 'shelrock', 'sherolck', 'sherlcok', 'sherlokc']
    switches = [start + remaining[1] + remaining[0] + remaining[2:] for start, remaining in split_list if len(remaining) > 1]
    
    return switches
