# Spell Correction

## Preprocessing Textual Data

In [224]:
import neattext as nt
import pandas as pd
import neattext.functions as nfx

### Extracting texts only

In [207]:
data = pd.read_csv("data/articles.csv")
data.head()

Unnamed: 0,author,claps,reading_time,link,title,text
0,Justin Lee,8.3K,11,https://medium.com/swlh/chatbots-were-the-next...,Chatbots were the next big thing: what happene...,"Oh, how the headlines blared:\nChatbots were T..."
1,Conor Dewey,1.4K,7,https://towardsdatascience.com/python-for-data...,Python for Data Science: 8 Concepts You May Ha...,If you’ve ever found yourself looking up the s...
2,William Koehrsen,2.8K,11,https://towardsdatascience.com/automated-featu...,Automated Feature Engineering in Python – Towa...,Machine learning is increasingly moving from h...
3,Gant Laborde,1.3K,7,https://medium.freecodecamp.org/machine-learni...,Machine Learning: how to go from Zero to Hero ...,If your understanding of A.I. and Machine Lear...
4,Emmanuel Ameisen,935,11,https://blog.insightdatascience.com/reinforcem...,Reinforcement Learning from scratch – Insight ...,Want to learn about applied Artificial Intelli...


In [209]:
data = data["text"].values

In [212]:
type(data[0])

str

In [216]:
texts = "".join(data)

In [221]:
texts[:100]

'Oh, how the headlines blared:\nChatbots were The Next Big Thing.\nOur hopes were sky high. Bright-eyed'

In [231]:
check = texts.split()
print(check[:10])

['Oh,', 'how', 'the', 'headlines', 'blared:', 'Chatbots', 'were', 'The', 'Next', 'Big']


In [240]:
cleaned_text = ""

In [241]:
for i in check:
    cleaned_text += nfx.clean_text(i, stopwords=False, puncts=True, urls=True, numbers=True, special_char=True,
                                  phone_num=True, non_ascii=True, multiple_whitespaces=False, contractions=True,
                                  currency_symbols=True) + " "

In [242]:
print(cleaned_text[-100:])

how how much you enjoyed this story reader writer and a programmer sharing concepts ideas and codes 


In [243]:
with open("data/processed.txt", "w") as file:
    file.write(cleaned_text)
file.close()

## Extracting Words

In [244]:
path = "data/processed.txt"

In [59]:
class WordExtractor:
    """
    Read txt file and extract words with frequency.
    """

    def __init__(self):
        self.words = dict()

    def extract_words(self, filename):
        """
        extract words from filename
        """
        with open(filename, "r") as file:
            for line in file.readlines():
                words = line.split()
                for word in words:
                    self.words[word] = 1 if word not in self.words else self.words[word] + 1


In [60]:
texts = WordExtractor()
texts.extract_words(path)

In [61]:
words = texts.words

In [62]:
len(words)

286225

## Modeling Spell checker

In [294]:
class SpellingCheck:
    """
    Check if the spelling is correct or incorrect
    If incorrect with it will correct it.
    """

    def __init__(self, filename=None):
        """
        Pass the text file path for text extraction.
        """
        self.words_frequency = dict()
        self.filename = filename
        self.total_words = 0
        self.letters = "abcdefghijklmnopqrstuvwxyz"

    def extract_words(self, filename=None):
        """
        extract words from filename
        """
        if not filename and not self.filename:
            return "Please Enter text file path"
        if filename:
            self.filename = filename
        with open(self.filename, "r") as text_file:
            for line in text_file.readlines():
                line_words = line.split()
                for word in line_words:
                    if word not in self.words_frequency:
                        self.words_frequency[word] = [1]
                    else:
                        self.words_frequency[word][0] += 1
                    self.total_words += 1
        self.__calculate_probability__()

    def __calculate_probability__(self):
        """
        Calculate probability when picking equally.
        """
        for key, val in self.words_frequency.items():
            if len(val) == 1:
                self.words_frequency[key].append(val[0] / self.total_words)
            else:
                self.words_frequency[key][1] = val[0] / self.total_words  # incase used twice


    def __missing__(self, word):
        """
        Generate random words considering there's a missing letter in between.
        """
        lists = []
        for i in range(len(word)):
            lists.append(word[:i] + word[i + 1:])
        return lists

    def __swapped__(self, word):
        """
        Generate text swapping adjacent.
        """
        lists = []
        for i in range(len(word) - 1):
            lists.append(word[:i] + word[i + 1] + word[i] + word[i + 2:])
        return lists

    def __miss_typed__(self, word):
        """
        Generate word for missing a letter.
        """
        lists = []
        for i in range(len(word) + 1):
            for j in self.letters:
                lists.append(word[:i] + j + word[i + 1:])
        return lists

    def __extra_typed__(self, word):
        """
        Generate word with an extra letter.
        """
        lists = []
        for i in range(len(word) + 1):
            for j in self.letters:
                lists.append(word[:i] + j + word[i:])
        return lists

    def __level_one_edits__(self, word):
        """
        Generates sets of all the possible mistakes.
        """
        return set(
            self.__miss_typed__(word) + self.__missing__(word) + self.__extra_typed__(word) + self.__swapped__(word))

    def __level_two_edits__(self, word):
        """
        two level mistakes.
        """
        return set(j for i in self.__level_one_edits__(word) for j in self.__level_one_edits__(i))

    def possibility(self, word):
        if word in self.words_frequency:
            return word

        pos = []
        can_be = self.__level_two_edits__(word).union(self.__level_one_edits__(word))
        for i in can_be:
            if i in self.words_frequency:
                pos.append((i, self.words_frequency[i][1]))
        return pos

    def edit_cost(self, word, predicted):
        """
        Calculate editing cost for changing the letters
        """
        n, m = len(word), len(predicted)
        dp = [[1000] * (m + 1) for j in range(n + 1)]
        for i in range(0, n + 1):
            for j in range(0, m + 1):
                if not i and not j:
                    dp[i][j] = 0
                elif not i:
                    dp[i][j] = j
                elif not j:
                    dp[i][j] = i
                else:
                    if word[i - 1] == predicted[j - 1]:
                        dp[i][j] = dp[i - 1][j - 1]
                    else:
                        dp[i][j] = min(dp[i - 1][j] + 3, dp[i - 1][j - 1] + 1,
                                           dp[i][j - 1] + 2)
        return dp[n][m]

    def check_spell(self, word):
        """
        Checks if the word is incorrect
        """
        if word in self.words_frequency:
            return word
        pos = self.possibility(word)
        corrected = ""
        min_cost = 1000
        prob = 0.0

        for i in pos:
            current_cost = self.edit_cost(word, i[0])
            print(i)
            if current_cost < min_cost:
                corrected = i[0]
                prob = i[1]
            elif current_cost == min_cost and prob < i[1]:
                prob = i[1]
                corrected = i[0]
        return corrected

In [286]:
test = SpellingCheck()
test.extract_words(path)

In [293]:
test.check_spell("lovvv")

('love', 0.0001789985842839243)
('loves', 4.881779571379754e-06)
('lover', 1.789985842839243e-05)
('loved', 8.136299285632923e-06)


'loved'

# END