# Context-sensitive Spelling Correction

### Student: Ivan Golov
### Email: i.golov@innopolis.university
### Group: AI-01

# Norvig's solution evaluation

I started working on the Assignment by evaluating the Norvig's solution and hightlighting the main drawbacks of it.

### Obtain the train language corpus using nltk.corpus import gutenberg, reuters, brown

In [None]:
import nltk
nltk.download('brown')
nltk.download('reuters')
nltk.download('gutenberg')
from nltk.corpus import gutenberg, reuters, brown

# Function to generate large corpus text
def generate_large_corpus():
    large_corpus_text = ""
    # List of all Gutenberg file IDs
    file_ids = gutenberg.fileids()

    # Generate the large corpus by combining all texts
    large_corpus_text = "\n".join(gutenberg.raw(file_id) for file_id in file_ids)

    # Add Reuters and Brown corpora to the large corpus
    reuters_text = " ".join(reuters.words())
    brown_text = " ".join(brown.words())

    large_corpus_text += f"\n{reuters_text}\n{brown_text}"

    return large_corpus_text

# Save the large corpus to a text file
def save_large_corpus(file_path, corpus_text):
    with open(file_path, "w") as file:
        file.write(corpus_text)

# Generate and save the large corpus
large_corpus_text = generate_large_corpus()
save_large_corpus("data/train/language_corpus.txt", large_corpus_text)
print("Large corpus generated and saved to 'data/train/language_corpus.txt'")

[nltk_data] Downloading package brown to /Users/ivangolov/nltk_data...
[nltk_data]   Package brown is already up-to-date!
[nltk_data] Downloading package reuters to
[nltk_data]     /Users/ivangolov/nltk_data...
[nltk_data]   Package reuters is already up-to-date!
[nltk_data] Downloading package gutenberg to
[nltk_data]     /Users/ivangolov/nltk_data...
[nltk_data]   Package gutenberg is already up-to-date!


### Implement the Norvig model

In [110]:
import re
from collections import Counter

In [111]:
class NorvigSpellingCorrector_v1:
    def __init__(self, corpus):
        self.WORDS = Counter(self.words(corpus))
        self.N = sum(self.WORDS.values())

    def words(self, text):
        return re.findall(r'\w+', text.lower())

    def P(self, word):
        "Probability of `word`."
        return self.WORDS[word] / self.N

    def correction(self, word):
        "Most probable spelling correction for word."
        return max(self.candidates(word), key=self.P)

    def candidates(self, word):
        "Generate possible spelling corrections for word."
        return (self.known([word]) or self.known(self.edits1(word)) or self.known(self.edits2(word)) or [word])

    def known(self, words):
        "The subset of `words` that appear in the dictionary of WORDS."
        return set(w for w in words if w in self.WORDS)

    def edits1(self, word):
        "All edits that are one edit away from `word`."
        letters    = 'abcdefghijklmnopqrstuvwxyz'
        splits     = [(word[:i], word[i:])    for i in range(len(word) + 1)]
        deletes    = [L + R[1:]               for L, R in splits if R]
        transposes = [L + R[1] + R[0] + R[2:] for L, R in splits if len(R)>1]
        replaces   = [L + c + R[1:]           for L, R in splits if R for c in letters]
        inserts    = [L + c + R               for L, R in splits for c in letters]
        return set(deletes + transposes + replaces + inserts)

    def edits2(self, word):
        "All edits that are two edits away from `word`."
        return (e2 for e1 in self.edits1(word) for e2 in self.edits1(e1))

### Train the model

In [112]:
model = NorvigSpellingCorrector_v1(large_corpus_text)

### Test set preparation

In [8]:
import nltk
import pandas as pd
from nltk.tokenize import sent_tokenize, word_tokenize
import re

In [9]:
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/ivangolov/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [10]:
test_data = ""
with open('data/test/test.txt') as f:
    test_data = f.read()

In [11]:
# Tokenize the document into sentences
sentences = sent_tokenize(test_data.lower())

In [18]:
import re

# Initialize lists to store the data for the DataFrame
original_sentences = []
misspelled_sentences = []
correct_words = []
misspelled_words = []

# Define a regex pattern to find the <ERR> tags
pattern = re.compile(r'<err targ=(.*?)>(.*?)</err>')

# Define a preprocessing function
def preprocess_sentence(sentence):
    # Convert to lowercase
    sentence = sentence.lower()
    # Remove punctuation
    sentence = re.sub(r'[^\w\s]', '', sentence)
    # Remove extra whitespaces
    sentence = re.sub(r'\s+', ' ', sentence).strip()
    return sentence

# Process each sentence
for sentence in sentences:
    matches = pattern.findall(sentence)
    if matches:
        original_sentence = sentence
        misspelled_sentence = sentence
        correct_word_list = []
        misspelled_word_list = []
        
        for match in matches:
            correct_word = match[0]
            misspelled_word = match[1]
            misspelled_word_mew = match[1].replace(' ', '')
            correct_word_list.append(correct_word)
            misspelled_word_list.append(misspelled_word_mew)
            misspelled_sentence = misspelled_sentence.replace(f'<err targ={correct_word}>{misspelled_word}</err>', misspelled_word_mew)
            original_sentence = original_sentence.replace(f'<err targ={correct_word}>{misspelled_word}</err>', correct_word)
        
        # Apply preprocessing
        original_sentence = preprocess_sentence(original_sentence)
        misspelled_sentence = preprocess_sentence(misspelled_sentence)
        
        original_sentences.append(original_sentence)
        misspelled_sentences.append(misspelled_sentence)
        correct_words.append(correct_word_list)
        misspelled_words.append(misspelled_word_list)

In [19]:
# Construct the pandas DataFrame
df = pd.DataFrame({
    'Original Sentence': original_sentences,
    'Misspelled Sentence': misspelled_sentences,
    'Correct Words': correct_words,
    'Misspelled Words': misspelled_words
})

# Display the DataFrame
print(df.head())

# Save the DataFrame to a CSV file
df.to_csv('data/test/test_data_processed.csv', index=False)

                                   Original Sentence  \
0  1 nigel thrush page 48 i have four in my famil...   
1                          my sister goes to tonbury   
2                          my mum goes out sometimes   
3  i go to bridgebrook i go out sometimes on tues...   
4  on thursday nights i go bellringing on saturda...   

                                 Misspelled Sentence      Correct Words  \
0  1 nigel thrush page 48 i have four in my famil...           [sister]   
1                             my siter go to tonbury     [sister, goes]   
2                          my mum goes out sometimes        [sometimes]   
3  i go to bridgebrook i go out sometimes on tues...  [sometimes, club]   
4  on thursday nights i go bellringing on saturda...      [bellringing]   

    Misspelled Words  
0            [siter]  
1        [siter, go]  
2        [sometimes]  
3  [sometimes, clob]  
4      [bellringing]  


In [20]:
df.head(10)

Unnamed: 0,Original Sentence,Misspelled Sentence,Correct Words,Misspelled Words
0,1 nigel thrush page 48 i have four in my famil...,1 nigel thrush page 48 i have four in my famil...,[sister],[siter]
1,my sister goes to tonbury,my siter go to tonbury,"[sister, goes]","[siter, go]"
2,my mum goes out sometimes,my mum goes out sometimes,[sometimes],[sometimes]
3,i go to bridgebrook i go out sometimes on tues...,i go to bridgebrook i go out sometimes on tues...,"[sometimes, club]","[sometimes, clob]"
4,on thursday nights i go bellringing on saturda...,on thursday nights i go bellringing on saturda...,[bellringing],[bellringing]
5,i go to bed at 10 o clock i watch tv at 5 o cl...,i go to bed at 10 o clock i wakh tv at 5 o clo...,[watch],[wakh]
6,the house is white it has stone up the front i...,the house is white it has stone up the frount ...,"[front, second]","[frount, sexeon]"
7,on monday i sometimes go down the farm in the ...,on monday i sometimes go down the farm in the ...,[watch],[wach]
8,we have got anglia like to watch cowboys,we have got anglia like to wach cowboys,"[watch, cowboys]","[wach, cowboys]"
9,on tuesday i get off the bus and sometimes in ...,on tuesday i get off the bus and sometimes in ...,"[sometimes, club]","[sometimes, colbe]"


### Metrics computation

In [113]:
import pandas as pd
import numpy as np
import math
from nltk.metrics import edit_distance as Levenshtein
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor
import ast

# Define the functions to calculate WER and CER
def calculate_wer(reference, corrected):
    # Calculate Word Error Rate (WER)
    reference_words = reference.split()
    corrected_words = corrected.split()

    S = Levenshtein(reference_words, corrected_words)
    I = max(0, len(corrected_words) - len(reference_words))
    D = max(0, len(reference_words) - len(corrected_words))

    N = max(len(reference_words), len(corrected_words))

    wer = (S + I + D) / N

    return wer

def calculate_cer(reference, corrected):
    # Calculate Character Error Rate (CER)
    S = Levenshtein(reference, corrected)
    I = max(0, len(corrected) - len(reference))
    D = max(0, len(reference) - len(corrected))

    N = max(len(reference), len(corrected))

    cer = (S + I + D) / N

    return cer

# Define the function to calculate accuracy
def calculate_accuracy(df):
    correct_predictions = 0
    total_predictions = 0

    for index, row in tqdm(df.iterrows(), total=df.shape[0]):
        correct_words = ast.literal_eval(row['Correct Words'])
        corrected_words = row['Corrected Words']
        for correct_word, corrected_word in zip(correct_words, corrected_words):
            if correct_word == corrected_word:
                correct_predictions += 1
            total_predictions += 1

    accuracy = correct_predictions / total_predictions
    return accuracy

def calculate_perplexity_norvig(sentence, model):
    words = model.words(sentence)
    log_prob = 0

    for word in words:
        prob = model.P(word)
        if prob > 0:
            log_prob += np.log2(prob)  
        else:
            log_prob += np.log2(1 / model.N) 

    HC = -log_prob / len(words)  # Cross-entropy
    perpl = math.pow(2, HC)  # Perplexity

    return HC, perpl

def correct_sentence(sentence, model):
    return ' '.join([model.correction(word) for word in model.words(sentence)])

def correct_words(words, model):
    return [model.correction(word.strip()) for word in eval(words)]

def compute_stats(df, model, tag):
    WER = []
    CER = []
    accuracy = 0
    perplexities = []
    HCs = []

    if tag == "Norvig":
        
        print("Compute the corrected sentence and words")
        with ThreadPoolExecutor() as executor:
            df['Corrected Sentence'] = list(tqdm(executor.map(lambda sentence: correct_sentence(sentence, model), df['Misspelled Sentence']), total=len(df)))
            df['Corrected Words'] = list(tqdm(executor.map(lambda words: correct_words(words, model), df['Misspelled Words']), total=len(df)))
        
        print("Compute the WER, CER and Perplexity")
        for index, row in tqdm(df.iterrows(), total=df.shape[0]):
            reference = row['Original Sentence']
            corrected = row['Corrected Sentence']
            wer = calculate_wer(reference, corrected)
            cer = calculate_cer(reference, corrected)
            WER.append(wer)
            CER.append(cer)
            HC, perplexity = calculate_perplexity_norvig(corrected, model)
            perplexities.append(perplexity)
            HCs.append(HC)
            
        
        print("Compute the accuracy")
        accuracy = calculate_accuracy(df)
            
    print('Average Word Error Rate (WER):', np.mean(WER))
    print('Average Character Error Rate (CER):', np.mean(CER))
    print(f"Accuracy: {accuracy}")
    print('Average Perplexity:', np.mean(perplexities))
    print('Average Cross-Entropy:', np.mean(HCs))
    
    return df

In [114]:
import pandas as pd
test_df = pd.read_csv('data/test/test_data_processed.csv')

### Collect statistics

In [115]:
updated = compute_stats(test_df.copy(), model, "Norvig")

Compute the corrected sentence and words


100%|██████████| 666/666 [00:36<00:00, 18.09it/s] 
100%|██████████| 666/666 [00:23<00:00, 28.19it/s] 


Compute the WER, CER and Perplexity


100%|██████████| 666/666 [00:28<00:00, 22.97it/s]


Compute the accuracy


100%|██████████| 666/666 [00:00<00:00, 10956.08it/s]

Average Word Error Rate (WER): 0.12568611368448648
Average Character Error Rate (CER): 0.06691358406401805
Accuracy: 0.2058484032320123
Average Perplexity: 5198.027378935151
Average Cross-Entropy: 10.281997326281457





In [116]:
updated.head(10)

Unnamed: 0,Original Sentence,Misspelled Sentence,Correct Words,Misspelled Words,Corrected Sentence,Corrected Words
0,1 nigel thrush page 48 i have four in my famil...,1 nigel thrush page 48 i have four in my famil...,['sister'],['siter'],1 nigel thrush page 48 i have four in my famil...,[sister]
1,my sister goes to tonbury,my siter go to tonbury,"['sister', 'goes']","['siter', 'go']",my sister go to tilbury,"[sister, go]"
2,my mum goes out sometimes,my mum goes out sometimes,['sometimes'],['sometimes'],my mum goes out sometimes,[sometimes]
3,i go to bridgebrook i go out sometimes on tues...,i go to bridgebrook i go out sometimes on tues...,"['sometimes', 'club']","['sometimes', 'clob']",i go to bridgebrook i go out sometimes on tues...,"[sometimes, club]"
4,on thursday nights i go bellringing on saturda...,on thursday nights i go bellringing on saturda...,['bellringing'],['bellringing'],on thursday nights i go bellringing on saturda...,[bellringing]
5,i go to bed at 10 o clock i watch tv at 5 o cl...,i go to bed at 10 o clock i wakh tv at 5 o clo...,['watch'],['wakh'],i go to bed at 10 o clock i wash tv at 5 o clo...,[wash]
6,the house is white it has stone up the front i...,the house is white it has stone up the frount ...,"['front', 'second']","['frount', 'sexeon']",the house is white it has stone up the front i...,"[front, sexton]"
7,on monday i sometimes go down the farm in the ...,on monday i sometimes go down the farm in the ...,['watch'],['wach'],on monday i sometimes go down the farm in the ...,[each]
8,we have got anglia like to watch cowboys,we have got anglia like to wach cowboys,"['watch', 'cowboys']","['wach', 'cowboys']",we have got anglia like to each cowboys,"[each, cowboys]"
9,on tuesday i get off the bus and sometimes in ...,on tuesday i get off the bus and sometimes in ...,"['sometimes', 'club']","['sometimes', 'colbe']",on tuesday i get off the bus and sometimes in ...,"[sometimes, cole]"


### The Norvig spelling corrector model v1
* Average Word Error Rate (WER): 0.12577995752833032
* Average Character Error Rate (CER): 0.06695512816430119
* Accuracy: 0.2058484032320123
* Average Perplexity: 5198.027378935151
* Average Cross-Entropy: 10.281997326281457

bla bla bla

# My solution

### **Improvement №1** (add advanced text predprocessing, exclude rare words and add <UNK> token):

In [118]:
class NorvigSpellingCorrector_v2:
    def __init__(self, corpus):
        self.WORDS = self.preprocess_corpus(corpus)
        self.N = sum(self.WORDS.values())
        
    def preprocess_corpus(self, corpus):
        corpus = corpus.lower()

        corpus = re.sub(r'[^a-z0-9\s]', '', corpus)

        tokens = self.words(corpus)  

        word_counts = Counter(tokens)
        
        threshold = 10
        vocab = {word for word, count in word_counts.items() if count >= threshold}

        tokens = [token if token in vocab else "<UNK>" for token in tokens]

        word_counts = Counter(tokens)
    
        return word_counts 

    def words(self, text):
        return re.findall(r'\w+', text.lower())

    def P(self, word):
        "Probability of `word`."
        return self.WORDS[word] / self.N

    def correction(self, word):
        "Most probable spelling correction for word."
        return max(self.candidates(word), key=self.P)

    def candidates(self, word):
        "Generate possible spelling corrections for word."
        return (self.known([word]) or self.known(self.edits1(word)) or self.known(self.edits2(word)) or [word])

    def known(self, words):
        "The subset of `words` that appear in the dictionary of WORDS."
        return set(w for w in words if w in self.WORDS)

    def edits1(self, word):
        "All edits that are one edit away from `word`."
        letters    = 'abcdefghijklmnopqrstuvwxyz'
        splits     = [(word[:i], word[i:])    for i in range(len(word) + 1)]
        deletes    = [L + R[1:]               for L, R in splits if R]
        transposes = [L + R[1] + R[0] + R[2:] for L, R in splits if len(R)>1]
        replaces   = [L + c + R[1:]           for L, R in splits if R for c in letters]
        inserts    = [L + c + R               for L, R in splits for c in letters]
        return set(deletes + transposes + replaces + inserts)

    def edits2(self, word):
        "All edits that are two edits away from `word`."
        return (e2 for e1 in self.edits1(word) for e2 in self.edits1(e1))

### Load data and train model

In [119]:
large_corpus_text = ""
with open('data/train/language_corpus.txt') as f:
    large_corpus_text = f.read()

In [120]:
model = NorvigSpellingCorrector_v2(large_corpus_text)

### Compute metrics

In [121]:
updated = compute_stats(test_df.copy(), model, "Norvig")

Compute the corrected sentence and words


100%|██████████| 666/666 [00:52<00:00, 12.70it/s]
100%|██████████| 666/666 [00:31<00:00, 21.31it/s]


Compute the WER, CER and Perplexity


100%|██████████| 666/666 [00:29<00:00, 22.96it/s]


Compute the accuracy


100%|██████████| 666/666 [00:00<00:00, 9620.36it/s]

Average Word Error Rate (WER): 0.12965260361783013
Average Character Error Rate (CER): 0.06938579384714089
Accuracy: 0.22008464794151597
Average Perplexity: 4751.986770267538
Average Cross-Entropy: 10.065830773255565





In [122]:
updated.head(10)

Unnamed: 0,Original Sentence,Misspelled Sentence,Correct Words,Misspelled Words,Corrected Sentence,Corrected Words
0,1 nigel thrush page 48 i have four in my famil...,1 nigel thrush page 48 i have four in my famil...,['sister'],['siter'],1 nigel thrust page 48 i have four in my famil...,[sister]
1,my sister goes to tonbury,my siter go to tonbury,"['sister', 'goes']","['siter', 'go']",my sister go to tonbury,"[sister, go]"
2,my mum goes out sometimes,my mum goes out sometimes,['sometimes'],['sometimes'],my sum goes out sometimes,[sometimes]
3,i go to bridgebrook i go out sometimes on tues...,i go to bridgebrook i go out sometimes on tues...,"['sometimes', 'club']","['sometimes', 'clob']",i go to bridgebrook i go out sometimes on tues...,"[sometimes, club]"
4,on thursday nights i go bellringing on saturda...,on thursday nights i go bellringing on saturda...,['bellringing'],['bellringing'],on thursday nights i go bellringing on saturda...,[bellringing]
5,i go to bed at 10 o clock i watch tv at 5 o cl...,i go to bed at 10 o clock i wakh tv at 5 o clo...,['watch'],['wakh'],i go to bed at 10 o clock i wash tv at 5 o clo...,[wash]
6,the house is white it has stone up the front i...,the house is white it has stone up the frount ...,"['front', 'second']","['frount', 'sexeon']",the house is white it has stone up the front i...,"[front, seen]"
7,on monday i sometimes go down the farm in the ...,on monday i sometimes go down the farm in the ...,['watch'],['wach'],on monday i sometimes go down the farm in the ...,[each]
8,we have got anglia like to watch cowboys,we have got anglia like to wach cowboys,"['watch', 'cowboys']","['wach', 'cowboys']",we have got angle like to each cowboy,"[each, cowboy]"
9,on tuesday i get off the bus and sometimes in ...,on tuesday i get off the bus and sometimes in ...,"['sometimes', 'club']","['sometimes', 'colbe']",on tuesday i get off the bus and sometimes in ...,"[sometimes, cole]"


### The Norvig spelling corrector model v2
* Average Word Error Rate (WER): 0.12965260361783013
* Average Character Error Rate (CER): 0.06938579384714089
* Accuracy: 0.22008464794151597
* Average Perplexity: 4751.986770267538
* Average Cross-Entropy: 10.065830773255565

bla bla bla

### **Improvement №2** (Add the notion of context using N-gram models):

### Load and predprocess the data

In [123]:
large_corpus_text = ""
with open('data/train/language_corpus.txt') as f:
    large_corpus_text = f.read()

In [142]:
def preprocess_corpus(corpus):
        corpus = corpus.lower()

        corpus = re.sub(r'[^a-z0-9\s]', '', corpus)

        tokens = re.findall(r'\w+', corpus) 

        word_counts = Counter(tokens)
        
        threshold = 10
        vocab = {word for word, count in word_counts.items() if count >= threshold}

        tokens = [token if token in vocab else "<UNK>" for token in tokens]


        return tokens, vocab

corpus_words, vocab = preprocess_corpus(large_corpus_text)

In [145]:
def preprocess_sentences(sentences, vocab):
    preprocessed_sentences = []
    for sentence in tqdm(sentences, total=len(sentences)):
        sentence = sentence.lower()
        sentence = re.sub(r'[^a-z0-9\s]', '', sentence)
        tokens = re.findall(r'\w+', sentence)
        preprocessed_sentence = [token if token in vocab else "<UNK>" for token in tokens]
        preprocessed_sentences.append(preprocessed_sentence)
    return preprocessed_sentences

In [151]:
corpus_sentences = nltk.sent_tokenize(large_corpus_text)

In [153]:
corpus_sentences_predprocessed = preprocess_sentences(corpus_sentences, vocab)

100%|██████████| 248174/248174 [00:07<00:00, 33215.54it/s]


### Compute the N-gram stats

In [126]:
from collections import defaultdict
from tqdm import tqdm

def build_ngram_counts(words, max_order):
    """
    Build n-gram counts for orders 1 through max_order.
    For unigrams, keys are one-element tuples.
    """
    ngram_counts = {i: defaultdict(int) for i in range(1, max_order + 1)}

    for i in tqdm(range(len(words)), desc="Building n-gram counts"):
        for order in range(1, max_order + 1):
            if i + order <= len(words):
                gram = tuple(words[i:i + order])
                ngram_counts[order][gram] += 1
    
    return ngram_counts

ngram_counts = build_ngram_counts(corpus_words, 3)

Building n-gram counts: 100%|██████████| 4624690/4624690 [00:14<00:00, 313352.48it/s]


### Bigram probability model

In [137]:
# Compute the bigram probabilities
def compute_bigram_probabilities(w1, w2, ngram_counts, alpha):
    """
    Computes smoothed bigram probability:
      P(w2|w1) = (C(w1, w2) + alpha) / (C(w1) + alpha * |V|)
    """
    bigram_count = ngram_counts[2][(w1, w2)]
    unigram_count = ngram_counts[1][(w1,)]
    
    V = len(ngram_counts[1])
    
    bigram_probability = (bigram_count + alpha) / (unigram_count + alpha * V)
    
    return bigram_probability

In [168]:
def compute_perplexity_bigram(sentences, ngram_counts, alpha):
    """
    Compute the perplexity of the validation set using the bigram model.
    """
    total_log_prob = 0
    total_words = 0

    for sentence in tqdm(sentences, desc="Computing Perplexity"):
        for i in range(1, len(sentence)):
            w1 = sentence[i - 1]
            w2 = sentence[i]
            bigram_prob = compute_bigram_probabilities(w1, w2, ngram_counts, alpha)
            log_prob = np.log2(bigram_prob)
            total_log_prob += log_prob
            total_words += 1

    HC = -total_log_prob / total_words
    perplexity = math.pow(2, HC)

    return HC, perplexity

### Trigram probability model

In [138]:
# Compute the trigram probabilities
def compute_trigram_probabilities(w1, w2, w3, ngram_counts, alpha):
    """
    Computes smoothed trigram probability:
      P(w3|w1, w2) = (C(w1, w2, w3) + alpha) / (C(w1, w2) + alpha * |V|)
    """
    trigram_count = ngram_counts[3][(w1, w2, w3)]
    bigram_count = ngram_counts[2][(w1, w2)]
    
    V = len(ngram_counts[1])
    
    trigram_probability = (trigram_count + alpha) / (bigram_count + alpha * V)
    
    return trigram_probability

In [169]:
def compute_perplexity_trigram(sentences, ngram_counts, alpha):
    """
    Compute the perplexity of the validation set using the trigram model.
    """
    total_log_prob = 0
    total_words = 0

    for sentence in tqdm(sentences, desc="Computing Perplexity"):
        for i in range(2, len(sentence)):
            w1 = sentence[i - 2]
            w2 = sentence[i - 1]
            w3 = sentence[i]
            trigram_prob = compute_trigram_probabilities(w1, w2, w3, ngram_counts, alpha)
            log_prob = np.log2(trigram_prob)
            total_log_prob += log_prob
            total_words += 1

    HC = -total_log_prob / total_words
    perplexity = math.pow(2, HC)

    return HC, perplexity

### Interpolated bi-gram and tri-gram model

In [139]:
def compute_interpolated_prob(w1, w2, w3, ngram_counts, alpha, lamda):
    """
    Computes the interpolated probability:
      P(w3|w1,w2) = lam * P_trigram(w3|w1,w2) + (1 - lam) * P_bigram(w3|w2)
    where the bigram probability is computed as:
      P(w3|w2) = (C(w2, w3) + alpha) / (C(w2) + alpha * |V|)
    """
    # Trigram probability
    p_trigram = compute_trigram_probabilities(w1, w2, w3, ngram_counts, alpha)
    
    # Bigram probability
    p_bigram = compute_bigram_probabilities(w2, w3, ngram_counts, alpha)
    
    # Interpolated probability
    interpolated_prob = lamda * p_trigram + (1 - lamda) * p_bigram
    
    return interpolated_prob

In [170]:
def compute_perplexity_interpolated(sentences, ngram_counts, alpha, lamda):
    """
    Compute the perplexity of the validation set using the interpolated model.
    """
    total_log_prob = 0
    total_words = 0

    for sentence in tqdm(sentences, desc="Computing Perplexity"):
        for i in range(2, len(sentence)):
            w1 = sentence[i - 2]
            w2 = sentence[i - 1]
            w3 = sentence[i]
            interpolated_prob = compute_interpolated_prob(w1, w2, w3, ngram_counts, alpha, lamda)
            log_prob = np.log2(interpolated_prob)
            total_log_prob += log_prob
            total_words += 1

    HC = -total_log_prob / total_words
    perplexity = math.pow(2, HC)

    return HC, perplexity

### Tuning of hyperparameters

In [171]:
# Tune the hyperparameters for the bigram LM
validation_set = corpus_sentences[:10000]
alpha_values = [0.0001, 0.001, 0.01, 0.1, 0.2, 0.3, 0.5, 1.0]
lambda_values = [0.1, 0.2, 0.3, 0.5, 0.7, 0.9]

best_bigram_params=None
best_bigram_ce = float('inf')
print("Tuning hyperparameters for the bigram LM:")
for alpha in (alpha_values):
    HC, perpl = compute_perplexity_bigram(validation_set, ngram_counts, alpha)
    print(f"Alpha: {alpha}, Cross-Entropy: {HC}, Perplexity: {perpl}")
    if HC < best_bigram_ce:
        best_bigram_ce = HC
        best_bigram_params = (alpha, HC, perpl)
        
# Tune the hyperparameters for the trigram LM
best_trigram_params=None
best_trigram_ce = float('inf')
print("Tuning hyperparameters for the trigram LM:")
for alpha in (alpha_values):
    HC, perpl = compute_perplexity_trigram(validation_set, ngram_counts, alpha)
    print(f"Alpha: {alpha}, Cross-Entropy: {HC}, Perplexity: {perpl}")
    if HC < best_trigram_ce:
        best_trigram_ce = HC
        best_trigram_params = (alpha, HC, perpl)
        
# Tune the hyperparameters for the interpolated LM
best_interpolated_params=None
best_interpolated_ce = float('inf')
print("Tuning hyperparameters for the interpolated LM:")
for alpha in (alpha_values):
    for lamda in lambda_values:
        HC, perpl = compute_perplexity_interpolated(validation_set, ngram_counts, alpha, lamda)
        print(f"Alpha: {alpha}, Lambda: {lamda}, Cross-Entropy: {HC}, Perplexity: {perpl}")
        if HC < best_interpolated_ce:
            best_interpolated_ce = HC
            best_interpolated_params = (alpha, lamda, HC, perpl)


Tuning hyperparameters for the bigram LM:


Computing Perplexity: 100%|██████████| 10000/10000 [00:01<00:00, 5044.64it/s]


Alpha: 0.0001, Cross-Entropy: 15.571766524590013, Perplexity: 48704.46861441161


Computing Perplexity: 100%|██████████| 10000/10000 [00:01<00:00, 5151.46it/s]


Alpha: 0.001, Cross-Entropy: 14.330228241264058, Perplexity: 20598.16559884154


Computing Perplexity: 100%|██████████| 10000/10000 [00:01<00:00, 5164.93it/s]


Alpha: 0.01, Cross-Entropy: 13.369777434082627, Perplexity: 10585.321255172768


Computing Perplexity: 100%|██████████| 10000/10000 [00:01<00:00, 5141.30it/s]


Alpha: 0.1, Cross-Entropy: 13.225124444839032, Perplexity: 9575.449149327107


Computing Perplexity: 100%|██████████| 10000/10000 [00:02<00:00, 4936.56it/s]


Alpha: 0.2, Cross-Entropy: 13.332784876705192, Perplexity: 10317.350239208281


Computing Perplexity: 100%|██████████| 10000/10000 [00:01<00:00, 5026.52it/s]


Alpha: 0.3, Cross-Entropy: 13.413407114210472, Perplexity: 10910.32996801804


Computing Perplexity: 100%|██████████| 10000/10000 [00:01<00:00, 5213.24it/s]


Alpha: 0.5, Cross-Entropy: 13.525765811997951, Perplexity: 11794.002723707026


Computing Perplexity: 100%|██████████| 10000/10000 [00:01<00:00, 5114.70it/s]


Alpha: 1.0, Cross-Entropy: 13.684862012267859, Perplexity: 13169.03500911066
Tuning hyperparameters for the trigram LM:


Computing Perplexity: 100%|██████████| 10000/10000 [00:02<00:00, 4065.17it/s]


Alpha: 0.0001, Cross-Entropy: 14.620505569248602, Perplexity: 25188.98849920805


Computing Perplexity: 100%|██████████| 10000/10000 [00:02<00:00, 4396.27it/s]


Alpha: 0.001, Cross-Entropy: 14.206140808074235, Perplexity: 18900.55277083627


Computing Perplexity: 100%|██████████| 10000/10000 [00:02<00:00, 4244.91it/s]


Alpha: 0.01, Cross-Entropy: 14.159908223480418, Perplexity: 18304.46800107179


Computing Perplexity: 100%|██████████| 10000/10000 [00:02<00:00, 4483.69it/s]


Alpha: 0.1, Cross-Entropy: 14.215964158776348, Perplexity: 19029.686298875287


Computing Perplexity: 100%|██████████| 10000/10000 [00:02<00:00, 4257.95it/s]


Alpha: 0.2, Cross-Entropy: 14.236179841975247, Perplexity: 19298.215691654135


Computing Perplexity: 100%|██████████| 10000/10000 [00:02<00:00, 4436.92it/s]


Alpha: 0.3, Cross-Entropy: 14.247777104562108, Perplexity: 19453.971710877144


Computing Perplexity: 100%|██████████| 10000/10000 [00:02<00:00, 4391.10it/s]


Alpha: 0.5, Cross-Entropy: 14.261719897904568, Perplexity: 19642.89427053955


Computing Perplexity: 100%|██████████| 10000/10000 [00:02<00:00, 4448.30it/s]


Alpha: 1.0, Cross-Entropy: 14.278660100343172, Perplexity: 19874.90164292532
Tuning hyperparameters for the interpolated LM:


Computing Perplexity: 100%|██████████| 10000/10000 [00:03<00:00, 3003.33it/s]


Alpha: 0.0001, Lambda: 0.1, Cross-Entropy: 13.74841975635381, Perplexity: 13762.16434002628


Computing Perplexity: 100%|██████████| 10000/10000 [00:03<00:00, 3085.07it/s]


Alpha: 0.0001, Lambda: 0.2, Cross-Entropy: 13.421845623129277, Perplexity: 10974.332889969628


Computing Perplexity: 100%|██████████| 10000/10000 [00:03<00:00, 3076.62it/s]


Alpha: 0.0001, Lambda: 0.3, Cross-Entropy: 13.251782886276027, Perplexity: 9754.031298146649


Computing Perplexity: 100%|██████████| 10000/10000 [00:03<00:00, 3071.61it/s]


Alpha: 0.0001, Lambda: 0.5, Cross-Entropy: 13.104681697956275, Perplexity: 8808.506410006923


Computing Perplexity: 100%|██████████| 10000/10000 [00:03<00:00, 3106.43it/s]


Alpha: 0.0001, Lambda: 0.7, Cross-Entropy: 13.126402758739374, Perplexity: 8942.129716459292


Computing Perplexity: 100%|██████████| 10000/10000 [00:03<00:00, 3051.85it/s]


Alpha: 0.0001, Lambda: 0.9, Cross-Entropy: 13.420482816570608, Perplexity: 10963.971149767824


Computing Perplexity: 100%|██████████| 10000/10000 [00:03<00:00, 2952.60it/s]


Alpha: 0.001, Lambda: 0.1, Cross-Entropy: 13.388771662904029, Perplexity: 10725.606896728907


Computing Perplexity: 100%|██████████| 10000/10000 [00:03<00:00, 3092.87it/s]


Alpha: 0.001, Lambda: 0.2, Cross-Entropy: 13.133015829305382, Perplexity: 8983.213017329448


Computing Perplexity: 100%|██████████| 10000/10000 [00:03<00:00, 3108.10it/s]


Alpha: 0.001, Lambda: 0.3, Cross-Entropy: 12.990919827534746, Perplexity: 8140.602319926089


Computing Perplexity: 100%|██████████| 10000/10000 [00:03<00:00, 3092.85it/s]


Alpha: 0.001, Lambda: 0.5, Cross-Entropy: 12.865048054124225, Perplexity: 7460.455769840212


Computing Perplexity: 100%|██████████| 10000/10000 [00:03<00:00, 3108.35it/s]


Alpha: 0.001, Lambda: 0.7, Cross-Entropy: 12.889988452461294, Perplexity: 7590.5486469191455


Computing Perplexity: 100%|██████████| 10000/10000 [00:03<00:00, 2993.29it/s]


Alpha: 0.001, Lambda: 0.9, Cross-Entropy: 13.170092411617906, Perplexity: 9217.069485687693


Computing Perplexity: 100%|██████████| 10000/10000 [00:03<00:00, 3098.93it/s]


Alpha: 0.01, Lambda: 0.1, Cross-Entropy: 13.054219315058122, Perplexity: 8505.72982676796


Computing Perplexity: 100%|██████████| 10000/10000 [00:03<00:00, 2903.07it/s]


Alpha: 0.01, Lambda: 0.2, Cross-Entropy: 12.95266768545199, Perplexity: 7927.595768650709


Computing Perplexity: 100%|██████████| 10000/10000 [00:03<00:00, 2895.11it/s]


Alpha: 0.01, Lambda: 0.3, Cross-Entropy: 12.898792610216251, Perplexity: 7637.012184923165


Computing Perplexity: 100%|██████████| 10000/10000 [00:03<00:00, 3069.01it/s]


Alpha: 0.01, Lambda: 0.5, Cross-Entropy: 12.874492605207696, Perplexity: 7509.455587400954


Computing Perplexity: 100%|██████████| 10000/10000 [00:03<00:00, 2707.68it/s]


Alpha: 0.01, Lambda: 0.7, Cross-Entropy: 12.956232928853806, Perplexity: 7947.210974771716


Computing Perplexity: 100%|██████████| 10000/10000 [00:03<00:00, 3135.24it/s]


Alpha: 0.01, Lambda: 0.9, Cross-Entropy: 13.264261707383362, Perplexity: 9838.766285362026


Computing Perplexity: 100%|██████████| 10000/10000 [00:03<00:00, 3074.05it/s]


Alpha: 0.1, Lambda: 0.1, Cross-Entropy: 13.171630429860954, Perplexity: 9226.900794201996


Computing Perplexity: 100%|██████████| 10000/10000 [00:03<00:00, 3035.82it/s]


Alpha: 0.1, Lambda: 0.2, Cross-Entropy: 13.165096248539044, Perplexity: 9185.205276259956


Computing Perplexity: 100%|██████████| 10000/10000 [00:03<00:00, 3094.58it/s]


Alpha: 0.1, Lambda: 0.3, Cross-Entropy: 13.176780061465879, Perplexity: 9259.894629423752


Computing Perplexity: 100%|██████████| 10000/10000 [00:03<00:00, 3134.30it/s]


Alpha: 0.1, Lambda: 0.5, Cross-Entropy: 13.243434492534057, Perplexity: 9697.750975674371


Computing Perplexity: 100%|██████████| 10000/10000 [00:03<00:00, 2921.69it/s]


Alpha: 0.1, Lambda: 0.7, Cross-Entropy: 13.381695482825537, Perplexity: 10673.128376878805


Computing Perplexity: 100%|██████████| 10000/10000 [00:03<00:00, 2810.90it/s]


Alpha: 0.1, Lambda: 0.9, Cross-Entropy: 13.688399665414488, Perplexity: 13201.366612424476


Computing Perplexity: 100%|██████████| 10000/10000 [00:03<00:00, 3093.42it/s]


Alpha: 0.2, Lambda: 0.1, Cross-Entropy: 13.313993041498236, Perplexity: 10183.832968208546


Computing Perplexity: 100%|██████████| 10000/10000 [00:03<00:00, 3032.68it/s]


Alpha: 0.2, Lambda: 0.2, Cross-Entropy: 13.322878081900287, Perplexity: 10246.745064096416


Computing Perplexity: 100%|██████████| 10000/10000 [00:03<00:00, 2722.21it/s]


Alpha: 0.2, Lambda: 0.3, Cross-Entropy: 13.344439653685741, Perplexity: 10401.036276166316


Computing Perplexity: 100%|██████████| 10000/10000 [00:03<00:00, 3096.49it/s]


Alpha: 0.2, Lambda: 0.5, Cross-Entropy: 13.422026880278523, Perplexity: 10975.711768527679


Computing Perplexity: 100%|██████████| 10000/10000 [00:03<00:00, 2755.93it/s]


Alpha: 0.2, Lambda: 0.7, Cross-Entropy: 13.560104693680929, Perplexity: 12078.089880640742


Computing Perplexity: 100%|██████████| 10000/10000 [00:03<00:00, 3124.12it/s]


Alpha: 0.2, Lambda: 0.9, Cross-Entropy: 13.836544304785765, Perplexity: 14629.007820328075


Computing Perplexity: 100%|██████████| 10000/10000 [00:03<00:00, 3059.63it/s]


Alpha: 0.3, Lambda: 0.1, Cross-Entropy: 13.408380719477208, Perplexity: 10872.38417705044


Computing Perplexity: 100%|██████████| 10000/10000 [00:03<00:00, 3082.23it/s]


Alpha: 0.3, Lambda: 0.2, Cross-Entropy: 13.423816716442913, Perplexity: 10989.336904925754


Computing Perplexity: 100%|██████████| 10000/10000 [00:03<00:00, 3075.38it/s]


Alpha: 0.3, Lambda: 0.3, Cross-Entropy: 13.449282648171627, Perplexity: 11185.03885910331


Computing Perplexity: 100%|██████████| 10000/10000 [00:03<00:00, 2978.48it/s]


Alpha: 0.3, Lambda: 0.5, Cross-Entropy: 13.529699897031351, Perplexity: 11826.207679021836


Computing Perplexity: 100%|██████████| 10000/10000 [00:03<00:00, 3116.57it/s]


Alpha: 0.3, Lambda: 0.7, Cross-Entropy: 13.66336565997831, Perplexity: 12974.269218489815


Computing Perplexity: 100%|██████████| 10000/10000 [00:03<00:00, 3077.34it/s]


Alpha: 0.3, Lambda: 0.9, Cross-Entropy: 13.915883410354882, Perplexity: 15456.04319086476


Computing Perplexity: 100%|██████████| 10000/10000 [00:03<00:00, 3131.74it/s]


Alpha: 0.5, Lambda: 0.1, Cross-Entropy: 13.532514455118235, Perplexity: 11849.301983139223


Computing Perplexity: 100%|██████████| 10000/10000 [00:03<00:00, 3095.41it/s]


Alpha: 0.5, Lambda: 0.2, Cross-Entropy: 13.553571800189744, Perplexity: 12023.520833735003


Computing Perplexity: 100%|██████████| 10000/10000 [00:03<00:00, 3046.43it/s]


Alpha: 0.5, Lambda: 0.3, Cross-Entropy: 13.581963683365942, Perplexity: 12262.48442318979


Computing Perplexity: 100%|██████████| 10000/10000 [00:03<00:00, 3097.51it/s]


Alpha: 0.5, Lambda: 0.5, Cross-Entropy: 13.662285258617828, Perplexity: 12964.55672183208


Computing Perplexity: 100%|██████████| 10000/10000 [00:03<00:00, 3107.96it/s]


Alpha: 0.5, Lambda: 0.7, Cross-Entropy: 13.786314718599488, Perplexity: 14128.44157281574


Computing Perplexity: 100%|██████████| 10000/10000 [00:03<00:00, 3093.16it/s]


Alpha: 0.5, Lambda: 0.9, Cross-Entropy: 14.004445186616572, Perplexity: 16434.559717363


Computing Perplexity: 100%|██████████| 10000/10000 [00:04<00:00, 2428.89it/s]


Alpha: 1.0, Lambda: 0.1, Cross-Entropy: 13.699871746504796, Perplexity: 13306.760257644217


Computing Perplexity: 100%|██████████| 10000/10000 [00:03<00:00, 2692.94it/s]


Alpha: 1.0, Lambda: 0.2, Cross-Entropy: 13.724104298991048, Perplexity: 13532.15792823409


Computing Perplexity: 100%|██████████| 10000/10000 [00:03<00:00, 3016.67it/s]


Alpha: 1.0, Lambda: 0.3, Cross-Entropy: 13.753030375680103, Perplexity: 13806.216338878241


Computing Perplexity: 100%|██████████| 10000/10000 [00:03<00:00, 2884.19it/s]


Alpha: 1.0, Lambda: 0.5, Cross-Entropy: 13.82730146463481, Perplexity: 14535.584492750848


Computing Perplexity: 100%|██████████| 10000/10000 [00:03<00:00, 2976.21it/s]


Alpha: 1.0, Lambda: 0.7, Cross-Entropy: 13.932713511762609, Perplexity: 15637.40513465185


Computing Perplexity: 100%|██████████| 10000/10000 [00:03<00:00, 2822.45it/s]

Alpha: 1.0, Lambda: 0.9, Cross-Entropy: 14.101701637133313, Perplexity: 17580.66031082511





In [172]:
# Print best paraneters
print("Best hyperparameters for the bigram LM:")
print(f"Alpha: {best_bigram_params[0]}, Cross-Entropy: {best_bigram_params[1]}, Perplexity: {best_bigram_params[2]}")
print("Best hyperparameters for the trigram LM:")
print(f"Alpha: {best_trigram_params[0]}, Cross-Entropy: {best_trigram_params[1]}, Perplexity: {best_trigram_params[2]}")
print("Best hyperparameters for the interpolated LM:")
print(f"Alpha: {best_interpolated_params[0]}, Lambda: {best_interpolated_params[1]}, Cross-Entropy: {best_interpolated_params[2]}, Perplexity: {best_interpolated_params[3]}")


Best hyperparameters for the bigram LM:
Alpha: 0.1, Cross-Entropy: 13.225124444839032, Perplexity: 9575.449149327107
Best hyperparameters for the trigram LM:
Alpha: 0.01, Cross-Entropy: 14.159908223480418, Perplexity: 18304.46800107179
Best hyperparameters for the interpolated LM:
Alpha: 0.001, Lambda: 0.5, Cross-Entropy: 12.865048054124225, Perplexity: 7460.455769840212


In [173]:
# save the best hyperparameters
best_hyperparameters = {
    "bigram": best_bigram_params,
    "trigram": best_trigram_params,
    "interpolated": best_interpolated_params
}

import json

with open('data/train/best_hyperparameters.json', 'w') as f:
    json.dump(best_hyperparameters, f)

# Solution №2 N-gram model based on the Google Books n-gram API

In [50]:
import re
import requests
import urllib

class ContextualSpellingCorrector:
    def __init__(self, vocabulary):
        self.vocabulary = set(vocabulary)

    def run_query(self, query, start_year=2010, end_year=2019, corpus=26, smoothing=3):
        """Fetches frequency data from the Google Books Ngram API."""
        query = urllib.parse.quote(query)
        url = f'https://books.google.com/ngrams/json?content={query}&year_start={start_year}&year_end={end_year}&corpus={corpus}&smoothing={smoothing}'
        response = requests.get(url)
        
        try:
            output = response.json()
        except:
            return {}

        if not output:
            return {}

        return {entry['ngram']: sum(entry['timeseries']) / len(entry['timeseries']) for entry in output}

    def average_frequency(self, phrase):
        """Gets the average frequency of a word or n-gram phrase from Google Ngrams."""
        data = self.run_query(phrase)
        return sum(data.values()) / len(data) if data else 0

    def words(self, text):
        """Tokenizes and lowercases the input text."""
        return re.findall(r'\w+', text.lower())

    def correction_with_context(self, word, context_window):
        """Finds the best spelling correction by considering context-based n-gram probabilities."""
        candidates = self.candidates(word)
        
        # Generate n-grams with the surrounding context
        context_phrases = {candidate: self.form_context_phrases(candidate, context_window) for candidate in candidates}

        # Get frequencies for each candidate within its context
        context_frequencies = {
            candidate: sum(self.average_frequency(phrase) for phrase in phrases)
            for candidate, phrases in context_phrases.items()
        }

        return max(context_frequencies, key=context_frequencies.get)  # Return the best correction

    def form_context_phrases(self, candidate, context_window):
        """Forms bigram and trigram phrases including the candidate."""
        left_context, right_context = context_window
        phrases = []

        if left_context:
            phrases.append(f"{left_context} {candidate}")
        if right_context:
            phrases.append(f"{candidate} {right_context}")
        if left_context and right_context:
            phrases.append(f"{left_context} {candidate} {right_context}")

        return phrases

    def candidates(self, word):
        """Generates possible spelling corrections based on known words."""
        return (self.known([word]) or self.known(self.edits1(word)) or self.known(self.edits2(word)) or [word])

    def known(self, words):
        """Filters words that exist in the vocabulary."""
        return set(w for w in words if w in self.vocabulary)

    def edits1(self, word):
        """Generates possible single-edit variations of a word."""
        letters = 'abcdefghijklmnopqrstuvwxyz'
        splits = [(word[:i], word[i:]) for i in range(len(word) + 1)]
        deletes = [L + R[1:] for L, R in splits if R]
        transposes = [L + R[1] + R[0] + R[2:] for L, R in splits if len(R) > 1]
        replaces = [L + c + R[1:] for L, R in splits if R for c in letters]
        inserts = [L + c + R for L, R in splits for c in letters]
        return set(deletes + transposes + replaces + inserts)

    def edits2(self, word):
        """Generates possible double-edit variations of a word."""
        return (e2 for e1 in self.edits1(word) for e2 in self.edits1(e1))

# Load corpus and build vocabulary
with open("data/train/large_corpus.txt") as f:
    train_corpus = f.read()

vocabulary = set(re.findall(r'\w+', train_corpus.lower()))
vocabulary.update(["<START>", "<END>"])  # Add special tokens

# Initialize corrector with updated vocabulary
corrector = ContextualSpellingCorrector(vocabulary)

In [None]:
# Initialize corrector
corrector = ContextualSpellingCorrector(vocabulary)

# Example usage with context-aware spelling correction
sentence = ["this", "is", "a", "speling", "error"]
corrected_list = []

for i, word in enumerate(sentence):
    left_context = sentence[i - 1] if i > 0 else "<START>"
    right_context = sentence[i + 1] if i < len(sentence) - 1 else "<END>"
    
    corrected_word = corrector.correction_with_context(word, (left_context, right_context))
    corrected_list.append(corrected_word)

corrected_sentence = " ".join(corrected_list)
print(f"Corrected sentence: {corrected_sentence}")

In [None]:
# Process each sentence
corrected_sentences = []
corrected_words_list = []

for index, row in tqdm(test_df.iterrows(), total=test_df.shape[0]):
    misspelled_sentence = word_tokenize(row["Misspelled Sentence"])  # Tokenize sentence
    corrected_words = []  # Store corrected words
    
    corrected_sentence = []
    for i, word in enumerate(misspelled_sentence):
        left_context = misspelled_sentence[i - 1] if i > 0 else "<START>"
        right_context = misspelled_sentence[i + 1] if i < len(misspelled_sentence) - 1 else "<END>"

        corrected_word = corrector.correction_with_context(word, (left_context, right_context))
        corrected_sentence.append(corrected_word)
        
        # If the word was corrected, add it to the corrected words list
        if corrected_word != word:
            corrected_words.append((word, corrected_word))

    # Store results
    corrected_sentences.append(" ".join(corrected_sentence))
    corrected_words_list.append(corrected_words)
    break

# Add results to DataFrame
print(corrected_sentences)
print(corrected_words_list)