## Spell Checker with NLTK

In [7]:
import nltk
nltk.download('words')

[nltk_data] Downloading package words to
[nltk_data]     C:\Users\jeffe\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!


True

In [2]:
from nltk.corpus import words
from spellchecker import SpellChecker

# Load word list dataset
word_list = set(words.words())
spell = SpellChecker()

# Example captured letters
captured_text = ['W', 'O', 'R', 'C', 'D',]  # Simulate a misrecognized gesture

captured_word = ''.join(captured_text)  

# Check if it's a valid word
if captured_word.lower() in word_list:
    print(f"'{captured_word}' is a valid word!")
else:
    print(f"'{captured_word}' is not a valid word!")

    # Suggest correction
    corrected_word = spell.correction(captured_word)
    print(f"Suggested correction: {corrected_word}")


'WORCD' is not a valid word!
Suggested correction: world


# Joining Words in a sentence


In [3]:
# Load word list dataset
word_list = set(words.words())
spell = SpellChecker()

# Example captured letters
captured_sequences = ['W', 'O', 'R', 'C', 'D', ' ', 'I', 'S', ' ', 'G', 'R', 'E', 'B', 'T']  # Simulate a misrecognized gesture with spaces

# Helper function to check if a word is valid
def is_valid_word(word):
    return word.lower() in word_list

# Process the captured sequence
sentence = ""
current_word = ""

for letter in captured_sequences:
    if letter == ' ':
        # Check and correct the current word, then reset
        if current_word:
            if not is_valid_word(current_word):
                corrected_word = spell.correction(current_word.lower())
                sentence += corrected_word.lower() + " "  # Convert to lowercase and add space
                print(f"'{current_word}' is not a valid word. Corrected to '{corrected_word}'.")
            else:
                sentence += current_word.lower() + " "  # Add space after the valid word
            current_word = ""
    else:
        current_word += letter

# Check and correct the last word (if any)
if current_word:
    if not is_valid_word(current_word):
        corrected_word = spell.correction(current_word.lower())
        sentence += corrected_word.lower()  # Convert to lowercase
        print(f"'{current_word}' is not a valid word. Corrected to '{corrected_word}'.")
    else:
        sentence += current_word.lower()  # Convert to lowercase

print("\nFinal Sentence:", sentence.strip())



'WORCD' is not a valid word. Corrected to 'world'.
'GREBT' is not a valid word. Corrected to 'great'.

Final Sentence: world is great


### Correcting with context


In [4]:
from spellchecker import SpellChecker
from transformers import pipeline

# Initialize spell checker and fill-mask pipeline
spell = SpellChecker()
fill_mask = pipeline("fill-mask", model="bert-base-uncased")






BertForMaskedLM has generative capabilities, as `prepare_inputs_for_generation` is explicitly overwritten. However, it doesn't directly inherit from `GenerationMixin`. From 👉v4.50👈 onwards, `PreTrainedModel` will NOT inherit from `GenerationMixin`, and this model will lose the ability to call `generate` and other related functions.
  - If you are the owner of the model architecture code, please modify your model class such that it inherits from `GenerationMixin` (after `PreTrainedModel`, otherwise you'll get an exception).
  - If you are not the owner of the model architecture class, please contact the model code owner to update it.
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another archite

In [5]:
# Step 1: Initial spell correction
input_sentence = "Many issues come from a poor diet. For example, sweet can xffect your health."
words = input_sentence.split()

corrected_words = []
for word in words:
    if word.lower():
        # Spell correction step
        corrected = spell.correction(word)
        corrected_words.append(corrected if corrected else word)  # If no correction, keep original
    else:
        corrected_words.append(word)

print(corrected_words)

['Many', 'issues', 'come', 'from', 'a', 'poor', 'diet', 'For', 'example', 'sweet', 'can', 'effect', 'your', 'health']


In [6]:
## The mispealled word "xffect" was corrected to "effect" - "effect" is more common than "affect"

In [7]:
# Step 2: Contextual prediction on the sentence
context_sentence = " ".join(corrected_words).replace("effect", "[MASK]").replace("affect", "[MASK]")
predictions = fill_mask(context_sentence)

# Filter predictions for "affect" or "effect"
candidates = [pred["token_str"] for pred in predictions if pred["token_str"] in ["affect", "effect"]]

# Final sentence adjustment
if candidates:
    corrected_sentence = context_sentence.replace("[MASK]", candidates[0])
else:
    corrected_sentence = " ".join(corrected_words)  # If no relevant candidates, use spell-check output

print("Top candidates:", candidates)
print("Corrected Sentence:", corrected_sentence)

Top candidates: ['affect']
Corrected Sentence: Many issues come from a poor diet For example sweet can affect your health


In [8]:
## Manually replace "effect" for [MASK]

## Using BERT

In [18]:
from spellchecker import SpellChecker
from transformers import pipeline
import requests

# Initialize spell checker and fill-mask pipeline
spell = SpellChecker()
fill_mask = pipeline("fill-mask", model="bert-base-uncased")

# Function to get confusing words from Datamuse API
#def get_confusing_words(word):
  #  url = f"https://api.datamuse.com/words?rel_hom={word}"
  #  response = requests.get(url)
  #  confusing_words = [item['word'] for item in response.json()]
  #  return confusing_words if confusing_words else [word]

# Step 1: Build Confusing Words Dictionary for Input Sentence
#def build_confusing_word_dict(text):
  #  words = set(text.split())  # Extract unique words
  #  confusing_word_dict = {}

  #  for word in words:
  #      confusing_word_dict[word] = get_confusing_words(word)

  #  return confusing_word_dict

# Step 2: Initial spell correction with Confusing Words Dictionary
input_sentence = "Many issues come from a poor diet. For example, sweet can xffect your health."
confusing_word_dict = build_confusing_word_dict(input_sentence)

corrected_words = []
for word in input_sentence.split():
    if word.lower() in confusing_word_dict:
        # Use spell correction and check against confusing words
        corrected = spell.correction(word)
        # If the corrected word is in confusing word dictionary, add to list
        if corrected and corrected in confusing_word_dict[word.lower()]:
            corrected_words.append(corrected)
        else:
            corrected_words.append(word)  # Keep original if no correction
    else:
        corrected_words.append(word)

# Step 3: Contextual prediction on the corrected sentence
context_sentence = " ".join(corrected_words).replace("xffect", "[MASK]")  # replace xffect with [MASK]

# Check if there's a [MASK] token in context_sentence before proceeding
if "[MASK]" in context_sentence:
    predictions = fill_mask(context_sentence)

    # Filter predictions for "affect" or "effect"
    candidates = [pred["token_str"] for pred in predictions if pred["token_str"] ]
                  

    # Final sentence adjustment
    if candidates:
        corrected_sentence = context_sentence.replace("[MASK]", candidates[0])
    else:
        corrected_sentence = " ".join(corrected_words)  # If no relevant candidates, use spell-check output
else:
    corrected_sentence = " ".join(corrected_words)  # If no [MASK] token, skip fill-mask

print("Top candidates:", candidates if 'candidates' in locals() else "No relevant candidates found")
print("Corrected Sentence:", corrected_sentence)


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Top candidates: ['help', 'improve', 'affect', 'hurt', 'harm']
Corrected Sentence: Many issues come from a poor diet. For example, sweet can help your health.


In [12]:
## BERT is changing effect for help. It does not make sense.

In [19]:
# Step 2: Initial spell correction with Confusing Words Dictionary
input_sentence = "Too much sugar may xffect your health."
confusing_word_dict = build_confusing_word_dict(input_sentence)

corrected_words = []
for word in input_sentence.split():
    if word.lower() in confusing_word_dict:
        # Use spell correction and check against confusing words
        corrected = spell.correction(word)
        # If the corrected word is in confusing word dictionary, add to list
        if corrected and corrected in confusing_word_dict[word.lower()]:
            corrected_words.append(corrected)
        else:
            corrected_words.append(word)  # Keep original if no correction
    else:
        corrected_words.append(word)

# Step 3: Contextual prediction on the corrected sentence
context_sentence = " ".join(corrected_words).replace("xffect", "[MASK]")  # replace xffect with [MASK]

# Check if there's a [MASK] token in context_sentence before proceeding
if "[MASK]" in context_sentence:
    predictions = fill_mask(context_sentence)

    # Filter predictions for "affect" or "effect"
    candidates = [pred["token_str"] for pred in predictions if pred["token_str"] ]
                  

    # Final sentence adjustment
    if candidates:
        corrected_sentence = context_sentence.replace("[MASK]", candidates[0])
    else:
        corrected_sentence = " ".join(corrected_words)  # If no relevant candidates, use spell-check output
else:
    corrected_sentence = " ".join(corrected_words)  # If no [MASK] token, skip fill-mask

print("Top candidates:", candidates if 'candidates' in locals() else "No relevant candidates found")
print("Corrected Sentence:", corrected_sentence)

Top candidates: ['ruin', 'hurt', 'harm', 'affect', 'damage']
Corrected Sentence: Too much sugar may ruin your health.


In [None]:
## Changing the text may change completely the top candidates.

In [None]:
## Next steps: figure out what is happening behind to get these Top Candidates Results. 

# First example
# Top candidates: ['help', 'improve', 'affect', 'hurt', 'harm']

# Second example
# Top candidates: ['ruin', 'hurt', 'harm', 'affect', 'damage']

## Datamuse API to get homophones

In [21]:
# Fetch homophones from Datamuse API

import requests

# Fetch homophones from Datamuse API
def get_homophones(word):
    response = requests.get(f"https://api.datamuse.com/words?rel_hom={word}")
    homophones = [entry['word'] for entry in response.json()]
    return homophones

# Example usage
word = "effect"
homophones = get_homophones(word)
print(f"Commonly confused words for '{word}':", homophones)


word2 = "desert"
homophones = get_homophones(word2)
print(f"Commonly confused words for '{word2}':", homophones)

word3 = "base"
homophones = get_homophones(word3)
print(f"Commonly confused words for '{word3}':", homophones)

Commonly confused words for 'effect': ['affect']
Commonly confused words for 'desert': ['dessert']
Commonly confused words for 'base': ['bass']


In [24]:
import requests
from transformers import pipeline
from spellchecker import SpellChecker

# Initialize spell checker and fill-mask pipeline
spell = SpellChecker()
fill_mask = pipeline("fill-mask", model="bert-base-uncased")

# Function to get homophones from Datamuse API
def get_homophones(word):
    response = requests.get(f"https://api.datamuse.com/words?rel_hom={word}")
    homophones = [entry['word'] for entry in response.json()]
    return homophones

# Function to check and correct words
def correct_sentence(sentence):
    words = sentence.split()
    corrected_sentence = []

    for word in words:
        # Check spelling
        if word.lower() in word_list:  # Assuming word_list is defined with valid words
            corrected_sentence.append(word)
        else:
            # Suggest correction
            corrected_word = spell.correction(word)
            corrected_sentence.append(corrected_word)

            # Get homophones
            homophones = get_homophones(corrected_word)
            if homophones:
                print(f"Potentially confusing words for '{corrected_word}': {homophones}")

                # Replace word with a mask for BERT
                masked_sentence = sentence.replace(word, "[MASK]")
                
                # Apply BERT to predict the best word
                predictions = fill_mask(masked_sentence)

                # Replace the mask with the best prediction
                if predictions:
                    best_candidate = predictions[0]['token_str']
                    corrected_sentence[-1] = best_candidate  # Replace last word with prediction

    return ' '.join(corrected_sentence)

# Example usage
input_sentence = "Many issues come from a poor diet. For example, sweet can xffect your health."
corrected = correct_sentence(input_sentence)
print("Corrected Sentence:", corrected)


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Potentially confusing words for 'diet': ['die it', 'dye it']
Potentially confusing words for 'effect': ['affect']
Corrected Sentence: Many issues come from a poor . For example sweet can help your health


In [25]:
## Same results as before without Datamuse API.