In [None]:
import re
import collections
from itertools import product
from typing import List, Tuple, Dict

In [None]:
import re
import collections
from typing import List, Tuple, Dict

class SpellChecker:
    def __init__(self, corpus: str):
        self.vocabulary       = set()
        self.bigramFrequency  = {}
        self.unigramFrequency = {}
        self._buildLanguageModel(corpus)

    def _buildLanguageModel(self, corpus: str):
        words                 = re.findall(r'\b\w+\b', corpus.lower())
        self.vocabulary       = set(words)
        self.unigramFrequency = collections.Counter(words)
        bigrams               = zip(words, words[1:])
        self.bigramFrequency  = collections.Counter(bigrams)

    def _edits1(self, word: str) -> set:
        letters     = 'abcdefghijklmnopqrstuvwxyz'
        splits      = [(word[:i], word[i:]) for i in range(len(word) + 1)]
        deletes     = [L + R[1:] for L, R in splits if R]
        transposes  = [L + R[1] + R[0] + R[2:] for L, R in splits if len(R) > 1]
        replaces    = [L + c + R[1:] for L, R in splits if R for c in letters]
        inserts     = [L + c + R for L, R in splits for c in letters]
        return set(deletes + transposes + replaces + inserts)

    def _bigramProbability(self, candidate: str, previous: str) -> float:
        return (self.bigramFrequency.get((previous, candidate), 0) + 1) / \
               (self.unigramFrequency.get(previous, 0) + len(self.unigramFrequency))

    def _suggestCorrection(self, word: str, previous: str) -> str:
        candidates = [w for w in self._edits1(word) if w in self.vocabulary]
        if not candidates:
            return word
        candidateProbabilities = {w: self._bigramProbability(w, previous) for w in candidates}
        return max(candidateProbabilities, key=candidateProbabilities.get)

    def correct(self, text: str) -> str:
        tokens = re.findall(r'\b\w+\b', text.lower())
        corrected = []
        for i, word in enumerate(tokens):
            if word in self.vocabulary:
                corrected.append(word)
            else:
                previous = tokens[i - 1] if i > 0 else '<s>'
                corrected_word = self._suggestCorrection(word, previous)
                corrected.append(corrected_word)
        return ' '.join(corrected)


with open('/content/corpus.txt', 'r', encoding='utf-8') as f:
    corpus = f.read()
inputText  = "The qick bron foxe jump over the lazi dog."
corrector  = SpellChecker(corpus)
correctedText  = corrector.correct(inputText)
print("Corrected Text:", correctedText)


['the', 'qick', 'bron', 'foxe', 'jump', 'over', 'the', 'lazi', 'dog']
Corrected Text: the quick brown fox jumps over the lazy dog


In [None]:
import re
import math
from collections import Counter, defaultdict
from typing import List, Tuple

class SentimentClassifier:
    def __init__(self, k: float = 1.0):
        self.k = k
        self.vocabulary = set()
        self.classWordFrequency = defaultdict(Counter)  # Word counts per class
        self.classdocFrequency = defaultdict(int)       # Document count per class
        self.classTotalWords = defaultdict(int)      # Total words per class
        self.totalDocs = 0                            # Total number of documents

    def _tokenize(self, text: str) -> List[str]:
        return re.findall(r'\b\w+\b', text.lower())

    def train(self, data: List[Tuple[str, str]]):
        for text, label in data:
            words = re.findall(r'\b\w+\b', text.lower())
            self.vocabulary.update(words)
            self.classWordFrequency[label].update(words)
            self.classdocFrequency[label] += 1
            self.classTotalWords[label] += len(words)
            self.totalDocs += 1


    def _wordProbability(self, word: str, label: str) -> float:
        wordFrequency = self.classWordFrequency[label][word]
        totalWords    = self.classTotalWords[label]
        vocabSize    = len(self.vocabulary)
        return (wordFrequency + self.k) / (totalWords + self.k * vocabSize)

    def _classProbability(self, label: str) -> float:
        return self.classdocFrequency[label] / self.totalDocs

    def predict(self, text: str) -> str:
        words = re.findall(r'\b\w+\b', text.lower())
        classProb = {}
        for label in self.classWordFrequency.keys():
            logProb = math.log(self.classdocFrequency[label] / self.totalDocs)
            for word in words:
                wordProb = self._wordProbability(word, label)
                logProb += math.log(wordProb)
            classProb[label] = logProb
        return max(classProb, key=classProb.get)

    def evaluate(self, test: List[Tuple[str, str]]) -> float:
        correct = 0
        for text, trueLabel in test:
            predLabel = self.predict(text)
            if predLabel == trueLabel:
                correct += 1
        return correct / len(test)

train = [
    ("I love this movie, it was fantastic and amazing", "positive"),
    ("The film was dull and boring", "negative"),
    ("What a wonderful experience, I enjoyed every moment", "positive"),
    ("This was the worst movie I've ever seen", "negative"),
    ("I felt so happy and thrilled", "positive"),
    ("Absolutely terrible, would not recommend it", "negative")
]

test = [
    ("I loved the film", "positive"),
    ("It was boring and I didn’t like it", "negative"),
    ("What an amazing movie", "positive"),
    ("This was not a good experience", "negative")
]

kValues = [0.25, 0.75, 1]
for k in kValues:
    classifier = SentimentClassifier(k=k)
    classifier.train(train)
    print(f"Accuracy with k = {k}: {accuracy:.2f}")
    for text in test:
        print(f"Text: {text[0]}, Predicted Label: {classifier.predict(text[0])}, True Label: {text[1]}")
    accuracy = classifier.evaluate(test)
    print(' ')


Accuracy with k = 0.25: 0.50
Text: I loved the film, Predicted Label: negative, True Label: positive
Text: It was boring and I didn’t like it, Predicted Label: negative, True Label: negative
Text: What an amazing movie, Predicted Label: positive, True Label: positive
Text: This was not a good experience, Predicted Label: positive, True Label: negative
 
Accuracy with k = 0.75: 0.50
Text: I loved the film, Predicted Label: negative, True Label: positive
Text: It was boring and I didn’t like it, Predicted Label: negative, True Label: negative
Text: What an amazing movie, Predicted Label: positive, True Label: positive
Text: This was not a good experience, Predicted Label: positive, True Label: negative
 
Accuracy with k = 1: 0.50
Text: I loved the film, Predicted Label: negative, True Label: positive
Text: It was boring and I didn’t like it, Predicted Label: negative, True Label: negative
Text: What an amazing movie, Predicted Label: positive, True Label: positive
Text: This was not a go