In [1]:
from collections import defaultdict
import math

In [2]:
def preprocess(text):
    text = text.lower()
    tokens = ''.join([char if char.isalnum() or char.isspace() else ' ' for char in text]).split()
    stopwords = {'is', 'the', 'a', 'and', 'in', 'on', 'to', 'of'}
    tokens = [token for token in tokens if token not in stopwords]
    return tokens

In [3]:
def create_vocab(dataset):
    vocab = set()
    for text in dataset:
        vocab.update(preprocess(text))
    return list(vocab)

def text_to_vector(text, vocab):
    vector = [0] * len(vocab)
    tokens = preprocess(text)
    for token in tokens:
        if token in vocab:
            vector[vocab.index(token)] += 1
    return vector

In [4]:
class NaiveBayes:
    def __init__(self):
        self.class_probs = defaultdict(float)
        self.word_probs = defaultdict(lambda: defaultdict(float))

    def train(self, texts, labels):
        class_counts = defaultdict(int)
        word_counts = defaultdict(lambda: defaultdict(int))
        total_docs = len(texts)
        vocab = set()
        
        for text, label in zip(texts, labels):
            class_counts[label] += 1
            tokens = preprocess(text)
            for token in tokens:
                vocab.add(token)
                word_counts[label][token] += 1
        
        self.class_probs = {cls: count / total_docs for cls, count in class_counts.items()}
        self.word_probs = {
            cls: {word: (word_counts[cls][word] + 1) / (sum(word_counts[cls].values()) + len(vocab))
                  for word in vocab}
            for cls in class_counts
        }

    def predict(self, text):
        tokens = preprocess(text)
        class_scores = {}
        
        for cls in self.class_probs:
            log_prob = math.log(self.class_probs[cls])
            for token in tokens:
                if token in self.word_probs[cls]:
                    log_prob += math.log(self.word_probs[cls][token])
            class_scores[cls] = log_prob
        
        return max(class_scores, key=class_scores.get)

In [5]:
def evaluate(predictions, labels):
    correct = sum(1 for p, l in zip(predictions, labels) if p == l)
    accuracy = correct / len(labels)
    return accuracy

In [6]:
dataset = ["I love this product", "This is the worst movie", "Amazing!", "Terrible experience"]
labels = ["positive", "negative", "positive", "negative"]

vocab = create_vocab(dataset)
vectors = [text_to_vector(text, vocab) for text in dataset]

model = NaiveBayes()
model.train(dataset, labels)

test_text = "I hate this"
print(f"Sentiment: {model.predict(test_text)}")

Sentiment: positive
