# Naive Bayes from Scratch

## Dataaset

In [None]:
dataset = [
    ("Buy cheap meds now", "spam"),
    ("Limited offer only today", "spam"),
    ("Meeting at 10 am", "not_spam"),
    ("Let's have lunch", "not_spam"),
    ("Cheap pills available", "spam"),
    ("Call me tonight", "not_spam"),
    ("Win cash prizes now", "spam"),
    ("Are we still meeting?", "not_spam")
]


## Naive Bayes

In [None]:
import re
from collections import defaultdict
import math

def tokenize(text):
    text = text.lower()
    return re.findall(r'\b\w+\b', text)

class NaiveBayes:
    def __init__(self):
        self.word_counts = defaultdict(lambda: defaultdict(int))
        self.class_counts = defaultdict(int)
        self.vocab = set()
    
    def train(self, dataset):
        for text, label in dataset:
            self.class_counts[label] += 1
            for word in tokenize(text):
                self.word_counts[label][word] += 1
                self.vocab.add(word)
        self.total_docs = sum(self.class_counts.values())

    def predict(self, text):
        words = tokenize(text)
        vocab_size = len(self.vocab)
        scores = {}

        for label in self.class_counts:
            log_prob = math.log(self.class_counts[label] / self.total_docs)
            total_words = sum(self.word_counts[label].values())
            
            for word in words:
                word_freq = self.word_counts[label][word] + 1  # Laplace smoothing
                log_prob += math.log(word_freq / (total_words + vocab_size))
            
            scores[label] = log_prob

        return max(scores, key=scores.get)


In [None]:
model = NaiveBayes()
model.train(dataset)

test_samples = [
    "cheap cash offer",
    "let's meet at night",
    "win big prizes",
    "are you free today"
]

for sample in test_samples:
    result = model.predict(sample)
    print(f"'{sample}' → {result}")


'cheap cash offer' → spam
'let's meet at night' → not_spam
'win big prizes' → spam
'are you free today' → spam


are you free today is a spam lol, next time we should have a bigger dataset :).