In [None]:
from collections import defaultdict
import math

class NaiveBayesClassifier:
    def __init__(self):
        self.word_counts = {"spam": defaultdict(int), "ham": defaultdict(int)}
        self.class_counts = {"spam": 0, "ham": 0}
        self.vocab = set()
    
    def train(self, dataset):
        for email, label in dataset:
            self.class_counts[label] += 1
            for word, count in email.items():
                self.word_counts[label][word] += count
                self.vocab.add(word)
    
    def calculate_prior(self):
        total_emails = sum(self.class_counts.values())
        return {label: self.class_counts[label] / total_emails for label in ["spam", "ham"]}
    
    def calculate_likelihood(self, word, label):
        laplace_smoothing = 1
        word_count = self.word_counts[label][word] + laplace_smoothing
        total_words = sum(self.word_counts[label].values()) + len(self.vocab)  # Laplace smoothing
        return word_count / total_words
    
    def predict(self, email):
        priors = self.calculate_prior()
        scores = {label: math.log(priors[label]) for label in ["spam", "ham"]}
        
        for word in email:
            if word in self.vocab:
                for label in ["spam", "ham"]:
                    scores[label] += math.log(self.calculate_likelihood(word, label))
        
        return max(scores, key=scores.get)
    
    def evaluate(self, test_data):
        correct = 0
        for email, label in test_data:
            prediction = self.predict(email)
            if prediction == label:
                correct += 1
        return correct / len(test_data)

# Sample training dataset
dataset = [
    ({"free": 1, "money": 1, "now": 1}, "spam"),
    ({"limited": 1, "offer": 1, "money": 1}, "spam"),
    ({"hello": 1, "how": 1, "are": 1, "you": 1}, "ham"),
    ({"congratulations": 1, "you": 1, "win": 1}, "spam"),
    ({"meet": 1, "me": 1, "for": 1, "lunch": 1}, "ham"),
    ({"free": 1, "offer": 1, "money": 1}, "spam"),
    ({"hello": 1, "dear": 1, "friend": 1}, "ham"),
    ({"money": 1, "limited": 1, "free": 1}, "spam"),
    ({"how": 1, "are": 1, "you": 1, "today": 1}, "ham"),
]

# Splitting into training and test sets (50% train, 50% test)
train_data = dataset[:4]
test_data = dataset[4:]

# Training the classifier
nb_classifier = NaiveBayesClassifier()
nb_classifier.train(train_data)

# Evaluating the classifier
accuracy = nb_classifier.evaluate(test_data)
print(f"Model Accuracy: {accuracy * 100:.2f}%")

# Predicting a new email
new_email = {"free": 1, "money": 1, "lunch": 1}
prediction = nb_classifier.predict(new_email)
print(f"The email '{new_email}' is classified as: {prediction}")

# I found out that we can easiily increase the accuracy by simply increasing the dataset training portion from 50% to higher.

Model Accuracy: 60.00%
The email '{'free': 1, 'money': 1, 'lunch': 1}' is classified as: spam
