In [1]:
import os
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report

In [2]:
def load_emails_from_dir(directory, label):
    emails = []
    labels = []

    for filename in os.listdir(directory):
        path = os.path.join(directory, filename)
        if os.path.isfile(path):
            with open(path, "r", encoding="latin1") as f:
                emails.append(f.read())
                labels.append(label)

    return emails, labels


def clean_email(text,
                lowercase=True,
                remove_punctuation=True,
                replace_urls=True,
                replace_numbers=True):

    if lowercase:
        text = text.lower()

    if replace_urls:
        text = re.sub(r"http\S+|www\S+", " URL ", text)

    if replace_numbers:
        text = re.sub(r"\d+", " NUMBER ", text)

    if remove_punctuation:
        text = re.sub(r"[^\w\s]", " ", text)

    return text

In [3]:
spam_emails, spam_labels = load_emails_from_dir("spam", 1)
ham_emails, ham_labels = load_emails_from_dir("ham", 0)

X = spam_emails + ham_emails
y = spam_labels + ham_labels

print("Total emails:", len(X))
print("Spam:", len(spam_emails))
print("Ham:", len(ham_emails))

FileNotFoundError: [Errno 2] No such file or directory: 'spam'

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)


In [None]:
vectorizer = CountVectorizer(
    preprocessor=clean_email,
    binary=True,
    stop_words="english"
)

X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

print("Feature matrix shape:", X_train_vec.shape)



In [None]:
nb_clf = MultinomialNB()
lr_clf = LogisticRegression(max_iter=1000)
svm_clf = LinearSVC()

nb_clf.fit(X_train_vec, y_train)
lr_clf.fit(X_train_vec, y_train)
svm_clf.fit(X_train_vec, y_train)

In [None]:
print("\nNaive Bayes")
print(classification_report(y_test, nb_clf.predict(X_test_vec)))

print("\nLogistic Regression")
print(classification_report(y_test, lr_clf.predict(X_test_vec)))

print("\nLinear SVM")
print(classification_report(y_test, svm_clf.predict(X_test_vec)))

## Summary of Findings

We built a spam classifier using the Apache SpamAssassin dataset.
Emails were converted into binary bag-of-words feature vectors.

- Naive Bayes achieved high recall, detecting most spam messages.
- Logistic Regression provided balanced precision and recall.
- Linear SVM achieved the highest precision, reducing false positives.

Overall, Logistic Regression and Naive Bayes performed best depending on
whether recall or precision is prioritized.