In [17]:

import os
import re

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier

In [18]:

def load_emails_from_folder(folder_name):
    texts = []
    for fname in sorted(os.listdir(folder_name)):
        fpath = os.path.join(folder_name, fname)
        if os.path.isfile(fpath):
            with open(fpath, "r", encoding="latin-1", errors="ignore") as f:
                texts.append(f.read())
    return texts

ham_texts = load_emails_from_folder("easy_ham")
spam_texts = load_emails_from_folder("spam")

X = ham_texts + spam_texts
y = [0] * len(ham_texts) + [1] * len(spam_texts)  # 0 = ham, 1 = spam

print("Ham emails:", len(ham_texts))
print("Spam emails:", len(spam_texts))
print("Total emails:", len(X))



Ham emails: 2501
Spam emails: 501
Total emails: 3002


In [19]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print("Train size:", len(X_train))
print("Test size:", len(X_test))

Train size: 2401
Test size: 601


In [20]:
def basic_email_cleaner(text, strip_headers=True, lowercase=True, replace_urls=True, replace_numbers=True):
    if strip_headers:
        parts = text.split("\n\n", 1)  # headers end at first blank line (usually)
        if len(parts) == 2:
            text = parts[1]

    if lowercase:
        text = text.lower()

    if replace_urls:
        text = re.sub(r"(http|https)://\S+|www\.\S+", " URL ", text)

    if replace_numbers:
        text = re.sub(r"\d+", " NUMBER ", text)

    # remove some punctuation (keep letters/numbers/space)
    text = re.sub(r"[^a-z0-9\s]", " ", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text

In [21]:
vectorizer = TfidfVectorizer(
    preprocessor=lambda t: basic_email_cleaner(
        t,
        strip_headers=True,
        lowercase=True,
        replace_urls=True,
        replace_numbers=True
    ),
    stop_words="english",
    ngram_range=(1, 2),
    min_df=2
)

In [22]:
def evaluate_model(model, name):
    y_pred = model.predict(X_test)

    print("Accuracy :", accuracy_score(y_test, y_pred))
    print("Precision:", precision_score(y_test, y_pred))
    print("Recall   :", recall_score(y_test, y_pred))
    print("F1       :", f1_score(y_test, y_pred))
    print("Confusion matrix:\n", confusion_matrix(y_test, y_pred))

In [23]:
#  Naive Bayes
nb_model = Pipeline([
    ("tfidf", vectorizer),
    ("clf", MultinomialNB())
])
nb_model.fit(X_train, y_train)
evaluate_model(nb_model, "MultinomialNB")

#  Logistic Regression
lr_model = Pipeline([
    ("tfidf", vectorizer),
    ("clf", LogisticRegression(max_iter=2000, random_state=42))
])
lr_model.fit(X_train, y_train)
evaluate_model(lr_model, "LogisticRegression")

#  SGD
sgd_model = Pipeline([
    ("tfidf", vectorizer),
    ("clf", SGDClassifier(loss="log_loss", random_state=42))
])
sgd_model.fit(X_train, y_train)
evaluate_model(sgd_model, "SGDClassifier (hinge)")

Accuracy : 0.9434276206322796
Precision: 1.0
Recall   : 0.66
F1       : 0.7951807228915663
Confusion matrix:
 [[501   0]
 [ 34  66]]


  norm2_w = weights @ weights if weights.ndim == 1 else squared_norm(weights)
  norm2_w = weights @ weights if weights.ndim == 1 else squared_norm(weights)
  norm2_w = weights @ weights if weights.ndim == 1 else squared_norm(weights)


Accuracy : 0.9600665557404326
Precision: 1.0
Recall   : 0.76
F1       : 0.8636363636363636
Confusion matrix:
 [[501   0]
 [ 24  76]]
Accuracy : 0.9850249584026622
Precision: 1.0
Recall   : 0.91
F1       : 0.9528795811518325
Confusion matrix:
 [[501   0]
 [  9  91]]
