# Spam Email Classification: Phase 1 Baseline

This notebook demonstrates the baseline spam classification pipeline using logistic regression. We'll walk through:

1. Loading and preprocessing the SMS spam dataset
2. Feature extraction using TF-IDF
3. Model training and evaluation
4. Result analysis and visualization

First, let's set up our environment and imports.

In [None]:
import sys
from pathlib import Path

# Add project root to path
project_root = Path().absolute().parent
if str(project_root) not in sys.path:
    sys.path.append(str(project_root))

# Core dependencies
import pandas as pd
import numpy as np
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

# Our package
from src.spam_classifier.data import SpamDataset
from src.spam_classifier.features import TextFeaturizer
from src.spam_classifier.model import SpamClassifier

# Set random seed
np.random.seed(42)

## Load and preprocess data

Let's load the SMS spam dataset and apply our preprocessing pipeline:

In [None]:
# Load and split data
dataset = SpamDataset()
train_df, val_df, test_df = dataset.load_split()

print("Dataset splits:")
print(f"Train: {len(train_df)} examples")
print(f"Val: {len(val_df)} examples")
print(f"Test: {len(test_df)} examples")

# Show class distribution
print("\nClass distribution:")
print("Train:", train_df["label"].value_counts(normalize=True))
print("Val:", val_df["label"].value_counts(normalize=True))
print("Test:", test_df["label"].value_counts(normalize=True))

## Feature extraction

Now we'll convert the text messages to TF-IDF features:

In [None]:
# Extract features
featurizer = TextFeaturizer(max_features=10000)

X_train = featurizer.fit_transform(train_df["text"])
X_val = featurizer.transform(val_df["text"])
X_test = featurizer.transform(test_df["text"])

print("Feature matrices:")
print(f"Train: {X_train.shape}")
print(f"Val: {X_val.shape}")
print(f"Test: {X_test.shape}")

# Show some top features
vocab = featurizer.vocabulary_
top_features = sorted(vocab.items(), key=lambda x: x[1])[:10]
print("\nSample features:", [word for word, idx in top_features])

## Train and evaluate model

Let's train our logistic regression model and evaluate it on all splits:

In [None]:
# Train model
classifier = SpamClassifier(class_weight="balanced")
classifier.fit(X_train, train_df["label"])

# Evaluate on all splits
results = {
    "train": classifier.evaluate(X_train, train_df["label"]),
    "val": classifier.evaluate(X_val, val_df["label"]),
    "test": classifier.evaluate(X_test, test_df["label"])
}

# Print metrics
for split, metrics in results.items():
    print(f"\n{split.upper()} metrics:")
    for metric, value in metrics.items():
        if metric != "confusion_matrix":
            print(f"{metric}: {value:.3f}")

## Visualize confusion matrix

Let's plot the confusion matrix for the validation set:

In [None]:
# Plot confusion matrix for validation set
cm = results["val"]["confusion_matrix"]

plt.figure(figsize=(8, 6))
sns.heatmap(
    cm, 
    annot=True, 
    fmt="d",
    cmap="Blues",
    xticklabels=["Ham", "Spam"],
    yticklabels=["Ham", "Spam"]
)
plt.title("Confusion Matrix (Validation Set)")
plt.xlabel("Predicted")
plt.ylabel("True")
plt.show()

# Save results
results_dir = Path("results") / "phase1"
results_dir.mkdir(parents=True, exist_ok=True)

import json
with open(results_dir / "metrics.json", "w") as f:
    # Convert numpy values to Python native types
    clean_results = {
        split: {
            k: v.tolist() if isinstance(v, np.ndarray) else float(v)
            for k, v in metrics.items()
        }
        for split, metrics in results.items()
    }
    json.dump(clean_results, f, indent=2)

print(f"\nResults saved to {results_dir}/metrics.json")