# Email Spam Detection Pipeline

This notebook demonstrates how a **single Python package** (`src/`) can
handle:

* configuration via YAML
* data loading & merging
* pipeline construction + hyper‑parameter search
* evaluation and model persistence

The core script is `run.py` – the notebook simply calls the same helper
functions, but with added visualisation.  The code structure follows:


src

├── init.py

├── config.py  # load_experiment_config & helpers

├── data_loader.py  # download_and_merge / split

├── model_builder.py  # build_pipeline + build_grid_search

└── trainer.py  # fit_and_evaluate + save_model


The experiments live in `experiments/<run_XX>/` – each run has its own folder with a config file and results.

In [7]:
# ---- Imports --------------------------------------------------------------
import json
from pathlib import Path
from glob import glob
from typing import Dict, Any, List

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns


pd.set_option("display.max_colwidth", None)
pd.set_option("display.max_seq_items", None)        # shows every element in a list/tuple

pd.set_option("display.expand_frame_repr", False)

In [15]:
from src.utils import load_json

final_dirs = Path("experiments/").glob("*/final")
rows: List[Dict[str, Any]] = []
for final_dir in final_dirs:
    exp_dir = final_dir.parent
    name = exp_dir.name
    training_metrics = load_json(final_dir / "train_metrics.json") or load_json(final_dir / "training_metrics.json")
    inference_metrics = load_json(final_dir / "inference_metrics.json")
    results = load_json(exp_dir / "results.json")  # original grid search results, optional
    best_params = load_json(exp_dir / "best_params.json")

    row: Dict[str, Any] = {"experiment": name}

    if training_metrics:
        row["train_total_sec"] = training_metrics.get("total_training_time_sec")
        row["train_per_sample_sec"] = training_metrics.get("time_per_sample_sec")
        row["n_train_samples"] = training_metrics.get("num_samples")
        row["model_size_mb"] = training_metrics.get("model_file_size_mb")

    if inference_metrics:
        row["auc"] = inference_metrics.get("auc", 0.5)
        row["per_sample_ms"] = inference_metrics.get("per_sample_median_ms", 0)

        report = inference_metrics.get("classification_report", {})
        row["accuracy"] = report.get("accuracy", 0)
        row["weighted_f1"] = report.get("weighted avg", {}).get("f1-score", 0)
        row["macro_f1"] = report.get("macro avg", {}).get("f1-score", 0)

    rows.append(row)


results_df = pd.DataFrame(rows)
results_df["clf"] = results_df["experiment"].apply(lambda x: x.split("_")[1])
print(results_df)

         experiment  train_total_sec  train_per_sample_sec  n_train_samples  model_size_mb       auc  per_sample_ms  accuracy  weighted_f1  macro_f1     clf
0  minisbert_LogReg        69.386425              0.008197             8465      87.163617  0.827918       21.71770     0.764     0.765298  0.750080  LogReg
1      SBERT_LogReg       773.808745              0.091413             8465     418.211255  0.838811      157.73990     0.764     0.765191  0.749829  LogReg
2         sbert_SVC        77.352576              0.009138             8465      87.164929       NaN       21.24165     0.759     0.760379  0.744913     SVC
3          TFIDF_NB         1.926714              0.000228             8465       5.247895  0.923971        1.25480     0.856     0.858300  0.852119      NB
4         TFIDF_SVC         3.876536              0.000458             8465      19.200761       NaN        0.55225     0.796     0.793175  0.775824     SVC


In [25]:
import plotly.express as px

# df: your summary dataframe with columns 'model_size_mb_train' and 'val_accuracy'
fig = px.scatter(
    results_df,
    x='per_sample_ms',
    y='weighted_f1',
    # size='model_size_mb',
    color='clf',
    hover_data=['experiment'],  # show name on hover
    title='Model size vs Accuracy',
    labels={
        'model_size_mb_train': 'Model size (MB)',
        'val_accuracy': 'Validation accuracy'
    }
)
fig.update_xaxes(type="log")
fig.update_yaxes(type="log")
fig.update_layout(width=800, height=600)
fig.show()