# Email Spam Detection Pipeline

This notebook demonstrates how a **single Python package** (`src/`) can
handle:

* configuration via YAML
* data loading & merging
* pipeline construction + hyper‑parameter search
* evaluation and model persistence

The core script is `run.py` – the notebook simply calls the same helper
functions, but with added visualisation.  The code structure follows:


src

├── init.py

├── config.py  # load_experiment_config & helpers

├── data_loader.py  # download_and_merge / split

├── model_builder.py  # build_pipeline + build_grid_search

└── trainer.py  # fit_and_evaluate + save_model


The experiments live in `experiments/<run_XX>/` – each run has its own folder with a config file and results.

In [29]:
# ---- Imports --------------------------------------------------------------
import json
from pathlib import Path
from glob import glob
from typing import Dict, Any, List

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns


pd.set_option("display.max_colwidth", None)
pd.set_option("display.max_seq_items", None)        # shows every element in a list/tuple

pd.set_option("display.expand_frame_repr", False)

In [42]:
from src.utils import load_json

final_dirs = Path("experiments/").glob("*/final")
rows: List[Dict[str, Any]] = []
for final_dir in final_dirs:
    exp_dir = final_dir.parent
    name = exp_dir.name
    training_metrics = load_json(final_dir / "train_metrics.json") or load_json(final_dir / "training_metrics.json")
    inference_metrics = load_json(final_dir / "inference_metrics.json")
    results = load_json(exp_dir / "results.json")  # original grid search results, optional
    best_params = load_json(exp_dir / "best_params.json")

    row: Dict[str, Any] = {"experiment": name}

    if training_metrics:
        # row["train_total_sec"] = training_metrics.get("total_training_time_sec")
        row["train_sps"] = training_metrics.get("time_per_sample_sec")
        # row["n_train_samples"] = training_metrics.get("num_samples")
        row["model_size_mb"] = training_metrics.get("model_file_size_mb")
        
        report = training_metrics.get("overfit_classification_report")
        row["train_accuracy"] = report.get("accuracy", np.nan)
        row["train_weighted_f1"] = report.get("weighted avg", {}).get("f1-score", np.nan)
        # row["train_macro_f1"] = report.get("macro avg", {}).get("f1-score", np.nan)

    if inference_metrics:
        # row["val_auc"] = inference_metrics.get("auc", np.nan)
        row["per_sample_ms"] = inference_metrics.get("per_sample_median_ms", np.nan)

        report = inference_metrics.get("classification_report", {})
        row["val_accuracy"] = report.get("accuracy", np.nan)
        row["val_weighted_f1"] = report.get("weighted avg", {}).get("f1-score", np.nan)

    rows.append(row)


results_df = pd.DataFrame(rows)
results_df["clf"] = results_df["experiment"].apply(lambda x: x.split("_")[1])
results_df["f1_score"] = results_df["val_weighted_f1"]/results_df["train_weighted_f1"]
results_df["acc_score"] = results_df["val_accuracy"]/results_df["train_accuracy"]
results_df.style.background_gradient(cmap="viridis").set_properties(**{'font-size': '10px'})

Unnamed: 0,experiment,train_sps,model_size_mb,train_accuracy,train_weighted_f1,per_sample_ms,val_accuracy,val_weighted_f1,clf,f1_score,acc_score
0,minisbert_LogReg,0.008305,87.163617,0.991022,0.990979,21.7177,0.764,0.765298,LogReg,0.772264,0.770921
1,SBERT_LogReg,0.09188,418.211255,0.999291,0.999291,157.7399,0.764,0.765191,LogReg,0.765734,0.764542
2,sbert_SVC,0.009074,87.164929,0.991258,0.991219,21.24165,0.759,0.760379,SVC,0.767115,0.765694
3,TFIDF_NB,0.000227,5.247895,0.997637,0.99763,1.2548,0.856,0.8583,NB,0.860339,0.858027
4,TFIDF_SVC,0.00049,19.200761,1.0,1.0,0.55225,0.796,0.793175,SVC,0.793175,0.796


In [40]:
import plotly.express as px

# df: your summary dataframe with columns 'model_size_mb_train' and 'val_accuracy'
fig = px.scatter(
    results_df,
    x='per_sample_ms',
    y='f1_score',
    # size='model_size_mb',
    color='clf',
    hover_data=['experiment'],  # show name on hover


)
fig.update_xaxes(type="log")
fig.update_layout(width=800, height=600)
fig.show()