In [1]:
# Aggregation Notebook: Convert per-run JSON results into CSV summaries

import os, glob, json
import pandas as pd
import numpy as np

OUT_DIR = "./outputs"   # change if needed
CSV_DIR = os.path.join(OUT_DIR, "csv")
os.makedirs(CSV_DIR, exist_ok=True)

json_files = sorted(glob.glob(os.path.join(OUT_DIR, "*_results.json")))
print(f"Found {len(json_files)} results files.")
if len(json_files) == 0:
    raise FileNotFoundError("No *_results.json files found in OUT_DIR. Run training notebook first.")


Found 12 results files.


In [2]:
# Load all JSON files
runs = []
for fp in json_files:
    with open(fp, "r") as f:
        runs.append(json.load(f))
print(f"Loaded {len(runs)} runs.")


Loaded 12 runs.


In [3]:
# Build Baseline Summary (one row per dataset–model–seed)
baseline_rows = []
for r in runs:
    c = r["config"]
    b = r["baseline"]
    baseline_rows.append({
        "dataset": c["dataset"],
        "model": c["model_name"],
        "seed": c.get("seed", None),
        "epochs": c["epochs"],
        "batch_size": c["batch_size"],
        "input_size": c.get("input_size", None),
        "lr": c.get("lr", None),
        "weight_decay": c.get("weight_decay", None),
        "momentum": c.get("momentum", None),
        "test_acc": b["test_acc"],
        "test_loss": b["test_loss"],
        "checkpoint_path": r.get("checkpoint_path", None),
    })

df_baseline = pd.DataFrame(baseline_rows).sort_values(["dataset","model","seed"])
df_baseline.head()


Unnamed: 0,dataset,model,seed,epochs,batch_size,input_size,lr,weight_decay,momentum,test_acc,test_loss,checkpoint_path
3,cifar10,efficientnetv2_s,,50,64,224,0.05,0.0001,0.9,0.5985,1.907442,./outputs\cifar10_efficientnetv2_s_e50_bs64.pth
4,cifar10,mobilenetv3_small,,50,64,224,0.05,0.0001,0.9,0.6515,0.989959,./outputs\cifar10_mobilenetv3_small_e50_bs64.pth
5,cifar10,shufflenetv2_0_5,,50,64,224,0.05,0.0001,0.9,0.7211,0.862438,./outputs\cifar10_shufflenetv2_0_5_e50_bs64.pth
0,cifar100,efficientnetv2_s,,50,64,224,0.05,0.0001,0.9,0.3746,5.72305,./outputs\cifar100_efficientnetv2_s_e50_bs64.pth
1,cifar100,mobilenetv3_small,,50,64,224,0.05,0.0001,0.9,0.4609,2.234729,./outputs\cifar100_mobilenetv3_small_e50_bs64.pth


In [4]:
# Build Uncertainty Metrics Summary (one row per dataset–model–seed–method)
metric_rows = []
for r in runs:
    c = r["config"]
    um = r["uncertainty_metrics"]
    for method, vals in um.items():
        metric_rows.append({
            "dataset": c["dataset"],
            "model": c["model_name"],
            "seed": c.get("seed", None),
            "method": method,
            "AUROC_error": vals.get("AUROC_error", np.nan),
            "ECE": vals.get("ECE", np.nan),
            "ARC_area": vals.get("ARC_area", np.nan),
            "AvUC": vals.get("AvUC", np.nan),
        })

df_uncert = pd.DataFrame(metric_rows).sort_values(["dataset","model","seed","method"])
df_uncert.head()


Unnamed: 0,dataset,model,seed,method,AUROC_error,ECE,ARC_area,AvUC
9,cifar10,efficientnetv2_s,,entropy,0.732709,0.020445,0.748996,0.411446
10,cifar10,efficientnetv2_s,,gradient,0.430106,0.020445,0.540664,0.399671
11,cifar10,efficientnetv2_s,,hybrid,0.74558,0.020445,0.78357,0.405559
12,cifar10,mobilenetv3_small,,entropy,0.775363,0.024108,0.834443,0.367732
13,cifar10,mobilenetv3_small,,gradient,0.410395,0.024108,0.584244,0.35733


In [5]:
# Build Percentile Rejection Table (one row per dataset–model–seed–method–reject_percent)
rej_rows = []
for r in runs:
    c = r["config"]
    for row in r["percentile_rejection"]:
        rej_rows.append({
            "dataset": c["dataset"],
            "model": c["model_name"],
            "seed": c.get("seed", None),
            "method": row["method"],
            "reject_percent": row["reject_percent"],
            "rejection_rate": row["rejection_rate"],
            "accuracy_before_rejection": row["accuracy_before_rejection"],
            "accuracy_after_rejection": row["accuracy_after_rejection"],
            "kept_count": row.get("kept_count", None),
        })

df_reject = pd.DataFrame(rej_rows).sort_values(["dataset","model","seed","method","reject_percent"])
df_reject.head()


Unnamed: 0,dataset,model,seed,method,reject_percent,rejection_rate,accuracy_before_rejection,accuracy_after_rejection,kept_count
45,cifar10,efficientnetv2_s,,entropy,10,0.1,0.5985,0.631222,9000
46,cifar10,efficientnetv2_s,,entropy,20,0.2,0.5985,0.662625,8000
47,cifar10,efficientnetv2_s,,entropy,30,0.3,0.5985,0.698571,7000
48,cifar10,efficientnetv2_s,,entropy,40,0.4,0.5985,0.732,6000
49,cifar10,efficientnetv2_s,,entropy,50,0.5,0.5985,0.7676,5000


In [6]:
# Pivot uncertainty metrics (no seed column)
df_pivot = (
    df_uncert
    .groupby(["dataset","model","method"], as_index=False)
    .first()
    .pivot(index=["dataset","model"],
           columns="method",
           values=["AUROC_error","ECE","ARC_area","AvUC"]
        )
    .reset_index()
)

# Flatten columns
df_pivot.columns = [
    col[0] if col[1] == "" else f"{col[0]}_{col[1]}"
    if isinstance(col, tuple) else col
    for col in df_pivot.columns
]

print("Pivot columns after flattening:")
print(df_pivot.columns)

# Merge (without seed)
df_wide = df_baseline.merge(
    df_pivot,
    on=["dataset","model"],
    how="left"
)

print("Final wide shape:", df_wide.shape)
df_wide.head()

Pivot columns after flattening:
Index(['dataset', 'model', 'AUROC_error_entropy', 'AUROC_error_gradient',
       'AUROC_error_hybrid', 'ECE_entropy', 'ECE_gradient', 'ECE_hybrid',
       'ARC_area_entropy', 'ARC_area_gradient', 'ARC_area_hybrid',
       'AvUC_entropy', 'AvUC_gradient', 'AvUC_hybrid'],
      dtype='str')
Final wide shape: (12, 24)


Unnamed: 0,dataset,model,seed,epochs,batch_size,input_size,lr,weight_decay,momentum,test_acc,...,AUROC_error_hybrid,ECE_entropy,ECE_gradient,ECE_hybrid,ARC_area_entropy,ARC_area_gradient,ARC_area_hybrid,AvUC_entropy,AvUC_gradient,AvUC_hybrid
0,cifar10,efficientnetv2_s,,50,64,224,0.05,0.0001,0.9,0.5985,...,0.74558,0.020445,0.020445,0.020445,0.748996,0.540664,0.78357,0.411446,0.399671,0.405559
1,cifar10,mobilenetv3_small,,50,64,224,0.05,0.0001,0.9,0.6515,...,0.776217,0.024108,0.024108,0.024108,0.834443,0.584244,0.836922,0.367732,0.35733,0.362531
2,cifar10,shufflenetv2_0_5,,50,64,224,0.05,0.0001,0.9,0.7211,...,0.805084,0.067836,0.067836,0.067836,0.887238,0.680867,0.891293,0.302412,0.306917,0.304664
3,cifar100,efficientnetv2_s,,50,64,224,0.05,0.0001,0.9,0.3746,...,0.787144,0.20472,0.20472,0.20472,0.582827,0.355537,0.612676,0.413415,0.621995,0.517705
4,cifar100,mobilenetv3_small,,50,64,224,0.05,0.0001,0.9,0.4609,...,0.77506,0.139864,0.139864,0.139864,0.707658,0.37941,0.67949,0.382528,0.542433,0.462481


In [7]:
# Save CSV files
baseline_csv = os.path.join(CSV_DIR, "baseline_summary.csv")
uncert_csv = os.path.join(CSV_DIR, "uncertainty_metrics.csv")
reject_csv = os.path.join(CSV_DIR, "percentile_rejection.csv")
wide_csv = os.path.join(CSV_DIR, "thesis_wide_summary.csv")

df_baseline.to_csv(baseline_csv, index=False)
df_uncert.to_csv(uncert_csv, index=False)
df_reject.to_csv(reject_csv, index=False)
df_wide.to_csv(wide_csv, index=False)

print("Saved:")
print(" -", baseline_csv)
print(" -", uncert_csv)
print(" -", reject_csv)
print(" -", wide_csv)

Saved:
 - ./outputs\csv\baseline_summary.csv
 - ./outputs\csv\uncertainty_metrics.csv
 - ./outputs\csv\percentile_rejection.csv
 - ./outputs\csv\thesis_wide_summary.csv


In [8]:
# Plots
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Where to save figures
FIG_DIR = "./outputs/figures"
os.makedirs(FIG_DIR, exist_ok=True)

def save_fig(fig, filename_base):
    png = os.path.join(FIG_DIR, f"{filename_base}.png")
    pdf = os.path.join(FIG_DIR, f"{filename_base}.pdf")
    fig.tight_layout()
    fig.savefig(png, dpi=300, bbox_inches="tight")
    fig.savefig(pdf, bbox_inches="tight")
    plt.close(fig)
    return png, pdf

def _clean_method_labels(s):
    # Optional: make method names prettier
    return str(s).replace("entropy", "Entropy").replace("gradient", "Gradient").replace("hybrid", "Hybrid")

In [9]:
df_baseline = pd.read_csv("./outputs/csv/baseline_summary.csv")
df_uncert   = pd.read_csv("./outputs/csv/uncertainty_metrics.csv")
df_reject   = pd.read_csv("./outputs/csv/percentile_rejection.csv")

# Normalize naming
df_uncert["method"] = df_uncert["method"].map(_clean_method_labels)
df_reject["method"] = df_reject["method"].map(_clean_method_labels)

df_baseline.head(), df_uncert.head(), df_reject.head()

(    dataset              model  seed  epochs  batch_size  input_size    lr  \
 0   cifar10   efficientnetv2_s   NaN      50          64         224  0.05   
 1   cifar10  mobilenetv3_small   NaN      50          64         224  0.05   
 2   cifar10   shufflenetv2_0_5   NaN      50          64         224  0.05   
 3  cifar100   efficientnetv2_s   NaN      50          64         224  0.05   
 4  cifar100  mobilenetv3_small   NaN      50          64         224  0.05   
 
    weight_decay  momentum  test_acc  test_loss  \
 0        0.0001       0.9    0.5985   1.907442   
 1        0.0001       0.9    0.6515   0.989959   
 2        0.0001       0.9    0.7211   0.862438   
 3        0.0001       0.9    0.3746   5.723050   
 4        0.0001       0.9    0.4609   2.234729   
 
                                      checkpoint_path  
 0    ./outputs\cifar10_efficientnetv2_s_e50_bs64.pth  
 1   ./outputs\cifar10_mobilenetv3_small_e50_bs64.pth  
 2    ./outputs\cifar10_shufflenetv2_0_5_e50_bs6

In [10]:
def plot_avuc_bars(df_uncert):
    # Skip if AvUC column doesn't exist
    if "AvUC" not in df_uncert.columns:
        print("AvUC column not found in data. Skipping AvUC plots.")
        return
        
    for (dataset, model), g in df_uncert.groupby(["dataset", "model"], sort=False):
        summary = g.groupby("method", as_index=False)["AvUC"].mean()

        fig = plt.figure()
        ax = plt.gca()
        ax.bar(summary["method"], summary["AvUC"])
        ax.set_ylabel("AvUC (Lower is Better)")
        ax.set_title(f"AvUC by Method — {dataset} / {model}")
        ax.grid(axis="y", alpha=0.3)

        save_fig(fig, f"AvUC_{dataset}_{model}")

plot_avuc_bars(df_uncert)
print(f"Saved AvUC plots to: {FIG_DIR}")

Saved AvUC plots to: ./outputs/figures


In [11]:
def plot_auroc_bars(df_uncert):
    # Expect columns: dataset, model, seed, method, AUROC_error
    for (dataset, model), g in df_uncert.groupby(["dataset", "model"], sort=False):
        # If multiple seeds exist later, take mean; else it’s just one value
        summary = g.groupby("method", as_index=False)["AUROC_error"].mean()

        fig = plt.figure()
        ax = plt.gca()

        ax.bar(summary["method"], summary["AUROC_error"])
        ax.set_ylim(0.0, 1.0)
        ax.set_ylabel("AUROC (Error Detection)")
        ax.set_title(f"AUROC by Method — {dataset} / {model}")
        ax.grid(axis="y", alpha=0.3)

        save_fig(fig, f"AUROC_{dataset}_{model}")

plot_auroc_bars(df_uncert)
print(f"Saved AUROC plots to: {FIG_DIR}")

Saved AUROC plots to: ./outputs/figures


In [12]:
def plot_ece_bars(df_uncert):
    for (dataset, model), g in df_uncert.groupby(["dataset", "model"], sort=False):
        summary = g.groupby("method", as_index=False)["ECE"].mean()

        fig = plt.figure()
        ax = plt.gca()

        ax.bar(summary["method"], summary["ECE"])
        ax.set_ylabel("ECE (Lower is Better)")
        ax.set_title(f"ECE by Method — {dataset} / {model}")
        ax.grid(axis="y", alpha=0.3)

        save_fig(fig, f"ECE_{dataset}_{model}")

plot_ece_bars(df_uncert)
print(f"Saved ECE plots to: {FIG_DIR}")

Saved ECE plots to: ./outputs/figures


In [13]:
def plot_arc_area_bars(df_uncert):
    for (dataset, model), g in df_uncert.groupby(["dataset", "model"], sort=False):
        summary = g.groupby("method", as_index=False)["ARC_area"].mean()

        fig = plt.figure()
        ax = plt.gca()

        ax.bar(summary["method"], summary["ARC_area"])
        ax.set_ylim(0.0, 1.0)
        ax.set_ylabel("ARC Area (Higher is Better)")
        ax.set_title(f"ARC Area by Method — {dataset} / {model}")
        ax.grid(axis="y", alpha=0.3)

        save_fig(fig, f"ARC_area_{dataset}_{model}")

plot_arc_area_bars(df_uncert)
print(f"Saved ARC-area plots to: {FIG_DIR}")

Saved ARC-area plots to: ./outputs/figures


In [14]:
def plot_rejection_curves(df_reject):
    # Expect columns: dataset, model, seed, method, reject_percent, accuracy_after_rejection, accuracy_before_rejection
    for (dataset, model), g in df_reject.groupby(["dataset", "model"], sort=False):
        fig = plt.figure()
        ax = plt.gca()

        # Plot one line per method
        for method, gm in g.groupby("method", sort=False):
            gm = gm.sort_values("reject_percent")
            ax.plot(gm["reject_percent"], gm["accuracy_after_rejection"], marker="o", label=method)

        # Baseline line (accuracy before rejection)
        base_acc = float(g["accuracy_before_rejection"].iloc[0])
        ax.axhline(base_acc, linestyle="--", linewidth=1.0, label="Baseline (No Rejection)")

        ax.set_xlabel("Rejected Most-Uncertain Samples (%)")
        ax.set_ylabel("Accuracy on Kept Samples")
        ax.set_title(f"Accuracy vs Rejection — {dataset} / {model}")
        ax.grid(alpha=0.3)
        ax.legend()

        save_fig(fig, f"RejectionCurve_{dataset}_{model}")

plot_rejection_curves(df_reject)
print(f"Saved rejection curves to: {FIG_DIR}")

Saved rejection curves to: ./outputs/figures
