# 04. Model Evaluation - GreenSpace CNN

Comprehensive evaluation of the trained multitask CNN:

## Evaluation Components
1. **Per-task metrics**: Regression (MAE, RÂ²), Binary (F1, AUC), Categorical (Accuracy)
2. **Model interpretability**: Feature importance, activation maps
3. **Error analysis**: Failure cases, confusion matrices
4. **Spatial analysis**: Geographic patterns in predictions
5. **Comparison**: Different architectures and baselines



In [1]:
# Setup: imports, paths, and dataframes
import pandas as pd
import numpy as np
import tensorflow as tf
from pathlib import Path

# Evaluate on the saved split manifests (created in 02_data_preprocessing.ipynb)
splits_dir = Path('../data/processed/splits')
train_csv = splits_dir / 'train.csv'
val_csv   = splits_dir / 'val.csv'
test_csv  = splits_dir / 'test.csv'

for p in [train_csv, val_csv, test_csv]:
    assert p.exists(), f"Missing split manifest: {p} (run 02 first)"

train_df = pd.read_csv(train_csv)
val_df   = pd.read_csv(val_csv)
test_df  = pd.read_csv(test_csv)

print('Loaded splits:', {"train": len(train_df), "val": len(val_df), "test": len(test_df)})

# Binary labels are stored as probabilities in *_p columns
binary_cols = [c for c in train_df.columns if c.endswith('_p')]
assert binary_cols, 'No *_p binary prob cols found in split manifests'

# Class targets (int columns)
for df_name, df in [('train', train_df), ('val', val_df), ('test', test_df)]:
    for c in ['shade_class', 'score_class', 'veg_class', 'image_path']:
        assert c in df.columns, f"Missing {c} in {df_name}.csv"

print('Binary prob cols:', binary_cols)
print('Class cols       :', ['shade_class', 'score_class', 'veg_class'])


  if not hasattr(np, "object"):


Loaded splits: {'train': 1896, 'val': 632, 'test': 632}
Binary prob cols: ['sports_field_p', 'multipurpose_open_area_p', 'children_s_playground_p', 'water_feature_p', 'gardens_p', 'walking_paths_p', 'built_structures_p', 'parking_lots_p']
Class cols       : ['shade_class', 'score_class', 'veg_class']


In [2]:
# Build datasets (no augmentation)
IMG_SIZE = (512, 512)
BATCH_SIZE = 8

NUM_SHADE = 2
NUM_SCORE = 5
NUM_VEG = 5

def decode_image(path):
    img = tf.io.read_file(path)
    img = tf.io.decode_jpeg(img, channels=3)
    img = tf.cast(img, tf.float32) / 255.0
    return img

def make_ds(df):
    paths = df['image_path'].astype(str).tolist()

    ds_paths = tf.data.Dataset.from_tensor_slices(paths)
    ds_imgs = ds_paths.map(decode_image, num_parallel_calls=tf.data.AUTOTUNE)

    # labels (match 03 training)
    y_bin = df[binary_cols].fillna(0.0).astype(np.float32).values

    y_shade = df['shade_class'].fillna(0).astype(np.int32).values

    # score/veg are stored as 1..5 in the manifest; training uses 0..4
    y_score = df['score_class'].fillna(1).astype(np.int32).values - 1
    y_veg   = df['veg_class'].fillna(1).astype(np.int32).values - 1

    # clip defensively
    y_shade = np.clip(y_shade, 0, NUM_SHADE - 1)
    y_score = np.clip(y_score, 0, NUM_SCORE - 1)
    y_veg   = np.clip(y_veg,   0, NUM_VEG - 1)

    ds_labels = tf.data.Dataset.from_tensor_slices({
        'bin_head': y_bin,
        'shade_head': y_shade,
        'score_head': y_score,
        'veg_head': y_veg,
    })

    return tf.data.Dataset.zip((ds_imgs, ds_labels)).batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)

train_ds = make_ds(train_df)
val_ds   = make_ds(val_df)
test_ds  = make_ds(test_df)

print('Datasets ready:', {"train": len(train_df), "val": len(val_df), "test": len(test_df)})


Datasets ready: {'train': 1896, 'val': 632, 'test': 632}


In [3]:
# Load a trained model
# Preferred layout: models/runs/<RUN_TAG>/final_<RUN_TAG>.keras
# Fallback: ../models (legacy flat layout).

runs_root = Path('../models/runs')

# Default: pick the most recently modified run folder, unless you override RUN_DIR manually.
RUN_DIR = globals().get('RUN_DIR', None)
if RUN_DIR is None:
    if runs_root.exists():
        run_dirs = [p for p in runs_root.iterdir() if p.is_dir()]
        run_dirs = sorted(run_dirs, key=lambda p: p.stat().st_mtime, reverse=True)
        RUN_DIR = run_dirs[0] if run_dirs else (runs_root / 'REPLACE_WITH_RUN_TAG')
    else:
        RUN_DIR = runs_root / 'REPLACE_WITH_RUN_TAG'

RUN_DIR = Path(RUN_DIR)
print('Using RUN_DIR =', RUN_DIR)

candidates = []

# 1) Preferred: run-scoped directory
if RUN_DIR.exists():
    candidates += sorted(RUN_DIR.glob('final*.keras'))
    if not candidates:
        candidates += sorted(RUN_DIR.glob('best*.keras'))

# 2) Fallback: legacy flat ../models directory
if not candidates:
    models_dir = Path('../models')
    candidates += sorted(models_dir.glob('final_*.keras'))
    if not candidates:
        candidates += sorted(models_dir.glob('best*.keras'))

assert candidates, (
    f"No model .keras found. Checked RUN_DIR={RUN_DIR} and ../models. "
    f"(Expected e.g. final*.keras or best*.keras)"
)

model_path = candidates[-1]
model = tf.keras.models.load_model(str(model_path))
print('Loaded model from', model_path)


Using RUN_DIR = ../models/runs/20260204_203132
Loaded model from ../models/runs/20260204_203132/final_20260204_203132.keras


## Label Loss Monitoring

In [4]:
# Monitoring: per-head losses + metrics (train / val / test)
# This is the cleanest way to discuss "which head is improving" across runs.

# Ensure the loaded model has the same losses/metrics as training.
# (Optimizer choice does not matter for evaluation, but compile is required for evaluate(..., return_dict=True).)
losses = {
    'bin_head': 'binary_crossentropy',
    'shade_head': 'sparse_categorical_crossentropy',
    'score_head': 'sparse_categorical_crossentropy',
    'veg_head': 'sparse_categorical_crossentropy',
}
metrics = {
    'bin_head': ['binary_accuracy'],
    'shade_head': ['sparse_categorical_accuracy'],
    'score_head': ['sparse_categorical_accuracy'],
    'veg_head': ['sparse_categorical_accuracy'],
}
model.compile(optimizer=tf.keras.optimizers.Adam(), loss=losses, metrics=metrics)

# Infer run tag from the run folder (models/runs/<RUN_TAG>/...) or filename (final_<RUN_TAG>.keras)
run_tag = None
try:
    p = Path(model_path)
    # If using models/runs/<RUN_TAG>/..., prefer the folder name
    if 'runs' in p.parts:
        runs_idx = p.parts.index('runs')
        if runs_idx + 1 < len(p.parts):
            run_tag = p.parts[runs_idx + 1]
    # Fallback: parse from filename final_<RUN_TAG>.keras
    if run_tag is None:
        name = p.name
        if name.startswith('final_') and name.endswith('.keras'):
            run_tag = name[len('final_'):-len('.keras')]
except Exception:
    pass
print('Model run tag:', run_tag)

def eval_split(split_name, ds):
    d = model.evaluate(ds, verbose=0, return_dict=True)
    d['split'] = split_name
    return d

rows = [
    eval_split('train', train_ds),
    eval_split('val',   val_ds),
    eval_split('test',  test_ds),
]
mon = pd.DataFrame(rows).set_index('split')

# Keep this table small and report-friendly.
keep = [
    'loss',
    'bin_head_loss', 'shade_head_loss', 'score_head_loss', 'veg_head_loss',
    'bin_head_binary_accuracy',
    'shade_head_sparse_categorical_accuracy',
    'score_head_sparse_categorical_accuracy',
    'veg_head_sparse_categorical_accuracy',
]

# Some keys may be absent depending on how the model was saved/loaded.
keep = [k for k in keep if k in mon.columns]

display(mon[keep].round(4))
print('Note: per-head losses are cross-entropy terms (not directly comparable across heads).')
print('Best practice: compare each head across runs + compare train vs val for that head (over/underfitting).')


Model run tag: 20260204_203132


Unnamed: 0_level_0,loss,bin_head_loss,shade_head_loss,score_head_loss,veg_head_loss,bin_head_binary_accuracy,shade_head_sparse_categorical_accuracy,score_head_sparse_categorical_accuracy,veg_head_sparse_categorical_accuracy
split,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
train,2.6311,0.3384,0.4969,0.9838,0.812,0.8501,0.7632,0.5765,0.6688
val,3.3585,0.3513,0.5446,1.3893,1.0733,0.8453,0.7421,0.4193,0.5585
test,3.5315,0.3445,0.6356,1.421,1.1304,0.8443,0.7104,0.4241,0.5633


Note: per-head losses are cross-entropy terms (not directly comparable across heads).
Best practice: compare each head across runs + compare train vs val for that head (over/underfitting).


### Save Loss Monitoring Artifact

In [5]:
# Save monitoring table
from datetime import datetime

out_dir = (Path('../monitoring_output')).resolve()
out_dir.mkdir(parents=True, exist_ok=True)

tag = run_tag or datetime.now().strftime('%Y%m%d_%H%M%S')
out_path = out_dir / f"loss_monitor_{tag}.csv"

# Save only the compact report columns
mon[keep].to_csv(out_path)
print('Saved monitoring table to', out_path)


Saved monitoring table to /Users/starsrain/2025_codeProject/GreenSpace_CNN/monitoring_output/loss_monitor_20260204_203132.csv


## Threshold Tuning

In [6]:
# Threshold tuning (binary heads): tune on VAL to maximize F1, then apply to TEST
# This does NOT require retraining.

from sklearn.metrics import precision_recall_curve
from datetime import datetime

# 1) Predict on validation
pred_bin_val, _, _, _ = model.predict(val_ds, verbose=0)

# Align y_true (val) to the same binary label order used for probabilities
bin_names = [c[:-2] for c in binary_cols]
hard_bin_names_val = [c for c in bin_names if c in val_df.columns]

if hard_bin_names_val:
    y_bin_val_true = val_df[hard_bin_names_val].fillna(0).astype(int).values
    pred_bin_val_aligned = np.stack([pred_bin_val[:, bin_names.index(n)] for n in hard_bin_names_val], axis=1)
    label_names = hard_bin_names_val
else:
    # Fallback: if hard 0/1 columns are not present, create pseudo-hard labels from *_p
    y_bin_val_true = (val_df[binary_cols].fillna(0.0).astype(np.float32).values >= 0.5).astype(int)
    pred_bin_val_aligned = pred_bin_val
    label_names = bin_names

print('Tuning thresholds on VAL for labels:', label_names)


def tune_thresholds_f1(y_true_mat, y_prob_mat, label_names, min_pos=1):
    """Tune per-label thresholds on validation to maximize F1.

    - Uses precision_recall_curve to generate candidate thresholds.
    - Skips labels with <2 classes in y_true.
    - Tie-break: choose the smallest threshold among max-F1 candidates (recall-friendly).
    """
    rows = []
    for i, name in enumerate(label_names):
        y_true = np.asarray(y_true_mat[:, i]).astype(int)
        y_prob = np.asarray(y_prob_mat[:, i]).astype(float)

        n_pos = int(y_true.sum())
        n = int(len(y_true))
        pos_rate = float(y_true.mean()) if n else float('nan')

        if np.unique(y_true).size < 2 or n_pos < min_pos:
            rows.append({
                'label': name,
                'best_threshold': np.nan,
                'best_f1': np.nan,
                'best_precision': np.nan,
                'best_recall': np.nan,
                'pos_rate': pos_rate,
                'n_pos': n_pos,
                'n': n,
                'note': 'single-class' if np.unique(y_true).size < 2 else 'too-few-positives',
            })
            continue

        precision, recall, thresholds = precision_recall_curve(y_true, y_prob)
        # thresholds has length = len(precision)-1
        if thresholds.size == 0:
            rows.append({
                'label': name,
                'best_threshold': np.nan,
                'best_f1': np.nan,
                'best_precision': np.nan,
                'best_recall': np.nan,
                'pos_rate': pos_rate,
                'n_pos': n_pos,
                'n': n,
                'note': 'no-thresholds',
            })
            continue

        p = precision[:-1]
        r = recall[:-1]
        t = thresholds
        f1 = (2 * p * r) / (p + r + 1e-12)

        best_f1 = float(np.max(f1))
        best_idxs = np.flatnonzero(f1 == best_f1)
        best_idx = int(best_idxs[0])  # smallest threshold among ties

        rows.append({
            'label': name,
            'best_threshold': float(t[best_idx]),
            'best_f1': best_f1,
            'best_precision': float(p[best_idx]),
            'best_recall': float(r[best_idx]),
            'pos_rate': pos_rate,
            'n_pos': n_pos,
            'n': n,
            'note': '',
        })

    df = pd.DataFrame(rows)
    return df


thresh_df = tune_thresholds_f1(y_bin_val_true, pred_bin_val_aligned, label_names)

# Save for reuse in later cells
best_thresholds = {
    row['label']: float(row['best_threshold'])
    for _, row in thresh_df.iterrows()
    if np.isfinite(row['best_threshold'])
}

print('\nTop thresholds by best_f1 (VAL):')
display(thresh_df.sort_values('best_f1', ascending=False).reset_index(drop=True))

_defined = thresh_df['best_f1'].notna()
if _defined.any():
    print(
        f"overall (VAL) over definable labels: "
        f"F1={float(thresh_df.loc[_defined, 'best_f1'].mean()):.3f} | "
        f"P={float(thresh_df.loc[_defined, 'best_precision'].mean()):.3f} | "
        f"R={float(thresh_df.loc[_defined, 'best_recall'].mean()):.3f}"
    )
else:
    print('overall (VAL): NA (no definable labels)')

# Optional: save thresholds to monitoring_output
out_dir = Path('../monitoring_output').resolve()
out_dir.mkdir(parents=True, exist_ok=True)

_tag = globals().get('run_tag', None)
if _tag is None:
    _tag = datetime.now().strftime('%Y%m%d_%H%M%S')

out_path = out_dir / f"thresholds_{_tag}.csv"
thresh_df.to_csv(out_path, index=False)
print('Saved thresholds to', out_path)


Tuning thresholds on VAL for labels: ['sports_field', 'multipurpose_open_area', 'children_s_playground', 'water_feature', 'gardens', 'walking_paths', 'built_structures', 'parking_lots']

Top thresholds by best_f1 (VAL):


Unnamed: 0,label,best_threshold,best_f1,best_precision,best_recall,pos_rate,n_pos,n,note
0,multipurpose_open_area,0.272524,0.935743,0.915521,0.956879,0.77057,487,632,
1,walking_paths,0.332493,0.907937,0.871951,0.94702,0.716772,453,632,
2,built_structures,0.450735,0.8,0.798354,0.801653,0.382911,242,632,
3,parking_lots,0.451296,0.712766,0.653659,0.783626,0.27057,171,632,
4,sports_field,0.431477,0.712251,0.706215,0.718391,0.275316,174,632,
5,children_s_playground,0.217674,0.42623,0.393939,0.464286,0.132911,84,632,
6,water_feature,0.250731,0.415663,0.312217,0.621622,0.175633,111,632,
7,gardens,0.083024,0.16309,0.093137,0.655172,0.045886,29,632,


overall (VAL) over definable labels: F1=0.634 | P=0.593 | R=0.744
Saved thresholds to /Users/starsrain/2025_codeProject/GreenSpace_CNN/monitoring_output/thresholds_20260204_203132.csv


In [7]:
# Predict on test
pred_bin, pred_shade, pred_score, pred_veg = model.predict(test_ds, verbose=0)

# Ground truth
# For binaries, prefer hard 0/1 columns if present (e.g. sports_field), else threshold *_p at 0.5
bin_names = [c[:-2] for c in binary_cols]
hard_bin_names = [c for c in bin_names if c in test_df.columns]

if hard_bin_names:
    y_bin_true = test_df[hard_bin_names].fillna(0).astype(int).values
    # align pred_bin columns to hard_bin_names order
    pred_bin_aligned = np.stack([pred_bin[:, bin_names.index(n)] for n in hard_bin_names], axis=1)
else:
    y_bin_true = (test_df[binary_cols].fillna(0.0).astype(np.float32).values >= 0.5).astype(int)
    pred_bin_aligned = pred_bin
    hard_bin_names = bin_names

y_shade_true = test_df['shade_class'].fillna(0).astype(int).values
# stored as 1..5; convert to 0..4 to match training targets
y_score_true = test_df['score_class'].fillna(1).astype(int).values - 1
y_veg_true   = test_df['veg_class'].fillna(1).astype(int).values - 1

# Metrics
from sklearn.metrics import (
    precision_recall_fscore_support,
    accuracy_score,
    roc_auc_score,
    average_precision_score,
)

print('--- Binary (threshold=0.5) ---')
for i, name in enumerate(hard_bin_names):
    y_prob = pred_bin_aligned[:, i]
    y_hat = (y_prob >= 0.5).astype(int)
    y_true = y_bin_true[:, i]
    p, r, f1, _ = precision_recall_fscore_support(y_true, y_hat, average='binary', zero_division=0)
    print(f"{name:24s} P={p:.2f} R={r:.2f} F1={f1:.2f}")


f1_list_05 = []
for i, name in enumerate(hard_bin_names):
    y_prob = pred_bin_aligned[:, i]
    y_hat = (y_prob >= 0.5).astype(int)
    y_true = y_bin_true[:, i]
    _, _, f1, _ = precision_recall_fscore_support(y_true, y_hat, average='binary', zero_division=0)
    f1_list_05.append(float(f1))
# Overall overall F1 for threshold=0.5 (binary heads)
# Uses the same per-label F1 definition as the printed table above.
print(f"overall F1 (threshold=0.5) = {float(np.mean(f1_list_05)):.3f}")

print('--- Binary (AUC) ---')
roc_list = []
ap_list = []
for i, name in enumerate(hard_bin_names):
    y_true = y_bin_true[:, i]
    y_prob = pred_bin_aligned[:, i]

    if np.unique(y_true).size < 2:
        print(f"{name:24s} ROC_AUC=NA PR_AUC=NA (single-class)")
        continue

    roc = float(roc_auc_score(y_true, y_prob))
    ap = float(average_precision_score(y_true, y_prob))
    roc_list.append(roc)
    ap_list.append(ap)
    print(f"{name:24s} ROC_AUC={roc:.3f} PR_AUC={ap:.3f}")

if roc_list:
    print(f"overall ROC_AUC={float(np.mean(roc_list)):.3f} overall PR_AUC={float(np.mean(ap_list)):.3f}")
else:
    print("overall ROC_AUC=NA overall PR_AUC=NA (no definable labels)")

print('--- Binary (tuned thresholds from val; optimize F1) ---')
if 'best_thresholds' not in globals() or not isinstance(best_thresholds, dict) or len(best_thresholds) == 0:
    print('No tuned thresholds found. Run the threshold tuning cell above.')
else:
    f1_list = []
    for i, name in enumerate(hard_bin_names):
        thr = best_thresholds.get(name, None)
        if thr is None or not np.isfinite(thr):
            print(f"{name:24s} thr=NA (not tuned / single-class on val)")
            continue

        y_prob = pred_bin_aligned[:, i]
        y_hat = (y_prob >= thr).astype(int)
        y_true = y_bin_true[:, i]
        p, r, f1, _ = precision_recall_fscore_support(y_true, y_hat, average='binary', zero_division=0)
        f1_list.append(float(f1))
        print(f"{name:24s} thr={thr:.3f} P={p:.2f} R={r:.2f} F1={f1:.2f}")

    if f1_list:
        print(f"overall F1={float(np.mean(f1_list)):.3f} (over tuned/defined labels)")
    else:
        print('overall F1=NA (no definable labels)')

print('--- Shade / Score / Veg ---')
shade_acc = accuracy_score(y_shade_true, pred_shade.argmax(axis=1))
score_acc = accuracy_score(y_score_true, pred_score.argmax(axis=1))
veg_acc   = accuracy_score(y_veg_true,   pred_veg.argmax(axis=1))
print(f"Shade accuracy: {shade_acc:.3f}")
print(f"Score accuracy: {score_acc:.3f}")
print(f"Veg   accuracy: {veg_acc:.3f}")

# Expected-value MAE for score/veg (convert back to 1..5 scale)
classes_1to5 = np.arange(1, 6, dtype=np.float32)
score_expected = (pred_score * classes_1to5).sum(axis=1)
veg_expected   = (pred_veg   * classes_1to5).sum(axis=1)

score_true_1to5 = (y_score_true + 1).astype(np.float32)
veg_true_1to5   = (y_veg_true + 1).astype(np.float32)

mae_score = float(np.mean(np.abs(score_expected - score_true_1to5)))
mae_veg   = float(np.mean(np.abs(veg_expected   - veg_true_1to5)))
print(f"Score MAE (expected value): {mae_score:.3f}")
print(f"Veg   MAE (expected value): {mae_veg:.3f}")


--- Binary (threshold=0.5) ---
sports_field             P=0.74 R=0.69 F1=0.71
multipurpose_open_area   P=0.95 R=0.85 F1=0.89
children_s_playground    P=0.33 R=0.05 F1=0.08
water_feature            P=0.56 R=0.04 F1=0.07
gardens                  P=0.00 R=0.00 F1=0.00
walking_paths            P=0.90 R=0.90 F1=0.90
built_structures         P=0.84 R=0.71 F1=0.77
parking_lots             P=0.68 R=0.72 F1=0.70
overall F1 (threshold=0.5) = 0.516
--- Binary (AUC) ---
sports_field             ROC_AUC=0.899 PR_AUC=0.813
multipurpose_open_area   ROC_AUC=0.919 PR_AUC=0.972
children_s_playground    ROC_AUC=0.713 PR_AUC=0.225
water_feature            ROC_AUC=0.648 PR_AUC=0.328
gardens                  ROC_AUC=0.671 PR_AUC=0.066
walking_paths            ROC_AUC=0.911 PR_AUC=0.965
built_structures         ROC_AUC=0.911 PR_AUC=0.876
parking_lots             ROC_AUC=0.895 PR_AUC=0.755
overall ROC_AUC=0.821 overall PR_AUC=0.625
--- Binary (tuned thresholds from val; optimize F1) ---
sports_field          

In [None]:
# Save evaluation artifacts (per run) to report_outputs/
# - binary_metrics_<run_tag>.csv : per-label metrics (thr=0.5 + tuned if available + AUC)
# - eval_summary_<run_tag>.json  : compact overall summary + provenance

from pathlib import Path
from datetime import datetime
import json
import numpy as np
import pandas as pd

out_dir = Path('../report_outputs').resolve()
out_dir.mkdir(parents=True, exist_ok=True)

tag = globals().get('run_tag', None) or datetime.now().strftime('%Y%m%d_%H%M%S')

# --- Binary per-label table ---
assert 'hard_bin_names' in globals(), 'Expected hard_bin_names from the evaluation cell.'
assert 'pred_bin_aligned' in globals(), 'Expected pred_bin_aligned from the evaluation cell.'
assert 'y_bin_true' in globals(), 'Expected y_bin_true from the evaluation cell.'

rows = []
for i, name in enumerate(hard_bin_names):
    y_true = y_bin_true[:, i]
    y_prob = pred_bin_aligned[:, i]

    # threshold=0.5
    y_hat_05 = (y_prob >= 0.5).astype(int)
    p05, r05, f105, _ = precision_recall_fscore_support(y_true, y_hat_05, average='binary', zero_division=0)

    # AUCs (may be undefined for single-class)
    roc = None
    ap = None
    if np.unique(y_true).size >= 2:
        roc = float(roc_auc_score(y_true, y_prob))
        ap = float(average_precision_score(y_true, y_prob))

    # tuned threshold (if available)
    thr = None
    pt = rt = f1t = None
    if 'best_thresholds' in globals() and isinstance(best_thresholds, dict):
        thr = best_thresholds.get(name, None)
        if thr is not None and np.isfinite(thr):
            y_hat_t = (y_prob >= float(thr)).astype(int)
            pt, rt, f1t, _ = precision_recall_fscore_support(y_true, y_hat_t, average='binary', zero_division=0)
            pt, rt, f1t = float(pt), float(rt), float(f1t)

    rows.append({
        'label': name,
        'support_pos_test': int(np.sum(y_true == 1)),
        'support_neg_test': int(np.sum(y_true == 0)),
        'P@0.5': float(p05),
        'R@0.5': float(r05),
        'F1@0.5': float(f105),
        'ROC_AUC': roc,
        'PR_AUC': ap,
        'tuned_thr': (float(thr) if thr is not None and np.isfinite(thr) else None),
        'P@tuned': pt,
        'R@tuned': rt,
        'F1@tuned': f1t,
    })

bin_df = pd.DataFrame(rows)

# Overall (mean over labels where defined)
overall = {
    'overall_F1@0.5': float(np.nanmean(bin_df['F1@0.5'].values)) if len(bin_df) else None,
    'overall_ROC_AUC': float(np.nanmean(bin_df['ROC_AUC'].values)) if 'ROC_AUC' in bin_df.columns else None,
    'overall_PR_AUC': float(np.nanmean(bin_df['PR_AUC'].values)) if 'PR_AUC' in bin_df.columns else None,
    'overall_F1@tuned': float(np.nanmean(bin_df['F1@tuned'].values)) if 'F1@tuned' in bin_df.columns else None,
}

# --- Showcase overall metrics first ---
print('--- Overall metrics ---')
print(f"Binary overall F1@0.5 : {overall['overall_F1@0.5']:.3f}" if overall.get('overall_F1@0.5') is not None else 'Binary overall F1@0.5 : NA')
print(f"Binary overall ROC_AUC: {overall['overall_ROC_AUC']:.3f}" if overall.get('overall_ROC_AUC') is not None else 'Binary overall ROC_AUC: NA')
print(f"Binary overall PR_AUC : {overall['overall_PR_AUC']:.3f}" if overall.get('overall_PR_AUC') is not None else 'Binary overall PR_AUC : NA')
print(f"Binary overall F1@tuned: {overall['overall_F1@tuned']:.3f}" if overall.get('overall_F1@tuned') is not None else 'Binary overall F1@tuned: NA')

# --- Save a compact JSON summary (provenance + multiclass + overall) ---
summary = {
    'run_tag': tag,
    'saved_at': datetime.now().isoformat(timespec='seconds'),
    'model_path': str(globals().get('model_path', '')),
    'run_dir': str(globals().get('RUN_DIR', '')),
    'n_test': int(len(y_shade_true)) if 'y_shade_true' in globals() else None,
    'binary_overall': overall,
    'shade_accuracy': float(globals().get('shade_acc')) if 'shade_acc' in globals() else None,
    'score_accuracy': float(globals().get('score_acc')) if 'score_acc' in globals() else None,
    'veg_accuracy': float(globals().get('veg_acc')) if 'veg_acc' in globals() else None,
    'score_mae_expected_value': float(globals().get('mae_score')) if 'mae_score' in globals() else None,
    'veg_mae_expected_value': float(globals().get('mae_veg')) if 'mae_veg' in globals() else None,
}

summary_path = out_dir / f"eval_summary_{tag}.json"
summary_path.write_text(json.dumps(summary, indent=2))
print('Saved:', summary_path)

# --- Then save label-level metrics ---
bin_out_path = out_dir / f"binary_metrics_{tag}.csv"
bin_df.to_csv(bin_out_path, index=False)
print('Saved:', bin_out_path)


--- Overall metrics ---
Binary overall F1@0.5 : 0.516
Binary overall ROC_AUC: 0.821
Binary overall PR_AUC : 0.625
Binary overall F1@tuned: 0.608
Saved: /Users/starsrain/2025_codeProject/GreenSpace_CNN/report_outputs/eval_summary_20260204_203132.json
Saved: /Users/starsrain/2025_codeProject/GreenSpace_CNN/report_outputs/binary_metrics_20260204_203132.csv
