# 04. Model Evaluation - GreenSpace CNN

Comprehensive evaluation of the trained multitask CNN:

## Evaluation Components
1. **Per-task metrics**: Regression (MAE, RÂ²), Binary (F1, AUC), Categorical (Accuracy)
2. **Model interpretability**: Feature importance, activation maps
3. **Error analysis**: Failure cases, confusion matrices
4. **Spatial analysis**: Geographic patterns in predictions
5. **Comparison**: Different architectures and baselines



In [1]:
# Setup: imports, paths, and dataframes
import pandas as pd
import numpy as np
import tensorflow as tf
from pathlib import Path

# Evaluate on the saved split manifests (created in 02_data_preprocessing.ipynb)
splits_dir = Path('../data/processed/splits')
train_csv = splits_dir / 'train.csv'
val_csv   = splits_dir / 'val.csv'
test_csv  = splits_dir / 'test.csv'

for p in [train_csv, val_csv, test_csv]:
    assert p.exists(), f"Missing split manifest: {p} (run 02 first)"

train_df = pd.read_csv(train_csv)
val_df   = pd.read_csv(val_csv)
test_df  = pd.read_csv(test_csv)

print('Loaded splits:', {"train": len(train_df), "val": len(val_df), "test": len(test_df)})

# Binary labels are stored as probabilities in *_p columns
binary_cols = [c for c in train_df.columns if c.endswith('_p')]
assert binary_cols, 'No *_p binary prob cols found in split manifests'

# Class targets (int columns)
for df_name, df in [('train', train_df), ('val', val_df), ('test', test_df)]:
    for c in ['shade_class', 'score_class', 'veg_class', 'image_path']:
        assert c in df.columns, f"Missing {c} in {df_name}.csv"

print('Binary prob cols:', binary_cols)
print('Class cols       :', ['shade_class', 'score_class', 'veg_class'])


  if not hasattr(np, "object"):


Loaded splits: {'train': 1896, 'val': 632, 'test': 632}
Binary prob cols: ['sports_field_p', 'multipurpose_open_area_p', 'children_s_playground_p', 'water_feature_p', 'gardens_p', 'walking_paths_p', 'built_structures_p', 'parking_lots_p']
Class cols       : ['shade_class', 'score_class', 'veg_class']


In [2]:
# Build datasets (no augmentation)
IMG_SIZE = (512, 512)
BATCH_SIZE = 8

NUM_SHADE = 2
NUM_SCORE = 5
NUM_VEG = 5

def decode_image(path):
    img = tf.io.read_file(path)
    img = tf.io.decode_jpeg(img, channels=3)
    img = tf.cast(img, tf.float32) / 255.0
    return img

def make_ds(df):
    paths = df['image_path'].astype(str).tolist()

    ds_paths = tf.data.Dataset.from_tensor_slices(paths)
    ds_imgs = ds_paths.map(decode_image, num_parallel_calls=tf.data.AUTOTUNE)

    # labels (match 03 training)
    y_bin = df[binary_cols].fillna(0.0).astype(np.float32).values

    y_shade = df['shade_class'].fillna(0).astype(np.int32).values

    # score/veg are stored as 1..5 in the manifest; training uses 0..4
    y_score = df['score_class'].fillna(1).astype(np.int32).values - 1
    y_veg   = df['veg_class'].fillna(1).astype(np.int32).values - 1

    # clip defensively
    y_shade = np.clip(y_shade, 0, NUM_SHADE - 1)
    y_score = np.clip(y_score, 0, NUM_SCORE - 1)
    y_veg   = np.clip(y_veg,   0, NUM_VEG - 1)

    ds_labels = tf.data.Dataset.from_tensor_slices({
        'bin_head': y_bin,
        'shade_head': y_shade,
        'score_head': y_score,
        'veg_head': y_veg,
    })

    return tf.data.Dataset.zip((ds_imgs, ds_labels)).batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)

train_ds = make_ds(train_df)
val_ds   = make_ds(val_df)
test_ds  = make_ds(test_df)

print('Datasets ready:', {"train": len(train_df), "val": len(val_df), "test": len(test_df)})


Datasets ready: {'train': 1896, 'val': 632, 'test': 632}


In [7]:
# Load a trained model
# If you store artifacts under models/runs/<RUN_TAG>/, point RUN_DIR there.
# Fallback: use ../models (legacy flat layout).

RUN_DIR = Path('../models/runs/20260129_074737')  # <-- change run tag here when needed

candidates = []

# 1) Preferred: run-scoped directory
if RUN_DIR.exists():
    candidates += sorted(RUN_DIR.glob('final*.keras'))
    if not candidates:
        candidates += sorted(RUN_DIR.glob('best*.keras'))

# 2) Fallback: legacy flat ../models directory
if not candidates:
    models_dir = Path('../models')
    candidates += sorted(models_dir.glob('final_*.keras'))
    if not candidates:
        candidates += sorted(models_dir.glob('best*.keras'))

assert candidates, (
    f"No model .keras found. Checked RUN_DIR={RUN_DIR} and ../models. "
    f"(Expected e.g. final*.keras or best*.keras)"
)

model_path = candidates[-1]
model = tf.keras.models.load_model(str(model_path))
print('Loaded model from', model_path)


Loaded model from ../models/runs/20260129_074737/final_20260129_074737.keras


In [4]:
# Monitoring: per-head losses + metrics (train / val / test)
# This is the cleanest way to discuss "which head is improving" across runs.

# Ensure the loaded model has the same losses/metrics as training.
# (Optimizer choice does not matter for evaluation, but compile is required for evaluate(..., return_dict=True).)
losses = {
    'bin_head': 'binary_crossentropy',
    'shade_head': 'sparse_categorical_crossentropy',
    'score_head': 'sparse_categorical_crossentropy',
    'veg_head': 'sparse_categorical_crossentropy',
}
metrics = {
    'bin_head': ['binary_accuracy'],
    'shade_head': ['sparse_categorical_accuracy'],
    'score_head': ['sparse_categorical_accuracy'],
    'veg_head': ['sparse_categorical_accuracy'],
}
model.compile(optimizer=tf.keras.optimizers.Adam(), loss=losses, metrics=metrics)

# Infer run tag from the run folder (models/runs/<RUN_TAG>/...) or filename (final_<RUN_TAG>.keras)
run_tag = None
try:
    p = Path(model_path)
    # If using models/runs/<RUN_TAG>/..., prefer the folder name
    if 'runs' in p.parts:
        runs_idx = p.parts.index('runs')
        if runs_idx + 1 < len(p.parts):
            run_tag = p.parts[runs_idx + 1]
    # Fallback: parse from filename final_<RUN_TAG>.keras
    if run_tag is None:
        name = p.name
        if name.startswith('final_') and name.endswith('.keras'):
            run_tag = name[len('final_'):-len('.keras')]
except Exception:
    pass
print('Model run tag:', run_tag)

def eval_split(split_name, ds):
    d = model.evaluate(ds, verbose=0, return_dict=True)
    d['split'] = split_name
    return d

rows = [
    eval_split('train', train_ds),
    eval_split('val',   val_ds),
    eval_split('test',  test_ds),
]
mon = pd.DataFrame(rows).set_index('split')

# Keep this table small and report-friendly.
keep = [
    'loss',
    'bin_head_loss', 'shade_head_loss', 'score_head_loss', 'veg_head_loss',
    'bin_head_binary_accuracy',
    'shade_head_sparse_categorical_accuracy',
    'score_head_sparse_categorical_accuracy',
    'veg_head_sparse_categorical_accuracy',
]

# Some keys may be absent depending on how the model was saved/loaded.
keep = [k for k in keep if k in mon.columns]

display(mon[keep].round(4))
print('Note: per-head losses are cross-entropy terms (not directly comparable across heads).')
print('Best practice: compare each head across runs + compare train vs val for that head (over/underfitting).')


Model run tag: 20260129_115813


Unnamed: 0_level_0,loss,bin_head_loss,shade_head_loss,score_head_loss,veg_head_loss,bin_head_binary_accuracy,shade_head_sparse_categorical_accuracy,score_head_sparse_categorical_accuracy,veg_head_sparse_categorical_accuracy
split,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
train,3.1724,0.4253,0.6368,1.1671,0.9432,0.7973,0.7104,0.491,0.6181
val,3.7419,0.4265,0.6507,1.4751,1.1896,0.8022,0.7294,0.3687,0.4905
test,3.869,0.4307,0.7346,1.4713,1.2323,0.7909,0.6962,0.4066,0.4778


Note: per-head losses are cross-entropy terms (not directly comparable across heads).
Best practice: compare each head across runs + compare train vs val for that head (over/underfitting).


In [5]:
# Save monitoring table
from datetime import datetime

out_dir = (Path('../monitoring_output')).resolve()
out_dir.mkdir(parents=True, exist_ok=True)

tag = run_tag or datetime.now().strftime('%Y%m%d_%H%M%S')
out_path = out_dir / f"loss_monitor_{tag}.csv"

# Save only the compact report columns
mon[keep].to_csv(out_path)
print('Saved monitoring table to', out_path)


Saved monitoring table to /Users/starsrain/2025_codeProject/GreenSpace_CNN/monitoring_output/loss_monitor_20260129_115813.csv


In [9]:
# Predict on test
pred_bin, pred_shade, pred_score, pred_veg = model.predict(test_ds, verbose=0)

# Ground truth
# For binaries, prefer hard 0/1 columns if present (e.g. sports_field), else threshold *_p at 0.5
bin_names = [c[:-2] for c in binary_cols]
hard_bin_names = [c for c in bin_names if c in test_df.columns]

if hard_bin_names:
    y_bin_true = test_df[hard_bin_names].fillna(0).astype(int).values
    # align pred_bin columns to hard_bin_names order
    pred_bin_aligned = np.stack([pred_bin[:, bin_names.index(n)] for n in hard_bin_names], axis=1)
else:
    y_bin_true = (test_df[binary_cols].fillna(0.0).astype(np.float32).values >= 0.5).astype(int)
    pred_bin_aligned = pred_bin
    hard_bin_names = bin_names

y_shade_true = test_df['shade_class'].fillna(0).astype(int).values
# stored as 1..5; convert to 0..4 to match training targets
y_score_true = test_df['score_class'].fillna(1).astype(int).values - 1
y_veg_true   = test_df['veg_class'].fillna(1).astype(int).values - 1

# Metrics
from sklearn.metrics import (
    precision_recall_fscore_support,
    accuracy_score,
    roc_auc_score,
    average_precision_score,
)

print('--- Binary (threshold=0.5) ---')
for i, name in enumerate(hard_bin_names):
    y_prob = pred_bin_aligned[:, i]
    y_hat = (y_prob >= 0.5).astype(int)
    y_true = y_bin_true[:, i]
    p, r, f1, _ = precision_recall_fscore_support(y_true, y_hat, average='binary', zero_division=0)
    print(f"{name:24s} P={p:.2f} R={r:.2f} F1={f1:.2f}")

print('--- Binary (AUC) ---')
roc_list = []
ap_list = []
for i, name in enumerate(hard_bin_names):
    y_true = y_bin_true[:, i]
    y_prob = pred_bin_aligned[:, i]

    if np.unique(y_true).size < 2:
        print(f"{name:24s} ROC_AUC=NA PR_AUC=NA (single-class)")
        continue

    roc = float(roc_auc_score(y_true, y_prob))
    ap = float(average_precision_score(y_true, y_prob))
    roc_list.append(roc)
    ap_list.append(ap)
    print(f"{name:24s} ROC_AUC={roc:.3f} PR_AUC={ap:.3f}")

if roc_list:
    print(f"macro ROC_AUC={float(np.mean(roc_list)):.3f} macro PR_AUC={float(np.mean(ap_list)):.3f}")
else:
    print("macro ROC_AUC=NA macro PR_AUC=NA (no definable labels)")

print('--- Shade / Score / Veg ---')
shade_acc = accuracy_score(y_shade_true, pred_shade.argmax(axis=1))
score_acc = accuracy_score(y_score_true, pred_score.argmax(axis=1))
veg_acc   = accuracy_score(y_veg_true,   pred_veg.argmax(axis=1))
print(f"Shade accuracy: {shade_acc:.3f}")
print(f"Score accuracy: {score_acc:.3f}")
print(f"Veg   accuracy: {veg_acc:.3f}")

# Expected-value MAE for score/veg (convert back to 1..5 scale)
classes_1to5 = np.arange(1, 6, dtype=np.float32)
score_expected = (pred_score * classes_1to5).sum(axis=1)
veg_expected   = (pred_veg   * classes_1to5).sum(axis=1)

score_true_1to5 = (y_score_true + 1).astype(np.float32)
veg_true_1to5   = (y_veg_true + 1).astype(np.float32)

mae_score = float(np.mean(np.abs(score_expected - score_true_1to5)))
mae_veg   = float(np.mean(np.abs(veg_expected   - veg_true_1to5)))
print(f"Score MAE (expected value): {mae_score:.3f}")
print(f"Veg   MAE (expected value): {mae_veg:.3f}")


--- Binary (threshold=0.5) ---
sports_field             P=0.93 R=0.37 F1=0.53
multipurpose_open_area   P=0.98 R=0.67 F1=0.79
children_s_playground    P=0.00 R=0.00 F1=0.00
water_feature            P=0.00 R=0.00 F1=0.00
gardens                  P=0.00 R=0.00 F1=0.00
walking_paths            P=0.95 R=0.73 F1=0.82
built_structures         P=0.90 R=0.56 F1=0.69
parking_lots             P=0.85 R=0.33 F1=0.48
--- Binary (AUC) ---
sports_field             ROC_AUC=0.923 PR_AUC=0.833
multipurpose_open_area   ROC_AUC=0.924 PR_AUC=0.971
children_s_playground    ROC_AUC=0.762 PR_AUC=0.228
water_feature            ROC_AUC=0.648 PR_AUC=0.322
gardens                  ROC_AUC=0.739 PR_AUC=0.130
walking_paths            ROC_AUC=0.897 PR_AUC=0.955
built_structures         ROC_AUC=0.882 PR_AUC=0.855
parking_lots             ROC_AUC=0.890 PR_AUC=0.753
macro ROC_AUC=0.833 macro PR_AUC=0.631
--- Shade / Score / Veg ---
Shade accuracy: 0.710
Score accuracy: 0.415
Veg   accuracy: 0.434
Score MAE (expected val