# 04. Model Evaluation - GreenSpace CNN

Comprehensive evaluation of the trained multitask CNN:

## Evaluation Components
1. **Per-task metrics**: Regression (MAE, R²), Binary (F1, AUC), Categorical (Accuracy)
2. **Model interpretability**: Feature importance, activation maps
3. **Error analysis**: Failure cases, confusion matrices
4. **Spatial analysis**: Geographic patterns in predictions
5. **Comparison**: Different architectures and baselines



In [1]:
# Setup: imports, paths, and dataframes
import pandas as pd
import numpy as np
import tensorflow as tf
from pathlib import Path

# Paths
train_csv = Path('../data/processed/splits/train.csv')
val_csv   = Path('../data/processed/splits/val.csv')
test_csv  = Path('../data/processed/splits/test.csv')

assert val_csv.exists(), 'Missing val.csv split manifest. Run 02 first.'

val_df   = pd.read_csv(val_csv)
print('Loaded val split:', len(val_df))

# Identify label columns (match training)
binary_cols = [c for c in val_df.columns if c.endswith('_p') and not c.startswith(('shade_p_', 'score_p_'))]
shade_cols  = [c for c in val_df.columns if c.startswith('shade_p_')]
score_cols  = [c for c in val_df.columns if c.startswith('score_p_')]

print('Binary labels:', binary_cols)
print('Shade cols   :', shade_cols)
print('Score cols   :', score_cols)



Loaded val split: 10
Binary labels: ['sports_field_p', 'multipurpose_open_area_p', 'childrens_playground_p', 'water_feature_p', 'gardens_p', 'walking_paths_p', 'built_structures_p']
Shade cols   : ['shade_p_none', 'shade_p_some', 'shade_p_abundant']
Score cols   : ['score_p_1', 'score_p_2', 'score_p_3', 'score_p_4', 'score_p_5']


In [2]:
# Build val dataset (no augmentation)
IMG_SIZE = (512, 512)
BATCH_SIZE = 8

def decode_image(path):
    img = tf.io.read_file(path)
    img = tf.io.decode_jpeg(img, channels=3)
    img = tf.cast(img, tf.float32) / 255.0
    return img

paths = val_df['image_path'].astype(str).tolist()

ds_paths = tf.data.Dataset.from_tensor_slices(paths)
ds_imgs = ds_paths.map(decode_image, num_parallel_calls=tf.data.AUTOTUNE)

# labels
y_bin = val_df[binary_cols].astype(np.float32).values
shade_cols = [c for c in val_df.columns if c.startswith('shade_p_')]
score_cols = [c for c in val_df.columns if c.startswith('score_p_')]
y_shade = val_df[shade_cols].astype(np.float32).values
y_score = val_df[score_cols].astype(np.float32).values

ds_labels = tf.data.Dataset.from_tensor_slices({
    'bin_head': y_bin,
    'shade_head': y_shade,
    'score_head': y_score,
})
val_ds = tf.data.Dataset.zip((ds_imgs, ds_labels)).batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)
print('Validation dataset ready:', len(paths))



Validation dataset ready: 10


2025-10-12 22:26:17.102473: I metal_plugin/src/device/metal_device.cc:1154] Metal device set to: Apple M4
2025-10-12 22:26:17.102661: I metal_plugin/src/device/metal_device.cc:296] systemMemory: 16.00 GB
2025-10-12 22:26:17.102668: I metal_plugin/src/device/metal_device.cc:313] maxCacheSize: 5.33 GB
2025-10-12 22:26:17.103011: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2025-10-12 22:26:17.103039: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:271] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


In [3]:
# Load best model
model_path = Path('../models/best.keras')
assert model_path.exists(), 'Missing best.keras. Train (03) first.'
model = tf.keras.models.load_model(str(model_path))
print('Loaded model from', model_path)



Loaded model from ../models/best.keras


In [4]:
# Predict on validation
pred_bin, pred_shade, pred_score = model.predict(val_ds, verbose=0)

# Build ground-truth arrays from val_df
bin_names = [c[:-2] for c in binary_cols]
bin_names = [c for c in bin_names if c in val_df.columns]
y_bin_true = val_df[bin_names].astype(int).values

y_shade_true = val_df['shade_class'].astype(int).values if 'shade_class' in val_df.columns else None
y_score_true = val_df['score_class'].astype(int).values if 'score_class' in val_df.columns else None

# Calibration: pick threshold per binary label by maximizing F1 on val
import json
from sklearn.metrics import precision_recall_fscore_support, accuracy_score

thresholds = {}
metrics_bin = {}
ths = np.linspace(0.05, 0.95, 19)
for i, name in enumerate(bin_names):
    best_f1, best_t = -1.0, 0.5
    y_prob = pred_bin[:, i]
    y_true = y_bin_true[:, i]
    for t in ths:
        y_hat = (y_prob >= t).astype(int)
        p, r, f1, _ = precision_recall_fscore_support(y_true, y_hat, average='binary', zero_division=0)
        if f1 > best_f1:
            best_f1, best_t = f1, t
    thresholds[name] = float(best_t)
    # report at chosen threshold
    y_hat = (y_prob >= best_t).astype(int)
    p, r, f1, _ = precision_recall_fscore_support(y_true, y_hat, average='binary', zero_division=0)
    metrics_bin[name] = {'precision': float(p), 'recall': float(r), 'f1': float(f1)}

# Shade/Score metrics on val (argmax)
metrics_val = {'binary': metrics_bin, 'shade': {}, 'score': {}}
if y_shade_true is not None:
    shade_pred_class = pred_shade.argmax(axis=1)
    from sklearn.metrics import accuracy_score
    acc_shade = accuracy_score(y_shade_true, shade_pred_class)
    metrics_val['shade']['accuracy'] = float(acc_shade)

if y_score_true is not None:
    score_pred_class = pred_score.argmax(axis=1) + 1
    acc_score = accuracy_score(y_score_true, score_pred_class)
    # expected score MAE
    classes = np.arange(1, pred_score.shape[1] + 1, dtype=np.float32)
    score_expected = (pred_score * classes).sum(axis=1)
    y_true = y_score_true.astype(np.float32)
    valid = (~np.isnan(score_expected)) & (~np.isnan(y_true))
    mae_score = float(np.mean(np.abs(score_expected[valid] - y_true[valid]))) if valid.sum() > 0 else float('nan')
    metrics_val['score']['accuracy'] = float(acc_score)
    metrics_val['score']['mae_expected'] = float(mae_score)

# Save thresholds
thr_path = Path('../data/processed/thresholds.json')
thr_path.parent.mkdir(parents=True, exist_ok=True)
with open(thr_path, 'w') as f:
    json.dump({'thresholds': thresholds}, f, indent=2)

print('Calibrated thresholds saved to', thr_path)
print('Binary metrics (val):')
for k, v in metrics_bin.items():
    print(f"  {k}: P={v['precision']:.2f} R={v['recall']:.2f} F1={v['f1']:.2f} @t={thresholds[k]:.2f}")
if 'accuracy' in metrics_val.get('shade', {}):
    print(f"Shade val accuracy: {metrics_val['shade']['accuracy']:.2f}")
if 'accuracy' in metrics_val.get('score', {}):
    print(f"Score val accuracy: {metrics_val['score']['accuracy']:.2f}; MAE(exp): {metrics_val['score']['mae_expected']:.2f}")



2025-10-12 22:26:25.941941: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:117] Plugin optimizer for device_type GPU is enabled.


Calibrated thresholds saved to ../data/processed/thresholds.json
Binary metrics (val):
  sports_field: P=0.00 R=0.00 F1=0.00 @t=0.05
  multipurpose_open_area: P=0.00 R=0.00 F1=0.00 @t=0.05
  childrens_playground: P=0.00 R=0.00 F1=0.00 @t=0.05
  water_feature: P=0.00 R=0.00 F1=0.00 @t=0.05
  gardens: P=0.00 R=0.00 F1=0.00 @t=0.05
  walking_paths: P=0.00 R=0.00 F1=0.00 @t=0.05
  built_structures: P=0.00 R=0.00 F1=0.00 @t=0.05
Shade val accuracy: 0.60
Score val accuracy: 0.10; MAE(exp): nan
