# 04. Model Evaluation - GreenSpace CNN

Comprehensive evaluation of the trained multitask CNN:

## Evaluation Components
1. **Per-task metrics**: Regression (MAE, RÂ²), Binary (F1, AUC), Categorical (Accuracy)
2. **Model interpretability**: Feature importance, activation maps
3. **Error analysis**: Failure cases, confusion matrices
4. **Spatial analysis**: Geographic patterns in predictions
5. **Comparison**: Different architectures and baselines



In [1]:
# Setup: imports, paths, and dataframes
import pandas as pd
import numpy as np
import tensorflow as tf
from pathlib import Path

# Evaluate on TEST split (matches the training manifests)
test_csv = Path('../data/processed/splits/test.csv')
assert test_csv.exists(), 'Missing test.csv split manifest. Run 02 first.'

test_df = pd.read_csv(test_csv)
print('Loaded test split:', len(test_df))

# Binary labels are stored as probabilities in *_p columns
binary_cols = [c for c in test_df.columns if c.endswith('_p')]

# Class targets (int columns)
assert 'shade_class' in test_df.columns, 'Missing shade_class in split manifest'
assert 'score_class' in test_df.columns, 'Missing score_class in split manifest'
assert 'veg_class' in test_df.columns, 'Missing veg_class in split manifest'

print('Binary prob cols:', binary_cols)
print('Class cols       :', ['shade_class', 'score_class', 'veg_class'])




Loaded test split: 156
Binary prob cols: ['sports_field_p', 'multipurpose_open_area_p', 'children_s_playground_p', 'water_feature_p', 'gardens_p', 'walking_paths_p', 'built_structures_p', 'parking_lots_p']
Class cols       : ['shade_class', 'score_class', 'veg_class']


In [2]:
# Build test dataset (no augmentation)
IMG_SIZE = (512, 512)
BATCH_SIZE = 8

NUM_SHADE = 2
NUM_SCORE = 5
NUM_VEG = 5

def decode_image(path):
    img = tf.io.read_file(path)
    img = tf.io.decode_jpeg(img, channels=3)
    img = tf.cast(img, tf.float32) / 255.0
    return img

paths = test_df['image_path'].astype(str).tolist()

ds_paths = tf.data.Dataset.from_tensor_slices(paths)
ds_imgs = ds_paths.map(decode_image, num_parallel_calls=tf.data.AUTOTUNE)

# labels (match 03 training)
y_bin = test_df[binary_cols].fillna(0.0).astype(np.float32).values

y_shade = test_df['shade_class'].fillna(0).astype(np.int32).values
# score/veg are stored as 1..5 in the manifest; training uses 0..4
# (so we shift here too)
y_score = test_df['score_class'].fillna(1).astype(np.int32).values - 1
y_veg   = test_df['veg_class'].fillna(1).astype(np.int32).values - 1

# clip defensively
y_shade = np.clip(y_shade, 0, NUM_SHADE - 1)
y_score = np.clip(y_score, 0, NUM_SCORE - 1)
y_veg   = np.clip(y_veg,   0, NUM_VEG - 1)

ds_labels = tf.data.Dataset.from_tensor_slices({
    'bin_head': y_bin,
    'shade_head': y_shade,
    'score_head': y_score,
    'veg_head': y_veg,
})

test_ds = tf.data.Dataset.zip((ds_imgs, ds_labels)).batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)
print('Test dataset ready:', len(paths))


Test dataset ready: 156


In [3]:
# Load a trained model
# Prefer the latest final_*.keras; fall back to best.keras
models_dir = Path('../models')
finals = sorted(models_dir.glob('final_*.keras'))
if finals:
    model_path = finals[-1]
else:
    model_path = models_dir / 'best.keras'

assert model_path.exists(), f"Missing model file: {model_path}"
model = tf.keras.models.load_model(str(model_path))
print('Loaded model from', model_path)


Loaded model from ../models/final_20260103_173019.keras


In [4]:
# Predict on test
pred_bin, pred_shade, pred_score, pred_veg = model.predict(test_ds, verbose=0)

# Ground truth
# For binaries, prefer hard 0/1 columns if present (e.g. sports_field), else threshold *_p at 0.5
bin_names = [c[:-2] for c in binary_cols]
hard_bin_names = [c for c in bin_names if c in test_df.columns]

if hard_bin_names:
    y_bin_true = test_df[hard_bin_names].fillna(0).astype(int).values
    # align pred_bin columns to hard_bin_names order
    pred_bin_aligned = np.stack([pred_bin[:, bin_names.index(n)] for n in hard_bin_names], axis=1)
else:
    y_bin_true = (test_df[binary_cols].fillna(0.0).astype(np.float32).values >= 0.5).astype(int)
    pred_bin_aligned = pred_bin
    hard_bin_names = bin_names

y_shade_true = test_df['shade_class'].fillna(0).astype(int).values
# stored as 1..5; convert to 0..4 to match training targets
y_score_true = test_df['score_class'].fillna(1).astype(int).values - 1
y_veg_true   = test_df['veg_class'].fillna(1).astype(int).values - 1

# Metrics
from sklearn.metrics import precision_recall_fscore_support, accuracy_score

print('--- Binary (threshold=0.5) ---')
for i, name in enumerate(hard_bin_names):
    y_prob = pred_bin_aligned[:, i]
    y_hat = (y_prob >= 0.5).astype(int)
    y_true = y_bin_true[:, i]
    p, r, f1, _ = precision_recall_fscore_support(y_true, y_hat, average='binary', zero_division=0)
    print(f"{name:24s} P={p:.2f} R={r:.2f} F1={f1:.2f}")

print('--- Shade / Score / Veg ---')
shade_acc = accuracy_score(y_shade_true, pred_shade.argmax(axis=1))
score_acc = accuracy_score(y_score_true, pred_score.argmax(axis=1))
veg_acc   = accuracy_score(y_veg_true,   pred_veg.argmax(axis=1))
print(f"Shade accuracy: {shade_acc:.3f}")
print(f"Score accuracy: {score_acc:.3f}")
print(f"Veg   accuracy: {veg_acc:.3f}")

# Expected-value MAE for score/veg (convert back to 1..5 scale)
classes_1to5 = np.arange(1, 6, dtype=np.float32)
score_expected = (pred_score * classes_1to5).sum(axis=1)
veg_expected   = (pred_veg   * classes_1to5).sum(axis=1)

score_true_1to5 = (y_score_true + 1).astype(np.float32)
veg_true_1to5   = (y_veg_true + 1).astype(np.float32)

mae_score = float(np.mean(np.abs(score_expected - score_true_1to5)))
mae_veg   = float(np.mean(np.abs(veg_expected   - veg_true_1to5)))
print(f"Score MAE (expected value): {mae_score:.3f}")
print(f"Veg   MAE (expected value): {mae_veg:.3f}")


--- Binary (threshold=0.5) ---
sports_field             P=0.67 R=0.04 F1=0.08
multipurpose_open_area   P=1.00 R=0.02 F1=0.03
children_s_playground    P=0.20 R=0.30 F1=0.24
water_feature            P=0.00 R=0.00 F1=0.00
gardens                  P=0.04 R=0.67 F1=0.07
walking_paths            P=0.00 R=0.00 F1=0.00
built_structures         P=0.00 R=0.00 F1=0.00
parking_lots             P=0.00 R=0.00 F1=0.00
--- Shade / Score / Veg ---
Shade accuracy: 0.628
Score accuracy: 0.199
Veg   accuracy: 0.122
Score MAE (expected value): 1.297
Veg   MAE (expected value): 0.831
