In [1]:
# quick sanity-check in a notebook or REPL
import numpy as np
Xt = np.load("Xtrain.npy")
print(Xt.min(), Xt.max())     # should say 0.0 1.0

0.0 1.0


In [2]:
import os, numpy as np, tensorflow as tf

In [3]:
MODEL_PATH        = "best_mlp.h5"
RAW_DIR_TRAIN     = os.path.abspath(os.path.join("..", "..", "..",
                                                 "archive", "seg_train", "seg_train"))
# adjust if your .npy files live elsewhere
Xtest_path, ytest_path = "Xtest.npy", "ytest.npy"

# 1 ── LOAD DATA & MODEL ───────────────────────────────────────────────────
print("Loading model and a *small* sample (500 imgs)…")
model   = tf.keras.models.load_model(MODEL_PATH)
Xtest   = np.load(Xtest_path)[:500]          # small slice for speed
ytest   = np.load(ytest_path)[:500]

# Sanity: make sure 0-1 scaling is consistent
print("pixel range  :", Xtest.min(), "→", Xtest.max())  # should be 0.0 … 1.0

Loading model and a *small* sample (500 imgs)…


2025-06-07 18:15:24.190122: I metal_plugin/src/device/metal_device.cc:1154] Metal device set to: Apple M2 Pro
2025-06-07 18:15:24.190176: I metal_plugin/src/device/metal_device.cc:296] systemMemory: 16.00 GB
2025-06-07 18:15:24.190190: I metal_plugin/src/device/metal_device.cc:313] maxCacheSize: 5.33 GB
2025-06-07 18:15:24.190224: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2025-06-07 18:15:24.190243: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:271] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


pixel range  : 0.0 → 1.0


In [4]:
# 2 ── PROBE SOFT-MAX OUTPUTS (over-confidence test) ──────────────────────
probs  = model.predict(Xtest, verbose=0)
print("\nSample probability vector (rounded to 6 dp):")
print(np.round(probs[0], 6), "sum =", probs[0].sum())

high_conf_pct = (probs.max(axis=1) > 0.999).mean() * 100
print(f"{high_conf_pct:.1f}% of samples have soft-max max-prob > 0.999")

# If you see many 0.9999s here, the model itself is over-confident
# (keep reading – we’ll still check the class order next)
# -------------------------------------------------------------------------

2025-06-07 18:15:56.269094: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:117] Plugin optimizer for device_type GPU is enabled.



Sample probability vector (rounded to 6 dp):
[0.006854 0.       0.       0.985302 0.005243 0.002601] sum = 0.99999994
81.4% of samples have soft-max max-prob > 0.999


In [5]:
# 3 ── RECREATE THE CLASS LIST EXACTLY LIKE FLASK DOES ────────────────────
classes_flask = sorted([d for d in os.listdir(RAW_DIR_TRAIN)
                        if os.path.isdir(os.path.join(RAW_DIR_TRAIN, d))])
print("\nFlask list:", classes_flask)


Flask list: ['buildings', 'forest', 'glacier', 'mountain', 'sea', 'street']


In [6]:
# 4 ── LOAD PREPROCESS() LIST, IF AVAILABLE ───────────────────────────────
if os.path.exists("classes.npy"):
    classes_saved = list(np.load("classes.npy"))
    print("Saved list:", classes_saved)
else:
    classes_saved = None
    print("WARNING: classes.npy not found – you may already have a mismatch!")



In [8]:
# 5 ── COMPARE THE TWO LISTS ───────────────────────────────────────────────
if classes_saved is not None:
    same_order = classes_saved == classes_flask
    print("\nClass lists identical? –>", same_order)
    if not same_order:
        diffs = [i for i,(a,b) in enumerate(zip(classes_saved, classes_flask)) if a!=b]
        print("First few mismatching indices:", diffs[:5])

In [9]:
# 6 ── EVALUATE ACCURACY UNDER *BOTH* ORDERS ───────────────────────────────
y_pred = np.argmax(probs, axis=1)

# Accuracy if we decode exactly like preprocess() (ground truth)
acc_saved = (y_pred == ytest).mean() if classes_saved is None else \
            (y_pred == ytest).mean()  # ytest already uses saved order

# Accuracy if we (wrongly) decode with Flask order
# We need to *remap* labels so they are integers 0..(k-1) in Flask order
flask_index = {cls:i for i,cls in enumerate(classes_flask)}
if classes_saved is None:
    acc_flask = acc_saved
else:
    ytest_flask = np.vectorize(flask_index.get)(np.array(classes_saved)[ytest])
    acc_flask = (y_pred == ytest_flask).mean()

print(f"\nAccuracy vs true labels (saved order):  {acc_saved*100:.2f}%")
print(f"Accuracy if Flask order is assumed:      {acc_flask*100:.2f}%")


Accuracy vs true labels (saved order):  44.20%
Accuracy if Flask order is assumed:      44.20%
