# GeoMatchAI Model Analysis & Visualization

This notebook analyzes model performance and creates beautiful visualizations.

**Prerequisites:** Run `usertest_comprehensive.py` first to generate test data.

## 0. Install Dependencies

Run this cell first to install required packages.

In [None]:
import subprocess
import sys

packages = ["pandas", "numpy", "matplotlib", "seaborn", "adjustText"]

for package in packages:
    try:
        __import__(package)
        print(f"✓ {package} already installed")
    except ImportError:
        print(f"Installing {package}...")
        subprocess.check_call([sys.executable, "-m", "pip", "install", package, "-q"])
        print(f"✓ {package} installed")

print("\nAll dependencies ready!")

## 1. Setup & Imports

In [None]:
import os
import sys
from pathlib import Path
from datetime import datetime

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

In [None]:
# Style settings
plt.style.use("seaborn-v0_8-darkgrid")
sns.set_palette("husl")
plt.rcParams["figure.figsize"] = (12, 8)
plt.rcParams["font.size"] = 10
plt.rcParams["axes.titlesize"] = 14
plt.rcParams["axes.labelsize"] = 12

In [None]:
# Paths
PROJECT_ROOT = Path().absolute().parent
OUTPUT_DIR = Path("output/csv")
IMG_DIR = PROJECT_ROOT / "img"
IMG_DIR.mkdir(exist_ok=True)

print(f"Project root: {PROJECT_ROOT}")
print(f"Output dir: {OUTPUT_DIR}")
print(f"Image dir: {IMG_DIR}")

## 2. Load Test Results

In [None]:
try:
    df_summary = pd.read_csv(OUTPUT_DIR / "results_summary.csv")
    df_by_image = pd.read_csv(OUTPUT_DIR / "results_by_image.csv")
    df_by_model = pd.read_csv(OUTPUT_DIR / "results_by_model.csv")
    df_by_landmark = pd.read_csv(OUTPUT_DIR / "results_by_landmark.csv")

    print("✓ All CSV files loaded successfully!")
except FileNotFoundError as e:
    print(f"❌ Error: {e}")
    print("\nPlease run the comprehensive test first:")
    print("  uv run examples/usertest_comprehensive.py")
    raise

In [None]:
print("Dataset sizes:")
print(f"  - Summary: {len(df_summary):,} rows")
print(f"  - By Image: {len(df_by_image):,} rows")
print(f"  - By Model: {len(df_by_model):,} rows")
print(f"  - By Landmark: {len(df_by_landmark):,} rows")

In [None]:
# Preview data
df_summary.head()

## 3. Overall Statistics

In [None]:
total_tests = len(df_summary)
total_correct = df_summary["is_correct"].sum()
overall_accuracy = (total_correct / total_tests) * 100

print("=" * 60)
print("OVERALL STATISTICS")
print("=" * 60)
print(f"Total tests: {total_tests:,}")
print(f"Correct predictions: {total_correct:,}")
print(f"Overall accuracy: {overall_accuracy:.2f}%")

In [None]:
print(f"Unique models tested: {df_summary['model_name'].nunique()}")
print(f"Unique landmarks: {df_summary['landmark_name'].nunique()}")
print(f"Unique images: {df_summary['image_name'].nunique()}")

In [None]:
print("Accuracy by preprocessing:")
for prep in [True, False]:
    subset = df_summary[df_summary["preprocessing"] == prep]
    acc = (subset["is_correct"].sum() / len(subset)) * 100
    print(f"  {'WITH' if prep else 'WITHOUT'} preprocessing: {acc:.2f}%")

## 4. Model Accuracy Rankings

In [None]:
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 8))

# With preprocessing
top_models_with = df_by_model[df_by_model["preprocessing"] == True].nlargest(15, "accuracy")
ax1.barh(range(len(top_models_with)), top_models_with["accuracy"] * 100)
ax1.set_yticks(range(len(top_models_with)))
ax1.set_yticklabels(top_models_with["model_name"], fontsize=9)
ax1.set_xlabel("Accuracy (%)")
ax1.set_title("Top 15 Models (WITH Preprocessing)", fontweight="bold")
ax1.grid(axis="x", alpha=0.3)
ax1.invert_yaxis()

for i, v in enumerate(top_models_with["accuracy"] * 100):
    ax1.text(v + 1, i, f"{v:.1f}%", va="center", fontsize=8)

# Without preprocessing
top_models_without = df_by_model[df_by_model["preprocessing"] == False].nlargest(15, "accuracy")
ax2.barh(range(len(top_models_without)), top_models_without["accuracy"] * 100, color="coral")
ax2.set_yticks(range(len(top_models_without)))
ax2.set_yticklabels(top_models_without["model_name"], fontsize=9)
ax2.set_xlabel("Accuracy (%)")
ax2.set_title("Top 15 Models (WITHOUT Preprocessing)", fontweight="bold")
ax2.grid(axis="x", alpha=0.3)
ax2.invert_yaxis()

for i, v in enumerate(top_models_without["accuracy"] * 100):
    ax2.text(v + 1, i, f"{v:.1f}%", va="center", fontsize=8)

plt.tight_layout()
plt.savefig(IMG_DIR / "model_accuracy_rankings.png", dpi=300, bbox_inches="tight")
print(f"✓ Saved: {IMG_DIR / 'model_accuracy_rankings.png'}")
plt.show()

## 5. Discrimination Gap Analysis

The **discrimination gap** measures how well a model separates related images from unrelated ones.

In [None]:
fig, ax = plt.subplots(figsize=(14, 10))

df_disc = df_by_model[df_by_model["preprocessing"] == True].nlargest(20, "discrimination_gap")

y_pos = np.arange(len(df_disc))
related_scores = df_disc["avg_related_score"].values * 100
unrelated_scores = df_disc["avg_unrelated_score"].values * 100

ax.barh(y_pos, related_scores, label="Related Images", alpha=0.8, color="green")
ax.barh(y_pos, unrelated_scores, label="Unrelated Images", alpha=0.8, color="red")

ax.set_yticks(y_pos)
ax.set_yticklabels(df_disc["model_name"], fontsize=9)
ax.set_xlabel("Similarity Score (%)")
ax.set_title("Top 20 Models by Discrimination Gap (WITH Preprocessing)", fontweight="bold")
ax.legend(loc="lower right")
ax.grid(axis="x", alpha=0.3)
ax.invert_yaxis()

for i, (rel, unrel) in enumerate(zip(related_scores, unrelated_scores)):
    gap = rel - unrel
    ax.text(max(rel, unrel) + 2, i, f"Δ{gap:.1f}%", va="center", fontsize=8, fontweight="bold")

plt.tight_layout()
plt.savefig(IMG_DIR / "discrimination_gap.png", dpi=300, bbox_inches="tight")
print(f"✓ Saved: {IMG_DIR / 'discrimination_gap.png'}")
plt.show()

## 6. Speed vs Accuracy Trade-off

In [None]:
fig, ax = plt.subplots(figsize=(16, 10))

df_perf = df_by_model[df_by_model["preprocessing"] == True].copy()

scatter = ax.scatter(
    df_perf["avg_inference_time_s"] * 1000,
    df_perf["accuracy"] * 100,
    s=200,
    alpha=0.7,
    c=df_perf["discrimination_gap"],
    cmap="viridis",
    edgecolors="black",
    linewidth=0.5,
)

# Annotate ALL models with adjusted positions to avoid overlap
from adjustText import adjust_text

texts = []
for _, row in df_perf.iterrows():
    texts.append(
        ax.text(
            row["avg_inference_time_s"] * 1000,
            row["accuracy"] * 100,
            row["model_name"],
            fontsize=7,
            alpha=0.9,
        )
    )

# Try to adjust text positions (if adjustText available), otherwise use basic offset
try:
    adjust_text(texts, arrowprops=dict(arrowstyle="-", color="gray", alpha=0.5))
except:
    # Fallback: just offset the labels
    for text in texts:
        text.set_position((text.get_position()[0] + 2, text.get_position()[1] + 0.5))

ax.set_xlabel("Inference Time (ms)")
ax.set_ylabel("Accuracy (%)")
ax.set_title("Speed vs Accuracy Trade-off (WITH Preprocessing)", fontweight="bold")
ax.grid(True, alpha=0.3)

cbar = plt.colorbar(scatter, ax=ax)
cbar.set_label("Discrimination Gap")

plt.tight_layout()
plt.savefig(IMG_DIR / "speed_vs_accuracy.png", dpi=300, bbox_inches="tight")
print(f"✓ Saved: {IMG_DIR / 'speed_vs_accuracy.png'}")
plt.show()

## 7. Landmark Difficulty Analysis

In [None]:
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 6))

# Related landmarks accuracy
related_landmarks = df_by_landmark[
    (df_by_landmark["is_related"] == True) & (df_by_landmark["preprocessing"] == True)
].sort_values("accuracy", ascending=True)

if len(related_landmarks) > 0:
    ax1.barh(
        range(len(related_landmarks)), related_landmarks["accuracy"] * 100, color="forestgreen"
    )
    ax1.set_yticks(range(len(related_landmarks)))
    ax1.set_yticklabels(related_landmarks["landmark_name"])
    ax1.set_xlabel("Accuracy (%)")
    ax1.set_title("Landmark Recognition Accuracy\n(Related Images)", fontweight="bold")
    ax1.grid(axis="x", alpha=0.3)

    for i, v in enumerate(related_landmarks["accuracy"] * 100):
        ax1.text(v + 1, i, f"{v:.1f}%", va="center", fontsize=9)

# Score distribution by landmark
landmark_scores = (
    df_summary[(df_summary["preprocessing"] == True) & (df_summary["is_related"] == True)]
    .groupby("landmark_name")["similarity_score"]
    .apply(list)
)

if len(landmark_scores) > 0:
    bp = ax2.boxplot(landmark_scores.values, vert=False, patch_artist=True)
    ax2.set_yticklabels(landmark_scores.index)
    ax2.set_yticks(range(1, len(landmark_scores) + 1))
    for patch in bp["boxes"]:
        patch.set_facecolor("lightblue")
        patch.set_alpha(0.7)
    ax2.set_xlabel("Similarity Score")
    ax2.set_title("Score Distribution by Landmark", fontweight="bold")
    ax2.grid(axis="x", alpha=0.3)

plt.tight_layout()
plt.savefig(IMG_DIR / "landmark_analysis.png", dpi=300, bbox_inches="tight")
print(f"✓ Saved: {IMG_DIR / 'landmark_analysis.png'}")
plt.show()

## 8. Model Performance Heatmaps

In [None]:
top_4_models = (
    df_by_model[df_by_model["preprocessing"] == True].nlargest(4, "accuracy")["model_name"].values
)
print(f"Top 4 models: {list(top_4_models)}")

In [None]:
fig, axes = plt.subplots(2, 2, figsize=(14, 10))
axes = axes.flatten()

for idx, model_name in enumerate(top_4_models):
    model_data = df_summary[
        (df_summary["model_name"] == model_name)
        & (df_summary["preprocessing"] == True)
        & (df_summary["landmark_name"] != "unrelated")  # Exclude 'unrelated' as a landmark
    ]

    # Create pivot with landmark as rows, showing avg scores for related vs unrelated
    pivot = model_data.pivot_table(
        index="landmark_name", columns="is_related", values="similarity_score", aggfunc="mean"
    )

    # Rename columns for clarity
    if pivot.columns.tolist() == [False, True]:
        pivot.columns = ["Unrelated", "Related"]
    elif pivot.columns.tolist() == [True]:
        pivot.columns = ["Related"]

    sns.heatmap(
        pivot,
        annot=True,
        fmt=".2f",
        cmap="RdYlGn",
        center=0.65,
        vmin=0.3,
        vmax=0.9,
        ax=axes[idx],
        cbar_kws={"label": "Score", "shrink": 0.8},
        annot_kws={"fontsize": 10, "fontweight": "bold"},
        linewidths=0.5,
        linecolor="white",
    )

    # Shorten model name if too long
    short_name = model_name if len(model_name) < 30 else model_name[:27] + "..."
    axes[idx].set_title(f"{short_name}", fontsize=11, fontweight="bold")
    axes[idx].set_xlabel("")
    axes[idx].set_ylabel("")
    axes[idx].tick_params(axis="both", labelsize=9)

plt.tight_layout()
plt.savefig(IMG_DIR / "model_heatmaps.png", dpi=300, bbox_inches="tight")
print(f"✓ Saved: {IMG_DIR / 'model_heatmaps.png'}")
plt.show()

## 9. Preprocessing Impact Analysis

In [None]:
# Calculate preprocessing delta for each model
preprocessing_impact = []

for model_name in df_by_model["model_name"].unique():
    with_prep = df_by_model[
        (df_by_model["model_name"] == model_name) & (df_by_model["preprocessing"] == True)
    ]["accuracy"].values

    without_prep = df_by_model[
        (df_by_model["model_name"] == model_name) & (df_by_model["preprocessing"] == False)
    ]["accuracy"].values

    if len(with_prep) > 0 and len(without_prep) > 0:
        delta = (with_prep[0] - without_prep[0]) * 100
        preprocessing_impact.append(
            {
                "model": model_name,
                "delta": delta,
                "with": with_prep[0] * 100,
                "without": without_prep[0] * 100,
            }
        )

df_impact = pd.DataFrame(preprocessing_impact).sort_values("delta", ascending=False)
df_impact.head(10)

In [None]:
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 8))

# Impact by model
top_15 = df_impact.head(15)
colors = ["green" if x > 0 else "red" for x in top_15["delta"]]
ax1.barh(range(len(top_15)), top_15["delta"], color=colors, alpha=0.7)
ax1.set_yticks(range(len(top_15)))
ax1.set_yticklabels(top_15["model"], fontsize=9)
ax1.set_xlabel("Accuracy Change (%)")
ax1.set_title("Preprocessing Impact (Positive = Better WITH)", fontweight="bold")
ax1.axvline(x=0, color="black", linestyle="--", linewidth=1)
ax1.grid(axis="x", alpha=0.3)
ax1.invert_yaxis()

for i, v in enumerate(top_15["delta"]):
    ax1.text(
        v + 0.2 if v > 0 else v - 0.2,
        i,
        f"{v:+.1f}%",
        va="center",
        ha="left" if v > 0 else "right",
        fontsize=8,
    )

# Distribution comparison
with_prep_acc = df_by_model[df_by_model["preprocessing"] == True]["accuracy"] * 100
without_prep_acc = df_by_model[df_by_model["preprocessing"] == False]["accuracy"] * 100

ax2.hist(
    [with_prep_acc, without_prep_acc],
    bins=20,
    label=["WITH Preprocessing", "WITHOUT Preprocessing"],
    alpha=0.7,
    color=["green", "red"],
)
ax2.set_xlabel("Accuracy (%)")
ax2.set_ylabel("Number of Models")
ax2.set_title("Accuracy Distribution", fontweight="bold")
ax2.legend()
ax2.grid(axis="y", alpha=0.3)

plt.tight_layout()
plt.savefig(IMG_DIR / "preprocessing_impact.png", dpi=300, bbox_inches="tight")
print(f"✓ Saved: {IMG_DIR / 'preprocessing_impact.png'}")
plt.show()

## 10. Export Text Report

In [None]:
lines = []
lines.append("=" * 80)
lines.append("GeoMatchAI Model Analysis Report")
lines.append(f"Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
lines.append("=" * 80)
lines.append("")

lines.append("OVERALL STATISTICS")
lines.append("-" * 80)
lines.append(f"Total tests: {len(df_summary):,}")
lines.append(f"Overall accuracy: {overall_accuracy:.2f}%")
lines.append(f"Models tested: {df_summary['model_name'].nunique()}")
lines.append(f"Landmarks tested: {df_summary['landmark_name'].nunique()}")
lines.append("")

In [None]:
lines.append("TOP 10 MODELS (WITH PREPROCESSING)")
lines.append("-" * 80)
top_10_w = df_by_model[df_by_model["preprocessing"] == True].nlargest(10, "accuracy")
for i, (_, row) in enumerate(top_10_w.iterrows(), 1):
    lines.append(
        f"{i:2d}. {row['model_name']:<35} Acc: {row['accuracy'] * 100:5.2f}%  Gap: {row['discrimination_gap']:.3f}"
    )
lines.append("")

In [None]:
lines.append("TOP 10 MODELS (WITHOUT PREPROCESSING)")
lines.append("-" * 80)
top_10_wo = df_by_model[df_by_model["preprocessing"] == False].nlargest(10, "accuracy")
for i, (_, row) in enumerate(top_10_wo.iterrows(), 1):
    lines.append(
        f"{i:2d}. {row['model_name']:<35} Acc: {row['accuracy'] * 100:5.2f}%  Gap: {row['discrimination_gap']:.3f}"
    )
lines.append("")

In [None]:
lines.append("IMAGES GENERATED")
lines.append("-" * 80)
for img in [
    "model_accuracy_rankings.png",
    "discrimination_gap.png",
    "speed_vs_accuracy.png",
    "landmark_analysis.png",
    "model_heatmaps.png",
    "preprocessing_impact.png",
    "architecture_comparison.png",
    "summary_dashboard.png",
]:
    lines.append(f"  ✓ {img}")
lines.append("")
lines.append("=" * 80)

report = "\n".join(lines)
(IMG_DIR / "analysis_report.txt").write_text(report)
print(report)
print(f"\n✓ Saved: {IMG_DIR / 'analysis_report.txt'}")