# Phase 1: Distance Threshold Analysis

Analyzes experiment results to determine an optimal cosine distance cutoff for filtering
retrieved memories. The goal is to increase **precision** (reduce false positives) while
maintaining acceptable **recall** (not losing ground truth memories).

All metrics are **macro-averaged**: computed per experiment, then averaged across experiments.
This gives each test case equal weight. Within each experiment, results are deduplicated by
memory ID using the *best* (minimum) distance across all queries.

**Approach:**
1. Collect best-distance data per unique memory per experiment
2. Visualize distance distributions (ground truth vs non-ground-truth)
3. Sweep thresholds and compute precision/recall/F1 (macro-averaged)
4. Identify the optimal cutoff and its tradeoffs

In [None]:
import json
import os
from pathlib import Path

import matplotlib.pyplot as plt
import numpy as np

from memory_retrieval.infra.runs import get_latest_run, get_run, list_runs

# Find project root by walking up to pyproject.toml
PROJECT_ROOT = Path.cwd()
while not (PROJECT_ROOT / "pyproject.toml").exists():
    if PROJECT_ROOT == PROJECT_ROOT.parent:
        raise RuntimeError("Could not find project root (pyproject.toml)")
    PROJECT_ROOT = PROJECT_ROOT.parent
os.chdir(PROJECT_ROOT)

# Run selection: use latest run or select a specific one
# To see available runs: print(list_runs("phase1"))
# To select specific run: RUN_DIR = get_run("phase1", "run_20260208_143022")

RUN_DIR = get_latest_run("phase1")
RESULTS_DIR = RUN_DIR / "results"

# Load all experiment result files
result_files = sorted(RESULTS_DIR.glob("results_*.json"))
print(f"Project root: {PROJECT_ROOT}")
print(f"Using run: {RUN_DIR.name}")
print(f"Found {len(result_files)} result files:")
for f in result_files:
    print(f"  {f.name}")

In [None]:
# Build per-experiment deduped data: best (minimum) distance for each memory across all queries
experiments = []
for result_file in result_files:
    with open(result_file) as fh:
        experiments.append(json.load(fh))

# Precompute per-experiment: deduplicated best distances + GT IDs (reused by all cells below)
experiments_deduped = []
for experiment in experiments:
    test_case_id = experiment.get("test_case_id", "unknown")
    ground_truth_ids = set(experiment.get("ground_truth", {}).get("memory_ids", []))
    best_distances = {}
    for query_result in experiment.get("queries", []):
        for result in query_result.get("results", []):
            memory_id = result["id"]
            distance = result["distance"]
            if memory_id not in best_distances or distance < best_distances[memory_id]:
                best_distances[memory_id] = distance
    experiments_deduped.append({
        "test_case_id": test_case_id,
        "ground_truth_ids": ground_truth_ids,
        "best_distances": best_distances,
    })

# Collect per-observation distances (one entry per experiment×memory pair) — used for histograms
ground_truth_observations = []      # GT observations (each memory appears exactly once since GT sets are disjoint)
non_ground_truth_observations = []  # non-GT observations (same memory appears in ~10 experiments)
ground_truth_memory_details = []    # (memory_id, best_distance, test_case_id)

for experiment_data in experiments_deduped:
    for memory_id, distance in experiment_data["best_distances"].items():
        if memory_id in experiment_data["ground_truth_ids"]:
            ground_truth_observations.append(distance)
            ground_truth_memory_details.append((memory_id, distance, experiment_data["test_case_id"]))
        else:
            non_ground_truth_observations.append(distance)

ground_truth_obs = np.array(ground_truth_observations)
non_ground_truth_obs = np.array(non_ground_truth_observations)

# True unique counts
total_ground_truth_memories = sum(len(experiment_data["ground_truth_ids"]) for experiment_data in experiments_deduped)
all_unique_ids = set()
for experiment_data in experiments_deduped:
    all_unique_ids.update(experiment_data["best_distances"].keys())
num_unique_memories = len(all_unique_ids)

print(f"Database: {num_unique_memories} unique memories")
print(f"Experiments: {len(experiments_deduped)} test cases, GT sets are disjoint (each memory GT in exactly 1)")
print(f"  Total GT assignments: {total_ground_truth_memories} (= {num_unique_memories} unique memories)")
print(f"")
print(f"Observations (experiment × memory pairs, used for histograms):")
print(f"  GT observations:     {len(ground_truth_obs)} (unique — each memory is GT once)")
print(f"  Non-GT observations: {len(non_ground_truth_obs)} (~{num_unique_memories} memories × ~{len(experiments_deduped)-1} experiments each)")
print(f"")
print(f"GT distance range:     [{ground_truth_obs.min():.4f}, {ground_truth_obs.max():.4f}]")
print(f"Non-GT distance range: [{non_ground_truth_obs.min():.4f}, {non_ground_truth_obs.max():.4f}]")
print(f"GT mean: {ground_truth_obs.mean():.4f}, median: {np.median(ground_truth_obs):.4f}")
print(f"Non-GT mean: {non_ground_truth_obs.mean():.4f}, median: {np.median(non_ground_truth_obs):.4f}")

print(f"\nPer GT memory details (sorted by distance):")
for memory_id, distance, test_case_id in sorted(ground_truth_memory_details, key=lambda x: x[1]):
    print(f"  {distance:.4f}  {memory_id}  ({test_case_id})")

In [None]:
# Density histogram and threshold sweep (macro-averaged with MRR)
# Exported as separate figures for blog post

FIGURES_DIR = Path("notebooks/phase1/figures")
FIGURES_DIR.mkdir(parents=True, exist_ok=True)

bins = np.arange(0.1, 1.15, 0.025)

# --- Figure 1: Normalized density histogram ---
fig1, ax1 = plt.subplots(figsize=(10, 5))
ax1.hist(ground_truth_obs, bins=bins, alpha=0.6, density=True,
         label="GT", color="#2ecc71", edgecolor="white", linewidth=0.5)
ax1.hist(non_ground_truth_obs, bins=bins, alpha=0.6, density=True,
         label="Non-GT", color="#e74c3c", edgecolor="white", linewidth=0.5)
ax1.axvline(np.median(ground_truth_obs), color="#27ae60", linestyle="--", linewidth=1.5,
            label=f"GT median: {np.median(ground_truth_obs):.3f}")
ax1.axvline(np.median(non_ground_truth_obs), color="#c0392b", linestyle="--", linewidth=1.5,
            label=f"Non-GT median: {np.median(non_ground_truth_obs):.3f}")
ax1.set_xlabel("Best Cosine Distance")
ax1.set_ylabel("Density")
ax1.set_title("Distance Distribution (normalized)")
ax1.legend(fontsize=8)
fig1.tight_layout()
fig1.savefig(FIGURES_DIR / "distance_distribution_histogram.png", dpi=200, bbox_inches="tight")
plt.show()
print(f"Saved: {FIGURES_DIR / 'distance_distribution_histogram.png'}")

# --- Compute threshold sweep data ---
experiment_thresholds = np.arange(0.10, 1.15, 0.01)
experiment_precisions = []
experiment_recalls = []
experiment_f1_scores = []
experiment_mrr_scores = []

for threshold in experiment_thresholds:
    precisions, recalls, f1_scores, reciprocal_ranks = [], [], [], []
    for experiment_data in experiments_deduped:
        accepted = {memory_id for memory_id, distance in experiment_data["best_distances"].items() if distance <= threshold}
        ground_truth_accepted = len(accepted & experiment_data["ground_truth_ids"])
        num_accepted = len(accepted)
        ground_truth_count = len(experiment_data["ground_truth_ids"])

        precision = ground_truth_accepted / num_accepted if num_accepted > 0 else 0.0
        recall = ground_truth_accepted / ground_truth_count if ground_truth_count > 0 else 0.0
        f1_score = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0.0
        precisions.append(precision)
        recalls.append(recall)
        f1_scores.append(f1_score)

        accepted_sorted = sorted(
            [(memory_id, distance) for memory_id, distance in experiment_data["best_distances"].items() if distance <= threshold],
            key=lambda x: x[1]
        )
        reciprocal_rank = 0.0
        for rank, (memory_id, _) in enumerate(accepted_sorted, 1):
            if memory_id in experiment_data["ground_truth_ids"]:
                reciprocal_rank = 1.0 / rank
                break
        reciprocal_ranks.append(reciprocal_rank)

    experiment_precisions.append(np.mean(precisions))
    experiment_recalls.append(np.mean(recalls))
    experiment_f1_scores.append(np.mean(f1_scores))
    experiment_mrr_scores.append(np.mean(reciprocal_ranks))

experiment_precisions = np.array(experiment_precisions)
experiment_recalls = np.array(experiment_recalls)
experiment_f1_scores = np.array(experiment_f1_scores)
experiment_mrr_scores = np.array(experiment_mrr_scores)

experiment_best_f1_index = np.argmax(experiment_f1_scores)
experiment_best_threshold = experiment_thresholds[experiment_best_f1_index]

# --- Figure 2: P/R/F1/MRR vs threshold ---
fig2, ax2 = plt.subplots(figsize=(10, 5))
ax2.plot(experiment_thresholds, experiment_precisions, label="Precision", color="#3498db", linewidth=2)
ax2.plot(experiment_thresholds, experiment_recalls, label="Recall", color="#2ecc71", linewidth=2)
ax2.plot(experiment_thresholds, experiment_f1_scores, label="F1", color="#9b59b6", linewidth=2)
ax2.plot(experiment_thresholds, experiment_mrr_scores, label="MRR", color="#e67e22", linewidth=2)
ax2.axvline(experiment_best_threshold, color="#e74c3c", linestyle="--", linewidth=1.5,
            label=f"Best F1 @ {experiment_best_threshold:.2f}")
ax2.set_xlabel("Distance Threshold")
ax2.set_ylabel("Score")
ax2.set_title("P/R/F1/MRR vs Threshold")
ax2.legend(fontsize=8)
ax2.set_ylim(0, 1.05)
ax2.grid(True, alpha=0.3)
fig2.tight_layout()
fig2.savefig(FIGURES_DIR / "threshold_sweep_metrics.png", dpi=200, bbox_inches="tight")
plt.show()
print(f"Saved: {FIGURES_DIR / 'threshold_sweep_metrics.png'}")

print(f"\nOptimal F1 threshold: {experiment_best_threshold:.2f}")
print(f"  F1:        {experiment_f1_scores[experiment_best_f1_index]:.3f}")
print(f"  Precision: {experiment_precisions[experiment_best_f1_index]:.3f}")
print(f"  Recall:    {experiment_recalls[experiment_best_f1_index]:.3f}")
print(f"  MRR:       {experiment_mrr_scores[experiment_best_f1_index]:.3f}")
print(f"  (all metrics macro-averaged: per-experiment, then mean)")

In [None]:
# Threshold table (macro-averaged per experiment, with MRR)
print(f"Threshold table (macro-averaged: per-experiment P/R/F1/MRR, then mean):")
print(f"Database has {num_unique_memories} unique memories, {len(experiments_deduped)} test cases")
print()
print(f"{'Threshold':>10} {'Precision':>10} {'Recall':>8} {'F1':>8} {'MRR':>8} {'Avg Accepted':>13} {'Avg GT Kept':>12} {'Avg GT Lost':>12}")
print("-" * 95)

for threshold in [0.55, 0.60, 0.65, 0.70, 0.75, 0.80, 0.85, 0.90, 0.95, 1.00, 1.05]:
    precisions, recalls, f1_scores, reciprocal_ranks = [], [], [], []
    num_accepted_list, ground_truth_kept_list = [], []
    
    for experiment_data in experiments_deduped:
        accepted = {memory_id for memory_id, distance in experiment_data["best_distances"].items() if distance <= threshold}
        ground_truth_accepted = len(accepted & experiment_data["ground_truth_ids"])
        num_accepted = len(accepted)
        ground_truth_count = len(experiment_data["ground_truth_ids"])

        precision = ground_truth_accepted / num_accepted if num_accepted > 0 else 0.0
        recall = ground_truth_accepted / ground_truth_count if ground_truth_count > 0 else 0.0
        f1_score = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0.0
        precisions.append(precision)
        recalls.append(recall)
        f1_scores.append(f1_score)
        num_accepted_list.append(num_accepted)
        ground_truth_kept_list.append(ground_truth_accepted)

        accepted_sorted = sorted(
            [(memory_id, distance) for memory_id, distance in experiment_data["best_distances"].items() if distance <= threshold],
            key=lambda x: x[1]
        )
        reciprocal_rank = 0.0
        for rank, (memory_id, _) in enumerate(accepted_sorted, 1):
            if memory_id in experiment_data["ground_truth_ids"]:
                reciprocal_rank = 1.0 / rank
                break
        reciprocal_ranks.append(reciprocal_rank)

    avg_precision, avg_recall = np.mean(precisions), np.mean(recalls)
    avg_f1_score, avg_mrr = np.mean(f1_scores), np.mean(reciprocal_ranks)
    avg_num_accepted = np.mean(num_accepted_list)
    avg_ground_truth_kept = np.mean(ground_truth_kept_list)
    avg_ground_truth_count = np.mean([len(experiment_data["ground_truth_ids"]) for experiment_data in experiments_deduped])
    avg_ground_truth_lost = avg_ground_truth_count - avg_ground_truth_kept

    marker = " <--" if abs(threshold - experiment_best_threshold) < 0.015 else ""
    print(f"{threshold:>10.2f} {avg_precision:>10.3f} {avg_recall:>8.3f} {avg_f1_score:>8.3f} {avg_mrr:>8.3f} {avg_num_accepted:>13.1f} {avg_ground_truth_kept:>12.1f} {avg_ground_truth_lost:>12.1f}{marker}")

In [None]:
# Per-experiment impact of applying the best threshold (with MRR)
print(f"Impact of threshold={experiment_best_threshold:.2f} per experiment:\n")
print(f"{'Test Case':<25} {'Recall':>7} {'Prec':>7} {'F1':>7} {'MRR':>7} {'GT':>4} {'Kept':>5} {'Lost':>5} {'Accepted':>9}")
print("-" * 85)

per_experiment_mrrs = []
for experiment_data in experiments_deduped:
    # Apply threshold and sort by distance
    accepted_sorted = sorted(
        [(memory_id, distance) for memory_id, distance in experiment_data["best_distances"].items() if distance <= experiment_best_threshold],
        key=lambda x: x[1]
    )
    accepted_ids = {memory_id for memory_id, _ in accepted_sorted}
    ground_truth_accepted = accepted_ids & experiment_data["ground_truth_ids"]
    ground_truth_lost = experiment_data["ground_truth_ids"] - accepted_ids

    recall = len(ground_truth_accepted) / len(experiment_data["ground_truth_ids"]) if experiment_data["ground_truth_ids"] else 0
    precision = len(ground_truth_accepted) / len(accepted_ids) if accepted_ids else 0
    f1_score = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0

    # MRR: reciprocal rank of first GT hit in accepted results (sorted by distance)
    reciprocal_rank = 0.0
    for rank, (memory_id, _) in enumerate(accepted_sorted, 1):
        if memory_id in experiment_data["ground_truth_ids"]:
            reciprocal_rank = 1.0 / rank
            break
    per_experiment_mrrs.append(reciprocal_rank)

    print(f"{experiment_data['test_case_id']:<25} {recall:>7.1%} {precision:>7.1%} {f1_score:>7.3f} {reciprocal_rank:>7.3f} {len(experiment_data['ground_truth_ids']):>4} {len(ground_truth_accepted):>5} {len(ground_truth_lost):>5} {len(accepted_ids):>9}")

    if ground_truth_lost:
        for memory_id in sorted(ground_truth_lost):
            distance = experiment_data["best_distances"].get(memory_id)
            distance_str = f"{distance:.4f}" if distance is not None else "NOT RETRIEVED"
            print(f"  {'':25} Lost: {memory_id} (best d={distance_str})")

print("-" * 85)
avg_num_accepted = np.mean([len({memory_id for memory_id, distance in experiment_data["best_distances"].items() if distance <= experiment_best_threshold}) for experiment_data in experiments_deduped])
print(f"{'AVERAGE':<25} {np.mean([len({memory_id for memory_id, distance in experiment_data['best_distances'].items() if distance <= experiment_best_threshold} & experiment_data['ground_truth_ids']) / len(experiment_data['ground_truth_ids']) for experiment_data in experiments_deduped]):>7.1%} {'':>7} {'':>7} {np.mean(per_experiment_mrrs):>7.3f} {'':>4} {'':>5} {'':>5} {avg_num_accepted:>9.1f}")

In [None]:
# Scatter plot: deduplicated — one point per unique memory per experiment (best distance)
fig, ax = plt.subplots(figsize=(14, 6))

experiment_names = [experiment_data["test_case_id"].replace("tc_", "")[:30] for experiment_data in experiments_deduped]
x_positions = []
colors = []
distances_to_plot = []

for experiment_index, experiment_data in enumerate(experiments_deduped):
    for memory_id, distance in experiment_data["best_distances"].items():
        x_positions.append(experiment_index + np.random.uniform(-0.3, 0.3))
        distances_to_plot.append(distance)
        colors.append("#2ecc71" if memory_id in experiment_data["ground_truth_ids"] else "#e74c3c")

ax.scatter(x_positions, distances_to_plot, c=colors, alpha=0.5, s=25, edgecolors="white", linewidth=0.3)
ax.axhline(experiment_best_threshold, color="#3498db", linestyle="--", linewidth=2,
           label=f"Optimal threshold: {experiment_best_threshold:.2f}")

from matplotlib.lines import Line2D
legend_elements = [
    Line2D([0], [0], marker='o', color='w', markerfacecolor='#2ecc71', markersize=8, label='Ground Truth'),
    Line2D([0], [0], marker='o', color='w', markerfacecolor='#e74c3c', markersize=8, label='Non-GT'),
    Line2D([0], [0], color='#3498db', linestyle='--', linewidth=2, label=f'Threshold: {experiment_best_threshold:.2f}'),
]
ax.legend(handles=legend_elements, loc="upper left")

ax.set_xticks(range(len(experiment_names)))
ax.set_xticklabels(experiment_names, rotation=30, ha="right", fontsize=8)
ax.set_ylabel("Cosine Distance")
ax.set_title("Unique Memories by Experiment — best distance (below threshold = accepted)")
ax.grid(True, alpha=0.3, axis="y")

plt.tight_layout()
plt.show()

# Print counts
total_raw = sum(
    len(result) for experiment in experiments for query_result in experiment.get("queries", []) for result in [query_result.get("results", [])]
)
num_ground_truth_plotted = colors.count('#2ecc71')
num_non_ground_truth_plotted = colors.count('#e74c3c')
print(f"\nRaw result rows (before dedup): {total_raw}")
print(f"Unique memories plotted:        {len(distances_to_plot)} ({len(distances_to_plot)//len(experiments_deduped):.0f} avg/experiment)")
print(f"  GT:     {num_ground_truth_plotted} ({num_ground_truth_plotted//len(experiments_deduped):.0f} avg/experiment)")
print(f"  Non-GT: {num_non_ground_truth_plotted} ({num_non_ground_truth_plotted//len(experiments_deduped):.0f} avg/experiment)")

In [None]:
# Analyze the current confidence buckets from db.py vs data-driven buckets
# Uses per-observation data (GT observations are unique; non-GT repeated across experiments)
current_buckets = [
    ("high",     0.0,  0.5),
    ("medium",   0.5,  0.8),
    ("low",      0.8,  1.2),
    ("very_low", 1.2,  2.0),
]

print(f"Current confidence buckets (db.py) vs actual data:")
print(f"NOTE: GT observations are unique (n={len(ground_truth_obs)}), non-GT are repeated across experiments (n={len(non_ground_truth_obs)} from {num_unique_memories} memories)")
print()
print(f"{'Bucket':<10} {'Range':<15} {'GT Obs':>10} {'Non-GT Obs':>12} {'GT %':>8} {'Obs Precision':>14}")
print("-" * 75)

for bucket_name, range_low, range_high in current_buckets:
    ground_truth_in_bucket = int(np.sum((ground_truth_obs >= range_low) & (ground_truth_obs < range_high)))
    non_ground_truth_in_bucket = int(np.sum((non_ground_truth_obs >= range_low) & (non_ground_truth_obs < range_high)))
    total_in_bucket = ground_truth_in_bucket + non_ground_truth_in_bucket
    ground_truth_percentage = ground_truth_in_bucket / len(ground_truth_obs) * 100 if len(ground_truth_obs) > 0 else 0
    observation_precision = ground_truth_in_bucket / total_in_bucket if total_in_bucket > 0 else 0
    print(f"{bucket_name:<10} [{range_low:.1f}, {range_high:.1f}){'':<5} {ground_truth_in_bucket:>10} {non_ground_truth_in_bucket:>12} {ground_truth_percentage:>7.1f}% {observation_precision:>14.1%}")

print(f"\nRecommendation: Update confidence buckets based on data distribution.")
print(f"The optimal F1 threshold ({experiment_best_threshold:.2f}) should be the cutoff between 'accepted' and 'rejected'.")

In [None]:
# Final summary and recommendation (macro-averaged, with MRR)
print("=" * 70)
print("THRESHOLD ANALYSIS SUMMARY")
print("=" * 70)

print(f"\nDatabase: {num_unique_memories} unique memories")
print(f"Test cases: {len(experiments_deduped)} (GT sets are disjoint, {total_ground_truth_memories} total GT assignments)")
avg_retrieved = np.mean([len(experiment_data["best_distances"]) for experiment_data in experiments_deduped])
print(f"Avg memories retrieved per experiment: {avg_retrieved:.1f} / {num_unique_memories}")

print(f"\nOptimal F1 threshold: {experiment_best_threshold:.2f}")
print(f"  P={experiment_precisions[experiment_best_f1_index]:.1%}, R={experiment_recalls[experiment_best_f1_index]:.1%}, "
      f"F1={experiment_f1_scores[experiment_best_f1_index]:.3f}, MRR={experiment_mrr_scores[experiment_best_f1_index]:.3f}")
print(f"  (all metrics macro-averaged: per-experiment, then mean)")

# Retrieval counts at optimal threshold
avg_num_accepted = np.mean([len({memory_id for memory_id, distance in experiment_data["best_distances"].items() if distance <= experiment_best_threshold}) for experiment_data in experiments_deduped])
avg_ground_truth_kept = np.mean([len({memory_id for memory_id, distance in experiment_data["best_distances"].items() if distance <= experiment_best_threshold} & experiment_data["ground_truth_ids"]) for experiment_data in experiments_deduped])
avg_ground_truth_count = np.mean([len(experiment_data["ground_truth_ids"]) for experiment_data in experiments_deduped])
print(f"\n--- Memories retrieved at threshold={experiment_best_threshold:.2f} (avg per experiment) ---")
print(f"  {avg_num_accepted:.1f} / {avg_retrieved:.1f} retrieved memories accepted")
print(f"    GT kept:  {avg_ground_truth_kept:.1f} / {avg_ground_truth_count:.1f}")
print(f"    Non-GT:   {avg_num_accepted - avg_ground_truth_kept:.1f}")

# Data-driven tradeoff points
print(f"\n--- Key Tradeoff Points (data-driven) ---")
tradeoff_points = [("Best F1", experiment_best_threshold)]

# Find lowest threshold where macro-avg recall >= 0.9
recall_90_candidates = experiment_thresholds[experiment_recalls >= 0.9]
if len(recall_90_candidates) > 0:
    threshold_recall_90 = recall_90_candidates[0]
    tradeoff_points.append(("Recall >= 90%", threshold_recall_90))

# Find lowest threshold where macro-avg recall >= 0.95
recall_95_candidates = experiment_thresholds[experiment_recalls >= 0.95]
if len(recall_95_candidates) > 0:
    threshold_recall_95 = recall_95_candidates[0]
    tradeoff_points.append(("Recall >= 95%", threshold_recall_95))

# Find highest threshold where macro-avg precision >= 0.9
precision_90_candidates = experiment_thresholds[experiment_precisions >= 0.9]
if len(precision_90_candidates) > 0:
    threshold_precision_90 = precision_90_candidates[-1]
    tradeoff_points.append(("Precision >= 90%", threshold_precision_90))

# Sort by threshold for readability
tradeoff_points.sort(key=lambda x: x[1])

for label, threshold in tradeoff_points:
    index = np.argmin(np.abs(experiment_thresholds - threshold))
    avg_accepted_at_threshold = np.mean([len({memory_id for memory_id, distance in experiment_data["best_distances"].items() if distance <= threshold}) for experiment_data in experiments_deduped])
    avg_ground_truth_at_threshold = np.mean([len({memory_id for memory_id, distance in experiment_data["best_distances"].items() if distance <= threshold} & experiment_data["ground_truth_ids"]) for experiment_data in experiments_deduped])
    print(f"  t={threshold:.2f} ({label:>17}): P={experiment_precisions[index]:.1%}, R={experiment_recalls[index]:.1%}, "
          f"F1={experiment_f1_scores[index]:.3f}, MRR={experiment_mrr_scores[index]:.3f}, "
          f"avg accepted={avg_accepted_at_threshold:.1f} ({avg_ground_truth_at_threshold:.1f} GT + {avg_accepted_at_threshold-avg_ground_truth_at_threshold:.1f} non-GT)")

print(f"\n--- Recommendation ---")
print(f"The GT and non-GT distributions overlap significantly (GT median={np.median(ground_truth_obs):.3f}, "
      f"non-GT median={np.median(non_ground_truth_obs):.3f}).")
print(f"A single distance threshold cannot cleanly separate them.")
print(f"")
print(f"Practical options:")
print(f"  1. Use threshold={experiment_best_threshold:.2f} as DEFAULT_DISTANCE_THRESHOLD for best F1")
print(f"  2. Use a higher threshold to preserve recall, accepting lower precision")
print(f"  3. Combine threshold with additional signals (RRF scoring, re-ranking)")
print(f"     to improve separation beyond what distance alone provides")
print("=" * 70)