In [None]:
# Cell 1 — Setup & Imports
import os
from pathlib import Path

from memory_retrieval.experiments.query_generation import (
    QueryGenerationConfig,
    generate_all_queries,
)
from memory_retrieval.experiments.runner import (
    ExperimentConfig,
    run_all_experiments,
)
from memory_retrieval.experiments.test_cases import build_test_cases
from memory_retrieval.infra.io import load_json
from memory_retrieval.memories.extractor import ExtractionConfig, SituationFormat, extract_memories
from memory_retrieval.search.vector import VectorBackend

# Find project root by walking up to pyproject.toml
PROJECT_ROOT = Path.cwd()
while not (PROJECT_ROOT / "pyproject.toml").exists():
    if PROJECT_ROOT.parent == PROJECT_ROOT:
        raise RuntimeError("Could not find project root (pyproject.toml)")
    PROJECT_ROOT = PROJECT_ROOT.parent
os.chdir(PROJECT_ROOT)

# Verify API key
if not os.environ.get("OPENROUTER_API_KEY"):
    print("WARNING: OPENROUTER_API_KEY is not set. Memory building and query generation will fail.")
else:
    print("OPENROUTER_API_KEY is set.")

print(f"Project root: {PROJECT_ROOT}")
print("Imports OK.")

In [None]:
# Cell 2 — Configuration
from memory_retrieval.infra.runs import create_run, get_latest_run, update_run_status

PROMPT_VERSION = "2.0.0"  # semantic versioning
MODEL_MEMORIES = "anthropic/claude-haiku-4.5"  # LLM for memory extraction
MODEL_EXPERIMENT = "anthropic/claude-sonnet-4.5"  # LLM for query generation

RAW_DATA_DIR = "data/review_data"

# Run selection: use latest run or select a specific one
# To see available runs: print(list_runs("phase1"))
# To select specific run: RUN_DIR = get_run("phase1", "run_20260208_143022")
RUN_DIR = get_latest_run("phase1")

# Derived paths (automatic from run directory)
MEMORIES_DIR = str(RUN_DIR / "memories")
DB_PATH = str(RUN_DIR / "memories" / "memories.db")
TEST_CASES_DIR = str(RUN_DIR / "test_cases")
QUERIES_DIR = str(RUN_DIR / "queries")
RESULTS_DIR = str(RUN_DIR / "results")

# Initialize backends
vector_backend = VectorBackend()

print("Configuration:")
print(f"  Using run: {RUN_DIR.name}")
print(f"  Prompt version: {PROMPT_VERSION}")
print(f"  Model (memories): {MODEL_MEMORIES}")
print(f"  Model (experiment): {MODEL_EXPERIMENT}")
print(f"  Raw data dir: {RAW_DATA_DIR}")
print(f"  Memories dir: {MEMORIES_DIR}")
print(f"  DB path: {DB_PATH}")
print(f"  Test cases dir: {TEST_CASES_DIR}")
print(f"  Queries dir: {QUERIES_DIR}")
print(f"  Results dir: {RESULTS_DIR}")

## Step 1 — Build Memories

Extracts structured memories from raw code review data via LLM.
Each memory contains a **situation description** (25-60 words describing the code pattern/issue) and an **actionable lesson** (imperative guidance, max 160 chars).

Requires `OPENROUTER_API_KEY`.

In [None]:
# Cell 4 — Build Memories: Single File
# Creates a new run if none exists

if RUN_DIR is None:
    run_id, RUN_DIR = create_run("phase1")
    MEMORIES_DIR = str(RUN_DIR / "memories")
    DB_PATH = str(RUN_DIR / "memories" / "memories.db")
    TEST_CASES_DIR = str(RUN_DIR / "test_cases")
    QUERIES_DIR = str(RUN_DIR / "queries")
    RESULTS_DIR = str(RUN_DIR / "results")
    print(f"Created new run: {run_id}")

raw_data_path = Path(RAW_DATA_DIR)
raw_files = sorted(raw_data_path.glob("*.json"))

print(f"Found {len(raw_files)} raw data files:")
for i, f in enumerate(raw_files):
    print(f"  [{i}] {f.name}")

if raw_files:
    # Process the first file (change index to pick a different one)
    target_file = raw_files[0]
    print(f"\nProcessing: {target_file.name}")
    extraction_config = ExtractionConfig(
        situation_format=SituationFormat.SINGLE,
        prompts_dir="data/prompts/phase1",
        prompt_version=PROMPT_VERSION,
        model=MODEL_MEMORIES,
    )
    output_path = extract_memories(
        raw_path=str(target_file),
        out_dir=MEMORIES_DIR,
        config=extraction_config,
    )
    print(f"Output saved to: {output_path}")
else:
    print("No raw data files found.")

In [None]:
# Cell 5 — Build Memories: All Files
# Creates a new run if none exists

if RUN_DIR is None:
    run_id, RUN_DIR = create_run("phase1")
    MEMORIES_DIR = str(RUN_DIR / "memories")
    DB_PATH = str(RUN_DIR / "memories" / "memories.db")
    TEST_CASES_DIR = str(RUN_DIR / "test_cases")
    QUERIES_DIR = str(RUN_DIR / "queries")
    RESULTS_DIR = str(RUN_DIR / "results")
    print(f"Created new run: {run_id}")

raw_data_path = Path(RAW_DATA_DIR)
raw_files = sorted(raw_data_path.glob("*.json"))

print(f"Processing all {len(raw_files)} raw data files...\n")

extraction_config = ExtractionConfig(
    situation_format=SituationFormat.SINGLE,
    prompts_dir="data/prompts/phase1",
    prompt_version=PROMPT_VERSION,
    model=MODEL_MEMORIES,
)

results = []
for f in raw_files:
    print(f"Processing: {f.name}")
    try:
        output_path = extract_memories(
            raw_path=str(f),
            out_dir=MEMORIES_DIR,
            config=extraction_config,
        )
        results.append({"file": f.name, "output": output_path, "status": "ok"})
    except Exception as e:
        results.append({"file": f.name, "output": None, "status": str(e)})
        print(f"  ERROR: {e}")

success_count = sum(1 for r in results if r["status"] == "ok")
print(f"\nSummary: {success_count}/{len(results)} files processed successfully.")

# Update run status
update_run_status(
    RUN_DIR,
    "build_memories",
    {
        "count": success_count,
        "failed": len(results) - success_count,
        "prompt_version": PROMPT_VERSION,
    },
)

## Step 2 — Create Database

Builds a SQLite database with **sqlite-vec** for vector similarity search.
Loads all accepted memories from JSONL files and indexes their situation descriptions as 1024-dimensional embeddings (via Ollama `mxbai-embed-large`).

Requires Ollama running locally with the `mxbai-embed-large` model.

In [None]:
# Cell 7 — Rebuild Database
print(f"Rebuilding database for run: {RUN_DIR.name}...")
vector_backend.rebuild_database(db_path=DB_PATH, memories_dir=MEMORIES_DIR)

count = vector_backend.get_memory_count(DB_PATH)
print(f"Database rebuilt. Total memories indexed: {count}")

# Update run status
update_run_status(RUN_DIR, "db", {"memory_count": count})

In [None]:
# Cell 8 — Verify Database: Sample Search

sample_query = "error handling in async functions"
print(f'Sample search: "{sample_query}"\n')

results = vector_backend.search(db_path=DB_PATH, query=sample_query, limit=5)

if results:
    for i, result in enumerate(results):
        print(f"--- Result {i + 1} (distance: {result.raw_score:.4f}) ---")
        print(f"  ID: {result.id}")
        print(f"  Situation: {result.situation}")
        print(f"  Lesson: {result.lesson}")
        print()
else:
    print("No results found. Check that the database is populated and Ollama is running.")

## Step 3 — Create Test Cases

Matches raw PR data to extracted memories to build **ground truth** test cases.
Each test case contains the filtered diff, PR context, and the set of memory IDs that should be retrieved.
PRs with no matching memories are skipped.

In [None]:
# Cell 10 — Build Test Cases
print(f"Building test cases for run: {RUN_DIR.name}...\n")
build_test_cases(
    raw_dir=RAW_DATA_DIR,
    memories_dir=MEMORIES_DIR,
    output_dir=TEST_CASES_DIR,
)

test_case_files = sorted(Path(TEST_CASES_DIR).glob("*.json"))
print(f"\nGenerated {len(test_case_files)} test cases:")
for test_case_file in test_case_files:
    test_case = load_json(str(test_case_file))
    ground_truth_count = test_case.get(
        "ground_truth_count", len(test_case.get("ground_truth_memory_ids", []))
    )
    print(f"  {test_case_file.name} — {ground_truth_count} ground truth memories")

# Update run status
update_run_status(RUN_DIR, "test_cases", {"count": len(test_case_files)})

## Step 4 — Generate Queries

Generates search queries from each test case's PR context and diff via LLM.
Queries are saved as separate JSON files in the `queries/` directory so they can be
reused across multiple experiment runs without re-calling the API.

Requires `OPENROUTER_API_KEY`.

In [None]:
# Cell 12 — Generate Queries for All Test Cases
print(f"Generating queries for run: {RUN_DIR.name}...\n")

query_config = QueryGenerationConfig(
    prompts_dir="data/prompts/phase1",
    prompt_version=PROMPT_VERSION,
    model=MODEL_EXPERIMENT,
)
all_query_data = generate_all_queries(
    test_cases_dir=TEST_CASES_DIR,
    queries_dir=QUERIES_DIR,
    config=query_config,
    db_path=DB_PATH,
    search_backend=vector_backend,
)

successful_queries = [data for data in all_query_data if "queries" in data]
total_queries = sum(len(data["queries"]) for data in successful_queries)
print(
    f"\nGenerated queries for {len(successful_queries)} test cases ({total_queries} total queries)"
)

# Update run status
update_run_status(
    RUN_DIR,
    "query_generation",
    {
        "count": len(successful_queries),
        "total_queries": total_queries,
        "model": MODEL_EXPERIMENT,
        "prompt_version": PROMPT_VERSION,
    },
)

## Step 5 — Run Experiments

For each test case, the experiment:
1. Loads pre-generated search queries from the `queries/` directory
2. Runs vector similarity search against the database
3. Computes **recall**, **precision**, and **F1** against the ground truth

Requires Ollama with `mxbai-embed-large`. Does NOT require `OPENROUTER_API_KEY`.

In [None]:
# Cell 13 — Run All Experiments
print(f"Running all experiments for run: {RUN_DIR.name}...\n")

config = ExperimentConfig(
    search_backend=vector_backend,
    search_limit=20,
    distance_threshold=1.1,
)
all_results = run_all_experiments(
    test_cases_dir=TEST_CASES_DIR,
    queries_dir=QUERIES_DIR,
    db_path=DB_PATH,
    results_dir=RESULTS_DIR,
    config=config,
)

print(f"\nCompleted {len(all_results)} experiments.\n")
print(f"{'Test Case':<40} {'Recall':>8} {'Precision':>10} {'F1':>8}")
print("-" * 70)
for result in all_results:
    test_case_name = result.get("test_case_id", "?")[:40]
    metrics = result.get("metrics", {})
    print(
        f"{test_case_name:<40} {metrics.get('recall', 0):>8.3f} {metrics.get('precision', 0):>10.3f} {metrics.get('f1', 0):>8.3f}"
    )

if all_results:
    metrics_list = [result.get("metrics", {}) for result in all_results]
    avg_recall = sum(metrics.get("recall", 0) for metrics in metrics_list) / len(metrics_list)
    avg_precision = sum(metrics.get("precision", 0) for metrics in metrics_list) / len(metrics_list)
    avg_f1_score = sum(metrics.get("f1", 0) for metrics in metrics_list) / len(metrics_list)
    print("-" * 70)
    print(f"{'AVERAGE':<40} {avg_recall:>8.3f} {avg_precision:>10.3f} {avg_f1_score:>8.3f}")

# Update run status
successful = [result for result in all_results if "metrics" in result]
update_run_status(
    RUN_DIR,
    "experiment",
    {
        "count": len(successful),
        "failed": len(all_results) - len(successful),
    },
)

In [None]:
# Cell 14 — Results Analysis: Top@K & Distance Threshold Sweep (with MRR)
import matplotlib.pyplot as plt
import numpy as np

# === Configuration ===
DISTANCE_THRESHOLDS = [0.65, 0.70, 0.75, 0.80, 0.85, 0.90, 0.95, 1.00, 1.10]
MAX_K = 20

FIGURES_DIR = RUN_DIR / "figures"
FIGURES_DIR.mkdir(parents=True, exist_ok=True)

results_path = Path(RESULTS_DIR)
result_files = sorted(results_path.glob("*.json"))

if not result_files:
    print("No result files found. Run experiments first.")
else:
    all_data = [load_json(str(file_path)) for file_path in result_files]
    num_test_cases = len(all_data)

    # --- Helper: deduplicate results per test case (best distance per memory) ---
    def get_best_distances(data):
        best_by_memory_id = {}
        for query_result in data.get("queries", []):
            for result in query_result.get("results", []):
                memory_id = result["id"]
                distance = result["distance"]
                if memory_id not in best_by_memory_id or distance < best_by_memory_id[memory_id]:
                    best_by_memory_id[memory_id] = distance
        return sorted(best_by_memory_id.items(), key=lambda x: x[1])

    # --- Helper: compute reciprocal rank for a ranked list ---
    def reciprocal_rank(sorted_memory_ids, ground_truth_ids):
        """Return 1/rank of the first GT hit, or 0 if none found."""
        for rank, memory_id in enumerate(sorted_memory_ids, 1):
            if memory_id in ground_truth_ids:
                return 1.0 / rank
        return 0.0

    # =========================================================================
    # Top@K Analysis (per-test-case: pool all queries, deduplicate, take top k)
    # =========================================================================
    k_values = list(range(1, MAX_K + 1))
    all_precisions_by_k = {k: [] for k in k_values}
    all_recalls_by_k = {k: [] for k in k_values}
    all_f1_scores_by_k = {k: [] for k in k_values}
    all_reciprocal_ranks_by_k = {k: [] for k in k_values}  # reciprocal ranks for MRR@k

    for data in all_data:
        ground_truth_ids = set(data.get("ground_truth", {}).get("memory_ids", []))
        ground_truth_count = len(ground_truth_ids)
        sorted_memories = get_best_distances(data)

        for k in k_values:
            top_k = sorted_memories[:k]
            top_k_ids = {memory_id for memory_id, _ in top_k}
            ground_truth_in_top_k = len(top_k_ids & ground_truth_ids)
            actual_k = min(k, len(sorted_memories))

            precision = ground_truth_in_top_k / actual_k if actual_k > 0 else 0.0
            recall = ground_truth_in_top_k / ground_truth_count if ground_truth_count > 0 else 0.0
            f1_score = (
                2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0.0
            )
            reciprocal_rank_value = reciprocal_rank(
                [memory_id for memory_id, _ in top_k], ground_truth_ids
            )

            all_precisions_by_k[k].append(precision)
            all_recalls_by_k[k].append(recall)
            all_f1_scores_by_k[k].append(f1_score)
            all_reciprocal_ranks_by_k[k].append(reciprocal_rank_value)

    avg_precisions = [np.mean(all_precisions_by_k[k]) for k in k_values]
    avg_recalls = [np.mean(all_recalls_by_k[k]) for k in k_values]
    avg_f1_scores = [np.mean(all_f1_scores_by_k[k]) for k in k_values]
    avg_mrr_scores = [np.mean(all_reciprocal_ranks_by_k[k]) for k in k_values]

    # Top@K table
    print(f"Top@K Results (averaged over {num_test_cases} test cases)\n")
    print(f"{'K':>4} {'Precision':>10} {'Recall':>8} {'F1':>8} {'MRR':>8}")
    print("-" * 42)
    for k in [1, 2, 3, 4, 5, 10, 15, 20]:
        if k <= MAX_K:
            index = k - 1
            print(
                f"{k:>4} {avg_precisions[index]:>10.3f} {avg_recalls[index]:>8.3f} {avg_f1_scores[index]:>8.3f} {avg_mrr_scores[index]:>8.3f}"
            )

    # Top@K plot — single axis
    fig, ax = plt.subplots(figsize=(10, 5))
    ax.plot(
        k_values,
        avg_precisions,
        label="Precision",
        color="#3498db",
        linewidth=2,
        marker="o",
        markersize=4,
    )
    ax.plot(
        k_values,
        avg_recalls,
        label="Recall",
        color="#2ecc71",
        linewidth=2,
        marker="s",
        markersize=4,
    )
    ax.plot(
        k_values, avg_f1_scores, label="F1", color="#9b59b6", linewidth=2, marker="^", markersize=4
    )
    ax.plot(
        k_values,
        avg_mrr_scores,
        label="MRR@K",
        color="#e67e22",
        linewidth=2,
        marker="D",
        markersize=5,
    )
    ax.set_xlabel("K (top results kept per test case)")
    ax.set_ylabel("Score")
    ax.set_title(f"Precision / Recall / F1 / MRR vs Top@K (avg over {num_test_cases} test cases)")
    ax.set_xticks(k_values)
    ax.set_ylim(0, 1.05)
    ax.legend()
    ax.grid(True, alpha=0.3)
    for k in [1, 5, 10]:
        index = k - 1
        ax.annotate(
            f"MRR={avg_mrr_scores[index]:.3f}",
            (k, avg_mrr_scores[index]),
            textcoords="offset points",
            xytext=(8, 8),
            fontsize=9,
            arrowprops=dict(arrowstyle="->", color="#e67e22", lw=0.8),
        )
    plt.tight_layout()
    fig.savefig(FIGURES_DIR / "topk_metrics.png", dpi=200, bbox_inches="tight")
    plt.show()
    print(f"Saved: {FIGURES_DIR / 'topk_metrics.png'}")

    # =========================================================================
    # Distance Threshold Sweep (no top@k limit, uses all deduplicated results)
    # =========================================================================
    print(f"\nDistance Threshold Results (averaged over {num_test_cases} test cases)\n")
    print(f"{'Threshold':>10} {'Precision':>10} {'Recall':>8} {'F1':>8} {'MRR':>8}")
    print("-" * 48)

    sweep_precisions, sweep_recalls, sweep_f1_scores, sweep_mrr_scores = [], [], [], []
    for threshold in DISTANCE_THRESHOLDS:
        threshold_precisions, threshold_recalls, threshold_f1_scores, threshold_reciprocal_ranks = (
            [],
            [],
            [],
            [],
        )
        for data in all_data:
            ground_truth_ids = set(data.get("ground_truth", {}).get("memory_ids", []))
            ground_truth_count = len(ground_truth_ids)
            sorted_memories = get_best_distances(data)

            accepted = [
                (memory_id, distance)
                for memory_id, distance in sorted_memories
                if distance <= threshold
            ]
            accepted_ids = {memory_id for memory_id, _ in accepted}
            ground_truth_accepted = len(accepted_ids & ground_truth_ids)
            num_accepted = len(accepted_ids)

            precision = ground_truth_accepted / num_accepted if num_accepted > 0 else 0.0
            recall = ground_truth_accepted / ground_truth_count if ground_truth_count > 0 else 0.0
            f1_score = (
                2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0.0
            )
            reciprocal_rank_value = reciprocal_rank(
                [memory_id for memory_id, _ in accepted], ground_truth_ids
            )

            threshold_precisions.append(precision)
            threshold_recalls.append(recall)
            threshold_f1_scores.append(f1_score)
            threshold_reciprocal_ranks.append(reciprocal_rank_value)

        sweep_precisions.append(np.mean(threshold_precisions))
        sweep_recalls.append(np.mean(threshold_recalls))
        sweep_f1_scores.append(np.mean(threshold_f1_scores))
        sweep_mrr_scores.append(np.mean(threshold_reciprocal_ranks))
        print(
            f"{threshold:>10.2f} {sweep_precisions[-1]:>10.3f} {sweep_recalls[-1]:>8.3f} {sweep_f1_scores[-1]:>8.3f} {sweep_mrr_scores[-1]:>8.3f}"
        )

    # Threshold sweep plot — single axis
    fig, ax = plt.subplots(figsize=(10, 5))
    ax.plot(
        DISTANCE_THRESHOLDS,
        sweep_precisions,
        label="Precision",
        color="#3498db",
        linewidth=2,
        marker="o",
        markersize=5,
    )
    ax.plot(
        DISTANCE_THRESHOLDS,
        sweep_recalls,
        label="Recall",
        color="#2ecc71",
        linewidth=2,
        marker="s",
        markersize=5,
    )
    ax.plot(
        DISTANCE_THRESHOLDS,
        sweep_f1_scores,
        label="F1",
        color="#9b59b6",
        linewidth=2,
        marker="^",
        markersize=5,
    )
    ax.plot(
        DISTANCE_THRESHOLDS,
        sweep_mrr_scores,
        label="MRR",
        color="#e67e22",
        linewidth=2,
        marker="D",
        markersize=5,
    )
    ax.set_xlabel("Cosine Distance Threshold")
    ax.set_ylabel("Score")
    ax.set_title(
        f"Precision / Recall / F1 / MRR vs Distance Threshold (avg over {num_test_cases} test cases)"
    )
    ax.set_xticks(DISTANCE_THRESHOLDS)
    ax.set_ylim(0, 1.05)
    ax.legend()
    ax.grid(True, alpha=0.3)
    plt.tight_layout()
    plt.show()