In [None]:
# Cell 1 — Setup & Imports
import os
import sys
from pathlib import Path

# Set working directory to project root
PROJECT_ROOT = Path("__file__").resolve().parent.parent
os.chdir(PROJECT_ROOT)

# Add scripts/ to sys.path so bare imports like `from phase1.db import ...` work
scripts_path = str(PROJECT_ROOT / "scripts")
if scripts_path not in sys.path:
    sys.path.insert(0, scripts_path)

from phase1.build_memories import build_memories
from phase1.db import (
    rebuild_database,
    search_memories,
    get_memory_count,
    get_random_sample_memories,
    DEFAULT_DB_PATH,
)
from phase1.experiment import (
    run_experiment,
    run_all_experiments,
    DEFAULT_TEST_CASES_DIR,
    DEFAULT_RESULTS_DIR,
)
from common.test_cases import build_test_cases
from common.io import load_json

# Verify API key
if not os.environ.get("OPENROUTER_API_KEY"):
    print("WARNING: OPENROUTER_API_KEY is not set. Memory building and experiments will fail.")
else:
    print("OPENROUTER_API_KEY is set.")

print(f"Project root: {PROJECT_ROOT}")
print("Imports OK.")

In [None]:
# Cell 2 — Configuration
PROMPT_VERSION = "v2"                          # "v1" or "v2"
MODEL_MEMORIES = "anthropic/claude-haiku-4.5"   # LLM for memory extraction
MODEL_EXPERIMENT = "anthropic/claude-sonnet-4.5" # LLM for query generation

RAW_DATA_DIR = "data/review_data"
MEMORIES_DIR = "data/phase1/memories"
DB_PATH = "data/phase1/memories/memories.db"
TEST_CASES_DIR = "data/phase1/test_cases"
RESULTS_DIR = "data/phase1/results"

print("Configuration:")
print(f"  Prompt version: {PROMPT_VERSION}")
print(f"  Model (memories): {MODEL_MEMORIES}")
print(f"  Model (experiment): {MODEL_EXPERIMENT}")
print(f"  Raw data dir: {RAW_DATA_DIR}")
print(f"  Memories dir: {MEMORIES_DIR}")
print(f"  DB path: {DB_PATH}")
print(f"  Test cases dir: {TEST_CASES_DIR}")
print(f"  Results dir: {RESULTS_DIR}")

## Step 1 — Build Memories

Extracts structured memories from raw code review data via LLM.
Each memory contains a **situation description** (25-60 words describing the code pattern/issue) and an **actionable lesson** (imperative guidance, max 160 chars).

Requires `OPENROUTER_API_KEY`.

In [None]:
# Cell 4 — Build Memories: Single File
raw_data_path = Path(RAW_DATA_DIR)
raw_files = sorted(raw_data_path.glob("*.json"))

print(f"Found {len(raw_files)} raw data files:")
for i, f in enumerate(raw_files):
    print(f"  [{i}] {f.name}")

if raw_files:
    # Process the first file (change index to pick a different one)
    target_file = raw_files[0]
    print(f"\nProcessing: {target_file.name}")
    output_path = build_memories(
        raw_path=str(target_file),
        out_dir=MEMORIES_DIR,
        model=MODEL_MEMORIES,
        prompt_version=PROMPT_VERSION,
    )
    print(f"Output saved to: {output_path}")
else:
    print("No raw data files found.")

In [None]:
# Cell 5 — Build Memories: All Files
raw_data_path = Path(RAW_DATA_DIR)
raw_files = sorted(raw_data_path.glob("*.json"))

print(f"Processing all {len(raw_files)} raw data files...\n")

results = []
for f in raw_files:
    print(f"Processing: {f.name}")
    try:
        output_path = build_memories(
            raw_path=str(f),
            out_dir=MEMORIES_DIR,
            model=MODEL_MEMORIES,
            prompt_version=PROMPT_VERSION,
        )
        results.append({"file": f.name, "output": output_path, "status": "ok"})
    except Exception as e:
        results.append({"file": f.name, "output": None, "status": str(e)})
        print(f"  ERROR: {e}")

print(f"\nSummary: {sum(1 for r in results if r['status'] == 'ok')}/{len(results)} files processed successfully.")

## Step 2 — Create Database

Builds a SQLite database with **sqlite-vec** for vector similarity search.
Loads all accepted memories from JSONL files and indexes their situation descriptions as 1024-dimensional embeddings (via Ollama `mxbai-embed-large`).

Requires Ollama running locally with the `mxbai-embed-large` model.

In [None]:
# Cell 7 — Rebuild Database
print("Rebuilding database...")
rebuild_database(db_path=DB_PATH, memories_dir=MEMORIES_DIR)

count = get_memory_count(DB_PATH)
print(f"Database rebuilt. Total memories indexed: {count}")

In [None]:
# Cell 8 — Verify Database: Sample Search
sample_query = "error handling in async functions"
print(f"Sample search: \"{sample_query}\"\n")

results = search_memories(db_path=DB_PATH, query=sample_query, limit=5)

if results:
    for i, r in enumerate(results):
        print(f"--- Result {i + 1} (distance: {r.get('distance', 'N/A'):.4f}) ---")
        print(f"  ID: {r.get('id', 'N/A')}")
        print(f"  Situation: {r.get('situation_description', 'N/A')}")
        print(f"  Lesson: {r.get('lesson', 'N/A')}")
        print()
else:
    print("No results found. Check that the database is populated and Ollama is running.")

## Step 3 — Create Test Cases

Matches raw PR data to extracted memories to build **ground truth** test cases.
Each test case contains the filtered diff, PR context, and the set of memory IDs that should be retrieved.
PRs with no matching memories are skipped.

In [None]:
# Cell 10 — Build Test Cases
print("Building test cases...\n")
build_test_cases(
    raw_dir=RAW_DATA_DIR,
    memories_dir=MEMORIES_DIR,
    output_dir=TEST_CASES_DIR,
)

test_case_files = sorted(Path(TEST_CASES_DIR).glob("*.json"))
print(f"\nGenerated {len(test_case_files)} test cases:")
for f in test_case_files:
    tc = load_json(str(f))
    gt_count = tc.get("ground_truth_count", len(tc.get("ground_truth_memory_ids", [])))
    print(f"  {f.name} — {gt_count} ground truth memories")

## Step 4 — Run Experiments

For each test case, the experiment:
1. Generates search queries from the PR context and diff via LLM
2. Runs vector similarity search against the database
3. Computes **recall**, **precision**, and **F1** against the ground truth

Requires both `OPENROUTER_API_KEY` and Ollama with `mxbai-embed-large`.

In [None]:
# Cell 12 — Run Single Experiment
test_case_files = sorted(Path(TEST_CASES_DIR).glob("*.json"))

if test_case_files:
    target = test_case_files[0]
    print(f"Running experiment on: {target.name}\n")

    result = run_experiment(
        test_case_path=str(target),
        db_path=DB_PATH,
        model=MODEL_EXPERIMENT,
        results_dir=RESULTS_DIR,
        prompt_version=PROMPT_VERSION,
    )

    print(f"Recall:    {result.get('recall', 'N/A')}")
    print(f"Precision: {result.get('precision', 'N/A')}")
    print(f"F1:        {result.get('f1', 'N/A')}")
    print(f"\nQueries generated: {len(result.get('queries', []))}")
    print(f"Unique memories retrieved: {len(result.get('retrieved_memory_ids', []))}")
    print(f"Ground truth count: {result.get('ground_truth_count', 'N/A')}")
else:
    print("No test cases found. Run Step 3 first.")

In [None]:
# Cell 13 — Run All Experiments
print("Running all experiments...\n")

all_results = run_all_experiments(
    test_cases_dir=TEST_CASES_DIR,
    db_path=DB_PATH,
    model=MODEL_EXPERIMENT,
    results_dir=RESULTS_DIR,
    prompt_version=PROMPT_VERSION,
)

print(f"\nCompleted {len(all_results)} experiments.\n")
print(f"{'Test Case':<40} {'Recall':>8} {'Precision':>10} {'F1':>8}")
print("-" * 70)
for r in all_results:
    name = r.get("test_case_id", r.get("test_case_path", "?"))[:40]
    print(f"{name:<40} {r.get('recall', 0):>8.3f} {r.get('precision', 0):>10.3f} {r.get('f1', 0):>8.3f}")

if all_results:
    avg_recall = sum(r.get("recall", 0) for r in all_results) / len(all_results)
    avg_precision = sum(r.get("precision", 0) for r in all_results) / len(all_results)
    avg_f1 = sum(r.get("f1", 0) for r in all_results) / len(all_results)
    print("-" * 70)
    print(f"{'AVERAGE':<40} {avg_recall:>8.3f} {avg_precision:>10.3f} {avg_f1:>8.3f}")

In [None]:
# Cell 14 — Results Summary
results_path = Path(RESULTS_DIR)
result_files = sorted(results_path.glob("*.json"))

if not result_files:
    print("No result files found. Run experiments first.")
else:
    all_data = [load_json(str(f)) for f in result_files]

    # Aggregate metrics
    recalls = [d.get("recall", 0) for d in all_data]
    precisions = [d.get("precision", 0) for d in all_data]
    f1s = [d.get("f1", 0) for d in all_data]

    print(f"Results from {len(all_data)} experiments:\n")
    print(f"  Avg Recall:    {sum(recalls) / len(recalls):.3f}")
    print(f"  Avg Precision: {sum(precisions) / len(precisions):.3f}")
    print(f"  Avg F1:        {sum(f1s) / len(f1s):.3f}")
    print(f"  Min Recall:    {min(recalls):.3f}")
    print(f"  Max Recall:    {max(recalls):.3f}")

    # Per-query analysis
    all_queries = []
    for d in all_data:
        for q in d.get("queries", []):
            if isinstance(q, str):
                all_queries.append(q)
            elif isinstance(q, dict):
                all_queries.append(q.get("query", ""))

    if all_queries:
        word_counts = [len(q.split()) for q in all_queries]
        print(f"\nQuery analysis ({len(all_queries)} total queries):")
        print(f"  Avg words per query: {sum(word_counts) / len(word_counts):.1f}")
        print(f"  Min words: {min(word_counts)}")
        print(f"  Max words: {max(word_counts)}")

    # Best / worst experiments by recall
    sorted_by_recall = sorted(all_data, key=lambda d: d.get("recall", 0), reverse=True)
    print("\nBest experiment by recall:")
    best = sorted_by_recall[0]
    print(f"  {best.get('test_case_id', '?')} — recall: {best.get('recall', 0):.3f}")

    print("Worst experiment by recall:")
    worst = sorted_by_recall[-1]
    print(f"  {worst.get('test_case_id', '?')} — recall: {worst.get('recall', 0):.3f}")