Neverdecel · Neverdecel · Jun 17, 2026 · Jun 17, 2026 · Jun 17, 2026
diff --git a/coderag/eval/__init__.py b/coderag/eval/__init__.py
@@ -21,17 +21,25 @@
 from __future__ import annotations
 
 from coderag.eval.dataset import EvalCase, build_from_git, load_dataset, save_dataset
-from coderag.eval.harness import EvalResult, compare_modes, evaluate
+from coderag.eval.harness import (
+    EvalResult,
+    aggregate_by_mode,
+    compare_modes,
+    evaluate,
+    mean_results,
+)
 from coderag.eval.metrics import hit_at_k, mrr, ndcg_at_k, recall_at_k
 
 __all__ = [
     "EvalCase",
     "EvalResult",
+    "aggregate_by_mode",
     "build_from_git",
     "compare_modes",
     "evaluate",
     "hit_at_k",
     "load_dataset",
+    "mean_results",
     "mrr",
     "ndcg_at_k",
     "recall_at_k",

diff --git a/coderag/eval/datasets/multirepo.example.json b/coderag/eval/datasets/multirepo.example.json
@@ -0,0 +1,14 @@
+[
+  {
+    "name": "coderag",
+    "watched_dir": ".",
+    "store_dir": ".coderag-eval",
+    "dataset": "coderag/eval/datasets/coderag_self_symbols.jsonl"
+  },
+  {
+    "name": "pydantic",
+    "watched_dir": "/tmp/pydantic",
+    "store_dir": "/tmp/pydantic/.coderag-eval",
+    "dataset": "/tmp/pydantic-sym.jsonl"
+  }
+]
diff --git a/coderag/eval/harness.py b/coderag/eval/harness.py
@@ -7,6 +7,7 @@
 
 from __future__ import annotations
 
+from collections import OrderedDict
 from dataclasses import dataclass, field
 from typing import TYPE_CHECKING, Callable, Dict, List, Optional, Sequence, Tuple
 
@@ -162,6 +163,44 @@ def compare_modes(
     return results
 
 
+def mean_results(results: Sequence[EvalResult], *, label: str = "mean") -> EvalResult:
+    """Macro-average several results (same ks/level) into one.
+
+    Each input is weighted equally — so when averaging one mode across repos, a large repo
+    doesn't dominate. That's the right lens for *generalization*: a config should win on the
+    average of repos, not just the biggest one. ``n`` carries the total case count.
+    """
+    if not results:
+        raise ValueError("mean_results: no results to average")
+    ks = results[0].ks
+    m = len(results)
+    return EvalResult(
+        label=label,
+        level=results[0].level,
+        n=sum(r.n for r in results),
+        ks=ks,
+        recall={k: sum(r.recall[k] for r in results) / m for k in ks},
+        hit={k: sum(r.hit[k] for r in results) / m for k in ks},
+        ndcg={k: sum(r.ndcg[k] for r in results) / m for k in ks},
+        mrr=sum(r.mrr for r in results) / m,
+    )
+
+
+def aggregate_by_mode(
+    per_repo: "OrderedDict[str, Sequence[EvalResult]]", *, prefix: str = "mean"
+) -> List[EvalResult]:
+    """Group per-repo results by mode label and macro-average each across repos.
+
+    Returns one row per mode (in first-seen order), e.g. ``mean:hybrid``, ``mean:adaptive`` —
+    the aggregate view used to decide whether a config generalizes.
+    """
+    groups: "OrderedDict[str, List[EvalResult]]" = OrderedDict()
+    for results in per_repo.values():
+        for r in results:
+            groups.setdefault(r.label, []).append(r)
+    return [mean_results(rs, label=f"{prefix}:{mode}") for mode, rs in groups.items()]
+
+
 def format_table(results: Sequence[EvalResult]) -> str:
     """Render results as a compact fixed-width table for the CLI."""
     if not results:

diff --git a/docs/eval.md b/docs/eval.md
@@ -249,9 +249,61 @@ pydantic, symbol level (172 cases, 22 071-chunk corpus)
 ⚠️ **But a 4-repo sweep (627 git-mined cases) shows it is *not* an aggregate win** — hybrid 0.442
 vs adaptive 0.423 MRR; adaptive is a wash on the well-powered repos and the big CodeRAG-curated
 gain turned out to be an artifact of unusually dense-friendly clean-NL queries (see the
-*Multi-repo evaluation* section below / PR adding it). So adaptive stays **off by default** — it's
-a **safe opt-in** (no catastrophic regression after this fix), not a default. Fixed 1:1 hybrid
-remains the default. Enable per-session with `CODERAG_ADAPTIVE_FUSION=1`.
+*Multi-repo evaluation* section below). So adaptive stays **off by default** — it's a **safe
+opt-in** (no catastrophic regression after this fix), not a default. Fixed 1:1 hybrid remains the
+default. Enable per-session with `CODERAG_ADAPTIVE_FUSION=1`.
+
+## Multi-repo evaluation (judging generalization)
+
+Single-repo tuning overfits — the [external-repo validation](research/external-validation.md)
+showed levers that won on CodeRAG reversing on `pydantic`. So a config should only be promoted
+to a **default** once it wins on the *average of several repos*. `scripts/bench_multirepo.py`
+runs the eval across a manifest of repos and prints each repo's table plus a **macro-averaged
+aggregate** (each repo weighted equally, so a big repo can't dominate):
+
+```bash
+python scripts/bench_multirepo.py --manifest repos.json --level symbol --adaptive --rerank
+```
+
+```json
+[
+  {"name": "coderag",  "watched_dir": ".",            "dataset": "coderag/eval/datasets/coderag_self_symbols.jsonl"},
+  {"name": "pydantic", "watched_dir": "/tmp/pydantic", "store_dir": "/tmp/pyd_store", "dataset": "/tmp/pyd_sym.jsonl"}
+]
+```
+
+Each entry reuses a prepared index + dataset (indexing is the slow part); pass `--index` to
+build them first and `--build` to mine a symbol dataset from git history. The aggregate rows are
+labelled `mean:<mode>`. See `coderag/eval/datasets/multirepo.example.json`. Programmatic API:
+`from coderag.eval import aggregate_by_mode, mean_results`.
+
+This is the gate for promoting **adaptive fusion** to default-on: it should be ≥ hybrid on the
+aggregate *and* on every individual repo before the default flips.
+
+### Result: adaptive fusion does **not** earn default-on
+
+Run across four repos (627 git-mined symbol-level cases, `bge-small`, with the embedded-identifier
+classifier):
+
+```
+                 coderag  flask  requests  click  | AGGREGATE (macro-avg, MRR)
+  dense           0.423   0.297   0.354    0.351  |  0.356
+  bm25            0.500   0.371   0.371    0.401  |  0.411
+  hybrid          0.564   0.363   0.415    0.427  |  0.442   ← best
+  adaptive        0.487   0.357   0.415    0.431  |  0.423
+  (cases)          13      219     126      269   |  627
+```
+
+**Hybrid 1:1 wins the aggregate (0.442) and is first-or-tied on every repo; adaptive does not
+clear the bar.** On the three well-powered repos adaptive ≈ hybrid (a wash), and it trails on the
+small/noisy coderag set. The large curated-CodeRAG adaptive win reported above was an artifact of
+unusually dense-friendly, clean natural-language queries; on realistic git-mined commit queries
+**dense is consistently the weakest modality**, so "lean dense for NL" stops paying off.
+
+**Decisions this locks in:** keep `adaptive_fusion` **off by default** (it's a safe opt-in after
+the classifier fix — no catastrophic regression — but not an aggregate win); keep **1:1 hybrid as
+the default**. The harness did its job: a single-repo "win" was correctly blocked from becoming a
+default. (Reranking across repos and a code-aware reranker remain the open levers.)
 
 ## Dataset format
 

diff --git a/scripts/bench_multirepo.py b/scripts/bench_multirepo.py
@@ -0,0 +1,121 @@
+#!/usr/bin/env python
+"""Run the retrieval eval across several repos and report a generalization view.
+
+Single-repo tuning overfits (see docs/research/external-validation.md), so a config should
+only be promoted to a default once it wins on the *average of repos*. This driver scores each
+repo with the eval harness, prints a per-repo table, then a macro-averaged aggregate (each
+repo weighted equally) so you can see at a glance which mode generalizes — and whether any
+repo regresses.
+
+It is manifest-driven so indexing (the slow part) is done once and reused:
+
+    [
+      {"name": "coderag",  "watched_dir": ".",          "dataset": "coderag/eval/datasets/coderag_self_symbols.jsonl"},
+      {"name": "pydantic", "watched_dir": "/tmp/pydantic", "store_dir": "/tmp/pyd_store", "dataset": "/tmp/pyd_sym.jsonl"}
+    ]
+
+    python scripts/bench_multirepo.py --manifest repos.json --level symbol --adaptive
+
+Each entry needs a ``dataset`` (or pass ``--build`` to mine a symbol dataset from the repo's
+git history). ``store_dir`` defaults to ``<watched_dir>/.coderag-eval``. Pass ``--index`` to
+(incrementally) build each index first; otherwise an existing index is assumed.
+"""
+
+from __future__ import annotations
+
+import argparse
+import json
+from collections import OrderedDict
+from pathlib import Path
+from typing import List
+
+from coderag.api import CodeRAG
+from coderag.config import Config
+from coderag.eval import build_from_git, compare_modes, load_dataset, save_dataset
+from coderag.eval.harness import EvalResult, aggregate_by_mode, format_table
+
+
+def _dataset_for(cr: CodeRAG, entry: dict, level: str, build: bool) -> Path:
+    ds = entry.get("dataset")
+    if ds and Path(ds).exists():
+        return Path(ds)
+    if not build:
+        raise SystemExit(
+            f"{entry['name']}: no dataset at {ds!r} (pass --build to mine one)."
+        )
+    out = Path(ds) if ds else Path(f"{entry['name']}-eval.jsonl")
+    cases = build_from_git(
+        cr.config.watched_dir, max_cases=200, symbols=level == "symbol"
+    )
+    indexed = set(cr.store.all_file_paths())
+    cases = [
+        c
+        for c in cases
+        if c.relevant_files and all(f in indexed for f in c.relevant_files)
+    ]
+    save_dataset(cases, out)
+    print(f"    mined {len(cases)} case(s) -> {out}")
+    return out
+
+
+def main() -> int:
+    ap = argparse.ArgumentParser(description=__doc__)
+    ap.add_argument("--manifest", required=True, help="JSON list of repo entries.")
+    ap.add_argument("--model", default="BAAI/bge-small-en-v1.5")
+    ap.add_argument("--level", choices=("file", "symbol"), default="symbol")
+    ap.add_argument("--ks", default="1,5,10")
+    ap.add_argument("--adaptive", action="store_true")
+    ap.add_argument("--rerank", action="store_true")
+    ap.add_argument("--build", action="store_true", help="Mine a dataset if missing.")
+    ap.add_argument(
+        "--index", action="store_true", help="(Incrementally) index each repo first."
+    )
+    args = ap.parse_args()
+
+    entries = json.loads(Path(args.manifest).read_text())
+    ks = tuple(int(k) for k in args.ks.split(","))
+    per_repo: "OrderedDict[str, List[EvalResult]]" = OrderedDict()
+
+    for entry in entries:
+        name = entry["name"]
+        watched = Path(entry["watched_dir"]).expanduser().resolve()
+        store = Path(entry.get("store_dir") or watched / ".coderag-eval")
+        print(f"\n=== {name} === ({watched})")
+        cfg = Config.from_env(
+            provider="fastembed", model=args.model, watched_dir=watched, store_dir=store
+        )
+        cr = CodeRAG(cfg)
+        if args.index:
+            stats = cr.index()
+            print(
+                f"    indexed {stats.total_files} files / {stats.total_chunks} chunks"
+            )
+        ds = _dataset_for(cr, entry, args.level, args.build)
+        cases = load_dataset(ds)
+
+        reranker = None
+        if args.rerank:
+            from coderag.retrieval.rerank import get_reranker
+
+            reranker = get_reranker(cfg.with_overrides(rerank=True))
+        results = compare_modes(
+            cr,
+            cases,
+            ks=ks,
+            level=args.level,
+            adaptive=args.adaptive,
+            reranker=reranker,
+        )
+        per_repo[name] = results
+        print(f"\n  {name}: {len(cases)} case(s)\n")
+        print("  " + format_table(results).replace("\n", "\n  "))
+        cr.close()
+
+    if len(per_repo) > 1:
+        print("\n=== AGGREGATE (macro-avg across repos, each weighted equally) ===\n")
+        print(format_table(aggregate_by_mode(per_repo)))
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
diff --git a/tests/test_eval.py b/tests/test_eval.py
@@ -17,7 +17,13 @@
     load_dataset,
     save_dataset,
 )
-from coderag.eval.harness import best_label, format_table
+from coderag.eval.harness import (
+    EvalResult,
+    aggregate_by_mode,
+    best_label,
+    format_table,
+    mean_results,
+)
 from coderag.eval.metrics import hit_at_k, mrr, ndcg_at_k, recall_at_k
 from tests.conftest import write
 
@@ -190,6 +196,49 @@ def git(*args: str) -> None:
     assert cases[0].source == "git"
 
 
+def _mk(label: str, mrr: float, n: int = 10) -> EvalResult:
+    ks = (1, 5)
+    return EvalResult(
+        label=label,
+        level="symbol",
+        n=n,
+        ks=ks,
+        recall={k: mrr for k in ks},
+        hit={k: mrr for k in ks},
+        ndcg={k: mrr for k in ks},
+        mrr=mrr,
+    )
+
+
+def test_mean_results_macro_averages():
+    out = mean_results(
+        [_mk("hybrid", 0.6, n=5), _mk("hybrid", 0.4, n=50)], label="mean"
+    )
+    assert out.label == "mean"
+    assert out.mrr == 0.5  # equal weight per repo, not per case
+    assert out.n == 55  # total cases carried through
+    assert out.recall[1] == 0.5
+
+
+def test_aggregate_by_mode_groups_across_repos():
+    per_repo = {
+        "repoA": [_mk("hybrid", 0.6), _mk("adaptive", 0.7)],
+        "repoB": [_mk("hybrid", 0.4), _mk("adaptive", 0.5)],
+    }
+    agg = aggregate_by_mode(per_repo)
+    by_label = {r.label: r.mrr for r in agg}
+    assert by_label == {"mean:hybrid": 0.5, "mean:adaptive": 0.6}
+    # First-seen mode order preserved.
+    assert [r.label for r in agg] == ["mean:hybrid", "mean:adaptive"]
+
+
+def test_mean_results_empty_raises():
+    import pytest
+
+    with pytest.raises(ValueError):
+        mean_results([])
+
+
 def test_extensions_for_uses_canonical_map():
     from coderag.chunking.languages import extensions_for