Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 9 additions & 1 deletion coderag/eval/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,17 +21,25 @@
from __future__ import annotations

from coderag.eval.dataset import EvalCase, build_from_git, load_dataset, save_dataset
from coderag.eval.harness import EvalResult, compare_modes, evaluate
from coderag.eval.harness import (
EvalResult,
aggregate_by_mode,
compare_modes,
evaluate,
mean_results,
)
from coderag.eval.metrics import hit_at_k, mrr, ndcg_at_k, recall_at_k

__all__ = [
"EvalCase",
"EvalResult",
"aggregate_by_mode",
"build_from_git",
"compare_modes",
"evaluate",
"hit_at_k",
"load_dataset",
"mean_results",
"mrr",
"ndcg_at_k",
"recall_at_k",
Expand Down
14 changes: 14 additions & 0 deletions coderag/eval/datasets/multirepo.example.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
[
{
"name": "coderag",
"watched_dir": ".",
"store_dir": ".coderag-eval",
"dataset": "coderag/eval/datasets/coderag_self_symbols.jsonl"
},
{
"name": "pydantic",
"watched_dir": "/tmp/pydantic",
"store_dir": "/tmp/pydantic/.coderag-eval",
"dataset": "/tmp/pydantic-sym.jsonl"
}
]
39 changes: 39 additions & 0 deletions coderag/eval/harness.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@

from __future__ import annotations

from collections import OrderedDict
from dataclasses import dataclass, field
from typing import TYPE_CHECKING, Callable, Dict, List, Optional, Sequence, Tuple

Expand Down Expand Up @@ -162,6 +163,44 @@ def compare_modes(
return results


def mean_results(results: Sequence[EvalResult], *, label: str = "mean") -> EvalResult:
"""Macro-average several results (same ks/level) into one.

Each input is weighted equally — so when averaging one mode across repos, a large repo
doesn't dominate. That's the right lens for *generalization*: a config should win on the
average of repos, not just the biggest one. ``n`` carries the total case count.
"""
if not results:
raise ValueError("mean_results: no results to average")
ks = results[0].ks
m = len(results)
return EvalResult(
label=label,
level=results[0].level,
n=sum(r.n for r in results),
ks=ks,
recall={k: sum(r.recall[k] for r in results) / m for k in ks},
hit={k: sum(r.hit[k] for r in results) / m for k in ks},
ndcg={k: sum(r.ndcg[k] for r in results) / m for k in ks},
mrr=sum(r.mrr for r in results) / m,
)


def aggregate_by_mode(
per_repo: "OrderedDict[str, Sequence[EvalResult]]", *, prefix: str = "mean"
) -> List[EvalResult]:
"""Group per-repo results by mode label and macro-average each across repos.

Returns one row per mode (in first-seen order), e.g. ``mean:hybrid``, ``mean:adaptive`` —
the aggregate view used to decide whether a config generalizes.
"""
groups: "OrderedDict[str, List[EvalResult]]" = OrderedDict()
for results in per_repo.values():
for r in results:
groups.setdefault(r.label, []).append(r)
return [mean_results(rs, label=f"{prefix}:{mode}") for mode, rs in groups.items()]


def format_table(results: Sequence[EvalResult]) -> str:
"""Render results as a compact fixed-width table for the CLI."""
if not results:
Expand Down
58 changes: 55 additions & 3 deletions docs/eval.md
Original file line number Diff line number Diff line change
Expand Up @@ -249,9 +249,61 @@ pydantic, symbol level (172 cases, 22 071-chunk corpus)
⚠️ **But a 4-repo sweep (627 git-mined cases) shows it is *not* an aggregate win** — hybrid 0.442
vs adaptive 0.423 MRR; adaptive is a wash on the well-powered repos and the big CodeRAG-curated
gain turned out to be an artifact of unusually dense-friendly clean-NL queries (see the
*Multi-repo evaluation* section below / PR adding it). So adaptive stays **off by default** — it's
a **safe opt-in** (no catastrophic regression after this fix), not a default. Fixed 1:1 hybrid
remains the default. Enable per-session with `CODERAG_ADAPTIVE_FUSION=1`.
*Multi-repo evaluation* section below). So adaptive stays **off by default** — it's a **safe
opt-in** (no catastrophic regression after this fix), not a default. Fixed 1:1 hybrid remains the
default. Enable per-session with `CODERAG_ADAPTIVE_FUSION=1`.

## Multi-repo evaluation (judging generalization)

Single-repo tuning overfits — the [external-repo validation](research/external-validation.md)
showed levers that won on CodeRAG reversing on `pydantic`. So a config should only be promoted
to a **default** once it wins on the *average of several repos*. `scripts/bench_multirepo.py`
runs the eval across a manifest of repos and prints each repo's table plus a **macro-averaged
aggregate** (each repo weighted equally, so a big repo can't dominate):

```bash
python scripts/bench_multirepo.py --manifest repos.json --level symbol --adaptive --rerank
```

```json
[
{"name": "coderag", "watched_dir": ".", "dataset": "coderag/eval/datasets/coderag_self_symbols.jsonl"},
{"name": "pydantic", "watched_dir": "/tmp/pydantic", "store_dir": "/tmp/pyd_store", "dataset": "/tmp/pyd_sym.jsonl"}
]
```

Each entry reuses a prepared index + dataset (indexing is the slow part); pass `--index` to
build them first and `--build` to mine a symbol dataset from git history. The aggregate rows are
labelled `mean:<mode>`. See `coderag/eval/datasets/multirepo.example.json`. Programmatic API:
`from coderag.eval import aggregate_by_mode, mean_results`.

This is the gate for promoting **adaptive fusion** to default-on: it should be ≥ hybrid on the
aggregate *and* on every individual repo before the default flips.

### Result: adaptive fusion does **not** earn default-on

Run across four repos (627 git-mined symbol-level cases, `bge-small`, with the embedded-identifier
classifier):

```
coderag flask requests click | AGGREGATE (macro-avg, MRR)
dense 0.423 0.297 0.354 0.351 | 0.356
bm25 0.500 0.371 0.371 0.401 | 0.411
hybrid 0.564 0.363 0.415 0.427 | 0.442 ← best
adaptive 0.487 0.357 0.415 0.431 | 0.423
(cases) 13 219 126 269 | 627
```

**Hybrid 1:1 wins the aggregate (0.442) and is first-or-tied on every repo; adaptive does not
clear the bar.** On the three well-powered repos adaptive ≈ hybrid (a wash), and it trails on the
small/noisy coderag set. The large curated-CodeRAG adaptive win reported above was an artifact of
unusually dense-friendly, clean natural-language queries; on realistic git-mined commit queries
**dense is consistently the weakest modality**, so "lean dense for NL" stops paying off.

**Decisions this locks in:** keep `adaptive_fusion` **off by default** (it's a safe opt-in after
the classifier fix — no catastrophic regression — but not an aggregate win); keep **1:1 hybrid as
the default**. The harness did its job: a single-repo "win" was correctly blocked from becoming a
default. (Reranking across repos and a code-aware reranker remain the open levers.)

## Dataset format

Expand Down
121 changes: 121 additions & 0 deletions scripts/bench_multirepo.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,121 @@
#!/usr/bin/env python
"""Run the retrieval eval across several repos and report a generalization view.

Single-repo tuning overfits (see docs/research/external-validation.md), so a config should
only be promoted to a default once it wins on the *average of repos*. This driver scores each
repo with the eval harness, prints a per-repo table, then a macro-averaged aggregate (each
repo weighted equally) so you can see at a glance which mode generalizes — and whether any
repo regresses.

It is manifest-driven so indexing (the slow part) is done once and reused:

[
{"name": "coderag", "watched_dir": ".", "dataset": "coderag/eval/datasets/coderag_self_symbols.jsonl"},
{"name": "pydantic", "watched_dir": "/tmp/pydantic", "store_dir": "/tmp/pyd_store", "dataset": "/tmp/pyd_sym.jsonl"}
]

python scripts/bench_multirepo.py --manifest repos.json --level symbol --adaptive

Each entry needs a ``dataset`` (or pass ``--build`` to mine a symbol dataset from the repo's
git history). ``store_dir`` defaults to ``<watched_dir>/.coderag-eval``. Pass ``--index`` to
(incrementally) build each index first; otherwise an existing index is assumed.
"""

from __future__ import annotations

import argparse
import json
from collections import OrderedDict
from pathlib import Path
from typing import List

Check notice

Code scanning / CodeQL

Unused import Note

Import of 'List' is not used.

from coderag.api import CodeRAG
from coderag.config import Config
from coderag.eval import build_from_git, compare_modes, load_dataset, save_dataset
from coderag.eval.harness import EvalResult, aggregate_by_mode, format_table

Check notice

Code scanning / CodeQL

Unused import Note

Import of 'EvalResult' is not used.


def _dataset_for(cr: CodeRAG, entry: dict, level: str, build: bool) -> Path:
ds = entry.get("dataset")
if ds and Path(ds).exists():
return Path(ds)
if not build:
raise SystemExit(
f"{entry['name']}: no dataset at {ds!r} (pass --build to mine one)."
)
out = Path(ds) if ds else Path(f"{entry['name']}-eval.jsonl")
cases = build_from_git(
cr.config.watched_dir, max_cases=200, symbols=level == "symbol"
)
indexed = set(cr.store.all_file_paths())
cases = [
c
for c in cases
if c.relevant_files and all(f in indexed for f in c.relevant_files)
]
save_dataset(cases, out)
print(f" mined {len(cases)} case(s) -> {out}")
return out


def main() -> int:
ap = argparse.ArgumentParser(description=__doc__)
ap.add_argument("--manifest", required=True, help="JSON list of repo entries.")
ap.add_argument("--model", default="BAAI/bge-small-en-v1.5")
ap.add_argument("--level", choices=("file", "symbol"), default="symbol")
ap.add_argument("--ks", default="1,5,10")
ap.add_argument("--adaptive", action="store_true")
ap.add_argument("--rerank", action="store_true")
ap.add_argument("--build", action="store_true", help="Mine a dataset if missing.")
ap.add_argument(
"--index", action="store_true", help="(Incrementally) index each repo first."
)
args = ap.parse_args()

entries = json.loads(Path(args.manifest).read_text())
ks = tuple(int(k) for k in args.ks.split(","))
per_repo: "OrderedDict[str, List[EvalResult]]" = OrderedDict()

for entry in entries:
name = entry["name"]
watched = Path(entry["watched_dir"]).expanduser().resolve()
store = Path(entry.get("store_dir") or watched / ".coderag-eval")
print(f"\n=== {name} === ({watched})")
cfg = Config.from_env(
provider="fastembed", model=args.model, watched_dir=watched, store_dir=store
)
cr = CodeRAG(cfg)
if args.index:
stats = cr.index()
print(
f" indexed {stats.total_files} files / {stats.total_chunks} chunks"
)
ds = _dataset_for(cr, entry, args.level, args.build)
cases = load_dataset(ds)

reranker = None
if args.rerank:
from coderag.retrieval.rerank import get_reranker

reranker = get_reranker(cfg.with_overrides(rerank=True))
results = compare_modes(
cr,
cases,
ks=ks,
level=args.level,
adaptive=args.adaptive,
reranker=reranker,
)
per_repo[name] = results
print(f"\n {name}: {len(cases)} case(s)\n")
print(" " + format_table(results).replace("\n", "\n "))
cr.close()

if len(per_repo) > 1:
print("\n=== AGGREGATE (macro-avg across repos, each weighted equally) ===\n")
print(format_table(aggregate_by_mode(per_repo)))
return 0


if __name__ == "__main__":
raise SystemExit(main())
51 changes: 50 additions & 1 deletion tests/test_eval.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,13 @@
load_dataset,
save_dataset,
)
from coderag.eval.harness import best_label, format_table
from coderag.eval.harness import (
EvalResult,
aggregate_by_mode,
best_label,
format_table,
mean_results,
)
from coderag.eval.metrics import hit_at_k, mrr, ndcg_at_k, recall_at_k
from tests.conftest import write

Expand Down Expand Up @@ -190,6 +196,49 @@ def git(*args: str) -> None:
assert cases[0].source == "git"


def _mk(label: str, mrr: float, n: int = 10) -> EvalResult:
ks = (1, 5)
return EvalResult(
label=label,
level="symbol",
n=n,
ks=ks,
recall={k: mrr for k in ks},
hit={k: mrr for k in ks},
ndcg={k: mrr for k in ks},
mrr=mrr,
)


def test_mean_results_macro_averages():
out = mean_results(
[_mk("hybrid", 0.6, n=5), _mk("hybrid", 0.4, n=50)], label="mean"
)
assert out.label == "mean"
assert out.mrr == 0.5 # equal weight per repo, not per case
assert out.n == 55 # total cases carried through
assert out.recall[1] == 0.5


def test_aggregate_by_mode_groups_across_repos():
per_repo = {
"repoA": [_mk("hybrid", 0.6), _mk("adaptive", 0.7)],
"repoB": [_mk("hybrid", 0.4), _mk("adaptive", 0.5)],
}
agg = aggregate_by_mode(per_repo)
by_label = {r.label: r.mrr for r in agg}
assert by_label == {"mean:hybrid": 0.5, "mean:adaptive": 0.6}
# First-seen mode order preserved.
assert [r.label for r in agg] == ["mean:hybrid", "mean:adaptive"]


def test_mean_results_empty_raises():
import pytest

with pytest.raises(ValueError):
mean_results([])


def test_extensions_for_uses_canonical_map():
from coderag.chunking.languages import extensions_for

Expand Down
Loading