Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
29 changes: 14 additions & 15 deletions benchmarks/baselines/ablations.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,19 +19,19 @@
from benchmarks.run_benchmark import RepoResult, evaluate
from semble import SembleIndex
from semble.index.dense import _DEFAULT_MODEL_NAME
from semble.types import SearchMode

_MODES = ["bm25", "semantic", "semble-bm25", "semble-semantic"]

# Maps mode name -> (search_mode, alpha) for index.search()
# alpha=None → raw mode, no ranking pipeline
# alpha=None → raw mode, input depends on query
# alpha=0.0 → hybrid pipeline, BM25-only input
# alpha=1.0 → hybrid pipeline, semantic-only input
_MODE_PARAMS: dict[str, tuple[SearchMode, float | None]] = {
"bm25": (SearchMode.BM25, None),
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We also need to update the actual results I guess if we do this? The readme etc? I'm a bit torn on removing these, since specially bm25 is a very strong and standard baseline to report. But we can just call the private functions here to to keep supporting these ablations I think, wdyt?

"semantic": (SearchMode.SEMANTIC, None),
"semble-bm25": (SearchMode.HYBRID, 0.0),
"semble-semantic": (SearchMode.HYBRID, 1.0),
_MODE_PARAMS: dict[str, tuple[float | None, bool]] = {
"semble-bm25": (0.0, True),
"semble-semantic": (1.0, True),
"semble-auto": (None, True),
"semble-balanced": (0.5, True),
"unranked-bm25": (0.0, False),
"unranked-semantic": (1.0, False),
"unranked-auto": (None, False),
"unranked-balanced": (0.5, False),
}


Expand Down Expand Up @@ -65,10 +65,9 @@ def _bench(
index = SembleIndex.from_path(spec.benchmark_dir, model=model)
index_ms = (time.perf_counter() - started) * 1000

for mode in modes:
search_mode, alpha = _MODE_PARAMS[mode]
for mode, (alpha, rerank) in sorted(_MODE_PARAMS.items()):
ndcg5, ndcg10, latencies, by_category, tokens = evaluate(
index, tasks, mode=search_mode, alpha=alpha, verbose=verbose
index, tasks, alpha=alpha, verbose=verbose, rerank=rerank
)
p50, p90, p95, p99 = np.percentile(latencies, [50, 90, 95, 99]).tolist()
result = RepoResult(
Expand Down Expand Up @@ -100,15 +99,15 @@ def _parse_args() -> argparse.Namespace:
parser = argparse.ArgumentParser(description="semble ablation benchmarks.")
add_filter_args(parser, verbose=True)
parser.add_argument(
"--mode", action="append", default=[], choices=_MODES, help="Mode(s) to evaluate (default: all)."
"--mode", action="append", default=[], choices=sorted(_MODE_PARAMS), help="Mode(s) to evaluate (default: all)."
)
return parser.parse_args()


def main() -> None:
"""Run the semble ablation benchmarks."""
args = _parse_args()
modes = args.mode or _MODES
modes = args.mode or sorted(_MODE_PARAMS)

repo_specs, tasks = load_filtered_tasks(args.repo or None, args.language or None)

Expand Down
Loading
Loading