From 2b86b62e45d25d8d768d39a70ebd6d73d5967c3b Mon Sep 17 00:00:00 2001 From: Robert Leonard <40375385+Robert-H-Leonard@users.noreply.github.com> Date: Tue, 2 Sep 2025 08:11:57 -0400 Subject: [PATCH] Update llm judge model for yourbench to 2.5-flash --- yourbench/utils/convert_to_atlas_module.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/yourbench/utils/convert_to_atlas_module.py b/yourbench/utils/convert_to_atlas_module.py index c32bda4a..0dbc1d50 100644 --- a/yourbench/utils/convert_to_atlas_module.py +++ b/yourbench/utils/convert_to_atlas_module.py @@ -13,7 +13,7 @@ def _scorer_yaml(name: str) -> str: type: llm_judge options: regex_pattern: '' - judge_model: google_gemini-2.5-flash-preview-05-20 + judge_model: google_gemini-2.5-flash judge_prompt: |- Your job is to look at a question, a gold target, and a predicted answer, and then assign a grade of either ["CORRECT", "INCORRECT", "NOT_ATTEMPTED"]. First, I will give examples of each grade, and then you will grade a new example.