From 2b86b62e45d25d8d768d39a70ebd6d73d5967c3b Mon Sep 17 00:00:00 2001
From: Robert Leonard <40375385+Robert-H-Leonard@users.noreply.github.com>
Date: Tue, 2 Sep 2025 08:11:57 -0400
Subject: [PATCH] Update llm judge model for yourbench to 2.5-flash

---
 yourbench/utils/convert_to_atlas_module.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/yourbench/utils/convert_to_atlas_module.py b/yourbench/utils/convert_to_atlas_module.py
index c32bda4a..0dbc1d50 100644
--- a/yourbench/utils/convert_to_atlas_module.py
+++ b/yourbench/utils/convert_to_atlas_module.py
@@ -13,7 +13,7 @@ def _scorer_yaml(name: str) -> str:
           type: llm_judge
           options:
             regex_pattern: ''
-            judge_model: google_gemini-2.5-flash-preview-05-20
+            judge_model: google_gemini-2.5-flash
             judge_prompt: |-
               Your job is to look at a question, a gold target, and a predicted answer, and then assign a grade of either ["CORRECT", "INCORRECT", "NOT_ATTEMPTED"].
               First, I will give examples of each grade, and then you will grade a new example.