NVIDIA · SimJeg · Nov 21, 2025 · Nov 21, 2025 · Nov 21, 2025 · Nov 21, 2025
diff --git a/evaluation/benchmarks/README.md b/evaluation/benchmarks/README.md
@@ -6,7 +6,7 @@ We currently support the following datasets:
 - [Zero Scrolls](zero_scrolls/README.md) ([hf link](https://huggingface.co/datasets/simonjegou/zero_scrolls))
 - [Infinitebench](infinite_bench/README.md) ([hf link](https://huggingface.co/datasets/MaxJeblick/InfiniteBench))
 - [longbench](longbench/README.md)([hf link](https://huggingface.co/datasets/Xnhyacinth/LongBench))
-- [longbench-v2](longbenchv2/README.md)([hf link](https://huggingface.co/datasets/Xnhyacinth/LongBench-v2))
+- [longbench-v2](longbenchv2/README.md)([hf link](https://huggingface.co/datasets/simonjegou/LongBench-v2))
 
 Please refer to the README of each dataset for more information on how the Hugging Face dataset was generated.
 

diff --git a/evaluation/benchmarks/longbenchv2/README.md b/evaluation/benchmarks/longbenchv2/README.md
@@ -1,7 +1,7 @@
-# longbench dataset
+# LongBench-v2 dataset
 
-[longbench-v2](https://github.com/THUDM/LongBench). 
+[longbench](https://github.com/THUDM/LongBench). 
 
 ## Create Hugging Face dataset
 
-The processed Hugging Face dataset for longbench can be found [here](https://huggingface.co/datasets/Xnhyacinth/LongBench-v2). To reproduce this dataset, simply run the `create_huggingface_dataset.py` script.
+The processed Hugging Face dataset for LongBench-v2 can be found [here](https://huggingface.co/datasets/simonjegou/LongBench-v2). To reproduce this dataset, simply run the `create_huggingface_dataset.py` script.
diff --git a/evaluation/benchmarks/longbenchv2/calculate_metrics.py b/evaluation/benchmarks/longbenchv2/calculate_metrics.py
@@ -1,66 +1,18 @@
 # SPDX-FileCopyrightText: Copyright (c) 1993-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
 
-import re
 
-
-def extract_answer(response):
-    response = response.replace("*", "")
-    match = re.search(r"The correct answer is \(([A-D])\)", response)
-    if match:
-        return match.group(1)
-    else:
-        match = re.search(r"The correct answer is ([A-D])", response)
-        if match:
-            return match.group(1)
-        else:
-            return None
+def score(predicted_answer, expected_answer):
+    # From https://github.com/THUDM/LongBench/blob/main/pred.py (extract_answer function)
+    predicted_answer = predicted_answer.replace("*", "")
+    r1 = f"The correct answer is ({expected_answer})" in predicted_answer
+    r2 = f"The correct answer is {expected_answer}" in predicted_answer
+    return r1 or r2
 
 
 def calculate_metrics(df):
-    predictions = df["predicted_answer"].tolist()
-    answers = df["answer"].tolist()
-    lengths = df["length"].tolist()
-    difficulties = df["difficulty"].tolist()
-    return scorer(predictions, answers, lengths, difficulties)
-
-
-def scorer(predictions, answers, lengths, difficulties):
-    compensated = False
-    easy, hard, short, medium, long = 0, 0, 0, 0, 0
-    easy_acc, hard_acc, short_acc, medium_acc, long_acc = 0, 0, 0, 0, 0
-    for pred, answer, length, difficulty in zip(predictions, answers, lengths, difficulties):
-        acc = int(extract_answer(pred) == answer)
-        if compensated and pred["pred"] is None:
-            acc = 0.25  # type:ignore[assignment]
-        if difficulty == "easy":
-            easy += 1
-            easy_acc += acc
-        else:
-            hard += 1
-            hard_acc += acc
-
-        if length == "short":
-            short += 1
-            short_acc += acc
-        elif length == "medium":
-            medium += 1
-            medium_acc += acc
-        else:
-            long += 1
-            long_acc += acc
-    scores = ["Overall\tEasy\tHard\tShort\tMedium\tLong"]
-    scores.append(
-        str(round(100 * (easy_acc + hard_acc) / len(predictions), 1))
-        + "\t"
-        + str(round(100 * easy_acc / easy, 1))
-        + "\t"
-        + str(round(100 * hard_acc / hard, 1))
-        + "\t"
-        + str(round(100 * short_acc / short, 1))
-        + "\t"
-        + str(round(100 * medium_acc / medium, 1))
-        + "\t"
-        + str(round(100 * long_acc / long, 1))
-    )
-    return scores
+    df["score"] = df.apply(lambda row: score(row["predicted_answer"], row["answer"]), axis=1)
+    metrics = {"average": df["score"].mean()}
+    metrics.update(df.groupby("difficulty")["score"].mean())
+    metrics.update(df.groupby("length")["score"].mean())
+    return metrics
diff --git a/evaluation/benchmarks/longbenchv2/create_huggingface_dataset.py b/evaluation/benchmarks/longbenchv2/create_huggingface_dataset.py
@@ -1,55 +1,38 @@
 # SPDX-FileCopyrightText: Copyright (c) 1993-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
 
-
 from datasets import Dataset, load_dataset
 
-# yarn_mistral_templates from: https://github.com/THUDM/LongBench/blob/main/pred.py
-context_prefix = {
-    "0shot": "Please read the following text and answer the question below.\n\n<text>\n{context}\n</text>\n\n",
-    "cot": "Please read the following text and answer the question below.\n\n<text>\n{context}\n</text>\n\n",
-    "rag": "Please read the following retrieved text chunks and answer the question below.\n\n<text>\n{context}\n</text>\n\n",
-    "nocontext": "",
-}
+# Templates from https://github.com/THUDM/LongBench/blob/main/prompts/0shot.txt
+context_template = """Please read the following text and answer the question below.
+<text>
+{context}
+</text>
 
-question_template = {
-    "0shot": "What is the correct answer to this question: {question}\nChoices:\n(A) {C_A}\n(B) {C_B}\n(C) {C_C}\n(D) {C_D}\n\n",
-    "cot": "What is the correct answer to this question: {question}\nChoices:\n(A) {C_A}\n(B) {C_B}\n(C) {C_C}\n(D) {C_D}\n\n",
-    "rag": "What is the correct answer to this question: {question}\nChoices:\n(A) {C_A}\n(B) {C_B}\n(C) {C_C}\n(D) {C_D}\n\n",
-    "nocontext": "What is the correct answer to this question: {question}\nChoices:\n(A) {C_A}\n(B) {C_B}\n(C) {C_C}\n(D) {C_D}\n\n",
-}
+"""
 
-answer_prefix = {
-    "0shot": 'Format your response as follows: "The correct answer is (insert answer here)".',
-    "cot": 'Format your response as follows: "The correct answer is (insert answer here)".\n\nLet’s think step by step:',
-    "rag": 'Format your response as follows: "The correct answer is (insert answer here)".',
-    "nocontext": 'What is the single, most likely answer choice? Format your response as follows: "The correct answer is (insert answer here)".',
-}
+question_template = """What is the correct answer to this question: {question}
+Choices:
+(A) {A}
+(B) {B}
+(C) {C}
+(D) {D}
 
-DATA_NAME_TO_MAX_NEW_TOKENS = {"0shot": 128, "cot": 1024}
+Format your response as follows: "The correct answer is (insert answer here)."""
 
 # Longbench-v2
-for task in ["0shot", "cot"]:
-    dataset = load_dataset("THUDM/LongBench-v2", split="train")
-    dataset = dataset.map(lambda x: {"context": context_prefix[task].format(context=x["context"].strip())})
-    dataset = dataset.map(
-        lambda x: {
-            "question": question_template[task].format(
-                question=x["question"].strip(),
-                C_A=x["choice_A"].strip(),
-                C_B=x["choice_B"].strip(),
-                C_C=x["choice_C"].strip(),
-                C_D=x["choice_D"].strip(),
-            )
-        }
-    )
-    df = dataset.to_pandas()
-    df["answer_prefix"] = answer_prefix.get(task, "")
-    # df = df[["context", "question", "answer_prefix", "answers", "all_classes"]]
-    df["task"] = task
-    # be a bit more generous with token generation to avoid any cut-offs
-    df["max_new_tokens"] = DATA_NAME_TO_MAX_NEW_TOKENS[task] + 20
-
-    # Push to hub
-    dataset = Dataset.from_pandas(df)
-    dataset.push_to_hub("Xnhyacinth/LongBench-v2", config_name=task, split="test")
+df = load_dataset("THUDM/LongBench-v2", split="train").to_pandas()
+df["context"] = df["context"].apply(lambda x: context_template.format(context=x))
+df["question"] = df.apply(
+    lambda row: question_template.format(
+        question=row["question"],
+        A=row["choice_A"],
+        B=row["choice_B"],
+        C=row["choice_C"],
+        D=row["choice_D"],
+    ),
+    axis=1,
+)
+df["max_new_tokens"] = 16
+df["answer_prefix"] = ""
+Dataset.from_pandas(df).push_to_hub("simonjegou/LongBench-v2", config_name="0shot", split="test")
diff --git a/evaluation/evaluate_registry.py b/evaluation/evaluate_registry.py
@@ -45,7 +45,7 @@
     "infinitebench": "MaxJeblick/InfiniteBench",
     "longbench": "Xnhyacinth/LongBench",
     "longbench-e": "Xnhyacinth/LongBench",
-    "longbench-v2": "Xnhyacinth/LongBench-v2",
+    "longbench-v2": "simonjegou/LongBench-v2",
     "needle_in_haystack": "alessiodevoto/paul_graham_essays",
     # Datasets used to be used for decoding compression
     "aime25": "alessiodevoto/aime25",