diff --git a/evaluation/benchmarks/README.md b/evaluation/benchmarks/README.md
index 47d8086e..a6d9ffe6 100644
--- a/evaluation/benchmarks/README.md
+++ b/evaluation/benchmarks/README.md
@@ -6,7 +6,7 @@ We currently support the following datasets:
- [Zero Scrolls](zero_scrolls/README.md) ([hf link](https://huggingface.co/datasets/simonjegou/zero_scrolls))
- [Infinitebench](infinite_bench/README.md) ([hf link](https://huggingface.co/datasets/MaxJeblick/InfiniteBench))
- [longbench](longbench/README.md)([hf link](https://huggingface.co/datasets/Xnhyacinth/LongBench))
-- [longbench-v2](longbenchv2/README.md)([hf link](https://huggingface.co/datasets/Xnhyacinth/LongBench-v2))
+- [longbench-v2](longbenchv2/README.md)([hf link](https://huggingface.co/datasets/simonjegou/LongBench-v2))
Please refer to the README of each dataset for more information on how the Hugging Face dataset was generated.
diff --git a/evaluation/benchmarks/longbenchv2/README.md b/evaluation/benchmarks/longbenchv2/README.md
index a6b12ff8..62448dcf 100644
--- a/evaluation/benchmarks/longbenchv2/README.md
+++ b/evaluation/benchmarks/longbenchv2/README.md
@@ -1,7 +1,7 @@
-# longbench dataset
+# LongBench-v2 dataset
-[longbench-v2](https://github.com/THUDM/LongBench).
+[longbench](https://github.com/THUDM/LongBench).
## Create Hugging Face dataset
-The processed Hugging Face dataset for longbench can be found [here](https://huggingface.co/datasets/Xnhyacinth/LongBench-v2). To reproduce this dataset, simply run the `create_huggingface_dataset.py` script.
\ No newline at end of file
+The processed Hugging Face dataset for LongBench-v2 can be found [here](https://huggingface.co/datasets/simonjegou/LongBench-v2). To reproduce this dataset, simply run the `create_huggingface_dataset.py` script.
\ No newline at end of file
diff --git a/evaluation/benchmarks/longbenchv2/calculate_metrics.py b/evaluation/benchmarks/longbenchv2/calculate_metrics.py
index f85eebb3..9ce24940 100644
--- a/evaluation/benchmarks/longbenchv2/calculate_metrics.py
+++ b/evaluation/benchmarks/longbenchv2/calculate_metrics.py
@@ -1,66 +1,18 @@
# SPDX-FileCopyrightText: Copyright (c) 1993-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
-import re
-
-def extract_answer(response):
- response = response.replace("*", "")
- match = re.search(r"The correct answer is \(([A-D])\)", response)
- if match:
- return match.group(1)
- else:
- match = re.search(r"The correct answer is ([A-D])", response)
- if match:
- return match.group(1)
- else:
- return None
+def score(predicted_answer, expected_answer):
+ # From https://github.com/THUDM/LongBench/blob/main/pred.py (extract_answer function)
+ predicted_answer = predicted_answer.replace("*", "")
+ r1 = f"The correct answer is ({expected_answer})" in predicted_answer
+ r2 = f"The correct answer is {expected_answer}" in predicted_answer
+ return r1 or r2
def calculate_metrics(df):
- predictions = df["predicted_answer"].tolist()
- answers = df["answer"].tolist()
- lengths = df["length"].tolist()
- difficulties = df["difficulty"].tolist()
- return scorer(predictions, answers, lengths, difficulties)
-
-
-def scorer(predictions, answers, lengths, difficulties):
- compensated = False
- easy, hard, short, medium, long = 0, 0, 0, 0, 0
- easy_acc, hard_acc, short_acc, medium_acc, long_acc = 0, 0, 0, 0, 0
- for pred, answer, length, difficulty in zip(predictions, answers, lengths, difficulties):
- acc = int(extract_answer(pred) == answer)
- if compensated and pred["pred"] is None:
- acc = 0.25 # type:ignore[assignment]
- if difficulty == "easy":
- easy += 1
- easy_acc += acc
- else:
- hard += 1
- hard_acc += acc
-
- if length == "short":
- short += 1
- short_acc += acc
- elif length == "medium":
- medium += 1
- medium_acc += acc
- else:
- long += 1
- long_acc += acc
- scores = ["Overall\tEasy\tHard\tShort\tMedium\tLong"]
- scores.append(
- str(round(100 * (easy_acc + hard_acc) / len(predictions), 1))
- + "\t"
- + str(round(100 * easy_acc / easy, 1))
- + "\t"
- + str(round(100 * hard_acc / hard, 1))
- + "\t"
- + str(round(100 * short_acc / short, 1))
- + "\t"
- + str(round(100 * medium_acc / medium, 1))
- + "\t"
- + str(round(100 * long_acc / long, 1))
- )
- return scores
+ df["score"] = df.apply(lambda row: score(row["predicted_answer"], row["answer"]), axis=1)
+ metrics = {"average": df["score"].mean()}
+ metrics.update(df.groupby("difficulty")["score"].mean())
+ metrics.update(df.groupby("length")["score"].mean())
+ return metrics
diff --git a/evaluation/benchmarks/longbenchv2/create_huggingface_dataset.py b/evaluation/benchmarks/longbenchv2/create_huggingface_dataset.py
index 95c4b45a..c209f8d4 100644
--- a/evaluation/benchmarks/longbenchv2/create_huggingface_dataset.py
+++ b/evaluation/benchmarks/longbenchv2/create_huggingface_dataset.py
@@ -1,55 +1,38 @@
# SPDX-FileCopyrightText: Copyright (c) 1993-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
-
from datasets import Dataset, load_dataset
-# yarn_mistral_templates from: https://github.com/THUDM/LongBench/blob/main/pred.py
-context_prefix = {
- "0shot": "Please read the following text and answer the question below.\n\n\n{context}\n\n\n",
- "cot": "Please read the following text and answer the question below.\n\n\n{context}\n\n\n",
- "rag": "Please read the following retrieved text chunks and answer the question below.\n\n\n{context}\n\n\n",
- "nocontext": "",
-}
+# Templates from https://github.com/THUDM/LongBench/blob/main/prompts/0shot.txt
+context_template = """Please read the following text and answer the question below.
+
+{context}
+
-question_template = {
- "0shot": "What is the correct answer to this question: {question}\nChoices:\n(A) {C_A}\n(B) {C_B}\n(C) {C_C}\n(D) {C_D}\n\n",
- "cot": "What is the correct answer to this question: {question}\nChoices:\n(A) {C_A}\n(B) {C_B}\n(C) {C_C}\n(D) {C_D}\n\n",
- "rag": "What is the correct answer to this question: {question}\nChoices:\n(A) {C_A}\n(B) {C_B}\n(C) {C_C}\n(D) {C_D}\n\n",
- "nocontext": "What is the correct answer to this question: {question}\nChoices:\n(A) {C_A}\n(B) {C_B}\n(C) {C_C}\n(D) {C_D}\n\n",
-}
+"""
-answer_prefix = {
- "0shot": 'Format your response as follows: "The correct answer is (insert answer here)".',
- "cot": 'Format your response as follows: "The correct answer is (insert answer here)".\n\nLet’s think step by step:',
- "rag": 'Format your response as follows: "The correct answer is (insert answer here)".',
- "nocontext": 'What is the single, most likely answer choice? Format your response as follows: "The correct answer is (insert answer here)".',
-}
+question_template = """What is the correct answer to this question: {question}
+Choices:
+(A) {A}
+(B) {B}
+(C) {C}
+(D) {D}
-DATA_NAME_TO_MAX_NEW_TOKENS = {"0shot": 128, "cot": 1024}
+Format your response as follows: "The correct answer is (insert answer here)."""
# Longbench-v2
-for task in ["0shot", "cot"]:
- dataset = load_dataset("THUDM/LongBench-v2", split="train")
- dataset = dataset.map(lambda x: {"context": context_prefix[task].format(context=x["context"].strip())})
- dataset = dataset.map(
- lambda x: {
- "question": question_template[task].format(
- question=x["question"].strip(),
- C_A=x["choice_A"].strip(),
- C_B=x["choice_B"].strip(),
- C_C=x["choice_C"].strip(),
- C_D=x["choice_D"].strip(),
- )
- }
- )
- df = dataset.to_pandas()
- df["answer_prefix"] = answer_prefix.get(task, "")
- # df = df[["context", "question", "answer_prefix", "answers", "all_classes"]]
- df["task"] = task
- # be a bit more generous with token generation to avoid any cut-offs
- df["max_new_tokens"] = DATA_NAME_TO_MAX_NEW_TOKENS[task] + 20
-
- # Push to hub
- dataset = Dataset.from_pandas(df)
- dataset.push_to_hub("Xnhyacinth/LongBench-v2", config_name=task, split="test")
+df = load_dataset("THUDM/LongBench-v2", split="train").to_pandas()
+df["context"] = df["context"].apply(lambda x: context_template.format(context=x))
+df["question"] = df.apply(
+ lambda row: question_template.format(
+ question=row["question"],
+ A=row["choice_A"],
+ B=row["choice_B"],
+ C=row["choice_C"],
+ D=row["choice_D"],
+ ),
+ axis=1,
+)
+df["max_new_tokens"] = 16
+df["answer_prefix"] = ""
+Dataset.from_pandas(df).push_to_hub("simonjegou/LongBench-v2", config_name="0shot", split="test")
diff --git a/evaluation/evaluate_registry.py b/evaluation/evaluate_registry.py
index 05579c6a..7501ad5f 100644
--- a/evaluation/evaluate_registry.py
+++ b/evaluation/evaluate_registry.py
@@ -45,7 +45,7 @@
"infinitebench": "MaxJeblick/InfiniteBench",
"longbench": "Xnhyacinth/LongBench",
"longbench-e": "Xnhyacinth/LongBench",
- "longbench-v2": "Xnhyacinth/LongBench-v2",
+ "longbench-v2": "simonjegou/LongBench-v2",
"needle_in_haystack": "alessiodevoto/paul_graham_essays",
# Datasets used to be used for decoding compression
"aime25": "alessiodevoto/aime25",