Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion evaluation/benchmarks/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ We currently support the following datasets:
- [Zero Scrolls](zero_scrolls/README.md) ([hf link](https://huggingface.co/datasets/simonjegou/zero_scrolls))
- [Infinitebench](infinite_bench/README.md) ([hf link](https://huggingface.co/datasets/MaxJeblick/InfiniteBench))
- [longbench](longbench/README.md)([hf link](https://huggingface.co/datasets/Xnhyacinth/LongBench))
- [longbench-v2](longbenchv2/README.md)([hf link](https://huggingface.co/datasets/Xnhyacinth/LongBench-v2))
- [longbench-v2](longbenchv2/README.md)([hf link](https://huggingface.co/datasets/simonjegou/LongBench-v2))

Please refer to the README of each dataset for more information on how the Hugging Face dataset was generated.

Expand Down
6 changes: 3 additions & 3 deletions evaluation/benchmarks/longbenchv2/README.md
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# longbench dataset
# LongBench-v2 dataset

[longbench-v2](https://github.com/THUDM/LongBench).
[longbench](https://github.com/THUDM/LongBench).

## Create Hugging Face dataset

The processed Hugging Face dataset for longbench can be found [here](https://huggingface.co/datasets/Xnhyacinth/LongBench-v2). To reproduce this dataset, simply run the `create_huggingface_dataset.py` script.
The processed Hugging Face dataset for LongBench-v2 can be found [here](https://huggingface.co/datasets/simonjegou/LongBench-v2). To reproduce this dataset, simply run the `create_huggingface_dataset.py` script.
70 changes: 11 additions & 59 deletions evaluation/benchmarks/longbenchv2/calculate_metrics.py
Original file line number Diff line number Diff line change
@@ -1,66 +1,18 @@
# SPDX-FileCopyrightText: Copyright (c) 1993-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0

import re


def extract_answer(response):
response = response.replace("*", "")
match = re.search(r"The correct answer is \(([A-D])\)", response)
if match:
return match.group(1)
else:
match = re.search(r"The correct answer is ([A-D])", response)
if match:
return match.group(1)
else:
return None
def score(predicted_answer, expected_answer):
# From https://github.com/THUDM/LongBench/blob/main/pred.py (extract_answer function)
predicted_answer = predicted_answer.replace("*", "")
r1 = f"The correct answer is ({expected_answer})" in predicted_answer
r2 = f"The correct answer is {expected_answer}" in predicted_answer
return r1 or r2


def calculate_metrics(df):
predictions = df["predicted_answer"].tolist()
answers = df["answer"].tolist()
lengths = df["length"].tolist()
difficulties = df["difficulty"].tolist()
return scorer(predictions, answers, lengths, difficulties)


def scorer(predictions, answers, lengths, difficulties):
compensated = False
easy, hard, short, medium, long = 0, 0, 0, 0, 0
easy_acc, hard_acc, short_acc, medium_acc, long_acc = 0, 0, 0, 0, 0
for pred, answer, length, difficulty in zip(predictions, answers, lengths, difficulties):
acc = int(extract_answer(pred) == answer)
if compensated and pred["pred"] is None:
acc = 0.25 # type:ignore[assignment]
if difficulty == "easy":
easy += 1
easy_acc += acc
else:
hard += 1
hard_acc += acc

if length == "short":
short += 1
short_acc += acc
elif length == "medium":
medium += 1
medium_acc += acc
else:
long += 1
long_acc += acc
scores = ["Overall\tEasy\tHard\tShort\tMedium\tLong"]
scores.append(
str(round(100 * (easy_acc + hard_acc) / len(predictions), 1))
+ "\t"
+ str(round(100 * easy_acc / easy, 1))
+ "\t"
+ str(round(100 * hard_acc / hard, 1))
+ "\t"
+ str(round(100 * short_acc / short, 1))
+ "\t"
+ str(round(100 * medium_acc / medium, 1))
+ "\t"
+ str(round(100 * long_acc / long, 1))
)
return scores
df["score"] = df.apply(lambda row: score(row["predicted_answer"], row["answer"]), axis=1)
metrics = {"average": df["score"].mean()}
metrics.update(df.groupby("difficulty")["score"].mean())
metrics.update(df.groupby("length")["score"].mean())
return metrics
73 changes: 28 additions & 45 deletions evaluation/benchmarks/longbenchv2/create_huggingface_dataset.py
Original file line number Diff line number Diff line change
@@ -1,55 +1,38 @@
# SPDX-FileCopyrightText: Copyright (c) 1993-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0


from datasets import Dataset, load_dataset

# yarn_mistral_templates from: https://github.com/THUDM/LongBench/blob/main/pred.py
context_prefix = {
"0shot": "Please read the following text and answer the question below.\n\n<text>\n{context}\n</text>\n\n",
"cot": "Please read the following text and answer the question below.\n\n<text>\n{context}\n</text>\n\n",
"rag": "Please read the following retrieved text chunks and answer the question below.\n\n<text>\n{context}\n</text>\n\n",
"nocontext": "",
}
# Templates from https://github.com/THUDM/LongBench/blob/main/prompts/0shot.txt
context_template = """Please read the following text and answer the question below.
<text>
{context}
</text>

question_template = {
"0shot": "What is the correct answer to this question: {question}\nChoices:\n(A) {C_A}\n(B) {C_B}\n(C) {C_C}\n(D) {C_D}\n\n",
"cot": "What is the correct answer to this question: {question}\nChoices:\n(A) {C_A}\n(B) {C_B}\n(C) {C_C}\n(D) {C_D}\n\n",
"rag": "What is the correct answer to this question: {question}\nChoices:\n(A) {C_A}\n(B) {C_B}\n(C) {C_C}\n(D) {C_D}\n\n",
"nocontext": "What is the correct answer to this question: {question}\nChoices:\n(A) {C_A}\n(B) {C_B}\n(C) {C_C}\n(D) {C_D}\n\n",
}
"""

answer_prefix = {
"0shot": 'Format your response as follows: "The correct answer is (insert answer here)".',
"cot": 'Format your response as follows: "The correct answer is (insert answer here)".\n\nLet’s think step by step:',
"rag": 'Format your response as follows: "The correct answer is (insert answer here)".',
"nocontext": 'What is the single, most likely answer choice? Format your response as follows: "The correct answer is (insert answer here)".',
}
question_template = """What is the correct answer to this question: {question}
Choices:
(A) {A}
(B) {B}
(C) {C}
(D) {D}

DATA_NAME_TO_MAX_NEW_TOKENS = {"0shot": 128, "cot": 1024}
Format your response as follows: "The correct answer is (insert answer here)."""

# Longbench-v2
for task in ["0shot", "cot"]:
dataset = load_dataset("THUDM/LongBench-v2", split="train")
dataset = dataset.map(lambda x: {"context": context_prefix[task].format(context=x["context"].strip())})
dataset = dataset.map(
lambda x: {
"question": question_template[task].format(
question=x["question"].strip(),
C_A=x["choice_A"].strip(),
C_B=x["choice_B"].strip(),
C_C=x["choice_C"].strip(),
C_D=x["choice_D"].strip(),
)
}
)
df = dataset.to_pandas()
df["answer_prefix"] = answer_prefix.get(task, "")
# df = df[["context", "question", "answer_prefix", "answers", "all_classes"]]
df["task"] = task
# be a bit more generous with token generation to avoid any cut-offs
df["max_new_tokens"] = DATA_NAME_TO_MAX_NEW_TOKENS[task] + 20

# Push to hub
dataset = Dataset.from_pandas(df)
dataset.push_to_hub("Xnhyacinth/LongBench-v2", config_name=task, split="test")
df = load_dataset("THUDM/LongBench-v2", split="train").to_pandas()
df["context"] = df["context"].apply(lambda x: context_template.format(context=x))
df["question"] = df.apply(
lambda row: question_template.format(
question=row["question"],
A=row["choice_A"],
B=row["choice_B"],
C=row["choice_C"],
D=row["choice_D"],
),
axis=1,
)
df["max_new_tokens"] = 16
df["answer_prefix"] = ""
Dataset.from_pandas(df).push_to_hub("simonjegou/LongBench-v2", config_name="0shot", split="test")
2 changes: 1 addition & 1 deletion evaluation/evaluate_registry.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@
"infinitebench": "MaxJeblick/InfiniteBench",
"longbench": "Xnhyacinth/LongBench",
"longbench-e": "Xnhyacinth/LongBench",
"longbench-v2": "Xnhyacinth/LongBench-v2",
"longbench-v2": "simonjegou/LongBench-v2",
"needle_in_haystack": "alessiodevoto/paul_graham_essays",
# Datasets used to be used for decoding compression
"aime25": "alessiodevoto/aime25",
Expand Down