diff --git a/evaluation/benchmarks/README.md b/evaluation/benchmarks/README.md index 47d8086e..a6d9ffe6 100644 --- a/evaluation/benchmarks/README.md +++ b/evaluation/benchmarks/README.md @@ -6,7 +6,7 @@ We currently support the following datasets: - [Zero Scrolls](zero_scrolls/README.md) ([hf link](https://huggingface.co/datasets/simonjegou/zero_scrolls)) - [Infinitebench](infinite_bench/README.md) ([hf link](https://huggingface.co/datasets/MaxJeblick/InfiniteBench)) - [longbench](longbench/README.md)([hf link](https://huggingface.co/datasets/Xnhyacinth/LongBench)) -- [longbench-v2](longbenchv2/README.md)([hf link](https://huggingface.co/datasets/Xnhyacinth/LongBench-v2)) +- [longbench-v2](longbenchv2/README.md)([hf link](https://huggingface.co/datasets/simonjegou/LongBench-v2)) Please refer to the README of each dataset for more information on how the Hugging Face dataset was generated. diff --git a/evaluation/benchmarks/longbenchv2/README.md b/evaluation/benchmarks/longbenchv2/README.md index a6b12ff8..62448dcf 100644 --- a/evaluation/benchmarks/longbenchv2/README.md +++ b/evaluation/benchmarks/longbenchv2/README.md @@ -1,7 +1,7 @@ -# longbench dataset +# LongBench-v2 dataset -[longbench-v2](https://github.com/THUDM/LongBench). +[longbench](https://github.com/THUDM/LongBench). ## Create Hugging Face dataset -The processed Hugging Face dataset for longbench can be found [here](https://huggingface.co/datasets/Xnhyacinth/LongBench-v2). To reproduce this dataset, simply run the `create_huggingface_dataset.py` script. \ No newline at end of file +The processed Hugging Face dataset for LongBench-v2 can be found [here](https://huggingface.co/datasets/simonjegou/LongBench-v2). To reproduce this dataset, simply run the `create_huggingface_dataset.py` script. \ No newline at end of file diff --git a/evaluation/benchmarks/longbenchv2/calculate_metrics.py b/evaluation/benchmarks/longbenchv2/calculate_metrics.py index f85eebb3..9ce24940 100644 --- a/evaluation/benchmarks/longbenchv2/calculate_metrics.py +++ b/evaluation/benchmarks/longbenchv2/calculate_metrics.py @@ -1,66 +1,18 @@ # SPDX-FileCopyrightText: Copyright (c) 1993-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 -import re - -def extract_answer(response): - response = response.replace("*", "") - match = re.search(r"The correct answer is \(([A-D])\)", response) - if match: - return match.group(1) - else: - match = re.search(r"The correct answer is ([A-D])", response) - if match: - return match.group(1) - else: - return None +def score(predicted_answer, expected_answer): + # From https://github.com/THUDM/LongBench/blob/main/pred.py (extract_answer function) + predicted_answer = predicted_answer.replace("*", "") + r1 = f"The correct answer is ({expected_answer})" in predicted_answer + r2 = f"The correct answer is {expected_answer}" in predicted_answer + return r1 or r2 def calculate_metrics(df): - predictions = df["predicted_answer"].tolist() - answers = df["answer"].tolist() - lengths = df["length"].tolist() - difficulties = df["difficulty"].tolist() - return scorer(predictions, answers, lengths, difficulties) - - -def scorer(predictions, answers, lengths, difficulties): - compensated = False - easy, hard, short, medium, long = 0, 0, 0, 0, 0 - easy_acc, hard_acc, short_acc, medium_acc, long_acc = 0, 0, 0, 0, 0 - for pred, answer, length, difficulty in zip(predictions, answers, lengths, difficulties): - acc = int(extract_answer(pred) == answer) - if compensated and pred["pred"] is None: - acc = 0.25 # type:ignore[assignment] - if difficulty == "easy": - easy += 1 - easy_acc += acc - else: - hard += 1 - hard_acc += acc - - if length == "short": - short += 1 - short_acc += acc - elif length == "medium": - medium += 1 - medium_acc += acc - else: - long += 1 - long_acc += acc - scores = ["Overall\tEasy\tHard\tShort\tMedium\tLong"] - scores.append( - str(round(100 * (easy_acc + hard_acc) / len(predictions), 1)) - + "\t" - + str(round(100 * easy_acc / easy, 1)) - + "\t" - + str(round(100 * hard_acc / hard, 1)) - + "\t" - + str(round(100 * short_acc / short, 1)) - + "\t" - + str(round(100 * medium_acc / medium, 1)) - + "\t" - + str(round(100 * long_acc / long, 1)) - ) - return scores + df["score"] = df.apply(lambda row: score(row["predicted_answer"], row["answer"]), axis=1) + metrics = {"average": df["score"].mean()} + metrics.update(df.groupby("difficulty")["score"].mean()) + metrics.update(df.groupby("length")["score"].mean()) + return metrics diff --git a/evaluation/benchmarks/longbenchv2/create_huggingface_dataset.py b/evaluation/benchmarks/longbenchv2/create_huggingface_dataset.py index 95c4b45a..c209f8d4 100644 --- a/evaluation/benchmarks/longbenchv2/create_huggingface_dataset.py +++ b/evaluation/benchmarks/longbenchv2/create_huggingface_dataset.py @@ -1,55 +1,38 @@ # SPDX-FileCopyrightText: Copyright (c) 1993-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 - from datasets import Dataset, load_dataset -# yarn_mistral_templates from: https://github.com/THUDM/LongBench/blob/main/pred.py -context_prefix = { - "0shot": "Please read the following text and answer the question below.\n\n\n{context}\n\n\n", - "cot": "Please read the following text and answer the question below.\n\n\n{context}\n\n\n", - "rag": "Please read the following retrieved text chunks and answer the question below.\n\n\n{context}\n\n\n", - "nocontext": "", -} +# Templates from https://github.com/THUDM/LongBench/blob/main/prompts/0shot.txt +context_template = """Please read the following text and answer the question below. + +{context} + -question_template = { - "0shot": "What is the correct answer to this question: {question}\nChoices:\n(A) {C_A}\n(B) {C_B}\n(C) {C_C}\n(D) {C_D}\n\n", - "cot": "What is the correct answer to this question: {question}\nChoices:\n(A) {C_A}\n(B) {C_B}\n(C) {C_C}\n(D) {C_D}\n\n", - "rag": "What is the correct answer to this question: {question}\nChoices:\n(A) {C_A}\n(B) {C_B}\n(C) {C_C}\n(D) {C_D}\n\n", - "nocontext": "What is the correct answer to this question: {question}\nChoices:\n(A) {C_A}\n(B) {C_B}\n(C) {C_C}\n(D) {C_D}\n\n", -} +""" -answer_prefix = { - "0shot": 'Format your response as follows: "The correct answer is (insert answer here)".', - "cot": 'Format your response as follows: "The correct answer is (insert answer here)".\n\nLet’s think step by step:', - "rag": 'Format your response as follows: "The correct answer is (insert answer here)".', - "nocontext": 'What is the single, most likely answer choice? Format your response as follows: "The correct answer is (insert answer here)".', -} +question_template = """What is the correct answer to this question: {question} +Choices: +(A) {A} +(B) {B} +(C) {C} +(D) {D} -DATA_NAME_TO_MAX_NEW_TOKENS = {"0shot": 128, "cot": 1024} +Format your response as follows: "The correct answer is (insert answer here).""" # Longbench-v2 -for task in ["0shot", "cot"]: - dataset = load_dataset("THUDM/LongBench-v2", split="train") - dataset = dataset.map(lambda x: {"context": context_prefix[task].format(context=x["context"].strip())}) - dataset = dataset.map( - lambda x: { - "question": question_template[task].format( - question=x["question"].strip(), - C_A=x["choice_A"].strip(), - C_B=x["choice_B"].strip(), - C_C=x["choice_C"].strip(), - C_D=x["choice_D"].strip(), - ) - } - ) - df = dataset.to_pandas() - df["answer_prefix"] = answer_prefix.get(task, "") - # df = df[["context", "question", "answer_prefix", "answers", "all_classes"]] - df["task"] = task - # be a bit more generous with token generation to avoid any cut-offs - df["max_new_tokens"] = DATA_NAME_TO_MAX_NEW_TOKENS[task] + 20 - - # Push to hub - dataset = Dataset.from_pandas(df) - dataset.push_to_hub("Xnhyacinth/LongBench-v2", config_name=task, split="test") +df = load_dataset("THUDM/LongBench-v2", split="train").to_pandas() +df["context"] = df["context"].apply(lambda x: context_template.format(context=x)) +df["question"] = df.apply( + lambda row: question_template.format( + question=row["question"], + A=row["choice_A"], + B=row["choice_B"], + C=row["choice_C"], + D=row["choice_D"], + ), + axis=1, +) +df["max_new_tokens"] = 16 +df["answer_prefix"] = "" +Dataset.from_pandas(df).push_to_hub("simonjegou/LongBench-v2", config_name="0shot", split="test") diff --git a/evaluation/evaluate_registry.py b/evaluation/evaluate_registry.py index 05579c6a..7501ad5f 100644 --- a/evaluation/evaluate_registry.py +++ b/evaluation/evaluate_registry.py @@ -45,7 +45,7 @@ "infinitebench": "MaxJeblick/InfiniteBench", "longbench": "Xnhyacinth/LongBench", "longbench-e": "Xnhyacinth/LongBench", - "longbench-v2": "Xnhyacinth/LongBench-v2", + "longbench-v2": "simonjegou/LongBench-v2", "needle_in_haystack": "alessiodevoto/paul_graham_essays", # Datasets used to be used for decoding compression "aime25": "alessiodevoto/aime25",