From e2fe544604736c1ffa0c9078adb09f0eda69e422 Mon Sep 17 00:00:00 2001 From: alessiodevoto Date: Thu, 21 Aug 2025 15:36:39 +0000 Subject: [PATCH 1/9] needle Signed-off-by: alessiodevoto --- .../benchmarks/needle_in_haystack/__init__.py | 0 .../needle_in_haystack/calculate_metrics.py | 14 ++++++++++++++ evaluation/evaluate.py | 15 ++++++++++++++- evaluation/evaluate_registry.py | 1 + 4 files changed, 29 insertions(+), 1 deletion(-) create mode 100644 evaluation/benchmarks/needle_in_haystack/__init__.py create mode 100644 evaluation/benchmarks/needle_in_haystack/calculate_metrics.py diff --git a/evaluation/benchmarks/needle_in_haystack/__init__.py b/evaluation/benchmarks/needle_in_haystack/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/evaluation/benchmarks/needle_in_haystack/calculate_metrics.py b/evaluation/benchmarks/needle_in_haystack/calculate_metrics.py new file mode 100644 index 00000000..06545629 --- /dev/null +++ b/evaluation/benchmarks/needle_in_haystack/calculate_metrics.py @@ -0,0 +1,14 @@ +# SPDX-FileCopyrightText: Copyright (c) 1993-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +import pandas as pd +from rouge_score import rouge_scorer + +scorer = rouge_scorer.RougeScorer(["rouge1", "rougeL"], use_stemmer=True) + +def calculate_metrics(df: pd.DataFrame) -> dict: + scores = [] + for index, row in df.iterrows(): + score = scorer.score(row["needle"], row["predicted_answer"])["rouge1"].fmeasure * 10 + scores.append(score) + return {"rouge1": sum(scores) / len(scores)} \ No newline at end of file diff --git a/evaluation/evaluate.py b/evaluation/evaluate.py index e7777dfc..cf444d73 100644 --- a/evaluation/evaluate.py +++ b/evaluation/evaluate.py @@ -42,6 +42,7 @@ class EvaluationConfig: max_new_tokens: Optional[int] = None max_context_length: Optional[int] = None compress_questions: bool = False + needle_depth: Optional[int] = None # Output and logging output_dir: str = "./results" @@ -87,6 +88,9 @@ def __post_init__(self): # Initialize model_kwargs if None if self.model_kwargs is None: self.model_kwargs = {} + + if self.dataset == "needle_in_haystack": + assert self.needle_depth is not None, "needle_depth must be set for needle_in_haystack" def get_results_dir(self, output_dir: Path) -> Path: """ @@ -338,6 +342,15 @@ def _prepare_data_for_inference(self): Prepares the dataset for inference, handling `compress_questions` and `FinchPress` specifics. """ compress_questions = self.config.compress_questions + + # if we have needle in a haystack, we need to tokenize the dataset, cut it to max_context_length, insert the needle at depth n%max_context_length, and then detokenize it + if self.config.dataset == "needle_in_haystack": + tokenized_needle = self.pipeline.tokenizer.encode(self.df["needle"], add_special_tokens=False) + context_length = self.config.max_context_length - len(tokenized_needle) - 150 # account for system prompts + self.df["context"] = self.df["context"].apply(lambda x: self.pipeline.tokenizer.encode(x, add_special_tokens=False)[:context_length]) + needle_index = int(context_length * self.config.needle_depth / 100) + self.df["context"] = self.df["context"].apply(lambda x: x[:needle_index] + tokenized_needle + x[needle_index:]) + self.df["context"] = "This is a very long story book: " + self.df["context"].apply(lambda x: self.pipeline.tokenizer.decode(x, skip_special_tokens=True)) + " ." if isinstance(self.press, FinchPress): if not compress_questions: @@ -400,7 +413,7 @@ def _save_results(self, save_filename: Path): if save_filename.exists(): logger.warning(f"Results CSV already exists at {save_filename}. Overwriting.") - self.df[["predicted_answer", "compression_ratio"]].to_csv(str(save_filename), index=False) # type: ignore[index] + self.df[list(set(self.df.columns) - set(["context"]))].to_csv(str(save_filename), index=False) # type: ignore[index] logger.info(f"Results saved to {save_filename}") def _calculate_and_save_metrics(self, save_filename: Path): diff --git a/evaluation/evaluate_registry.py b/evaluation/evaluate_registry.py index 89b26596..af753dd2 100644 --- a/evaluation/evaluate_registry.py +++ b/evaluation/evaluate_registry.py @@ -41,6 +41,7 @@ "longbench": "Xnhyacinth/LongBench", "longbench-e": "Xnhyacinth/LongBench", "longbench-v2": "Xnhyacinth/LongBench-v2", + "needle_in_haystack": "alessiodevoto/paul_graham_essays", } SCORER_REGISTRY = { From 4ce1c43a1907bfe3c26288e9c810c8618e007d30 Mon Sep 17 00:00:00 2001 From: alessiodevoto Date: Thu, 21 Aug 2025 16:18:58 +0000 Subject: [PATCH 2/9] niah Signed-off-by: alessiodevoto --- .../benchmarks/needle_in_haystack/README.md | 3 ++ evaluation/evaluate.py | 30 ++++++++++++++----- evaluation/evaluate_registry.py | 2 ++ 3 files changed, 27 insertions(+), 8 deletions(-) create mode 100644 evaluation/benchmarks/needle_in_haystack/README.md diff --git a/evaluation/benchmarks/needle_in_haystack/README.md b/evaluation/benchmarks/needle_in_haystack/README.md new file mode 100644 index 00000000..324e260e --- /dev/null +++ b/evaluation/benchmarks/needle_in_haystack/README.md @@ -0,0 +1,3 @@ +# Needle in a Haystack +This benchmark evaluates a model's ability to retrieve a specific piece of information, the "needle," hidden within a large body of text, the "haystack." The test challenges a model's long-context understanding and its ability to maintain information accuracy over increasing document lengths. +We follow the vast majority of the literature and use [Paul Graham's essays](https://huggingface.co/datasets/alessiodevoto/paul_graham_essays) as the haystack. \ No newline at end of file diff --git a/evaluation/evaluate.py b/evaluation/evaluate.py index cf444d73..dd28655f 100644 --- a/evaluation/evaluate.py +++ b/evaluation/evaluate.py @@ -208,7 +208,8 @@ def _setup_deterministic_seeds(self): def _setup_logging(self): """Configures the logging level based on the config.""" log_level = self.config.log_level.upper() - logging.basicConfig(level=getattr(logging, log_level), format="%(asctime)s - %(levelname)s - %(message)s") + logging.basicConfig(level=logging.INFO) + # logging.basicConfig(level=getattr(logging, log_level), format="%(asctime)s - %(levelname)s - %(message)s") def _setup_directories(self) -> Path: """ @@ -299,7 +300,6 @@ def _setup_model_pipeline(self): logger.info(f"No device specified, auto-detected device: {device}") model_kwargs = self.config.model_kwargs or {} - print(model_kwargs) if isinstance(self.press, ObservedAttentionPress): model_kwargs["attn_implementation"] = "eager" logger.info("ObservedAttentionPress detected, setting attn_implementation to 'eager'.") @@ -337,20 +337,34 @@ def _setup_model_pipeline(self): logger.info("Model pipeline loaded.") + + def _insert_needle_in_haystack(self): + """ + Inserts the needle in the haystack at the depth specified in the config. + Adapted from the original implementation: https://github.com/gkamradt/LLMTest_NeedleInAHaystack + To insert the needle, we need to tokenize the context, insert the needle at the depth specified in the config, and then detokenize it. + """ + logger.info(f"Preparing dataset for inference with needle in haystack. Needle: {self.df['needle'][0]}") + tokenized_needle = self.pipeline.tokenizer.encode(self.df["needle"][0], add_special_tokens=False) + context_length = self.config.max_context_length - len(tokenized_needle) - 150 # account for system prompts + needle_index = int(context_length * self.config.needle_depth / 100) + # tokenize the context + self.df["context"] = self.df["context"].apply(lambda x: self.pipeline.tokenizer.encode(x, add_special_tokens=False)[:context_length]) + # insert the needle at the depth specified in the config + self.df["context"] = self.df["context"].apply(lambda x: x[:needle_index] + tokenized_needle + x[needle_index:]) + # detokenize the context + self.df["context"] = "This is a very long story book: " + self.df["context"].apply(lambda x: self.pipeline.tokenizer.decode(x, skip_special_tokens=True)) + " ." + def _prepare_data_for_inference(self): """ Prepares the dataset for inference, handling `compress_questions` and `FinchPress` specifics. """ compress_questions = self.config.compress_questions + # if we have needle in a haystack, we need to tokenize the dataset, cut it to max_context_length, insert the needle at depth n%max_context_length, and then detokenize it if self.config.dataset == "needle_in_haystack": - tokenized_needle = self.pipeline.tokenizer.encode(self.df["needle"], add_special_tokens=False) - context_length = self.config.max_context_length - len(tokenized_needle) - 150 # account for system prompts - self.df["context"] = self.df["context"].apply(lambda x: self.pipeline.tokenizer.encode(x, add_special_tokens=False)[:context_length]) - needle_index = int(context_length * self.config.needle_depth / 100) - self.df["context"] = self.df["context"].apply(lambda x: x[:needle_index] + tokenized_needle + x[needle_index:]) - self.df["context"] = "This is a very long story book: " + self.df["context"].apply(lambda x: self.pipeline.tokenizer.decode(x, skip_special_tokens=True)) + " ." + self._insert_needle_in_haystack() if isinstance(self.press, FinchPress): if not compress_questions: diff --git a/evaluation/evaluate_registry.py b/evaluation/evaluate_registry.py index af753dd2..4798bb39 100644 --- a/evaluation/evaluate_registry.py +++ b/evaluation/evaluate_registry.py @@ -8,6 +8,7 @@ from benchmarks.loogle.calculate_metrics import calculate_metrics as loogle_scorer from benchmarks.ruler.calculate_metrics import calculate_metrics as ruler_scorer from benchmarks.zero_scrolls.calculate_metrics import calculate_metrics as zero_scrolls_scorer +from benchmarks.needle_in_haystack.calculate_metrics import calculate_metrics as needle_in_haystack_scorer from kvpress import ( AdaKVPress, @@ -52,6 +53,7 @@ "longbench": longbench_scorer, "longbench-e": longbench_scorer_e, "longbench-v2": longbenchv2_scorer, + "needle_in_haystack": needle_in_haystack_scorer, } From 5e777b51e69c428840349a46622fc61e3841d469 Mon Sep 17 00:00:00 2001 From: alessiodevoto Date: Fri, 22 Aug 2025 07:13:09 +0000 Subject: [PATCH 3/9] rouge scorer Signed-off-by: alessiodevoto --- .../benchmarks/needle_in_haystack/__init__.py | 2 ++ .../needle_in_haystack/calculate_metrics.py | 11 ++++---- evaluation/evaluate.py | 26 ++++++++++++------- evaluation/evaluate_registry.py | 2 +- 4 files changed, 25 insertions(+), 16 deletions(-) diff --git a/evaluation/benchmarks/needle_in_haystack/__init__.py b/evaluation/benchmarks/needle_in_haystack/__init__.py index e69de29b..9adfdfcd 100644 --- a/evaluation/benchmarks/needle_in_haystack/__init__.py +++ b/evaluation/benchmarks/needle_in_haystack/__init__.py @@ -0,0 +1,2 @@ +# SPDX-FileCopyrightText: Copyright (c) 1993-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 diff --git a/evaluation/benchmarks/needle_in_haystack/calculate_metrics.py b/evaluation/benchmarks/needle_in_haystack/calculate_metrics.py index 06545629..76c8a2e3 100644 --- a/evaluation/benchmarks/needle_in_haystack/calculate_metrics.py +++ b/evaluation/benchmarks/needle_in_haystack/calculate_metrics.py @@ -2,13 +2,14 @@ # SPDX-License-Identifier: Apache-2.0 import pandas as pd -from rouge_score import rouge_scorer +from rouge import Rouge -scorer = rouge_scorer.RougeScorer(["rouge1", "rougeL"], use_stemmer=True) +scorer = Rouge() -def calculate_metrics(df: pd.DataFrame) -> dict: + +def calculate_metrics(df: pd.DataFrame) -> list[dict]: scores = [] for index, row in df.iterrows(): - score = scorer.score(row["needle"], row["predicted_answer"])["rouge1"].fmeasure * 10 + score = scorer.get_scores(row["needle"], row["predicted_answer"])[0] scores.append(score) - return {"rouge1": sum(scores) / len(scores)} \ No newline at end of file + return scores diff --git a/evaluation/evaluate.py b/evaluation/evaluate.py index dd28655f..d3688d55 100644 --- a/evaluation/evaluate.py +++ b/evaluation/evaluate.py @@ -88,9 +88,10 @@ def __post_init__(self): # Initialize model_kwargs if None if self.model_kwargs is None: self.model_kwargs = {} - + if self.dataset == "needle_in_haystack": assert self.needle_depth is not None, "needle_depth must be set for needle_in_haystack" + assert self.max_context_length is not None, "max_context_length must be set for needle_in_haystack" def get_results_dir(self, output_dir: Path) -> Path: """ @@ -125,6 +126,8 @@ def get_results_dir(self, output_dir: Path) -> Path: components.append("compressed_questions") if self.key_channel_compression_ratio is not None: components.append(f"key_channel_cr{self.key_channel_compression_ratio:.2f}") + if self.needle_depth is not None: + components.append(f"needle_depth{self.needle_depth:.1f}") dir_name = "__".join(filter(None, components)) # Filter None/empty strings config_dir = output_dir / dir_name @@ -208,8 +211,7 @@ def _setup_deterministic_seeds(self): def _setup_logging(self): """Configures the logging level based on the config.""" log_level = self.config.log_level.upper() - logging.basicConfig(level=logging.INFO) - # logging.basicConfig(level=getattr(logging, log_level), format="%(asctime)s - %(levelname)s - %(message)s") + logging.basicConfig(level=getattr(logging, log_level), format="%(asctime)s - %(levelname)s - %(message)s") def _setup_directories(self) -> Path: """ @@ -337,7 +339,6 @@ def _setup_model_pipeline(self): logger.info("Model pipeline loaded.") - def _insert_needle_in_haystack(self): """ Inserts the needle in the haystack at the depth specified in the config. @@ -346,23 +347,28 @@ def _insert_needle_in_haystack(self): """ logger.info(f"Preparing dataset for inference with needle in haystack. Needle: {self.df['needle'][0]}") tokenized_needle = self.pipeline.tokenizer.encode(self.df["needle"][0], add_special_tokens=False) - context_length = self.config.max_context_length - len(tokenized_needle) - 150 # account for system prompts + context_length = self.config.max_context_length - len(tokenized_needle) - 150 # account for system prompts needle_index = int(context_length * self.config.needle_depth / 100) # tokenize the context - self.df["context"] = self.df["context"].apply(lambda x: self.pipeline.tokenizer.encode(x, add_special_tokens=False)[:context_length]) + self.df["context"] = self.df["context"].apply( + lambda x: self.pipeline.tokenizer.encode(x, add_special_tokens=False)[:context_length] + ) # insert the needle at the depth specified in the config self.df["context"] = self.df["context"].apply(lambda x: x[:needle_index] + tokenized_needle + x[needle_index:]) # detokenize the context - self.df["context"] = "This is a very long story book: " + self.df["context"].apply(lambda x: self.pipeline.tokenizer.decode(x, skip_special_tokens=True)) + " ." + self.df["context"] = ( + "This is a very long story book: " + + self.df["context"].apply(lambda x: self.pipeline.tokenizer.decode(x, skip_special_tokens=True)) + + " ." + ) def _prepare_data_for_inference(self): """ Prepares the dataset for inference, handling `compress_questions` and `FinchPress` specifics. """ compress_questions = self.config.compress_questions - - - # if we have needle in a haystack, we need to tokenize the dataset, cut it to max_context_length, insert the needle at depth n%max_context_length, and then detokenize it + + # if we have needle in a haystack, we need to insert it in the context if self.config.dataset == "needle_in_haystack": self._insert_needle_in_haystack() diff --git a/evaluation/evaluate_registry.py b/evaluation/evaluate_registry.py index 4798bb39..0dc4bd03 100644 --- a/evaluation/evaluate_registry.py +++ b/evaluation/evaluate_registry.py @@ -6,9 +6,9 @@ from benchmarks.longbench.calculate_metrics import calculate_metrics_e as longbench_scorer_e from benchmarks.longbenchv2.calculate_metrics import calculate_metrics as longbenchv2_scorer from benchmarks.loogle.calculate_metrics import calculate_metrics as loogle_scorer +from benchmarks.needle_in_haystack.calculate_metrics import calculate_metrics as needle_in_haystack_scorer from benchmarks.ruler.calculate_metrics import calculate_metrics as ruler_scorer from benchmarks.zero_scrolls.calculate_metrics import calculate_metrics as zero_scrolls_scorer -from benchmarks.needle_in_haystack.calculate_metrics import calculate_metrics as needle_in_haystack_scorer from kvpress import ( AdaKVPress, From a46c963a2220881a15158567b3765eb854993d64 Mon Sep 17 00:00:00 2001 From: alessiodevoto Date: Fri, 22 Aug 2025 08:59:49 +0000 Subject: [PATCH 4/9] add inits Signed-off-by: alessiodevoto --- evaluation/benchmarks/infinite_bench/__init__.py | 2 ++ evaluation/benchmarks/longbench/__init__.py | 2 ++ evaluation/benchmarks/longbenchv2/__init__.py | 2 ++ 3 files changed, 6 insertions(+) create mode 100644 evaluation/benchmarks/infinite_bench/__init__.py create mode 100644 evaluation/benchmarks/longbench/__init__.py create mode 100644 evaluation/benchmarks/longbenchv2/__init__.py diff --git a/evaluation/benchmarks/infinite_bench/__init__.py b/evaluation/benchmarks/infinite_bench/__init__.py new file mode 100644 index 00000000..9adfdfcd --- /dev/null +++ b/evaluation/benchmarks/infinite_bench/__init__.py @@ -0,0 +1,2 @@ +# SPDX-FileCopyrightText: Copyright (c) 1993-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 diff --git a/evaluation/benchmarks/longbench/__init__.py b/evaluation/benchmarks/longbench/__init__.py new file mode 100644 index 00000000..9adfdfcd --- /dev/null +++ b/evaluation/benchmarks/longbench/__init__.py @@ -0,0 +1,2 @@ +# SPDX-FileCopyrightText: Copyright (c) 1993-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 diff --git a/evaluation/benchmarks/longbenchv2/__init__.py b/evaluation/benchmarks/longbenchv2/__init__.py new file mode 100644 index 00000000..9adfdfcd --- /dev/null +++ b/evaluation/benchmarks/longbenchv2/__init__.py @@ -0,0 +1,2 @@ +# SPDX-FileCopyrightText: Copyright (c) 1993-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 From 74128750342939a04e2b624b8977ff2ca59e4039 Mon Sep 17 00:00:00 2001 From: alessiodevoto Date: Fri, 22 Aug 2025 10:54:56 +0000 Subject: [PATCH 5/9] refactor Signed-off-by: alessiodevoto --- .../benchmarks/needle_in_haystack/utils.py | 49 +++++++++++++++++++ evaluation/evaluate.py | 34 +++---------- evaluation/evaluate_config.yaml | 7 +-- tests/integration/test_ruler.py | 2 +- 4 files changed, 62 insertions(+), 30 deletions(-) create mode 100644 evaluation/benchmarks/needle_in_haystack/utils.py diff --git a/evaluation/benchmarks/needle_in_haystack/utils.py b/evaluation/benchmarks/needle_in_haystack/utils.py new file mode 100644 index 00000000..14717cde --- /dev/null +++ b/evaluation/benchmarks/needle_in_haystack/utils.py @@ -0,0 +1,49 @@ +# SPDX-FileCopyrightText: Copyright (c) 1993-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +import logging + +import pandas as pd +from transformers import PreTrainedTokenizer + +logger = logging.getLogger(__name__) + + +def insert_needle_in_haystack( + df: pd.DataFrame, tokenizer: PreTrainedTokenizer, max_context_length: int, needle_depth: int +) -> pd.DataFrame: + """ + Inserts the "needle" string into the "context" of each row in the DataFrame at a specified depth. + Adapted from the original implementation: https://github.com/gkamradt/LLMTest_NeedleInAHaystack + + Parameters + ---------- + df : pd.DataFrame + The input DataFrame containing at least the columns "context" and "needle". + tokenizer : PreTrainedTokenizer + The tokenizer used to encode and decode the context and needle. + max_context_length : int + The maximum allowed length (in tokens) for the context, including the needle. + needle_depth : int + The percentage (0-100) indicating how deep into the context the needle should be inserted. + + Returns + ------- + pd.DataFrame + The DataFrame with the "context" column modified to include the needle at the specified depth. + """ + logger.info(f"Preparing dataset for inference with needle in haystack. Needle: {df['needle'][0]}") + tokenized_needle = tokenizer.encode(df["needle"][0], add_special_tokens=False) + context_length = max_context_length - len(tokenized_needle) - 150 # account for system prompts + needle_index = int(context_length * needle_depth / 100) + # tokenize the context + df["context"] = df["context"].apply(lambda x: tokenizer.encode(x, add_special_tokens=False)[:context_length]) + # insert the needle at the depth specified in the config + df["context"] = df["context"].apply(lambda x: x[:needle_index] + tokenized_needle + x[needle_index:]) + # detokenize the context + df["context"] = ( + "This is a very long story book: " + + df["context"].apply(lambda x: tokenizer.decode(x, skip_special_tokens=True)) + + " ." + ) + return df diff --git a/evaluation/evaluate.py b/evaluation/evaluate.py index d3688d55..cc44a9c8 100644 --- a/evaluation/evaluate.py +++ b/evaluation/evaluate.py @@ -10,9 +10,10 @@ from typing import Any, Dict, Optional import numpy as np -import pandas as pd # Import pandas for DataFrame type hinting +import pandas as pd import torch import yaml # type: ignore[import-untyped] +from benchmarks.needle_in_haystack.utils import insert_needle_in_haystack from datasets import load_dataset from evaluate_registry import DATASET_REGISTRY, PRESS_REGISTRY, SCORER_REGISTRY from fire import Fire @@ -286,7 +287,9 @@ def _setup_press(self): press.compression_ratio = compression_ratio logger.info(f"Set {press.__class__.__name__} compression_ratio to {compression_ratio}") else: - logger.warning(f"Press {press.__class__.__name__} has no 'compression_ratio' attribute.") + logger.warning( + f"Press {press.__class__.__name__} has no 'compression_ratio' attribute. This is expected is you set `no_press`." + ) self.press = press # Set the press info in the config for saving to YAML @@ -339,29 +342,6 @@ def _setup_model_pipeline(self): logger.info("Model pipeline loaded.") - def _insert_needle_in_haystack(self): - """ - Inserts the needle in the haystack at the depth specified in the config. - Adapted from the original implementation: https://github.com/gkamradt/LLMTest_NeedleInAHaystack - To insert the needle, we need to tokenize the context, insert the needle at the depth specified in the config, and then detokenize it. - """ - logger.info(f"Preparing dataset for inference with needle in haystack. Needle: {self.df['needle'][0]}") - tokenized_needle = self.pipeline.tokenizer.encode(self.df["needle"][0], add_special_tokens=False) - context_length = self.config.max_context_length - len(tokenized_needle) - 150 # account for system prompts - needle_index = int(context_length * self.config.needle_depth / 100) - # tokenize the context - self.df["context"] = self.df["context"].apply( - lambda x: self.pipeline.tokenizer.encode(x, add_special_tokens=False)[:context_length] - ) - # insert the needle at the depth specified in the config - self.df["context"] = self.df["context"].apply(lambda x: x[:needle_index] + tokenized_needle + x[needle_index:]) - # detokenize the context - self.df["context"] = ( - "This is a very long story book: " - + self.df["context"].apply(lambda x: self.pipeline.tokenizer.decode(x, skip_special_tokens=True)) - + " ." - ) - def _prepare_data_for_inference(self): """ Prepares the dataset for inference, handling `compress_questions` and `FinchPress` specifics. @@ -370,7 +350,9 @@ def _prepare_data_for_inference(self): # if we have needle in a haystack, we need to insert it in the context if self.config.dataset == "needle_in_haystack": - self._insert_needle_in_haystack() + self.df = insert_needle_in_haystack( + self.df, self.pipeline.tokenizer, self.config.max_context_length, self.config.needle_depth + ) if isinstance(self.press, FinchPress): if not compress_questions: diff --git a/evaluation/evaluate_config.yaml b/evaluation/evaluate_config.yaml index 9d23734d..f4f4cdec 100644 --- a/evaluation/evaluate_config.yaml +++ b/evaluation/evaluate_config.yaml @@ -5,16 +5,17 @@ output_dir: "./results" model: "meta-llama/Meta-Llama-3.1-8B-Instruct" dataset: "ruler" # see DATASET_REGISTRY in evaluate_registry.py -data_dir: "4096" # Subdirectory of the dataset (if applicable) +data_dir: "4096" # Subdirectory of the dataset (if applicable) else leave "null" press_name: "knorm" # see PRESS_REGISTRY in evaluate_registry.py -compression_ratio: 1.0 # Compression ratio for the press (0.0 to 1.0) +compression_ratio: 0.5 # Compression ratio for the press (0.0 to 1.0) key_channel_compression_ratio: null # For ThinKPress and ComposedPress (0.0 to 1.0) fraction: 1.0 # Fraction of dataset to evaluate (0.0 to 1.0), for quick testing max_new_tokens: null # Maximum new tokens to generate (null = use dataset default) max_context_length: null # Maximum context length (null = use model maximum) -compress_questions: false # Whether to compress questions with context +compress_questions: false # Whether to compress questions with context +needle_depth: null # Depth percentage of the needle in the haystack (0 to 100), only for needle_in_haystack dataset device: null # Device to use (null = auto-detect, "cuda:0", "cpu", etc.) diff --git a/tests/integration/test_ruler.py b/tests/integration/test_ruler.py index 675f92f6..38495821 100644 --- a/tests/integration/test_ruler.py +++ b/tests/integration/test_ruler.py @@ -4,7 +4,7 @@ import datasets import pytest import torch -from transformers import DynamicCache, QuantoQuantizedCache, QuantizedCacheConfig +from transformers import DynamicCache, QuantizedCacheConfig, QuantoQuantizedCache from transformers.utils import is_flash_attn_2_available, is_optimum_quanto_available from tests.default_presses import default_presses From be5b6dc0b601bd9cdd592c5c75038acaf0ced4bb Mon Sep 17 00:00:00 2001 From: alessiodevoto Date: Fri, 22 Aug 2025 14:05:14 +0000 Subject: [PATCH 6/9] refactor eval Signed-off-by: alessiodevoto --- evaluation/evaluate.py | 133 +++++++++++++++++++---------------------- 1 file changed, 61 insertions(+), 72 deletions(-) diff --git a/evaluation/evaluate.py b/evaluation/evaluate.py index cc44a9c8..4fc67366 100644 --- a/evaluation/evaluate.py +++ b/evaluation/evaluate.py @@ -195,25 +195,24 @@ def __init__(self, config: EvaluationConfig): def _setup_deterministic_seeds(self): """Set deterministic seeds for reproducible results.""" - seed = self.config.seed - - torch.manual_seed(seed) - np.random.seed(seed) - random.seed(seed) + torch.manual_seed(self.config.seed) + np.random.seed(self.config.seed) + random.seed(self.config.seed) if torch.cuda.is_available(): - torch.cuda.manual_seed(seed) - torch.cuda.manual_seed_all(seed) + torch.cuda.manual_seed(self.config.seed) + torch.cuda.manual_seed_all(self.config.seed) torch.backends.cudnn.deterministic = True torch.backends.cudnn.benchmark = False + logger.info(f"Set deterministic seeds to {self.config.seed}") - logger.info(f"Set deterministic seeds to {seed}") def _setup_logging(self): """Configures the logging level based on the config.""" log_level = self.config.log_level.upper() logging.basicConfig(level=getattr(logging, log_level), format="%(asctime)s - %(levelname)s - %(message)s") + def _setup_directories(self) -> Path: """ Creates the output directory for saving results if it doesn't exist. @@ -228,24 +227,6 @@ def _setup_directories(self) -> Path: logger.info(f"Output directory set to: {output_dir}") return output_dir - def _load_dataset(self): - """ - Loads the dataset specified in the config and applies sampling/filtering. - """ - dataset_name = self.config.dataset - data_dir = str(self.config.data_dir) if self.config.data_dir else None - fraction = self.config.fraction - - logger.info(f"Loading dataset: {DATASET_REGISTRY[dataset_name]} (data_dir: {data_dir})") - df = load_dataset(DATASET_REGISTRY[dataset_name], data_dir=data_dir, split="test").to_pandas() - - if fraction < 1.0: - original_len = len(df) - df = df.sample(frac=fraction, random_state=self.config.seed) - logger.info(f"Sampled {len(df)} samples ({fraction:.2f}) from original {original_len} samples.") - - self.df = df - logger.info(f"Dataset loaded with {len(self.df)} entries.") def _setup_press(self): """ @@ -296,6 +277,50 @@ def _setup_press(self): self.config.press_init_command = str(press) logger.info(f"KV Press '{press_name}' setup.") + + def _load_and_prepare_dataset(self): + """ + Loads the dataset specified in the config and applies sampling/filtering. + """ + dataset_name = self.config.dataset + data_dir = str(self.config.data_dir) if self.config.data_dir else None + fraction = self.config.fraction + + logger.info(f"Loading dataset: {DATASET_REGISTRY[dataset_name]} (data_dir: {data_dir})") + df = load_dataset(DATASET_REGISTRY[dataset_name], data_dir=data_dir, split="test").to_pandas() + + if fraction < 1.0: + original_len = len(df) + df = df.sample(frac=fraction, random_state=self.config.seed) + logger.info(f"Sampled {len(df)} samples ({fraction:.2f}) from original {original_len} samples.") + + logger.info(f"Dataset loaded with {len(df)} entries.") + + # if we have needle in a haystack, we need to insert it in the context + if self.config.dataset == "needle_in_haystack": + df = insert_needle_in_haystack( + df, self.pipeline.tokenizer, self.config.max_context_length, self.config.needle_depth + ) + + if isinstance(self.press, FinchPress): + if not self.config.compress_questions: + logger.error("FinchPress requires 'compress_questions' to be set to True.") + raise ValueError("FinchPress requires compress_questions to be set to True") + # FinchPress uses a delimiter token to separate context and question + # So we need to update the tokenizer and the model embeddings. + logger.info("FinchPress detected, updating model and tokenizer with delimiter token.") + self.press.update_model_and_tokenizer(self.pipeline.model, self.pipeline.tokenizer) # type: ignore[attr-defined] + df["context"] = df["context"] + self.press.delimiter_token # type: ignore[attr-defined, index] + + if self.config.compress_questions: + logger.info("Compressing questions into context.") + df["context"] = df["context"] + df["question"] # type: ignore[index] + df["question"] = "" # type: ignore[index] + + self.df = df + logger.info(f"Dataset processed with {len(self.df)} entries.") + + def _setup_model_pipeline(self): model_name = self.config.model device = self.config.device @@ -319,55 +344,20 @@ def _setup_model_pipeline(self): pass logger.info(f"Loading model pipeline for: {model_name} on device: {device} with model_kwargs: {model_kwargs}") + pipeline_kwargs = { + "model": model_name, + "model_kwargs": model_kwargs, + "trust_remote_code": True, + } if device == "auto": - self.pipeline = pipeline( - "kv-press-text-generation", - model=model_name, - device_map="auto", - model_kwargs=model_kwargs, - trust_remote_code=True, - ) + pipeline_kwargs["device_map"] = "auto" else: - self.pipeline = pipeline( - "kv-press-text-generation", - model=model_name, - device=device, - model_kwargs=model_kwargs, - trust_remote_code=True, - ) - - # Ensure model is in eval mode for deterministic inference - if hasattr(self.pipeline, "model"): - self.pipeline.model.eval() + pipeline_kwargs["device"] = device + self.pipeline = pipeline("kv-press-text-generation", **pipeline_kwargs) + self.pipeline.model.eval() logger.info("Model pipeline loaded.") - def _prepare_data_for_inference(self): - """ - Prepares the dataset for inference, handling `compress_questions` and `FinchPress` specifics. - """ - compress_questions = self.config.compress_questions - - # if we have needle in a haystack, we need to insert it in the context - if self.config.dataset == "needle_in_haystack": - self.df = insert_needle_in_haystack( - self.df, self.pipeline.tokenizer, self.config.max_context_length, self.config.needle_depth - ) - - if isinstance(self.press, FinchPress): - if not compress_questions: - logger.error("FinchPress requires 'compress_questions' to be set to True.") - raise ValueError("FinchPress requires compress_questions to be set to True") - # FinchPress uses a delimiter token to separate context and question - # So we need to update the tokenizer and the model embeddings. - logger.info("FinchPress detected, updating model and tokenizer with delimiter token.") - self.press.update_model_and_tokenizer(self.pipeline.model, self.pipeline.tokenizer) # type: ignore[attr-defined] - self.df["context"] = self.df["context"] + self.press.delimiter_token # type: ignore[attr-defined, index] - - if compress_questions: - logger.info("Compressing questions into context.") - self.df["context"] = self.df["context"] + self.df["question"] # type: ignore[index] - self.df["question"] = "" # type: ignore[index] @torch.inference_mode() def _run_inference(self): @@ -458,10 +448,9 @@ def run_evaluation(self): ) return - self._load_dataset() self._setup_press() self._setup_model_pipeline() - self._prepare_data_for_inference() + self._load_and_prepare_dataset() self._run_inference() self._save_results(predictions_filename) From e96247dea1f4b55a58268e3913e520da7a088016 Mon Sep 17 00:00:00 2001 From: alessiodevoto Date: Fri, 22 Aug 2025 14:05:38 +0000 Subject: [PATCH 7/9] add niah Signed-off-by: alessiodevoto --- evaluation/README.md | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/evaluation/README.md b/evaluation/README.md index bd78f400..350bd637 100644 --- a/evaluation/README.md +++ b/evaluation/README.md @@ -41,12 +41,13 @@ We support evaluation with all the presses implemented in the library (and possi At the moment, we support the following standard popular benchmarks: -- [Loogle](loogle/README.md) ([hf link](https://huggingface.co/datasets/simonjegou/loogle)) -- [RULER](ruler/README.md) ([hf link](https://huggingface.co/datasets/simonjegou/ruler)) -- [Zero Scrolls](zero_scrolls/README.md) ([hf link](https://huggingface.co/datasets/simonjegou/zero_scrolls)) -- [Infinitebench](infinite_bench/README.md) ([hf link](https://huggingface.co/datasets/MaxJeblick/InfiniteBench)) -- [longbench](longbench/README.md)([hf link](https://huggingface.co/datasets/Xnhyacinth/LongBench)) -- [longbench-v2](longbenchv2/README.md)([hf link](https://huggingface.co/datasets/Xnhyacinth/LongBench-v2)) +- [Loogle](benchmarks/loogle/README.md) ([hf link](https://huggingface.co/datasets/simonjegou/loogle)) +- [RULER](benchmarks/ruler/README.md) ([hf link](https://huggingface.co/datasets/simonjegou/ruler)) +- [Zero Scrolls](benchmarks/zero_scrolls/README.md) ([hf link](https://huggingface.co/datasets/simonjegou/zero_scrolls)) +- [Infinitebench](benchmarks/infinite_bench/README.md) ([hf link](https://huggingface.co/datasets/MaxJeblick/InfiniteBench)) +- [longbench](benchmarks/longbench/README.md)([hf link](https://huggingface.co/datasets/Xnhyacinth/LongBench)) +- [longbench-v2](benchmarks/longbenchv2/README.md)([hf link](https://huggingface.co/datasets/Xnhyacinth/LongBench-v2)) +- [Needle in a Haystack](benchmarks/needle_in_haystack/README.md)([hf link][Paul Graham's essays](https://huggingface.co/datasets/alessiodevoto/paul_graham_essays)) 📚 **For detailed information** about each dataset or implementing custom benchmarks, see the individual README files in the benchmarks directory. From 2b7d181008a10da3c8578378e5dcd56ddda95264 Mon Sep 17 00:00:00 2001 From: alessiodevoto Date: Fri, 22 Aug 2025 14:14:02 +0000 Subject: [PATCH 8/9] style Signed-off-by: alessiodevoto --- evaluation/evaluate.py | 6 ------ evaluation/evaluate_config.yaml | 2 +- 2 files changed, 1 insertion(+), 7 deletions(-) diff --git a/evaluation/evaluate.py b/evaluation/evaluate.py index 4fc67366..6b2b497d 100644 --- a/evaluation/evaluate.py +++ b/evaluation/evaluate.py @@ -206,13 +206,11 @@ def _setup_deterministic_seeds(self): torch.backends.cudnn.benchmark = False logger.info(f"Set deterministic seeds to {self.config.seed}") - def _setup_logging(self): """Configures the logging level based on the config.""" log_level = self.config.log_level.upper() logging.basicConfig(level=getattr(logging, log_level), format="%(asctime)s - %(levelname)s - %(message)s") - def _setup_directories(self) -> Path: """ Creates the output directory for saving results if it doesn't exist. @@ -227,7 +225,6 @@ def _setup_directories(self) -> Path: logger.info(f"Output directory set to: {output_dir}") return output_dir - def _setup_press(self): """ Initializes the KVPress instance and applies compression ratios based on its type. @@ -277,7 +274,6 @@ def _setup_press(self): self.config.press_init_command = str(press) logger.info(f"KV Press '{press_name}' setup.") - def _load_and_prepare_dataset(self): """ Loads the dataset specified in the config and applies sampling/filtering. @@ -320,7 +316,6 @@ def _load_and_prepare_dataset(self): self.df = df logger.info(f"Dataset processed with {len(self.df)} entries.") - def _setup_model_pipeline(self): model_name = self.config.model device = self.config.device @@ -358,7 +353,6 @@ def _setup_model_pipeline(self): self.pipeline.model.eval() logger.info("Model pipeline loaded.") - @torch.inference_mode() def _run_inference(self): """ diff --git a/evaluation/evaluate_config.yaml b/evaluation/evaluate_config.yaml index f4f4cdec..ba5d6714 100644 --- a/evaluation/evaluate_config.yaml +++ b/evaluation/evaluate_config.yaml @@ -1,7 +1,7 @@ # SPDX-FileCopyrightText: Copyright (c) 1993-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 -output_dir: "./results" +output_dir: "./test_eval" model: "meta-llama/Meta-Llama-3.1-8B-Instruct" dataset: "ruler" # see DATASET_REGISTRY in evaluate_registry.py From 0eb4fa6a5ed88fa46ca145d494ffe3034a94105e Mon Sep 17 00:00:00 2001 From: alessiodevoto Date: Fri, 22 Aug 2025 14:21:15 +0000 Subject: [PATCH 9/9] revert Signed-off-by: alessiodevoto --- evaluation/evaluate_config.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/evaluation/evaluate_config.yaml b/evaluation/evaluate_config.yaml index ba5d6714..f4f4cdec 100644 --- a/evaluation/evaluate_config.yaml +++ b/evaluation/evaluate_config.yaml @@ -1,7 +1,7 @@ # SPDX-FileCopyrightText: Copyright (c) 1993-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 -output_dir: "./test_eval" +output_dir: "./results" model: "meta-llama/Meta-Llama-3.1-8B-Instruct" dataset: "ruler" # see DATASET_REGISTRY in evaluate_registry.py