diff --git a/evaluation/README.md b/evaluation/README.md index bd78f400..350bd637 100644 --- a/evaluation/README.md +++ b/evaluation/README.md @@ -41,12 +41,13 @@ We support evaluation with all the presses implemented in the library (and possi At the moment, we support the following standard popular benchmarks: -- [Loogle](loogle/README.md) ([hf link](https://huggingface.co/datasets/simonjegou/loogle)) -- [RULER](ruler/README.md) ([hf link](https://huggingface.co/datasets/simonjegou/ruler)) -- [Zero Scrolls](zero_scrolls/README.md) ([hf link](https://huggingface.co/datasets/simonjegou/zero_scrolls)) -- [Infinitebench](infinite_bench/README.md) ([hf link](https://huggingface.co/datasets/MaxJeblick/InfiniteBench)) -- [longbench](longbench/README.md)([hf link](https://huggingface.co/datasets/Xnhyacinth/LongBench)) -- [longbench-v2](longbenchv2/README.md)([hf link](https://huggingface.co/datasets/Xnhyacinth/LongBench-v2)) +- [Loogle](benchmarks/loogle/README.md) ([hf link](https://huggingface.co/datasets/simonjegou/loogle)) +- [RULER](benchmarks/ruler/README.md) ([hf link](https://huggingface.co/datasets/simonjegou/ruler)) +- [Zero Scrolls](benchmarks/zero_scrolls/README.md) ([hf link](https://huggingface.co/datasets/simonjegou/zero_scrolls)) +- [Infinitebench](benchmarks/infinite_bench/README.md) ([hf link](https://huggingface.co/datasets/MaxJeblick/InfiniteBench)) +- [longbench](benchmarks/longbench/README.md)([hf link](https://huggingface.co/datasets/Xnhyacinth/LongBench)) +- [longbench-v2](benchmarks/longbenchv2/README.md)([hf link](https://huggingface.co/datasets/Xnhyacinth/LongBench-v2)) +- [Needle in a Haystack](benchmarks/needle_in_haystack/README.md)([hf link][Paul Graham's essays](https://huggingface.co/datasets/alessiodevoto/paul_graham_essays)) 📚 **For detailed information** about each dataset or implementing custom benchmarks, see the individual README files in the benchmarks directory. diff --git a/evaluation/benchmarks/infinite_bench/__init__.py b/evaluation/benchmarks/infinite_bench/__init__.py new file mode 100644 index 00000000..9adfdfcd --- /dev/null +++ b/evaluation/benchmarks/infinite_bench/__init__.py @@ -0,0 +1,2 @@ +# SPDX-FileCopyrightText: Copyright (c) 1993-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 diff --git a/evaluation/benchmarks/longbench/__init__.py b/evaluation/benchmarks/longbench/__init__.py new file mode 100644 index 00000000..9adfdfcd --- /dev/null +++ b/evaluation/benchmarks/longbench/__init__.py @@ -0,0 +1,2 @@ +# SPDX-FileCopyrightText: Copyright (c) 1993-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 diff --git a/evaluation/benchmarks/longbenchv2/__init__.py b/evaluation/benchmarks/longbenchv2/__init__.py new file mode 100644 index 00000000..9adfdfcd --- /dev/null +++ b/evaluation/benchmarks/longbenchv2/__init__.py @@ -0,0 +1,2 @@ +# SPDX-FileCopyrightText: Copyright (c) 1993-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 diff --git a/evaluation/benchmarks/needle_in_haystack/README.md b/evaluation/benchmarks/needle_in_haystack/README.md new file mode 100644 index 00000000..324e260e --- /dev/null +++ b/evaluation/benchmarks/needle_in_haystack/README.md @@ -0,0 +1,3 @@ +# Needle in a Haystack +This benchmark evaluates a model's ability to retrieve a specific piece of information, the "needle," hidden within a large body of text, the "haystack." The test challenges a model's long-context understanding and its ability to maintain information accuracy over increasing document lengths. +We follow the vast majority of the literature and use [Paul Graham's essays](https://huggingface.co/datasets/alessiodevoto/paul_graham_essays) as the haystack. \ No newline at end of file diff --git a/evaluation/benchmarks/needle_in_haystack/__init__.py b/evaluation/benchmarks/needle_in_haystack/__init__.py new file mode 100644 index 00000000..9adfdfcd --- /dev/null +++ b/evaluation/benchmarks/needle_in_haystack/__init__.py @@ -0,0 +1,2 @@ +# SPDX-FileCopyrightText: Copyright (c) 1993-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 diff --git a/evaluation/benchmarks/needle_in_haystack/calculate_metrics.py b/evaluation/benchmarks/needle_in_haystack/calculate_metrics.py new file mode 100644 index 00000000..76c8a2e3 --- /dev/null +++ b/evaluation/benchmarks/needle_in_haystack/calculate_metrics.py @@ -0,0 +1,15 @@ +# SPDX-FileCopyrightText: Copyright (c) 1993-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +import pandas as pd +from rouge import Rouge + +scorer = Rouge() + + +def calculate_metrics(df: pd.DataFrame) -> list[dict]: + scores = [] + for index, row in df.iterrows(): + score = scorer.get_scores(row["needle"], row["predicted_answer"])[0] + scores.append(score) + return scores diff --git a/evaluation/benchmarks/needle_in_haystack/utils.py b/evaluation/benchmarks/needle_in_haystack/utils.py new file mode 100644 index 00000000..14717cde --- /dev/null +++ b/evaluation/benchmarks/needle_in_haystack/utils.py @@ -0,0 +1,49 @@ +# SPDX-FileCopyrightText: Copyright (c) 1993-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +import logging + +import pandas as pd +from transformers import PreTrainedTokenizer + +logger = logging.getLogger(__name__) + + +def insert_needle_in_haystack( + df: pd.DataFrame, tokenizer: PreTrainedTokenizer, max_context_length: int, needle_depth: int +) -> pd.DataFrame: + """ + Inserts the "needle" string into the "context" of each row in the DataFrame at a specified depth. + Adapted from the original implementation: https://github.com/gkamradt/LLMTest_NeedleInAHaystack + + Parameters + ---------- + df : pd.DataFrame + The input DataFrame containing at least the columns "context" and "needle". + tokenizer : PreTrainedTokenizer + The tokenizer used to encode and decode the context and needle. + max_context_length : int + The maximum allowed length (in tokens) for the context, including the needle. + needle_depth : int + The percentage (0-100) indicating how deep into the context the needle should be inserted. + + Returns + ------- + pd.DataFrame + The DataFrame with the "context" column modified to include the needle at the specified depth. + """ + logger.info(f"Preparing dataset for inference with needle in haystack. Needle: {df['needle'][0]}") + tokenized_needle = tokenizer.encode(df["needle"][0], add_special_tokens=False) + context_length = max_context_length - len(tokenized_needle) - 150 # account for system prompts + needle_index = int(context_length * needle_depth / 100) + # tokenize the context + df["context"] = df["context"].apply(lambda x: tokenizer.encode(x, add_special_tokens=False)[:context_length]) + # insert the needle at the depth specified in the config + df["context"] = df["context"].apply(lambda x: x[:needle_index] + tokenized_needle + x[needle_index:]) + # detokenize the context + df["context"] = ( + "This is a very long story book: " + + df["context"].apply(lambda x: tokenizer.decode(x, skip_special_tokens=True)) + + " ." + ) + return df diff --git a/evaluation/evaluate.py b/evaluation/evaluate.py index e7777dfc..6b2b497d 100644 --- a/evaluation/evaluate.py +++ b/evaluation/evaluate.py @@ -10,9 +10,10 @@ from typing import Any, Dict, Optional import numpy as np -import pandas as pd # Import pandas for DataFrame type hinting +import pandas as pd import torch import yaml # type: ignore[import-untyped] +from benchmarks.needle_in_haystack.utils import insert_needle_in_haystack from datasets import load_dataset from evaluate_registry import DATASET_REGISTRY, PRESS_REGISTRY, SCORER_REGISTRY from fire import Fire @@ -42,6 +43,7 @@ class EvaluationConfig: max_new_tokens: Optional[int] = None max_context_length: Optional[int] = None compress_questions: bool = False + needle_depth: Optional[int] = None # Output and logging output_dir: str = "./results" @@ -88,6 +90,10 @@ def __post_init__(self): if self.model_kwargs is None: self.model_kwargs = {} + if self.dataset == "needle_in_haystack": + assert self.needle_depth is not None, "needle_depth must be set for needle_in_haystack" + assert self.max_context_length is not None, "max_context_length must be set for needle_in_haystack" + def get_results_dir(self, output_dir: Path) -> Path: """ Generates the unique save directory and filenames based on configuration parameters. @@ -121,6 +127,8 @@ def get_results_dir(self, output_dir: Path) -> Path: components.append("compressed_questions") if self.key_channel_compression_ratio is not None: components.append(f"key_channel_cr{self.key_channel_compression_ratio:.2f}") + if self.needle_depth is not None: + components.append(f"needle_depth{self.needle_depth:.1f}") dir_name = "__".join(filter(None, components)) # Filter None/empty strings config_dir = output_dir / dir_name @@ -187,19 +195,16 @@ def __init__(self, config: EvaluationConfig): def _setup_deterministic_seeds(self): """Set deterministic seeds for reproducible results.""" - seed = self.config.seed - - torch.manual_seed(seed) - np.random.seed(seed) - random.seed(seed) + torch.manual_seed(self.config.seed) + np.random.seed(self.config.seed) + random.seed(self.config.seed) if torch.cuda.is_available(): - torch.cuda.manual_seed(seed) - torch.cuda.manual_seed_all(seed) + torch.cuda.manual_seed(self.config.seed) + torch.cuda.manual_seed_all(self.config.seed) torch.backends.cudnn.deterministic = True torch.backends.cudnn.benchmark = False - - logger.info(f"Set deterministic seeds to {seed}") + logger.info(f"Set deterministic seeds to {self.config.seed}") def _setup_logging(self): """Configures the logging level based on the config.""" @@ -220,25 +225,6 @@ def _setup_directories(self) -> Path: logger.info(f"Output directory set to: {output_dir}") return output_dir - def _load_dataset(self): - """ - Loads the dataset specified in the config and applies sampling/filtering. - """ - dataset_name = self.config.dataset - data_dir = str(self.config.data_dir) if self.config.data_dir else None - fraction = self.config.fraction - - logger.info(f"Loading dataset: {DATASET_REGISTRY[dataset_name]} (data_dir: {data_dir})") - df = load_dataset(DATASET_REGISTRY[dataset_name], data_dir=data_dir, split="test").to_pandas() - - if fraction < 1.0: - original_len = len(df) - df = df.sample(frac=fraction, random_state=self.config.seed) - logger.info(f"Sampled {len(df)} samples ({fraction:.2f}) from original {original_len} samples.") - - self.df = df - logger.info(f"Dataset loaded with {len(self.df)} entries.") - def _setup_press(self): """ Initializes the KVPress instance and applies compression ratios based on its type. @@ -279,13 +265,57 @@ def _setup_press(self): press.compression_ratio = compression_ratio logger.info(f"Set {press.__class__.__name__} compression_ratio to {compression_ratio}") else: - logger.warning(f"Press {press.__class__.__name__} has no 'compression_ratio' attribute.") + logger.warning( + f"Press {press.__class__.__name__} has no 'compression_ratio' attribute. This is expected is you set `no_press`." + ) self.press = press # Set the press info in the config for saving to YAML self.config.press_init_command = str(press) logger.info(f"KV Press '{press_name}' setup.") + def _load_and_prepare_dataset(self): + """ + Loads the dataset specified in the config and applies sampling/filtering. + """ + dataset_name = self.config.dataset + data_dir = str(self.config.data_dir) if self.config.data_dir else None + fraction = self.config.fraction + + logger.info(f"Loading dataset: {DATASET_REGISTRY[dataset_name]} (data_dir: {data_dir})") + df = load_dataset(DATASET_REGISTRY[dataset_name], data_dir=data_dir, split="test").to_pandas() + + if fraction < 1.0: + original_len = len(df) + df = df.sample(frac=fraction, random_state=self.config.seed) + logger.info(f"Sampled {len(df)} samples ({fraction:.2f}) from original {original_len} samples.") + + logger.info(f"Dataset loaded with {len(df)} entries.") + + # if we have needle in a haystack, we need to insert it in the context + if self.config.dataset == "needle_in_haystack": + df = insert_needle_in_haystack( + df, self.pipeline.tokenizer, self.config.max_context_length, self.config.needle_depth + ) + + if isinstance(self.press, FinchPress): + if not self.config.compress_questions: + logger.error("FinchPress requires 'compress_questions' to be set to True.") + raise ValueError("FinchPress requires compress_questions to be set to True") + # FinchPress uses a delimiter token to separate context and question + # So we need to update the tokenizer and the model embeddings. + logger.info("FinchPress detected, updating model and tokenizer with delimiter token.") + self.press.update_model_and_tokenizer(self.pipeline.model, self.pipeline.tokenizer) # type: ignore[attr-defined] + df["context"] = df["context"] + self.press.delimiter_token # type: ignore[attr-defined, index] + + if self.config.compress_questions: + logger.info("Compressing questions into context.") + df["context"] = df["context"] + df["question"] # type: ignore[index] + df["question"] = "" # type: ignore[index] + + self.df = df + logger.info(f"Dataset processed with {len(self.df)} entries.") + def _setup_model_pipeline(self): model_name = self.config.model device = self.config.device @@ -295,7 +325,6 @@ def _setup_model_pipeline(self): logger.info(f"No device specified, auto-detected device: {device}") model_kwargs = self.config.model_kwargs or {} - print(model_kwargs) if isinstance(self.press, ObservedAttentionPress): model_kwargs["attn_implementation"] = "eager" logger.info("ObservedAttentionPress detected, setting attn_implementation to 'eager'.") @@ -310,50 +339,20 @@ def _setup_model_pipeline(self): pass logger.info(f"Loading model pipeline for: {model_name} on device: {device} with model_kwargs: {model_kwargs}") + pipeline_kwargs = { + "model": model_name, + "model_kwargs": model_kwargs, + "trust_remote_code": True, + } if device == "auto": - self.pipeline = pipeline( - "kv-press-text-generation", - model=model_name, - device_map="auto", - model_kwargs=model_kwargs, - trust_remote_code=True, - ) + pipeline_kwargs["device_map"] = "auto" else: - self.pipeline = pipeline( - "kv-press-text-generation", - model=model_name, - device=device, - model_kwargs=model_kwargs, - trust_remote_code=True, - ) - - # Ensure model is in eval mode for deterministic inference - if hasattr(self.pipeline, "model"): - self.pipeline.model.eval() + pipeline_kwargs["device"] = device + self.pipeline = pipeline("kv-press-text-generation", **pipeline_kwargs) + self.pipeline.model.eval() logger.info("Model pipeline loaded.") - def _prepare_data_for_inference(self): - """ - Prepares the dataset for inference, handling `compress_questions` and `FinchPress` specifics. - """ - compress_questions = self.config.compress_questions - - if isinstance(self.press, FinchPress): - if not compress_questions: - logger.error("FinchPress requires 'compress_questions' to be set to True.") - raise ValueError("FinchPress requires compress_questions to be set to True") - # FinchPress uses a delimiter token to separate context and question - # So we need to update the tokenizer and the model embeddings. - logger.info("FinchPress detected, updating model and tokenizer with delimiter token.") - self.press.update_model_and_tokenizer(self.pipeline.model, self.pipeline.tokenizer) # type: ignore[attr-defined] - self.df["context"] = self.df["context"] + self.press.delimiter_token # type: ignore[attr-defined, index] - - if compress_questions: - logger.info("Compressing questions into context.") - self.df["context"] = self.df["context"] + self.df["question"] # type: ignore[index] - self.df["question"] = "" # type: ignore[index] - @torch.inference_mode() def _run_inference(self): """ @@ -400,7 +399,7 @@ def _save_results(self, save_filename: Path): if save_filename.exists(): logger.warning(f"Results CSV already exists at {save_filename}. Overwriting.") - self.df[["predicted_answer", "compression_ratio"]].to_csv(str(save_filename), index=False) # type: ignore[index] + self.df[list(set(self.df.columns) - set(["context"]))].to_csv(str(save_filename), index=False) # type: ignore[index] logger.info(f"Results saved to {save_filename}") def _calculate_and_save_metrics(self, save_filename: Path): @@ -443,10 +442,9 @@ def run_evaluation(self): ) return - self._load_dataset() self._setup_press() self._setup_model_pipeline() - self._prepare_data_for_inference() + self._load_and_prepare_dataset() self._run_inference() self._save_results(predictions_filename) diff --git a/evaluation/evaluate_config.yaml b/evaluation/evaluate_config.yaml index 9d23734d..f4f4cdec 100644 --- a/evaluation/evaluate_config.yaml +++ b/evaluation/evaluate_config.yaml @@ -5,16 +5,17 @@ output_dir: "./results" model: "meta-llama/Meta-Llama-3.1-8B-Instruct" dataset: "ruler" # see DATASET_REGISTRY in evaluate_registry.py -data_dir: "4096" # Subdirectory of the dataset (if applicable) +data_dir: "4096" # Subdirectory of the dataset (if applicable) else leave "null" press_name: "knorm" # see PRESS_REGISTRY in evaluate_registry.py -compression_ratio: 1.0 # Compression ratio for the press (0.0 to 1.0) +compression_ratio: 0.5 # Compression ratio for the press (0.0 to 1.0) key_channel_compression_ratio: null # For ThinKPress and ComposedPress (0.0 to 1.0) fraction: 1.0 # Fraction of dataset to evaluate (0.0 to 1.0), for quick testing max_new_tokens: null # Maximum new tokens to generate (null = use dataset default) max_context_length: null # Maximum context length (null = use model maximum) -compress_questions: false # Whether to compress questions with context +compress_questions: false # Whether to compress questions with context +needle_depth: null # Depth percentage of the needle in the haystack (0 to 100), only for needle_in_haystack dataset device: null # Device to use (null = auto-detect, "cuda:0", "cpu", etc.) diff --git a/evaluation/evaluate_registry.py b/evaluation/evaluate_registry.py index 89b26596..0dc4bd03 100644 --- a/evaluation/evaluate_registry.py +++ b/evaluation/evaluate_registry.py @@ -6,6 +6,7 @@ from benchmarks.longbench.calculate_metrics import calculate_metrics_e as longbench_scorer_e from benchmarks.longbenchv2.calculate_metrics import calculate_metrics as longbenchv2_scorer from benchmarks.loogle.calculate_metrics import calculate_metrics as loogle_scorer +from benchmarks.needle_in_haystack.calculate_metrics import calculate_metrics as needle_in_haystack_scorer from benchmarks.ruler.calculate_metrics import calculate_metrics as ruler_scorer from benchmarks.zero_scrolls.calculate_metrics import calculate_metrics as zero_scrolls_scorer @@ -41,6 +42,7 @@ "longbench": "Xnhyacinth/LongBench", "longbench-e": "Xnhyacinth/LongBench", "longbench-v2": "Xnhyacinth/LongBench-v2", + "needle_in_haystack": "alessiodevoto/paul_graham_essays", } SCORER_REGISTRY = { @@ -51,6 +53,7 @@ "longbench": longbench_scorer, "longbench-e": longbench_scorer_e, "longbench-v2": longbenchv2_scorer, + "needle_in_haystack": needle_in_haystack_scorer, } diff --git a/tests/integration/test_ruler.py b/tests/integration/test_ruler.py index 675f92f6..38495821 100644 --- a/tests/integration/test_ruler.py +++ b/tests/integration/test_ruler.py @@ -4,7 +4,7 @@ import datasets import pytest import torch -from transformers import DynamicCache, QuantoQuantizedCache, QuantizedCacheConfig +from transformers import DynamicCache, QuantizedCacheConfig, QuantoQuantizedCache from transformers.utils import is_flash_attn_2_available, is_optimum_quanto_available from tests.default_presses import default_presses