NVIDIA · alessiodevoto · Aug 22, 2025 · Aug 21, 2025 · Aug 21, 2025 · Aug 22, 2025
diff --git a/evaluation/README.md b/evaluation/README.md
@@ -41,12 +41,13 @@ We support evaluation with all the presses implemented in the library (and possi
 
 At the moment, we support the following standard popular benchmarks:
 
-- [Loogle](loogle/README.md) ([hf link](https://huggingface.co/datasets/simonjegou/loogle))
-- [RULER](ruler/README.md) ([hf link](https://huggingface.co/datasets/simonjegou/ruler))
-- [Zero Scrolls](zero_scrolls/README.md) ([hf link](https://huggingface.co/datasets/simonjegou/zero_scrolls))
-- [Infinitebench](infinite_bench/README.md) ([hf link](https://huggingface.co/datasets/MaxJeblick/InfiniteBench))
-- [longbench](longbench/README.md)([hf link](https://huggingface.co/datasets/Xnhyacinth/LongBench))
-- [longbench-v2](longbenchv2/README.md)([hf link](https://huggingface.co/datasets/Xnhyacinth/LongBench-v2))
+- [Loogle](benchmarks/loogle/README.md) ([hf link](https://huggingface.co/datasets/simonjegou/loogle))
+- [RULER](benchmarks/ruler/README.md) ([hf link](https://huggingface.co/datasets/simonjegou/ruler))
+- [Zero Scrolls](benchmarks/zero_scrolls/README.md) ([hf link](https://huggingface.co/datasets/simonjegou/zero_scrolls))
+- [Infinitebench](benchmarks/infinite_bench/README.md) ([hf link](https://huggingface.co/datasets/MaxJeblick/InfiniteBench))
+- [longbench](benchmarks/longbench/README.md)([hf link](https://huggingface.co/datasets/Xnhyacinth/LongBench))
+- [longbench-v2](benchmarks/longbenchv2/README.md)([hf link](https://huggingface.co/datasets/Xnhyacinth/LongBench-v2))
+- [Needle in a Haystack](benchmarks/needle_in_haystack/README.md)([hf link][Paul Graham's essays](https://huggingface.co/datasets/alessiodevoto/paul_graham_essays))
 
 📚 **For detailed information** about each dataset or implementing custom benchmarks, see the individual README files in the benchmarks directory.
 

diff --git a/evaluation/benchmarks/infinite_bench/__init__.py b/evaluation/benchmarks/infinite_bench/__init__.py
@@ -0,0 +1,2 @@
+# SPDX-FileCopyrightText: Copyright (c) 1993-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
diff --git a/evaluation/benchmarks/longbench/__init__.py b/evaluation/benchmarks/longbench/__init__.py
@@ -0,0 +1,2 @@
+# SPDX-FileCopyrightText: Copyright (c) 1993-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
diff --git a/evaluation/benchmarks/longbenchv2/__init__.py b/evaluation/benchmarks/longbenchv2/__init__.py
@@ -0,0 +1,2 @@
+# SPDX-FileCopyrightText: Copyright (c) 1993-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
diff --git a/evaluation/benchmarks/needle_in_haystack/README.md b/evaluation/benchmarks/needle_in_haystack/README.md
@@ -0,0 +1,3 @@
+# Needle in a Haystack
+This benchmark evaluates a model's ability to retrieve a specific piece of information, the "needle," hidden within a large body of text, the "haystack." The test challenges a model's long-context understanding and its ability to maintain information accuracy over increasing document lengths. 
+We follow the vast majority of the literature and use [Paul Graham's essays](https://huggingface.co/datasets/alessiodevoto/paul_graham_essays) as the haystack.
diff --git a/evaluation/benchmarks/needle_in_haystack/__init__.py b/evaluation/benchmarks/needle_in_haystack/__init__.py
@@ -0,0 +1,2 @@
+# SPDX-FileCopyrightText: Copyright (c) 1993-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
diff --git a/evaluation/benchmarks/needle_in_haystack/calculate_metrics.py b/evaluation/benchmarks/needle_in_haystack/calculate_metrics.py
@@ -0,0 +1,15 @@
+# SPDX-FileCopyrightText: Copyright (c) 1993-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+import pandas as pd
+from rouge import Rouge
+
+scorer = Rouge()
+
+
+def calculate_metrics(df: pd.DataFrame) -> list[dict]:
+    scores = []
+    for index, row in df.iterrows():
+        score = scorer.get_scores(row["needle"], row["predicted_answer"])[0]
+        scores.append(score)
+    return scores
diff --git a/evaluation/benchmarks/needle_in_haystack/utils.py b/evaluation/benchmarks/needle_in_haystack/utils.py
@@ -0,0 +1,49 @@
+# SPDX-FileCopyrightText: Copyright (c) 1993-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+import logging
+
+import pandas as pd
+from transformers import PreTrainedTokenizer
+
+logger = logging.getLogger(__name__)
+
+
+def insert_needle_in_haystack(
+    df: pd.DataFrame, tokenizer: PreTrainedTokenizer, max_context_length: int, needle_depth: int
+) -> pd.DataFrame:
+    """
+    Inserts the "needle" string into the "context" of each row in the DataFrame at a specified depth.
+    Adapted from the original implementation: https://github.com/gkamradt/LLMTest_NeedleInAHaystack
+
+    Parameters
+    ----------
+    df : pd.DataFrame
+        The input DataFrame containing at least the columns "context" and "needle".
+    tokenizer : PreTrainedTokenizer
+        The tokenizer used to encode and decode the context and needle.
+    max_context_length : int
+        The maximum allowed length (in tokens) for the context, including the needle.
+    needle_depth : int
+        The percentage (0-100) indicating how deep into the context the needle should be inserted.
+
+    Returns
+    -------
+    pd.DataFrame
+        The DataFrame with the "context" column modified to include the needle at the specified depth.
+    """
+    logger.info(f"Preparing dataset for inference with needle in haystack. Needle: {df['needle'][0]}")
+    tokenized_needle = tokenizer.encode(df["needle"][0], add_special_tokens=False)
+    context_length = max_context_length - len(tokenized_needle) - 150  # account for system prompts
+    needle_index = int(context_length * needle_depth / 100)
+    # tokenize the context
+    df["context"] = df["context"].apply(lambda x: tokenizer.encode(x, add_special_tokens=False)[:context_length])
+    # insert the needle at the depth specified in the config
+    df["context"] = df["context"].apply(lambda x: x[:needle_index] + tokenized_needle + x[needle_index:])
+    # detokenize the context
+    df["context"] = (
+        "This is a very long story book: <book> "
+        + df["context"].apply(lambda x: tokenizer.decode(x, skip_special_tokens=True))
+        + " </book>."
+    )
+    return df
diff --git a/evaluation/evaluate.py b/evaluation/evaluate.py
@@ -10,9 +10,10 @@
 from typing import Any, Dict, Optional
 
 import numpy as np
-import pandas as pd  # Import pandas for DataFrame type hinting
+import pandas as pd
 import torch
 import yaml  # type: ignore[import-untyped]
+from benchmarks.needle_in_haystack.utils import insert_needle_in_haystack
 from datasets import load_dataset
 from evaluate_registry import DATASET_REGISTRY, PRESS_REGISTRY, SCORER_REGISTRY
 from fire import Fire
@@ -42,6 +43,7 @@ class EvaluationConfig:
     max_new_tokens: Optional[int] = None
     max_context_length: Optional[int] = None
     compress_questions: bool = False
+    needle_depth: Optional[int] = None
 
     # Output and logging
     output_dir: str = "./results"
@@ -88,6 +90,10 @@ def __post_init__(self):
         if self.model_kwargs is None:
             self.model_kwargs = {}
 
+        if self.dataset == "needle_in_haystack":
+            assert self.needle_depth is not None, "needle_depth must be set for needle_in_haystack"
+            assert self.max_context_length is not None, "max_context_length must be set for needle_in_haystack"
+
     def get_results_dir(self, output_dir: Path) -> Path:
         """
         Generates the unique save directory and filenames based on configuration parameters.
@@ -121,6 +127,8 @@ def get_results_dir(self, output_dir: Path) -> Path:
             components.append("compressed_questions")
         if self.key_channel_compression_ratio is not None:
             components.append(f"key_channel_cr{self.key_channel_compression_ratio:.2f}")
+        if self.needle_depth is not None:
+            components.append(f"needle_depth{self.needle_depth:.1f}")
 
         dir_name = "__".join(filter(None, components))  # Filter None/empty strings
         config_dir = output_dir / dir_name
@@ -187,19 +195,16 @@ def __init__(self, config: EvaluationConfig):
 
     def _setup_deterministic_seeds(self):
         """Set deterministic seeds for reproducible results."""
-        seed = self.config.seed
-
-        torch.manual_seed(seed)
-        np.random.seed(seed)
-        random.seed(seed)
+        torch.manual_seed(self.config.seed)
+        np.random.seed(self.config.seed)
+        random.seed(self.config.seed)
 
         if torch.cuda.is_available():
-            torch.cuda.manual_seed(seed)
-            torch.cuda.manual_seed_all(seed)
+            torch.cuda.manual_seed(self.config.seed)
+            torch.cuda.manual_seed_all(self.config.seed)
             torch.backends.cudnn.deterministic = True
             torch.backends.cudnn.benchmark = False
-
-        logger.info(f"Set deterministic seeds to {seed}")
+        logger.info(f"Set deterministic seeds to {self.config.seed}")
 
     def _setup_logging(self):
         """Configures the logging level based on the config."""
@@ -220,25 +225,6 @@ def _setup_directories(self) -> Path:
         logger.info(f"Output directory set to: {output_dir}")
         return output_dir
 
-    def _load_dataset(self):
-        """
-        Loads the dataset specified in the config and applies sampling/filtering.
-        """
-        dataset_name = self.config.dataset
-        data_dir = str(self.config.data_dir) if self.config.data_dir else None
-        fraction = self.config.fraction
-
-        logger.info(f"Loading dataset: {DATASET_REGISTRY[dataset_name]} (data_dir: {data_dir})")
-        df = load_dataset(DATASET_REGISTRY[dataset_name], data_dir=data_dir, split="test").to_pandas()
-
-        if fraction < 1.0:
-            original_len = len(df)
-            df = df.sample(frac=fraction, random_state=self.config.seed)
-            logger.info(f"Sampled {len(df)} samples ({fraction:.2f}) from original {original_len} samples.")
-
-        self.df = df
-        logger.info(f"Dataset loaded with {len(self.df)} entries.")
-
     def _setup_press(self):
         """
         Initializes the KVPress instance and applies compression ratios based on its type.
@@ -279,13 +265,57 @@ def _setup_press(self):
                 press.compression_ratio = compression_ratio
                 logger.info(f"Set {press.__class__.__name__} compression_ratio to {compression_ratio}")
             else:
-                logger.warning(f"Press {press.__class__.__name__} has no 'compression_ratio' attribute.")
+                logger.warning(
+                    f"Press {press.__class__.__name__} has no 'compression_ratio' attribute. This is expected is you set `no_press`."
+                )
 
         self.press = press
         # Set the press info in the config for saving to YAML
         self.config.press_init_command = str(press)
         logger.info(f"KV Press '{press_name}' setup.")
 
+    def _load_and_prepare_dataset(self):
+        """
+        Loads the dataset specified in the config and applies sampling/filtering.
+        """
+        dataset_name = self.config.dataset
+        data_dir = str(self.config.data_dir) if self.config.data_dir else None
+        fraction = self.config.fraction
+
+        logger.info(f"Loading dataset: {DATASET_REGISTRY[dataset_name]} (data_dir: {data_dir})")
+        df = load_dataset(DATASET_REGISTRY[dataset_name], data_dir=data_dir, split="test").to_pandas()
+
+        if fraction < 1.0:
+            original_len = len(df)
+            df = df.sample(frac=fraction, random_state=self.config.seed)
+            logger.info(f"Sampled {len(df)} samples ({fraction:.2f}) from original {original_len} samples.")
+
+        logger.info(f"Dataset loaded with {len(df)} entries.")
+
+        # if we have needle in a haystack, we need to insert it in the context
+        if self.config.dataset == "needle_in_haystack":
+            df = insert_needle_in_haystack(
+                df, self.pipeline.tokenizer, self.config.max_context_length, self.config.needle_depth
+            )
+
+        if isinstance(self.press, FinchPress):
+            if not self.config.compress_questions:
+                logger.error("FinchPress requires 'compress_questions' to be set to True.")
+                raise ValueError("FinchPress requires compress_questions to be set to True")
+            # FinchPress uses a delimiter token to separate context and question
+            # So we need to update the tokenizer and the model embeddings.
+            logger.info("FinchPress detected, updating model and tokenizer with delimiter token.")
+            self.press.update_model_and_tokenizer(self.pipeline.model, self.pipeline.tokenizer)  # type: ignore[attr-defined]
+            df["context"] = df["context"] + self.press.delimiter_token  # type: ignore[attr-defined, index]
+
+        if self.config.compress_questions:
+            logger.info("Compressing questions into context.")
+            df["context"] = df["context"] + df["question"]  # type: ignore[index]
+            df["question"] = ""  # type: ignore[index]
+
+        self.df = df
+        logger.info(f"Dataset processed with {len(self.df)} entries.")
+
     def _setup_model_pipeline(self):
         model_name = self.config.model
         device = self.config.device
@@ -295,7 +325,6 @@ def _setup_model_pipeline(self):
             logger.info(f"No device specified, auto-detected device: {device}")
 
         model_kwargs = self.config.model_kwargs or {}
-        print(model_kwargs)
         if isinstance(self.press, ObservedAttentionPress):
             model_kwargs["attn_implementation"] = "eager"
             logger.info("ObservedAttentionPress detected, setting attn_implementation to 'eager'.")
@@ -310,50 +339,20 @@ def _setup_model_pipeline(self):
                 pass
 
         logger.info(f"Loading model pipeline for: {model_name} on device: {device} with model_kwargs: {model_kwargs}")
+        pipeline_kwargs = {
+            "model": model_name,
+            "model_kwargs": model_kwargs,
+            "trust_remote_code": True,
+        }
         if device == "auto":
-            self.pipeline = pipeline(
-                "kv-press-text-generation",
-                model=model_name,
-                device_map="auto",
-                model_kwargs=model_kwargs,
-                trust_remote_code=True,
-            )
+            pipeline_kwargs["device_map"] = "auto"
         else:
-            self.pipeline = pipeline(
-                "kv-press-text-generation",
-                model=model_name,
-                device=device,
-                model_kwargs=model_kwargs,
-                trust_remote_code=True,
-            )
-
-        # Ensure model is in eval mode for deterministic inference
-        if hasattr(self.pipeline, "model"):
-            self.pipeline.model.eval()
+            pipeline_kwargs["device"] = device
+        self.pipeline = pipeline("kv-press-text-generation", **pipeline_kwargs)
 
+        self.pipeline.model.eval()
         logger.info("Model pipeline loaded.")
 
-    def _prepare_data_for_inference(self):
-        """
-        Prepares the dataset for inference, handling `compress_questions` and `FinchPress` specifics.
-        """
-        compress_questions = self.config.compress_questions
-
-        if isinstance(self.press, FinchPress):
-            if not compress_questions:
-                logger.error("FinchPress requires 'compress_questions' to be set to True.")
-                raise ValueError("FinchPress requires compress_questions to be set to True")
-            # FinchPress uses a delimiter token to separate context and question
-            # So we need to update the tokenizer and the model embeddings.
-            logger.info("FinchPress detected, updating model and tokenizer with delimiter token.")
-            self.press.update_model_and_tokenizer(self.pipeline.model, self.pipeline.tokenizer)  # type: ignore[attr-defined]
-            self.df["context"] = self.df["context"] + self.press.delimiter_token  # type: ignore[attr-defined, index]
-
-        if compress_questions:
-            logger.info("Compressing questions into context.")
-            self.df["context"] = self.df["context"] + self.df["question"]  # type: ignore[index]
-            self.df["question"] = ""  # type: ignore[index]
-
     @torch.inference_mode()
     def _run_inference(self):
         """
@@ -400,7 +399,7 @@ def _save_results(self, save_filename: Path):
         if save_filename.exists():
             logger.warning(f"Results CSV already exists at {save_filename}. Overwriting.")
 
-        self.df[["predicted_answer", "compression_ratio"]].to_csv(str(save_filename), index=False)  # type: ignore[index]
+        self.df[list(set(self.df.columns) - set(["context"]))].to_csv(str(save_filename), index=False)  # type: ignore[index]
         logger.info(f"Results saved to {save_filename}")
 
     def _calculate_and_save_metrics(self, save_filename: Path):
@@ -443,10 +442,9 @@ def run_evaluation(self):
             )
             return
 
-        self._load_dataset()
         self._setup_press()
         self._setup_model_pipeline()
-        self._prepare_data_for_inference()
+        self._load_and_prepare_dataset()
 
         self._run_inference()
         self._save_results(predictions_filename)

diff --git a/evaluation/evaluate_config.yaml b/evaluation/evaluate_config.yaml
@@ -5,16 +5,17 @@ output_dir: "./results"
 
 model: "meta-llama/Meta-Llama-3.1-8B-Instruct"
 dataset: "ruler"                                  # see DATASET_REGISTRY in evaluate_registry.py
-data_dir: "4096"                                  # Subdirectory of the dataset (if applicable)
+data_dir: "4096"                                  # Subdirectory of the dataset (if applicable) else leave "null"
 
 press_name: "knorm"                                # see PRESS_REGISTRY in evaluate_registry.py
-compression_ratio: 1.0                             # Compression ratio for the press (0.0 to 1.0)
+compression_ratio: 0.5                             # Compression ratio for the press (0.0 to 1.0)
 key_channel_compression_ratio: null                # For ThinKPress and ComposedPress (0.0 to 1.0)
 
 fraction: 1.0                                     # Fraction of dataset to evaluate (0.0 to 1.0), for quick testing
 max_new_tokens: null                              # Maximum new tokens to generate (null = use dataset default)
 max_context_length: null                          # Maximum context length (null = use model maximum)
-compress_questions: false                          # Whether to compress questions with context
+compress_questions: false                         # Whether to compress questions with context
+needle_depth: null                                # Depth percentage of the needle in the haystack (0 to 100), only for needle_in_haystack dataset
 
 device: null  # Device to use (null = auto-detect, "cuda:0", "cpu", etc.)
 

diff --git a/evaluation/evaluate_registry.py b/evaluation/evaluate_registry.py
@@ -6,6 +6,7 @@
 from benchmarks.longbench.calculate_metrics import calculate_metrics_e as longbench_scorer_e
 from benchmarks.longbenchv2.calculate_metrics import calculate_metrics as longbenchv2_scorer
 from benchmarks.loogle.calculate_metrics import calculate_metrics as loogle_scorer
+from benchmarks.needle_in_haystack.calculate_metrics import calculate_metrics as needle_in_haystack_scorer
 from benchmarks.ruler.calculate_metrics import calculate_metrics as ruler_scorer
 from benchmarks.zero_scrolls.calculate_metrics import calculate_metrics as zero_scrolls_scorer
 
@@ -41,6 +42,7 @@
     "longbench": "Xnhyacinth/LongBench",
     "longbench-e": "Xnhyacinth/LongBench",
     "longbench-v2": "Xnhyacinth/LongBench-v2",
+    "needle_in_haystack": "alessiodevoto/paul_graham_essays",
 }
 
 SCORER_REGISTRY = {
@@ -51,6 +53,7 @@
     "longbench": longbench_scorer,
     "longbench-e": longbench_scorer_e,
     "longbench-v2": longbenchv2_scorer,
+    "needle_in_haystack": needle_in_haystack_scorer,
 }
 
 

diff --git a/tests/integration/test_ruler.py b/tests/integration/test_ruler.py
@@ -4,7 +4,7 @@
 import datasets
 import pytest
 import torch
-from transformers import DynamicCache, QuantoQuantizedCache, QuantizedCacheConfig
+from transformers import DynamicCache, QuantizedCacheConfig, QuantoQuantizedCache
 from transformers.utils import is_flash_attn_2_available, is_optimum_quanto_available
 
 from tests.default_presses import default_presses
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		# SPDX-FileCopyrightText: Copyright (c) 1993-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
		# SPDX-License-Identifier: Apache-2.0
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		# SPDX-FileCopyrightText: Copyright (c) 1993-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
Comment thread maxjeblick marked this conversation as resolved.
		# SPDX-License-Identifier: Apache-2.0