From e2fe544604736c1ffa0c9078adb09f0eda69e422 Mon Sep 17 00:00:00 2001
From: alessiodevoto <devoto.alessio@gmail.com>
Date: Thu, 21 Aug 2025 15:36:39 +0000
Subject: [PATCH 1/9] needle

Signed-off-by: alessiodevoto <devoto.alessio@gmail.com>
---
 .../benchmarks/needle_in_haystack/__init__.py     |  0
 .../needle_in_haystack/calculate_metrics.py       | 14 ++++++++++++++
 evaluation/evaluate.py                            | 15 ++++++++++++++-
 evaluation/evaluate_registry.py                   |  1 +
 4 files changed, 29 insertions(+), 1 deletion(-)
 create mode 100644 evaluation/benchmarks/needle_in_haystack/__init__.py
 create mode 100644 evaluation/benchmarks/needle_in_haystack/calculate_metrics.py
diff --git a/evaluation/benchmarks/needle_in_haystack/__init__.py b/evaluation/benchmarks/needle_in_haystack/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/evaluation/benchmarks/needle_in_haystack/calculate_metrics.py b/evaluation/benchmarks/needle_in_haystack/calculate_metrics.py
new file mode 100644
index 00000000..06545629
--- /dev/null
+++ b/evaluation/benchmarks/needle_in_haystack/calculate_metrics.py
@@ -0,0 +1,14 @@
+# SPDX-FileCopyrightText: Copyright (c) 1993-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+import pandas as pd
+from rouge_score import rouge_scorer
+
+scorer = rouge_scorer.RougeScorer(["rouge1", "rougeL"], use_stemmer=True)
+
+def calculate_metrics(df: pd.DataFrame) -> dict:
+    scores = []
+    for index, row in df.iterrows():
+        score = scorer.score(row["needle"], row["predicted_answer"])["rouge1"].fmeasure * 10
+        scores.append(score)
+    return {"rouge1": sum(scores) / len(scores)}
\ No newline at end of file
diff --git a/evaluation/evaluate.py b/evaluation/evaluate.py
index e7777dfc..cf444d73 100644
--- a/evaluation/evaluate.py
+++ b/evaluation/evaluate.py
@@ -42,6 +42,7 @@ class EvaluationConfig:
     max_new_tokens: Optional[int] = None
     max_context_length: Optional[int] = None
     compress_questions: bool = False
+    needle_depth: Optional[int] = None
 
     # Output and logging
     output_dir: str = "./results"
@@ -87,6 +88,9 @@ def __post_init__(self):
         # Initialize model_kwargs if None
         if self.model_kwargs is None:
             self.model_kwargs = {}
+        
+        if self.dataset == "needle_in_haystack":
+            assert self.needle_depth is not None, "needle_depth must be set for needle_in_haystack"
 
     def get_results_dir(self, output_dir: Path) -> Path:
         """
@@ -338,6 +342,15 @@ def _prepare_data_for_inference(self):
         Prepares the dataset for inference, handling `compress_questions` and `FinchPress` specifics.
         """
         compress_questions = self.config.compress_questions
+        
+        # if we have needle in a haystack, we need to tokenize the dataset, cut it to max_context_length, insert the needle at depth n%max_context_length, and then detokenize it
+        if self.config.dataset == "needle_in_haystack":
+            tokenized_needle = self.pipeline.tokenizer.encode(self.df["needle"], add_special_tokens=False)
+            context_length = self.config.max_context_length - len(tokenized_needle) - 150 # account for system prompts
+            self.df["context"] = self.df["context"].apply(lambda x: self.pipeline.tokenizer.encode(x, add_special_tokens=False)[:context_length])
+            needle_index = int(context_length * self.config.needle_depth / 100)
+            self.df["context"] = self.df["context"].apply(lambda x: x[:needle_index] + tokenized_needle + x[needle_index:])
+            self.df["context"] = "This is a very long story book: <book> " + self.df["context"].apply(lambda x: self.pipeline.tokenizer.decode(x, skip_special_tokens=True)) + " </book>."
 
         if isinstance(self.press, FinchPress):
             if not compress_questions:
@@ -400,7 +413,7 @@ def _save_results(self, save_filename: Path):
         if save_filename.exists():
             logger.warning(f"Results CSV already exists at {save_filename}. Overwriting.")
 
-        self.df[["predicted_answer", "compression_ratio"]].to_csv(str(save_filename), index=False)  # type: ignore[index]
+        self.df[list(set(self.df.columns) - set(["context"]))].to_csv(str(save_filename), index=False)  # type: ignore[index]
         logger.info(f"Results saved to {save_filename}")
 
     def _calculate_and_save_metrics(self, save_filename: Path):
diff --git a/evaluation/evaluate_registry.py b/evaluation/evaluate_registry.py
index 89b26596..af753dd2 100644
--- a/evaluation/evaluate_registry.py
+++ b/evaluation/evaluate_registry.py
@@ -41,6 +41,7 @@
     "longbench": "Xnhyacinth/LongBench",
     "longbench-e": "Xnhyacinth/LongBench",
     "longbench-v2": "Xnhyacinth/LongBench-v2",
+    "needle_in_haystack": "alessiodevoto/paul_graham_essays",
 }
 
 SCORER_REGISTRY = {

From 4ce1c43a1907bfe3c26288e9c810c8618e007d30 Mon Sep 17 00:00:00 2001
From: alessiodevoto <devoto.alessio@gmail.com>
Date: Thu, 21 Aug 2025 16:18:58 +0000
Subject: [PATCH 2/9] niah

Signed-off-by: alessiodevoto <devoto.alessio@gmail.com>
---
 .../benchmarks/needle_in_haystack/README.md   |  3 ++
 evaluation/evaluate.py                        | 30 ++++++++++++++-----
 evaluation/evaluate_registry.py               |  2 ++
 3 files changed, 27 insertions(+), 8 deletions(-)
 create mode 100644 evaluation/benchmarks/needle_in_haystack/README.md

diff --git a/evaluation/benchmarks/needle_in_haystack/README.md b/evaluation/benchmarks/needle_in_haystack/README.md
new file mode 100644
index 00000000..324e260e
--- /dev/null
+++ b/evaluation/benchmarks/needle_in_haystack/README.md
@@ -0,0 +1,3 @@
+# Needle in a Haystack
+This benchmark evaluates a model's ability to retrieve a specific piece of information, the "needle," hidden within a large body of text, the "haystack." The test challenges a model's long-context understanding and its ability to maintain information accuracy over increasing document lengths. 
+We follow the vast majority of the literature and use [Paul Graham's essays](https://huggingface.co/datasets/alessiodevoto/paul_graham_essays) as the haystack.
\ No newline at end of file
diff --git a/evaluation/evaluate.py b/evaluation/evaluate.py
index cf444d73..dd28655f 100644
--- a/evaluation/evaluate.py
+++ b/evaluation/evaluate.py
@@ -208,7 +208,8 @@ def _setup_deterministic_seeds(self):
     def _setup_logging(self):
         """Configures the logging level based on the config."""
         log_level = self.config.log_level.upper()
-        logging.basicConfig(level=getattr(logging, log_level), format="%(asctime)s - %(levelname)s - %(message)s")
+        logging.basicConfig(level=logging.INFO)
+        # logging.basicConfig(level=getattr(logging, log_level), format="%(asctime)s - %(levelname)s - %(message)s")
 
     def _setup_directories(self) -> Path:
         """
@@ -299,7 +300,6 @@ def _setup_model_pipeline(self):
             logger.info(f"No device specified, auto-detected device: {device}")
 
         model_kwargs = self.config.model_kwargs or {}
-        print(model_kwargs)
         if isinstance(self.press, ObservedAttentionPress):
             model_kwargs["attn_implementation"] = "eager"
             logger.info("ObservedAttentionPress detected, setting attn_implementation to 'eager'.")
@@ -337,20 +337,34 @@ def _setup_model_pipeline(self):
 
         logger.info("Model pipeline loaded.")
 
+    
+    def _insert_needle_in_haystack(self):
+        """
+        Inserts the needle in the haystack at the depth specified in the config.
+        Adapted from the original implementation: https://github.com/gkamradt/LLMTest_NeedleInAHaystack
+        To insert the needle, we need to tokenize the context, insert the needle at the depth specified in the config, and then detokenize it.
+        """
+        logger.info(f"Preparing dataset for inference with needle in haystack. Needle: {self.df['needle'][0]}")
+        tokenized_needle = self.pipeline.tokenizer.encode(self.df["needle"][0], add_special_tokens=False)
+        context_length = self.config.max_context_length - len(tokenized_needle) - 150 # account for system prompts
+        needle_index = int(context_length * self.config.needle_depth / 100)
+        # tokenize the context
+        self.df["context"] = self.df["context"].apply(lambda x: self.pipeline.tokenizer.encode(x, add_special_tokens=False)[:context_length])
+        # insert the needle at the depth specified in the config
+        self.df["context"] = self.df["context"].apply(lambda x: x[:needle_index] + tokenized_needle + x[needle_index:])
+        # detokenize the context
+        self.df["context"] = "This is a very long story book: <book> " + self.df["context"].apply(lambda x: self.pipeline.tokenizer.decode(x, skip_special_tokens=True)) + " </book>."
+
     def _prepare_data_for_inference(self):
         """
         Prepares the dataset for inference, handling `compress_questions` and `FinchPress` specifics.
         """
         compress_questions = self.config.compress_questions
         
+        
         # if we have needle in a haystack, we need to tokenize the dataset, cut it to max_context_length, insert the needle at depth n%max_context_length, and then detokenize it
         if self.config.dataset == "needle_in_haystack":
-            tokenized_needle = self.pipeline.tokenizer.encode(self.df["needle"], add_special_tokens=False)
-            context_length = self.config.max_context_length - len(tokenized_needle) - 150 # account for system prompts
-            self.df["context"] = self.df["context"].apply(lambda x: self.pipeline.tokenizer.encode(x, add_special_tokens=False)[:context_length])
-            needle_index = int(context_length * self.config.needle_depth / 100)
-            self.df["context"] = self.df["context"].apply(lambda x: x[:needle_index] + tokenized_needle + x[needle_index:])
-            self.df["context"] = "This is a very long story book: <book> " + self.df["context"].apply(lambda x: self.pipeline.tokenizer.decode(x, skip_special_tokens=True)) + " </book>."
+            self._insert_needle_in_haystack()
 
         if isinstance(self.press, FinchPress):
             if not compress_questions:
diff --git a/evaluation/evaluate_registry.py b/evaluation/evaluate_registry.py
index af753dd2..4798bb39 100644
--- a/evaluation/evaluate_registry.py
+++ b/evaluation/evaluate_registry.py
@@ -8,6 +8,7 @@
 from benchmarks.loogle.calculate_metrics import calculate_metrics as loogle_scorer
 from benchmarks.ruler.calculate_metrics import calculate_metrics as ruler_scorer
 from benchmarks.zero_scrolls.calculate_metrics import calculate_metrics as zero_scrolls_scorer
+from benchmarks.needle_in_haystack.calculate_metrics import calculate_metrics as needle_in_haystack_scorer
 
 from kvpress import (
     AdaKVPress,
@@ -52,6 +53,7 @@
     "longbench": longbench_scorer,
     "longbench-e": longbench_scorer_e,
     "longbench-v2": longbenchv2_scorer,
+    "needle_in_haystack": needle_in_haystack_scorer,
 }
 
 

From 5e777b51e69c428840349a46622fc61e3841d469 Mon Sep 17 00:00:00 2001
From: alessiodevoto <devoto.alessio@gmail.com>
Date: Fri, 22 Aug 2025 07:13:09 +0000
Subject: [PATCH 3/9] rouge scorer

Signed-off-by: alessiodevoto <devoto.alessio@gmail.com>
---
 .../benchmarks/needle_in_haystack/__init__.py |  2 ++
 .../needle_in_haystack/calculate_metrics.py   | 11 ++++----
 evaluation/evaluate.py                        | 26 ++++++++++++-------
 evaluation/evaluate_registry.py               |  2 +-
 4 files changed, 25 insertions(+), 16 deletions(-)

diff --git a/evaluation/benchmarks/needle_in_haystack/__init__.py b/evaluation/benchmarks/needle_in_haystack/__init__.py
index e69de29b..9adfdfcd 100644
--- a/evaluation/benchmarks/needle_in_haystack/__init__.py
+++ b/evaluation/benchmarks/needle_in_haystack/__init__.py
@@ -0,0 +1,2 @@
+# SPDX-FileCopyrightText: Copyright (c) 1993-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
diff --git a/evaluation/benchmarks/needle_in_haystack/calculate_metrics.py b/evaluation/benchmarks/needle_in_haystack/calculate_metrics.py
index 06545629..76c8a2e3 100644
--- a/evaluation/benchmarks/needle_in_haystack/calculate_metrics.py
+++ b/evaluation/benchmarks/needle_in_haystack/calculate_metrics.py
@@ -2,13 +2,14 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import pandas as pd
-from rouge_score import rouge_scorer
+from rouge import Rouge
 
-scorer = rouge_scorer.RougeScorer(["rouge1", "rougeL"], use_stemmer=True)
+scorer = Rouge()
 
-def calculate_metrics(df: pd.DataFrame) -> dict:
+
+def calculate_metrics(df: pd.DataFrame) -> list[dict]:
     scores = []
     for index, row in df.iterrows():
-        score = scorer.score(row["needle"], row["predicted_answer"])["rouge1"].fmeasure * 10
+        score = scorer.get_scores(row["needle"], row["predicted_answer"])[0]
         scores.append(score)
-    return {"rouge1": sum(scores) / len(scores)}
\ No newline at end of file
+    return scores
diff --git a/evaluation/evaluate.py b/evaluation/evaluate.py
index dd28655f..d3688d55 100644
--- a/evaluation/evaluate.py
+++ b/evaluation/evaluate.py
@@ -88,9 +88,10 @@ def __post_init__(self):
         # Initialize model_kwargs if None
         if self.model_kwargs is None:
             self.model_kwargs = {}
-        
+
         if self.dataset == "needle_in_haystack":
             assert self.needle_depth is not None, "needle_depth must be set for needle_in_haystack"
+            assert self.max_context_length is not None, "max_context_length must be set for needle_in_haystack"
 
     def get_results_dir(self, output_dir: Path) -> Path:
         """
@@ -125,6 +126,8 @@ def get_results_dir(self, output_dir: Path) -> Path:
             components.append("compressed_questions")
         if self.key_channel_compression_ratio is not None:
             components.append(f"key_channel_cr{self.key_channel_compression_ratio:.2f}")
+        if self.needle_depth is not None:
+            components.append(f"needle_depth{self.needle_depth:.1f}")
 
         dir_name = "__".join(filter(None, components))  # Filter None/empty strings
         config_dir = output_dir / dir_name
@@ -208,8 +211,7 @@ def _setup_deterministic_seeds(self):
     def _setup_logging(self):
         """Configures the logging level based on the config."""
         log_level = self.config.log_level.upper()
-        logging.basicConfig(level=logging.INFO)
-        # logging.basicConfig(level=getattr(logging, log_level), format="%(asctime)s - %(levelname)s - %(message)s")
+        logging.basicConfig(level=getattr(logging, log_level), format="%(asctime)s - %(levelname)s - %(message)s")
 
     def _setup_directories(self) -> Path:
         """
@@ -337,7 +339,6 @@ def _setup_model_pipeline(self):
 
         logger.info("Model pipeline loaded.")
 
-    
     def _insert_needle_in_haystack(self):
         """
         Inserts the needle in the haystack at the depth specified in the config.
@@ -346,23 +347,28 @@ def _insert_needle_in_haystack(self):
         """
         logger.info(f"Preparing dataset for inference with needle in haystack. Needle: {self.df['needle'][0]}")
         tokenized_needle = self.pipeline.tokenizer.encode(self.df["needle"][0], add_special_tokens=False)
-        context_length = self.config.max_context_length - len(tokenized_needle) - 150 # account for system prompts
+        context_length = self.config.max_context_length - len(tokenized_needle) - 150  # account for system prompts
         needle_index = int(context_length * self.config.needle_depth / 100)
         # tokenize the context
-        self.df["context"] = self.df["context"].apply(lambda x: self.pipeline.tokenizer.encode(x, add_special_tokens=False)[:context_length])
+        self.df["context"] = self.df["context"].apply(
+            lambda x: self.pipeline.tokenizer.encode(x, add_special_tokens=False)[:context_length]
+        )
         # insert the needle at the depth specified in the config
         self.df["context"] = self.df["context"].apply(lambda x: x[:needle_index] + tokenized_needle + x[needle_index:])
         # detokenize the context
-        self.df["context"] = "This is a very long story book: <book> " + self.df["context"].apply(lambda x: self.pipeline.tokenizer.decode(x, skip_special_tokens=True)) + " </book>."
+        self.df["context"] = (
+            "This is a very long story book: <book> "
+            + self.df["context"].apply(lambda x: self.pipeline.tokenizer.decode(x, skip_special_tokens=True))
+            + " </book>."
+        )
 
     def _prepare_data_for_inference(self):
         """
         Prepares the dataset for inference, handling `compress_questions` and `FinchPress` specifics.
         """
         compress_questions = self.config.compress_questions
-        
-        
-        # if we have needle in a haystack, we need to tokenize the dataset, cut it to max_context_length, insert the needle at depth n%max_context_length, and then detokenize it
+
+        # if we have needle in a haystack, we need to insert it in the context
         if self.config.dataset == "needle_in_haystack":
             self._insert_needle_in_haystack()
 
diff --git a/evaluation/evaluate_registry.py b/evaluation/evaluate_registry.py
index 4798bb39..0dc4bd03 100644
--- a/evaluation/evaluate_registry.py
+++ b/evaluation/evaluate_registry.py
@@ -6,9 +6,9 @@
 from benchmarks.longbench.calculate_metrics import calculate_metrics_e as longbench_scorer_e
 from benchmarks.longbenchv2.calculate_metrics import calculate_metrics as longbenchv2_scorer
 from benchmarks.loogle.calculate_metrics import calculate_metrics as loogle_scorer
+from benchmarks.needle_in_haystack.calculate_metrics import calculate_metrics as needle_in_haystack_scorer
 from benchmarks.ruler.calculate_metrics import calculate_metrics as ruler_scorer
 from benchmarks.zero_scrolls.calculate_metrics import calculate_metrics as zero_scrolls_scorer
-from benchmarks.needle_in_haystack.calculate_metrics import calculate_metrics as needle_in_haystack_scorer
 
 from kvpress import (
     AdaKVPress,

From a46c963a2220881a15158567b3765eb854993d64 Mon Sep 17 00:00:00 2001
From: alessiodevoto <devoto.alessio@gmail.com>
Date: Fri, 22 Aug 2025 08:59:49 +0000
Subject: [PATCH 4/9] add inits

Signed-off-by: alessiodevoto <devoto.alessio@gmail.com>
---
 evaluation/benchmarks/infinite_bench/__init__.py | 2 ++
 evaluation/benchmarks/longbench/__init__.py      | 2 ++
 evaluation/benchmarks/longbenchv2/__init__.py    | 2 ++
 3 files changed, 6 insertions(+)
 create mode 100644 evaluation/benchmarks/infinite_bench/__init__.py
 create mode 100644 evaluation/benchmarks/longbench/__init__.py
 create mode 100644 evaluation/benchmarks/longbenchv2/__init__.py

diff --git a/evaluation/benchmarks/infinite_bench/__init__.py b/evaluation/benchmarks/infinite_bench/__init__.py
new file mode 100644
index 00000000..9adfdfcd
--- /dev/null
+++ b/evaluation/benchmarks/infinite_bench/__init__.py
@@ -0,0 +1,2 @@
+# SPDX-FileCopyrightText: Copyright (c) 1993-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
diff --git a/evaluation/benchmarks/longbench/__init__.py b/evaluation/benchmarks/longbench/__init__.py
new file mode 100644
index 00000000..9adfdfcd
--- /dev/null
+++ b/evaluation/benchmarks/longbench/__init__.py
@@ -0,0 +1,2 @@
+# SPDX-FileCopyrightText: Copyright (c) 1993-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
diff --git a/evaluation/benchmarks/longbenchv2/__init__.py b/evaluation/benchmarks/longbenchv2/__init__.py
new file mode 100644
index 00000000..9adfdfcd
--- /dev/null
+++ b/evaluation/benchmarks/longbenchv2/__init__.py
@@ -0,0 +1,2 @@
+# SPDX-FileCopyrightText: Copyright (c) 1993-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0

From 74128750342939a04e2b624b8977ff2ca59e4039 Mon Sep 17 00:00:00 2001
From: alessiodevoto <devoto.alessio@gmail.com>
Date: Fri, 22 Aug 2025 10:54:56 +0000
Subject: [PATCH 5/9] refactor

Signed-off-by: alessiodevoto <devoto.alessio@gmail.com>
---
 .../benchmarks/needle_in_haystack/utils.py    | 49 +++++++++++++++++++
 evaluation/evaluate.py                        | 34 +++----------
 evaluation/evaluate_config.yaml               |  7 +--
 tests/integration/test_ruler.py               |  2 +-
 4 files changed, 62 insertions(+), 30 deletions(-)
 create mode 100644 evaluation/benchmarks/needle_in_haystack/utils.py

diff --git a/evaluation/benchmarks/needle_in_haystack/utils.py b/evaluation/benchmarks/needle_in_haystack/utils.py
new file mode 100644
index 00000000..14717cde
--- /dev/null
+++ b/evaluation/benchmarks/needle_in_haystack/utils.py
@@ -0,0 +1,49 @@
+# SPDX-FileCopyrightText: Copyright (c) 1993-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+import logging
+
+import pandas as pd
+from transformers import PreTrainedTokenizer
+
+logger = logging.getLogger(__name__)
+
+
+def insert_needle_in_haystack(
+    df: pd.DataFrame, tokenizer: PreTrainedTokenizer, max_context_length: int, needle_depth: int
+) -> pd.DataFrame:
+    """
+    Inserts the "needle" string into the "context" of each row in the DataFrame at a specified depth.
+    Adapted from the original implementation: https://github.com/gkamradt/LLMTest_NeedleInAHaystack
+
+    Parameters
+    ----------
+    df : pd.DataFrame
+        The input DataFrame containing at least the columns "context" and "needle".
+    tokenizer : PreTrainedTokenizer
+        The tokenizer used to encode and decode the context and needle.
+    max_context_length : int
+        The maximum allowed length (in tokens) for the context, including the needle.
+    needle_depth : int
+        The percentage (0-100) indicating how deep into the context the needle should be inserted.
+
+    Returns
+    -------
+    pd.DataFrame
+        The DataFrame with the "context" column modified to include the needle at the specified depth.
+    """
+    logger.info(f"Preparing dataset for inference with needle in haystack. Needle: {df['needle'][0]}")
+    tokenized_needle = tokenizer.encode(df["needle"][0], add_special_tokens=False)
+    context_length = max_context_length - len(tokenized_needle) - 150  # account for system prompts
+    needle_index = int(context_length * needle_depth / 100)
+    # tokenize the context
+    df["context"] = df["context"].apply(lambda x: tokenizer.encode(x, add_special_tokens=False)[:context_length])
+    # insert the needle at the depth specified in the config
+    df["context"] = df["context"].apply(lambda x: x[:needle_index] + tokenized_needle + x[needle_index:])
+    # detokenize the context
+    df["context"] = (
+        "This is a very long story book: <book> "
+        + df["context"].apply(lambda x: tokenizer.decode(x, skip_special_tokens=True))
+        + " </book>."
+    )
+    return df
diff --git a/evaluation/evaluate.py b/evaluation/evaluate.py
index d3688d55..cc44a9c8 100644
--- a/evaluation/evaluate.py
+++ b/evaluation/evaluate.py
@@ -10,9 +10,10 @@
 from typing import Any, Dict, Optional
 
 import numpy as np
-import pandas as pd  # Import pandas for DataFrame type hinting
+import pandas as pd
 import torch
 import yaml  # type: ignore[import-untyped]
+from benchmarks.needle_in_haystack.utils import insert_needle_in_haystack
 from datasets import load_dataset
 from evaluate_registry import DATASET_REGISTRY, PRESS_REGISTRY, SCORER_REGISTRY
 from fire import Fire
@@ -286,7 +287,9 @@ def _setup_press(self):
                 press.compression_ratio = compression_ratio
                 logger.info(f"Set {press.__class__.__name__} compression_ratio to {compression_ratio}")
             else:
-                logger.warning(f"Press {press.__class__.__name__} has no 'compression_ratio' attribute.")
+                logger.warning(
+                    f"Press {press.__class__.__name__} has no 'compression_ratio' attribute. This is expected is you set `no_press`."
+                )
 
         self.press = press
         # Set the press info in the config for saving to YAML
@@ -339,29 +342,6 @@ def _setup_model_pipeline(self):
 
         logger.info("Model pipeline loaded.")
 
-    def _insert_needle_in_haystack(self):
-        """
-        Inserts the needle in the haystack at the depth specified in the config.
-        Adapted from the original implementation: https://github.com/gkamradt/LLMTest_NeedleInAHaystack
-        To insert the needle, we need to tokenize the context, insert the needle at the depth specified in the config, and then detokenize it.
-        """
-        logger.info(f"Preparing dataset for inference with needle in haystack. Needle: {self.df['needle'][0]}")
-        tokenized_needle = self.pipeline.tokenizer.encode(self.df["needle"][0], add_special_tokens=False)
-        context_length = self.config.max_context_length - len(tokenized_needle) - 150  # account for system prompts
-        needle_index = int(context_length * self.config.needle_depth / 100)
-        # tokenize the context
-        self.df["context"] = self.df["context"].apply(
-            lambda x: self.pipeline.tokenizer.encode(x, add_special_tokens=False)[:context_length]
-        )
-        # insert the needle at the depth specified in the config
-        self.df["context"] = self.df["context"].apply(lambda x: x[:needle_index] + tokenized_needle + x[needle_index:])
-        # detokenize the context
-        self.df["context"] = (
-            "This is a very long story book: <book> "
-            + self.df["context"].apply(lambda x: self.pipeline.tokenizer.decode(x, skip_special_tokens=True))
-            + " </book>."
-        )
-
     def _prepare_data_for_inference(self):
         """
         Prepares the dataset for inference, handling `compress_questions` and `FinchPress` specifics.
@@ -370,7 +350,9 @@ def _prepare_data_for_inference(self):
 
         # if we have needle in a haystack, we need to insert it in the context
         if self.config.dataset == "needle_in_haystack":
-            self._insert_needle_in_haystack()
+            self.df = insert_needle_in_haystack(
+                self.df, self.pipeline.tokenizer, self.config.max_context_length, self.config.needle_depth
+            )
 
         if isinstance(self.press, FinchPress):
             if not compress_questions:
diff --git a/evaluation/evaluate_config.yaml b/evaluation/evaluate_config.yaml
index 9d23734d..f4f4cdec 100644
--- a/evaluation/evaluate_config.yaml
+++ b/evaluation/evaluate_config.yaml
@@ -5,16 +5,17 @@ output_dir: "./results"
 
 model: "meta-llama/Meta-Llama-3.1-8B-Instruct"
 dataset: "ruler"                                  # see DATASET_REGISTRY in evaluate_registry.py
-data_dir: "4096"                                  # Subdirectory of the dataset (if applicable)
+data_dir: "4096"                                  # Subdirectory of the dataset (if applicable) else leave "null"
 
 press_name: "knorm"                                # see PRESS_REGISTRY in evaluate_registry.py
-compression_ratio: 1.0                             # Compression ratio for the press (0.0 to 1.0)
+compression_ratio: 0.5                             # Compression ratio for the press (0.0 to 1.0)
 key_channel_compression_ratio: null                # For ThinKPress and ComposedPress (0.0 to 1.0)
 
 fraction: 1.0                                     # Fraction of dataset to evaluate (0.0 to 1.0), for quick testing
 max_new_tokens: null                              # Maximum new tokens to generate (null = use dataset default)
 max_context_length: null                          # Maximum context length (null = use model maximum)
-compress_questions: false                          # Whether to compress questions with context
+compress_questions: false                         # Whether to compress questions with context
+needle_depth: null                                # Depth percentage of the needle in the haystack (0 to 100), only for needle_in_haystack dataset
 
 device: null  # Device to use (null = auto-detect, "cuda:0", "cpu", etc.)
 
diff --git a/tests/integration/test_ruler.py b/tests/integration/test_ruler.py
index 675f92f6..38495821 100644
--- a/tests/integration/test_ruler.py
+++ b/tests/integration/test_ruler.py
@@ -4,7 +4,7 @@
 import datasets
 import pytest
 import torch
-from transformers import DynamicCache, QuantoQuantizedCache, QuantizedCacheConfig
+from transformers import DynamicCache, QuantizedCacheConfig, QuantoQuantizedCache
 from transformers.utils import is_flash_attn_2_available, is_optimum_quanto_available
 
 from tests.default_presses import default_presses

From be5b6dc0b601bd9cdd592c5c75038acaf0ced4bb Mon Sep 17 00:00:00 2001
From: alessiodevoto <devoto.alessio@gmail.com>
Date: Fri, 22 Aug 2025 14:05:14 +0000
Subject: [PATCH 6/9] refactor eval

Signed-off-by: alessiodevoto <devoto.alessio@gmail.com>
---
 evaluation/evaluate.py | 133 +++++++++++++++++++----------------------
 1 file changed, 61 insertions(+), 72 deletions(-)

diff --git a/evaluation/evaluate.py b/evaluation/evaluate.py
index cc44a9c8..4fc67366 100644
--- a/evaluation/evaluate.py
+++ b/evaluation/evaluate.py
@@ -195,25 +195,24 @@ def __init__(self, config: EvaluationConfig):
 
     def _setup_deterministic_seeds(self):
         """Set deterministic seeds for reproducible results."""
-        seed = self.config.seed
-
-        torch.manual_seed(seed)
-        np.random.seed(seed)
-        random.seed(seed)
+        torch.manual_seed(self.config.seed)
+        np.random.seed(self.config.seed)
+        random.seed(self.config.seed)
 
         if torch.cuda.is_available():
-            torch.cuda.manual_seed(seed)
-            torch.cuda.manual_seed_all(seed)
+            torch.cuda.manual_seed(self.config.seed)
+            torch.cuda.manual_seed_all(self.config.seed)
             torch.backends.cudnn.deterministic = True
             torch.backends.cudnn.benchmark = False
+        logger.info(f"Set deterministic seeds to {self.config.seed}")
 
-        logger.info(f"Set deterministic seeds to {seed}")
 
     def _setup_logging(self):
         """Configures the logging level based on the config."""
         log_level = self.config.log_level.upper()
         logging.basicConfig(level=getattr(logging, log_level), format="%(asctime)s - %(levelname)s - %(message)s")
 
+
     def _setup_directories(self) -> Path:
         """
         Creates the output directory for saving results if it doesn't exist.
@@ -228,24 +227,6 @@ def _setup_directories(self) -> Path:
         logger.info(f"Output directory set to: {output_dir}")
         return output_dir
 
-    def _load_dataset(self):
-        """
-        Loads the dataset specified in the config and applies sampling/filtering.
-        """
-        dataset_name = self.config.dataset
-        data_dir = str(self.config.data_dir) if self.config.data_dir else None
-        fraction = self.config.fraction
-
-        logger.info(f"Loading dataset: {DATASET_REGISTRY[dataset_name]} (data_dir: {data_dir})")
-        df = load_dataset(DATASET_REGISTRY[dataset_name], data_dir=data_dir, split="test").to_pandas()
-
-        if fraction < 1.0:
-            original_len = len(df)
-            df = df.sample(frac=fraction, random_state=self.config.seed)
-            logger.info(f"Sampled {len(df)} samples ({fraction:.2f}) from original {original_len} samples.")
-
-        self.df = df
-        logger.info(f"Dataset loaded with {len(self.df)} entries.")
 
     def _setup_press(self):
         """
@@ -296,6 +277,50 @@ def _setup_press(self):
         self.config.press_init_command = str(press)
         logger.info(f"KV Press '{press_name}' setup.")
 
+
+    def _load_and_prepare_dataset(self):
+        """
+        Loads the dataset specified in the config and applies sampling/filtering.
+        """
+        dataset_name = self.config.dataset
+        data_dir = str(self.config.data_dir) if self.config.data_dir else None
+        fraction = self.config.fraction
+
+        logger.info(f"Loading dataset: {DATASET_REGISTRY[dataset_name]} (data_dir: {data_dir})")
+        df = load_dataset(DATASET_REGISTRY[dataset_name], data_dir=data_dir, split="test").to_pandas()
+
+        if fraction < 1.0:
+            original_len = len(df)
+            df = df.sample(frac=fraction, random_state=self.config.seed)
+            logger.info(f"Sampled {len(df)} samples ({fraction:.2f}) from original {original_len} samples.")
+
+        logger.info(f"Dataset loaded with {len(df)} entries.")
+
+        # if we have needle in a haystack, we need to insert it in the context
+        if self.config.dataset == "needle_in_haystack":
+            df = insert_needle_in_haystack(
+                df, self.pipeline.tokenizer, self.config.max_context_length, self.config.needle_depth
+            )
+
+        if isinstance(self.press, FinchPress):
+            if not self.config.compress_questions:
+                logger.error("FinchPress requires 'compress_questions' to be set to True.")
+                raise ValueError("FinchPress requires compress_questions to be set to True")
+            # FinchPress uses a delimiter token to separate context and question
+            # So we need to update the tokenizer and the model embeddings.
+            logger.info("FinchPress detected, updating model and tokenizer with delimiter token.")
+            self.press.update_model_and_tokenizer(self.pipeline.model, self.pipeline.tokenizer)  # type: ignore[attr-defined]
+            df["context"] = df["context"] + self.press.delimiter_token  # type: ignore[attr-defined, index]
+
+        if self.config.compress_questions:
+            logger.info("Compressing questions into context.")
+            df["context"] = df["context"] + df["question"]  # type: ignore[index]
+            df["question"] = ""  # type: ignore[index]
+
+        self.df = df
+        logger.info(f"Dataset processed with {len(self.df)} entries.")
+
+
     def _setup_model_pipeline(self):
         model_name = self.config.model
         device = self.config.device
@@ -319,55 +344,20 @@ def _setup_model_pipeline(self):
                 pass
 
         logger.info(f"Loading model pipeline for: {model_name} on device: {device} with model_kwargs: {model_kwargs}")
+        pipeline_kwargs = {
+            "model": model_name,
+            "model_kwargs": model_kwargs,
+            "trust_remote_code": True,
+        }
         if device == "auto":
-            self.pipeline = pipeline(
-                "kv-press-text-generation",
-                model=model_name,
-                device_map="auto",
-                model_kwargs=model_kwargs,
-                trust_remote_code=True,
-            )
+            pipeline_kwargs["device_map"] = "auto"
         else:
-            self.pipeline = pipeline(
-                "kv-press-text-generation",
-                model=model_name,
-                device=device,
-                model_kwargs=model_kwargs,
-                trust_remote_code=True,
-            )
-
-        # Ensure model is in eval mode for deterministic inference
-        if hasattr(self.pipeline, "model"):
-            self.pipeline.model.eval()
+            pipeline_kwargs["device"] = device
+        self.pipeline = pipeline("kv-press-text-generation", **pipeline_kwargs)
 
+        self.pipeline.model.eval()
         logger.info("Model pipeline loaded.")
 
-    def _prepare_data_for_inference(self):
-        """
-        Prepares the dataset for inference, handling `compress_questions` and `FinchPress` specifics.
-        """
-        compress_questions = self.config.compress_questions
-
-        # if we have needle in a haystack, we need to insert it in the context
-        if self.config.dataset == "needle_in_haystack":
-            self.df = insert_needle_in_haystack(
-                self.df, self.pipeline.tokenizer, self.config.max_context_length, self.config.needle_depth
-            )
-
-        if isinstance(self.press, FinchPress):
-            if not compress_questions:
-                logger.error("FinchPress requires 'compress_questions' to be set to True.")
-                raise ValueError("FinchPress requires compress_questions to be set to True")
-            # FinchPress uses a delimiter token to separate context and question
-            # So we need to update the tokenizer and the model embeddings.
-            logger.info("FinchPress detected, updating model and tokenizer with delimiter token.")
-            self.press.update_model_and_tokenizer(self.pipeline.model, self.pipeline.tokenizer)  # type: ignore[attr-defined]
-            self.df["context"] = self.df["context"] + self.press.delimiter_token  # type: ignore[attr-defined, index]
-
-        if compress_questions:
-            logger.info("Compressing questions into context.")
-            self.df["context"] = self.df["context"] + self.df["question"]  # type: ignore[index]
-            self.df["question"] = ""  # type: ignore[index]
 
     @torch.inference_mode()
     def _run_inference(self):
@@ -458,10 +448,9 @@ def run_evaluation(self):
             )
             return
 
-        self._load_dataset()
         self._setup_press()
         self._setup_model_pipeline()
-        self._prepare_data_for_inference()
+        self._load_and_prepare_dataset()
 
         self._run_inference()
         self._save_results(predictions_filename)

From e96247dea1f4b55a58268e3913e520da7a088016 Mon Sep 17 00:00:00 2001
From: alessiodevoto <devoto.alessio@gmail.com>
Date: Fri, 22 Aug 2025 14:05:38 +0000
Subject: [PATCH 7/9] add niah

Signed-off-by: alessiodevoto <devoto.alessio@gmail.com>
---
 evaluation/README.md | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/evaluation/README.md b/evaluation/README.md
index bd78f400..350bd637 100644
--- a/evaluation/README.md
+++ b/evaluation/README.md
@@ -41,12 +41,13 @@ We support evaluation with all the presses implemented in the library (and possi
 
 At the moment, we support the following standard popular benchmarks:
 
-- [Loogle](loogle/README.md) ([hf link](https://huggingface.co/datasets/simonjegou/loogle))
-- [RULER](ruler/README.md) ([hf link](https://huggingface.co/datasets/simonjegou/ruler))
-- [Zero Scrolls](zero_scrolls/README.md) ([hf link](https://huggingface.co/datasets/simonjegou/zero_scrolls))
-- [Infinitebench](infinite_bench/README.md) ([hf link](https://huggingface.co/datasets/MaxJeblick/InfiniteBench))
-- [longbench](longbench/README.md)([hf link](https://huggingface.co/datasets/Xnhyacinth/LongBench))
-- [longbench-v2](longbenchv2/README.md)([hf link](https://huggingface.co/datasets/Xnhyacinth/LongBench-v2))
+- [Loogle](benchmarks/loogle/README.md) ([hf link](https://huggingface.co/datasets/simonjegou/loogle))
+- [RULER](benchmarks/ruler/README.md) ([hf link](https://huggingface.co/datasets/simonjegou/ruler))
+- [Zero Scrolls](benchmarks/zero_scrolls/README.md) ([hf link](https://huggingface.co/datasets/simonjegou/zero_scrolls))
+- [Infinitebench](benchmarks/infinite_bench/README.md) ([hf link](https://huggingface.co/datasets/MaxJeblick/InfiniteBench))
+- [longbench](benchmarks/longbench/README.md)([hf link](https://huggingface.co/datasets/Xnhyacinth/LongBench))
+- [longbench-v2](benchmarks/longbenchv2/README.md)([hf link](https://huggingface.co/datasets/Xnhyacinth/LongBench-v2))
+- [Needle in a Haystack](benchmarks/needle_in_haystack/README.md)([hf link][Paul Graham's essays](https://huggingface.co/datasets/alessiodevoto/paul_graham_essays))
 
 📚 **For detailed information** about each dataset or implementing custom benchmarks, see the individual README files in the benchmarks directory.
 

From 2b7d181008a10da3c8578378e5dcd56ddda95264 Mon Sep 17 00:00:00 2001
From: alessiodevoto <devoto.alessio@gmail.com>
Date: Fri, 22 Aug 2025 14:14:02 +0000
Subject: [PATCH 8/9] style

Signed-off-by: alessiodevoto <devoto.alessio@gmail.com>
---
 evaluation/evaluate.py          | 6 ------
 evaluation/evaluate_config.yaml | 2 +-
 2 files changed, 1 insertion(+), 7 deletions(-)

diff --git a/evaluation/evaluate.py b/evaluation/evaluate.py
index 4fc67366..6b2b497d 100644
--- a/evaluation/evaluate.py
+++ b/evaluation/evaluate.py
@@ -206,13 +206,11 @@ def _setup_deterministic_seeds(self):
             torch.backends.cudnn.benchmark = False
         logger.info(f"Set deterministic seeds to {self.config.seed}")
 
-
     def _setup_logging(self):
         """Configures the logging level based on the config."""
         log_level = self.config.log_level.upper()
         logging.basicConfig(level=getattr(logging, log_level), format="%(asctime)s - %(levelname)s - %(message)s")
 
-
     def _setup_directories(self) -> Path:
         """
         Creates the output directory for saving results if it doesn't exist.
@@ -227,7 +225,6 @@ def _setup_directories(self) -> Path:
         logger.info(f"Output directory set to: {output_dir}")
         return output_dir
 
-
     def _setup_press(self):
         """
         Initializes the KVPress instance and applies compression ratios based on its type.
@@ -277,7 +274,6 @@ def _setup_press(self):
         self.config.press_init_command = str(press)
         logger.info(f"KV Press '{press_name}' setup.")
 
-
     def _load_and_prepare_dataset(self):
         """
         Loads the dataset specified in the config and applies sampling/filtering.
@@ -320,7 +316,6 @@ def _load_and_prepare_dataset(self):
         self.df = df
         logger.info(f"Dataset processed with {len(self.df)} entries.")
 
-
     def _setup_model_pipeline(self):
         model_name = self.config.model
         device = self.config.device
@@ -358,7 +353,6 @@ def _setup_model_pipeline(self):
         self.pipeline.model.eval()
         logger.info("Model pipeline loaded.")
 
-
     @torch.inference_mode()
     def _run_inference(self):
         """
diff --git a/evaluation/evaluate_config.yaml b/evaluation/evaluate_config.yaml
index f4f4cdec..ba5d6714 100644
--- a/evaluation/evaluate_config.yaml
+++ b/evaluation/evaluate_config.yaml
@@ -1,7 +1,7 @@
 # SPDX-FileCopyrightText: Copyright (c) 1993-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
 
-output_dir: "./results"
+output_dir: "./test_eval"
 
 model: "meta-llama/Meta-Llama-3.1-8B-Instruct"
 dataset: "ruler"                                  # see DATASET_REGISTRY in evaluate_registry.py

From 0eb4fa6a5ed88fa46ca145d494ffe3034a94105e Mon Sep 17 00:00:00 2001
From: alessiodevoto <devoto.alessio@gmail.com>
Date: Fri, 22 Aug 2025 14:21:15 +0000
Subject: [PATCH 9/9] revert

Signed-off-by: alessiodevoto <devoto.alessio@gmail.com>
---
 evaluation/evaluate_config.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/evaluation/evaluate_config.yaml b/evaluation/evaluate_config.yaml
index ba5d6714..f4f4cdec 100644
--- a/evaluation/evaluate_config.yaml
+++ b/evaluation/evaluate_config.yaml
@@ -1,7 +1,7 @@
 # SPDX-FileCopyrightText: Copyright (c) 1993-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
 
-output_dir: "./test_eval"
+output_dir: "./results"
 
 model: "meta-llama/Meta-Llama-3.1-8B-Instruct"
 dataset: "ruler"                                  # see DATASET_REGISTRY in evaluate_registry.py