Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 7 additions & 6 deletions evaluation/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -41,12 +41,13 @@ We support evaluation with all the presses implemented in the library (and possi

At the moment, we support the following standard popular benchmarks:

- [Loogle](loogle/README.md) ([hf link](https://huggingface.co/datasets/simonjegou/loogle))
- [RULER](ruler/README.md) ([hf link](https://huggingface.co/datasets/simonjegou/ruler))
- [Zero Scrolls](zero_scrolls/README.md) ([hf link](https://huggingface.co/datasets/simonjegou/zero_scrolls))
- [Infinitebench](infinite_bench/README.md) ([hf link](https://huggingface.co/datasets/MaxJeblick/InfiniteBench))
- [longbench](longbench/README.md)([hf link](https://huggingface.co/datasets/Xnhyacinth/LongBench))
- [longbench-v2](longbenchv2/README.md)([hf link](https://huggingface.co/datasets/Xnhyacinth/LongBench-v2))
- [Loogle](benchmarks/loogle/README.md) ([hf link](https://huggingface.co/datasets/simonjegou/loogle))
- [RULER](benchmarks/ruler/README.md) ([hf link](https://huggingface.co/datasets/simonjegou/ruler))
- [Zero Scrolls](benchmarks/zero_scrolls/README.md) ([hf link](https://huggingface.co/datasets/simonjegou/zero_scrolls))
- [Infinitebench](benchmarks/infinite_bench/README.md) ([hf link](https://huggingface.co/datasets/MaxJeblick/InfiniteBench))
- [longbench](benchmarks/longbench/README.md)([hf link](https://huggingface.co/datasets/Xnhyacinth/LongBench))
- [longbench-v2](benchmarks/longbenchv2/README.md)([hf link](https://huggingface.co/datasets/Xnhyacinth/LongBench-v2))
- [Needle in a Haystack](benchmarks/needle_in_haystack/README.md)([hf link][Paul Graham's essays](https://huggingface.co/datasets/alessiodevoto/paul_graham_essays))

📚 **For detailed information** about each dataset or implementing custom benchmarks, see the individual README files in the benchmarks directory.

Expand Down
2 changes: 2 additions & 0 deletions evaluation/benchmarks/infinite_bench/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
# SPDX-FileCopyrightText: Copyright (c) 1993-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
2 changes: 2 additions & 0 deletions evaluation/benchmarks/longbench/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
# SPDX-FileCopyrightText: Copyright (c) 1993-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
2 changes: 2 additions & 0 deletions evaluation/benchmarks/longbenchv2/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
# SPDX-FileCopyrightText: Copyright (c) 1993-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
3 changes: 3 additions & 0 deletions evaluation/benchmarks/needle_in_haystack/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
# Needle in a Haystack
This benchmark evaluates a model's ability to retrieve a specific piece of information, the "needle," hidden within a large body of text, the "haystack." The test challenges a model's long-context understanding and its ability to maintain information accuracy over increasing document lengths.
We follow the vast majority of the literature and use [Paul Graham's essays](https://huggingface.co/datasets/alessiodevoto/paul_graham_essays) as the haystack.
2 changes: 2 additions & 0 deletions evaluation/benchmarks/needle_in_haystack/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
# SPDX-FileCopyrightText: Copyright (c) 1993-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
Comment thread
maxjeblick marked this conversation as resolved.
# SPDX-License-Identifier: Apache-2.0
15 changes: 15 additions & 0 deletions evaluation/benchmarks/needle_in_haystack/calculate_metrics.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
# SPDX-FileCopyrightText: Copyright (c) 1993-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0

import pandas as pd
from rouge import Rouge

scorer = Rouge()


def calculate_metrics(df: pd.DataFrame) -> list[dict]:
scores = []
for index, row in df.iterrows():
score = scorer.get_scores(row["needle"], row["predicted_answer"])[0]
scores.append(score)
return scores
49 changes: 49 additions & 0 deletions evaluation/benchmarks/needle_in_haystack/utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
# SPDX-FileCopyrightText: Copyright (c) 1993-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0

import logging

import pandas as pd
from transformers import PreTrainedTokenizer

logger = logging.getLogger(__name__)


def insert_needle_in_haystack(
df: pd.DataFrame, tokenizer: PreTrainedTokenizer, max_context_length: int, needle_depth: int
) -> pd.DataFrame:
"""
Inserts the "needle" string into the "context" of each row in the DataFrame at a specified depth.
Adapted from the original implementation: https://github.com/gkamradt/LLMTest_NeedleInAHaystack

Parameters
----------
df : pd.DataFrame
The input DataFrame containing at least the columns "context" and "needle".
tokenizer : PreTrainedTokenizer
The tokenizer used to encode and decode the context and needle.
max_context_length : int
The maximum allowed length (in tokens) for the context, including the needle.
needle_depth : int
The percentage (0-100) indicating how deep into the context the needle should be inserted.

Returns
-------
pd.DataFrame
The DataFrame with the "context" column modified to include the needle at the specified depth.
"""
logger.info(f"Preparing dataset for inference with needle in haystack. Needle: {df['needle'][0]}")
tokenized_needle = tokenizer.encode(df["needle"][0], add_special_tokens=False)
context_length = max_context_length - len(tokenized_needle) - 150 # account for system prompts
needle_index = int(context_length * needle_depth / 100)
# tokenize the context
df["context"] = df["context"].apply(lambda x: tokenizer.encode(x, add_special_tokens=False)[:context_length])
# insert the needle at the depth specified in the config
df["context"] = df["context"].apply(lambda x: x[:needle_index] + tokenized_needle + x[needle_index:])
# detokenize the context
df["context"] = (
"This is a very long story book: <book> "
+ df["context"].apply(lambda x: tokenizer.decode(x, skip_special_tokens=True))
+ " </book>."
)
return df
144 changes: 71 additions & 73 deletions evaluation/evaluate.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,9 +10,10 @@
from typing import Any, Dict, Optional

import numpy as np
import pandas as pd # Import pandas for DataFrame type hinting
import pandas as pd
import torch
import yaml # type: ignore[import-untyped]
from benchmarks.needle_in_haystack.utils import insert_needle_in_haystack
from datasets import load_dataset
from evaluate_registry import DATASET_REGISTRY, PRESS_REGISTRY, SCORER_REGISTRY
from fire import Fire
Expand Down Expand Up @@ -42,6 +43,7 @@ class EvaluationConfig:
max_new_tokens: Optional[int] = None
max_context_length: Optional[int] = None
compress_questions: bool = False
needle_depth: Optional[int] = None

# Output and logging
output_dir: str = "./results"
Expand Down Expand Up @@ -88,6 +90,10 @@ def __post_init__(self):
if self.model_kwargs is None:
self.model_kwargs = {}

if self.dataset == "needle_in_haystack":
assert self.needle_depth is not None, "needle_depth must be set for needle_in_haystack"
assert self.max_context_length is not None, "max_context_length must be set for needle_in_haystack"

def get_results_dir(self, output_dir: Path) -> Path:
"""
Generates the unique save directory and filenames based on configuration parameters.
Expand Down Expand Up @@ -121,6 +127,8 @@ def get_results_dir(self, output_dir: Path) -> Path:
components.append("compressed_questions")
if self.key_channel_compression_ratio is not None:
components.append(f"key_channel_cr{self.key_channel_compression_ratio:.2f}")
if self.needle_depth is not None:
components.append(f"needle_depth{self.needle_depth:.1f}")

dir_name = "__".join(filter(None, components)) # Filter None/empty strings
config_dir = output_dir / dir_name
Expand Down Expand Up @@ -187,19 +195,16 @@ def __init__(self, config: EvaluationConfig):

def _setup_deterministic_seeds(self):
"""Set deterministic seeds for reproducible results."""
seed = self.config.seed

torch.manual_seed(seed)
np.random.seed(seed)
random.seed(seed)
torch.manual_seed(self.config.seed)
np.random.seed(self.config.seed)
random.seed(self.config.seed)

if torch.cuda.is_available():
torch.cuda.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
torch.cuda.manual_seed(self.config.seed)
torch.cuda.manual_seed_all(self.config.seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

logger.info(f"Set deterministic seeds to {seed}")
logger.info(f"Set deterministic seeds to {self.config.seed}")

def _setup_logging(self):
"""Configures the logging level based on the config."""
Expand All @@ -220,25 +225,6 @@ def _setup_directories(self) -> Path:
logger.info(f"Output directory set to: {output_dir}")
return output_dir

def _load_dataset(self):
"""
Loads the dataset specified in the config and applies sampling/filtering.
"""
dataset_name = self.config.dataset
data_dir = str(self.config.data_dir) if self.config.data_dir else None
fraction = self.config.fraction

logger.info(f"Loading dataset: {DATASET_REGISTRY[dataset_name]} (data_dir: {data_dir})")
df = load_dataset(DATASET_REGISTRY[dataset_name], data_dir=data_dir, split="test").to_pandas()

if fraction < 1.0:
original_len = len(df)
df = df.sample(frac=fraction, random_state=self.config.seed)
logger.info(f"Sampled {len(df)} samples ({fraction:.2f}) from original {original_len} samples.")

self.df = df
logger.info(f"Dataset loaded with {len(self.df)} entries.")

def _setup_press(self):
"""
Initializes the KVPress instance and applies compression ratios based on its type.
Expand Down Expand Up @@ -279,13 +265,57 @@ def _setup_press(self):
press.compression_ratio = compression_ratio
logger.info(f"Set {press.__class__.__name__} compression_ratio to {compression_ratio}")
else:
logger.warning(f"Press {press.__class__.__name__} has no 'compression_ratio' attribute.")
logger.warning(
f"Press {press.__class__.__name__} has no 'compression_ratio' attribute. This is expected is you set `no_press`."
)

self.press = press
# Set the press info in the config for saving to YAML
self.config.press_init_command = str(press)
logger.info(f"KV Press '{press_name}' setup.")

def _load_and_prepare_dataset(self):
"""
Loads the dataset specified in the config and applies sampling/filtering.
"""
dataset_name = self.config.dataset
data_dir = str(self.config.data_dir) if self.config.data_dir else None
fraction = self.config.fraction

logger.info(f"Loading dataset: {DATASET_REGISTRY[dataset_name]} (data_dir: {data_dir})")
df = load_dataset(DATASET_REGISTRY[dataset_name], data_dir=data_dir, split="test").to_pandas()

if fraction < 1.0:
original_len = len(df)
df = df.sample(frac=fraction, random_state=self.config.seed)
logger.info(f"Sampled {len(df)} samples ({fraction:.2f}) from original {original_len} samples.")

logger.info(f"Dataset loaded with {len(df)} entries.")

# if we have needle in a haystack, we need to insert it in the context
if self.config.dataset == "needle_in_haystack":
df = insert_needle_in_haystack(
df, self.pipeline.tokenizer, self.config.max_context_length, self.config.needle_depth
)

if isinstance(self.press, FinchPress):
if not self.config.compress_questions:
logger.error("FinchPress requires 'compress_questions' to be set to True.")
raise ValueError("FinchPress requires compress_questions to be set to True")
# FinchPress uses a delimiter token to separate context and question
# So we need to update the tokenizer and the model embeddings.
logger.info("FinchPress detected, updating model and tokenizer with delimiter token.")
self.press.update_model_and_tokenizer(self.pipeline.model, self.pipeline.tokenizer) # type: ignore[attr-defined]
df["context"] = df["context"] + self.press.delimiter_token # type: ignore[attr-defined, index]

if self.config.compress_questions:
logger.info("Compressing questions into context.")
df["context"] = df["context"] + df["question"] # type: ignore[index]
df["question"] = "" # type: ignore[index]

self.df = df
logger.info(f"Dataset processed with {len(self.df)} entries.")

def _setup_model_pipeline(self):
model_name = self.config.model
device = self.config.device
Expand All @@ -295,7 +325,6 @@ def _setup_model_pipeline(self):
logger.info(f"No device specified, auto-detected device: {device}")

model_kwargs = self.config.model_kwargs or {}
print(model_kwargs)
if isinstance(self.press, ObservedAttentionPress):
model_kwargs["attn_implementation"] = "eager"
logger.info("ObservedAttentionPress detected, setting attn_implementation to 'eager'.")
Expand All @@ -310,50 +339,20 @@ def _setup_model_pipeline(self):
pass

logger.info(f"Loading model pipeline for: {model_name} on device: {device} with model_kwargs: {model_kwargs}")
pipeline_kwargs = {
"model": model_name,
"model_kwargs": model_kwargs,
"trust_remote_code": True,
}
if device == "auto":
self.pipeline = pipeline(
"kv-press-text-generation",
model=model_name,
device_map="auto",
model_kwargs=model_kwargs,
trust_remote_code=True,
)
pipeline_kwargs["device_map"] = "auto"
else:
self.pipeline = pipeline(
"kv-press-text-generation",
model=model_name,
device=device,
model_kwargs=model_kwargs,
trust_remote_code=True,
)

# Ensure model is in eval mode for deterministic inference
if hasattr(self.pipeline, "model"):
self.pipeline.model.eval()
pipeline_kwargs["device"] = device
self.pipeline = pipeline("kv-press-text-generation", **pipeline_kwargs)

self.pipeline.model.eval()
logger.info("Model pipeline loaded.")

def _prepare_data_for_inference(self):
"""
Prepares the dataset for inference, handling `compress_questions` and `FinchPress` specifics.
"""
compress_questions = self.config.compress_questions

if isinstance(self.press, FinchPress):
if not compress_questions:
logger.error("FinchPress requires 'compress_questions' to be set to True.")
raise ValueError("FinchPress requires compress_questions to be set to True")
# FinchPress uses a delimiter token to separate context and question
# So we need to update the tokenizer and the model embeddings.
logger.info("FinchPress detected, updating model and tokenizer with delimiter token.")
self.press.update_model_and_tokenizer(self.pipeline.model, self.pipeline.tokenizer) # type: ignore[attr-defined]
self.df["context"] = self.df["context"] + self.press.delimiter_token # type: ignore[attr-defined, index]

if compress_questions:
logger.info("Compressing questions into context.")
self.df["context"] = self.df["context"] + self.df["question"] # type: ignore[index]
self.df["question"] = "" # type: ignore[index]

@torch.inference_mode()
def _run_inference(self):
"""
Expand Down Expand Up @@ -400,7 +399,7 @@ def _save_results(self, save_filename: Path):
if save_filename.exists():
logger.warning(f"Results CSV already exists at {save_filename}. Overwriting.")

self.df[["predicted_answer", "compression_ratio"]].to_csv(str(save_filename), index=False) # type: ignore[index]
self.df[list(set(self.df.columns) - set(["context"]))].to_csv(str(save_filename), index=False) # type: ignore[index]
logger.info(f"Results saved to {save_filename}")

def _calculate_and_save_metrics(self, save_filename: Path):
Expand Down Expand Up @@ -443,10 +442,9 @@ def run_evaluation(self):
)
return

self._load_dataset()
self._setup_press()
self._setup_model_pipeline()
self._prepare_data_for_inference()
self._load_and_prepare_dataset()

self._run_inference()
self._save_results(predictions_filename)
Expand Down
7 changes: 4 additions & 3 deletions evaluation/evaluate_config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -5,16 +5,17 @@ output_dir: "./results"

model: "meta-llama/Meta-Llama-3.1-8B-Instruct"
dataset: "ruler" # see DATASET_REGISTRY in evaluate_registry.py
data_dir: "4096" # Subdirectory of the dataset (if applicable)
data_dir: "4096" # Subdirectory of the dataset (if applicable) else leave "null"

press_name: "knorm" # see PRESS_REGISTRY in evaluate_registry.py
compression_ratio: 1.0 # Compression ratio for the press (0.0 to 1.0)
compression_ratio: 0.5 # Compression ratio for the press (0.0 to 1.0)
key_channel_compression_ratio: null # For ThinKPress and ComposedPress (0.0 to 1.0)

fraction: 1.0 # Fraction of dataset to evaluate (0.0 to 1.0), for quick testing
max_new_tokens: null # Maximum new tokens to generate (null = use dataset default)
max_context_length: null # Maximum context length (null = use model maximum)
compress_questions: false # Whether to compress questions with context
compress_questions: false # Whether to compress questions with context
needle_depth: null # Depth percentage of the needle in the haystack (0 to 100), only for needle_in_haystack dataset

device: null # Device to use (null = auto-detect, "cuda:0", "cpu", etc.)

Expand Down
3 changes: 3 additions & 0 deletions evaluation/evaluate_registry.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
from benchmarks.longbench.calculate_metrics import calculate_metrics_e as longbench_scorer_e
from benchmarks.longbenchv2.calculate_metrics import calculate_metrics as longbenchv2_scorer
from benchmarks.loogle.calculate_metrics import calculate_metrics as loogle_scorer
from benchmarks.needle_in_haystack.calculate_metrics import calculate_metrics as needle_in_haystack_scorer
from benchmarks.ruler.calculate_metrics import calculate_metrics as ruler_scorer
from benchmarks.zero_scrolls.calculate_metrics import calculate_metrics as zero_scrolls_scorer

Expand Down Expand Up @@ -41,6 +42,7 @@
"longbench": "Xnhyacinth/LongBench",
"longbench-e": "Xnhyacinth/LongBench",
"longbench-v2": "Xnhyacinth/LongBench-v2",
"needle_in_haystack": "alessiodevoto/paul_graham_essays",
}

SCORER_REGISTRY = {
Expand All @@ -51,6 +53,7 @@
"longbench": longbench_scorer,
"longbench-e": longbench_scorer_e,
"longbench-v2": longbenchv2_scorer,
"needle_in_haystack": needle_in_haystack_scorer,
}


Expand Down
2 changes: 1 addition & 1 deletion tests/integration/test_ruler.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
import datasets
import pytest
import torch
from transformers import DynamicCache, QuantoQuantizedCache, QuantizedCacheConfig
from transformers import DynamicCache, QuantizedCacheConfig, QuantoQuantizedCache
from transformers.utils import is_flash_attn_2_available, is_optimum_quanto_available

from tests.default_presses import default_presses
Expand Down