Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
85 changes: 56 additions & 29 deletions examples/llm_eval/lm_eval_hf.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,15 +42,15 @@

import datasets
from lm_eval import utils
from lm_eval.__main__ import cli_evaluate, parse_eval_args, setup_parser
from packaging.version import Version

if not version("lm_eval").startswith("0.4.8"):
warnings.warn(
f"lm_eval_hf.py is tested with lm-eval 0.4.8; found {version('lm_eval')}. "
"Later versions may have incompatible API changes."
)
if Version(version("lm_eval")) < Version("0.4.10"):
raise ImportError(f"lm_eval_hf.py requires lm-eval >= 0.4.10; found {version('lm_eval')}.")

from lm_eval._cli import HarnessCLI
from lm_eval.api.model import T
from lm_eval.models.huggingface import HFLM
from lm_eval.utils import setup_logging
from quantization_utils import quantize_model
from sparse_attention_utils import sparsify_model

Expand Down Expand Up @@ -160,9 +160,24 @@ def create_from_arg_string(
HFLM.create_from_arg_string = classmethod(create_from_arg_string)


def setup_parser_with_modelopt_args():
"""Extend the lm-eval argument parser with ModelOpt quantization and sparsity options."""
parser = setup_parser()
# ModelOpt-specific args that we add to lm-eval's parser. After parsing, these are
# moved out of the argparse namespace and into args.model_args so they reach
# HFLM.create_from_arg_obj (and so lm-eval's own arg validation doesn't reject them).
_MODELOPT_ARG_KEYS = (
"quant_cfg",
"calib_batch_size",
"calib_size",
"auto_quantize_bits",
"auto_quantize_method",
"auto_quantize_score_size",
"auto_quantize_checkpoint",
"compress",
"sparse_cfg",
)


def _add_modelopt_args(parser):
"""Extend an lm-eval argument parser with ModelOpt quantization and sparsity options."""
parser.add_argument(
"--quant_cfg",
type=str,
Expand Down Expand Up @@ -221,33 +236,45 @@ def setup_parser_with_modelopt_args():
type=str,
help="Sparse attention configuration (e.g., SKIP_SOFTMAX_DEFAULT, SKIP_SOFTMAX_CALIB)",
)
return parser


if __name__ == "__main__":
parser = setup_parser_with_modelopt_args()
args = parse_eval_args(parser)
model_args = utils.simple_parse_args_string(args.model_args)
def _inject_modelopt_args_into_model_args(args):
"""Move ModelOpt args from the argparse namespace into args.model_args.

args.model_args is a dict (parsed by lm-eval's MergeDictAction). The ModelOpt
keys must be removed from the namespace so EvaluatorConfig.from_cli doesn't
reject them as unknown kwargs.
"""
model_args = dict(args.model_args) if args.model_args else {}

if args.trust_remote_code:
if getattr(args, "trust_remote_code", False):
# Propagate the user-provided --trust_remote_code flag (not hardcoded).
datasets.config.HF_DATASETS_TRUST_REMOTE_CODE = True
model_args["trust_remote_code"] = True
args.trust_remote_code = None

model_args.update(
{
"quant_cfg": args.quant_cfg,
"auto_quantize_bits": args.auto_quantize_bits,
"auto_quantize_method": args.auto_quantize_method,
"auto_quantize_score_size": args.auto_quantize_score_size,
"auto_quantize_checkpoint": args.auto_quantize_checkpoint,
"calib_batch_size": args.calib_batch_size,
"calib_size": args.calib_size,
"compress": args.compress,
"sparse_cfg": args.sparse_cfg,
}
)
for key in _MODELOPT_ARG_KEYS:
if hasattr(args, key):
model_args[key] = getattr(args, key)
delattr(args, key)

args.model_args = model_args

cli_evaluate(args)

if __name__ == "__main__":
setup_logging()
cli = HarnessCLI()
# The `run` subcommand owns the model/task arguments; extend that parser.
# `_subparsers` is private API; guard so a future lm-eval refactor surfaces a
# clear error instead of an opaque AttributeError.
try:
run_parser = cli._subparsers.choices["run"]
except (AttributeError, KeyError) as e:
raise RuntimeError(
"Cannot locate lm-eval's `run` subparser; the HarnessCLI internals may "
f"have changed. Installed lm-eval version: {version('lm_eval')}."
) from e
_add_modelopt_args(run_parser)
args = cli.parse_args()
_inject_modelopt_args_into_model_args(args)
cli.execute(args)
2 changes: 1 addition & 1 deletion examples/llm_eval/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
fire>=0.5.0
lm_eval[api,ifeval]==0.4.8
lm_eval[api,ifeval]>=0.4.10
peft>=0.5.0
rwkv>=0.7.3
torchvision
2 changes: 1 addition & 1 deletion examples/llm_sparsity/weight_sparsity/launch_finetune.sh
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,7 @@ CMD="accelerate launch --multi_gpu --mixed_precision bf16 finetune.py \
--save_total_limit 10 \
--learning_rate 2e-5 \
--weight_decay 0.1 \
--warmup_steps 0.0 \
--warmup_steps 0 \
--lr_scheduler_type cosine \
--logging_steps 1 \
--fsdp 'full_shard auto_wrap' \
Expand Down
1 change: 0 additions & 1 deletion examples/puzzletron/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
lm-eval==0.4.8
math-verify
ray
# Likely works for transformers v5 also, but we need to test it
Expand Down
34 changes: 32 additions & 2 deletions examples/specdec_bench/specdec_bench/datasets/speed.py
Original file line number Diff line number Diff line change
Expand Up @@ -737,10 +737,40 @@ def _load_dataset(self, config_name_or_dataset_path: config_type | str) -> "Data
}
table = table.replace_schema_metadata(new_meta or None)
dataset = HFDataset(table)
if self.num_samples is not None:
dataset = dataset.select(range(self.num_samples))
if self.num_samples is not None and self.num_samples < len(dataset):
dataset = self._stratified_select(dataset, self.num_samples)
return dataset

@staticmethod
def _stratified_select(dataset: "Dataset", n: int) -> "Dataset":
"""Select ``n`` samples uniformly across the ``category`` column.

Round-robin across categories until ``n`` rows are collected. The
resulting prefix is balanced; once a smaller category is exhausted
the remaining categories continue contributing, so exactly ``n``
rows are returned whenever ``n`` does not exceed the dataset size.
Falls back to ``range(n)`` when ``category`` is absent or there is
only one category. Indices come from ``range(category_size)`` (not
random) so behavior is deterministic.
"""
if "category" not in dataset.column_names:
return dataset.select(range(n))
cat_to_rows: dict[str, list[int]] = {}
for i, c in enumerate(dataset["category"]):
cat_to_rows.setdefault(c, []).append(i)
if len(cat_to_rows) <= 1:
return dataset.select(range(n))
cat_lists = list(cat_to_rows.values())
interleaved: list[int] = []
max_len = max(len(c) for c in cat_lists)
for i in range(max_len):
for c in cat_lists:
if i < len(c):
interleaved.append(c[i])
if len(interleaved) == n:
return dataset.select(interleaved)
return dataset.select(interleaved)

Comment on lines +745 to +773
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major | ⚡ Quick win

Guard non-positive sample counts before interleaving.

When n <= 0, this method can still return non-empty results for multi-category datasets (because the round-robin loop appends before any len(interleaved) == n check can succeed). That makes num_samples=0 behave incorrectly.

Suggested fix
 `@staticmethod`
 def _stratified_select(dataset: "Dataset", n: int) -> "Dataset":
+    if n <= 0:
+        return dataset.select([])
+
     if "category" not in dataset.column_names:
         return dataset.select(range(n))
🤖 Prompt for AI Agents
Verify each finding against current code. Fix only still-valid issues, skip the
rest with a brief reason, keep changes minimal, and validate.

In `@examples/specdec_bench/specdec_bench/datasets/speed.py` around lines 745 -
773, Add an early guard in _stratified_select to handle non-positive n: if n <=
0 return dataset.select(range(n)) before checking "category" presence or
building cat_to_rows, so the interleaving code never runs for n<=0 and the
function returns an empty selection deterministically.

def _resolve_external_data(
self, dataset: "Dataset", speed_config: config_type | str
) -> "Dataset":
Expand Down
8 changes: 8 additions & 0 deletions modelopt/onnx/export/nvfp4_exporter.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,10 @@ def _cast_fp4(array: np.ndarray) -> np.ndarray:

Note: The first dimension of the array must be divisible by 2
as two FP4 values are packed into a single byte.

Also reused by the deprecated ``modelopt.onnx.quantization.qdq_utils.fp4qdq_to_2dq``
compatibility shim. Do not rename or change the signature without updating that
shim (it is a load-bearing re-export for TensorRT-Edge-LLM 0.6.1).
"""
array_f32_t = torch.from_numpy(array)
array_f32_t_shape = array_f32_t.shape
Expand Down Expand Up @@ -76,6 +80,10 @@ def _replace_fp4qdq_with_2dq(
):
"""Replaces the given node in the ONNX graph with a subgraph consisting of two DequantizeLinear nodes.

Also reused by the deprecated ``modelopt.onnx.quantization.qdq_utils.fp4qdq_to_2dq``
compatibility shim. Do not rename or change the signature without updating that
shim (it is a load-bearing re-export for TensorRT-Edge-LLM 0.6.1).

Args:
graph: The ONNX graph containing the node to replace.
node: The node to be replaced.
Expand Down
20 changes: 8 additions & 12 deletions modelopt/onnx/quantization/autotune/benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,6 @@
import os
import re
import shutil
import subprocess # nosec B404
import tempfile
import time
from abc import ABC, abstractmethod
Expand All @@ -42,7 +41,7 @@
import torch

from modelopt.onnx.logging_config import logger
from modelopt.onnx.quantization.ort_utils import _check_for_trtexec
from modelopt.onnx.quantization.ort_utils import _check_for_trtexec, _run_trtexec

TRT_AVAILABLE = importlib.util.find_spec("tensorrt") is not None
if TRT_AVAILABLE:
Expand Down Expand Up @@ -159,7 +158,6 @@ def __init__(
warmup_runs: int = 5,
timing_runs: int = 10,
plugin_libraries: list[str] | None = None,
trtexec_path: str = "trtexec",
trtexec_args: list[str] | None = None,
):
"""Initialize the trtexec benchmark.
Expand All @@ -169,14 +167,11 @@ def __init__(
warmup_runs: See :meth:`Benchmark.__init__`.
timing_runs: See :meth:`Benchmark.__init__`.
plugin_libraries: See :meth:`Benchmark.__init__`.
trtexec_path: Path to trtexec binary. Defaults to 'trtexec' which
looks for the binary in PATH.
trtexec_args: Additional command-line arguments to pass to trtexec.
These are appended after the standard arguments.
Example: ['--fp16', '--workspace=4096', '--verbose']
"""
super().__init__(timing_cache_file, warmup_runs, timing_runs, plugin_libraries)
self.trtexec_path = trtexec_path
self.trtexec_args = trtexec_args if trtexec_args is not None else []
self.temp_dir = tempfile.mkdtemp(prefix="trtexec_benchmark_")
self.engine_path = os.path.join(self.temp_dir, "engine.trt")
Expand All @@ -186,7 +181,6 @@ def __init__(
self.latency_pattern = r"\[I\]\s+Latency:.*?median\s*=\s*([\d.]+)\s*ms"

self._base_cmd = [
self.trtexec_path,
f"--avgRuns={self.timing_runs}",
f"--iterations={self.timing_runs}",
f"--warmUp={self.warmup_runs}",
Expand Down Expand Up @@ -268,13 +262,14 @@ def run(
self.logger.debug(f"Wrote model bytes to temporary file: {model_path}")

cmd = [*self._base_cmd, f"--onnx={model_path}"]
self.logger.debug(f"Running: {' '.join(cmd)}")
result = subprocess.run(cmd, capture_output=True, text=True) # nosec B603
full_cmd = ["trtexec", *cmd]
self.logger.debug(f"Running: {' '.join(full_cmd)}")
result = _run_trtexec(cmd)
self._write_log_file(
log_file,
"\n".join(
[
f"Command: {' '.join(cmd)}",
f"Command: {' '.join(full_cmd)}",
f"Return code: {result.returncode}",
"=" * 80,
"STDOUT:",
Expand All @@ -301,8 +296,9 @@ def run(
self.logger.info(f"TrtExec benchmark (median): {latency:.2f} ms")
return latency
except FileNotFoundError:
self.logger.error(f"trtexec binary not found: {self.trtexec_path}")
self.logger.error("Please ensure TensorRT is installed and trtexec path is correct")
self.logger.error(
"'trtexec' binary not found. Please ensure TensorRT is installed and 'trtexec' is in PATH."
)
return float("inf")
except Exception as e:
self.logger.error(f"Benchmark failed: {e}")
Expand Down
26 changes: 25 additions & 1 deletion modelopt/onnx/quantization/ort_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,30 @@ def _check_lib_in_ld_library_path(ld_library_path, lib_pattern):
return False, None


def _run_trtexec(
args: list[str] | None = None, timeout: float | None = None
) -> subprocess.CompletedProcess:
"""Run a 'trtexec' command via subprocess.

Args:
args: Arguments to pass to trtexec (without the 'trtexec' command itself).
timeout: Optional subprocess timeout in seconds.

Returns:
The completed subprocess result.

Raises:
FileNotFoundError: If the 'trtexec' binary is not found in PATH.
"""
cmd = ["trtexec", *(args or [])]
try:
return subprocess.run(cmd, capture_output=True, text=True, timeout=timeout) # nosec B603
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🔴 Critical | ⚡ Quick win

Remove the inline Bandit suppression.

This subprocess call is already using an argv list, so the new # nosec B603 is just a policy violation on a security-sensitive path and should not be merged as-is.

Suggested fix
-        return subprocess.run(cmd, capture_output=True, text=True, timeout=timeout)  # nosec B603
+        return subprocess.run(cmd, capture_output=True, text=True, timeout=timeout)

As per coding guidelines, "Bandit security checks must pass without exceptions. # nosec comments are not allowed as a bypass for security checks."

📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
return subprocess.run(cmd, capture_output=True, text=True, timeout=timeout) # nosec B603
return subprocess.run(cmd, capture_output=True, text=True, timeout=timeout)
🤖 Prompt for AI Agents
Verify each finding against current code. Fix only still-valid issues, skip the
rest with a brief reason, keep changes minimal, and validate.

In `@modelopt/onnx/quantization/ort_utils.py` at line 66, Remove the inline Bandit
suppression by deleting the trailing "# nosec B603" on the subprocess.run call
so the security scan can validate it; ensure the call remains using the argv
list variable `cmd` (as already used) and keeps the existing parameters
(capture_output=True, text=True, timeout=timeout) or replace with an explicit
safer pattern if desired (e.g., subprocess.run(cmd, capture_output=True,
text=True, timeout=timeout, check=True))—locate the return statement that
invokes subprocess.run(cmd, ...) in ort_utils.py and remove only the "# nosec
B603" comment.

except FileNotFoundError as e:
raise FileNotFoundError(
"'trtexec' binary not found. Please ensure TensorRT is installed and 'trtexec' is in PATH."
) from e


def _check_for_trtexec(min_version: str = "10.0") -> str:
"""Check if the `trtexec` CLI tool is available in PATH and is >= min_version.

Expand Down Expand Up @@ -89,7 +113,7 @@ def _parse_version_from_string(version_str: str) -> str | None:
)

try:
result = subprocess.run([trtexec_path], capture_output=True, text=True, timeout=5) # nosec B603
result = _run_trtexec(timeout=5)
banner_output = result.stdout + result.stderr
parsed_version = _parse_version_from_string(banner_output)

Expand Down
Loading
Loading