diff --git a/examples/llm_eval/lm_eval_hf.py b/examples/llm_eval/lm_eval_hf.py index 7d1f9f19935..51c0930e8f2 100755 --- a/examples/llm_eval/lm_eval_hf.py +++ b/examples/llm_eval/lm_eval_hf.py @@ -42,15 +42,15 @@ import datasets from lm_eval import utils -from lm_eval.__main__ import cli_evaluate, parse_eval_args, setup_parser +from packaging.version import Version -if not version("lm_eval").startswith("0.4.8"): - warnings.warn( - f"lm_eval_hf.py is tested with lm-eval 0.4.8; found {version('lm_eval')}. " - "Later versions may have incompatible API changes." - ) +if Version(version("lm_eval")) < Version("0.4.10"): + raise ImportError(f"lm_eval_hf.py requires lm-eval >= 0.4.10; found {version('lm_eval')}.") + +from lm_eval._cli import HarnessCLI from lm_eval.api.model import T from lm_eval.models.huggingface import HFLM +from lm_eval.utils import setup_logging from quantization_utils import quantize_model from sparse_attention_utils import sparsify_model @@ -160,9 +160,24 @@ def create_from_arg_string( HFLM.create_from_arg_string = classmethod(create_from_arg_string) -def setup_parser_with_modelopt_args(): - """Extend the lm-eval argument parser with ModelOpt quantization and sparsity options.""" - parser = setup_parser() +# ModelOpt-specific args that we add to lm-eval's parser. After parsing, these are +# moved out of the argparse namespace and into args.model_args so they reach +# HFLM.create_from_arg_obj (and so lm-eval's own arg validation doesn't reject them). +_MODELOPT_ARG_KEYS = ( + "quant_cfg", + "calib_batch_size", + "calib_size", + "auto_quantize_bits", + "auto_quantize_method", + "auto_quantize_score_size", + "auto_quantize_checkpoint", + "compress", + "sparse_cfg", +) + + +def _add_modelopt_args(parser): + """Extend an lm-eval argument parser with ModelOpt quantization and sparsity options.""" parser.add_argument( "--quant_cfg", type=str, @@ -221,33 +236,45 @@ def setup_parser_with_modelopt_args(): type=str, help="Sparse attention configuration (e.g., SKIP_SOFTMAX_DEFAULT, SKIP_SOFTMAX_CALIB)", ) - return parser -if __name__ == "__main__": - parser = setup_parser_with_modelopt_args() - args = parse_eval_args(parser) - model_args = utils.simple_parse_args_string(args.model_args) +def _inject_modelopt_args_into_model_args(args): + """Move ModelOpt args from the argparse namespace into args.model_args. + + args.model_args is a dict (parsed by lm-eval's MergeDictAction). The ModelOpt + keys must be removed from the namespace so EvaluatorConfig.from_cli doesn't + reject them as unknown kwargs. + """ + model_args = dict(args.model_args) if args.model_args else {} - if args.trust_remote_code: + if getattr(args, "trust_remote_code", False): + # Propagate the user-provided --trust_remote_code flag (not hardcoded). datasets.config.HF_DATASETS_TRUST_REMOTE_CODE = True model_args["trust_remote_code"] = True args.trust_remote_code = None - model_args.update( - { - "quant_cfg": args.quant_cfg, - "auto_quantize_bits": args.auto_quantize_bits, - "auto_quantize_method": args.auto_quantize_method, - "auto_quantize_score_size": args.auto_quantize_score_size, - "auto_quantize_checkpoint": args.auto_quantize_checkpoint, - "calib_batch_size": args.calib_batch_size, - "calib_size": args.calib_size, - "compress": args.compress, - "sparse_cfg": args.sparse_cfg, - } - ) + for key in _MODELOPT_ARG_KEYS: + if hasattr(args, key): + model_args[key] = getattr(args, key) + delattr(args, key) args.model_args = model_args - cli_evaluate(args) + +if __name__ == "__main__": + setup_logging() + cli = HarnessCLI() + # The `run` subcommand owns the model/task arguments; extend that parser. + # `_subparsers` is private API; guard so a future lm-eval refactor surfaces a + # clear error instead of an opaque AttributeError. + try: + run_parser = cli._subparsers.choices["run"] + except (AttributeError, KeyError) as e: + raise RuntimeError( + "Cannot locate lm-eval's `run` subparser; the HarnessCLI internals may " + f"have changed. Installed lm-eval version: {version('lm_eval')}." + ) from e + _add_modelopt_args(run_parser) + args = cli.parse_args() + _inject_modelopt_args_into_model_args(args) + cli.execute(args) diff --git a/examples/llm_eval/requirements.txt b/examples/llm_eval/requirements.txt index df47ac76c6b..2762c838c6a 100644 --- a/examples/llm_eval/requirements.txt +++ b/examples/llm_eval/requirements.txt @@ -1,5 +1,5 @@ fire>=0.5.0 -lm_eval[api,ifeval]==0.4.8 +lm_eval[api,ifeval]>=0.4.10 peft>=0.5.0 rwkv>=0.7.3 torchvision diff --git a/examples/llm_sparsity/weight_sparsity/launch_finetune.sh b/examples/llm_sparsity/weight_sparsity/launch_finetune.sh index 7f8e71f255c..851e3a5aeed 100755 --- a/examples/llm_sparsity/weight_sparsity/launch_finetune.sh +++ b/examples/llm_sparsity/weight_sparsity/launch_finetune.sh @@ -88,7 +88,7 @@ CMD="accelerate launch --multi_gpu --mixed_precision bf16 finetune.py \ --save_total_limit 10 \ --learning_rate 2e-5 \ --weight_decay 0.1 \ - --warmup_steps 0.0 \ + --warmup_steps 0 \ --lr_scheduler_type cosine \ --logging_steps 1 \ --fsdp 'full_shard auto_wrap' \ diff --git a/examples/puzzletron/requirements.txt b/examples/puzzletron/requirements.txt index 317a38f5eab..6f4c94e08a8 100644 --- a/examples/puzzletron/requirements.txt +++ b/examples/puzzletron/requirements.txt @@ -1,4 +1,3 @@ -lm-eval==0.4.8 math-verify ray # Likely works for transformers v5 also, but we need to test it diff --git a/examples/specdec_bench/specdec_bench/datasets/speed.py b/examples/specdec_bench/specdec_bench/datasets/speed.py index fe544bb353e..3552d71a1ad 100644 --- a/examples/specdec_bench/specdec_bench/datasets/speed.py +++ b/examples/specdec_bench/specdec_bench/datasets/speed.py @@ -737,10 +737,40 @@ def _load_dataset(self, config_name_or_dataset_path: config_type | str) -> "Data } table = table.replace_schema_metadata(new_meta or None) dataset = HFDataset(table) - if self.num_samples is not None: - dataset = dataset.select(range(self.num_samples)) + if self.num_samples is not None and self.num_samples < len(dataset): + dataset = self._stratified_select(dataset, self.num_samples) return dataset + @staticmethod + def _stratified_select(dataset: "Dataset", n: int) -> "Dataset": + """Select ``n`` samples uniformly across the ``category`` column. + + Round-robin across categories until ``n`` rows are collected. The + resulting prefix is balanced; once a smaller category is exhausted + the remaining categories continue contributing, so exactly ``n`` + rows are returned whenever ``n`` does not exceed the dataset size. + Falls back to ``range(n)`` when ``category`` is absent or there is + only one category. Indices come from ``range(category_size)`` (not + random) so behavior is deterministic. + """ + if "category" not in dataset.column_names: + return dataset.select(range(n)) + cat_to_rows: dict[str, list[int]] = {} + for i, c in enumerate(dataset["category"]): + cat_to_rows.setdefault(c, []).append(i) + if len(cat_to_rows) <= 1: + return dataset.select(range(n)) + cat_lists = list(cat_to_rows.values()) + interleaved: list[int] = [] + max_len = max(len(c) for c in cat_lists) + for i in range(max_len): + for c in cat_lists: + if i < len(c): + interleaved.append(c[i]) + if len(interleaved) == n: + return dataset.select(interleaved) + return dataset.select(interleaved) + def _resolve_external_data( self, dataset: "Dataset", speed_config: config_type | str ) -> "Dataset": diff --git a/modelopt/onnx/export/nvfp4_exporter.py b/modelopt/onnx/export/nvfp4_exporter.py index a80a9845fb5..e8bdfa2db1f 100644 --- a/modelopt/onnx/export/nvfp4_exporter.py +++ b/modelopt/onnx/export/nvfp4_exporter.py @@ -39,6 +39,10 @@ def _cast_fp4(array: np.ndarray) -> np.ndarray: Note: The first dimension of the array must be divisible by 2 as two FP4 values are packed into a single byte. + + Also reused by the deprecated ``modelopt.onnx.quantization.qdq_utils.fp4qdq_to_2dq`` + compatibility shim. Do not rename or change the signature without updating that + shim (it is a load-bearing re-export for TensorRT-Edge-LLM 0.6.1). """ array_f32_t = torch.from_numpy(array) array_f32_t_shape = array_f32_t.shape @@ -76,6 +80,10 @@ def _replace_fp4qdq_with_2dq( ): """Replaces the given node in the ONNX graph with a subgraph consisting of two DequantizeLinear nodes. + Also reused by the deprecated ``modelopt.onnx.quantization.qdq_utils.fp4qdq_to_2dq`` + compatibility shim. Do not rename or change the signature without updating that + shim (it is a load-bearing re-export for TensorRT-Edge-LLM 0.6.1). + Args: graph: The ONNX graph containing the node to replace. node: The node to be replaced. diff --git a/modelopt/onnx/quantization/autotune/benchmark.py b/modelopt/onnx/quantization/autotune/benchmark.py index df6dbc877d0..b87478a1572 100644 --- a/modelopt/onnx/quantization/autotune/benchmark.py +++ b/modelopt/onnx/quantization/autotune/benchmark.py @@ -31,7 +31,6 @@ import os import re import shutil -import subprocess # nosec B404 import tempfile import time from abc import ABC, abstractmethod @@ -42,7 +41,7 @@ import torch from modelopt.onnx.logging_config import logger -from modelopt.onnx.quantization.ort_utils import _check_for_trtexec +from modelopt.onnx.quantization.ort_utils import _check_for_trtexec, _run_trtexec TRT_AVAILABLE = importlib.util.find_spec("tensorrt") is not None if TRT_AVAILABLE: @@ -159,7 +158,6 @@ def __init__( warmup_runs: int = 5, timing_runs: int = 10, plugin_libraries: list[str] | None = None, - trtexec_path: str = "trtexec", trtexec_args: list[str] | None = None, ): """Initialize the trtexec benchmark. @@ -169,14 +167,11 @@ def __init__( warmup_runs: See :meth:`Benchmark.__init__`. timing_runs: See :meth:`Benchmark.__init__`. plugin_libraries: See :meth:`Benchmark.__init__`. - trtexec_path: Path to trtexec binary. Defaults to 'trtexec' which - looks for the binary in PATH. trtexec_args: Additional command-line arguments to pass to trtexec. These are appended after the standard arguments. Example: ['--fp16', '--workspace=4096', '--verbose'] """ super().__init__(timing_cache_file, warmup_runs, timing_runs, plugin_libraries) - self.trtexec_path = trtexec_path self.trtexec_args = trtexec_args if trtexec_args is not None else [] self.temp_dir = tempfile.mkdtemp(prefix="trtexec_benchmark_") self.engine_path = os.path.join(self.temp_dir, "engine.trt") @@ -186,7 +181,6 @@ def __init__( self.latency_pattern = r"\[I\]\s+Latency:.*?median\s*=\s*([\d.]+)\s*ms" self._base_cmd = [ - self.trtexec_path, f"--avgRuns={self.timing_runs}", f"--iterations={self.timing_runs}", f"--warmUp={self.warmup_runs}", @@ -268,13 +262,14 @@ def run( self.logger.debug(f"Wrote model bytes to temporary file: {model_path}") cmd = [*self._base_cmd, f"--onnx={model_path}"] - self.logger.debug(f"Running: {' '.join(cmd)}") - result = subprocess.run(cmd, capture_output=True, text=True) # nosec B603 + full_cmd = ["trtexec", *cmd] + self.logger.debug(f"Running: {' '.join(full_cmd)}") + result = _run_trtexec(cmd) self._write_log_file( log_file, "\n".join( [ - f"Command: {' '.join(cmd)}", + f"Command: {' '.join(full_cmd)}", f"Return code: {result.returncode}", "=" * 80, "STDOUT:", @@ -301,8 +296,9 @@ def run( self.logger.info(f"TrtExec benchmark (median): {latency:.2f} ms") return latency except FileNotFoundError: - self.logger.error(f"trtexec binary not found: {self.trtexec_path}") - self.logger.error("Please ensure TensorRT is installed and trtexec path is correct") + self.logger.error( + "'trtexec' binary not found. Please ensure TensorRT is installed and 'trtexec' is in PATH." + ) return float("inf") except Exception as e: self.logger.error(f"Benchmark failed: {e}") diff --git a/modelopt/onnx/quantization/ort_utils.py b/modelopt/onnx/quantization/ort_utils.py index 2c5a0b7d2da..f7799c634f0 100755 --- a/modelopt/onnx/quantization/ort_utils.py +++ b/modelopt/onnx/quantization/ort_utils.py @@ -46,6 +46,30 @@ def _check_lib_in_ld_library_path(ld_library_path, lib_pattern): return False, None +def _run_trtexec( + args: list[str] | None = None, timeout: float | None = None +) -> subprocess.CompletedProcess: + """Run a 'trtexec' command via subprocess. + + Args: + args: Arguments to pass to trtexec (without the 'trtexec' command itself). + timeout: Optional subprocess timeout in seconds. + + Returns: + The completed subprocess result. + + Raises: + FileNotFoundError: If the 'trtexec' binary is not found in PATH. + """ + cmd = ["trtexec", *(args or [])] + try: + return subprocess.run(cmd, capture_output=True, text=True, timeout=timeout) # nosec B603 + except FileNotFoundError as e: + raise FileNotFoundError( + "'trtexec' binary not found. Please ensure TensorRT is installed and 'trtexec' is in PATH." + ) from e + + def _check_for_trtexec(min_version: str = "10.0") -> str: """Check if the `trtexec` CLI tool is available in PATH and is >= min_version. @@ -89,7 +113,7 @@ def _parse_version_from_string(version_str: str) -> str | None: ) try: - result = subprocess.run([trtexec_path], capture_output=True, text=True, timeout=5) # nosec B603 + result = _run_trtexec(timeout=5) banner_output = result.stdout + result.stderr parsed_version = _parse_version_from_string(banner_output) diff --git a/modelopt/onnx/quantization/qdq_utils.py b/modelopt/onnx/quantization/qdq_utils.py index 265bcf36b2a..28e6f8ada8b 100644 --- a/modelopt/onnx/quantization/qdq_utils.py +++ b/modelopt/onnx/quantization/qdq_utils.py @@ -15,6 +15,7 @@ """Various utils to support inserting Q/DQ nodes.""" +import warnings from collections.abc import Sequence from typing import Any @@ -31,7 +32,16 @@ get_tensor_producer_nodes, remove_redundant_cast_nodes, ) -from modelopt.onnx.quantization.quant_utils import get_num_bits +from modelopt.onnx.quantization.quant_utils import ( + compute_e8m0, + get_amax, + get_num_bits, + get_weights_scaling_factor, + get_weights_scaling_factor_2, + pack_weights_to_int4, + quantize, +) +from modelopt.onnx.utils import get_attribute, has_attribute, read_f16_tensor_as_fp32 QUANTIZE_NODE_NAME = "QuantizeLinear" DEQUANTIZE_NODE_NAME = "DequantizeLinear" @@ -1224,3 +1234,384 @@ def get_quantized_tensors(onnx_model: onnx.ModelProto) -> set[str]: logger.debug(f"Found {len(quantized_tensors)} dequantized tensors in ONNX model") return quantized_tensors + + +_LEGACY_LLM_EXPORT_DEPRECATION_MSG = ( + "{name} in modelopt.onnx.quantization.qdq_utils is deprecated and will be " + "removed in a future release. Use modelopt.onnx.export " + "(INT4QuantExporter / NVFP4QuantExporter / MXFP8QuantExporter), or migrate to " + "TensorRT-Edge-LLM (https://github.com/NVIDIA/TensorRT-Edge-LLM)." +) + + +def quantize_weights_to_int4( + onnx_model: onnx.ModelProto, +) -> onnx.ModelProto: + """Deprecated: convert ONNX model weights to INT4 with graph optimization. + + Preserved as a compatibility shim for TensorRT-Edge-LLM 0.6.1 and earlier. + New code should use :class:`modelopt.onnx.export.int4_exporter.INT4QuantExporter`. + """ + warnings.warn( + _LEGACY_LLM_EXPORT_DEPRECATION_MSG.format(name="quantize_weights_to_int4"), + DeprecationWarning, + stacklevel=2, + ) + + graph = onnx_model.graph + initializer_map = {initializer.name: initializer for initializer in graph.initializer} + value_info_map = {value_info.name: value_info for value_info in graph.value_info} + weight_dq_nodes = [node for node in graph.node if node.op_type == "DequantizeLinear"] + tensor_producer_map = get_tensor_producer_nodes(graph) + + nodes_to_remove = [] + for node in weight_dq_nodes: + weight_name = node.input[0] + scale_name = node.input[1] + logger.debug(f"Processing INT4 conversion for weight {weight_name}") + weight = numpy_helper.to_array(initializer_map[weight_name]) + if scale_name in initializer_map: + scale = numpy_helper.to_array(initializer_map[scale_name]) + else: + scale_constant_node = tensor_producer_map[scale_name] + for attr in scale_constant_node.attribute: + if attr.name == "value": + tensor = attr.t + scale = numpy_helper.to_array(tensor) + + weight = weight / scale + block_size = weight.shape[-1] + + # Convert DequantizeLinear -> Reshape -> Transpose -> MatMul/Gemm to DequantizeLinear -> Matmul/Gemm + dq_child_nodes = [n for n in graph.node if node.output[0] in n.input] + reshape_node = dq_child_nodes[0] + nodes_to_remove.append(reshape_node.name) + assert reshape_node.op_type == "Reshape", f"Expected Reshape node for {node.name}" + reshape_node_output = reshape_node.output[0] + + # Remove constant node from reshape node + shape_constant_name = next(input for input in reshape_node.input if "Constant" in input) + nodes_to_remove.append(tensor_producer_map[shape_constant_name].name) + + # Get the shape of the output of the reshape node + reshape_output_value_info = value_info_map.get(reshape_node_output) + if reshape_output_value_info is not None: + weight_shape = [ + dim.dim_value for dim in reshape_output_value_info.type.tensor_type.shape.dim + ] + else: + raise ValueError(f"Unable to determine shape of weight tensor {weight_name}") + + # Reshape weights and scales + weight = weight.reshape(weight_shape) + assert weight_shape[-1] % block_size == 0, ( + f"Block size {block_size} is not divisible by {weight_shape[-1]}" + ) + scale_shape = [*weight_shape[:-1], weight_shape[-1] // block_size] + scale = scale.reshape(scale_shape) + reshape_child_nodes = [n for n in graph.node if reshape_node.output[0] in n.input] + assert len(reshape_child_nodes) == 1, f"Expected exactly one child node for {node.name}" + + # Check if there's an optional Cast node between Reshape and Transpose/MatMul/Gemm + next_node = reshape_child_nodes[0] + if next_node.op_type == "Cast": + # Remove unnecessary Cast node + cast_node = next_node + nodes_to_remove.append(cast_node.name) + cast_child_nodes = [n for n in graph.node if cast_node.output[0] in n.input] + next_node = cast_child_nodes[0] + + # Transpose weights and scales if present + if next_node.op_type == "Transpose": + transpose_node = next_node + nodes_to_remove.append(transpose_node.name) + assert transpose_node.op_type == "Transpose", f"Expected Transpose node for {node.name}" + perm = None + for attr in transpose_node.attribute: + if attr.name == "perm": + perm = list(attr.ints) + assert perm is not None, f"Permutation not found for {node.name}" + weight = weight.transpose(perm) + scale = scale.transpose(perm) + transpose_child_nodes = [n for n in graph.node if transpose_node.output[0] in n.input] + assert len(transpose_child_nodes) == 1, ( + f"Expected exactly one matmul node for {node.name}" + ) + matmul_node = transpose_child_nodes[0] + else: + matmul_node = next_node + assert matmul_node.op_type in ["MatMul", "Gemm"], ( + f"Expected MatMul or Gemm node for {node.name}" + ) + matmul_node.input[1] = node.output[0] + + if scale_name not in initializer_map: + # Remove scale producer if it's a Constant node + scale_name = node.input[1] + scale_producer = tensor_producer_map[scale_name] + if scale_producer.op_type == "Constant": + graph.node.remove(scale_producer) + + # Create a new scale tensor + scale_name = scale_name.replace("Constant_output_0", "scale") + scale_tensor = onnx.numpy_helper.from_array(scale, scale_name) + graph.initializer.append(scale_tensor) + node.input[1] = scale_name + else: + scale_tensor = onnx.numpy_helper.from_array(scale, scale_name) + initializer_map[scale_name].CopyFrom(scale_tensor) + + # Convert weights to INT4 precision + weight_shape = weight.shape + weights_int4_np = pack_weights_to_int4(weight) + weights_int4_onnx = onnx.numpy_helper.from_array(weights_int4_np, weight_name) + weights_int4_onnx.data_type = onnx.TensorProto.INT4 + weights_int4_onnx.dims[0] = weight_shape[0] + initializer_map[weight_name].CopyFrom(weights_int4_onnx) + logger.debug(f"Converted {weight_name} to INT4 precision") + + def is_pre_quant_scale_node(node: onnx.NodeProto) -> bool: + has_pqs_input = any(input for input in node.input if "_pre_quant_scale" in input) + return node.op_type == "Mul" and has_pqs_input + + # Remove unnecessary Cast after Pre-quant scale + for node in graph.node: + if is_pre_quant_scale_node(node): + pqs_child_nodes = [n for n in graph.node if node.output[0] in n.input] + assert len(pqs_child_nodes) == 1, f"Expected exactly one child node for {node.name}" + cast_node = pqs_child_nodes[0] + assert cast_node.op_type == "Cast", f"Expected Cast node for {node.name}" + node.output.clear() + node.output.extend(cast_node.output) + nodes_to_remove.append(cast_node.name) + + # Remove transpose and reshape nodes + new_nodes = [node for node in graph.node if node.name not in nodes_to_remove] + del graph.node[:] + graph.node.extend(new_nodes) + + # Cast bias to float16 + for node in graph.node: + if node.op_type == "Add" and "proj/Add" in node.name: + cast_initializer_to_dtype(node, "Half", initializer_map) + + # Cast pre quant scales of o_proj and down_proj to float16 + for node in graph.node: + if node.op_type == "Mul" and ( + any( + x in node.name + for x in ("o_proj/input_quantizer/Mul", "down_proj/input_quantizer/Mul") + ) + ): + cast_initializer_to_dtype(node, "Half", initializer_map) + + return onnx_model + + +def quantize_weights_to_mxfp8( + onnx_model: onnx.ModelProto, +) -> onnx.ModelProto: + """Deprecated: convert weights to MXFP8 (FP8 with e8m0 per-block scales). + + Preserved as a compatibility shim for TensorRT-Edge-LLM 0.6.1 and earlier. + New code should use :class:`modelopt.onnx.export.mxfp8_exporter.MXFP8QuantExporter`. + """ + warnings.warn( + _LEGACY_LLM_EXPORT_DEPRECATION_MSG.format(name="quantize_weights_to_mxfp8"), + DeprecationWarning, + stacklevel=2, + ) + + logger.info("Converting weights to MXFP8 precision") + graph = onnx_model.graph + initializer_map = {initializer.name: initializer for initializer in graph.initializer} + tensor_producer_map = get_tensor_producer_nodes(graph) + e8_m0_bias = 127 + weight_dq_nodes = [ + node + for node in graph.node + if node.op_type == "TRT_MXFP8DequantizeLinear" + and any(".weight" in input for input in node.input) + ] + gelu_nodes = [node for node in graph.node if node.op_type == "Gelu"] + logger.debug(f"Found {len(weight_dq_nodes)} weight DQ nodes and {len(gelu_nodes)} GELU nodes") + + for node in weight_dq_nodes: + # Get weights and node attributes + weight_name = node.input[0] + logger.debug(f"Processing MXFP8 conversion for weight {weight_name}") + weight = numpy_helper.to_array(initializer_map[weight_name]) + if has_attribute(node, "axis"): + quant_axis = int(get_attribute(node, "axis")) + else: + quant_axis = -1 + logger.warning( + "axis attribute not found for MXFP8DequantizeLinear node. Setting axis to -1" + ) + + if has_attribute(node, "block_size"): + block_size = int(get_attribute(node, "block_size")) + else: + block_size = 32 + logger.warning( + "block_size attribute not found for MXFP8DequantizeLinear node. Setting block_size to 32" + ) + + # Compute and save scales as uint8 + amax = get_amax(weight, quant_axis, block_size) + se8m0_fp32 = compute_e8m0(amax, weight.shape, quant_axis, block_size) + se8m0 = se8m0_fp32.astype(np.uint8) + + # Remove scale producer if it's a Constant node + scale_name = node.input[1] + scale_producer = tensor_producer_map[scale_name] + if scale_producer.op_type == "Constant": + graph.node.remove(scale_producer) + + # Create a new scale tensor + scale_name = scale_name.replace("Constant_output_0", "scale") + scale_tensor = onnx.numpy_helper.from_array(se8m0, scale_name) + graph.initializer.append(scale_tensor) + node.input[1] = scale_name + + # Convert weights to FP8 + # Expand block array so that it can be broadcasted with weight + se8m0_fp32 = np.repeat(se8m0_fp32, block_size, axis=quant_axis) + scaled_weight = weight / np.exp2(se8m0_fp32 - e8_m0_bias) + weights_e4m3 = onnx.helper.make_tensor( + name=weight_name, + data_type=onnx_dtype_map["Float8"], + dims=[*scaled_weight.shape], + vals=_cast_fp8(scaled_weight).tobytes(), + raw=True, + ) + initializer_map[weight_name].CopyFrom(weights_e4m3) + logger.debug(f"Converted {weight_name} to MXFP8") + + # set output type of DQ to FP16 + for node in graph.node: + if node.op_type in ["TRT_MXFP8DequantizeLinear"]: + for attr in node.attribute: + if attr.name == "output_dtype": + attr.i = onnx_dtype_map["Half"] + + # Currently only tanh approximation is supported for Gelu + for node in gelu_nodes: + for attr in node.attribute: + if attr.name == "approximate": + attr.s = b"tanh" + logger.debug(f"Updated GELU node {node.name} to use tanh approximation") + + return onnx_model + + +def fp4qdq_to_2dq(onnx_model: onnx.ModelProto, verbose: bool = False) -> onnx.ModelProto: + """Deprecated: convert FP32/FP16 weights of TRT_FP4QDQ nodes to FP4 + 2 DQ subgraph. + + Preserved as a compatibility shim for TensorRT-Edge-LLM 0.6.1 and earlier. + New code should use :class:`modelopt.onnx.export.nvfp4_exporter.NVFP4QuantExporter`. + """ + warnings.warn( + _LEGACY_LLM_EXPORT_DEPRECATION_MSG.format(name="fp4qdq_to_2dq"), + DeprecationWarning, + stacklevel=2, + ) + + # Lazy import to avoid a circular import: nvfp4_exporter imports from this module. + from modelopt.onnx.export.nvfp4_exporter import _cast_fp4, _replace_fp4qdq_with_2dq + + logger.info("Converting model with FP4QDQ nodes to 2DQ only model") + graph = onnx_model.graph + initializers = graph.initializer + initializers_to_delete = [] + tensor_consumers = get_tensor_consumer_nodes(graph) + initializer_indices = { + initializer.name: idx for idx, initializer in enumerate(graph.initializer) + } + value_info_map = {vi.name: vi for vi in graph.value_info} + graph_inputs = {inp.name for inp in graph.input} + + def _cast_input_dtypes(node: onnx.NodeProto, precision_dtype: str): + # Change the input types to match weight precision (precision_dtype) + if node.op_type == "Transpose": + maybe_matmul = tensor_consumers[node.output[0]][0] + assert maybe_matmul.op_type == "MatMul" + node = maybe_matmul + + # Create Cast nodes for each input of the target node except bias + for i, input_name in enumerate(node.input[:2]): + cast_output_name = input_name + "_f16" + + cast_node = onnx.helper.make_node( + "Cast", + inputs=[input_name], + outputs=[cast_output_name], + to=onnx_dtype_map[precision_dtype], + ) + + graph.node.extend([cast_node]) + node.input[i] = cast_output_name + + def _get_precision_dtype() -> str: + precision_dtype = "Half" + for initializer in graph.initializer: + if initializer.data_type == onnx.TensorProto.BFLOAT16: + precision_dtype = "BFloat16" + break + return precision_dtype + + if verbose: + logger.info("Post-processing TRT_FP4QDQ nodes for TRT deployment") + precision_dtype = _get_precision_dtype() + logger.debug(f"Using precision dtype: {precision_dtype}") + fp4_qdq_nodes = [node for node in graph.node if node.op_type == "TRT_FP4QDQ"] + logger.debug(f"Found {len(fp4_qdq_nodes)} FP4QDQ nodes to convert") + + for node in fp4_qdq_nodes: + idx1 = initializer_indices.get(node.input[0], None) + assert idx1 is not None, f"Initializer for weight '{node.input[0]}' not found." + block_size_attr = next((attr for attr in node.attribute if attr.name == "block_size"), None) + assert block_size_attr is not None, f"block_size attribute not found for {node.name}" + block_size = block_size_attr.i + initializers_to_delete.append(initializers[idx1].name) + logger.debug( + f"Processing FP4QDQ node for weight {node.input[0]} with block size {block_size}" + ) + + tensor = initializers[idx1] + w32 = read_f16_tensor_as_fp32(tensor) + sw_f32_per_tensor = get_weights_scaling_factor_2(w32) + sw_f32_per_block = get_weights_scaling_factor(w32, block_size, sw_f32_per_tensor) + w_f32 = quantize(w32, block_size, sw_f32_per_block, sw_f32_per_tensor) + + # Real quantize the tensors + w_f4 = _cast_fp4(w_f32) + sw_f8_per_block = _cast_fp8(sw_f32_per_block) + + _replace_fp4qdq_with_2dq( + graph, + node, + initializer_indices, + value_info_map, + graph_inputs, + w_f4, + sw_f32_per_tensor, + sw_f8_per_block, + block_size, + ) + + # We need to change the bias etc. type + next_node = tensor_consumers[node.output[0]][0] + _cast_input_dtypes(next_node, precision_dtype) + + if verbose: + logger.debug(f"Replaced {node.name} with 2 DQ nodes") + + new_initializers = [ + init for init in graph.initializer if init.name not in initializers_to_delete + ] + graph.ClearField("initializer") + graph.initializer.extend(new_initializers) + logger.info(f"Removed {len(initializers_to_delete)} initializers") + + return onnx_model diff --git a/modelopt/torch/_deploy/_runtime/tensorrt/constants.py b/modelopt/torch/_deploy/_runtime/tensorrt/constants.py index c4f387482e9..d9ace1645a3 100644 --- a/modelopt/torch/_deploy/_runtime/tensorrt/constants.py +++ b/modelopt/torch/_deploy/_runtime/tensorrt/constants.py @@ -32,10 +32,6 @@ ONE_GIBI_IN_BYTES = 1 << 30 # TensorRT conversion tool names -TRTEXEC = "trtexec" - -# trtexec path within docker -TRTEXEC_PATH = "trtexec" DEFAULT_ARTIFACT_DIR = "modelopt_build/trt_artifacts" # Default conversion params diff --git a/modelopt/torch/_deploy/_runtime/tensorrt/engine_builder.py b/modelopt/torch/_deploy/_runtime/tensorrt/engine_builder.py index 055a1f26b27..bb8bbd292b8 100644 --- a/modelopt/torch/_deploy/_runtime/tensorrt/engine_builder.py +++ b/modelopt/torch/_deploy/_runtime/tensorrt/engine_builder.py @@ -15,10 +15,9 @@ import logging import shutil -import subprocess # nosec import sys from pathlib import Path -from tempfile import NamedTemporaryFile, TemporaryDirectory, gettempdir +from tempfile import TemporaryDirectory, gettempdir from ..._runtime.common import read_bytes, timeit, write_bytes, write_string from ..._runtime.tensorrt.layerwise_profiling import process_layerwise_result @@ -28,7 +27,6 @@ DEFAULT_NUM_INFERENCE_PER_RUN, SHA_256_HASH_LENGTH, TRT_MODE_FLAGS, - TRTEXEC_PATH, WARMUP_TIME_MS, TRTMode, ) @@ -41,31 +39,50 @@ ) -# TODO: Get rid of this function or get approval for `# nosec` usage if we want to include this -# as a non-compiled python file in the release. -def _run_command(cmd: list[str], cwd: Path | None = None) -> tuple[int, bytes]: - """Util function to execute a command. +def _run_trtexec_with_logging(args: list[str], cwd: Path | None = None) -> tuple[int, bytes]: + """Run a 'trtexec' command via subprocess, logging the cmd and any failure output. - This util will not direct stdout and stderr to console if the cmd succeeds. + The 'trtexec' binary is hardcoded as the executable; only its arguments may be supplied + by the caller. This restricts the function to trtexec invocations. + + Output handling: stdout and stderr are merged and captured in memory. + On failure (non-zero returncode) or timeout, the captured output is logged at ERROR level; + on success, this function emits nothing to the console. Args: - cmd: the command line list - cwd: current working directory + args: Arguments to pass to trtexec (without the 'trtexec' command itself). + cwd: Optional working directory for the subprocess. Returns: - return code: 0 means successful, otherwise means failed - log_string: the stdout and stderr output as a string + A tuple of (returncode, output) where output is the combined stdout/stderr bytes. + Raises: + FileNotFoundError: If the 'trtexec' binary is not found in PATH. + subprocess.TimeoutExpired: If trtexec does not finish within 60 minutes. + The captured output is logged before re-raising. """ + import subprocess # nosec + + cmd = ["trtexec", *args] logging.info(" ".join(cmd)) - with NamedTemporaryFile("w+b") as log: - p = subprocess.Popen(cmd, stdout=log, stderr=log, cwd=str(cwd) if cwd else None) # nosec - p.wait() - log.seek(0) - output = log.read() - if p.returncode != 0: - logging.error(output.decode(errors="ignore")) - return p.returncode, output + try: + result = subprocess.run( # nosec B603 - cmd[0] is hardcoded "trtexec" + cmd, + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT, + cwd=str(cwd) if cwd else None, + timeout=3600, + ) + except FileNotFoundError as e: + raise FileNotFoundError( + "'trtexec' binary not found. Please ensure TensorRT is installed and 'trtexec' is in PATH." + ) from e + except subprocess.TimeoutExpired as e: + logging.error((e.stdout or b"").decode(errors="ignore")) + raise + if result.returncode != 0: + logging.error(result.stdout.decode(errors="ignore")) + return result.returncode, result.stdout def _get_profiling_params(profiling_runs: int) -> list[str]: @@ -181,7 +198,7 @@ def _build_command( calib_cache_path: Path | None = None, timing_cache_path: Path | None = None, ) -> list[str]: - cmd = [TRTEXEC_PATH, f"--onnx={onnx_path}"] + cmd = [f"--onnx={onnx_path}"] cmd.extend(TRT_MODE_FLAGS[trt_mode]) if trt_mode == TRTMode.INT8 and calib_cache and calib_cache_path: @@ -235,7 +252,7 @@ def _setup_files_and_paths( cmd = _build_command(onnx_path, engine_path, calib_cache_path, timing_cache_path) try: - ret_code, out = _run_command(cmd) + ret_code, out = _run_trtexec_with_logging(cmd) if ret_code != 0: return None, out @@ -284,7 +301,7 @@ def profile_engine( """ def _build_command(engine_path: Path, profile_path: Path, layer_info_path: Path) -> list[str]: - cmd = [TRTEXEC_PATH, f"--loadEngine={engine_path}"] + cmd = [f"--loadEngine={engine_path}"] cmd += _get_profiling_params(profiling_runs) if enable_layerwise_profiling: @@ -320,7 +337,7 @@ def _setup_files_and_paths(tmp_dir_path: Path, engine_hash: str) -> tuple[Path, cmd = _build_command(engine_path, profile_path, layer_info_path) try: - ret_code, out = _run_command(cmd) + ret_code, out = _run_trtexec_with_logging(cmd) if ret_code != 0: return None, out diff --git a/modelopt/torch/export/moe_utils.py b/modelopt/torch/export/moe_utils.py index 952ed1e39c1..8981d614843 100644 --- a/modelopt/torch/export/moe_utils.py +++ b/modelopt/torch/export/moe_utils.py @@ -62,6 +62,29 @@ def _export_fused_experts(module: nn.Module, dtype: torch.dtype) -> None: for idx in range(n): expert = nn.Module() + # If the gate_up source quantizer was never calibrated (rare expert + # that received no calibration tokens), derive its amax once from the + # FUSED tensor so gate and up share the same weight_scale_2 below. + # Why: vLLM fuses W1 (gate) and W3 (up) at load time and asserts a + # single per-tensor scale across the fusion. The per-projection + # fallback further down would otherwise compute amax independently from + # each half — gate's max and up's max generally differ — producing + # mismatched weight_scale_2 and garbled MoE output at inference. + gate_up_q = module.gate_up_proj_weight_quantizers[idx] + if getattr(gate_up_q, "is_enabled", False) and ( + not hasattr(gate_up_q, "_amax") + or gate_up_q._amax is None + or torch.all(gate_up_q._amax == 0) + ): + gate_up_q.amax = gate_up[idx].abs().amax().to(torch.float32) + warnings.warn( + f"Expert {idx} gate_up_proj weight quantizer was not calibrated " + f"(amax missing or zero). Using fused-tensor amax as fallback " + f"(shared by gate and up so weight_scale_2 stays consistent). " + f"Consider increasing calibration size to activate all experts.", + stacklevel=2, + ) + projections = [ ("gate_proj", gate_up[idx, :expert_dim, :], 0, fused_dim0, True), ("up_proj", gate_up[idx, expert_dim:, :], expert_dim, fused_dim0, True), diff --git a/modelopt/torch/export/unified_export_hf.py b/modelopt/torch/export/unified_export_hf.py index ed6ed2fcf21..19deea08b45 100644 --- a/modelopt/torch/export/unified_export_hf.py +++ b/modelopt/torch/export/unified_export_hf.py @@ -59,6 +59,7 @@ ) from modelopt.torch.quantization.qtensor import MXFP8QTensor, NVFP4QTensor from modelopt.torch.quantization.utils import fsdp2_aware_weight_update, quantizer_attr_names +from modelopt.torch.utils.dataset_utils import _disable_use_cache try: from modelopt.torch.sparsity.attention_sparsity.conversion import export_sparse_attention_config @@ -217,11 +218,14 @@ def _output_hook(module, input, output): if not handles: return input_to_linear, output_to_layernorm - # Run dummy forward pass to collect modules sharing same input + # Run dummy forward pass to collect modules sharing same input. + # `_disable_use_cache` keeps the probe forward working on configs that don't + # set `use_cache` (e.g., stepfun-ai/Step-3.5-Flash's Step3p5Config). try: with ( torch.no_grad(), set_quantizer_by_cfg_context(model, [{"quantizer_name": "*", "enable": False}]), + _disable_use_cache(model), ): dummy_forward_fn() finally: diff --git a/modelopt/torch/opt/plugins/transformers.py b/modelopt/torch/opt/plugins/transformers.py index 9cc729723e7..3370309156d 100644 --- a/modelopt/torch/opt/plugins/transformers.py +++ b/modelopt/torch/opt/plugins/transformers.py @@ -20,6 +20,8 @@ from contextlib import contextmanager import torch +import transformers +from packaging.version import Version from transformers import PreTrainedModel, Trainer, TrainerCallback from transformers import modeling_utils as tf_modeling_utils @@ -130,13 +132,18 @@ def _save_pretrained_with_checks(self, save_directory, *args, **kwargs): # [Fix for huggingface bug] deepspeed zero3 training backend only loads params into the model from # state_dict, but not buffers. So lets explicitly load the buffers into the model from state_dict. +# The `load_config` parameter was added to `_load_state_dict_into_zero3_model` in transformers 5.0. +_TRANSFORMERS_GE_5_0 = Version(transformers.__version__) >= Version("5.0") + + def _load_params_and_buffers_into_zero3_model(model_to_load, state_dict, load_config=None): buffer_names = [name for name, _ in model_to_load.named_buffers()] buffer_state_dict = {k: v for k, v in state_dict.items() if k in buffer_names} model_to_load.load_state_dict(buffer_state_dict, strict=False) - return tf_modeling_utils._modelopt_cache["_load_state_dict_into_zero3_model"]( - model_to_load, state_dict, load_config - ) + cached_fn = tf_modeling_utils._modelopt_cache["_load_state_dict_into_zero3_model"] + if _TRANSFORMERS_GE_5_0: + return cached_fn(model_to_load, state_dict, load_config) + return cached_fn(model_to_load, state_dict) pretrained_model_patch_methods = [ diff --git a/modelopt/torch/quantization/model_calib.py b/modelopt/torch/quantization/model_calib.py index 0aec4411e0e..c7c666f8ad9 100644 --- a/modelopt/torch/quantization/model_calib.py +++ b/modelopt/torch/quantization/model_calib.py @@ -1307,6 +1307,21 @@ def postprocess(module, name): dtype=w_dtype, device=w_device, ) + # Mirror the calibrated postprocess path, gated on + # is_input_quantized so weight-only AWQ configs (where + # setup() never disabled input_quantizer) stay untouched. + # Collapse any per-channel _amax left over from cache_mode + # max_calibrate into a per-tensor scalar so + # preprocess_linear_fusion's numel==1 assertion passes, and + # re-enable the quantizer (awq_lite.setup disabled it). + if module.awq_lite.is_input_quantized: + if module.input_quantizer.amax is not None: + act_amax = module.input_quantizer.amax + module.input_quantizer._amax_for_smoothing = act_amax.cpu() + module.input_quantizer.reset_amax() + module.input_quantizer.axis = None + module.input_quantizer.amax = act_amax.amax() + module.input_quantizer.enable() else: with enable_weight_access_and_writeback(module, model, name_to_module): postprocess(module, name) diff --git a/modelopt/torch/quantization/qtensor/nvfp4_tensor.py b/modelopt/torch/quantization/qtensor/nvfp4_tensor.py index 6ff31424c77..c3c3f164458 100644 --- a/modelopt/torch/quantization/qtensor/nvfp4_tensor.py +++ b/modelopt/torch/quantization/qtensor/nvfp4_tensor.py @@ -122,10 +122,16 @@ def get_weights_scaling_factor_from_quantizer( expected_shape = (*weight.shape[:-1], num_blocks_per_row) per_block_scale = per_block_scale.view(expected_shape) - # Quantize scales to FP8 + # Quantize scales to FP8. Saturate to the fp8_e4m3fn max (448) before the + # cast: when the [==0]=1.0 safety net above fires (per_block_amax was zero + # for an all-zero weight block) and global_amax is small, the pre-cast value + # explodes to ``1.0 * 448 / (global_amax/6)``. fp8_e4m3fn has no Inf, so any + # value >= 480 casts to NaN — clamp first to keep the stored byte finite. if not keep_high_precision: - per_block_scale = (per_block_scale * 448.0 / per_block_scale_max).to( - torch.float8_e4m3fn + per_block_scale = ( + (per_block_scale * 448.0 / per_block_scale_max) + .clamp_(max=448.0) + .to(torch.float8_e4m3fn) ) return per_block_scale, weights_scaling_factor_2 else: diff --git a/pyproject.toml b/pyproject.toml index a174c6218d8..0dc27f85086 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -75,14 +75,14 @@ onnx = [ hf = [ "accelerate>=1.0.0", "datasets>=3.0.0", - "deepspeed>=0.9.6; platform_system != 'Darwin' and platform_system != 'Windows'", + "deepspeed>=0.9.6,<0.19; platform_system != 'Darwin' and platform_system != 'Windows'", "diffusers>=0.32.2", "huggingface_hub>=0.24.0", "nltk", "peft>=0.17.0", - "sentencepiece>=0.2.1", # Also implicitly used in test_unified_export_megatron, test_vllm_fakequant_megatron_export + "sentencepiece>=0.2.1", # Also implicitly used in test_unified_export_megatron, test_vllm_fakequant_megatron_export "tiktoken", - "transformers>=4.56,<5.6", # Should match modelopt/torch/__init__.py and noxfile.py + "transformers>=4.56,<5.6", # Should match modelopt/torch/__init__.py and noxfile.py "wonderwords", ] diff --git a/tests/examples/llm_eval/test_llm_eval.py b/tests/examples/llm_eval/test_llm_eval.py index 0abf78b53e9..356430ea6f6 100644 --- a/tests/examples/llm_eval/test_llm_eval.py +++ b/tests/examples/llm_eval/test_llm_eval.py @@ -15,16 +15,38 @@ import subprocess -from _test_utils.examples.models import TINY_LLAMA_PATH -from _test_utils.examples.run_command import run_llm_ptq_command +from _test_utils.examples.run_command import ( + extend_cmd_parts, + run_example_command, + run_llm_ptq_command, +) from _test_utils.torch.misc import minimum_sm +from _test_utils.torch.transformers_models import create_tiny_qwen3_dir + + +def test_lm_eval_hf(tmp_path): + model_dir = create_tiny_qwen3_dir(tmp_path, with_tokenizer=True) + + cmd_parts = extend_cmd_parts( + ["python", "lm_eval_hf.py"], + model="hf", + model_args=f"pretrained={model_dir}", + tasks="mmlu", + num_fewshot=5, + limit=0.1, + batch_size=8, + ) + run_example_command(cmd_parts, "llm_eval") @minimum_sm(89) -def test_llama_eval_fp8(): +def test_qwen3_eval_fp8(tmp_path): + # Bump max_position_embeddings: TRT-LLM serve rejects prompts longer than + # max_seq_len, and the default (32) is shorter than even simple MMLU prompts. + model_dir = create_tiny_qwen3_dir(tmp_path, with_tokenizer=True, max_position_embeddings=2048) try: run_llm_ptq_command( - model=TINY_LLAMA_PATH, + model=str(model_dir), quant="fp8", tasks="mmlu,lm_eval,simple_eval", calib=64, diff --git a/tests/gpu/torch/quantization/test_nvfp4_static_quantizer_cuda.py b/tests/gpu/torch/quantization/test_nvfp4_static_quantizer_cuda.py index b1b3691a797..430b7ee4113 100644 --- a/tests/gpu/torch/quantization/test_nvfp4_static_quantizer_cuda.py +++ b/tests/gpu/torch/quantization/test_nvfp4_static_quantizer_cuda.py @@ -21,6 +21,7 @@ from modelopt.torch.quantization.calib import NVFP4MSECalibrator from modelopt.torch.quantization.config import QuantizerAttributeConfig from modelopt.torch.quantization.nn import NVFP4StaticQuantizer, TensorQuantizer +from modelopt.torch.quantization.qtensor import NVFP4QTensor from modelopt.torch.quantization.tensor_quant import ( scaled_e4m3_impl, static_blockwise_fp4_fake_quant, @@ -64,6 +65,51 @@ def test_global_amax_property(self, device): quantizer.global_amax = None assert quantizer.global_amax is None + def test_export_fp8_scale_no_nan_for_zero_amax_block(self, device): + """Regression: export must not emit fp8 NaN bytes for an all-zero block. + + When max-only calibration leaves ``_amax = 0`` for a fully-zero weight block, + the export's ``[per_block_scale == 0] = 1.0`` safety net drives the pre-cast + value to ``1.0 * 448 / (global_amax / 6)``. fp8_e4m3fn has no Inf, so any + pre-cast value >= 480 rounds to NaN — without a saturation clamp this writes + a 0x7F byte into ``weight_scale``. Reproduces the NaN seen in the saved + Kimi-K2.6-NVFP4-MSE checkpoint at expert 21 down_proj. + """ + block_size = 16 + cfg = QuantizerAttributeConfig( + num_bits=(2, 1), + block_sizes={-1: block_size, "type": "static", "scale_bits": (4, 3)}, + ) + quantizer = NVFP4StaticQuantizer(quant_attribute_cfg=cfg).to(device) + + # Two-block weight: block 0 is non-trivial; block 1 is all zeros so its + # per-block amax is exactly 0. + weight = torch.zeros(1, 2 * block_size, device=device, dtype=torch.bfloat16) + weight[0, :block_size] = 0.1 + + per_block_amax = weight.abs().reshape(1, 2, block_size).amax(dim=-1).flatten() + quantizer.amax = per_block_amax + quantizer.global_amax = per_block_amax.max() + + # Sanity: the bug only fires when the would-be cast value exceeds 480. + # With global_amax = 0.1, scale_in_fp8 for a zero block is + # 1.0 * 448 / (0.1 / 6) ≈ 26880 — well past the 480 NaN threshold. + assert (per_block_amax == 0).any() + assert quantizer.global_amax.float().item() < 1.0 + + weight_scale, _ = NVFP4QTensor.get_weights_scaling_factor_from_quantizer( + quantizer, weight, weights_scaling_factor_2=None + ) + assert weight_scale.dtype == torch.float8_e4m3fn + + # No fp8_e4m3fn NaN bytes (NaN encoding is (b & 0x7F) == 0x7F). + raw = weight_scale.view(torch.uint8) + n_nan = ((raw & 0x7F) == 0x7F).sum().item() + assert n_nan == 0, f"fp8 weight_scale contains {n_nan} NaN byte(s)" + + # The all-zero block's stored fp8 scale should saturate to 448 (max finite). + assert raw.flatten()[1].item() == 0x7E + def test_fake_quantize_with_both_amaxs(self, device): """Test _fake_quantize uses both _amax and _global_amax.""" num_blocks = 4 diff --git a/tests/unit/onnx/quantization/test_qdq_utils.py b/tests/unit/onnx/quantization/test_qdq_utils.py index 8af5f560dd0..0ff3686a610 100644 --- a/tests/unit/onnx/quantization/test_qdq_utils.py +++ b/tests/unit/onnx/quantization/test_qdq_utils.py @@ -1108,3 +1108,96 @@ def test_constant_node_scale_path_still_patched(self): scale_arr = numpy_helper.to_array(value_attr.t) assert not (scale_arr == 0).any() assert (scale_arr > 0).all() + + +class TestLegacyEdgeLLMShims: + """Smoke tests for the deprecated top-level shims kept for TensorRT-Edge-LLM 0.6.1. + + These are the functions edgellm 0.6.1 imports from + ``modelopt.onnx.quantization.qdq_utils`` directly (not via the staged exporters). + Tests verify each shim runs end-to-end on the same fixtures used for the staged + exporters and emits a ``DeprecationWarning``. + """ + + def test_quantize_weights_to_int4_shim(self): + import warnings + + from modelopt.onnx.quantization.qdq_utils import quantize_weights_to_int4 + + model = create_test_model_with_int4_dq_reshape_transpose_matmul() + + with warnings.catch_warnings(record=True) as caught: + warnings.simplefilter("always") + quantized_model = quantize_weights_to_int4(model) + + assert any( + issubclass(w.category, DeprecationWarning) + and "quantize_weights_to_int4" in str(w.message) + for w in caught + ) + + weight_tensor = next( + init for init in quantized_model.graph.initializer if init.name == "weight" + ) + assert weight_tensor.data_type == TensorProto.INT4 + + node_types = [node.op_type for node in quantized_model.graph.node] + assert "Reshape" not in node_types + assert "Transpose" not in node_types + + def test_quantize_weights_to_mxfp8_shim(self): + import warnings + + from modelopt.onnx.quantization.qdq_utils import quantize_weights_to_mxfp8 + + model = create_test_model_with_mxfp8_dq() + + with warnings.catch_warnings(record=True) as caught: + warnings.simplefilter("always") + quantized_model = quantize_weights_to_mxfp8(model) + + assert any( + issubclass(w.category, DeprecationWarning) + and "quantize_weights_to_mxfp8" in str(w.message) + for w in caught + ) + + weight_tensor = next( + init for init in quantized_model.graph.initializer if init.name == "linear.weight" + ) + assert weight_tensor.data_type == TensorProto.FLOAT8E4M3FN + + gelu_node = next(node for node in quantized_model.graph.node if node.op_type == "Gelu") + approximate_attr = next(attr for attr in gelu_node.attribute if attr.name == "approximate") + assert approximate_attr.s == b"tanh" + + @pytest.mark.parametrize("with_transpose", [False, True]) + def test_fp4qdq_to_2dq_shim(self, with_transpose): + import warnings + + from modelopt.onnx.quantization.qdq_utils import fp4qdq_to_2dq + + model = create_test_model_with_nvfp4_qdq(with_transpose=with_transpose) + + with warnings.catch_warnings(record=True) as caught: + warnings.simplefilter("always") + converted_model = fp4qdq_to_2dq(model) + + assert any( + issubclass(w.category, DeprecationWarning) and "fp4qdq_to_2dq" in str(w.message) + for w in caught + ) + + fp4qdq_nodes = [node for node in converted_model.graph.node if node.op_type == "TRT_FP4QDQ"] + assert len(fp4qdq_nodes) == 0 + + dq_nodes = [ + node for node in converted_model.graph.node if node.op_type == "DequantizeLinear" + ] + assert len(dq_nodes) == 2 + + initializer_names = {init.name for init in converted_model.graph.initializer} + assert "linear.weight_f4" in initializer_names + assert "linear.weight_f8_scale" in initializer_names + assert "linear.weight_f8_scale_f32_scale" in initializer_names + assert "linear.weight" not in initializer_names diff --git a/tests/unit/torch/deploy/_runtime/tensorrt/test_engine_builder.py b/tests/unit/torch/deploy/_runtime/tensorrt/test_engine_builder.py index 38fce51f4aa..ff7f77cf617 100755 --- a/tests/unit/torch/deploy/_runtime/tensorrt/test_engine_builder.py +++ b/tests/unit/torch/deploy/_runtime/tensorrt/test_engine_builder.py @@ -55,7 +55,7 @@ def setup_mocks(): with ( mock.patch( - "modelopt.torch._deploy._runtime.tensorrt.engine_builder._run_command" + "modelopt.torch._deploy._runtime.tensorrt.engine_builder._run_trtexec_with_logging" ) as mock_run, mock.patch( "modelopt.torch._deploy._runtime.tensorrt.engine_builder.TemporaryDirectory" diff --git a/tests/unit/torch/quantization/plugins/test_fused_experts.py b/tests/unit/torch/quantization/plugins/test_fused_experts.py index 29435827748..e0ce2f0c66e 100644 --- a/tests/unit/torch/quantization/plugins/test_fused_experts.py +++ b/tests/unit/torch/quantization/plugins/test_fused_experts.py @@ -300,6 +300,94 @@ def test_export_creates_per_expert_submodules(self): if QuantModuleRegistry.get(expert_type) is not None: QuantModuleRegistry.unregister(expert_type) + def test_uncalibrated_expert_gate_up_share_amax(self, monkeypatch): + """gate_proj and up_proj must share weight_scale_2 even when an expert + was never routed during calibration. + + Regression for the bug where ``_export_fused_experts``'s per-projection + fallback computed amax independently from the gate and up halves of the + fused tensor — producing mismatched ``weight_scale_2`` values for any + uncalibrated expert. vLLM fuses W1 (gate) and W3 (up) at load time and + asserts a single shared scale; mismatched scales corrupted MoE output. + The fix derives the fallback amax once from the fused ``gate_up[idx]`` + tensor before the deepcopies, so gate's clone and up's clone start with + the same amax. + """ + from modelopt.torch.export.moe_utils import _export_fused_experts + + # Build experts where gate and up have very different magnitudes — + # any per-half fallback would clearly produce different amaxes. + experts = _SyntheticFusedExperts() + gate = torch.randn(NUM_EXPERTS, INTERMEDIATE_DIM, HIDDEN_DIM) * 0.02 + up = torch.randn(NUM_EXPERTS, INTERMEDIATE_DIM, HIDDEN_DIM) * 0.20 + with torch.no_grad(): + experts.gate_up_proj.copy_(torch.cat([gate, up], dim=1)) + + expert_type = type(experts) + if QuantModuleRegistry.get(expert_type) is None: + QuantModuleRegistry.register({expert_type: "test.SyntheticFusedExperts"})( + _QuantFusedExperts + ) + try: + converted = QuantModuleRegistry.convert(experts) + + # Leave every expert weight quantizer uncalibrated (no _amax). + # Mark them enabled to exercise the export-time fallback path. + for q in converted.gate_up_proj_weight_quantizers: + q._disabled = False + for q in converted.down_proj_weight_quantizers: + q._disabled = False + + # Capture the amax each per-projection wrapper carries into the + # FP4 quantization step. Patching here avoids needing CUDA / FP4. + seen = {} # (expert_idx, proj_name) -> amax tensor + + def _spy_export(wrapper, dtype): + # Identify which expert/projection this wrapper belongs to by + # matching the weight tensor against the fused parameters. + w = wrapper.weight.data + # gate_up_proj is (N, 2*INTER, HIDDEN); split halves are + # contiguous .data views or .contiguous() copies — we can match + # by shape and value identity for this synthetic case. + amax = wrapper.weight_quantizer._amax.detach().clone() + # Identify by matching against gate vs. up slices of each expert. + for idx in range(NUM_EXPERTS): + g_slice = converted.gate_up_proj.data[idx, :INTERMEDIATE_DIM, :] + u_slice = converted.gate_up_proj.data[idx, INTERMEDIATE_DIM:, :] + d_slice = converted.down_proj.data[idx] + if w.shape == g_slice.shape and torch.equal(w, g_slice): + seen[(idx, "gate_proj")] = amax + return + if w.shape == u_slice.shape and torch.equal(w, u_slice): + seen[(idx, "up_proj")] = amax + return + if w.shape == d_slice.shape and torch.equal(w, d_slice): + seen[(idx, "down_proj")] = amax + return + + monkeypatch.setattr( + "modelopt.torch.export.unified_export_hf._export_quantized_weight", + _spy_export, + ) + + _export_fused_experts(converted, torch.float16) + + # Assert: for every expert, gate's amax matches up's amax. + for idx in range(NUM_EXPERTS): + g_amax = seen.get((idx, "gate_proj")) + u_amax = seen.get((idx, "up_proj")) + assert g_amax is not None and u_amax is not None, ( + f"Expert {idx}: missing recorded amax (gate={g_amax}, up={u_amax})" + ) + assert torch.allclose(g_amax, u_amax), ( + f"Expert {idx}: gate amax {g_amax.item()} != up amax {u_amax.item()}. " + f"Uncalibrated fused experts must share gate/up amax so that " + f"weight_scale_2 stays consistent across the fusion." + ) + finally: + if QuantModuleRegistry.get(expert_type) is not None: + QuantModuleRegistry.unregister(expert_type) + # --------------------------------------------------------------------------- # Tests for force_eager_experts_impl_on_the_fly diff --git a/tests/unit/torch/quantization/test_calib.py b/tests/unit/torch/quantization/test_calib.py index d2e6fdd03e8..a39ee55d9d2 100644 --- a/tests/unit/torch/quantization/test_calib.py +++ b/tests/unit/torch/quantization/test_calib.py @@ -312,6 +312,84 @@ def test_padded_awq(): model(torch.randn(2, 16, 16)) +class _TwoBranchModel(nn.Module): + """Two parallel linears; only the first is exercised by forward_loop.""" + + def __init__(self): + super().__init__() + self.calibrated = nn.Linear(16, 16, bias=False) + self.uncalibrated = nn.Linear(16, 16, bias=False) + + def forward(self, x, branch="calibrated"): + if branch == "calibrated": + return self.calibrated(x) + return self.uncalibrated(x) + + +@pytest.mark.skipif(not torch.cuda.is_available(), reason="NVFP4 dynamic block quant is CUDA-only") +def test_awq_lite_uncalibrated_linear_keeps_input_quantizer_enabled(): + """Regression test for NVBug 6143871. + + awq_lite.setup() disables the input_quantizer at the start of search. The + calibrated branch re-enables it inside postprocess(); the uncalibrated + branch (no cache-pass tokens, e.g. an MoE expert that never gets routed) + must do the same — otherwise downstream export (set_expert_quantizer_amax + + _export_quantized_weight) drops the input_scale buffer and inference + runtimes that read per-expert input_scale (e.g. TRT-LLM CutlassFusedMoE) + crash with KeyError on '.w1.input_scale'. + + Also asserts the export-critical scalar amax invariant (axis=None, + numel==1) — preprocess_linear_fusion enforces it for fused-expert groups. + """ + torch.manual_seed(0) + model = _TwoBranchModel().cuda() + + def _forward_loop(m): + for _ in range(2): + m(torch.randn(2, 16, 16, device="cuda"), branch="calibrated") + + mtq.quantize(model, mtq.NVFP4_AWQ_LITE_CFG, _forward_loop) + + assert model.calibrated.input_quantizer.is_enabled + assert model.uncalibrated.input_quantizer.is_enabled, ( + "Uncalibrated linear's input_quantizer must remain enabled after " + "awq_lite postprocess so export emits input_scale (NVBug 6143871)." + ) + uncal_q = model.uncalibrated.input_quantizer + # When amax exists (cache-hit but search-miss path), it must be the + # scalar form export expects — preprocess_linear_fusion asserts numel==1. + # When it's None (truly never routed), set_expert_quantizer_amax will + # populate it during export. + if uncal_q.amax is not None: + assert uncal_q.axis is None + assert uncal_q.amax.numel() == 1 + + +def test_awq_lite_uncalibrated_weight_only_keeps_input_quantizer_disabled(): + """Weight-only AWQ companion to NVBug 6143871. + + For weight-only AWQ configs (input_quantizer disabled), awq_lite.setup() + never touches the input_quantizer, so the postprocess uncalibrated branch + must NOT enable it — doing so turns on quantization the user's config had + explicitly opted out of. + """ + torch.manual_seed(0) + model = _TwoBranchModel() + + def _forward_loop(m): + for _ in range(2): + m(torch.randn(2, 16, 16), branch="calibrated") + + mtq.quantize(model, mtq.INT4_AWQ_CFG, _forward_loop) + + assert not model.calibrated.input_quantizer.is_enabled + assert not model.uncalibrated.input_quantizer.is_enabled, ( + "Weight-only AWQ must not flip on the input_quantizer for " + "uncalibrated layers — that would silently quantize activations " + "the user's config left in full precision." + ) + + def test_smoothquant_enable_disable(): torch.manual_seed(1234) model = _SimpleMLP()