diff --git a/examples/llm_eval/lm_eval_hf.py b/examples/llm_eval/lm_eval_hf.py
index 7d1f9f19935..51c0930e8f2 100755
--- a/examples/llm_eval/lm_eval_hf.py
+++ b/examples/llm_eval/lm_eval_hf.py
@@ -42,15 +42,15 @@
 
 import datasets
 from lm_eval import utils
-from lm_eval.__main__ import cli_evaluate, parse_eval_args, setup_parser
+from packaging.version import Version
 
-if not version("lm_eval").startswith("0.4.8"):
-    warnings.warn(
-        f"lm_eval_hf.py is tested with lm-eval 0.4.8; found {version('lm_eval')}. "
-        "Later versions may have incompatible API changes."
-    )
+if Version(version("lm_eval")) < Version("0.4.10"):
+    raise ImportError(f"lm_eval_hf.py requires lm-eval >= 0.4.10; found {version('lm_eval')}.")
+
+from lm_eval._cli import HarnessCLI
 from lm_eval.api.model import T
 from lm_eval.models.huggingface import HFLM
+from lm_eval.utils import setup_logging
 from quantization_utils import quantize_model
 from sparse_attention_utils import sparsify_model
 
@@ -160,9 +160,24 @@ def create_from_arg_string(
 HFLM.create_from_arg_string = classmethod(create_from_arg_string)
 
 
-def setup_parser_with_modelopt_args():
-    """Extend the lm-eval argument parser with ModelOpt quantization and sparsity options."""
-    parser = setup_parser()
+# ModelOpt-specific args that we add to lm-eval's parser. After parsing, these are
+# moved out of the argparse namespace and into args.model_args so they reach
+# HFLM.create_from_arg_obj (and so lm-eval's own arg validation doesn't reject them).
+_MODELOPT_ARG_KEYS = (
+    "quant_cfg",
+    "calib_batch_size",
+    "calib_size",
+    "auto_quantize_bits",
+    "auto_quantize_method",
+    "auto_quantize_score_size",
+    "auto_quantize_checkpoint",
+    "compress",
+    "sparse_cfg",
+)
+
+
+def _add_modelopt_args(parser):
+    """Extend an lm-eval argument parser with ModelOpt quantization and sparsity options."""
     parser.add_argument(
         "--quant_cfg",
         type=str,
@@ -221,33 +236,45 @@ def setup_parser_with_modelopt_args():
         type=str,
         help="Sparse attention configuration (e.g., SKIP_SOFTMAX_DEFAULT, SKIP_SOFTMAX_CALIB)",
     )
-    return parser
 
 
-if __name__ == "__main__":
-    parser = setup_parser_with_modelopt_args()
-    args = parse_eval_args(parser)
-    model_args = utils.simple_parse_args_string(args.model_args)
+def _inject_modelopt_args_into_model_args(args):
+    """Move ModelOpt args from the argparse namespace into args.model_args.
+
+    args.model_args is a dict (parsed by lm-eval's MergeDictAction). The ModelOpt
+    keys must be removed from the namespace so EvaluatorConfig.from_cli doesn't
+    reject them as unknown kwargs.
+    """
+    model_args = dict(args.model_args) if args.model_args else {}
 
-    if args.trust_remote_code:
+    if getattr(args, "trust_remote_code", False):
+        # Propagate the user-provided --trust_remote_code flag (not hardcoded).
         datasets.config.HF_DATASETS_TRUST_REMOTE_CODE = True
         model_args["trust_remote_code"] = True
         args.trust_remote_code = None
 
-    model_args.update(
-        {
-            "quant_cfg": args.quant_cfg,
-            "auto_quantize_bits": args.auto_quantize_bits,
-            "auto_quantize_method": args.auto_quantize_method,
-            "auto_quantize_score_size": args.auto_quantize_score_size,
-            "auto_quantize_checkpoint": args.auto_quantize_checkpoint,
-            "calib_batch_size": args.calib_batch_size,
-            "calib_size": args.calib_size,
-            "compress": args.compress,
-            "sparse_cfg": args.sparse_cfg,
-        }
-    )
+    for key in _MODELOPT_ARG_KEYS:
+        if hasattr(args, key):
+            model_args[key] = getattr(args, key)
+            delattr(args, key)
 
     args.model_args = model_args
 
-    cli_evaluate(args)
+
+if __name__ == "__main__":
+    setup_logging()
+    cli = HarnessCLI()
+    # The `run` subcommand owns the model/task arguments; extend that parser.
+    # `_subparsers` is private API; guard so a future lm-eval refactor surfaces a
+    # clear error instead of an opaque AttributeError.
+    try:
+        run_parser = cli._subparsers.choices["run"]
+    except (AttributeError, KeyError) as e:
+        raise RuntimeError(
+            "Cannot locate lm-eval's `run` subparser; the HarnessCLI internals may "
+            f"have changed. Installed lm-eval version: {version('lm_eval')}."
+        ) from e
+    _add_modelopt_args(run_parser)
+    args = cli.parse_args()
+    _inject_modelopt_args_into_model_args(args)
+    cli.execute(args)
diff --git a/examples/llm_eval/requirements.txt b/examples/llm_eval/requirements.txt
index df47ac76c6b..2762c838c6a 100644
--- a/examples/llm_eval/requirements.txt
+++ b/examples/llm_eval/requirements.txt
@@ -1,5 +1,5 @@
 fire>=0.5.0
-lm_eval[api,ifeval]==0.4.8
+lm_eval[api,ifeval]>=0.4.10
 peft>=0.5.0
 rwkv>=0.7.3
 torchvision
diff --git a/examples/llm_sparsity/weight_sparsity/launch_finetune.sh b/examples/llm_sparsity/weight_sparsity/launch_finetune.sh
index 7f8e71f255c..851e3a5aeed 100755
--- a/examples/llm_sparsity/weight_sparsity/launch_finetune.sh
+++ b/examples/llm_sparsity/weight_sparsity/launch_finetune.sh
@@ -88,7 +88,7 @@ CMD="accelerate launch --multi_gpu --mixed_precision bf16 finetune.py \
     --save_total_limit 10 \
     --learning_rate 2e-5 \
     --weight_decay 0.1 \
-    --warmup_steps 0.0 \
+    --warmup_steps 0 \
     --lr_scheduler_type cosine \
     --logging_steps 1 \
     --fsdp 'full_shard auto_wrap' \
diff --git a/examples/puzzletron/requirements.txt b/examples/puzzletron/requirements.txt
index 317a38f5eab..6f4c94e08a8 100644
--- a/examples/puzzletron/requirements.txt
+++ b/examples/puzzletron/requirements.txt
@@ -1,4 +1,3 @@
-lm-eval==0.4.8
 math-verify
 ray
 # Likely works for transformers v5 also, but we need to test it
diff --git a/examples/specdec_bench/specdec_bench/datasets/speed.py b/examples/specdec_bench/specdec_bench/datasets/speed.py
index fe544bb353e..3552d71a1ad 100644
--- a/examples/specdec_bench/specdec_bench/datasets/speed.py
+++ b/examples/specdec_bench/specdec_bench/datasets/speed.py
@@ -737,10 +737,40 @@ def _load_dataset(self, config_name_or_dataset_path: config_type | str) -> "Data
                     }
                     table = table.replace_schema_metadata(new_meta or None)
                 dataset = HFDataset(table)
-        if self.num_samples is not None:
-            dataset = dataset.select(range(self.num_samples))
+        if self.num_samples is not None and self.num_samples < len(dataset):
+            dataset = self._stratified_select(dataset, self.num_samples)
         return dataset
 
+    @staticmethod
+    def _stratified_select(dataset: "Dataset", n: int) -> "Dataset":
+        """Select ``n`` samples uniformly across the ``category`` column.
+
+        Round-robin across categories until ``n`` rows are collected. The
+        resulting prefix is balanced; once a smaller category is exhausted
+        the remaining categories continue contributing, so exactly ``n``
+        rows are returned whenever ``n`` does not exceed the dataset size.
+        Falls back to ``range(n)`` when ``category`` is absent or there is
+        only one category. Indices come from ``range(category_size)`` (not
+        random) so behavior is deterministic.
+        """
+        if "category" not in dataset.column_names:
+            return dataset.select(range(n))
+        cat_to_rows: dict[str, list[int]] = {}
+        for i, c in enumerate(dataset["category"]):
+            cat_to_rows.setdefault(c, []).append(i)
+        if len(cat_to_rows) <= 1:
+            return dataset.select(range(n))
+        cat_lists = list(cat_to_rows.values())
+        interleaved: list[int] = []
+        max_len = max(len(c) for c in cat_lists)
+        for i in range(max_len):
+            for c in cat_lists:
+                if i < len(c):
+                    interleaved.append(c[i])
+                    if len(interleaved) == n:
+                        return dataset.select(interleaved)
+        return dataset.select(interleaved)
+
     def _resolve_external_data(
         self, dataset: "Dataset", speed_config: config_type | str
     ) -> "Dataset":
diff --git a/modelopt/onnx/export/nvfp4_exporter.py b/modelopt/onnx/export/nvfp4_exporter.py
index a80a9845fb5..e8bdfa2db1f 100644
--- a/modelopt/onnx/export/nvfp4_exporter.py
+++ b/modelopt/onnx/export/nvfp4_exporter.py
@@ -39,6 +39,10 @@ def _cast_fp4(array: np.ndarray) -> np.ndarray:
 
     Note: The first dimension of the array must be divisible by 2
     as two FP4 values are packed into a single byte.
+
+    Also reused by the deprecated ``modelopt.onnx.quantization.qdq_utils.fp4qdq_to_2dq``
+    compatibility shim. Do not rename or change the signature without updating that
+    shim (it is a load-bearing re-export for TensorRT-Edge-LLM 0.6.1).
     """
     array_f32_t = torch.from_numpy(array)
     array_f32_t_shape = array_f32_t.shape
@@ -76,6 +80,10 @@ def _replace_fp4qdq_with_2dq(
 ):
     """Replaces the given node in the ONNX graph with a subgraph consisting of two DequantizeLinear nodes.
 
+    Also reused by the deprecated ``modelopt.onnx.quantization.qdq_utils.fp4qdq_to_2dq``
+    compatibility shim. Do not rename or change the signature without updating that
+    shim (it is a load-bearing re-export for TensorRT-Edge-LLM 0.6.1).
+
     Args:
         graph: The ONNX graph containing the node to replace.
         node: The node to be replaced.
diff --git a/modelopt/onnx/quantization/autotune/benchmark.py b/modelopt/onnx/quantization/autotune/benchmark.py
index df6dbc877d0..b87478a1572 100644
--- a/modelopt/onnx/quantization/autotune/benchmark.py
+++ b/modelopt/onnx/quantization/autotune/benchmark.py
@@ -31,7 +31,6 @@
 import os
 import re
 import shutil
-import subprocess  # nosec B404
 import tempfile
 import time
 from abc import ABC, abstractmethod
@@ -42,7 +41,7 @@
 import torch
 
 from modelopt.onnx.logging_config import logger
-from modelopt.onnx.quantization.ort_utils import _check_for_trtexec
+from modelopt.onnx.quantization.ort_utils import _check_for_trtexec, _run_trtexec
 
 TRT_AVAILABLE = importlib.util.find_spec("tensorrt") is not None
 if TRT_AVAILABLE:
@@ -159,7 +158,6 @@ def __init__(
         warmup_runs: int = 5,
         timing_runs: int = 10,
         plugin_libraries: list[str] | None = None,
-        trtexec_path: str = "trtexec",
         trtexec_args: list[str] | None = None,
     ):
         """Initialize the trtexec benchmark.
@@ -169,14 +167,11 @@ def __init__(
             warmup_runs: See :meth:`Benchmark.__init__`.
             timing_runs: See :meth:`Benchmark.__init__`.
             plugin_libraries: See :meth:`Benchmark.__init__`.
-            trtexec_path: Path to trtexec binary. Defaults to 'trtexec' which
-                         looks for the binary in PATH.
             trtexec_args: Additional command-line arguments to pass to trtexec.
                          These are appended after the standard arguments.
                          Example: ['--fp16', '--workspace=4096', '--verbose']
         """
         super().__init__(timing_cache_file, warmup_runs, timing_runs, plugin_libraries)
-        self.trtexec_path = trtexec_path
         self.trtexec_args = trtexec_args if trtexec_args is not None else []
         self.temp_dir = tempfile.mkdtemp(prefix="trtexec_benchmark_")
         self.engine_path = os.path.join(self.temp_dir, "engine.trt")
@@ -186,7 +181,6 @@ def __init__(
         self.latency_pattern = r"\[I\]\s+Latency:.*?median\s*=\s*([\d.]+)\s*ms"
 
         self._base_cmd = [
-            self.trtexec_path,
             f"--avgRuns={self.timing_runs}",
             f"--iterations={self.timing_runs}",
             f"--warmUp={self.warmup_runs}",
@@ -268,13 +262,14 @@ def run(
                 self.logger.debug(f"Wrote model bytes to temporary file: {model_path}")
 
             cmd = [*self._base_cmd, f"--onnx={model_path}"]
-            self.logger.debug(f"Running: {' '.join(cmd)}")
-            result = subprocess.run(cmd, capture_output=True, text=True)  # nosec B603
+            full_cmd = ["trtexec", *cmd]
+            self.logger.debug(f"Running: {' '.join(full_cmd)}")
+            result = _run_trtexec(cmd)
             self._write_log_file(
                 log_file,
                 "\n".join(
                     [
-                        f"Command: {' '.join(cmd)}",
+                        f"Command: {' '.join(full_cmd)}",
                         f"Return code: {result.returncode}",
                         "=" * 80,
                         "STDOUT:",
@@ -301,8 +296,9 @@ def run(
             self.logger.info(f"TrtExec benchmark (median): {latency:.2f} ms")
             return latency
         except FileNotFoundError:
-            self.logger.error(f"trtexec binary not found: {self.trtexec_path}")
-            self.logger.error("Please ensure TensorRT is installed and trtexec path is correct")
+            self.logger.error(
+                "'trtexec' binary not found. Please ensure TensorRT is installed and 'trtexec' is in PATH."
+            )
             return float("inf")
         except Exception as e:
             self.logger.error(f"Benchmark failed: {e}")
diff --git a/modelopt/onnx/quantization/ort_utils.py b/modelopt/onnx/quantization/ort_utils.py
index 2c5a0b7d2da..f7799c634f0 100755
--- a/modelopt/onnx/quantization/ort_utils.py
+++ b/modelopt/onnx/quantization/ort_utils.py
@@ -46,6 +46,30 @@ def _check_lib_in_ld_library_path(ld_library_path, lib_pattern):
     return False, None
 
 
+def _run_trtexec(
+    args: list[str] | None = None, timeout: float | None = None
+) -> subprocess.CompletedProcess:
+    """Run a 'trtexec' command via subprocess.
+
+    Args:
+        args: Arguments to pass to trtexec (without the 'trtexec' command itself).
+        timeout: Optional subprocess timeout in seconds.
+
+    Returns:
+        The completed subprocess result.
+
+    Raises:
+        FileNotFoundError: If the 'trtexec' binary is not found in PATH.
+    """
+    cmd = ["trtexec", *(args or [])]
+    try:
+        return subprocess.run(cmd, capture_output=True, text=True, timeout=timeout)  # nosec B603
+    except FileNotFoundError as e:
+        raise FileNotFoundError(
+            "'trtexec' binary not found. Please ensure TensorRT is installed and 'trtexec' is in PATH."
+        ) from e
+
+
 def _check_for_trtexec(min_version: str = "10.0") -> str:
     """Check if the `trtexec` CLI tool is available in PATH and is >= min_version.
 
@@ -89,7 +113,7 @@ def _parse_version_from_string(version_str: str) -> str | None:
         )
 
     try:
-        result = subprocess.run([trtexec_path], capture_output=True, text=True, timeout=5)  # nosec B603
+        result = _run_trtexec(timeout=5)
         banner_output = result.stdout + result.stderr
         parsed_version = _parse_version_from_string(banner_output)
 
diff --git a/modelopt/onnx/quantization/qdq_utils.py b/modelopt/onnx/quantization/qdq_utils.py
index 265bcf36b2a..28e6f8ada8b 100644
--- a/modelopt/onnx/quantization/qdq_utils.py
+++ b/modelopt/onnx/quantization/qdq_utils.py
@@ -15,6 +15,7 @@
 
 """Various utils to support inserting Q/DQ nodes."""
 
+import warnings
 from collections.abc import Sequence
 from typing import Any
 
@@ -31,7 +32,16 @@
     get_tensor_producer_nodes,
     remove_redundant_cast_nodes,
 )
-from modelopt.onnx.quantization.quant_utils import get_num_bits
+from modelopt.onnx.quantization.quant_utils import (
+    compute_e8m0,
+    get_amax,
+    get_num_bits,
+    get_weights_scaling_factor,
+    get_weights_scaling_factor_2,
+    pack_weights_to_int4,
+    quantize,
+)
+from modelopt.onnx.utils import get_attribute, has_attribute, read_f16_tensor_as_fp32
 
 QUANTIZE_NODE_NAME = "QuantizeLinear"
 DEQUANTIZE_NODE_NAME = "DequantizeLinear"
@@ -1224,3 +1234,384 @@ def get_quantized_tensors(onnx_model: onnx.ModelProto) -> set[str]:
 
     logger.debug(f"Found {len(quantized_tensors)} dequantized tensors in ONNX model")
     return quantized_tensors
+
+
+_LEGACY_LLM_EXPORT_DEPRECATION_MSG = (
+    "{name} in modelopt.onnx.quantization.qdq_utils is deprecated and will be "
+    "removed in a future release. Use modelopt.onnx.export "
+    "(INT4QuantExporter / NVFP4QuantExporter / MXFP8QuantExporter), or migrate to "
+    "TensorRT-Edge-LLM (https://github.com/NVIDIA/TensorRT-Edge-LLM)."
+)
+
+
+def quantize_weights_to_int4(
+    onnx_model: onnx.ModelProto,
+) -> onnx.ModelProto:
+    """Deprecated: convert ONNX model weights to INT4 with graph optimization.
+
+    Preserved as a compatibility shim for TensorRT-Edge-LLM 0.6.1 and earlier.
+    New code should use :class:`modelopt.onnx.export.int4_exporter.INT4QuantExporter`.
+    """
+    warnings.warn(
+        _LEGACY_LLM_EXPORT_DEPRECATION_MSG.format(name="quantize_weights_to_int4"),
+        DeprecationWarning,
+        stacklevel=2,
+    )
+
+    graph = onnx_model.graph
+    initializer_map = {initializer.name: initializer for initializer in graph.initializer}
+    value_info_map = {value_info.name: value_info for value_info in graph.value_info}
+    weight_dq_nodes = [node for node in graph.node if node.op_type == "DequantizeLinear"]
+    tensor_producer_map = get_tensor_producer_nodes(graph)
+
+    nodes_to_remove = []
+    for node in weight_dq_nodes:
+        weight_name = node.input[0]
+        scale_name = node.input[1]
+        logger.debug(f"Processing INT4 conversion for weight {weight_name}")
+        weight = numpy_helper.to_array(initializer_map[weight_name])
+        if scale_name in initializer_map:
+            scale = numpy_helper.to_array(initializer_map[scale_name])
+        else:
+            scale_constant_node = tensor_producer_map[scale_name]
+            for attr in scale_constant_node.attribute:
+                if attr.name == "value":
+                    tensor = attr.t
+                    scale = numpy_helper.to_array(tensor)
+
+        weight = weight / scale
+        block_size = weight.shape[-1]
+
+        # Convert DequantizeLinear -> Reshape -> Transpose -> MatMul/Gemm to DequantizeLinear -> Matmul/Gemm
+        dq_child_nodes = [n for n in graph.node if node.output[0] in n.input]
+        reshape_node = dq_child_nodes[0]
+        nodes_to_remove.append(reshape_node.name)
+        assert reshape_node.op_type == "Reshape", f"Expected Reshape node for {node.name}"
+        reshape_node_output = reshape_node.output[0]
+
+        # Remove constant node from reshape node
+        shape_constant_name = next(input for input in reshape_node.input if "Constant" in input)
+        nodes_to_remove.append(tensor_producer_map[shape_constant_name].name)
+
+        # Get the shape of the output of the reshape node
+        reshape_output_value_info = value_info_map.get(reshape_node_output)
+        if reshape_output_value_info is not None:
+            weight_shape = [
+                dim.dim_value for dim in reshape_output_value_info.type.tensor_type.shape.dim
+            ]
+        else:
+            raise ValueError(f"Unable to determine shape of weight tensor {weight_name}")
+
+        # Reshape weights and scales
+        weight = weight.reshape(weight_shape)
+        assert weight_shape[-1] % block_size == 0, (
+            f"Block size {block_size} is not divisible by {weight_shape[-1]}"
+        )
+        scale_shape = [*weight_shape[:-1], weight_shape[-1] // block_size]
+        scale = scale.reshape(scale_shape)
+        reshape_child_nodes = [n for n in graph.node if reshape_node.output[0] in n.input]
+        assert len(reshape_child_nodes) == 1, f"Expected exactly one child node for {node.name}"
+
+        # Check if there's an optional Cast node between Reshape and Transpose/MatMul/Gemm
+        next_node = reshape_child_nodes[0]
+        if next_node.op_type == "Cast":
+            # Remove unnecessary Cast node
+            cast_node = next_node
+            nodes_to_remove.append(cast_node.name)
+            cast_child_nodes = [n for n in graph.node if cast_node.output[0] in n.input]
+            next_node = cast_child_nodes[0]
+
+        # Transpose weights and scales if present
+        if next_node.op_type == "Transpose":
+            transpose_node = next_node
+            nodes_to_remove.append(transpose_node.name)
+            assert transpose_node.op_type == "Transpose", f"Expected Transpose node for {node.name}"
+            perm = None
+            for attr in transpose_node.attribute:
+                if attr.name == "perm":
+                    perm = list(attr.ints)
+            assert perm is not None, f"Permutation not found for {node.name}"
+            weight = weight.transpose(perm)
+            scale = scale.transpose(perm)
+            transpose_child_nodes = [n for n in graph.node if transpose_node.output[0] in n.input]
+            assert len(transpose_child_nodes) == 1, (
+                f"Expected exactly one matmul node for {node.name}"
+            )
+            matmul_node = transpose_child_nodes[0]
+        else:
+            matmul_node = next_node
+        assert matmul_node.op_type in ["MatMul", "Gemm"], (
+            f"Expected MatMul or Gemm node for {node.name}"
+        )
+        matmul_node.input[1] = node.output[0]
+
+        if scale_name not in initializer_map:
+            # Remove scale producer if it's a Constant node
+            scale_name = node.input[1]
+            scale_producer = tensor_producer_map[scale_name]
+            if scale_producer.op_type == "Constant":
+                graph.node.remove(scale_producer)
+
+            # Create a new scale tensor
+            scale_name = scale_name.replace("Constant_output_0", "scale")
+            scale_tensor = onnx.numpy_helper.from_array(scale, scale_name)
+            graph.initializer.append(scale_tensor)
+            node.input[1] = scale_name
+        else:
+            scale_tensor = onnx.numpy_helper.from_array(scale, scale_name)
+            initializer_map[scale_name].CopyFrom(scale_tensor)
+
+        # Convert weights to INT4 precision
+        weight_shape = weight.shape
+        weights_int4_np = pack_weights_to_int4(weight)
+        weights_int4_onnx = onnx.numpy_helper.from_array(weights_int4_np, weight_name)
+        weights_int4_onnx.data_type = onnx.TensorProto.INT4
+        weights_int4_onnx.dims[0] = weight_shape[0]
+        initializer_map[weight_name].CopyFrom(weights_int4_onnx)
+        logger.debug(f"Converted {weight_name} to INT4 precision")
+
+    def is_pre_quant_scale_node(node: onnx.NodeProto) -> bool:
+        has_pqs_input = any(input for input in node.input if "_pre_quant_scale" in input)
+        return node.op_type == "Mul" and has_pqs_input
+
+    # Remove unnecessary Cast after Pre-quant scale
+    for node in graph.node:
+        if is_pre_quant_scale_node(node):
+            pqs_child_nodes = [n for n in graph.node if node.output[0] in n.input]
+            assert len(pqs_child_nodes) == 1, f"Expected exactly one child node for {node.name}"
+            cast_node = pqs_child_nodes[0]
+            assert cast_node.op_type == "Cast", f"Expected Cast node for {node.name}"
+            node.output.clear()
+            node.output.extend(cast_node.output)
+            nodes_to_remove.append(cast_node.name)
+
+    # Remove transpose and reshape nodes
+    new_nodes = [node for node in graph.node if node.name not in nodes_to_remove]
+    del graph.node[:]
+    graph.node.extend(new_nodes)
+
+    # Cast bias to float16
+    for node in graph.node:
+        if node.op_type == "Add" and "proj/Add" in node.name:
+            cast_initializer_to_dtype(node, "Half", initializer_map)
+
+    # Cast pre quant scales of o_proj and down_proj to float16
+    for node in graph.node:
+        if node.op_type == "Mul" and (
+            any(
+                x in node.name
+                for x in ("o_proj/input_quantizer/Mul", "down_proj/input_quantizer/Mul")
+            )
+        ):
+            cast_initializer_to_dtype(node, "Half", initializer_map)
+
+    return onnx_model
+
+
+def quantize_weights_to_mxfp8(
+    onnx_model: onnx.ModelProto,
+) -> onnx.ModelProto:
+    """Deprecated: convert weights to MXFP8 (FP8 with e8m0 per-block scales).
+
+    Preserved as a compatibility shim for TensorRT-Edge-LLM 0.6.1 and earlier.
+    New code should use :class:`modelopt.onnx.export.mxfp8_exporter.MXFP8QuantExporter`.
+    """
+    warnings.warn(
+        _LEGACY_LLM_EXPORT_DEPRECATION_MSG.format(name="quantize_weights_to_mxfp8"),
+        DeprecationWarning,
+        stacklevel=2,
+    )
+
+    logger.info("Converting weights to MXFP8 precision")
+    graph = onnx_model.graph
+    initializer_map = {initializer.name: initializer for initializer in graph.initializer}
+    tensor_producer_map = get_tensor_producer_nodes(graph)
+    e8_m0_bias = 127
+    weight_dq_nodes = [
+        node
+        for node in graph.node
+        if node.op_type == "TRT_MXFP8DequantizeLinear"
+        and any(".weight" in input for input in node.input)
+    ]
+    gelu_nodes = [node for node in graph.node if node.op_type == "Gelu"]
+    logger.debug(f"Found {len(weight_dq_nodes)} weight DQ nodes and {len(gelu_nodes)} GELU nodes")
+
+    for node in weight_dq_nodes:
+        # Get weights and node attributes
+        weight_name = node.input[0]
+        logger.debug(f"Processing MXFP8 conversion for weight {weight_name}")
+        weight = numpy_helper.to_array(initializer_map[weight_name])
+        if has_attribute(node, "axis"):
+            quant_axis = int(get_attribute(node, "axis"))
+        else:
+            quant_axis = -1
+            logger.warning(
+                "axis attribute not found for MXFP8DequantizeLinear node. Setting axis to -1"
+            )
+
+        if has_attribute(node, "block_size"):
+            block_size = int(get_attribute(node, "block_size"))
+        else:
+            block_size = 32
+            logger.warning(
+                "block_size attribute not found for MXFP8DequantizeLinear node. Setting block_size to 32"
+            )
+
+        # Compute and save scales as uint8
+        amax = get_amax(weight, quant_axis, block_size)
+        se8m0_fp32 = compute_e8m0(amax, weight.shape, quant_axis, block_size)
+        se8m0 = se8m0_fp32.astype(np.uint8)
+
+        # Remove scale producer if it's a Constant node
+        scale_name = node.input[1]
+        scale_producer = tensor_producer_map[scale_name]
+        if scale_producer.op_type == "Constant":
+            graph.node.remove(scale_producer)
+
+        # Create a new scale tensor
+        scale_name = scale_name.replace("Constant_output_0", "scale")
+        scale_tensor = onnx.numpy_helper.from_array(se8m0, scale_name)
+        graph.initializer.append(scale_tensor)
+        node.input[1] = scale_name
+
+        # Convert weights to FP8
+        # Expand block array so that it can be broadcasted with weight
+        se8m0_fp32 = np.repeat(se8m0_fp32, block_size, axis=quant_axis)
+        scaled_weight = weight / np.exp2(se8m0_fp32 - e8_m0_bias)
+        weights_e4m3 = onnx.helper.make_tensor(
+            name=weight_name,
+            data_type=onnx_dtype_map["Float8"],
+            dims=[*scaled_weight.shape],
+            vals=_cast_fp8(scaled_weight).tobytes(),
+            raw=True,
+        )
+        initializer_map[weight_name].CopyFrom(weights_e4m3)
+        logger.debug(f"Converted {weight_name} to MXFP8")
+
+    # set output type of DQ to FP16
+    for node in graph.node:
+        if node.op_type in ["TRT_MXFP8DequantizeLinear"]:
+            for attr in node.attribute:
+                if attr.name == "output_dtype":
+                    attr.i = onnx_dtype_map["Half"]
+
+    # Currently only tanh approximation is supported for Gelu
+    for node in gelu_nodes:
+        for attr in node.attribute:
+            if attr.name == "approximate":
+                attr.s = b"tanh"
+                logger.debug(f"Updated GELU node {node.name} to use tanh approximation")
+
+    return onnx_model
+
+
+def fp4qdq_to_2dq(onnx_model: onnx.ModelProto, verbose: bool = False) -> onnx.ModelProto:
+    """Deprecated: convert FP32/FP16 weights of TRT_FP4QDQ nodes to FP4 + 2 DQ subgraph.
+
+    Preserved as a compatibility shim for TensorRT-Edge-LLM 0.6.1 and earlier.
+    New code should use :class:`modelopt.onnx.export.nvfp4_exporter.NVFP4QuantExporter`.
+    """
+    warnings.warn(
+        _LEGACY_LLM_EXPORT_DEPRECATION_MSG.format(name="fp4qdq_to_2dq"),
+        DeprecationWarning,
+        stacklevel=2,
+    )
+
+    # Lazy import to avoid a circular import: nvfp4_exporter imports from this module.
+    from modelopt.onnx.export.nvfp4_exporter import _cast_fp4, _replace_fp4qdq_with_2dq
+
+    logger.info("Converting model with FP4QDQ nodes to 2DQ only model")
+    graph = onnx_model.graph
+    initializers = graph.initializer
+    initializers_to_delete = []
+    tensor_consumers = get_tensor_consumer_nodes(graph)
+    initializer_indices = {
+        initializer.name: idx for idx, initializer in enumerate(graph.initializer)
+    }
+    value_info_map = {vi.name: vi for vi in graph.value_info}
+    graph_inputs = {inp.name for inp in graph.input}
+
+    def _cast_input_dtypes(node: onnx.NodeProto, precision_dtype: str):
+        # Change the input types to match weight precision (precision_dtype)
+        if node.op_type == "Transpose":
+            maybe_matmul = tensor_consumers[node.output[0]][0]
+            assert maybe_matmul.op_type == "MatMul"
+            node = maybe_matmul
+
+        # Create Cast nodes for each input of the target node except bias
+        for i, input_name in enumerate(node.input[:2]):
+            cast_output_name = input_name + "_f16"
+
+            cast_node = onnx.helper.make_node(
+                "Cast",
+                inputs=[input_name],
+                outputs=[cast_output_name],
+                to=onnx_dtype_map[precision_dtype],
+            )
+
+            graph.node.extend([cast_node])
+            node.input[i] = cast_output_name
+
+    def _get_precision_dtype() -> str:
+        precision_dtype = "Half"
+        for initializer in graph.initializer:
+            if initializer.data_type == onnx.TensorProto.BFLOAT16:
+                precision_dtype = "BFloat16"
+                break
+        return precision_dtype
+
+    if verbose:
+        logger.info("Post-processing TRT_FP4QDQ nodes for TRT deployment")
+    precision_dtype = _get_precision_dtype()
+    logger.debug(f"Using precision dtype: {precision_dtype}")
+    fp4_qdq_nodes = [node for node in graph.node if node.op_type == "TRT_FP4QDQ"]
+    logger.debug(f"Found {len(fp4_qdq_nodes)} FP4QDQ nodes to convert")
+
+    for node in fp4_qdq_nodes:
+        idx1 = initializer_indices.get(node.input[0], None)
+        assert idx1 is not None, f"Initializer for weight '{node.input[0]}' not found."
+        block_size_attr = next((attr for attr in node.attribute if attr.name == "block_size"), None)
+        assert block_size_attr is not None, f"block_size attribute not found for {node.name}"
+        block_size = block_size_attr.i
+        initializers_to_delete.append(initializers[idx1].name)
+        logger.debug(
+            f"Processing FP4QDQ node for weight {node.input[0]} with block size {block_size}"
+        )
+
+        tensor = initializers[idx1]
+        w32 = read_f16_tensor_as_fp32(tensor)
+        sw_f32_per_tensor = get_weights_scaling_factor_2(w32)
+        sw_f32_per_block = get_weights_scaling_factor(w32, block_size, sw_f32_per_tensor)
+        w_f32 = quantize(w32, block_size, sw_f32_per_block, sw_f32_per_tensor)
+
+        # Real quantize the tensors
+        w_f4 = _cast_fp4(w_f32)
+        sw_f8_per_block = _cast_fp8(sw_f32_per_block)
+
+        _replace_fp4qdq_with_2dq(
+            graph,
+            node,
+            initializer_indices,
+            value_info_map,
+            graph_inputs,
+            w_f4,
+            sw_f32_per_tensor,
+            sw_f8_per_block,
+            block_size,
+        )
+
+        # We need to change the bias etc. type
+        next_node = tensor_consumers[node.output[0]][0]
+        _cast_input_dtypes(next_node, precision_dtype)
+
+        if verbose:
+            logger.debug(f"Replaced {node.name} with 2 DQ nodes")
+
+    new_initializers = [
+        init for init in graph.initializer if init.name not in initializers_to_delete
+    ]
+    graph.ClearField("initializer")
+    graph.initializer.extend(new_initializers)
+    logger.info(f"Removed {len(initializers_to_delete)} initializers")
+
+    return onnx_model
diff --git a/modelopt/torch/_deploy/_runtime/tensorrt/constants.py b/modelopt/torch/_deploy/_runtime/tensorrt/constants.py
index c4f387482e9..d9ace1645a3 100644
--- a/modelopt/torch/_deploy/_runtime/tensorrt/constants.py
+++ b/modelopt/torch/_deploy/_runtime/tensorrt/constants.py
@@ -32,10 +32,6 @@
 ONE_GIBI_IN_BYTES = 1 << 30
 
 # TensorRT conversion tool names
-TRTEXEC = "trtexec"
-
-# trtexec path within docker
-TRTEXEC_PATH = "trtexec"
 DEFAULT_ARTIFACT_DIR = "modelopt_build/trt_artifacts"
 
 # Default conversion params
diff --git a/modelopt/torch/_deploy/_runtime/tensorrt/engine_builder.py b/modelopt/torch/_deploy/_runtime/tensorrt/engine_builder.py
index 055a1f26b27..bb8bbd292b8 100644
--- a/modelopt/torch/_deploy/_runtime/tensorrt/engine_builder.py
+++ b/modelopt/torch/_deploy/_runtime/tensorrt/engine_builder.py
@@ -15,10 +15,9 @@
 
 import logging
 import shutil
-import subprocess  # nosec
 import sys
 from pathlib import Path
-from tempfile import NamedTemporaryFile, TemporaryDirectory, gettempdir
+from tempfile import TemporaryDirectory, gettempdir
 
 from ..._runtime.common import read_bytes, timeit, write_bytes, write_string
 from ..._runtime.tensorrt.layerwise_profiling import process_layerwise_result
@@ -28,7 +27,6 @@
     DEFAULT_NUM_INFERENCE_PER_RUN,
     SHA_256_HASH_LENGTH,
     TRT_MODE_FLAGS,
-    TRTEXEC_PATH,
     WARMUP_TIME_MS,
     TRTMode,
 )
@@ -41,31 +39,50 @@
 )
 
 
-# TODO: Get rid of this function or get approval for `# nosec` usage if we want to include this
-#   as a non-compiled python file in the release.
-def _run_command(cmd: list[str], cwd: Path | None = None) -> tuple[int, bytes]:
-    """Util function to execute a command.
+def _run_trtexec_with_logging(args: list[str], cwd: Path | None = None) -> tuple[int, bytes]:
+    """Run a 'trtexec' command via subprocess, logging the cmd and any failure output.
 
-    This util will not direct stdout and stderr to console if the cmd succeeds.
+    The 'trtexec' binary is hardcoded as the executable; only its arguments may be supplied
+    by the caller. This restricts the function to trtexec invocations.
+
+    Output handling: stdout and stderr are merged and captured in memory.
+    On failure (non-zero returncode) or timeout, the captured output is logged at ERROR level;
+    on success, this function emits nothing to the console.
 
     Args:
-        cmd: the command line list
-        cwd: current working directory
+        args: Arguments to pass to trtexec (without the 'trtexec' command itself).
+        cwd: Optional working directory for the subprocess.
 
     Returns:
-        return code: 0 means successful, otherwise means failed
-        log_string: the stdout and stderr output as a string
+        A tuple of (returncode, output) where output is the combined stdout/stderr bytes.
 
+    Raises:
+        FileNotFoundError: If the 'trtexec' binary is not found in PATH.
+        subprocess.TimeoutExpired: If trtexec does not finish within 60 minutes.
+            The captured output is logged before re-raising.
     """
+    import subprocess  # nosec
+
+    cmd = ["trtexec", *args]
     logging.info(" ".join(cmd))
-    with NamedTemporaryFile("w+b") as log:
-        p = subprocess.Popen(cmd, stdout=log, stderr=log, cwd=str(cwd) if cwd else None)  # nosec
-        p.wait()
-        log.seek(0)
-        output = log.read()
-        if p.returncode != 0:
-            logging.error(output.decode(errors="ignore"))
-        return p.returncode, output
+    try:
+        result = subprocess.run(  # nosec B603 - cmd[0] is hardcoded "trtexec"
+            cmd,
+            stdout=subprocess.PIPE,
+            stderr=subprocess.STDOUT,
+            cwd=str(cwd) if cwd else None,
+            timeout=3600,
+        )
+    except FileNotFoundError as e:
+        raise FileNotFoundError(
+            "'trtexec' binary not found. Please ensure TensorRT is installed and 'trtexec' is in PATH."
+        ) from e
+    except subprocess.TimeoutExpired as e:
+        logging.error((e.stdout or b"").decode(errors="ignore"))
+        raise
+    if result.returncode != 0:
+        logging.error(result.stdout.decode(errors="ignore"))
+    return result.returncode, result.stdout
 
 
 def _get_profiling_params(profiling_runs: int) -> list[str]:
@@ -181,7 +198,7 @@ def _build_command(
         calib_cache_path: Path | None = None,
         timing_cache_path: Path | None = None,
     ) -> list[str]:
-        cmd = [TRTEXEC_PATH, f"--onnx={onnx_path}"]
+        cmd = [f"--onnx={onnx_path}"]
         cmd.extend(TRT_MODE_FLAGS[trt_mode])
 
         if trt_mode == TRTMode.INT8 and calib_cache and calib_cache_path:
@@ -235,7 +252,7 @@ def _setup_files_and_paths(
         cmd = _build_command(onnx_path, engine_path, calib_cache_path, timing_cache_path)
 
         try:
-            ret_code, out = _run_command(cmd)
+            ret_code, out = _run_trtexec_with_logging(cmd)
             if ret_code != 0:
                 return None, out
 
@@ -284,7 +301,7 @@ def profile_engine(
     """
 
     def _build_command(engine_path: Path, profile_path: Path, layer_info_path: Path) -> list[str]:
-        cmd = [TRTEXEC_PATH, f"--loadEngine={engine_path}"]
+        cmd = [f"--loadEngine={engine_path}"]
         cmd += _get_profiling_params(profiling_runs)
 
         if enable_layerwise_profiling:
@@ -320,7 +337,7 @@ def _setup_files_and_paths(tmp_dir_path: Path, engine_hash: str) -> tuple[Path,
         cmd = _build_command(engine_path, profile_path, layer_info_path)
 
         try:
-            ret_code, out = _run_command(cmd)
+            ret_code, out = _run_trtexec_with_logging(cmd)
             if ret_code != 0:
                 return None, out
 
diff --git a/modelopt/torch/export/moe_utils.py b/modelopt/torch/export/moe_utils.py
index 952ed1e39c1..8981d614843 100644
--- a/modelopt/torch/export/moe_utils.py
+++ b/modelopt/torch/export/moe_utils.py
@@ -62,6 +62,29 @@ def _export_fused_experts(module: nn.Module, dtype: torch.dtype) -> None:
     for idx in range(n):
         expert = nn.Module()
 
+        # If the gate_up source quantizer was never calibrated (rare expert
+        # that received no calibration tokens), derive its amax once from the
+        # FUSED tensor so gate and up share the same weight_scale_2 below.
+        # Why: vLLM fuses W1 (gate) and W3 (up) at load time and asserts a
+        # single per-tensor scale across the fusion. The per-projection
+        # fallback further down would otherwise compute amax independently from
+        # each half — gate's max and up's max generally differ — producing
+        # mismatched weight_scale_2 and garbled MoE output at inference.
+        gate_up_q = module.gate_up_proj_weight_quantizers[idx]
+        if getattr(gate_up_q, "is_enabled", False) and (
+            not hasattr(gate_up_q, "_amax")
+            or gate_up_q._amax is None
+            or torch.all(gate_up_q._amax == 0)
+        ):
+            gate_up_q.amax = gate_up[idx].abs().amax().to(torch.float32)
+            warnings.warn(
+                f"Expert {idx} gate_up_proj weight quantizer was not calibrated "
+                f"(amax missing or zero). Using fused-tensor amax as fallback "
+                f"(shared by gate and up so weight_scale_2 stays consistent). "
+                f"Consider increasing calibration size to activate all experts.",
+                stacklevel=2,
+            )
+
         projections = [
             ("gate_proj", gate_up[idx, :expert_dim, :], 0, fused_dim0, True),
             ("up_proj", gate_up[idx, expert_dim:, :], expert_dim, fused_dim0, True),
diff --git a/modelopt/torch/export/unified_export_hf.py b/modelopt/torch/export/unified_export_hf.py
index ed6ed2fcf21..19deea08b45 100644
--- a/modelopt/torch/export/unified_export_hf.py
+++ b/modelopt/torch/export/unified_export_hf.py
@@ -59,6 +59,7 @@
 )
 from modelopt.torch.quantization.qtensor import MXFP8QTensor, NVFP4QTensor
 from modelopt.torch.quantization.utils import fsdp2_aware_weight_update, quantizer_attr_names
+from modelopt.torch.utils.dataset_utils import _disable_use_cache
 
 try:
     from modelopt.torch.sparsity.attention_sparsity.conversion import export_sparse_attention_config
@@ -217,11 +218,14 @@ def _output_hook(module, input, output):
     if not handles:
         return input_to_linear, output_to_layernorm
 
-    # Run dummy forward pass to collect modules sharing same input
+    # Run dummy forward pass to collect modules sharing same input.
+    # `_disable_use_cache` keeps the probe forward working on configs that don't
+    # set `use_cache` (e.g., stepfun-ai/Step-3.5-Flash's Step3p5Config).
     try:
         with (
             torch.no_grad(),
             set_quantizer_by_cfg_context(model, [{"quantizer_name": "*", "enable": False}]),
+            _disable_use_cache(model),
         ):
             dummy_forward_fn()
     finally:
diff --git a/modelopt/torch/opt/plugins/transformers.py b/modelopt/torch/opt/plugins/transformers.py
index 9cc729723e7..3370309156d 100644
--- a/modelopt/torch/opt/plugins/transformers.py
+++ b/modelopt/torch/opt/plugins/transformers.py
@@ -20,6 +20,8 @@
 from contextlib import contextmanager
 
 import torch
+import transformers
+from packaging.version import Version
 from transformers import PreTrainedModel, Trainer, TrainerCallback
 from transformers import modeling_utils as tf_modeling_utils
 
@@ -130,13 +132,18 @@ def _save_pretrained_with_checks(self, save_directory, *args, **kwargs):
 
 # [Fix for huggingface bug] deepspeed zero3 training backend only loads params into the model from
 # state_dict, but not buffers. So lets explicitly load the buffers into the model from state_dict.
+# The `load_config` parameter was added to `_load_state_dict_into_zero3_model` in transformers 5.0.
+_TRANSFORMERS_GE_5_0 = Version(transformers.__version__) >= Version("5.0")
+
+
 def _load_params_and_buffers_into_zero3_model(model_to_load, state_dict, load_config=None):
     buffer_names = [name for name, _ in model_to_load.named_buffers()]
     buffer_state_dict = {k: v for k, v in state_dict.items() if k in buffer_names}
     model_to_load.load_state_dict(buffer_state_dict, strict=False)
-    return tf_modeling_utils._modelopt_cache["_load_state_dict_into_zero3_model"](
-        model_to_load, state_dict, load_config
-    )
+    cached_fn = tf_modeling_utils._modelopt_cache["_load_state_dict_into_zero3_model"]
+    if _TRANSFORMERS_GE_5_0:
+        return cached_fn(model_to_load, state_dict, load_config)
+    return cached_fn(model_to_load, state_dict)
 
 
 pretrained_model_patch_methods = [
diff --git a/modelopt/torch/quantization/model_calib.py b/modelopt/torch/quantization/model_calib.py
index 0aec4411e0e..c7c666f8ad9 100644
--- a/modelopt/torch/quantization/model_calib.py
+++ b/modelopt/torch/quantization/model_calib.py
@@ -1307,6 +1307,21 @@ def postprocess(module, name):
                     dtype=w_dtype,
                     device=w_device,
                 )
+                # Mirror the calibrated postprocess path, gated on
+                # is_input_quantized so weight-only AWQ configs (where
+                # setup() never disabled input_quantizer) stay untouched.
+                # Collapse any per-channel _amax left over from cache_mode
+                # max_calibrate into a per-tensor scalar so
+                # preprocess_linear_fusion's numel==1 assertion passes, and
+                # re-enable the quantizer (awq_lite.setup disabled it).
+                if module.awq_lite.is_input_quantized:
+                    if module.input_quantizer.amax is not None:
+                        act_amax = module.input_quantizer.amax
+                        module.input_quantizer._amax_for_smoothing = act_amax.cpu()
+                        module.input_quantizer.reset_amax()
+                        module.input_quantizer.axis = None
+                        module.input_quantizer.amax = act_amax.amax()
+                    module.input_quantizer.enable()
             else:
                 with enable_weight_access_and_writeback(module, model, name_to_module):
                     postprocess(module, name)
diff --git a/modelopt/torch/quantization/qtensor/nvfp4_tensor.py b/modelopt/torch/quantization/qtensor/nvfp4_tensor.py
index 6ff31424c77..c3c3f164458 100644
--- a/modelopt/torch/quantization/qtensor/nvfp4_tensor.py
+++ b/modelopt/torch/quantization/qtensor/nvfp4_tensor.py
@@ -122,10 +122,16 @@ def get_weights_scaling_factor_from_quantizer(
             expected_shape = (*weight.shape[:-1], num_blocks_per_row)
             per_block_scale = per_block_scale.view(expected_shape)
 
-            # Quantize scales to FP8
+            # Quantize scales to FP8. Saturate to the fp8_e4m3fn max (448) before the
+            # cast: when the [==0]=1.0 safety net above fires (per_block_amax was zero
+            # for an all-zero weight block) and global_amax is small, the pre-cast value
+            # explodes to ``1.0 * 448 / (global_amax/6)``. fp8_e4m3fn has no Inf, so any
+            # value >= 480 casts to NaN — clamp first to keep the stored byte finite.
             if not keep_high_precision:
-                per_block_scale = (per_block_scale * 448.0 / per_block_scale_max).to(
-                    torch.float8_e4m3fn
+                per_block_scale = (
+                    (per_block_scale * 448.0 / per_block_scale_max)
+                    .clamp_(max=448.0)
+                    .to(torch.float8_e4m3fn)
                 )
             return per_block_scale, weights_scaling_factor_2
         else:
diff --git a/pyproject.toml b/pyproject.toml
index a174c6218d8..0dc27f85086 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -75,14 +75,14 @@ onnx = [
 hf = [
     "accelerate>=1.0.0",
     "datasets>=3.0.0",
-    "deepspeed>=0.9.6; platform_system != 'Darwin' and platform_system != 'Windows'",
+    "deepspeed>=0.9.6,<0.19; platform_system != 'Darwin' and platform_system != 'Windows'",
     "diffusers>=0.32.2",
     "huggingface_hub>=0.24.0",
     "nltk",
     "peft>=0.17.0",
-    "sentencepiece>=0.2.1",                                                           # Also implicitly used in test_unified_export_megatron, test_vllm_fakequant_megatron_export
+    "sentencepiece>=0.2.1",                                                                 # Also implicitly used in test_unified_export_megatron, test_vllm_fakequant_megatron_export
     "tiktoken",
-    "transformers>=4.56,<5.6",                                                        # Should match modelopt/torch/__init__.py and noxfile.py
+    "transformers>=4.56,<5.6",                                                              # Should match modelopt/torch/__init__.py and noxfile.py
     "wonderwords",
 ]
 
diff --git a/tests/examples/llm_eval/test_llm_eval.py b/tests/examples/llm_eval/test_llm_eval.py
index 0abf78b53e9..356430ea6f6 100644
--- a/tests/examples/llm_eval/test_llm_eval.py
+++ b/tests/examples/llm_eval/test_llm_eval.py
@@ -15,16 +15,38 @@
 
 import subprocess
 
-from _test_utils.examples.models import TINY_LLAMA_PATH
-from _test_utils.examples.run_command import run_llm_ptq_command
+from _test_utils.examples.run_command import (
+    extend_cmd_parts,
+    run_example_command,
+    run_llm_ptq_command,
+)
 from _test_utils.torch.misc import minimum_sm
+from _test_utils.torch.transformers_models import create_tiny_qwen3_dir
+
+
+def test_lm_eval_hf(tmp_path):
+    model_dir = create_tiny_qwen3_dir(tmp_path, with_tokenizer=True)
+
+    cmd_parts = extend_cmd_parts(
+        ["python", "lm_eval_hf.py"],
+        model="hf",
+        model_args=f"pretrained={model_dir}",
+        tasks="mmlu",
+        num_fewshot=5,
+        limit=0.1,
+        batch_size=8,
+    )
+    run_example_command(cmd_parts, "llm_eval")
 
 
 @minimum_sm(89)
-def test_llama_eval_fp8():
+def test_qwen3_eval_fp8(tmp_path):
+    # Bump max_position_embeddings: TRT-LLM serve rejects prompts longer than
+    # max_seq_len, and the default (32) is shorter than even simple MMLU prompts.
+    model_dir = create_tiny_qwen3_dir(tmp_path, with_tokenizer=True, max_position_embeddings=2048)
     try:
         run_llm_ptq_command(
-            model=TINY_LLAMA_PATH,
+            model=str(model_dir),
             quant="fp8",
             tasks="mmlu,lm_eval,simple_eval",
             calib=64,
diff --git a/tests/gpu/torch/quantization/test_nvfp4_static_quantizer_cuda.py b/tests/gpu/torch/quantization/test_nvfp4_static_quantizer_cuda.py
index b1b3691a797..430b7ee4113 100644
--- a/tests/gpu/torch/quantization/test_nvfp4_static_quantizer_cuda.py
+++ b/tests/gpu/torch/quantization/test_nvfp4_static_quantizer_cuda.py
@@ -21,6 +21,7 @@
 from modelopt.torch.quantization.calib import NVFP4MSECalibrator
 from modelopt.torch.quantization.config import QuantizerAttributeConfig
 from modelopt.torch.quantization.nn import NVFP4StaticQuantizer, TensorQuantizer
+from modelopt.torch.quantization.qtensor import NVFP4QTensor
 from modelopt.torch.quantization.tensor_quant import (
     scaled_e4m3_impl,
     static_blockwise_fp4_fake_quant,
@@ -64,6 +65,51 @@ def test_global_amax_property(self, device):
         quantizer.global_amax = None
         assert quantizer.global_amax is None
 
+    def test_export_fp8_scale_no_nan_for_zero_amax_block(self, device):
+        """Regression: export must not emit fp8 NaN bytes for an all-zero block.
+
+        When max-only calibration leaves ``_amax = 0`` for a fully-zero weight block,
+        the export's ``[per_block_scale == 0] = 1.0`` safety net drives the pre-cast
+        value to ``1.0 * 448 / (global_amax / 6)``. fp8_e4m3fn has no Inf, so any
+        pre-cast value >= 480 rounds to NaN — without a saturation clamp this writes
+        a 0x7F byte into ``weight_scale``. Reproduces the NaN seen in the saved
+        Kimi-K2.6-NVFP4-MSE checkpoint at expert 21 down_proj.
+        """
+        block_size = 16
+        cfg = QuantizerAttributeConfig(
+            num_bits=(2, 1),
+            block_sizes={-1: block_size, "type": "static", "scale_bits": (4, 3)},
+        )
+        quantizer = NVFP4StaticQuantizer(quant_attribute_cfg=cfg).to(device)
+
+        # Two-block weight: block 0 is non-trivial; block 1 is all zeros so its
+        # per-block amax is exactly 0.
+        weight = torch.zeros(1, 2 * block_size, device=device, dtype=torch.bfloat16)
+        weight[0, :block_size] = 0.1
+
+        per_block_amax = weight.abs().reshape(1, 2, block_size).amax(dim=-1).flatten()
+        quantizer.amax = per_block_amax
+        quantizer.global_amax = per_block_amax.max()
+
+        # Sanity: the bug only fires when the would-be cast value exceeds 480.
+        # With global_amax = 0.1, scale_in_fp8 for a zero block is
+        # 1.0 * 448 / (0.1 / 6) ≈ 26880 — well past the 480 NaN threshold.
+        assert (per_block_amax == 0).any()
+        assert quantizer.global_amax.float().item() < 1.0
+
+        weight_scale, _ = NVFP4QTensor.get_weights_scaling_factor_from_quantizer(
+            quantizer, weight, weights_scaling_factor_2=None
+        )
+        assert weight_scale.dtype == torch.float8_e4m3fn
+
+        # No fp8_e4m3fn NaN bytes (NaN encoding is (b & 0x7F) == 0x7F).
+        raw = weight_scale.view(torch.uint8)
+        n_nan = ((raw & 0x7F) == 0x7F).sum().item()
+        assert n_nan == 0, f"fp8 weight_scale contains {n_nan} NaN byte(s)"
+
+        # The all-zero block's stored fp8 scale should saturate to 448 (max finite).
+        assert raw.flatten()[1].item() == 0x7E
+
     def test_fake_quantize_with_both_amaxs(self, device):
         """Test _fake_quantize uses both _amax and _global_amax."""
         num_blocks = 4
diff --git a/tests/unit/onnx/quantization/test_qdq_utils.py b/tests/unit/onnx/quantization/test_qdq_utils.py
index 8af5f560dd0..0ff3686a610 100644
--- a/tests/unit/onnx/quantization/test_qdq_utils.py
+++ b/tests/unit/onnx/quantization/test_qdq_utils.py
@@ -1108,3 +1108,96 @@ def test_constant_node_scale_path_still_patched(self):
         scale_arr = numpy_helper.to_array(value_attr.t)
         assert not (scale_arr == 0).any()
         assert (scale_arr > 0).all()
+
+
+class TestLegacyEdgeLLMShims:
+    """Smoke tests for the deprecated top-level shims kept for TensorRT-Edge-LLM 0.6.1.
+
+    These are the functions edgellm 0.6.1 imports from
+    ``modelopt.onnx.quantization.qdq_utils`` directly (not via the staged exporters).
+    Tests verify each shim runs end-to-end on the same fixtures used for the staged
+    exporters and emits a ``DeprecationWarning``.
+    """
+
+    def test_quantize_weights_to_int4_shim(self):
+        import warnings
+
+        from modelopt.onnx.quantization.qdq_utils import quantize_weights_to_int4
+
+        model = create_test_model_with_int4_dq_reshape_transpose_matmul()
+
+        with warnings.catch_warnings(record=True) as caught:
+            warnings.simplefilter("always")
+            quantized_model = quantize_weights_to_int4(model)
+
+        assert any(
+            issubclass(w.category, DeprecationWarning)
+            and "quantize_weights_to_int4" in str(w.message)
+            for w in caught
+        )
+
+        weight_tensor = next(
+            init for init in quantized_model.graph.initializer if init.name == "weight"
+        )
+        assert weight_tensor.data_type == TensorProto.INT4
+
+        node_types = [node.op_type for node in quantized_model.graph.node]
+        assert "Reshape" not in node_types
+        assert "Transpose" not in node_types
+
+    def test_quantize_weights_to_mxfp8_shim(self):
+        import warnings
+
+        from modelopt.onnx.quantization.qdq_utils import quantize_weights_to_mxfp8
+
+        model = create_test_model_with_mxfp8_dq()
+
+        with warnings.catch_warnings(record=True) as caught:
+            warnings.simplefilter("always")
+            quantized_model = quantize_weights_to_mxfp8(model)
+
+        assert any(
+            issubclass(w.category, DeprecationWarning)
+            and "quantize_weights_to_mxfp8" in str(w.message)
+            for w in caught
+        )
+
+        weight_tensor = next(
+            init for init in quantized_model.graph.initializer if init.name == "linear.weight"
+        )
+        assert weight_tensor.data_type == TensorProto.FLOAT8E4M3FN
+
+        gelu_node = next(node for node in quantized_model.graph.node if node.op_type == "Gelu")
+        approximate_attr = next(attr for attr in gelu_node.attribute if attr.name == "approximate")
+        assert approximate_attr.s == b"tanh"
+
+    @pytest.mark.parametrize("with_transpose", [False, True])
+    def test_fp4qdq_to_2dq_shim(self, with_transpose):
+        import warnings
+
+        from modelopt.onnx.quantization.qdq_utils import fp4qdq_to_2dq
+
+        model = create_test_model_with_nvfp4_qdq(with_transpose=with_transpose)
+
+        with warnings.catch_warnings(record=True) as caught:
+            warnings.simplefilter("always")
+            converted_model = fp4qdq_to_2dq(model)
+
+        assert any(
+            issubclass(w.category, DeprecationWarning) and "fp4qdq_to_2dq" in str(w.message)
+            for w in caught
+        )
+
+        fp4qdq_nodes = [node for node in converted_model.graph.node if node.op_type == "TRT_FP4QDQ"]
+        assert len(fp4qdq_nodes) == 0
+
+        dq_nodes = [
+            node for node in converted_model.graph.node if node.op_type == "DequantizeLinear"
+        ]
+        assert len(dq_nodes) == 2
+
+        initializer_names = {init.name for init in converted_model.graph.initializer}
+        assert "linear.weight_f4" in initializer_names
+        assert "linear.weight_f8_scale" in initializer_names
+        assert "linear.weight_f8_scale_f32_scale" in initializer_names
+        assert "linear.weight" not in initializer_names
diff --git a/tests/unit/torch/deploy/_runtime/tensorrt/test_engine_builder.py b/tests/unit/torch/deploy/_runtime/tensorrt/test_engine_builder.py
index 38fce51f4aa..ff7f77cf617 100755
--- a/tests/unit/torch/deploy/_runtime/tensorrt/test_engine_builder.py
+++ b/tests/unit/torch/deploy/_runtime/tensorrt/test_engine_builder.py
@@ -55,7 +55,7 @@ def setup_mocks():
 
     with (
         mock.patch(
-            "modelopt.torch._deploy._runtime.tensorrt.engine_builder._run_command"
+            "modelopt.torch._deploy._runtime.tensorrt.engine_builder._run_trtexec_with_logging"
         ) as mock_run,
         mock.patch(
             "modelopt.torch._deploy._runtime.tensorrt.engine_builder.TemporaryDirectory"
diff --git a/tests/unit/torch/quantization/plugins/test_fused_experts.py b/tests/unit/torch/quantization/plugins/test_fused_experts.py
index 29435827748..e0ce2f0c66e 100644
--- a/tests/unit/torch/quantization/plugins/test_fused_experts.py
+++ b/tests/unit/torch/quantization/plugins/test_fused_experts.py
@@ -300,6 +300,94 @@ def test_export_creates_per_expert_submodules(self):
         if QuantModuleRegistry.get(expert_type) is not None:
             QuantModuleRegistry.unregister(expert_type)
 
+    def test_uncalibrated_expert_gate_up_share_amax(self, monkeypatch):
+        """gate_proj and up_proj must share weight_scale_2 even when an expert
+        was never routed during calibration.
+
+        Regression for the bug where ``_export_fused_experts``'s per-projection
+        fallback computed amax independently from the gate and up halves of the
+        fused tensor — producing mismatched ``weight_scale_2`` values for any
+        uncalibrated expert. vLLM fuses W1 (gate) and W3 (up) at load time and
+        asserts a single shared scale; mismatched scales corrupted MoE output.
+        The fix derives the fallback amax once from the fused ``gate_up[idx]``
+        tensor before the deepcopies, so gate's clone and up's clone start with
+        the same amax.
+        """
+        from modelopt.torch.export.moe_utils import _export_fused_experts
+
+        # Build experts where gate and up have very different magnitudes —
+        # any per-half fallback would clearly produce different amaxes.
+        experts = _SyntheticFusedExperts()
+        gate = torch.randn(NUM_EXPERTS, INTERMEDIATE_DIM, HIDDEN_DIM) * 0.02
+        up = torch.randn(NUM_EXPERTS, INTERMEDIATE_DIM, HIDDEN_DIM) * 0.20
+        with torch.no_grad():
+            experts.gate_up_proj.copy_(torch.cat([gate, up], dim=1))
+
+        expert_type = type(experts)
+        if QuantModuleRegistry.get(expert_type) is None:
+            QuantModuleRegistry.register({expert_type: "test.SyntheticFusedExperts"})(
+                _QuantFusedExperts
+            )
+        try:
+            converted = QuantModuleRegistry.convert(experts)
+
+            # Leave every expert weight quantizer uncalibrated (no _amax).
+            # Mark them enabled to exercise the export-time fallback path.
+            for q in converted.gate_up_proj_weight_quantizers:
+                q._disabled = False
+            for q in converted.down_proj_weight_quantizers:
+                q._disabled = False
+
+            # Capture the amax each per-projection wrapper carries into the
+            # FP4 quantization step. Patching here avoids needing CUDA / FP4.
+            seen = {}  # (expert_idx, proj_name) -> amax tensor
+
+            def _spy_export(wrapper, dtype):
+                # Identify which expert/projection this wrapper belongs to by
+                # matching the weight tensor against the fused parameters.
+                w = wrapper.weight.data
+                # gate_up_proj is (N, 2*INTER, HIDDEN); split halves are
+                # contiguous .data views or .contiguous() copies — we can match
+                # by shape and value identity for this synthetic case.
+                amax = wrapper.weight_quantizer._amax.detach().clone()
+                # Identify by matching against gate vs. up slices of each expert.
+                for idx in range(NUM_EXPERTS):
+                    g_slice = converted.gate_up_proj.data[idx, :INTERMEDIATE_DIM, :]
+                    u_slice = converted.gate_up_proj.data[idx, INTERMEDIATE_DIM:, :]
+                    d_slice = converted.down_proj.data[idx]
+                    if w.shape == g_slice.shape and torch.equal(w, g_slice):
+                        seen[(idx, "gate_proj")] = amax
+                        return
+                    if w.shape == u_slice.shape and torch.equal(w, u_slice):
+                        seen[(idx, "up_proj")] = amax
+                        return
+                    if w.shape == d_slice.shape and torch.equal(w, d_slice):
+                        seen[(idx, "down_proj")] = amax
+                        return
+
+            monkeypatch.setattr(
+                "modelopt.torch.export.unified_export_hf._export_quantized_weight",
+                _spy_export,
+            )
+
+            _export_fused_experts(converted, torch.float16)
+
+            # Assert: for every expert, gate's amax matches up's amax.
+            for idx in range(NUM_EXPERTS):
+                g_amax = seen.get((idx, "gate_proj"))
+                u_amax = seen.get((idx, "up_proj"))
+                assert g_amax is not None and u_amax is not None, (
+                    f"Expert {idx}: missing recorded amax (gate={g_amax}, up={u_amax})"
+                )
+                assert torch.allclose(g_amax, u_amax), (
+                    f"Expert {idx}: gate amax {g_amax.item()} != up amax {u_amax.item()}. "
+                    f"Uncalibrated fused experts must share gate/up amax so that "
+                    f"weight_scale_2 stays consistent across the fusion."
+                )
+        finally:
+            if QuantModuleRegistry.get(expert_type) is not None:
+                QuantModuleRegistry.unregister(expert_type)
+
 
 # ---------------------------------------------------------------------------
 # Tests for force_eager_experts_impl_on_the_fly
diff --git a/tests/unit/torch/quantization/test_calib.py b/tests/unit/torch/quantization/test_calib.py
index d2e6fdd03e8..a39ee55d9d2 100644
--- a/tests/unit/torch/quantization/test_calib.py
+++ b/tests/unit/torch/quantization/test_calib.py
@@ -312,6 +312,84 @@ def test_padded_awq():
     model(torch.randn(2, 16, 16))
 
 
+class _TwoBranchModel(nn.Module):
+    """Two parallel linears; only the first is exercised by forward_loop."""
+
+    def __init__(self):
+        super().__init__()
+        self.calibrated = nn.Linear(16, 16, bias=False)
+        self.uncalibrated = nn.Linear(16, 16, bias=False)
+
+    def forward(self, x, branch="calibrated"):
+        if branch == "calibrated":
+            return self.calibrated(x)
+        return self.uncalibrated(x)
+
+
+@pytest.mark.skipif(not torch.cuda.is_available(), reason="NVFP4 dynamic block quant is CUDA-only")
+def test_awq_lite_uncalibrated_linear_keeps_input_quantizer_enabled():
+    """Regression test for NVBug 6143871.
+
+    awq_lite.setup() disables the input_quantizer at the start of search. The
+    calibrated branch re-enables it inside postprocess(); the uncalibrated
+    branch (no cache-pass tokens, e.g. an MoE expert that never gets routed)
+    must do the same — otherwise downstream export (set_expert_quantizer_amax
+    + _export_quantized_weight) drops the input_scale buffer and inference
+    runtimes that read per-expert input_scale (e.g. TRT-LLM CutlassFusedMoE)
+    crash with KeyError on '<idx>.w1.input_scale'.
+
+    Also asserts the export-critical scalar amax invariant (axis=None,
+    numel==1) — preprocess_linear_fusion enforces it for fused-expert groups.
+    """
+    torch.manual_seed(0)
+    model = _TwoBranchModel().cuda()
+
+    def _forward_loop(m):
+        for _ in range(2):
+            m(torch.randn(2, 16, 16, device="cuda"), branch="calibrated")
+
+    mtq.quantize(model, mtq.NVFP4_AWQ_LITE_CFG, _forward_loop)
+
+    assert model.calibrated.input_quantizer.is_enabled
+    assert model.uncalibrated.input_quantizer.is_enabled, (
+        "Uncalibrated linear's input_quantizer must remain enabled after "
+        "awq_lite postprocess so export emits input_scale (NVBug 6143871)."
+    )
+    uncal_q = model.uncalibrated.input_quantizer
+    # When amax exists (cache-hit but search-miss path), it must be the
+    # scalar form export expects — preprocess_linear_fusion asserts numel==1.
+    # When it's None (truly never routed), set_expert_quantizer_amax will
+    # populate it during export.
+    if uncal_q.amax is not None:
+        assert uncal_q.axis is None
+        assert uncal_q.amax.numel() == 1
+
+
+def test_awq_lite_uncalibrated_weight_only_keeps_input_quantizer_disabled():
+    """Weight-only AWQ companion to NVBug 6143871.
+
+    For weight-only AWQ configs (input_quantizer disabled), awq_lite.setup()
+    never touches the input_quantizer, so the postprocess uncalibrated branch
+    must NOT enable it — doing so turns on quantization the user's config had
+    explicitly opted out of.
+    """
+    torch.manual_seed(0)
+    model = _TwoBranchModel()
+
+    def _forward_loop(m):
+        for _ in range(2):
+            m(torch.randn(2, 16, 16), branch="calibrated")
+
+    mtq.quantize(model, mtq.INT4_AWQ_CFG, _forward_loop)
+
+    assert not model.calibrated.input_quantizer.is_enabled
+    assert not model.uncalibrated.input_quantizer.is_enabled, (
+        "Weight-only AWQ must not flip on the input_quantizer for "
+        "uncalibrated layers — that would silently quantize activations "
+        "the user's config left in full precision."
+    )
+
+
 def test_smoothquant_enable_disable():
     torch.manual_seed(1234)
     model = _SimpleMLP()