From a9cf8ebc9009c1f700af44b4204d74fbb2b45a24 Mon Sep 17 00:00:00 2001
From: ajrasane <131806219+ajrasane@users.noreply.github.com>
Date: Thu, 5 Feb 2026 00:58:19 +0000
Subject: [PATCH 01/12] Add support for Qwen3Omni30B thinking model

Signed-off-by: ajrasane <131806219+ajrasane@users.noreply.github.com>
---
 examples/llm_ptq/example_utils.py             |  26 ++-
 examples/llm_ptq/hf_ptq.py                    | 155 ++++++++++++++--
 modelopt/torch/export/model_utils.py          |  58 +++---
 modelopt/torch/export/unified_export_hf.py    |  10 +
 .../torch/quantization/plugins/huggingface.py |  18 ++
 modelopt/torch/utils/__init__.py              |   1 +
 modelopt/torch/utils/dataset_utils.py         | 127 +++++++++++--
 modelopt/torch/utils/image_processor.py       | 171 ++++++++++++++++++
 8 files changed, 496 insertions(+), 70 deletions(-)

diff --git a/examples/llm_ptq/example_utils.py b/examples/llm_ptq/example_utils.py
index 58eb676111..5fe9ab5ad6 100755
--- a/examples/llm_ptq/example_utils.py
+++ b/examples/llm_ptq/example_utils.py
@@ -45,7 +45,12 @@
 except ImportError:
     snapshot_download = None
 
-from modelopt.torch.utils.image_processor import BaseImageProcessor, MllamaImageProcessor
+import modelopt.torch.quantization as mtq
+from modelopt.torch.utils.image_processor import (
+    BaseImageProcessor,
+    MllamaImageProcessor,
+    Qwen3OmniImageProcessor,
+)
 
 logger = logging.getLogger(__name__)
 
@@ -284,7 +289,7 @@ def get_processor(
     if attn_implementation is not None:
         model_kwargs["attn_implementation"] = attn_implementation
 
-    if model_type == "whisper":
+    if model_type in ("whisper", "mllama", "qwen3omni"):
         processor = AutoProcessor.from_pretrained(
             ckpt_path,
             padding_side="left",
@@ -296,20 +301,11 @@ def get_processor(
             f"Pad token for {ckpt_path} cannot be set!"
         )
 
+        if model_type == "mllama":
+            return MllamaImageProcessor(processor, device)
+        elif model_type == "qwen3omni":
+            return Qwen3OmniImageProcessor(processor, device)
         return processor
-    elif model_type == "mllama":
-        processor = AutoProcessor.from_pretrained(
-            ckpt_path,
-            padding_side="left",
-            **model_kwargs,
-        )
-        if processor.tokenizer.pad_token is None:
-            processor.tokenizer.pad_token = processor.tokenizer.eos_token
-        assert processor.tokenizer.pad_token is not None, (
-            f"Pad token for {ckpt_path} cannot be set!"
-        )
-
-        return MllamaImageProcessor(processor, device)
     else:
         # Try to load AutoProcessor for other VL models (e.g., Nemotron-Parse)
         try:
diff --git a/examples/llm_ptq/hf_ptq.py b/examples/llm_ptq/hf_ptq.py
index b81dc60c01..4f7d3430f8 100755
--- a/examples/llm_ptq/hf_ptq.py
+++ b/examples/llm_ptq/hf_ptq.py
@@ -15,7 +15,9 @@
 
 import argparse
 import copy
+import io
 import random
+import sys
 import time
 import warnings
 from typing import Any
@@ -68,12 +70,26 @@
     create_forward_loop,
     get_dataset_dataloader,
     get_max_batch_size,
+    get_qwen3omni_text_dataloader,
     get_supported_datasets,
 )
-from modelopt.torch.utils.image_processor import BaseImageProcessor, MllamaImageProcessor
+from modelopt.torch.utils.image_processor import (
+    BaseImageProcessor,
+    MllamaImageProcessor,
+    Qwen3OmniImageProcessor,
+    Qwen3OmniTextProcessor,
+)
 from modelopt.torch.utils.memory_monitor import launch_memory_monitor
 from modelopt.torch.utils.speech_dataset_utils import get_speech_dataset_dataloader
-from modelopt.torch.utils.vlm_dataset_utils import get_vlm_dataset_dataloader
+from modelopt.torch.utils.video_dataset_utils import (
+    Qwen3OmniVideoProcessor,
+    get_supported_video_datasets,
+    get_video_dataset_dataloader,
+)
+from modelopt.torch.utils.vlm_dataset_utils import (
+    get_supported_vlm_datasets,
+    get_vlm_dataset_dataloader,
+)
 
 RAND_SEED = 1234
 
@@ -208,6 +224,51 @@ def make_calib_dataloader(
             batch_size=args.batch_size,
             num_samples=args.calib_size[0],
         )
+    elif model_type == "qwen3omni":
+        assert processor is not None, "The processor must be set for qwen3omni model."
+        dataset_name = args.dataset[0] if args.dataset else "cnn_dailymail"
+        # Check if using video dataset (e.g., finevideo)
+        if dataset_name in get_supported_video_datasets():
+            video_processor = Qwen3OmniVideoProcessor(
+                processor.tokenizer if hasattr(processor, "tokenizer") else processor,
+                device=device,
+                dtype=language_model.dtype,
+                use_audio_in_video=True,
+            )
+            calib_dataloader = get_video_dataset_dataloader(
+                dataset_name=dataset_name,
+                processor=video_processor,
+                batch_size=args.batch_size,
+                num_samples=args.calib_size[0],
+            )
+        elif dataset_name in get_supported_vlm_datasets():
+            assert isinstance(processor, Qwen3OmniImageProcessor), (
+                "The Qwen3OmniImageProcessor must be set."
+            )
+            # Set the dtype for proper tensor conversion in collate_function
+            processor.dtype = language_model.dtype
+            calib_dataloader = get_vlm_dataset_dataloader(
+                dataset_name=dataset_name,
+                processor=processor,
+                batch_size=args.batch_size,
+                num_samples=args.calib_size[0],
+            )
+        else:
+            # Text-only datasets (e.g., cnn_dailymail)
+            # Use Qwen3OmniTextProcessor to apply proper conversation template
+            # See: https://huggingface.co/Qwen/Qwen3-Omni-30B-A3B-Thinking
+            text_processor = Qwen3OmniTextProcessor(
+                processor=processor.tokenizer,  # Pass the underlying HF processor
+                device=device,
+                dtype=language_model.dtype,
+            )
+            calib_dataloader = get_qwen3omni_text_dataloader(
+                dataset_name=dataset_name,
+                processor=text_processor,
+                batch_size=args.batch_size,
+                num_samples=args.calib_size[0],
+            )
+        print(f"Selected dataset for calibration: {dataset_name}")
     elif model_type == "whisper":
         assert processor is not None and isinstance(processor, WhisperProcessor), (
             "The AutoProcessor must be set."
@@ -391,6 +452,9 @@ def load_model(args: argparse.Namespace):
         calibration_only = True
 
     model_type = get_model_type(full_model)
+    if model_type == "qwen3omni":
+        print("Disabling talker for Qwen3Omni model")
+        full_model.disable_talker()
 
     device = full_model.device
     if hasattr(full_model, "model"):
@@ -408,7 +472,7 @@ def load_model(args: argparse.Namespace):
         print("Nemotron VL model detected. Enabling image-text calibration by default.")
         args.calib_with_images = True
 
-    if model_type == "mllama":
+    if model_type in ["mllama", "qwen3omni"]:
         processor = get_processor(
             args.pyt_ckpt_path,
             model_type,
@@ -555,6 +619,15 @@ def mono_quantize(
         quant_cfg["quant_cfg"]["*model_encoder*"] = {"enable": False}  # Nemotron-Parse specific
         print("Quantization will only be applied to the decoder (text generation) component")
 
+    # For Qwen3Omni models, disable quantization of conv layers
+    if model_type == "qwen3omni":
+        print(
+            "Disabling quantization for conv layers, audio tower and visual encoder in Qwen3Omni model"
+        )
+        quant_cfg["quant_cfg"]["*conv*"] = {"enable": False}
+        quant_cfg["quant_cfg"]["*audio_tower*"] = {"enable": False}
+        quant_cfg["quant_cfg"]["*visual*"] = {"enable": False}
+
     if not model_is_already_quantized or calibration_only:
         # quantize the model
 
@@ -735,9 +808,10 @@ def pre_quantize(
 
     """
     # Only run single sample for preview
-    preview_input_ids = next(iter(calib_dataloader))[
-        "input_features" if model_type == "whisper" else "input_ids"
-    ][0:1]
+    calib_batch = next(iter(calib_dataloader))
+    preview_input_ids = calib_batch["input_features" if model_type == "whisper" else "input_ids"][
+        0:1
+    ]
 
     # Generate preview before quantization
     if args.skip_generate:
@@ -759,10 +833,21 @@ def pre_quantize(
             "before quantization",
             allow_fallback=False,
         )
+    elif model_type == "qwen3omni":
+        # Qwen3Omni returns (text_ids, audio) tuple; text_ids has .sequences
+        # Pass full batch with all multimodal inputs
+        result = full_model.generate(**calib_batch, max_new_tokens=100)
+        if isinstance(result, tuple):
+            text_ids, _ = result
+            generated_ids_before_ptq = (
+                text_ids.sequences if hasattr(text_ids, "sequences") else text_ids
+            )
+        else:
+            generated_ids_before_ptq = result
     else:
         generated_ids_before_ptq = full_model.generate(preview_input_ids, max_new_tokens=100)
 
-    return preview_input_ids, generated_ids_before_ptq
+    return preview_input_ids, generated_ids_before_ptq, calib_batch
 
 
 def post_quantize(
@@ -775,6 +860,7 @@ def post_quantize(
     generated_ids_before_ptq,
     is_nemotron_vl_model,
     first_text_speech_dataset,
+    calib_batch: dict | None = None,
 ):
     """
     Processing after the quantization.
@@ -785,18 +871,38 @@ def post_quantize(
     """
 
     if args.verbose:
-        try:
+        if args.quant_summary_path:
+            # Capture the summary output to a file
+            old_stdout = sys.stdout
+            sys.stdout = buffer = io.StringIO()
+            try:
+                mtq.print_quant_summary(full_model, args.export_path)
+            finally:
+                sys.stdout = old_stdout
+            summary = buffer.getvalue()
+            with open(args.quant_summary_path, "w") as f:
+                f.write(summary)
+            print(f"Quantization summary saved to {args.quant_summary_path}")
+        else:
             mtq.print_quant_summary(full_model, args.export_path)
-            save_expert_token_count_table(full_model, args.export_path)
-        except Exception as e:
-            print(f"Error saving quant summary: {e}")
-            print("Continuing with generation...")
+        save_expert_token_count_table(full_model, args.export_path)
 
     # Run some samples
     torch.cuda.empty_cache()
     generated_ids_after_ptq = None
     if generated_ids_before_ptq is None:
         pass
+    elif model_type == "qwen3omni":
+        # Qwen3Omni returns (text_ids, audio) tuple; text_ids has .sequences
+        # Pass full batch with all multimodal inputs
+        result = full_model.generate(**calib_batch, max_new_tokens=100)
+        if isinstance(result, tuple):
+            text_ids, _ = result
+            generated_ids_after_ptq = (
+                text_ids.sequences if hasattr(text_ids, "sequences") else text_ids
+            )
+        else:
+            generated_ids_after_ptq = result
     elif model_type != "llama4" and not is_nemotron_vl_model:
         # Our fake quantizer may not be fully compatible with torch.compile.
         generated_ids_after_ptq = full_model.generate(preview_input_ids, max_new_tokens=100)
@@ -815,12 +921,13 @@ def post_quantize(
         )
 
     def input_decode(input_ids):
-        if processor is not None and isinstance(processor, MllamaImageProcessor):
-            return processor.tokenizer.batch_decode(input_ids)
+        # BaseImageProcessor covers MllamaImageProcessor and Qwen3OmniImageProcessor
+        if processor is not None and isinstance(processor, BaseImageProcessor):
+            return processor.tokenizer.batch_decode(input_ids, skip_special_tokens=True)
         elif processor is not None and isinstance(processor, WhisperProcessor):
             return first_text_speech_dataset
         elif tokenizer is not None:
-            return tokenizer.batch_decode(input_ids)
+            return tokenizer.batch_decode(input_ids, skip_special_tokens=True)
         else:
             raise ValueError("The processor or tokenizer must be set")
 
@@ -832,6 +939,12 @@ def output_decode(generated_ids, input_shape):
                 return tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
         elif processor is not None and isinstance(processor, MllamaImageProcessor):
             return processor.tokenizer.batch_decode(generated_ids[:, input_shape:])
+        elif processor is not None and isinstance(processor, Qwen3OmniImageProcessor):
+            return processor.tokenizer.batch_decode(
+                generated_ids[:, input_shape:],
+                skip_special_tokens=True,
+                clean_up_tokenization_spaces=False,
+            )
         elif tokenizer is not None:
             return tokenizer.batch_decode(generated_ids[:, input_shape:])
         else:
@@ -919,7 +1032,7 @@ def quantize_main(
     # Detect if this is a Nemotron VL model using architecture-based detection
     is_nemotron_vl_model = is_nemotron_vl(full_model)
 
-    preview_input_ids, generated_ids_before_ptq = pre_quantize(
+    preview_input_ids, generated_ids_before_ptq, calib_batch = pre_quantize(
         args, full_model, model_type, tokenizer, calib_dataloader, is_nemotron_vl_model
     )
 
@@ -1014,6 +1127,7 @@ def quantize_main(
         generated_ids_before_ptq,
         is_nemotron_vl_model,
         first_text_speech_dataset,
+        calib_batch,
     )
     export_quantized(
         args,
@@ -1238,6 +1352,15 @@ def parse_args() -> argparse.Namespace:
         help="Export as vLLM fake-quant checkpoint (produces vllm_fq_modelopt_state.pth "
         "for use with vllm_serve_fakequant.py).",
     )
+    parser.add_argument(
+        "--quant_summary_path",
+        type=str,
+        default=None,
+        help=(
+            "Path to save the quantization summary. If not specified, summary is printed to stdout. "
+            "Requires --verbose to be enabled (default: True)."
+        ),
+    )
 
     args = parser.parse_args()
     if args.moe_calib_experts_ratio is not None and not (0.0 < args.moe_calib_experts_ratio <= 1.0):
diff --git a/modelopt/torch/export/model_utils.py b/modelopt/torch/export/model_utils.py
index 3bd72d9de9..b71e53bacf 100755
--- a/modelopt/torch/export/model_utils.py
+++ b/modelopt/torch/export/model_utils.py
@@ -17,45 +17,46 @@
 import torch.nn as nn
 
 MODEL_NAME_TO_TYPE = {
+    "ArcticForCausalLM": "llama",
+    "baichuan": "baichuan",
+    "Bart": "bart",
+    "Bloom": "bloom",
+    "ChatGLM": "chatglm",
+    "Dbrx": "dbrx",
+    "Deepseek": "deepseek",
+    "ExaoneForCausalLM": "exaone",
+    "FalconForCausalLM": "falcon",
+    "Gemma": "gemma",
+    "Gemma2": "gemma2",
+    "Gemma3": "gemma3",
+    "GLM": "glm",
     "GPT2": "gpt",
-    "Mllama": "mllama",
-    "Llama4": "llama4",
+    "GPTJ": "gptj",
+    "gptoss": "gptoss",
+    "InternLM2ForCausalLM": "internlm",
     "Llama": "llama",
+    "Llama4": "llama4",
     "Mistral": "llama",
-    "GPTJ": "gptj",
-    "FalconForCausalLM": "falcon",
-    "RWForCausalLM": "falcon",
-    "baichuan": "baichuan",
+    "MixtralForCausalLM": "llama",
+    "Mllama": "mllama",
     "MPT": "mpt",
-    "Bloom": "bloom",
-    "ChatGLM": "chatglm",
+    "Nemotron": "gpt",
+    "phi": "phi",
+    "phi3": "phi3",
+    "phi3small": "phi3small",
+    "Phi4MMForCausalLM": "phi4mm",
+    "PhiMoEForCausalLM": "phi3",
     "Qwen3Moe": "qwen3moe",
     "Qwen3Next": "qwen3next",
+    "Qwen3OmniMoeForConditionalGeneration": "qwen3omni",
     "QWen": "qwen",
     "RecurrentGemma": "recurrentgemma",
-    "Gemma3": "gemma3",
-    "Gemma2": "gemma2",
-    "Gemma": "gemma",
-    "phi3small": "phi3small",
-    "phi3": "phi3",
-    "PhiMoEForCausalLM": "phi3",
-    "Phi4MMForCausalLM": "phi4mm",
-    "phi": "phi",
-    "TLGv4ForCausalLM": "phi",
-    "MixtralForCausalLM": "llama",
-    "ArcticForCausalLM": "llama",
+    "RWForCausalLM": "falcon",
     "StarCoder": "gpt",
-    "Dbrx": "dbrx",
     "T5": "t5",
-    "Bart": "bart",
-    "GLM": "glm",
-    "InternLM2ForCausalLM": "internlm",
-    "ExaoneForCausalLM": "exaone",
+    "TLGv4ForCausalLM": "phi",
     "NemotronH": "nemotron_h",
-    "Nemotron": "gpt",
-    "Deepseek": "deepseek",
     "Whisper": "whisper",
-    "gptoss": "gptoss",
     "MiniMax": "minimax",
 }
 
@@ -149,6 +150,9 @@ def get_language_model_from_vl(model) -> list[nn.Module] | None:
     if hasattr(model, "language_model"):
         return [model, model.language_model]
 
+    if hasattr(model, "thinker"):
+        return [model, model.thinker]
+
     # Pattern 3: For encoder-decoder VL models (e.g., Nemotron-Parse), the decoder is the language model.
     # Only match if the model is detected as multimodal to avoid matching non-VLM encoder-decoder
     # models like T5, Bart, Whisper which also have .decoder.
diff --git a/modelopt/torch/export/unified_export_hf.py b/modelopt/torch/export/unified_export_hf.py
index 14a12bcdf3..e105c46aa4 100644
--- a/modelopt/torch/export/unified_export_hf.py
+++ b/modelopt/torch/export/unified_export_hf.py
@@ -1181,6 +1181,16 @@ def export_hf_checkpoint(
         if getattr(model, "hf_quantizer", None) is not None:
             model.hf_quantizer = None
 
+        # Fix generation_config conflicts before saving
+        # Some models have temperature/top_p/top_k set but do_sample=False which causes validation errors
+        if hasattr(model, "generation_config") and model.generation_config is not None:
+            gen_config = model.generation_config
+            if not getattr(gen_config, "do_sample", True):
+                # Remove sampling-related params when do_sample is False
+                for attr in ["temperature", "top_p", "top_k"]:
+                    if hasattr(gen_config, attr):
+                        setattr(gen_config, attr, None)
+
         # Save model
         # Temporarily disable revert_weight_conversion if available — it doesn't handle
         # quantized state dicts (scalar scale tensors have 0 dimensions, causing IndexError).
diff --git a/modelopt/torch/quantization/plugins/huggingface.py b/modelopt/torch/quantization/plugins/huggingface.py
index 0d02716a6e..b03c46fd03 100644
--- a/modelopt/torch/quantization/plugins/huggingface.py
+++ b/modelopt/torch/quantization/plugins/huggingface.py
@@ -1180,6 +1180,24 @@ def unpack_weight(self):
     pass
 
 
+try:
+    from transformers.models.qwen3_omni_moe.modeling_qwen3_omni_moe import (
+        Qwen3OmniMoeTalkerTextSparseMoeBlock,
+        Qwen3OmniMoeThinkerTextSparseMoeBlock,
+    )
+
+    if Qwen3OmniMoeTalkerTextSparseMoeBlock not in QuantModuleRegistry:
+        QuantModuleRegistry.register(
+            {Qwen3OmniMoeTalkerTextSparseMoeBlock: "hf.Qwen3OmniMoeTalkerTextSparseMoeBlock"}
+        )(_QuantSparseMoe)
+    if Qwen3OmniMoeThinkerTextSparseMoeBlock not in QuantModuleRegistry:
+        QuantModuleRegistry.register(
+            {Qwen3OmniMoeThinkerTextSparseMoeBlock: "hf.Qwen3OmniMoeThinkerTextSparseMoeBlock"}
+        )(_QuantSparseMoe)
+except ImportError:
+    pass
+
+
 class _QuantGptOssExperts(_QuantFunctionalMixin):
     """Quantized wrapper for `transformers.GptOssExperts`.
 
diff --git a/modelopt/torch/utils/__init__.py b/modelopt/torch/utils/__init__.py
index f026e747a8..354212d56e 100644
--- a/modelopt/torch/utils/__init__.py
+++ b/modelopt/torch/utils/__init__.py
@@ -27,4 +27,5 @@
 from .regex import *
 from .robust_json import *
 from .tensor import *
+from .video_dataset_utils import *
 from .vlm_dataset_utils import *
diff --git a/modelopt/torch/utils/dataset_utils.py b/modelopt/torch/utils/dataset_utils.py
index 00cdff8877..9a5b7ccf98 100644
--- a/modelopt/torch/utils/dataset_utils.py
+++ b/modelopt/torch/utils/dataset_utils.py
@@ -112,6 +112,7 @@
     "get_dataset_samples",
     "get_jsonl_text_samples",
     "get_max_batch_size",
+    "get_qwen3omni_text_dataloader",
     "get_supported_datasets",
 ]
 
@@ -211,6 +212,84 @@ def _auto_preprocess_sample(
     )
 
 
+def get_qwen3omni_text_dataloader(
+    dataset_name: str | list[str] = "cnn_dailymail",
+    processor=None,
+    batch_size: int = 1,
+    num_samples: int | list[int] = 512,
+) -> DataLoader:
+    """Get a text-only dataloader for Qwen3-Omni with proper conversation template applied.
+
+    This function applies the Qwen3-Omni chat template to text samples before tokenization,
+    which is required for proper calibration of Qwen3-Omni models with text-only datasets.
+
+    See: https://huggingface.co/Qwen/Qwen3-Omni-30B-A3B-Thinking
+
+    Args:
+        dataset_name: Name of the dataset(s) to load.
+        processor: Qwen3OmniTextProcessor instance wrapping the Qwen3OmniMoeProcessor.
+        batch_size: Batch size of the returned dataloader.
+        num_samples: Number of samples from the dataset.
+
+    Returns:
+        A DataLoader with properly formatted inputs for Qwen3-Omni.
+    """
+    assert processor is not None, "Please provide a Qwen3OmniTextProcessor."
+
+    if isinstance(num_samples, int):
+        num_samples = [num_samples]
+
+    if isinstance(dataset_name, str):
+        dataset_name = [dataset_name]
+
+    assert len(dataset_name) == len(num_samples), (
+        "dataset_name and num_samples must be the same length"
+    )
+
+    # Get raw text samples
+    all_samples = []
+    for ds_name, num_sample in zip(dataset_name, num_samples):
+        samples = get_dataset_samples(ds_name, num_sample)
+        all_samples.extend(samples)
+
+    # Preprocess each sample with the conversation template
+    processed_samples = []
+    for text in all_samples:
+        # Apply conversation template and tokenize
+        values = processor.preprocess_function(text)
+
+        # Convert to lists for dataset compatibility
+        sample_dict = {}
+        for key, val in values.items():
+            if val is not None and hasattr(val, "tolist"):
+                sample_dict[key] = val.tolist()
+            elif val is not None:
+                sample_dict[key] = val
+        processed_samples.append(sample_dict)
+
+    # Create dataset
+    class _Qwen3OmniTextDataset(torch.utils.data.Dataset):
+        def __init__(self, samples):
+            self.samples = samples
+
+        def __getitem__(self, idx):
+            return self.samples[idx]
+
+        def __len__(self):
+            return len(self.samples)
+
+    dataset = _Qwen3OmniTextDataset(processed_samples)
+
+    calib_dataloader = DataLoader(
+        dataset,
+        batch_size=batch_size,
+        shuffle=False,
+        collate_fn=processor.collate_function,
+    )
+
+    return calib_dataloader
+
+
 def get_dataset_samples(
     dataset_name: str,
     num_samples: int,
@@ -452,8 +531,8 @@ def _get_free_gpu_mem():
     torch.cuda.empty_cache()
 
     free_mem_before, max_allocated_before = _get_free_gpu_mem()
-    is_enc_dec = model_type_is_enc_dec(model)
-    infer_method = model.generate if is_enc_dec else model.forward
+    use_generate = _should_use_generate(model)
+    infer_method = model.generate if use_generate else model.forward
 
     if sample_input_single_batch is None:
         sample_input_single_batch = (
@@ -519,11 +598,15 @@ def _process_batch(batch_data, infer_method, max_working_batch_size=None):
     Returns:
         The maximum batch size that worked successfully
     """
-    assert all(torch.is_tensor(data) or data is None for data in batch_data.values()), (
-        "batch_data values must be tensors"
+    # Separate tensor values from scalar parameters (like max_new_tokens)
+    tensor_data = {k: v for k, v in batch_data.items() if torch.is_tensor(v) or v is None}
+    scalar_data = {k: v for k, v in batch_data.items() if not torch.is_tensor(v) and v is not None}
+
+    assert all(torch.is_tensor(data) or data is None for data in tensor_data.values()), (
+        "tensor_data values must be tensors"
     )
     # Get the batch size of current data
-    batch_size = batch_data[next(iter(batch_data.keys()))].shape[0]
+    batch_size = tensor_data[next(iter(batch_data.keys()))].shape[0]
 
     # If we know a smaller batch size works, preemptively split
     if max_working_batch_size is not None and batch_size > max_working_batch_size:
@@ -531,11 +614,13 @@ def _process_batch(batch_data, infer_method, max_working_batch_size=None):
         for i in range(0, batch_size, max_working_batch_size):
             end_idx = min(i + max_working_batch_size, batch_size)
             split_data = {}
-            for key in batch_data:
-                if batch_data[key] is None:
+            for key in tensor_data:
+                if tensor_data[key] is None:
                     split_data[key] = None
                 else:
-                    split_data[key] = batch_data[key][i:end_idx, ...]
+                    split_data[key] = tensor_data[key][i:end_idx, ...]
+            # Add back scalar data (non-tensor params like max_new_tokens)
+            split_data.update(scalar_data)
 
             max_working_batch_size = _process_batch(
                 split_data, infer_method, max_working_batch_size
@@ -562,8 +647,11 @@ def _process_batch(batch_data, infer_method, max_working_batch_size=None):
     # Split the batch in half
     mid = (batch_size + 1) // 2
     warn(f"CUDA out of memory with batch size {batch_size}, trying with batch size {mid}")
-    split_data_1 = {key: batch_data[key][:mid, ...] for key in batch_data}
-    split_data_2 = {key: batch_data[key][mid:, ...] for key in batch_data}
+    split_data_1 = {key: tensor_data[key][:mid, ...] for key in tensor_data}
+    split_data_2 = {key: tensor_data[key][mid:, ...] for key in tensor_data}
+    # Add back scalar data (non-tensor params like max_new_tokens)
+    split_data_1.update(scalar_data)
+    split_data_2.update(scalar_data)
 
     # Recursively process each half and track max working batch size
     max_working_batch_size = _process_batch(split_data_1, infer_method)
@@ -581,11 +669,14 @@ def _forward_loop(model: torch.nn.Module, dataloader: DataLoader) -> None:
         dataloader: DataLoader containing the batched input data
     """
     with torch.no_grad():
-        is_enc_dec = model_type_is_enc_dec(model)
-        infer_method = model.generate if is_enc_dec else model.forward
+        use_generate = _should_use_generate(model)
+        infer_method = model.generate if use_generate else model.forward
         max_working_batch_size = None  # Initialize max working batch size as None
 
         for _, data in enumerate(tqdm(dataloader)):
+            # For generate(), add max_new_tokens to prevent indefinite generation during calibration
+            if use_generate:
+                data["max_new_tokens"] = 1
             # Process batch and update max working batch size
             max_working_batch_size = _process_batch(data, infer_method, max_working_batch_size)
 
@@ -753,3 +844,15 @@ def download_hf_dataset_as_jsonl(
         jsonl_paths.append(jsonl_file_path)
 
     return jsonl_paths
+
+
+def _should_use_generate(model):
+    """Check if model should use generate() instead of forward() for calibration.
+
+    Returns True for:
+    - Encoder-decoder models (t5, bart, whisper)
+    - Conditional generation models that don't support standard forward() (qwen3omni)
+    """
+    generate_model_list = ["qwen3omni"]
+    model_name = model.__class__.__name__.lower()
+    return model_type_is_enc_dec(model) or any(name in model_name for name in generate_model_list)
diff --git a/modelopt/torch/utils/image_processor.py b/modelopt/torch/utils/image_processor.py
index 6374642e3d..07deca7fc4 100644
--- a/modelopt/torch/utils/image_processor.py
+++ b/modelopt/torch/utils/image_processor.py
@@ -110,3 +110,174 @@ def collate_function(self, batch):
             ).to(self.device)
 
         return batch[0]
+
+
+class Qwen3OmniTextProcessor(BaseImageProcessor):
+    """Text-only processor for Qwen3-Omni that applies proper conversation template.
+
+    This processor wraps raw text in the Qwen3-Omni conversation format and applies
+    the chat template before tokenization. Use this for text-only calibration datasets.
+
+    See: https://huggingface.co/Qwen/Qwen3-Omni-30B-A3B-Thinking
+    """
+
+    def __init__(self, processor, device="auto", dtype=None):
+        """Constructor.
+
+        Args:
+            processor: The Qwen3OmniMoeProcessor (from AutoProcessor.from_pretrained).
+            device: Device to move tensors to.
+            dtype: dtype for float tensors (e.g., torch.bfloat16). If None, uses default.
+        """
+        super().__init__(processor, device)
+        self.dtype = dtype
+
+    def preprocess_function(self, text: str) -> dict:
+        """Preprocess a single text sample by applying conversation template.
+
+        Args:
+            text: Raw text string from dataset.
+
+        Returns:
+            Dictionary with tokenized inputs.
+        """
+        # Build conversation in Qwen format (text-only)
+        conversation = [{"role": "user", "content": [{"type": "text", "text": text}]}]
+        formatted_text = self.tokenizer.apply_chat_template(
+            conversation, add_generation_prompt=True, tokenize=False, enable_thinking=False
+        )
+
+        # Tokenize with the processor (no multimodal inputs)
+        values = self.tokenizer(
+            text=formatted_text,
+            audio=None,
+            images=None,
+            videos=None,
+            return_tensors="pt",
+            padding=True,
+        )
+
+        return values
+
+    def collate_function(self, batch):
+        """Collate function to process text inputs during data loading."""
+        result = {}
+        first = batch[0]
+
+        if "input_ids" in first and first["input_ids"] is not None:
+            result["input_ids"] = torch.LongTensor(first["input_ids"]).to(self.device)
+        if "attention_mask" in first and first["attention_mask"] is not None:
+            result["attention_mask"] = torch.LongTensor(first["attention_mask"]).to(self.device)
+
+        return result
+
+
+class Qwen3OmniImageProcessor(BaseImageProcessor):
+    """Image processor for Qwen3-Omni multimodal model."""
+
+    def __init__(self, tokenizer, device="auto", use_audio_in_video=False):
+        """Constructor."""
+        super().__init__(tokenizer, device)
+        self.use_audio_in_video = use_audio_in_video
+        # Try to import qwen_omni_utils for multimodal processing
+        try:
+            from qwen_omni_utils import process_mm_info
+
+            self.process_mm_info = process_mm_info
+        except ImportError:
+            raise ImportError(
+                "qwen_omni_utils is required for Qwen3OmniImageProcessor. "
+                "Please install it from https://github.com/QwenLM/Qwen3-Omni"
+            )
+
+    def preprocess_function(self, examples):
+        """Preprocess function for Qwen3-Omni."""
+        question = examples.get("question", "Describe this image.")
+
+        # Build conversation in Qwen format
+        content = []
+        if examples.get("image") is not None:
+            content.append({"type": "image", "image": examples["image"]})
+        if examples.get("audio") is not None:
+            content.append({"type": "audio", "audio": examples["audio"]})
+        if examples.get("video") is not None:
+            content.append({"type": "video", "video": examples["video"]})
+        content.append({"type": "text", "text": question})
+
+        conversation = [{"role": "user", "content": content}]
+        text = self.tokenizer.apply_chat_template(
+            conversation, add_generation_prompt=True, tokenize=False, enable_thinking=False
+        )
+
+        # Extract multimodal info using qwen_omni_utils
+        audios, images, videos = self.process_mm_info(
+            conversation, use_audio_in_video=self.use_audio_in_video
+        )
+
+        # Process inputs with the processor
+        values = self.tokenizer(
+            text=text,
+            audio=audios,
+            images=images,
+            videos=videos,
+            return_tensors="pt",
+            padding=True,
+            use_audio_in_video=self.use_audio_in_video,
+        )
+
+        # Define all possible keys to ensure consistent schema for Arrow serialization
+        all_keys = [
+            "input_ids",
+            "attention_mask",
+            "pixel_values",
+            "image_grid_thw",
+            "audio_features",
+            "audio_feature_lens",
+            "video_grid_thw",
+        ]
+
+        # Convert tensors to lists for Arrow serialization compatibility
+        # Tensor conversion back happens in collate_function
+        result = dict.fromkeys(all_keys)  # Initialize all keys to None
+        for key, val in values.items():
+            if val is not None and hasattr(val, "tolist"):
+                result[key] = val.tolist()
+            elif val is not None:
+                result[key] = val
+
+        return result
+
+    def collate_function(self, batch):
+        """Collate function to process inputs during data loading."""
+        result = {}
+
+        # Take first item from batch (batch_size handling)
+        first = batch[0]
+
+        # Convert lists to tensors and move to device
+        if "input_ids" in first and first["input_ids"] is not None:
+            result["input_ids"] = torch.LongTensor(first["input_ids"]).to(self.device)
+        if "attention_mask" in first and first["attention_mask"] is not None:
+            result["attention_mask"] = torch.LongTensor(first["attention_mask"]).to(self.device)
+
+        # Handle pixel values for images
+        if first.get("pixel_values") is not None:
+            result["pixel_values"] = torch.tensor(first["pixel_values"]).to(self.device)
+
+        # Handle image grid thw (tile height width info)
+        if first.get("image_grid_thw") is not None:
+            result["image_grid_thw"] = torch.LongTensor(first["image_grid_thw"]).to(self.device)
+
+        # Handle audio features if present
+        if first.get("audio_feature_lens") is not None:
+            result["audio_feature_lens"] = torch.LongTensor(first["audio_feature_lens"]).to(
+                self.device
+            )
+        if first.get("audio_features") is not None:
+            result["audio_features"] = torch.tensor(first["audio_features"]).to(self.device)
+
+        # Handle video features if present
+        if first.get("video_grid_thw") is not None:
+            result["video_grid_thw"] = torch.LongTensor(first["video_grid_thw"]).to(self.device)
+
+        return result

From 0ccf7f9cbdc82d46a4494cd24c0fe107a79d24dc Mon Sep 17 00:00:00 2001
From: ajrasane <131806219+ajrasane@users.noreply.github.com>
Date: Fri, 6 Feb 2026 05:16:41 +0000
Subject: [PATCH 02/12] Optimize calibration for text data

Signed-off-by: ajrasane <131806219+ajrasane@users.noreply.github.com>
---
 examples/llm_ptq/hf_ptq.py                    | 97 ++++++++++---------
 modelopt/torch/export/unified_export_hf.py    | 10 +-
 .../torch/quantization/plugins/huggingface.py |  5 -
 modelopt/torch/utils/dataset_utils.py         | 25 +++--
 4 files changed, 72 insertions(+), 65 deletions(-)

diff --git a/examples/llm_ptq/hf_ptq.py b/examples/llm_ptq/hf_ptq.py
index 4f7d3430f8..d854c36454 100755
--- a/examples/llm_ptq/hf_ptq.py
+++ b/examples/llm_ptq/hf_ptq.py
@@ -70,14 +70,12 @@
     create_forward_loop,
     get_dataset_dataloader,
     get_max_batch_size,
-    get_qwen3omni_text_dataloader,
     get_supported_datasets,
 )
 from modelopt.torch.utils.image_processor import (
     BaseImageProcessor,
     MllamaImageProcessor,
     Qwen3OmniImageProcessor,
-    Qwen3OmniTextProcessor,
 )
 from modelopt.torch.utils.memory_monitor import launch_memory_monitor
 from modelopt.torch.utils.speech_dataset_utils import get_speech_dataset_dataloader
@@ -225,50 +223,47 @@ def make_calib_dataloader(
             num_samples=args.calib_size[0],
         )
     elif model_type == "qwen3omni":
-        assert processor is not None, "The processor must be set for qwen3omni model."
         dataset_name = args.dataset[0] if args.dataset else "cnn_dailymail"
         # Check if using video dataset (e.g., finevideo)
-        if dataset_name in get_supported_video_datasets():
-            video_processor = Qwen3OmniVideoProcessor(
-                processor.tokenizer if hasattr(processor, "tokenizer") else processor,
-                device=device,
-                dtype=language_model.dtype,
-                use_audio_in_video=True,
-            )
-            calib_dataloader = get_video_dataset_dataloader(
-                dataset_name=dataset_name,
-                processor=video_processor,
-                batch_size=args.batch_size,
-                num_samples=args.calib_size[0],
-            )
-        elif dataset_name in get_supported_vlm_datasets():
-            assert isinstance(processor, Qwen3OmniImageProcessor), (
-                "The Qwen3OmniImageProcessor must be set."
-            )
-            # Set the dtype for proper tensor conversion in collate_function
-            processor.dtype = language_model.dtype
-            calib_dataloader = get_vlm_dataset_dataloader(
-                dataset_name=dataset_name,
-                processor=processor,
-                batch_size=args.batch_size,
-                num_samples=args.calib_size[0],
-            )
+        if processor is not None:
+            if dataset_name in get_supported_video_datasets():
+                video_processor = Qwen3OmniVideoProcessor(
+                    processor.tokenizer if hasattr(processor, "tokenizer") else processor,
+                    device=device,
+                    dtype=language_model.dtype,
+                    use_audio_in_video=True,
+                )
+                calib_dataloader = get_video_dataset_dataloader(
+                    dataset_name=dataset_name,
+                    processor=video_processor,
+                    batch_size=args.batch_size,
+                    num_samples=args.calib_size[0],
+                )
+            elif dataset_name in get_supported_vlm_datasets():
+                assert isinstance(processor, Qwen3OmniImageProcessor), (
+                    "The Qwen3OmniImageProcessor must be set."
+                )
+                # Set the dtype for proper tensor conversion in collate_function
+                processor.dtype = language_model.dtype
+                calib_dataloader = get_vlm_dataset_dataloader(
+                    dataset_name=dataset_name,
+                    processor=processor,
+                    batch_size=args.batch_size,
+                    num_samples=args.calib_size[0],
+                )
         else:
-            # Text-only datasets (e.g., cnn_dailymail)
-            # Use Qwen3OmniTextProcessor to apply proper conversation template
-            # See: https://huggingface.co/Qwen/Qwen3-Omni-30B-A3B-Thinking
-            text_processor = Qwen3OmniTextProcessor(
-                processor=processor.tokenizer,  # Pass the underlying HF processor
-                device=device,
-                dtype=language_model.dtype,
+            # Labels are only needed for gradient-based auto_quantize
+            include_labels = (
+                args.auto_quantize_bits is not None and args.auto_quantize_method == "gradient"
             )
-            calib_dataloader = get_qwen3omni_text_dataloader(
-                dataset_name=dataset_name,
-                processor=text_processor,
+            calib_dataloader = get_dataset_dataloader(
+                dataset_name=args.dataset,
+                tokenizer=tokenizer,
                 batch_size=args.batch_size,
-                num_samples=args.calib_size[0],
+                num_samples=args.calib_size,
+                device=device,
+                include_labels=include_labels,
             )
-        print(f"Selected dataset for calibration: {dataset_name}")
     elif model_type == "whisper":
         assert processor is not None and isinstance(processor, WhisperProcessor), (
             "The AutoProcessor must be set."
@@ -452,9 +447,6 @@ def load_model(args: argparse.Namespace):
         calibration_only = True
 
     model_type = get_model_type(full_model)
-    if model_type == "qwen3omni":
-        print("Disabling talker for Qwen3Omni model")
-        full_model.disable_talker()
 
     device = full_model.device
     if hasattr(full_model, "model"):
@@ -480,6 +472,14 @@ def load_model(args: argparse.Namespace):
             trust_remote_code=args.trust_remote_code,
             attn_implementation=args.attn_implementation,
         )
+        if model_type == "qwen3omni":
+            print("Disabling talker for Qwen3Omni model")
+            full_model.disable_talker()
+            language_model = full_model.thinker.model
+            tokenizer = processor.tokenizer.tokenizer
+            processor = None
+            default_padding_side = tokenizer.padding_side
+            default_pad_token = tokenizer.pad_token
     elif model_type == "whisper":
         processor = get_processor(
             args.pyt_ckpt_path,
@@ -620,6 +620,7 @@ def mono_quantize(
         print("Quantization will only be applied to the decoder (text generation) component")
 
     # For Qwen3Omni models, disable quantization of conv layers
+    generation_kwargs = {}
     if model_type == "qwen3omni":
         print(
             "Disabling quantization for conv layers, audio tower and visual encoder in Qwen3Omni model"
@@ -627,6 +628,8 @@ def mono_quantize(
         quant_cfg["quant_cfg"]["*conv*"] = {"enable": False}
         quant_cfg["quant_cfg"]["*audio_tower*"] = {"enable": False}
         quant_cfg["quant_cfg"]["*visual*"] = {"enable": False}
+        generation_kwargs["return_audio"] = False
+        generation_kwargs["thinker_max_new_tokens"] = 1
 
     if not model_is_already_quantized or calibration_only:
         # quantize the model
@@ -642,7 +645,9 @@ def mono_quantize(
             if args.calib_with_images and is_nemotron_vl_model:
                 calibrate_loop = create_vlm_calibration_loop(full_model, calib_dataloader)
             else:
-                calibrate_loop = create_forward_loop(dataloader=calib_dataloader)
+                calibrate_loop = create_forward_loop(
+                    dataloader=calib_dataloader, generation_kwargs=generation_kwargs
+                )
 
         if calibration_only:
             language_model = mtq.calibrate(
@@ -836,7 +841,7 @@ def pre_quantize(
     elif model_type == "qwen3omni":
         # Qwen3Omni returns (text_ids, audio) tuple; text_ids has .sequences
         # Pass full batch with all multimodal inputs
-        result = full_model.generate(**calib_batch, max_new_tokens=100)
+        result = full_model.generate(**calib_batch, return_audio=False, thinker_max_new_tokens=100)
         if isinstance(result, tuple):
             text_ids, _ = result
             generated_ids_before_ptq = (
@@ -895,7 +900,7 @@ def post_quantize(
     elif model_type == "qwen3omni":
         # Qwen3Omni returns (text_ids, audio) tuple; text_ids has .sequences
         # Pass full batch with all multimodal inputs
-        result = full_model.generate(**calib_batch, max_new_tokens=100)
+        result = full_model.generate(**calib_batch, return_audio=False, thinker_max_new_tokens=100)
         if isinstance(result, tuple):
             text_ids, _ = result
             generated_ids_after_ptq = (
diff --git a/modelopt/torch/export/unified_export_hf.py b/modelopt/torch/export/unified_export_hf.py
index e105c46aa4..17796e026d 100644
--- a/modelopt/torch/export/unified_export_hf.py
+++ b/modelopt/torch/export/unified_export_hf.py
@@ -357,9 +357,11 @@ def llm_dummy_forward():
                 [1, model.config.num_mel_bins, feature_extractor.nb_max_frames], dtype=model.dtype
             ).to(model.device)
 
-        if is_vl_model and "nemotron" in model_type:
-            # For Nemotron VL models, run optimization on just the language model/decoder.
-            # This avoids needing pixel_values for the vision encoder.
+        if getattr(model.config, "is_encoder_decoder", False):
+            # For encoder-decoder models, we need to pass both the encoder and decoder input ids
+            model(fake_input, decoder_input_ids=decoder_fake_input)
+        elif (is_vl_model and "nemotron" in model_type) or model_type.startswith("qwen3omni"):
+            # For Nemotron VL models, try to run optimization on just the language model part
             language_model_lineage = get_language_model_from_vl(model)
 
             if language_model_lineage is not None:
@@ -371,7 +373,7 @@ def llm_dummy_forward():
                 language_model(fake_input, use_cache=False)
             else:
                 raise ValueError(
-                    f"Cannot extract language_model from Nemotron VL model (type: {model_type}). "
+                    f"Cannot extract language_model from VL model (type: {model_type}). "
                     "This is required for requantization/resmoothing optimization. "
                     "Please ensure the model architecture is supported or file an issue."
                 )
diff --git a/modelopt/torch/quantization/plugins/huggingface.py b/modelopt/torch/quantization/plugins/huggingface.py
index b03c46fd03..0a451e8ccb 100644
--- a/modelopt/torch/quantization/plugins/huggingface.py
+++ b/modelopt/torch/quantization/plugins/huggingface.py
@@ -1182,14 +1182,9 @@ def unpack_weight(self):
 
 try:
     from transformers.models.qwen3_omni_moe.modeling_qwen3_omni_moe import (
-        Qwen3OmniMoeTalkerTextSparseMoeBlock,
         Qwen3OmniMoeThinkerTextSparseMoeBlock,
     )
 
-    if Qwen3OmniMoeTalkerTextSparseMoeBlock not in QuantModuleRegistry:
-        QuantModuleRegistry.register(
-            {Qwen3OmniMoeTalkerTextSparseMoeBlock: "hf.Qwen3OmniMoeTalkerTextSparseMoeBlock"}
-        )(_QuantSparseMoe)
     if Qwen3OmniMoeThinkerTextSparseMoeBlock not in QuantModuleRegistry:
         QuantModuleRegistry.register(
             {Qwen3OmniMoeThinkerTextSparseMoeBlock: "hf.Qwen3OmniMoeThinkerTextSparseMoeBlock"}
diff --git a/modelopt/torch/utils/dataset_utils.py b/modelopt/torch/utils/dataset_utils.py
index 9a5b7ccf98..a02a0717c8 100644
--- a/modelopt/torch/utils/dataset_utils.py
+++ b/modelopt/torch/utils/dataset_utils.py
@@ -587,12 +587,13 @@ def _get_free_gpu_mem():
         return 512
 
 
-def _process_batch(batch_data, infer_method, max_working_batch_size=None):
+def _process_batch(batch_data, infer_method, generation_kwargs={}, max_working_batch_size=None):
     """Process a batch of data through the model's inference method.
 
     Args:
         batch_data: Dictionary containing the batch data
         infer_method: Model's inference method (either forward or generate)
+        generation_kwargs: Keyword arguments to pass to the model.generate() method.
         max_working_batch_size: Maximum batch size known to work without OOM
 
     Returns:
@@ -630,7 +631,7 @@ def _process_batch(batch_data, infer_method, max_working_batch_size=None):
 
     # Try processing with current batch size
     try:
-        infer_method(**batch_data)
+        infer_method(**batch_data, **generation_kwargs)
         return (
             batch_size
             if max_working_batch_size is None
@@ -661,24 +662,27 @@ def _process_batch(batch_data, infer_method, max_working_batch_size=None):
     return max_working_batch_size
 
 
-def _forward_loop(model: torch.nn.Module, dataloader: DataLoader) -> None:
+def _forward_loop(
+    model: torch.nn.Module, dataloader: DataLoader, generation_kwargs: dict = {}
+) -> None:
     """Runs forward passes through the model using data from the dataloader.
 
     Args:
         model: The PyTorch model to run inference on
         dataloader: DataLoader containing the batched input data
+        generation_kwargs: Keyword arguments to pass to the model.generate() method.
     """
     with torch.no_grad():
-        use_generate = _should_use_generate(model)
+        # use_generate = _should_use_generate(model)
+        use_generate = model_type_is_enc_dec(model)
         infer_method = model.generate if use_generate else model.forward
         max_working_batch_size = None  # Initialize max working batch size as None
 
         for _, data in enumerate(tqdm(dataloader)):
-            # For generate(), add max_new_tokens to prevent indefinite generation during calibration
-            if use_generate:
-                data["max_new_tokens"] = 1
             # Process batch and update max working batch size
-            max_working_batch_size = _process_batch(data, infer_method, max_working_batch_size)
+            max_working_batch_size = _process_batch(
+                data, infer_method, generation_kwargs, max_working_batch_size
+            )
 
 
 def create_forward_loop(
@@ -691,6 +695,7 @@ def create_forward_loop(
     device: str | None = None,
     include_labels: bool = False,
     dataloader: DataLoader | None = None,
+    generation_kwargs: dict = {},
 ) -> Callable:
     """Creates and returns a forward loop function configured for a specific model, dataset, and tokenizer.
 
@@ -709,7 +714,7 @@ def create_forward_loop(
         device: Target device for the returned dataloader.
         include_labels: Whether to include labels in the dataloader.
         dataloader: If provided, use the provided dataloader instead.
-
+        generation_kwargs: Keyword arguments to pass to the model.generate() method.
     Example usage for quantization:
 
     .. code-block:: python
@@ -748,7 +753,7 @@ def create_forward_loop(
             include_labels=include_labels,
         )
 
-    return lambda model: _forward_loop(model, dataloader)
+    return lambda model: _forward_loop(model, dataloader, generation_kwargs)
 
 
 def model_type_is_enc_dec(model):

From df50095e60d30a7df671c3401ec2a5b81809a986 Mon Sep 17 00:00:00 2001
From: ajrasane <131806219+ajrasane@users.noreply.github.com>
Date: Fri, 6 Feb 2026 06:11:27 +0000
Subject: [PATCH 03/12] Refactor model specific code to example_utils

Signed-off-by: ajrasane <131806219+ajrasane@users.noreply.github.com>
---
 examples/llm_ptq/example_utils.py          | 124 +++++++++++++++++++++
 examples/llm_ptq/hf_ptq.py                 |  97 ++++------------
 modelopt/torch/export/model_utils.py       |  56 +++++-----
 modelopt/torch/export/unified_export_hf.py |  10 +-
 modelopt/torch/quantization/model_quant.py |  36 +++---
 modelopt/torch/utils/dataset_utils.py      |   2 +-
 6 files changed, 200 insertions(+), 125 deletions(-)

diff --git a/examples/llm_ptq/example_utils.py b/examples/llm_ptq/example_utils.py
index 5fe9ab5ad6..1c17e2b5d9 100755
--- a/examples/llm_ptq/example_utils.py
+++ b/examples/llm_ptq/example_utils.py
@@ -46,11 +46,21 @@
     snapshot_download = None
 
 import modelopt.torch.quantization as mtq
+from modelopt.torch.utils.dataset_utils import get_dataset_dataloader
 from modelopt.torch.utils.image_processor import (
     BaseImageProcessor,
     MllamaImageProcessor,
     Qwen3OmniImageProcessor,
 )
+from modelopt.torch.utils.video_dataset_utils import (
+    Qwen3OmniVideoProcessor,
+    get_supported_video_datasets,
+    get_video_dataset_dataloader,
+)
+from modelopt.torch.utils.vlm_dataset_utils import (
+    get_supported_vlm_datasets,
+    get_vlm_dataset_dataloader,
+)
 
 logger = logging.getLogger(__name__)
 
@@ -246,9 +256,45 @@ def build_quant_cfg(
         quant_cfg["quant_cfg"]["*image*"] = {"enable": False}
         quant_cfg["quant_cfg"]["*vision*"] = {"enable": False}
 
+    if model_type in ["qwen3moe", "qwen3next"] and qformat == "nvfp4":
+        # Disable the attention projection layers to retain accuracy
+        quant_cfg["quant_cfg"]["model*.*attn*in_proj*"] = {"enable": False}
+        quant_cfg["quant_cfg"]["model*.*attn*q_proj*"] = {"enable": False}
+        quant_cfg["quant_cfg"]["model*.*attn*k_proj*"] = {"enable": False}
+        quant_cfg["quant_cfg"]["model*.*attn*v_proj*"] = {"enable": False}
+
+    if model_type == "deepseek":
+        # Disable MLA quantization for accuracy.
+        quant_cfg["quant_cfg"]["*self_attn.q*"] = {"enable": False}
+        quant_cfg["quant_cfg"]["*self_attn.kv*"] = {"enable": False}
+
+    if model_type == "qwen3omni":
+        print(
+            "Disabling quantization for conv layers, audio tower and visual encoder in Qwen3Omni model"
+        )
+        quant_cfg["quant_cfg"]["*conv*"] = {"enable": False}
+        quant_cfg["quant_cfg"]["*audio_tower*"] = {"enable": False}
+        quant_cfg["quant_cfg"]["*visual*"] = {"enable": False}
+
     return quant_cfg
 
 
+def get_generation_kwargs(model_type: str) -> dict[str, Any]:
+    """Get model-specific generation kwargs for calibration.
+
+    Args:
+        model_type: The model type string.
+
+    Returns:
+        Dictionary of generation kwargs for the model.
+    """
+    generation_kwargs = {}
+    if model_type == "qwen3omni":
+        generation_kwargs["return_audio"] = False
+        generation_kwargs["thinker_max_new_tokens"] = 1
+    return generation_kwargs
+
+
 def is_speculative(hf_config):
     """Check if the model architecture is a speculative model."""
     return hf_config.architectures and any(
@@ -834,3 +880,81 @@ def copy_custom_model_files(source_path: str, export_path: str, trust_remote_cod
         print(f"Successfully copied {len(copied_files)} custom model files to {export_path}")
     else:
         print("No custom model files found to copy")
+
+
+def get_qwen3omni_dataloader(
+    dataset_name: str | list[str] | None,
+    processor: Qwen3OmniImageProcessor | None,
+    tokenizer,
+    batch_size: int,
+    num_samples: int | list[int],
+    device: torch.device,
+    model_dtype: torch.dtype,
+    include_labels: bool = False,
+):
+    """Create a calibration dataloader for Qwen3Omni models.
+
+    Handles video, VLM, and text-only dataset configurations.
+
+    Args:
+        dataset_name: Name of the dataset(s) to use for calibration.
+        processor: The Qwen3OmniImageProcessor for multimodal inputs.
+        tokenizer: The tokenizer for text-only fallback.
+        batch_size: Batch size for the dataloader.
+        num_samples: Number of samples to use (int or list for multi-dataset).
+        device: Target device for tensors.
+        model_dtype: Model dtype for proper tensor conversion.
+        include_labels: Whether to include labels (for gradient-based auto_quantize).
+
+    Returns:
+        DataLoader for calibration.
+    """
+    if dataset_name is None:
+        dataset_name = ["cnn_dailymail", "nemotron-post-training-dataset-v2"]
+
+    if processor is not None:
+        if dataset_name in get_supported_video_datasets():
+            assert isinstance(dataset_name, str)
+            video_processor = Qwen3OmniVideoProcessor(
+                processor.tokenizer if hasattr(processor, "tokenizer") else processor,
+                device=device,
+                dtype=model_dtype,
+                use_audio_in_video=True,
+            )
+            calib_dataloader = get_video_dataset_dataloader(
+                dataset_name=dataset_name,
+                processor=video_processor,
+                batch_size=batch_size,
+                num_samples=num_samples if isinstance(num_samples, int) else num_samples[0],
+            )
+        elif dataset_name in get_supported_vlm_datasets():
+            assert isinstance(dataset_name, str)
+            assert isinstance(processor, Qwen3OmniImageProcessor), (
+                "The Qwen3OmniImageProcessor must be set."
+            )
+            # Set the dtype for proper tensor conversion in collate_function
+            processor.dtype = model_dtype
+            calib_dataloader = get_vlm_dataset_dataloader(
+                dataset_name=dataset_name,
+                processor=processor,
+                batch_size=batch_size,
+                num_samples=num_samples if isinstance(num_samples, int) else num_samples[0],
+            )
+        else:
+            raise ValueError(
+                f"Dataset '{dataset_name}' not supported for Qwen3Omni with processor. "
+                f"Supported video datasets: {get_supported_video_datasets()}, "
+                f"Supported VLM datasets: {get_supported_vlm_datasets()}"
+            )
+    else:
+        # Text-only fallback
+        calib_dataloader = get_dataset_dataloader(
+            dataset_name=dataset_name if isinstance(dataset_name, list) else [dataset_name],
+            tokenizer=tokenizer,
+            batch_size=batch_size,
+            num_samples=num_samples if isinstance(num_samples, list) else [num_samples],
+            device=device,
+            include_labels=include_labels,
+        )
+
+    return calib_dataloader
diff --git a/examples/llm_ptq/hf_ptq.py b/examples/llm_ptq/hf_ptq.py
index d854c36454..571cea125a 100755
--- a/examples/llm_ptq/hf_ptq.py
+++ b/examples/llm_ptq/hf_ptq.py
@@ -15,9 +15,7 @@
 
 import argparse
 import copy
-import io
 import random
-import sys
 import time
 import warnings
 from typing import Any
@@ -29,8 +27,10 @@
     build_quant_cfg,
     copy_custom_model_files,
     create_vlm_calibration_loop,
+    get_generation_kwargs,
     get_model,
     get_processor,
+    get_qwen3omni_dataloader,
     get_tokenizer,
     is_enc_dec,
     is_nemotron_vl,
@@ -79,15 +79,7 @@
 )
 from modelopt.torch.utils.memory_monitor import launch_memory_monitor
 from modelopt.torch.utils.speech_dataset_utils import get_speech_dataset_dataloader
-from modelopt.torch.utils.video_dataset_utils import (
-    Qwen3OmniVideoProcessor,
-    get_supported_video_datasets,
-    get_video_dataset_dataloader,
-)
-from modelopt.torch.utils.vlm_dataset_utils import (
-    get_supported_vlm_datasets,
-    get_vlm_dataset_dataloader,
-)
+from modelopt.torch.utils.vlm_dataset_utils import get_vlm_dataset_dataloader
 
 RAND_SEED = 1234
 
@@ -223,47 +215,20 @@ def make_calib_dataloader(
             num_samples=args.calib_size[0],
         )
     elif model_type == "qwen3omni":
-        dataset_name = args.dataset[0] if args.dataset else "cnn_dailymail"
-        # Check if using video dataset (e.g., finevideo)
-        if processor is not None:
-            if dataset_name in get_supported_video_datasets():
-                video_processor = Qwen3OmniVideoProcessor(
-                    processor.tokenizer if hasattr(processor, "tokenizer") else processor,
-                    device=device,
-                    dtype=language_model.dtype,
-                    use_audio_in_video=True,
-                )
-                calib_dataloader = get_video_dataset_dataloader(
-                    dataset_name=dataset_name,
-                    processor=video_processor,
-                    batch_size=args.batch_size,
-                    num_samples=args.calib_size[0],
-                )
-            elif dataset_name in get_supported_vlm_datasets():
-                assert isinstance(processor, Qwen3OmniImageProcessor), (
-                    "The Qwen3OmniImageProcessor must be set."
-                )
-                # Set the dtype for proper tensor conversion in collate_function
-                processor.dtype = language_model.dtype
-                calib_dataloader = get_vlm_dataset_dataloader(
-                    dataset_name=dataset_name,
-                    processor=processor,
-                    batch_size=args.batch_size,
-                    num_samples=args.calib_size[0],
-                )
-        else:
-            # Labels are only needed for gradient-based auto_quantize
-            include_labels = (
-                args.auto_quantize_bits is not None and args.auto_quantize_method == "gradient"
-            )
-            calib_dataloader = get_dataset_dataloader(
-                dataset_name=args.dataset,
-                tokenizer=tokenizer,
-                batch_size=args.batch_size,
-                num_samples=args.calib_size,
-                device=device,
-                include_labels=include_labels,
-            )
+        # Labels are only needed for gradient-based auto_quantize
+        include_labels = (
+            args.auto_quantize_bits is not None and args.auto_quantize_method == "gradient"
+        )
+        calib_dataloader = get_qwen3omni_dataloader(
+            dataset_name=args.dataset[0] if args.dataset else None,
+            processor=processor,
+            tokenizer=tokenizer,
+            batch_size=args.batch_size,
+            num_samples=args.calib_size[0] if processor else args.calib_size,
+            device=device,
+            model_dtype=language_model.dtype,
+            include_labels=include_labels,
+        )
     elif model_type == "whisper":
         assert processor is not None and isinstance(processor, WhisperProcessor), (
             "The AutoProcessor must be set."
@@ -619,17 +584,8 @@ def mono_quantize(
         quant_cfg["quant_cfg"]["*model_encoder*"] = {"enable": False}  # Nemotron-Parse specific
         print("Quantization will only be applied to the decoder (text generation) component")
 
-    # For Qwen3Omni models, disable quantization of conv layers
-    generation_kwargs = {}
-    if model_type == "qwen3omni":
-        print(
-            "Disabling quantization for conv layers, audio tower and visual encoder in Qwen3Omni model"
-        )
-        quant_cfg["quant_cfg"]["*conv*"] = {"enable": False}
-        quant_cfg["quant_cfg"]["*audio_tower*"] = {"enable": False}
-        quant_cfg["quant_cfg"]["*visual*"] = {"enable": False}
-        generation_kwargs["return_audio"] = False
-        generation_kwargs["thinker_max_new_tokens"] = 1
+    # Get model-specific generation kwargs (e.g., for Qwen3Omni)
+    generation_kwargs = get_generation_kwargs(model_type)
 
     if not model_is_already_quantized or calibration_only:
         # quantize the model
@@ -876,20 +832,7 @@ def post_quantize(
     """
 
     if args.verbose:
-        if args.quant_summary_path:
-            # Capture the summary output to a file
-            old_stdout = sys.stdout
-            sys.stdout = buffer = io.StringIO()
-            try:
-                mtq.print_quant_summary(full_model, args.export_path)
-            finally:
-                sys.stdout = old_stdout
-            summary = buffer.getvalue()
-            with open(args.quant_summary_path, "w") as f:
-                f.write(summary)
-            print(f"Quantization summary saved to {args.quant_summary_path}")
-        else:
-            mtq.print_quant_summary(full_model, args.export_path)
+        mtq.print_quant_summary(full_model, save_path=args.quant_summary_path)
         save_expert_token_count_table(full_model, args.export_path)
 
     # Run some samples
diff --git a/modelopt/torch/export/model_utils.py b/modelopt/torch/export/model_utils.py
index b71e53bacf..7501ed7bbc 100755
--- a/modelopt/torch/export/model_utils.py
+++ b/modelopt/torch/export/model_utils.py
@@ -17,47 +17,47 @@
 import torch.nn as nn
 
 MODEL_NAME_TO_TYPE = {
-    "ArcticForCausalLM": "llama",
-    "baichuan": "baichuan",
-    "Bart": "bart",
-    "Bloom": "bloom",
-    "ChatGLM": "chatglm",
-    "Dbrx": "dbrx",
-    "Deepseek": "deepseek",
-    "ExaoneForCausalLM": "exaone",
-    "FalconForCausalLM": "falcon",
-    "Gemma": "gemma",
-    "Gemma2": "gemma2",
-    "Gemma3": "gemma3",
-    "GLM": "glm",
     "GPT2": "gpt",
-    "GPTJ": "gptj",
-    "gptoss": "gptoss",
-    "InternLM2ForCausalLM": "internlm",
-    "Llama": "llama",
+    "Mllama": "mllama",
     "Llama4": "llama4",
+    "Llama": "llama",
     "Mistral": "llama",
-    "MixtralForCausalLM": "llama",
-    "Mllama": "mllama",
+    "GPTJ": "gptj",
+    "FalconForCausalLM": "falcon",
+    "RWForCausalLM": "falcon",
+    "baichuan": "baichuan",
     "MPT": "mpt",
-    "Nemotron": "gpt",
-    "phi": "phi",
-    "phi3": "phi3",
-    "phi3small": "phi3small",
-    "Phi4MMForCausalLM": "phi4mm",
-    "PhiMoEForCausalLM": "phi3",
+    "Bloom": "bloom",
+    "ChatGLM": "chatglm",
     "Qwen3Moe": "qwen3moe",
     "Qwen3Next": "qwen3next",
     "Qwen3OmniMoeForConditionalGeneration": "qwen3omni",
     "QWen": "qwen",
     "RecurrentGemma": "recurrentgemma",
-    "RWForCausalLM": "falcon",
-    "StarCoder": "gpt",
-    "T5": "t5",
+    "Gemma3": "gemma3",
+    "Gemma2": "gemma2",
+    "Gemma": "gemma",
+    "phi3small": "phi3small",
+    "phi3": "phi3",
+    "PhiMoEForCausalLM": "phi3",
+    "Phi4MMForCausalLM": "phi4mm",
+    "phi": "phi",
     "TLGv4ForCausalLM": "phi",
     "NemotronH": "nemotron_h",
+    "MixtralForCausalLM": "llama",
+    "ArcticForCausalLM": "llama",
+    "StarCoder": "gpt",
+    "Dbrx": "dbrx",
+    "T5": "t5",
+    "Bart": "bart",
+    "GLM": "glm",
+    "InternLM2ForCausalLM": "internlm",
+    "ExaoneForCausalLM": "exaone",
+    "Nemotron": "gpt",
+    "Deepseek": "deepseek",
     "Whisper": "whisper",
     "MiniMax": "minimax",
+    "gptoss": "gptoss",
 }
 
 __doc__ = f"""Utility functions for model type detection and classification.
diff --git a/modelopt/torch/export/unified_export_hf.py b/modelopt/torch/export/unified_export_hf.py
index 17796e026d..e8a1e06857 100644
--- a/modelopt/torch/export/unified_export_hf.py
+++ b/modelopt/torch/export/unified_export_hf.py
@@ -1188,10 +1188,12 @@ def export_hf_checkpoint(
         if hasattr(model, "generation_config") and model.generation_config is not None:
             gen_config = model.generation_config
             if not getattr(gen_config, "do_sample", True):
-                # Remove sampling-related params when do_sample is False
-                for attr in ["temperature", "top_p", "top_k"]:
-                    if hasattr(gen_config, attr):
-                        setattr(gen_config, attr, None)
+                # Enable sampling if sampling params are present
+                if any(
+                    getattr(gen_config, attr, None) is not None
+                    for attr in ["temperature", "top_p", "top_k"]
+                ):
+                    gen_config.do_sample = True
 
         # Save model
         # Temporarily disable revert_weight_conversion if available — it doesn't handle
diff --git a/modelopt/torch/quantization/model_quant.py b/modelopt/torch/quantization/model_quant.py
index 4aa1ff46b4..782702703b 100644
--- a/modelopt/torch/quantization/model_quant.py
+++ b/modelopt/torch/quantization/model_quant.py
@@ -583,22 +583,28 @@ def enable_quantizer(model: nn.Module, wildcard_or_filter_func: str | Callable):
 
 
 @atomic_print
-def print_quant_summary(model: nn.Module, output_dir: str | None = None):
-    """Print summary of all quantizer modules in the model."""
-    lines = [
-        f"{name:80} {mod}"
-        for name, mod in model.named_modules()
-        if isinstance(mod, TensorQuantizer)
-    ]
-    lines.append(f"{len(lines)} TensorQuantizers found in model")
-
-    if output_dir:
-        path = os.path.join(output_dir, ".quant_summary.txt")
-        with open(path, "w", encoding="utf-8") as f:
-            f.write("\n".join(lines) + "\n")
-        print(f"\033[1mQuant summary saved to {path}\033[0m")
+def print_quant_summary(model: nn.Module, save_path: str | None = None):
+    """Print summary of all quantizer modules in the model.
+
+    Args:
+        model: The model to summarize.
+        save_path: Optional path to save the summary to a file. If None, prints to stdout.
+    """
+    lines = []
+    count = 0
+    for name, mod in model.named_modules():
+        if isinstance(mod, TensorQuantizer):
+            lines.append(f"{name:80} {mod}")
+            count += 1
+    lines.append(f"{count} TensorQuantizers found in model")
+
+    summary = "\n".join(lines)
+    if save_path:
+        with open(save_path, "w") as f:
+            f.write(summary)
+        print(f"Quantization summary saved to {save_path}")
     else:
-        print("\n".join(lines))
+        print(summary)
 
 
 def fold_weight(model: nn.Module, keep_attrs: bool = False):
diff --git a/modelopt/torch/utils/dataset_utils.py b/modelopt/torch/utils/dataset_utils.py
index a02a0717c8..842a797afb 100644
--- a/modelopt/torch/utils/dataset_utils.py
+++ b/modelopt/torch/utils/dataset_utils.py
@@ -531,7 +531,7 @@ def _get_free_gpu_mem():
     torch.cuda.empty_cache()
 
     free_mem_before, max_allocated_before = _get_free_gpu_mem()
-    use_generate = _should_use_generate(model)
+    use_generate = model_type_is_enc_dec(model)
     infer_method = model.generate if use_generate else model.forward
 
     if sample_input_single_batch is None:

From 3f5859b2222f3d196e3bf4d2b2f255b215b10548 Mon Sep 17 00:00:00 2001
From: ajrasane <131806219+ajrasane@users.noreply.github.com>
Date: Fri, 6 Feb 2026 19:11:26 +0000
Subject: [PATCH 04/12] Update hf configs for vLLM deployment

Signed-off-by: ajrasane <131806219+ajrasane@users.noreply.github.com>
---
 examples/llm_ptq/example_utils.py | 50 +++++++++++++++++++++++++++++++
 examples/llm_ptq/hf_ptq.py        |  4 +++
 2 files changed, 54 insertions(+)

diff --git a/examples/llm_ptq/example_utils.py b/examples/llm_ptq/example_utils.py
index 1c17e2b5d9..24c9cfa10f 100755
--- a/examples/llm_ptq/example_utils.py
+++ b/examples/llm_ptq/example_utils.py
@@ -882,6 +882,55 @@ def copy_custom_model_files(source_path: str, export_path: str, trust_remote_cod
         print("No custom model files found to copy")
 
 
+def patch_config_for_unified_export(model_type: str, export_path: str) -> None:
+    """Patch config files to add missing exclusion patterns for unified HF export.
+
+    This function adds missing exclusion patterns for modules that should not be quantized
+    (e.g., audio tower, visual encoder, lm_head) to both hf_quant_config.json and config.json.
+
+    Args:
+        export_path: Path to the exported model directory.
+    """
+    if model_type == "qwen3omni":
+        missing_patterns = [
+            "thinker.audio_tower*",
+            "thinker.visual*",
+            "thinker.lm_head",
+        ]
+
+        # (filename, path_to_exclude_list)
+        configs = [
+            ("hf_quant_config.json", ["quantization", "exclude_modules"]),
+            ("config.json", ["quantization_config", "ignore"]),
+        ]
+
+        for filename, keys in configs:
+            filepath = os.path.join(export_path, filename)
+            if not os.path.exists(filepath):
+                continue
+            try:
+                with open(filepath) as f:
+                    config = json.load(f)
+
+                # Navigate to nested key
+                target = config
+                for key in keys[:-1]:
+                    target = target.get(key, {})
+
+                exclude_list = target.get(keys[-1])
+                if exclude_list is None:
+                    continue
+
+                added = [p for p in missing_patterns if p not in exclude_list]
+                if added:
+                    exclude_list.extend(added)
+                    with open(filepath, "w") as f:
+                        json.dump(config, f, indent=2)
+                    print(f"Patched {filename} with exclusions: {added}")
+            except Exception as e:
+                print(f"Warning: Failed to patch {filename}: {e}")
+
+
 def get_qwen3omni_dataloader(
     dataset_name: str | list[str] | None,
     processor: Qwen3OmniImageProcessor | None,
@@ -911,6 +960,7 @@ def get_qwen3omni_dataloader(
     """
     if dataset_name is None:
         dataset_name = ["cnn_dailymail", "nemotron-post-training-dataset-v2"]
+        num_samples = [512, 512]
 
     if processor is not None:
         if dataset_name in get_supported_video_datasets():
diff --git a/examples/llm_ptq/hf_ptq.py b/examples/llm_ptq/hf_ptq.py
index 571cea125a..cbb8961efd 100755
--- a/examples/llm_ptq/hf_ptq.py
+++ b/examples/llm_ptq/hf_ptq.py
@@ -35,6 +35,7 @@
     is_enc_dec,
     is_nemotron_vl,
     load_mtp_weights,
+    patch_config_for_unified_export,
     run_nemotron_vl_preview,
 )
 from torch.utils.data import DataLoader
@@ -734,6 +735,9 @@ def export_quantized(
                     extra_state_dict=mtp_state_dict,
                 )
 
+            # Exclude non-quantized modules in config.json and hf_quant_config.json
+            patch_config_for_unified_export(model_type, export_path)
+
         # Restore default padding and export the tokenizer as well.
         if tokenizer is not None:
             tokenizer.padding_side = default_padding_side

From 41ee25b7c2d3607c48c8e4b5903568e7f93e937f Mon Sep 17 00:00:00 2001
From: ajrasane <131806219+ajrasane@users.noreply.github.com>
Date: Fri, 6 Feb 2026 20:14:12 +0000
Subject: [PATCH 05/12] Create a script to run vllm inference

Signed-off-by: ajrasane <131806219+ajrasane@users.noreply.github.com>
---
 examples/llm_ptq/example_utils.py | 111 ++++++++++++++++++++++++
 examples/llm_ptq/run_vllm.py      | 136 ++++++++++++++++++++++++++++++
 2 files changed, 247 insertions(+)
 create mode 100644 examples/llm_ptq/run_vllm.py

diff --git a/examples/llm_ptq/example_utils.py b/examples/llm_ptq/example_utils.py
index 24c9cfa10f..fa71607ad0 100755
--- a/examples/llm_ptq/example_utils.py
+++ b/examples/llm_ptq/example_utils.py
@@ -46,6 +46,7 @@
     snapshot_download = None
 
 import modelopt.torch.quantization as mtq
+from modelopt.torch.export.model_utils import MODEL_NAME_TO_TYPE
 from modelopt.torch.utils.dataset_utils import get_dataset_dataloader
 from modelopt.torch.utils.image_processor import (
     BaseImageProcessor,
@@ -66,6 +67,116 @@
 
 SPECULATIVE_MODEL_LIST = ["Eagle", "Medusa"]
 
+# Files needed for tokenizer/processor that vLLM loads from model path
+TOKENIZER_FILES = [
+    "vocab.json",
+    "merges.txt",
+    "tokenizer.json",
+    "tokenizer_config.json",
+    "special_tokens_map.json",
+    "preprocessor_config.json",
+    "chat_template.json",
+]
+
+
+def get_model_type_from_config(model_path: str) -> str | None:
+    """Get model type from the config.json file.
+
+    Args:
+        model_path: Path to the model directory or HuggingFace model ID.
+
+    Returns:
+        Model type string (e.g., 'qwen3omni', 'llama', 'gpt') or None if not found.
+    """
+    config_path = os.path.join(model_path, "config.json")
+    if not os.path.exists(config_path):
+        return None
+
+    with open(config_path) as f:
+        config = json.load(f)
+
+    # Check architectures field first
+    architectures = config.get("architectures", [])
+    for arch in architectures:
+        for key, model_type in MODEL_NAME_TO_TYPE.items():
+            if key.lower() in arch.lower():
+                return model_type
+
+    # Fallback to model_type field
+    model_type_field = config.get("model_type", "")
+    for key, model_type in MODEL_NAME_TO_TYPE.items():
+        if key.lower() in model_type_field.lower():
+            return model_type
+
+    return None
+
+
+def get_sampling_params_from_config(model_path: str) -> dict:
+    """Extract sampling params from generation_config.json if present."""
+    gen_config_path = Path(model_path) / "generation_config.json"
+    if not gen_config_path.exists():
+        return {}
+
+    gen_config = json.loads(gen_config_path.read_text())
+
+    params = {k: gen_config[k] for k in ("temperature", "top_p", "top_k") if k in gen_config}
+
+    for key in ("max_new_tokens", "max_length"):
+        if key in gen_config:
+            params["max_tokens"] = gen_config[key]
+            break
+
+    return params
+
+
+def get_quantization_format(model_path: str) -> str | None:
+    """Get quantization format from the model config.
+
+    Args:
+        model_path: Path to the model directory.
+
+    Returns:
+        vLLM quantization string ('modelopt', 'modelopt_fp4') or None if not quantized.
+    """
+    hf_quant_config_path = os.path.join(model_path, "hf_quant_config.json")
+    if os.path.exists(hf_quant_config_path):
+        with open(hf_quant_config_path) as f:
+            quant_config = json.load(f)
+        quant_algo = quant_config.get("quantization", {}).get("quant_algo", "")
+        if "NVFP4" in quant_algo:
+            return "modelopt_fp4"
+
+    return None
+
+
+def ensure_tokenizer_files(model_path: str, source_model_id: str) -> None:
+    """Copy tokenizer files from HF model to local quantized model dir if missing."""
+    if not os.path.isdir(model_path):
+        return  # Not a local path, nothing to do
+
+    # Check if tokenizer files are missing
+    missing_files = [f for f in TOKENIZER_FILES if not os.path.exists(os.path.join(model_path, f))]
+    if not missing_files:
+        return
+
+    if snapshot_download is None:
+        print("Warning: huggingface_hub not installed, cannot download tokenizer files")
+        return
+
+    print(f"Copying missing tokenizer files from {source_model_id}...")
+    # Download only tokenizer files from HF
+    cache_dir = snapshot_download(
+        source_model_id,
+        allow_patterns=TOKENIZER_FILES,
+    )
+
+    for fname in TOKENIZER_FILES:
+        src = os.path.join(cache_dir, fname)
+        dst = os.path.join(model_path, fname)
+        if os.path.exists(src) and not os.path.exists(dst):
+            shutil.copy2(src, dst)
+            print(f"  Copied {fname}")
+
 
 def run_nemotron_vl_preview(
     full_model, tokenizer, input_ids, pyt_ckpt_path, stage_name, allow_fallback=False
diff --git a/examples/llm_ptq/run_vllm.py b/examples/llm_ptq/run_vllm.py
new file mode 100644
index 0000000000..748c5b13a0
--- /dev/null
+++ b/examples/llm_ptq/run_vllm.py
@@ -0,0 +1,136 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Unified HF checkpoint inference with vLLM.
+
+Usage:
+    python run_vllm.py --model /path/to/quantized/model
+    python run_vllm.py --model /path/to/model --tp 4
+"""
+
+from __future__ import annotations
+
+import argparse
+
+from example_utils import (
+    ensure_tokenizer_files,
+    get_model_type_from_config,
+    get_quantization_format,
+    get_sampling_params_from_config,
+)
+from transformers import AutoConfig, AutoProcessor
+from vllm import LLM, SamplingParams
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Run unified hf checkpoint inference with vLLM")
+    parser.add_argument("--model", type=str, required=True, help="Model ID or path")
+    parser.add_argument("--tp", type=int, default=1, help="Tensor parallel size")
+    parser.add_argument(
+        "--max-model-len",
+        type=int,
+        default=None,
+        help="Max model length (auto-detected from config if not specified)",
+    )
+    parser.add_argument("--prompt", type=str, default="What in Nvidia?", help="Text prompt")
+    parser.add_argument(
+        "--tokenizer", type=str, default=None, help="Tokenizer ID or path (defaults to model path)"
+    )
+    parser.add_argument("--temperature", type=float, default=0.7, help="Sampling temperature")
+    parser.add_argument("--top-p", type=float, default=0.9, help="Top-p sampling")
+    parser.add_argument("--top-k", type=int, default=-1, help="Top-k sampling (-1 to disable)")
+    parser.add_argument("--max-tokens", type=int, default=512, help="Max tokens to generate")
+
+    args = parser.parse_args()
+
+    # Detect model type from config
+    model_type = get_model_type_from_config(args.model)
+    print(f"Detected model type: {model_type}")
+
+    # Detect quantization format
+    quantization = get_quantization_format(args.model)
+    print(f"Detected quantization: {quantization}")
+
+    # Get max_model_len from config if not specified
+    if args.max_model_len is None:
+        config = AutoConfig.from_pretrained(args.model, trust_remote_code=True)
+        args.max_model_len = getattr(config, "max_position_embeddings", 4096)
+        print(f"Using max_model_len from config: {args.max_model_len}")
+
+    # Determine tokenizer source
+    tokenizer_id = args.tokenizer or args.model
+
+    # Load processor for chat template
+    processor = AutoProcessor.from_pretrained(tokenizer_id, trust_remote_code=True)
+
+    # Text-only conversations
+    conversations = [
+        [
+            {
+                "role": "user",
+                "content": [{"type": "text", "text": args.prompt}],
+            }
+        ],
+    ]
+
+    # Apply chat template
+    apply_chat_kwargs = {
+        "add_generation_prompt": True,
+        "tokenize": False,
+    }
+    # Qwen3Omni-specific: disable thinking mode
+    if model_type == "qwen3omni":
+        apply_chat_kwargs["enable_thinking"] = False
+
+    texts = processor.apply_chat_template(conversations, **apply_chat_kwargs)
+
+    # Ensure tokenizer files exist in local model dir (vLLM loads processor from model path)
+    if args.tokenizer:
+        ensure_tokenizer_files(args.model, args.tokenizer)
+
+    print(f"Loading model: {args.model}")
+    llm = LLM(
+        model=args.model,
+        tokenizer=tokenizer_id,
+        tensor_parallel_size=args.tp,
+        max_model_len=args.max_model_len,
+        trust_remote_code=True,
+        quantization=quantization,
+    )
+
+    # Get sampling params from config, with CLI/defaults as fallback
+    config_params = get_sampling_params_from_config(args.model)
+    sampling_kwargs = {
+        "temperature": config_params.get("temperature", args.temperature),
+        "top_p": config_params.get("top_p", args.top_p),
+        "max_tokens": config_params.get("max_tokens", args.max_tokens),
+    }
+    top_k = config_params.get("top_k", args.top_k)
+    if top_k > 0:
+        sampling_kwargs["top_k"] = top_k
+    print(f"Sampling params: {sampling_kwargs}")
+    sampling_params = SamplingParams(**sampling_kwargs)
+
+    print("Running inference...")
+    outputs = llm.generate(texts, sampling_params)
+
+    for output in outputs:
+        generated_text = output.outputs[0].text
+        print("-" * 80)
+        print(f"Generated: {generated_text}")
+
+
+if __name__ == "__main__":
+    main()

From a71c73b29bd0982eb04a03ddff3c5366cd8f53ab Mon Sep 17 00:00:00 2001
From: ajrasane <131806219+ajrasane@users.noreply.github.com>
Date: Mon, 9 Feb 2026 22:37:47 +0000
Subject: [PATCH 06/12] Add an option to supply host as an argument

Signed-off-by: ajrasane <131806219+ajrasane@users.noreply.github.com>
---
 examples/llm_eval/run_lm_eval_vllm.sh | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)
 mode change 100644 => 100755 examples/llm_eval/run_lm_eval_vllm.sh

diff --git a/examples/llm_eval/run_lm_eval_vllm.sh b/examples/llm_eval/run_lm_eval_vllm.sh
old mode 100644
new mode 100755
index ef94a66d14..18c52995c9
--- a/examples/llm_eval/run_lm_eval_vllm.sh
+++ b/examples/llm_eval/run_lm_eval_vllm.sh
@@ -19,12 +19,13 @@
 # Script to run lm-evaluation-harness against a running vLLM OpenAI-compatible server.
 #
 # Usage:
-#   bash run_lm_eval_vllm.sh <model_name> [port] [task]
+#   bash run_lm_eval_vllm.sh <model_name> [port] [task] [host]
 #
 # Arguments:
 #   <model_name>: The name of the model being served (e.g., Qwen/Qwen3-30B-A3B). Used for the 'model' argument in lm_eval.
 #   [port]:       The port the vLLM server is listening on (default: 8000).
 #   [task]:       The lm_eval task(s) to run (default: mmlu).
+#   [host]:       The IP address or hostname of the vLLM server (default: localhost).
 #
 # Example:
 #   # Start vLLM server first (in another terminal):
@@ -35,6 +36,9 @@
 #
 #   # Run for a different task, e.g., hellaswag:
 #   bash run_lm_eval_vllm.sh Qwen/Qwen3-30B-A3B 8000 hellaswag
+#
+#   # Run against a remote server:
+#   bash run_lm_eval_vllm.sh Qwen/Qwen3-30B-A3B 8000 mmlu 10.78.17.40
 # ---
 
 set -e
@@ -42,16 +46,17 @@ set -x
 
 # --- Argument Parsing ---
 if [ -z "$1" ]; then
-  echo "Usage: $0 <model_name> [port] [task]"
+  echo "Usage: $0 <model_name> [port] [task] [host]"
   exit 1
 fi
 MODEL_NAME=$1
 PORT=${2:-8000}       # Default port is 8000 if not provided
 TASK=${3:-mmlu}       # Default task is mmlu if not provided
+HOST=${4:-localhost}  # Default host is localhost if not provided
 
 # --- Environment Setup ---
 export OPENAI_API_KEY="local" # Not strictly required for local, but good practice
-BASE_URL="http://localhost:${PORT}/v1"
+BASE_URL="http://${HOST}:${PORT}/v1"
 COMPLETIONS_URL="${BASE_URL}/completions"
 
 # --- Evaluation ---

From e0e108e8befb3fb86ad40e49fb44f75064ec7805 Mon Sep 17 00:00:00 2001
From: ajrasane <131806219+ajrasane@users.noreply.github.com>
Date: Mon, 9 Feb 2026 23:46:01 +0000
Subject: [PATCH 07/12] Add video dataset utils

Signed-off-by: ajrasane <131806219+ajrasane@users.noreply.github.com>
---
 modelopt/torch/utils/video_dataset_utils.py | 332 ++++++++++++++++++++
 1 file changed, 332 insertions(+)
 create mode 100644 modelopt/torch/utils/video_dataset_utils.py

diff --git a/modelopt/torch/utils/video_dataset_utils.py b/modelopt/torch/utils/video_dataset_utils.py
new file mode 100644
index 0000000000..e022d7e24f
--- /dev/null
+++ b/modelopt/torch/utils/video_dataset_utils.py
@@ -0,0 +1,332 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Utility functions for getting samples and forward loop function for video datasets."""
+
+import os
+import tempfile
+from typing import Any
+
+import torch
+from torch.utils.data import DataLoader
+
+from .image_processor import BaseImageProcessor
+
+# Use dict to store the config for each dataset.
+SUPPORTED_VIDEO_DATASET_CONFIG: dict[str, dict[str, Any]] = {
+    "finevideo": {
+        "config": {"path": "HuggingFaceFV/finevideo", "split": "train", "streaming": True}
+    },
+}
+
+__all__ = [
+    "Qwen3OmniVideoProcessor",
+    "get_supported_video_datasets",
+    "get_video_dataset_dataloader",
+]
+
+
+def _get_video_dataset(dataset_name: str, num_samples: int):
+    """Load a portion of train dataset with the dataset name and a given size.
+
+    Args:
+        dataset_name: Name of the dataset to load.
+        num_samples: Number of samples to load from the dataset.
+
+    Returns:
+        A hugging face Dataset.
+    """
+    if dataset_name in SUPPORTED_VIDEO_DATASET_CONFIG:
+        from datasets import Dataset, load_dataset
+
+        config = SUPPORTED_VIDEO_DATASET_CONFIG[dataset_name]["config"]
+        is_streaming = config.get("streaming", False)
+
+        dataset = load_dataset(**config)
+
+        if is_streaming:
+            # For streaming datasets, use take() and convert to list then Dataset
+            samples = list(dataset.take(num_samples))
+            return Dataset.from_list(samples)
+        else:
+            return dataset.select(range(num_samples))
+    else:
+        raise NotImplementedError(
+            f"dataset {dataset_name} is not supported. Please use one of the following:"
+            f" {get_supported_video_datasets()}."
+        )
+
+
+def get_supported_video_datasets() -> list[str]:
+    """Retrieves a list of video datasets supported.
+
+    Returns:
+        A list of strings, where each string is the name of a supported dataset.
+
+    Example usage:
+
+    .. code-block:: python
+
+        from modelopt.torch.utils import get_supported_video_datasets
+
+        print("Supported video datasets:", get_supported_video_datasets())
+    """
+    return list(SUPPORTED_VIDEO_DATASET_CONFIG.keys())
+
+
+def get_video_dataset_dataloader(
+    dataset_name: str = "finevideo",
+    processor: "Qwen3OmniVideoProcessor" = None,
+    batch_size: int = 1,
+    num_samples: int = 512,
+    cache_dir: str | None = None,
+) -> DataLoader:
+    """Get a dataloader with the dataset name and processor of the target model.
+
+    Args:
+        dataset_name: Name of the dataset to load.
+        processor: Processor used for encoding video and text data.
+        batch_size: Batch size of the returned dataloader.
+        num_samples: Number of samples from the dataset.
+        cache_dir: Directory to cache the processed dataset. Defaults to a temp directory.
+            If the cache exists, it will be loaded instead of reprocessing.
+
+    Returns:
+        An instance of dataloader.
+    """
+    assert processor is not None, "Please provide a valid processor."
+
+    # Default cache_dir to temp directory
+    if cache_dir is None:
+        cache_dir = os.path.join(tempfile.gettempdir(), "modelopt_video_dataset_cache")
+
+    processed_dataset = None
+
+    # Try to load from cache (use torch.save/load to avoid Arrow 32-bit offset overflow)
+    if cache_dir is not None:
+        cache_path = os.path.join(cache_dir, f"{dataset_name}_n{num_samples}_processed.pt")
+        if os.path.exists(cache_path):
+            try:
+                from datasets import Dataset
+
+                processed_samples = torch.load(cache_path, weights_only=False)
+                processed_dataset = Dataset.from_list(processed_samples)
+                print(f"Loaded processed dataset from cache: {cache_path}")
+            except Exception as e:
+                print(f"Failed to load cache from {cache_path}: {e}. Reprocessing...")
+                processed_dataset = None
+
+    # Process dataset if not loaded from cache
+    if processed_dataset is None:
+        from datasets import Dataset
+
+        dataset = _get_video_dataset(dataset_name, num_samples=num_samples)
+
+        # Process samples manually to avoid Arrow 32-bit offset overflow
+        # (dataset.map() uses Arrow internally which can't handle large nested lists)
+        processed_samples = []
+        for i, sample in enumerate(dataset):
+            processed = processor.preprocess_function(sample)
+            processed_samples.append(processed)
+            if (i + 1) % 10 == 0:
+                print(f"Processed {i + 1}/{len(dataset)} samples...")
+
+        processed_dataset = Dataset.from_list(processed_samples)
+
+        # Save to cache using torch.save to avoid Arrow 32-bit offset overflow
+        if cache_dir is not None:
+            os.makedirs(cache_dir, exist_ok=True)
+            torch.save(processed_samples, cache_path)
+            print(f"Saved processed dataset to cache: {cache_path}")
+
+    # Create DataLoader with the custom collate function
+    return DataLoader(
+        processed_dataset,
+        batch_size=batch_size,
+        shuffle=False,
+        collate_fn=processor.collate_function,
+    )
+
+
+class Qwen3OmniVideoProcessor(BaseImageProcessor):
+    """Video processor for Qwen3-Omni multimodal model with finevideo dataset support."""
+
+    def __init__(self, tokenizer, device="cuda", dtype=None, use_audio_in_video=True):
+        """Constructor.
+
+        Args:
+            tokenizer: The Qwen3OmniMoeProcessor for tokenizing and processing inputs.
+            device: Device to move tensors to.
+            dtype: dtype for float tensors (e.g., torch.bfloat16). If None, uses default.
+            use_audio_in_video: Whether to extract and use audio from video files.
+        """
+        super().__init__(tokenizer, device)
+        self.dtype = dtype
+        self.use_audio_in_video = use_audio_in_video
+        self._temp_dir = tempfile.mkdtemp(prefix="qwen3omni_video_")
+        self._video_counter = 0
+        # Try to import qwen_omni_utils for multimodal processing
+        try:
+            from qwen_omni_utils import process_mm_info
+
+            self.process_mm_info = process_mm_info
+        except ImportError:
+            raise ImportError(
+                "qwen_omni_utils is required for Qwen3OmniVideoProcessor. "
+                "Please install it from https://github.com/QwenLM/Qwen3-Omni"
+            )
+
+    def _save_video_bytes_to_file(self, video_bytes: bytes) -> str:
+        """Save video bytes to a temporary file and return the path.
+
+        Args:
+            video_bytes: Raw video bytes (e.g., from finevideo's 'mp4' field).
+
+        Returns:
+            Path to the temporary video file.
+        """
+        video_path = os.path.join(self._temp_dir, f"video_{self._video_counter}.mp4")
+        self._video_counter += 1
+        with open(video_path, "wb") as f:
+            f.write(video_bytes)
+        return video_path
+
+    def preprocess_function(self, examples):
+        """Preprocess function for Qwen3-Omni with video support.
+
+        Handles both standard video paths and raw video bytes (finevideo format).
+        """
+        # Get question/prompt - finevideo has metadata in 'json' field
+        if "json" in examples and examples["json"] is not None:
+            metadata = examples["json"]
+            # Try to get a meaningful question from metadata
+            category = metadata.get("content_fine_category", "")
+            question = (
+                f"Describe what is happening in this video in detail. Category hint: {category}"
+            )
+        else:
+            question = examples.get("question", "Describe this video in detail.")
+
+        # Build conversation in Qwen format
+        content = []
+
+        # Handle video - check for raw bytes (finevideo format) or path
+        video_path = None
+        if examples.get("mp4") is not None:
+            # finevideo format: raw video bytes in 'mp4' field
+            video_path = self._save_video_bytes_to_file(examples["mp4"])
+        elif examples.get("video") is not None:
+            # Standard format: video path or URL
+            video_path = examples["video"]
+
+        if video_path is not None:
+            content.append({"type": "video", "video": video_path})
+
+        content.append({"type": "text", "text": question})
+
+        conversation = [{"role": "user", "content": content}]
+        text = self.tokenizer.apply_chat_template(
+            conversation, add_generation_prompt=True, tokenize=False, enable_thinking=False
+        )
+
+        # Extract multimodal info using qwen_omni_utils
+        audios, images, videos = self.process_mm_info(
+            conversation, use_audio_in_video=self.use_audio_in_video
+        )
+
+        # Process inputs with the processor
+        values = self.tokenizer(
+            text=text,
+            audio=audios,
+            images=images,
+            videos=videos,
+            return_tensors="pt",
+            padding=True,
+            use_audio_in_video=self.use_audio_in_video,
+        )
+        # Define all possible keys to ensure consistent schema for Arrow serialization
+        all_keys = [
+            "input_ids",
+            "attention_mask",
+            "pixel_values_videos",
+            "video_grid_thw",
+            "video_second_per_grid",
+            "feature_attention_mask",
+            "input_features",
+        ]
+
+        # Convert tensors to lists for Arrow serialization compatibility
+        # Tensor conversion back happens in collate_function
+        result = dict.fromkeys(all_keys)  # Initialize all keys to None
+        for key, val in values.items():
+            if val is not None and hasattr(val, "tolist"):
+                result[key] = val.tolist()
+            elif val is not None:
+                result[key] = val
+
+        return result
+
+    def collate_function(self, batch):
+        """Collate function to process inputs during data loading."""
+        result = {}
+
+        # Take first item from batch (batch_size handling)
+        first = batch[0]
+
+        # Convert lists to tensors and move to device
+        if first.get("input_ids") is not None:
+            result["input_ids"] = torch.LongTensor(first["input_ids"]).to(self.device)
+        if first.get("attention_mask") is not None:
+            result["attention_mask"] = torch.LongTensor(first["attention_mask"]).to(self.device)
+
+        # Handle pixel values for video frames
+        if first.get("pixel_values_videos") is not None:
+            pv = torch.tensor(first["pixel_values_videos"])
+            if self.dtype is not None:
+                pv = pv.to(self.dtype)
+            result["pixel_values_videos"] = pv.to(self.device)
+
+        # Handle video grid thw (tile height width info)
+        if first.get("video_grid_thw") is not None:
+            result["video_grid_thw"] = torch.LongTensor(first["video_grid_thw"]).to(self.device)
+
+        # Handle video second per grid (temporal info for rope)
+        if first.get("video_second_per_grid") is not None:
+            result["video_second_per_grid"] = torch.tensor(first["video_second_per_grid"]).to(
+                self.device
+            )
+
+        # Handle audio features if present
+        if first.get("feature_attention_mask") is not None:
+            result["feature_attention_mask"] = torch.LongTensor(first["feature_attention_mask"]).to(
+                self.device
+            )
+        if first.get("input_features") is not None:
+            inp_feat = torch.tensor(first["input_features"])
+            if self.dtype is not None:
+                inp_feat = inp_feat.to(self.dtype)
+            result["input_features"] = inp_feat.to(self.device)
+
+        # Pass use_audio_in_video flag to model.generate() for Qwen3Omni
+        result["use_audio_in_video"] = self.use_audio_in_video
+
+        return result
+
+    def cleanup(self):
+        """Clean up temporary video files."""
+        import shutil
+
+        if os.path.exists(self._temp_dir):
+            shutil.rmtree(self._temp_dir)

From d334c9a89f6547a336c0dc62ee8da89bbdbe738f Mon Sep 17 00:00:00 2001
From: ajrasane <131806219+ajrasane@users.noreply.github.com>
Date: Tue, 17 Mar 2026 21:30:29 +0000
Subject: [PATCH 08/12] Update documentation for post_quantize

Signed-off-by: ajrasane <131806219+ajrasane@users.noreply.github.com>
---
 examples/llm_ptq/hf_ptq.py | 33 +++++++++++++++++++++++++++++----
 1 file changed, 29 insertions(+), 4 deletions(-)

diff --git a/examples/llm_ptq/hf_ptq.py b/examples/llm_ptq/hf_ptq.py
index cbb8961efd..4091c54493 100755
--- a/examples/llm_ptq/hf_ptq.py
+++ b/examples/llm_ptq/hf_ptq.py
@@ -827,11 +827,36 @@ def post_quantize(
     first_text_speech_dataset,
     calib_batch: dict | None = None,
 ):
-    """
-    Processing after the quantization.
+    """Processing after the quantization.
+
+    Runs one round of generation using the quantized model for a sample prompt and
+    compares it with the pre-quantize generation from ``pre_quantize()``.
 
-    Currently we run one round of generation using the quantized model for a sample prompt,
-    and compare it with pre-quantize generation.
+    Args:
+        args: Parsed CLI arguments. Used for ``verbose``, ``quant_summary_path``,
+            ``export_path``, ``pyt_ckpt_path``, and ``skip_generate`` flags.
+        full_model: The quantized model to run post-quantization generation on.
+        model_type: Model architecture identifier (e.g. ``"qwen3omni"``, ``"whisper"``,
+            ``"llama4"``, ``"deepseek"``). Controls model-specific generation and
+            decoding paths. ``None`` for generic models.
+        tokenizer: HF tokenizer for decoding generated token ids. May be ``None`` when
+            a ``processor`` is used instead (e.g. vision-language or speech models).
+        processor: HF image/audio processor for multimodal models. Used for decoding
+            outputs from vision-language (Mllama, Qwen3Omni) and speech (Whisper)
+            models. ``None`` for text-only models.
+        preview_input_ids: Input token ids (single sample) produced by ``pre_quantize()``
+            for the preview generation comparison.
+        generated_ids_before_ptq: Generation output from ``pre_quantize()`` to compare
+            against post-quantization output. ``None`` if generation was skipped.
+        is_nemotron_vl_model: Whether the model is a Nemotron VL model, which uses
+            ``model.chat()`` and returns text strings instead of token tensors.
+        first_text_speech_dataset: Text transcript of the first speech sample, used as
+            the display input for Whisper models since their ``input_ids`` are
+            mel-spectrogram features rather than decodable tokens.
+        calib_batch: Full calibration batch dict from ``pre_quantize``. Required for
+            multimodal models (e.g. Qwen3Omni) whose ``generate()`` needs the complete
+            input dict (audio features, attention masks, etc.) rather than just
+            ``input_ids``. For text-only models this is unused and may be ``None``.
 
     """
 

From 55f1e736e7532e534d07a87592f831e5fe86f48b Mon Sep 17 00:00:00 2001
From: ajrasane <131806219+ajrasane@users.noreply.github.com>
Date: Wed, 18 Mar 2026 02:35:17 +0000
Subject: [PATCH 09/12] Bug fixes

Signed-off-by: ajrasane <131806219+ajrasane@users.noreply.github.com>
---
 examples/llm_ptq/example_utils.py           |  7 +++++--
 examples/llm_ptq/run_vllm.py                | 14 +++++++++++---
 modelopt/torch/quantization/model_quant.py  |  1 -
 modelopt/torch/utils/dataset_utils.py       |  4 ++--
 modelopt/torch/utils/image_processor.py     | 16 ++++++++++++----
 modelopt/torch/utils/video_dataset_utils.py |  4 +++-
 6 files changed, 33 insertions(+), 13 deletions(-)

diff --git a/examples/llm_ptq/example_utils.py b/examples/llm_ptq/example_utils.py
index fa71607ad0..b9d3dd5f86 100755
--- a/examples/llm_ptq/example_utils.py
+++ b/examples/llm_ptq/example_utils.py
@@ -45,7 +45,6 @@
 except ImportError:
     snapshot_download = None
 
-import modelopt.torch.quantization as mtq
 from modelopt.torch.export.model_utils import MODEL_NAME_TO_TYPE
 from modelopt.torch.utils.dataset_utils import get_dataset_dataloader
 from modelopt.torch.utils.image_processor import (
@@ -1074,6 +1073,9 @@ def get_qwen3omni_dataloader(
         num_samples = [512, 512]
 
     if processor is not None:
+        # Normalize single-element list to str for supported-dataset lookups
+        if isinstance(dataset_name, list) and len(dataset_name) == 1:
+            dataset_name = dataset_name[0]
         if dataset_name in get_supported_video_datasets():
             assert isinstance(dataset_name, str)
             video_processor = Qwen3OmniVideoProcessor(
@@ -1093,7 +1095,8 @@ def get_qwen3omni_dataloader(
             assert isinstance(processor, Qwen3OmniImageProcessor), (
                 "The Qwen3OmniImageProcessor must be set."
             )
-            # Set the dtype for proper tensor conversion in collate_function
+            # Set dtype for proper tensor conversion in collate_function.
+            # Processor is created before model_dtype is known, so we set it here.
             processor.dtype = model_dtype
             calib_dataloader = get_vlm_dataset_dataloader(
                 dataset_name=dataset_name,
diff --git a/examples/llm_ptq/run_vllm.py b/examples/llm_ptq/run_vllm.py
index 748c5b13a0..3e69ab9de7 100644
--- a/examples/llm_ptq/run_vllm.py
+++ b/examples/llm_ptq/run_vllm.py
@@ -52,6 +52,12 @@ def main():
     parser.add_argument("--top-p", type=float, default=0.9, help="Top-p sampling")
     parser.add_argument("--top-k", type=int, default=-1, help="Top-k sampling (-1 to disable)")
     parser.add_argument("--max-tokens", type=int, default=512, help="Max tokens to generate")
+    parser.add_argument(
+        "--trust-remote-code",
+        action="store_true",
+        default=False,
+        help="Trust remote code from HuggingFace model repos",
+    )
 
     args = parser.parse_args()
 
@@ -65,7 +71,7 @@ def main():
 
     # Get max_model_len from config if not specified
     if args.max_model_len is None:
-        config = AutoConfig.from_pretrained(args.model, trust_remote_code=True)
+        config = AutoConfig.from_pretrained(args.model, trust_remote_code=args.trust_remote_code)
         args.max_model_len = getattr(config, "max_position_embeddings", 4096)
         print(f"Using max_model_len from config: {args.max_model_len}")
 
@@ -73,7 +79,9 @@ def main():
     tokenizer_id = args.tokenizer or args.model
 
     # Load processor for chat template
-    processor = AutoProcessor.from_pretrained(tokenizer_id, trust_remote_code=True)
+    processor = AutoProcessor.from_pretrained(
+        tokenizer_id, trust_remote_code=args.trust_remote_code
+    )
 
     # Text-only conversations
     conversations = [
@@ -106,7 +114,7 @@ def main():
         tokenizer=tokenizer_id,
         tensor_parallel_size=args.tp,
         max_model_len=args.max_model_len,
-        trust_remote_code=True,
+        trust_remote_code=args.trust_remote_code,
         quantization=quantization,
     )
 
diff --git a/modelopt/torch/quantization/model_quant.py b/modelopt/torch/quantization/model_quant.py
index 782702703b..a2dae3fbe9 100644
--- a/modelopt/torch/quantization/model_quant.py
+++ b/modelopt/torch/quantization/model_quant.py
@@ -17,7 +17,6 @@
 
 import fnmatch
 import inspect
-import os
 import warnings
 from collections.abc import Callable, Iterable
 from typing import Any
diff --git a/modelopt/torch/utils/dataset_utils.py b/modelopt/torch/utils/dataset_utils.py
index 842a797afb..cd538111c6 100644
--- a/modelopt/torch/utils/dataset_utils.py
+++ b/modelopt/torch/utils/dataset_utils.py
@@ -606,8 +606,8 @@ def _process_batch(batch_data, infer_method, generation_kwargs={}, max_working_b
     assert all(torch.is_tensor(data) or data is None for data in tensor_data.values()), (
         "tensor_data values must be tensors"
     )
-    # Get the batch size of current data
-    batch_size = tensor_data[next(iter(batch_data.keys()))].shape[0]
+    # Get the batch size from the first non-None tensor value
+    batch_size = next(v for v in tensor_data.values() if v is not None).shape[0]
 
     # If we know a smaller batch size works, preemptively split
     if max_working_batch_size is not None and batch_size > max_working_batch_size:
diff --git a/modelopt/torch/utils/image_processor.py b/modelopt/torch/utils/image_processor.py
index 07deca7fc4..7691d65951 100644
--- a/modelopt/torch/utils/image_processor.py
+++ b/modelopt/torch/utils/image_processor.py
@@ -175,9 +175,10 @@ def collate_function(self, batch):
 class Qwen3OmniImageProcessor(BaseImageProcessor):
     """Image processor for Qwen3-Omni multimodal model."""
 
-    def __init__(self, tokenizer, device="auto", use_audio_in_video=False):
+    def __init__(self, tokenizer, device="auto", dtype=None, use_audio_in_video=False):
         """Constructor."""
         super().__init__(tokenizer, device)
+        self.dtype = dtype
         self.use_audio_in_video = use_audio_in_video
         # Try to import qwen_omni_utils for multimodal processing
         try:
@@ -251,7 +252,8 @@ def collate_function(self, batch):
         """Collate function to process inputs during data loading."""
         result = {}
 
-        # Take first item from batch (batch_size handling)
+        # Take first item only — multimodal inputs have variable-length sequences
+        # (images, audio) that cannot be stacked, so batch_size=1 is expected.
         first = batch[0]
 
         # Convert lists to tensors and move to device
@@ -262,7 +264,10 @@ def collate_function(self, batch):
 
         # Handle pixel values for images
         if first.get("pixel_values") is not None:
-            result["pixel_values"] = torch.tensor(first["pixel_values"]).to(self.device)
+            pv = torch.tensor(first["pixel_values"])
+            if self.dtype is not None:
+                pv = pv.to(self.dtype)
+            result["pixel_values"] = pv.to(self.device)
 
         # Handle image grid thw (tile height width info)
         if first.get("image_grid_thw") is not None:
@@ -274,7 +279,10 @@ def collate_function(self, batch):
                 self.device
             )
         if first.get("audio_features") is not None:
-            result["audio_features"] = torch.tensor(first["audio_features"]).to(self.device)
+            af = torch.tensor(first["audio_features"])
+            if self.dtype is not None:
+                af = af.to(self.dtype)
+            result["audio_features"] = af.to(self.device)
 
         # Handle video features if present
         if first.get("video_grid_thw") is not None:
diff --git a/modelopt/torch/utils/video_dataset_utils.py b/modelopt/torch/utils/video_dataset_utils.py
index e022d7e24f..a48c29048b 100644
--- a/modelopt/torch/utils/video_dataset_utils.py
+++ b/modelopt/torch/utils/video_dataset_utils.py
@@ -121,6 +121,7 @@ def get_video_dataset_dataloader(
             try:
                 from datasets import Dataset
 
+                # weights_only=False is safe here: the cache file is self-generated at line 151
                 processed_samples = torch.load(cache_path, weights_only=False)
                 processed_dataset = Dataset.from_list(processed_samples)
                 print(f"Loaded processed dataset from cache: {cache_path}")
@@ -282,7 +283,8 @@ def collate_function(self, batch):
         """Collate function to process inputs during data loading."""
         result = {}
 
-        # Take first item from batch (batch_size handling)
+        # Take first item only — multimodal inputs have variable-length sequences
+        # (video frames, audio) that cannot be stacked, so batch_size=1 is expected.
         first = batch[0]
 
         # Convert lists to tensors and move to device

From 9e3b3991c0a2150a387f581b4defa22051b1fb36 Mon Sep 17 00:00:00 2001
From: ajrasane <131806219+ajrasane@users.noreply.github.com>
Date: Wed, 18 Mar 2026 03:57:18 +0000
Subject: [PATCH 10/12] Update get_expert_linear_names

Signed-off-by: ajrasane <131806219+ajrasane@users.noreply.github.com>
---
 examples/llm_ptq/run_vllm.py         | 1 +
 modelopt/torch/export/layer_utils.py | 1 +
 2 files changed, 2 insertions(+)

diff --git a/examples/llm_ptq/run_vllm.py b/examples/llm_ptq/run_vllm.py
index 3e69ab9de7..60cfcb2cd1 100644
--- a/examples/llm_ptq/run_vllm.py
+++ b/examples/llm_ptq/run_vllm.py
@@ -116,6 +116,7 @@ def main():
         max_model_len=args.max_model_len,
         trust_remote_code=args.trust_remote_code,
         quantization=quantization,
+        enforce_eager=True,
     )
 
     # Get sampling params from config, with CLI/defaults as fallback
diff --git a/modelopt/torch/export/layer_utils.py b/modelopt/torch/export/layer_utils.py
index 9a2cd4b2f0..641204d4f7 100755
--- a/modelopt/torch/export/layer_utils.py
+++ b/modelopt/torch/export/layer_utils.py
@@ -972,6 +972,7 @@ def module_match_name_list(module, name_list):
             "Qwen3MoeSparseMoeBlock",
             "Qwen3NextSparseMoeBlock",
             "Qwen3_5MoeSparseMoeBlock",
+            "Qwen3OmniMoeThinkerTextSparseMoeBlock",
             "DeepseekMoE",
         ],
     ):

From 20a4b33d5c0a3459b03e35588386261ea52b877f Mon Sep 17 00:00:00 2001
From: ajrasane <131806219+ajrasane@users.noreply.github.com>
Date: Wed, 1 Apr 2026 20:12:16 +0000
Subject: [PATCH 11/12] Address comments

Signed-off-by: ajrasane <131806219+ajrasane@users.noreply.github.com>
---
 examples/llm_ptq/example_utils.py           |  78 ++------
 examples/llm_ptq/hf_ptq.py                  |  63 ++++---
 modelopt/torch/export/model_utils.py        |  26 ++-
 modelopt/torch/export/unified_export_hf.py  |  17 +-
 modelopt/torch/utils/dataset_utils.py       | 130 ++++++-------
 modelopt/torch/utils/image_processor.py     | 194 +++++++++++---------
 modelopt/torch/utils/video_dataset_utils.py | 115 +++---------
 7 files changed, 288 insertions(+), 335 deletions(-)

diff --git a/examples/llm_ptq/example_utils.py b/examples/llm_ptq/example_utils.py
index b9d3dd5f86..a39acf4c73 100755
--- a/examples/llm_ptq/example_utils.py
+++ b/examples/llm_ptq/example_utils.py
@@ -45,7 +45,7 @@
 except ImportError:
     snapshot_download = None
 
-from modelopt.torch.export.model_utils import MODEL_NAME_TO_TYPE
+from modelopt.torch.export.model_utils import match_model_type_by_name
 from modelopt.torch.utils.dataset_utils import get_dataset_dataloader
 from modelopt.torch.utils.image_processor import (
     BaseImageProcessor,
@@ -95,19 +95,13 @@ def get_model_type_from_config(model_path: str) -> str | None:
         config = json.load(f)
 
     # Check architectures field first
-    architectures = config.get("architectures", [])
-    for arch in architectures:
-        for key, model_type in MODEL_NAME_TO_TYPE.items():
-            if key.lower() in arch.lower():
-                return model_type
+    for arch in config.get("architectures", []):
+        result = match_model_type_by_name(arch)
+        if result is not None:
+            return result
 
     # Fallback to model_type field
-    model_type_field = config.get("model_type", "")
-    for key, model_type in MODEL_NAME_TO_TYPE.items():
-        if key.lower() in model_type_field.lower():
-            return model_type
-
-    return None
+    return match_model_type_by_name(config.get("model_type", ""))
 
 
 def get_sampling_params_from_config(model_path: str) -> dict:
@@ -164,10 +158,13 @@ def ensure_tokenizer_files(model_path: str, source_model_id: str) -> None:
 
     print(f"Copying missing tokenizer files from {source_model_id}...")
     # Download only tokenizer files from HF
-    cache_dir = snapshot_download(
-        source_model_id,
-        allow_patterns=TOKENIZER_FILES,
-    )
+    if os.path.isdir(source_model_id):
+        cache_dir = source_model_id
+    else:
+        cache_dir = snapshot_download(
+            source_model_id,
+            allow_patterns=TOKENIZER_FILES,
+        )
 
     for fname in TOKENIZER_FILES:
         src = os.path.join(cache_dir, fname)
@@ -992,55 +989,6 @@ def copy_custom_model_files(source_path: str, export_path: str, trust_remote_cod
         print("No custom model files found to copy")
 
 
-def patch_config_for_unified_export(model_type: str, export_path: str) -> None:
-    """Patch config files to add missing exclusion patterns for unified HF export.
-
-    This function adds missing exclusion patterns for modules that should not be quantized
-    (e.g., audio tower, visual encoder, lm_head) to both hf_quant_config.json and config.json.
-
-    Args:
-        export_path: Path to the exported model directory.
-    """
-    if model_type == "qwen3omni":
-        missing_patterns = [
-            "thinker.audio_tower*",
-            "thinker.visual*",
-            "thinker.lm_head",
-        ]
-
-        # (filename, path_to_exclude_list)
-        configs = [
-            ("hf_quant_config.json", ["quantization", "exclude_modules"]),
-            ("config.json", ["quantization_config", "ignore"]),
-        ]
-
-        for filename, keys in configs:
-            filepath = os.path.join(export_path, filename)
-            if not os.path.exists(filepath):
-                continue
-            try:
-                with open(filepath) as f:
-                    config = json.load(f)
-
-                # Navigate to nested key
-                target = config
-                for key in keys[:-1]:
-                    target = target.get(key, {})
-
-                exclude_list = target.get(keys[-1])
-                if exclude_list is None:
-                    continue
-
-                added = [p for p in missing_patterns if p not in exclude_list]
-                if added:
-                    exclude_list.extend(added)
-                    with open(filepath, "w") as f:
-                        json.dump(config, f, indent=2)
-                    print(f"Patched {filename} with exclusions: {added}")
-            except Exception as e:
-                print(f"Warning: Failed to patch {filename}: {e}")
-
-
 def get_qwen3omni_dataloader(
     dataset_name: str | list[str] | None,
     processor: Qwen3OmniImageProcessor | None,
diff --git a/examples/llm_ptq/hf_ptq.py b/examples/llm_ptq/hf_ptq.py
index 4091c54493..2d441f4b35 100755
--- a/examples/llm_ptq/hf_ptq.py
+++ b/examples/llm_ptq/hf_ptq.py
@@ -18,6 +18,7 @@
 import random
 import time
 import warnings
+from collections import namedtuple
 from typing import Any
 
 import numpy as np
@@ -35,7 +36,6 @@
     is_enc_dec,
     is_nemotron_vl,
     load_mtp_weights,
-    patch_config_for_unified_export,
     run_nemotron_vl_preview,
 )
 from torch.utils.data import DataLoader
@@ -735,9 +735,6 @@ def export_quantized(
                     extra_state_dict=mtp_state_dict,
                 )
 
-            # Exclude non-quantized modules in config.json and hf_quant_config.json
-            patch_config_for_unified_export(model_type, export_path)
-
         # Restore default padding and export the tokenizer as well.
         if tokenizer is not None:
             tokenizer.padding_side = default_padding_side
@@ -757,6 +754,23 @@ def export_quantized(
         )
 
 
+PreQuantizeResult = namedtuple(
+    "PreQuantizeResult", ["preview_input_ids", "generated_ids_before_ptq", "calib_batch"]
+)
+
+
+def _qwen3omni_generate(model, calib_batch):
+    """Run Qwen3Omni generate and unpack the result.
+
+    Qwen3Omni returns a (text_ids, audio) tuple; text_ids may have a .sequences attribute.
+    """
+    result = model.generate(**calib_batch, return_audio=False, thinker_max_new_tokens=100)
+    if isinstance(result, tuple):
+        text_ids, _ = result
+        return text_ids.sequences if hasattr(text_ids, "sequences") else text_ids
+    return result
+
+
 def pre_quantize(
     args: argparse.Namespace,
     full_model: torch.nn.Module,
@@ -799,20 +813,15 @@ def pre_quantize(
             allow_fallback=False,
         )
     elif model_type == "qwen3omni":
-        # Qwen3Omni returns (text_ids, audio) tuple; text_ids has .sequences
-        # Pass full batch with all multimodal inputs
-        result = full_model.generate(**calib_batch, return_audio=False, thinker_max_new_tokens=100)
-        if isinstance(result, tuple):
-            text_ids, _ = result
-            generated_ids_before_ptq = (
-                text_ids.sequences if hasattr(text_ids, "sequences") else text_ids
-            )
-        else:
-            generated_ids_before_ptq = result
+        # Use only a single sample for preview generation to avoid OOM
+        single_sample = {
+            k: v[0:1] if isinstance(v, torch.Tensor) else v for k, v in calib_batch.items()
+        }
+        generated_ids_before_ptq = _qwen3omni_generate(full_model, single_sample)
     else:
         generated_ids_before_ptq = full_model.generate(preview_input_ids, max_new_tokens=100)
 
-    return preview_input_ids, generated_ids_before_ptq, calib_batch
+    return PreQuantizeResult(preview_input_ids, generated_ids_before_ptq, calib_batch)
 
 
 def post_quantize(
@@ -861,25 +870,23 @@ def post_quantize(
     """
 
     if args.verbose:
-        mtq.print_quant_summary(full_model, save_path=args.quant_summary_path)
-        save_expert_token_count_table(full_model, args.export_path)
+        try:
+            mtq.print_quant_summary(full_model, save_path=args.quant_summary_path)
+            save_expert_token_count_table(full_model, args.export_path)
+        except Exception as e:
+            print(f"Warning: Failed to print quant summary: {e}")
 
     # Run some samples
     torch.cuda.empty_cache()
     generated_ids_after_ptq = None
     if generated_ids_before_ptq is None:
         pass
-    elif model_type == "qwen3omni":
-        # Qwen3Omni returns (text_ids, audio) tuple; text_ids has .sequences
-        # Pass full batch with all multimodal inputs
-        result = full_model.generate(**calib_batch, return_audio=False, thinker_max_new_tokens=100)
-        if isinstance(result, tuple):
-            text_ids, _ = result
-            generated_ids_after_ptq = (
-                text_ids.sequences if hasattr(text_ids, "sequences") else text_ids
-            )
-        else:
-            generated_ids_after_ptq = result
+    elif model_type == "qwen3omni" and calib_batch is not None:
+        # Use only a single sample for preview generation to avoid OOM
+        single_sample = {
+            k: v[0:1] if isinstance(v, torch.Tensor) else v for k, v in calib_batch.items()
+        }
+        generated_ids_after_ptq = _qwen3omni_generate(full_model, single_sample)
     elif model_type != "llama4" and not is_nemotron_vl_model:
         # Our fake quantizer may not be fully compatible with torch.compile.
         generated_ids_after_ptq = full_model.generate(preview_input_ids, max_new_tokens=100)
diff --git a/modelopt/torch/export/model_utils.py b/modelopt/torch/export/model_utils.py
index 7501ed7bbc..17798d0837 100755
--- a/modelopt/torch/export/model_utils.py
+++ b/modelopt/torch/export/model_utils.py
@@ -67,17 +67,35 @@
         {MODEL_NAME_TO_TYPE=}
 """
 
-__all__ = ["get_language_model_from_vl", "get_model_type", "is_multimodal_model"]
+__all__ = [
+    "get_language_model_from_vl",
+    "get_model_type",
+    "is_multimodal_model",
+    "match_model_type_by_name",
+]
 
 
-def get_model_type(model):
-    """Try get the model type from the model name. If not found, return None."""
+def match_model_type_by_name(name: str) -> str | None:
+    """Match a model type from MODEL_NAME_TO_TYPE by case-insensitive substring match.
+
+    Args:
+        name: String to match against (e.g. class name, architecture string, model_type field).
+
+    Returns:
+        Matched model type string, or None.
+    """
+    name_lower = name.lower()
     for k, v in MODEL_NAME_TO_TYPE.items():
-        if k.lower() in type(model).__name__.lower():
+        if k.lower() in name_lower:
             return v
     return None
 
 
+def get_model_type(model):
+    """Try get the model type from the model name. If not found, return None."""
+    return match_model_type_by_name(type(model).__name__)
+
+
 def is_multimodal_model(model):
     """Check if a model is a Vision-Language Model (VLM) or multimodal model.
 
diff --git a/modelopt/torch/export/unified_export_hf.py b/modelopt/torch/export/unified_export_hf.py
index e8a1e06857..c60469a587 100644
--- a/modelopt/torch/export/unified_export_hf.py
+++ b/modelopt/torch/export/unified_export_hf.py
@@ -87,7 +87,7 @@
     QUANTIZATION_W4A8_AWQ,
     QUANTIZATION_W4A8_NVFP4_FP8,
 )
-from .model_utils import get_language_model_from_vl, is_multimodal_model
+from .model_utils import get_language_model_from_vl, get_model_type, is_multimodal_model
 from .plugins import SpeculativeDecodingExporter, has_spec_opt
 from .quant_utils import (
     fuse_prequant_layernorm,
@@ -781,6 +781,16 @@ def _export_transformers_checkpoint(
                 exclude_modules.append(pattern)
                 print(f"Adding MTP layer to quantization_config ignore: {pattern}")
 
+    # Add model-specific non-quantized module exclusions
+    _model_type_exclusions = {
+        "qwen3omni": ["thinker.audio_tower*", "thinker.visual*", "thinker.lm_head"],
+    }
+    model_type = get_model_type(model)
+    for pattern in _model_type_exclusions.get(model_type, []):
+        exclude_modules = quant_config["quantization"].setdefault("exclude_modules", [])
+        if pattern not in exclude_modules:
+            exclude_modules.append(pattern)
+
     # Safety net: sync any gate/up weight quantizer amaxes that
     # requantize_resmooth_fused_llm_layers did not reach (e.g. experts not
     # activated during the dummy forward, or non-standard expert naming).
@@ -1185,6 +1195,8 @@ def export_hf_checkpoint(
 
         # Fix generation_config conflicts before saving
         # Some models have temperature/top_p/top_k set but do_sample=False which causes validation errors
+        # Restore the original value after save to avoid mutating the caller's model.
+        _gen_config_restore = None
         if hasattr(model, "generation_config") and model.generation_config is not None:
             gen_config = model.generation_config
             if not getattr(gen_config, "do_sample", True):
@@ -1193,6 +1205,7 @@ def export_hf_checkpoint(
                     getattr(gen_config, attr, None) is not None
                     for attr in ["temperature", "top_p", "top_k"]
                 ):
+                    _gen_config_restore = gen_config.do_sample
                     gen_config.do_sample = True
 
         # Save model
@@ -1211,6 +1224,8 @@ def export_hf_checkpoint(
             )
         finally:
             _unpatch_revert_weight_conversion(_patches)
+            if _gen_config_restore is not None:
+                model.generation_config.do_sample = _gen_config_restore
 
         original_config = f"{export_dir}/config.json"
         config_data = {}
diff --git a/modelopt/torch/utils/dataset_utils.py b/modelopt/torch/utils/dataset_utils.py
index cd538111c6..f5a64054fe 100644
--- a/modelopt/torch/utils/dataset_utils.py
+++ b/modelopt/torch/utils/dataset_utils.py
@@ -212,6 +212,44 @@ def _auto_preprocess_sample(
     )
 
 
+def _load_text_samples(dataset_name, num_samples, **kwargs):
+    """Normalize inputs and load raw text samples from one or more datasets.
+
+    Args:
+        dataset_name: Single name or list of names.
+        num_samples: Single count or list of counts (must match dataset_name length).
+        **kwargs: Forwarded to get_dataset_samples().
+
+    Returns:
+        List of raw text strings.
+    """
+    if isinstance(num_samples, int):
+        num_samples = [num_samples]
+    if isinstance(dataset_name, str):
+        dataset_name = [dataset_name]
+    assert len(dataset_name) == len(num_samples), (
+        "dataset_name and num_samples must be the same length"
+    )
+    all_samples = []
+    for ds_name, num_sample in zip(dataset_name, num_samples):
+        samples = get_dataset_samples(ds_name, num_sample, **kwargs)
+        all_samples.extend(samples)
+    return all_samples
+
+
+class _ListDataset(torch.utils.data.Dataset):
+    """Simple dataset wrapping a list of dicts."""
+
+    def __init__(self, samples):
+        self.samples = samples
+
+    def __getitem__(self, idx):
+        return self.samples[idx]
+
+    def __len__(self):
+        return len(self.samples)
+
+
 def get_qwen3omni_text_dataloader(
     dataset_name: str | list[str] = "cnn_dailymail",
     processor=None,
@@ -236,59 +274,25 @@ def get_qwen3omni_text_dataloader(
     """
     assert processor is not None, "Please provide a Qwen3OmniTextProcessor."
 
-    if isinstance(num_samples, int):
-        num_samples = [num_samples]
-
-    if isinstance(dataset_name, str):
-        dataset_name = [dataset_name]
-
-    assert len(dataset_name) == len(num_samples), (
-        "dataset_name and num_samples must be the same length"
-    )
+    all_samples = _load_text_samples(dataset_name, num_samples)
 
-    # Get raw text samples
-    all_samples = []
-    for ds_name, num_sample in zip(dataset_name, num_samples):
-        samples = get_dataset_samples(ds_name, num_sample)
-        all_samples.extend(samples)
+    # Preprocess each sample with the conversation template and convert to lists
+    from .image_processor import _Qwen3OmniProcessorMixin
 
-    # Preprocess each sample with the conversation template
     processed_samples = []
     for text in all_samples:
-        # Apply conversation template and tokenize
         values = processor.preprocess_function(text)
+        processed_samples.append(
+            _Qwen3OmniProcessorMixin._serialize_for_arrow(values, list(values.keys()))
+        )
 
-        # Convert to lists for dataset compatibility
-        sample_dict = {}
-        for key, val in values.items():
-            if val is not None and hasattr(val, "tolist"):
-                sample_dict[key] = val.tolist()
-            elif val is not None:
-                sample_dict[key] = val
-        processed_samples.append(sample_dict)
-
-    # Create dataset
-    class _Qwen3OmniTextDataset(torch.utils.data.Dataset):
-        def __init__(self, samples):
-            self.samples = samples
-
-        def __getitem__(self, idx):
-            return self.samples[idx]
-
-        def __len__(self):
-            return len(self.samples)
-
-    dataset = _Qwen3OmniTextDataset(processed_samples)
-
-    calib_dataloader = DataLoader(
-        dataset,
+    return DataLoader(
+        _ListDataset(processed_samples),
         batch_size=batch_size,
         shuffle=False,
         collate_fn=processor.collate_function,
     )
 
-    return calib_dataloader
-
 
 def get_dataset_samples(
     dataset_name: str,
@@ -446,23 +450,13 @@ def get_dataset_dataloader(
             "Tokenizer with the right padding_side may impact calibration accuracy. Recommend set to left"
         )
 
-    if isinstance(num_samples, int):
-        num_samples = [num_samples]
-
-    if isinstance(dataset_name, str):
-        dataset_name = [dataset_name]
-
-    assert len(dataset_name) == len(num_samples), (
-        "dataset_name and num_samples must be the same length"
+    all_samples = _load_text_samples(
+        dataset_name,
+        num_samples,
+        apply_chat_template=apply_chat_template,
+        tokenizer=tokenizer,
     )
 
-    all_samples = []
-    for ds_name, num_sample in zip(dataset_name, num_samples):
-        samples = get_dataset_samples(
-            ds_name, num_sample, apply_chat_template=apply_chat_template, tokenizer=tokenizer
-        )
-        all_samples.extend(samples)
-
     batch_encoded = tokenizer(
         all_samples,
         return_tensors="pt",
@@ -531,7 +525,7 @@ def _get_free_gpu_mem():
     torch.cuda.empty_cache()
 
     free_mem_before, max_allocated_before = _get_free_gpu_mem()
-    use_generate = model_type_is_enc_dec(model)
+    use_generate = _should_use_generate(model)
     infer_method = model.generate if use_generate else model.forward
 
     if sample_input_single_batch is None:
@@ -587,7 +581,7 @@ def _get_free_gpu_mem():
         return 512
 
 
-def _process_batch(batch_data, infer_method, generation_kwargs={}, max_working_batch_size=None):
+def _process_batch(batch_data, infer_method, generation_kwargs=None, max_working_batch_size=None):
     """Process a batch of data through the model's inference method.
 
     Args:
@@ -599,6 +593,8 @@ def _process_batch(batch_data, infer_method, generation_kwargs={}, max_working_b
     Returns:
         The maximum batch size that worked successfully
     """
+    if generation_kwargs is None:
+        generation_kwargs = {}
     # Separate tensor values from scalar parameters (like max_new_tokens)
     tensor_data = {k: v for k, v in batch_data.items() if torch.is_tensor(v) or v is None}
     scalar_data = {k: v for k, v in batch_data.items() if not torch.is_tensor(v) and v is not None}
@@ -663,7 +659,7 @@ def _process_batch(batch_data, infer_method, generation_kwargs={}, max_working_b
 
 
 def _forward_loop(
-    model: torch.nn.Module, dataloader: DataLoader, generation_kwargs: dict = {}
+    model: torch.nn.Module, dataloader: DataLoader, generation_kwargs: dict | None = None
 ) -> None:
     """Runs forward passes through the model using data from the dataloader.
 
@@ -672,9 +668,10 @@ def _forward_loop(
         dataloader: DataLoader containing the batched input data
         generation_kwargs: Keyword arguments to pass to the model.generate() method.
     """
+    if generation_kwargs is None:
+        generation_kwargs = {}
     with torch.no_grad():
-        # use_generate = _should_use_generate(model)
-        use_generate = model_type_is_enc_dec(model)
+        use_generate = _should_use_generate(model)
         infer_method = model.generate if use_generate else model.forward
         max_working_batch_size = None  # Initialize max working batch size as None
 
@@ -695,7 +692,7 @@ def create_forward_loop(
     device: str | None = None,
     include_labels: bool = False,
     dataloader: DataLoader | None = None,
-    generation_kwargs: dict = {},
+    generation_kwargs: dict | None = None,
 ) -> Callable:
     """Creates and returns a forward loop function configured for a specific model, dataset, and tokenizer.
 
@@ -737,6 +734,8 @@ def create_forward_loop(
         A forward loop function that can be called with no arguments. When called, this function iterates over
             the dataset specified by `dataset_name`.
     """
+    if generation_kwargs is None:
+        generation_kwargs = {}
     if dataloader is None:
         if batch_size == 0:
             # We let the system to determine the max data batch for each forward.
@@ -860,4 +859,7 @@ def _should_use_generate(model):
     """
     generate_model_list = ["qwen3omni"]
     model_name = model.__class__.__name__.lower()
-    return model_type_is_enc_dec(model) or any(name in model_name for name in generate_model_list)
+    needs_generate = model_type_is_enc_dec(model) or any(
+        name in model_name for name in generate_model_list
+    )
+    return needs_generate and hasattr(model, "generate")
diff --git a/modelopt/torch/utils/image_processor.py b/modelopt/torch/utils/image_processor.py
index 7691d65951..2f226e41c5 100644
--- a/modelopt/torch/utils/image_processor.py
+++ b/modelopt/torch/utils/image_processor.py
@@ -16,6 +16,8 @@
 # Adapted from tensorrt_llm/quantization/image_processing.py
 """Utility classes for image processing."""
 
+from typing import Any
+
 import torch
 
 
@@ -39,6 +41,33 @@ def collate_function(self, examples):
         """Collate function to process images during data loading."""
         raise NotImplementedError("Each image processor must implement its own collate method")
 
+    def _collate_first_item(self, batch, long_keys=(), float_keys=(), dtype=None):
+        """Shared collate helper: validates batch_size=1, converts lists to tensors.
+
+        Args:
+            batch: List of sample dicts from the DataLoader.
+            long_keys: Keys to convert via torch.LongTensor.
+            float_keys: Keys to convert via torch.tensor with optional dtype cast.
+            dtype: Optional dtype for float_keys tensors.
+
+        Returns:
+            Dict of tensors moved to self.device.
+        """
+        if len(batch) != 1:
+            raise ValueError(f"{type(self).__name__} currently supports batch_size=1 only.")
+        first = batch[0]
+        result = {}
+        for key in long_keys:
+            if first.get(key) is not None:
+                result[key] = torch.LongTensor(first[key]).to(self.device)
+        for key in float_keys:
+            if first.get(key) is not None:
+                t = torch.tensor(first[key])
+                if dtype is not None:
+                    t = t.to(dtype)
+                result[key] = t.to(self.device)
+        return result
+
 
 # A light Encapsulation for Huggingface MllamaImageProcessor
 
@@ -161,20 +190,77 @@ def preprocess_function(self, text: str) -> dict:
 
     def collate_function(self, batch):
         """Collate function to process text inputs during data loading."""
-        result = {}
-        first = batch[0]
+        return self._collate_first_item(
+            batch,
+            long_keys=("input_ids", "attention_mask"),
+        )
+
+
+class _Qwen3OmniProcessorMixin:
+    """Shared preprocessing logic for Qwen3-Omni image/video processors."""
+
+    tokenizer: Any
+    process_mm_info: Any
+    use_audio_in_video: Any
 
-        if "input_ids" in first and first["input_ids"] is not None:
-            result["input_ids"] = torch.LongTensor(first["input_ids"]).to(self.device)
-        if "attention_mask" in first and first["attention_mask"] is not None:
-            result["attention_mask"] = torch.LongTensor(first["attention_mask"]).to(self.device)
+    def _tokenize_conversation(self, conversation):
+        """Tokenize a Qwen3-Omni conversation and return processor outputs.
 
+        Args:
+            conversation: List of conversation dicts in Qwen format.
+
+        Returns:
+            Processor output dict with tensors.
+        """
+        text = self.tokenizer.apply_chat_template(
+            conversation, add_generation_prompt=True, tokenize=False, enable_thinking=False
+        )
+        audios, images, videos = self.process_mm_info(
+            conversation, use_audio_in_video=self.use_audio_in_video
+        )
+        return self.tokenizer(
+            text=text,
+            audio=audios,
+            images=images,
+            videos=videos,
+            return_tensors="pt",
+            padding=True,
+            use_audio_in_video=self.use_audio_in_video,
+        )
+
+    @staticmethod
+    def _serialize_for_arrow(values, all_keys):
+        """Convert processor outputs to lists for Arrow serialization.
+
+        Args:
+            values: Processor output dict (may contain tensors).
+            all_keys: List of keys to include in the result (ensures consistent schema).
+
+        Returns:
+            Dict with all_keys initialized to None, populated from values.
+        """
+        result = dict.fromkeys(all_keys)
+        for key, val in values.items():
+            if val is not None and hasattr(val, "tolist"):
+                result[key] = val.tolist()
+            elif val is not None:
+                result[key] = val
         return result
 
 
-class Qwen3OmniImageProcessor(BaseImageProcessor):
+class Qwen3OmniImageProcessor(_Qwen3OmniProcessorMixin, BaseImageProcessor):
     """Image processor for Qwen3-Omni multimodal model."""
 
+    _ALL_KEYS = [
+        "input_ids",
+        "attention_mask",
+        "pixel_values",
+        "image_grid_thw",
+        "audio_features",
+        "audio_feature_lens",
+        "video_grid_thw",
+    ]
+
     def __init__(self, tokenizer, device="auto", dtype=None, use_audio_in_video=False):
         """Constructor."""
         super().__init__(tokenizer, device)
@@ -206,86 +292,20 @@ def preprocess_function(self, examples):
         content.append({"type": "text", "text": question})
 
         conversation = [{"role": "user", "content": content}]
-        text = self.tokenizer.apply_chat_template(
-            conversation, add_generation_prompt=True, tokenize=False, enable_thinking=False
-        )
-
-        # Extract multimodal info using qwen_omni_utils
-        audios, images, videos = self.process_mm_info(
-            conversation, use_audio_in_video=self.use_audio_in_video
-        )
-
-        # Process inputs with the processor
-        values = self.tokenizer(
-            text=text,
-            audio=audios,
-            images=images,
-            videos=videos,
-            return_tensors="pt",
-            padding=True,
-            use_audio_in_video=self.use_audio_in_video,
-        )
-
-        # Define all possible keys to ensure consistent schema for Arrow serialization
-        all_keys = [
-            "input_ids",
-            "attention_mask",
-            "pixel_values",
-            "image_grid_thw",
-            "audio_features",
-            "audio_feature_lens",
-            "video_grid_thw",
-        ]
-
-        # Convert tensors to lists for Arrow serialization compatibility
-        # Tensor conversion back happens in collate_function
-        result = dict.fromkeys(all_keys)  # Initialize all keys to None
-        for key, val in values.items():
-            if val is not None and hasattr(val, "tolist"):
-                result[key] = val.tolist()
-            elif val is not None:
-                result[key] = val
-
-        return result
+        values = self._tokenize_conversation(conversation)
+        return self._serialize_for_arrow(values, self._ALL_KEYS)
 
     def collate_function(self, batch):
         """Collate function to process inputs during data loading."""
-        result = {}
-
-        # Take first item only — multimodal inputs have variable-length sequences
-        # (images, audio) that cannot be stacked, so batch_size=1 is expected.
-        first = batch[0]
-
-        # Convert lists to tensors and move to device
-        if "input_ids" in first and first["input_ids"] is not None:
-            result["input_ids"] = torch.LongTensor(first["input_ids"]).to(self.device)
-        if "attention_mask" in first and first["attention_mask"] is not None:
-            result["attention_mask"] = torch.LongTensor(first["attention_mask"]).to(self.device)
-
-        # Handle pixel values for images
-        if first.get("pixel_values") is not None:
-            pv = torch.tensor(first["pixel_values"])
-            if self.dtype is not None:
-                pv = pv.to(self.dtype)
-            result["pixel_values"] = pv.to(self.device)
-
-        # Handle image grid thw (tile height width info)
-        if first.get("image_grid_thw") is not None:
-            result["image_grid_thw"] = torch.LongTensor(first["image_grid_thw"]).to(self.device)
-
-        # Handle audio features if present
-        if first.get("audio_feature_lens") is not None:
-            result["audio_feature_lens"] = torch.LongTensor(first["audio_feature_lens"]).to(
-                self.device
-            )
-        if first.get("audio_features") is not None:
-            af = torch.tensor(first["audio_features"])
-            if self.dtype is not None:
-                af = af.to(self.dtype)
-            result["audio_features"] = af.to(self.device)
-
-        # Handle video features if present
-        if first.get("video_grid_thw") is not None:
-            result["video_grid_thw"] = torch.LongTensor(first["video_grid_thw"]).to(self.device)
-
-        return result
+        return self._collate_first_item(
+            batch,
+            long_keys=(
+                "input_ids",
+                "attention_mask",
+                "image_grid_thw",
+                "audio_feature_lens",
+                "video_grid_thw",
+            ),
+            float_keys=("pixel_values", "audio_features"),
+            dtype=self.dtype,
+        )
diff --git a/modelopt/torch/utils/video_dataset_utils.py b/modelopt/torch/utils/video_dataset_utils.py
index a48c29048b..d8b02b7ee1 100644
--- a/modelopt/torch/utils/video_dataset_utils.py
+++ b/modelopt/torch/utils/video_dataset_utils.py
@@ -22,7 +22,7 @@
 import torch
 from torch.utils.data import DataLoader
 
-from .image_processor import BaseImageProcessor
+from .image_processor import BaseImageProcessor, _Qwen3OmniProcessorMixin
 
 # Use dict to store the config for each dataset.
 SUPPORTED_VIDEO_DATASET_CONFIG: dict[str, dict[str, Any]] = {
@@ -161,7 +161,7 @@ def get_video_dataset_dataloader(
     )
 
 
-class Qwen3OmniVideoProcessor(BaseImageProcessor):
+class Qwen3OmniVideoProcessor(_Qwen3OmniProcessorMixin, BaseImageProcessor):
     """Video processor for Qwen3-Omni multimodal model with finevideo dataset support."""
 
     def __init__(self, tokenizer, device="cuda", dtype=None, use_audio_in_video=True):
@@ -204,6 +204,16 @@ def _save_video_bytes_to_file(self, video_bytes: bytes) -> str:
             f.write(video_bytes)
         return video_path
 
+    _ALL_KEYS = [
+        "input_ids",
+        "attention_mask",
+        "pixel_values_videos",
+        "video_grid_thw",
+        "video_second_per_grid",
+        "feature_attention_mask",
+        "input_features",
+    ]
+
     def preprocess_function(self, examples):
         """Preprocess function for Qwen3-Omni with video support.
 
@@ -212,7 +222,6 @@ def preprocess_function(self, examples):
         # Get question/prompt - finevideo has metadata in 'json' field
         if "json" in examples and examples["json"] is not None:
             metadata = examples["json"]
-            # Try to get a meaningful question from metadata
             category = metadata.get("content_fine_category", "")
             question = (
                 f"Describe what is happening in this video in detail. Category hint: {category}"
@@ -226,10 +235,8 @@ def preprocess_function(self, examples):
         # Handle video - check for raw bytes (finevideo format) or path
         video_path = None
         if examples.get("mp4") is not None:
-            # finevideo format: raw video bytes in 'mp4' field
             video_path = self._save_video_bytes_to_file(examples["mp4"])
         elif examples.get("video") is not None:
-            # Standard format: video path or URL
             video_path = examples["video"]
 
         if video_path is not None:
@@ -238,92 +245,24 @@ def preprocess_function(self, examples):
         content.append({"type": "text", "text": question})
 
         conversation = [{"role": "user", "content": content}]
-        text = self.tokenizer.apply_chat_template(
-            conversation, add_generation_prompt=True, tokenize=False, enable_thinking=False
-        )
-
-        # Extract multimodal info using qwen_omni_utils
-        audios, images, videos = self.process_mm_info(
-            conversation, use_audio_in_video=self.use_audio_in_video
-        )
-
-        # Process inputs with the processor
-        values = self.tokenizer(
-            text=text,
-            audio=audios,
-            images=images,
-            videos=videos,
-            return_tensors="pt",
-            padding=True,
-            use_audio_in_video=self.use_audio_in_video,
-        )
-        # Define all possible keys to ensure consistent schema for Arrow serialization
-        all_keys = [
-            "input_ids",
-            "attention_mask",
-            "pixel_values_videos",
-            "video_grid_thw",
-            "video_second_per_grid",
-            "feature_attention_mask",
-            "input_features",
-        ]
-
-        # Convert tensors to lists for Arrow serialization compatibility
-        # Tensor conversion back happens in collate_function
-        result = dict.fromkeys(all_keys)  # Initialize all keys to None
-        for key, val in values.items():
-            if val is not None and hasattr(val, "tolist"):
-                result[key] = val.tolist()
-            elif val is not None:
-                result[key] = val
-
-        return result
+        values = self._tokenize_conversation(conversation)
+        return self._serialize_for_arrow(values, self._ALL_KEYS)
 
     def collate_function(self, batch):
         """Collate function to process inputs during data loading."""
-        result = {}
-
-        # Take first item only — multimodal inputs have variable-length sequences
-        # (video frames, audio) that cannot be stacked, so batch_size=1 is expected.
-        first = batch[0]
-
-        # Convert lists to tensors and move to device
-        if first.get("input_ids") is not None:
-            result["input_ids"] = torch.LongTensor(first["input_ids"]).to(self.device)
-        if first.get("attention_mask") is not None:
-            result["attention_mask"] = torch.LongTensor(first["attention_mask"]).to(self.device)
-
-        # Handle pixel values for video frames
-        if first.get("pixel_values_videos") is not None:
-            pv = torch.tensor(first["pixel_values_videos"])
-            if self.dtype is not None:
-                pv = pv.to(self.dtype)
-            result["pixel_values_videos"] = pv.to(self.device)
-
-        # Handle video grid thw (tile height width info)
-        if first.get("video_grid_thw") is not None:
-            result["video_grid_thw"] = torch.LongTensor(first["video_grid_thw"]).to(self.device)
-
-        # Handle video second per grid (temporal info for rope)
-        if first.get("video_second_per_grid") is not None:
-            result["video_second_per_grid"] = torch.tensor(first["video_second_per_grid"]).to(
-                self.device
-            )
-
-        # Handle audio features if present
-        if first.get("feature_attention_mask") is not None:
-            result["feature_attention_mask"] = torch.LongTensor(first["feature_attention_mask"]).to(
-                self.device
-            )
-        if first.get("input_features") is not None:
-            inp_feat = torch.tensor(first["input_features"])
-            if self.dtype is not None:
-                inp_feat = inp_feat.to(self.dtype)
-            result["input_features"] = inp_feat.to(self.device)
-
+        result = self._collate_first_item(
+            batch,
+            long_keys=(
+                "input_ids",
+                "attention_mask",
+                "video_grid_thw",
+                "feature_attention_mask",
+            ),
+            float_keys=("pixel_values_videos", "video_second_per_grid", "input_features"),
+            dtype=self.dtype,
+        )
         # Pass use_audio_in_video flag to model.generate() for Qwen3Omni
         result["use_audio_in_video"] = self.use_audio_in_video
-
         return result
 
     def cleanup(self):
@@ -332,3 +271,7 @@ def cleanup(self):
 
         if os.path.exists(self._temp_dir):
             shutil.rmtree(self._temp_dir)
+
+    def __del__(self):
+        """Ensure temporary files are cleaned up when the processor is garbage collected."""
+        self.cleanup()

From ee95177dbcf0018ab0ef10952d9c90cf6cd8bcbc Mon Sep 17 00:00:00 2001
From: ajrasane <131806219+ajrasane@users.noreply.github.com>
Date: Thu, 2 Apr 2026 00:04:44 +0000
Subject: [PATCH 12/12] Remove manual registration of sparse moe block

Signed-off-by: ajrasane <131806219+ajrasane@users.noreply.github.com>
---
 modelopt/torch/quantization/plugins/huggingface.py | 13 -------------
 1 file changed, 13 deletions(-)

diff --git a/modelopt/torch/quantization/plugins/huggingface.py b/modelopt/torch/quantization/plugins/huggingface.py
index 0a451e8ccb..0d02716a6e 100644
--- a/modelopt/torch/quantization/plugins/huggingface.py
+++ b/modelopt/torch/quantization/plugins/huggingface.py
@@ -1180,19 +1180,6 @@ def unpack_weight(self):
     pass
 
 
-try:
-    from transformers.models.qwen3_omni_moe.modeling_qwen3_omni_moe import (
-        Qwen3OmniMoeThinkerTextSparseMoeBlock,
-    )
-
-    if Qwen3OmniMoeThinkerTextSparseMoeBlock not in QuantModuleRegistry:
-        QuantModuleRegistry.register(
-            {Qwen3OmniMoeThinkerTextSparseMoeBlock: "hf.Qwen3OmniMoeThinkerTextSparseMoeBlock"}
-        )(_QuantSparseMoe)
-except ImportError:
-    pass
-
-
 class _QuantGptOssExperts(_QuantFunctionalMixin):
     """Quantized wrapper for `transformers.GptOssExperts`.