From a9cf8ebc9009c1f700af44b4204d74fbb2b45a24 Mon Sep 17 00:00:00 2001 From: ajrasane <131806219+ajrasane@users.noreply.github.com> Date: Thu, 5 Feb 2026 00:58:19 +0000 Subject: [PATCH 01/12] Add support for Qwen3Omni30B thinking model Signed-off-by: ajrasane <131806219+ajrasane@users.noreply.github.com> --- examples/llm_ptq/example_utils.py | 26 ++- examples/llm_ptq/hf_ptq.py | 155 ++++++++++++++-- modelopt/torch/export/model_utils.py | 58 +++--- modelopt/torch/export/unified_export_hf.py | 10 + .../torch/quantization/plugins/huggingface.py | 18 ++ modelopt/torch/utils/__init__.py | 1 + modelopt/torch/utils/dataset_utils.py | 127 +++++++++++-- modelopt/torch/utils/image_processor.py | 171 ++++++++++++++++++ 8 files changed, 496 insertions(+), 70 deletions(-) diff --git a/examples/llm_ptq/example_utils.py b/examples/llm_ptq/example_utils.py index 58eb676111..5fe9ab5ad6 100755 --- a/examples/llm_ptq/example_utils.py +++ b/examples/llm_ptq/example_utils.py @@ -45,7 +45,12 @@ except ImportError: snapshot_download = None -from modelopt.torch.utils.image_processor import BaseImageProcessor, MllamaImageProcessor +import modelopt.torch.quantization as mtq +from modelopt.torch.utils.image_processor import ( + BaseImageProcessor, + MllamaImageProcessor, + Qwen3OmniImageProcessor, +) logger = logging.getLogger(__name__) @@ -284,7 +289,7 @@ def get_processor( if attn_implementation is not None: model_kwargs["attn_implementation"] = attn_implementation - if model_type == "whisper": + if model_type in ("whisper", "mllama", "qwen3omni"): processor = AutoProcessor.from_pretrained( ckpt_path, padding_side="left", @@ -296,20 +301,11 @@ def get_processor( f"Pad token for {ckpt_path} cannot be set!" ) + if model_type == "mllama": + return MllamaImageProcessor(processor, device) + elif model_type == "qwen3omni": + return Qwen3OmniImageProcessor(processor, device) return processor - elif model_type == "mllama": - processor = AutoProcessor.from_pretrained( - ckpt_path, - padding_side="left", - **model_kwargs, - ) - if processor.tokenizer.pad_token is None: - processor.tokenizer.pad_token = processor.tokenizer.eos_token - assert processor.tokenizer.pad_token is not None, ( - f"Pad token for {ckpt_path} cannot be set!" - ) - - return MllamaImageProcessor(processor, device) else: # Try to load AutoProcessor for other VL models (e.g., Nemotron-Parse) try: diff --git a/examples/llm_ptq/hf_ptq.py b/examples/llm_ptq/hf_ptq.py index b81dc60c01..4f7d3430f8 100755 --- a/examples/llm_ptq/hf_ptq.py +++ b/examples/llm_ptq/hf_ptq.py @@ -15,7 +15,9 @@ import argparse import copy +import io import random +import sys import time import warnings from typing import Any @@ -68,12 +70,26 @@ create_forward_loop, get_dataset_dataloader, get_max_batch_size, + get_qwen3omni_text_dataloader, get_supported_datasets, ) -from modelopt.torch.utils.image_processor import BaseImageProcessor, MllamaImageProcessor +from modelopt.torch.utils.image_processor import ( + BaseImageProcessor, + MllamaImageProcessor, + Qwen3OmniImageProcessor, + Qwen3OmniTextProcessor, +) from modelopt.torch.utils.memory_monitor import launch_memory_monitor from modelopt.torch.utils.speech_dataset_utils import get_speech_dataset_dataloader -from modelopt.torch.utils.vlm_dataset_utils import get_vlm_dataset_dataloader +from modelopt.torch.utils.video_dataset_utils import ( + Qwen3OmniVideoProcessor, + get_supported_video_datasets, + get_video_dataset_dataloader, +) +from modelopt.torch.utils.vlm_dataset_utils import ( + get_supported_vlm_datasets, + get_vlm_dataset_dataloader, +) RAND_SEED = 1234 @@ -208,6 +224,51 @@ def make_calib_dataloader( batch_size=args.batch_size, num_samples=args.calib_size[0], ) + elif model_type == "qwen3omni": + assert processor is not None, "The processor must be set for qwen3omni model." + dataset_name = args.dataset[0] if args.dataset else "cnn_dailymail" + # Check if using video dataset (e.g., finevideo) + if dataset_name in get_supported_video_datasets(): + video_processor = Qwen3OmniVideoProcessor( + processor.tokenizer if hasattr(processor, "tokenizer") else processor, + device=device, + dtype=language_model.dtype, + use_audio_in_video=True, + ) + calib_dataloader = get_video_dataset_dataloader( + dataset_name=dataset_name, + processor=video_processor, + batch_size=args.batch_size, + num_samples=args.calib_size[0], + ) + elif dataset_name in get_supported_vlm_datasets(): + assert isinstance(processor, Qwen3OmniImageProcessor), ( + "The Qwen3OmniImageProcessor must be set." + ) + # Set the dtype for proper tensor conversion in collate_function + processor.dtype = language_model.dtype + calib_dataloader = get_vlm_dataset_dataloader( + dataset_name=dataset_name, + processor=processor, + batch_size=args.batch_size, + num_samples=args.calib_size[0], + ) + else: + # Text-only datasets (e.g., cnn_dailymail) + # Use Qwen3OmniTextProcessor to apply proper conversation template + # See: https://huggingface.co/Qwen/Qwen3-Omni-30B-A3B-Thinking + text_processor = Qwen3OmniTextProcessor( + processor=processor.tokenizer, # Pass the underlying HF processor + device=device, + dtype=language_model.dtype, + ) + calib_dataloader = get_qwen3omni_text_dataloader( + dataset_name=dataset_name, + processor=text_processor, + batch_size=args.batch_size, + num_samples=args.calib_size[0], + ) + print(f"Selected dataset for calibration: {dataset_name}") elif model_type == "whisper": assert processor is not None and isinstance(processor, WhisperProcessor), ( "The AutoProcessor must be set." @@ -391,6 +452,9 @@ def load_model(args: argparse.Namespace): calibration_only = True model_type = get_model_type(full_model) + if model_type == "qwen3omni": + print("Disabling talker for Qwen3Omni model") + full_model.disable_talker() device = full_model.device if hasattr(full_model, "model"): @@ -408,7 +472,7 @@ def load_model(args: argparse.Namespace): print("Nemotron VL model detected. Enabling image-text calibration by default.") args.calib_with_images = True - if model_type == "mllama": + if model_type in ["mllama", "qwen3omni"]: processor = get_processor( args.pyt_ckpt_path, model_type, @@ -555,6 +619,15 @@ def mono_quantize( quant_cfg["quant_cfg"]["*model_encoder*"] = {"enable": False} # Nemotron-Parse specific print("Quantization will only be applied to the decoder (text generation) component") + # For Qwen3Omni models, disable quantization of conv layers + if model_type == "qwen3omni": + print( + "Disabling quantization for conv layers, audio tower and visual encoder in Qwen3Omni model" + ) + quant_cfg["quant_cfg"]["*conv*"] = {"enable": False} + quant_cfg["quant_cfg"]["*audio_tower*"] = {"enable": False} + quant_cfg["quant_cfg"]["*visual*"] = {"enable": False} + if not model_is_already_quantized or calibration_only: # quantize the model @@ -735,9 +808,10 @@ def pre_quantize( """ # Only run single sample for preview - preview_input_ids = next(iter(calib_dataloader))[ - "input_features" if model_type == "whisper" else "input_ids" - ][0:1] + calib_batch = next(iter(calib_dataloader)) + preview_input_ids = calib_batch["input_features" if model_type == "whisper" else "input_ids"][ + 0:1 + ] # Generate preview before quantization if args.skip_generate: @@ -759,10 +833,21 @@ def pre_quantize( "before quantization", allow_fallback=False, ) + elif model_type == "qwen3omni": + # Qwen3Omni returns (text_ids, audio) tuple; text_ids has .sequences + # Pass full batch with all multimodal inputs + result = full_model.generate(**calib_batch, max_new_tokens=100) + if isinstance(result, tuple): + text_ids, _ = result + generated_ids_before_ptq = ( + text_ids.sequences if hasattr(text_ids, "sequences") else text_ids + ) + else: + generated_ids_before_ptq = result else: generated_ids_before_ptq = full_model.generate(preview_input_ids, max_new_tokens=100) - return preview_input_ids, generated_ids_before_ptq + return preview_input_ids, generated_ids_before_ptq, calib_batch def post_quantize( @@ -775,6 +860,7 @@ def post_quantize( generated_ids_before_ptq, is_nemotron_vl_model, first_text_speech_dataset, + calib_batch: dict | None = None, ): """ Processing after the quantization. @@ -785,18 +871,38 @@ def post_quantize( """ if args.verbose: - try: + if args.quant_summary_path: + # Capture the summary output to a file + old_stdout = sys.stdout + sys.stdout = buffer = io.StringIO() + try: + mtq.print_quant_summary(full_model, args.export_path) + finally: + sys.stdout = old_stdout + summary = buffer.getvalue() + with open(args.quant_summary_path, "w") as f: + f.write(summary) + print(f"Quantization summary saved to {args.quant_summary_path}") + else: mtq.print_quant_summary(full_model, args.export_path) - save_expert_token_count_table(full_model, args.export_path) - except Exception as e: - print(f"Error saving quant summary: {e}") - print("Continuing with generation...") + save_expert_token_count_table(full_model, args.export_path) # Run some samples torch.cuda.empty_cache() generated_ids_after_ptq = None if generated_ids_before_ptq is None: pass + elif model_type == "qwen3omni": + # Qwen3Omni returns (text_ids, audio) tuple; text_ids has .sequences + # Pass full batch with all multimodal inputs + result = full_model.generate(**calib_batch, max_new_tokens=100) + if isinstance(result, tuple): + text_ids, _ = result + generated_ids_after_ptq = ( + text_ids.sequences if hasattr(text_ids, "sequences") else text_ids + ) + else: + generated_ids_after_ptq = result elif model_type != "llama4" and not is_nemotron_vl_model: # Our fake quantizer may not be fully compatible with torch.compile. generated_ids_after_ptq = full_model.generate(preview_input_ids, max_new_tokens=100) @@ -815,12 +921,13 @@ def post_quantize( ) def input_decode(input_ids): - if processor is not None and isinstance(processor, MllamaImageProcessor): - return processor.tokenizer.batch_decode(input_ids) + # BaseImageProcessor covers MllamaImageProcessor and Qwen3OmniImageProcessor + if processor is not None and isinstance(processor, BaseImageProcessor): + return processor.tokenizer.batch_decode(input_ids, skip_special_tokens=True) elif processor is not None and isinstance(processor, WhisperProcessor): return first_text_speech_dataset elif tokenizer is not None: - return tokenizer.batch_decode(input_ids) + return tokenizer.batch_decode(input_ids, skip_special_tokens=True) else: raise ValueError("The processor or tokenizer must be set") @@ -832,6 +939,12 @@ def output_decode(generated_ids, input_shape): return tokenizer.batch_decode(generated_ids, skip_special_tokens=True) elif processor is not None and isinstance(processor, MllamaImageProcessor): return processor.tokenizer.batch_decode(generated_ids[:, input_shape:]) + elif processor is not None and isinstance(processor, Qwen3OmniImageProcessor): + return processor.tokenizer.batch_decode( + generated_ids[:, input_shape:], + skip_special_tokens=True, + clean_up_tokenization_spaces=False, + ) elif tokenizer is not None: return tokenizer.batch_decode(generated_ids[:, input_shape:]) else: @@ -919,7 +1032,7 @@ def quantize_main( # Detect if this is a Nemotron VL model using architecture-based detection is_nemotron_vl_model = is_nemotron_vl(full_model) - preview_input_ids, generated_ids_before_ptq = pre_quantize( + preview_input_ids, generated_ids_before_ptq, calib_batch = pre_quantize( args, full_model, model_type, tokenizer, calib_dataloader, is_nemotron_vl_model ) @@ -1014,6 +1127,7 @@ def quantize_main( generated_ids_before_ptq, is_nemotron_vl_model, first_text_speech_dataset, + calib_batch, ) export_quantized( args, @@ -1238,6 +1352,15 @@ def parse_args() -> argparse.Namespace: help="Export as vLLM fake-quant checkpoint (produces vllm_fq_modelopt_state.pth " "for use with vllm_serve_fakequant.py).", ) + parser.add_argument( + "--quant_summary_path", + type=str, + default=None, + help=( + "Path to save the quantization summary. If not specified, summary is printed to stdout. " + "Requires --verbose to be enabled (default: True)." + ), + ) args = parser.parse_args() if args.moe_calib_experts_ratio is not None and not (0.0 < args.moe_calib_experts_ratio <= 1.0): diff --git a/modelopt/torch/export/model_utils.py b/modelopt/torch/export/model_utils.py index 3bd72d9de9..b71e53bacf 100755 --- a/modelopt/torch/export/model_utils.py +++ b/modelopt/torch/export/model_utils.py @@ -17,45 +17,46 @@ import torch.nn as nn MODEL_NAME_TO_TYPE = { + "ArcticForCausalLM": "llama", + "baichuan": "baichuan", + "Bart": "bart", + "Bloom": "bloom", + "ChatGLM": "chatglm", + "Dbrx": "dbrx", + "Deepseek": "deepseek", + "ExaoneForCausalLM": "exaone", + "FalconForCausalLM": "falcon", + "Gemma": "gemma", + "Gemma2": "gemma2", + "Gemma3": "gemma3", + "GLM": "glm", "GPT2": "gpt", - "Mllama": "mllama", - "Llama4": "llama4", + "GPTJ": "gptj", + "gptoss": "gptoss", + "InternLM2ForCausalLM": "internlm", "Llama": "llama", + "Llama4": "llama4", "Mistral": "llama", - "GPTJ": "gptj", - "FalconForCausalLM": "falcon", - "RWForCausalLM": "falcon", - "baichuan": "baichuan", + "MixtralForCausalLM": "llama", + "Mllama": "mllama", "MPT": "mpt", - "Bloom": "bloom", - "ChatGLM": "chatglm", + "Nemotron": "gpt", + "phi": "phi", + "phi3": "phi3", + "phi3small": "phi3small", + "Phi4MMForCausalLM": "phi4mm", + "PhiMoEForCausalLM": "phi3", "Qwen3Moe": "qwen3moe", "Qwen3Next": "qwen3next", + "Qwen3OmniMoeForConditionalGeneration": "qwen3omni", "QWen": "qwen", "RecurrentGemma": "recurrentgemma", - "Gemma3": "gemma3", - "Gemma2": "gemma2", - "Gemma": "gemma", - "phi3small": "phi3small", - "phi3": "phi3", - "PhiMoEForCausalLM": "phi3", - "Phi4MMForCausalLM": "phi4mm", - "phi": "phi", - "TLGv4ForCausalLM": "phi", - "MixtralForCausalLM": "llama", - "ArcticForCausalLM": "llama", + "RWForCausalLM": "falcon", "StarCoder": "gpt", - "Dbrx": "dbrx", "T5": "t5", - "Bart": "bart", - "GLM": "glm", - "InternLM2ForCausalLM": "internlm", - "ExaoneForCausalLM": "exaone", + "TLGv4ForCausalLM": "phi", "NemotronH": "nemotron_h", - "Nemotron": "gpt", - "Deepseek": "deepseek", "Whisper": "whisper", - "gptoss": "gptoss", "MiniMax": "minimax", } @@ -149,6 +150,9 @@ def get_language_model_from_vl(model) -> list[nn.Module] | None: if hasattr(model, "language_model"): return [model, model.language_model] + if hasattr(model, "thinker"): + return [model, model.thinker] + # Pattern 3: For encoder-decoder VL models (e.g., Nemotron-Parse), the decoder is the language model. # Only match if the model is detected as multimodal to avoid matching non-VLM encoder-decoder # models like T5, Bart, Whisper which also have .decoder. diff --git a/modelopt/torch/export/unified_export_hf.py b/modelopt/torch/export/unified_export_hf.py index 14a12bcdf3..e105c46aa4 100644 --- a/modelopt/torch/export/unified_export_hf.py +++ b/modelopt/torch/export/unified_export_hf.py @@ -1181,6 +1181,16 @@ def export_hf_checkpoint( if getattr(model, "hf_quantizer", None) is not None: model.hf_quantizer = None + # Fix generation_config conflicts before saving + # Some models have temperature/top_p/top_k set but do_sample=False which causes validation errors + if hasattr(model, "generation_config") and model.generation_config is not None: + gen_config = model.generation_config + if not getattr(gen_config, "do_sample", True): + # Remove sampling-related params when do_sample is False + for attr in ["temperature", "top_p", "top_k"]: + if hasattr(gen_config, attr): + setattr(gen_config, attr, None) + # Save model # Temporarily disable revert_weight_conversion if available — it doesn't handle # quantized state dicts (scalar scale tensors have 0 dimensions, causing IndexError). diff --git a/modelopt/torch/quantization/plugins/huggingface.py b/modelopt/torch/quantization/plugins/huggingface.py index 0d02716a6e..b03c46fd03 100644 --- a/modelopt/torch/quantization/plugins/huggingface.py +++ b/modelopt/torch/quantization/plugins/huggingface.py @@ -1180,6 +1180,24 @@ def unpack_weight(self): pass +try: + from transformers.models.qwen3_omni_moe.modeling_qwen3_omni_moe import ( + Qwen3OmniMoeTalkerTextSparseMoeBlock, + Qwen3OmniMoeThinkerTextSparseMoeBlock, + ) + + if Qwen3OmniMoeTalkerTextSparseMoeBlock not in QuantModuleRegistry: + QuantModuleRegistry.register( + {Qwen3OmniMoeTalkerTextSparseMoeBlock: "hf.Qwen3OmniMoeTalkerTextSparseMoeBlock"} + )(_QuantSparseMoe) + if Qwen3OmniMoeThinkerTextSparseMoeBlock not in QuantModuleRegistry: + QuantModuleRegistry.register( + {Qwen3OmniMoeThinkerTextSparseMoeBlock: "hf.Qwen3OmniMoeThinkerTextSparseMoeBlock"} + )(_QuantSparseMoe) +except ImportError: + pass + + class _QuantGptOssExperts(_QuantFunctionalMixin): """Quantized wrapper for `transformers.GptOssExperts`. diff --git a/modelopt/torch/utils/__init__.py b/modelopt/torch/utils/__init__.py index f026e747a8..354212d56e 100644 --- a/modelopt/torch/utils/__init__.py +++ b/modelopt/torch/utils/__init__.py @@ -27,4 +27,5 @@ from .regex import * from .robust_json import * from .tensor import * +from .video_dataset_utils import * from .vlm_dataset_utils import * diff --git a/modelopt/torch/utils/dataset_utils.py b/modelopt/torch/utils/dataset_utils.py index 00cdff8877..9a5b7ccf98 100644 --- a/modelopt/torch/utils/dataset_utils.py +++ b/modelopt/torch/utils/dataset_utils.py @@ -112,6 +112,7 @@ "get_dataset_samples", "get_jsonl_text_samples", "get_max_batch_size", + "get_qwen3omni_text_dataloader", "get_supported_datasets", ] @@ -211,6 +212,84 @@ def _auto_preprocess_sample( ) +def get_qwen3omni_text_dataloader( + dataset_name: str | list[str] = "cnn_dailymail", + processor=None, + batch_size: int = 1, + num_samples: int | list[int] = 512, +) -> DataLoader: + """Get a text-only dataloader for Qwen3-Omni with proper conversation template applied. + + This function applies the Qwen3-Omni chat template to text samples before tokenization, + which is required for proper calibration of Qwen3-Omni models with text-only datasets. + + See: https://huggingface.co/Qwen/Qwen3-Omni-30B-A3B-Thinking + + Args: + dataset_name: Name of the dataset(s) to load. + processor: Qwen3OmniTextProcessor instance wrapping the Qwen3OmniMoeProcessor. + batch_size: Batch size of the returned dataloader. + num_samples: Number of samples from the dataset. + + Returns: + A DataLoader with properly formatted inputs for Qwen3-Omni. + """ + assert processor is not None, "Please provide a Qwen3OmniTextProcessor." + + if isinstance(num_samples, int): + num_samples = [num_samples] + + if isinstance(dataset_name, str): + dataset_name = [dataset_name] + + assert len(dataset_name) == len(num_samples), ( + "dataset_name and num_samples must be the same length" + ) + + # Get raw text samples + all_samples = [] + for ds_name, num_sample in zip(dataset_name, num_samples): + samples = get_dataset_samples(ds_name, num_sample) + all_samples.extend(samples) + + # Preprocess each sample with the conversation template + processed_samples = [] + for text in all_samples: + # Apply conversation template and tokenize + values = processor.preprocess_function(text) + + # Convert to lists for dataset compatibility + sample_dict = {} + for key, val in values.items(): + if val is not None and hasattr(val, "tolist"): + sample_dict[key] = val.tolist() + elif val is not None: + sample_dict[key] = val + processed_samples.append(sample_dict) + + # Create dataset + class _Qwen3OmniTextDataset(torch.utils.data.Dataset): + def __init__(self, samples): + self.samples = samples + + def __getitem__(self, idx): + return self.samples[idx] + + def __len__(self): + return len(self.samples) + + dataset = _Qwen3OmniTextDataset(processed_samples) + + calib_dataloader = DataLoader( + dataset, + batch_size=batch_size, + shuffle=False, + collate_fn=processor.collate_function, + ) + + return calib_dataloader + + def get_dataset_samples( dataset_name: str, num_samples: int, @@ -452,8 +531,8 @@ def _get_free_gpu_mem(): torch.cuda.empty_cache() free_mem_before, max_allocated_before = _get_free_gpu_mem() - is_enc_dec = model_type_is_enc_dec(model) - infer_method = model.generate if is_enc_dec else model.forward + use_generate = _should_use_generate(model) + infer_method = model.generate if use_generate else model.forward if sample_input_single_batch is None: sample_input_single_batch = ( @@ -519,11 +598,15 @@ def _process_batch(batch_data, infer_method, max_working_batch_size=None): Returns: The maximum batch size that worked successfully """ - assert all(torch.is_tensor(data) or data is None for data in batch_data.values()), ( - "batch_data values must be tensors" + # Separate tensor values from scalar parameters (like max_new_tokens) + tensor_data = {k: v for k, v in batch_data.items() if torch.is_tensor(v) or v is None} + scalar_data = {k: v for k, v in batch_data.items() if not torch.is_tensor(v) and v is not None} + + assert all(torch.is_tensor(data) or data is None for data in tensor_data.values()), ( + "tensor_data values must be tensors" ) # Get the batch size of current data - batch_size = batch_data[next(iter(batch_data.keys()))].shape[0] + batch_size = tensor_data[next(iter(batch_data.keys()))].shape[0] # If we know a smaller batch size works, preemptively split if max_working_batch_size is not None and batch_size > max_working_batch_size: @@ -531,11 +614,13 @@ def _process_batch(batch_data, infer_method, max_working_batch_size=None): for i in range(0, batch_size, max_working_batch_size): end_idx = min(i + max_working_batch_size, batch_size) split_data = {} - for key in batch_data: - if batch_data[key] is None: + for key in tensor_data: + if tensor_data[key] is None: split_data[key] = None else: - split_data[key] = batch_data[key][i:end_idx, ...] + split_data[key] = tensor_data[key][i:end_idx, ...] + # Add back scalar data (non-tensor params like max_new_tokens) + split_data.update(scalar_data) max_working_batch_size = _process_batch( split_data, infer_method, max_working_batch_size @@ -562,8 +647,11 @@ def _process_batch(batch_data, infer_method, max_working_batch_size=None): # Split the batch in half mid = (batch_size + 1) // 2 warn(f"CUDA out of memory with batch size {batch_size}, trying with batch size {mid}") - split_data_1 = {key: batch_data[key][:mid, ...] for key in batch_data} - split_data_2 = {key: batch_data[key][mid:, ...] for key in batch_data} + split_data_1 = {key: tensor_data[key][:mid, ...] for key in tensor_data} + split_data_2 = {key: tensor_data[key][mid:, ...] for key in tensor_data} + # Add back scalar data (non-tensor params like max_new_tokens) + split_data_1.update(scalar_data) + split_data_2.update(scalar_data) # Recursively process each half and track max working batch size max_working_batch_size = _process_batch(split_data_1, infer_method) @@ -581,11 +669,14 @@ def _forward_loop(model: torch.nn.Module, dataloader: DataLoader) -> None: dataloader: DataLoader containing the batched input data """ with torch.no_grad(): - is_enc_dec = model_type_is_enc_dec(model) - infer_method = model.generate if is_enc_dec else model.forward + use_generate = _should_use_generate(model) + infer_method = model.generate if use_generate else model.forward max_working_batch_size = None # Initialize max working batch size as None for _, data in enumerate(tqdm(dataloader)): + # For generate(), add max_new_tokens to prevent indefinite generation during calibration + if use_generate: + data["max_new_tokens"] = 1 # Process batch and update max working batch size max_working_batch_size = _process_batch(data, infer_method, max_working_batch_size) @@ -753,3 +844,15 @@ def download_hf_dataset_as_jsonl( jsonl_paths.append(jsonl_file_path) return jsonl_paths + + +def _should_use_generate(model): + """Check if model should use generate() instead of forward() for calibration. + + Returns True for: + - Encoder-decoder models (t5, bart, whisper) + - Conditional generation models that don't support standard forward() (qwen3omni) + """ + generate_model_list = ["qwen3omni"] + model_name = model.__class__.__name__.lower() + return model_type_is_enc_dec(model) or any(name in model_name for name in generate_model_list) diff --git a/modelopt/torch/utils/image_processor.py b/modelopt/torch/utils/image_processor.py index 6374642e3d..07deca7fc4 100644 --- a/modelopt/torch/utils/image_processor.py +++ b/modelopt/torch/utils/image_processor.py @@ -110,3 +110,174 @@ def collate_function(self, batch): ).to(self.device) return batch[0] + + +class Qwen3OmniTextProcessor(BaseImageProcessor): + """Text-only processor for Qwen3-Omni that applies proper conversation template. + + This processor wraps raw text in the Qwen3-Omni conversation format and applies + the chat template before tokenization. Use this for text-only calibration datasets. + + See: https://huggingface.co/Qwen/Qwen3-Omni-30B-A3B-Thinking + """ + + def __init__(self, processor, device="auto", dtype=None): + """Constructor. + + Args: + processor: The Qwen3OmniMoeProcessor (from AutoProcessor.from_pretrained). + device: Device to move tensors to. + dtype: dtype for float tensors (e.g., torch.bfloat16). If None, uses default. + """ + super().__init__(processor, device) + self.dtype = dtype + + def preprocess_function(self, text: str) -> dict: + """Preprocess a single text sample by applying conversation template. + + Args: + text: Raw text string from dataset. + + Returns: + Dictionary with tokenized inputs. + """ + # Build conversation in Qwen format (text-only) + conversation = [{"role": "user", "content": [{"type": "text", "text": text}]}] + formatted_text = self.tokenizer.apply_chat_template( + conversation, add_generation_prompt=True, tokenize=False, enable_thinking=False + ) + + # Tokenize with the processor (no multimodal inputs) + values = self.tokenizer( + text=formatted_text, + audio=None, + images=None, + videos=None, + return_tensors="pt", + padding=True, + ) + + return values + + def collate_function(self, batch): + """Collate function to process text inputs during data loading.""" + result = {} + first = batch[0] + + if "input_ids" in first and first["input_ids"] is not None: + result["input_ids"] = torch.LongTensor(first["input_ids"]).to(self.device) + if "attention_mask" in first and first["attention_mask"] is not None: + result["attention_mask"] = torch.LongTensor(first["attention_mask"]).to(self.device) + + return result + + +class Qwen3OmniImageProcessor(BaseImageProcessor): + """Image processor for Qwen3-Omni multimodal model.""" + + def __init__(self, tokenizer, device="auto", use_audio_in_video=False): + """Constructor.""" + super().__init__(tokenizer, device) + self.use_audio_in_video = use_audio_in_video + # Try to import qwen_omni_utils for multimodal processing + try: + from qwen_omni_utils import process_mm_info + + self.process_mm_info = process_mm_info + except ImportError: + raise ImportError( + "qwen_omni_utils is required for Qwen3OmniImageProcessor. " + "Please install it from https://github.com/QwenLM/Qwen3-Omni" + ) + + def preprocess_function(self, examples): + """Preprocess function for Qwen3-Omni.""" + question = examples.get("question", "Describe this image.") + + # Build conversation in Qwen format + content = [] + if examples.get("image") is not None: + content.append({"type": "image", "image": examples["image"]}) + if examples.get("audio") is not None: + content.append({"type": "audio", "audio": examples["audio"]}) + if examples.get("video") is not None: + content.append({"type": "video", "video": examples["video"]}) + content.append({"type": "text", "text": question}) + + conversation = [{"role": "user", "content": content}] + text = self.tokenizer.apply_chat_template( + conversation, add_generation_prompt=True, tokenize=False, enable_thinking=False + ) + + # Extract multimodal info using qwen_omni_utils + audios, images, videos = self.process_mm_info( + conversation, use_audio_in_video=self.use_audio_in_video + ) + + # Process inputs with the processor + values = self.tokenizer( + text=text, + audio=audios, + images=images, + videos=videos, + return_tensors="pt", + padding=True, + use_audio_in_video=self.use_audio_in_video, + ) + + # Define all possible keys to ensure consistent schema for Arrow serialization + all_keys = [ + "input_ids", + "attention_mask", + "pixel_values", + "image_grid_thw", + "audio_features", + "audio_feature_lens", + "video_grid_thw", + ] + + # Convert tensors to lists for Arrow serialization compatibility + # Tensor conversion back happens in collate_function + result = dict.fromkeys(all_keys) # Initialize all keys to None + for key, val in values.items(): + if val is not None and hasattr(val, "tolist"): + result[key] = val.tolist() + elif val is not None: + result[key] = val + + return result + + def collate_function(self, batch): + """Collate function to process inputs during data loading.""" + result = {} + + # Take first item from batch (batch_size handling) + first = batch[0] + + # Convert lists to tensors and move to device + if "input_ids" in first and first["input_ids"] is not None: + result["input_ids"] = torch.LongTensor(first["input_ids"]).to(self.device) + if "attention_mask" in first and first["attention_mask"] is not None: + result["attention_mask"] = torch.LongTensor(first["attention_mask"]).to(self.device) + + # Handle pixel values for images + if first.get("pixel_values") is not None: + result["pixel_values"] = torch.tensor(first["pixel_values"]).to(self.device) + + # Handle image grid thw (tile height width info) + if first.get("image_grid_thw") is not None: + result["image_grid_thw"] = torch.LongTensor(first["image_grid_thw"]).to(self.device) + + # Handle audio features if present + if first.get("audio_feature_lens") is not None: + result["audio_feature_lens"] = torch.LongTensor(first["audio_feature_lens"]).to( + self.device + ) + if first.get("audio_features") is not None: + result["audio_features"] = torch.tensor(first["audio_features"]).to(self.device) + + # Handle video features if present + if first.get("video_grid_thw") is not None: + result["video_grid_thw"] = torch.LongTensor(first["video_grid_thw"]).to(self.device) + + return result From 0ccf7f9cbdc82d46a4494cd24c0fe107a79d24dc Mon Sep 17 00:00:00 2001 From: ajrasane <131806219+ajrasane@users.noreply.github.com> Date: Fri, 6 Feb 2026 05:16:41 +0000 Subject: [PATCH 02/12] Optimize calibration for text data Signed-off-by: ajrasane <131806219+ajrasane@users.noreply.github.com> --- examples/llm_ptq/hf_ptq.py | 97 ++++++++++--------- modelopt/torch/export/unified_export_hf.py | 10 +- .../torch/quantization/plugins/huggingface.py | 5 - modelopt/torch/utils/dataset_utils.py | 25 +++-- 4 files changed, 72 insertions(+), 65 deletions(-) diff --git a/examples/llm_ptq/hf_ptq.py b/examples/llm_ptq/hf_ptq.py index 4f7d3430f8..d854c36454 100755 --- a/examples/llm_ptq/hf_ptq.py +++ b/examples/llm_ptq/hf_ptq.py @@ -70,14 +70,12 @@ create_forward_loop, get_dataset_dataloader, get_max_batch_size, - get_qwen3omni_text_dataloader, get_supported_datasets, ) from modelopt.torch.utils.image_processor import ( BaseImageProcessor, MllamaImageProcessor, Qwen3OmniImageProcessor, - Qwen3OmniTextProcessor, ) from modelopt.torch.utils.memory_monitor import launch_memory_monitor from modelopt.torch.utils.speech_dataset_utils import get_speech_dataset_dataloader @@ -225,50 +223,47 @@ def make_calib_dataloader( num_samples=args.calib_size[0], ) elif model_type == "qwen3omni": - assert processor is not None, "The processor must be set for qwen3omni model." dataset_name = args.dataset[0] if args.dataset else "cnn_dailymail" # Check if using video dataset (e.g., finevideo) - if dataset_name in get_supported_video_datasets(): - video_processor = Qwen3OmniVideoProcessor( - processor.tokenizer if hasattr(processor, "tokenizer") else processor, - device=device, - dtype=language_model.dtype, - use_audio_in_video=True, - ) - calib_dataloader = get_video_dataset_dataloader( - dataset_name=dataset_name, - processor=video_processor, - batch_size=args.batch_size, - num_samples=args.calib_size[0], - ) - elif dataset_name in get_supported_vlm_datasets(): - assert isinstance(processor, Qwen3OmniImageProcessor), ( - "The Qwen3OmniImageProcessor must be set." - ) - # Set the dtype for proper tensor conversion in collate_function - processor.dtype = language_model.dtype - calib_dataloader = get_vlm_dataset_dataloader( - dataset_name=dataset_name, - processor=processor, - batch_size=args.batch_size, - num_samples=args.calib_size[0], - ) + if processor is not None: + if dataset_name in get_supported_video_datasets(): + video_processor = Qwen3OmniVideoProcessor( + processor.tokenizer if hasattr(processor, "tokenizer") else processor, + device=device, + dtype=language_model.dtype, + use_audio_in_video=True, + ) + calib_dataloader = get_video_dataset_dataloader( + dataset_name=dataset_name, + processor=video_processor, + batch_size=args.batch_size, + num_samples=args.calib_size[0], + ) + elif dataset_name in get_supported_vlm_datasets(): + assert isinstance(processor, Qwen3OmniImageProcessor), ( + "The Qwen3OmniImageProcessor must be set." + ) + # Set the dtype for proper tensor conversion in collate_function + processor.dtype = language_model.dtype + calib_dataloader = get_vlm_dataset_dataloader( + dataset_name=dataset_name, + processor=processor, + batch_size=args.batch_size, + num_samples=args.calib_size[0], + ) else: - # Text-only datasets (e.g., cnn_dailymail) - # Use Qwen3OmniTextProcessor to apply proper conversation template - # See: https://huggingface.co/Qwen/Qwen3-Omni-30B-A3B-Thinking - text_processor = Qwen3OmniTextProcessor( - processor=processor.tokenizer, # Pass the underlying HF processor - device=device, - dtype=language_model.dtype, + # Labels are only needed for gradient-based auto_quantize + include_labels = ( + args.auto_quantize_bits is not None and args.auto_quantize_method == "gradient" ) - calib_dataloader = get_qwen3omni_text_dataloader( - dataset_name=dataset_name, - processor=text_processor, + calib_dataloader = get_dataset_dataloader( + dataset_name=args.dataset, + tokenizer=tokenizer, batch_size=args.batch_size, - num_samples=args.calib_size[0], + num_samples=args.calib_size, + device=device, + include_labels=include_labels, ) - print(f"Selected dataset for calibration: {dataset_name}") elif model_type == "whisper": assert processor is not None and isinstance(processor, WhisperProcessor), ( "The AutoProcessor must be set." @@ -452,9 +447,6 @@ def load_model(args: argparse.Namespace): calibration_only = True model_type = get_model_type(full_model) - if model_type == "qwen3omni": - print("Disabling talker for Qwen3Omni model") - full_model.disable_talker() device = full_model.device if hasattr(full_model, "model"): @@ -480,6 +472,14 @@ def load_model(args: argparse.Namespace): trust_remote_code=args.trust_remote_code, attn_implementation=args.attn_implementation, ) + if model_type == "qwen3omni": + print("Disabling talker for Qwen3Omni model") + full_model.disable_talker() + language_model = full_model.thinker.model + tokenizer = processor.tokenizer.tokenizer + processor = None + default_padding_side = tokenizer.padding_side + default_pad_token = tokenizer.pad_token elif model_type == "whisper": processor = get_processor( args.pyt_ckpt_path, @@ -620,6 +620,7 @@ def mono_quantize( print("Quantization will only be applied to the decoder (text generation) component") # For Qwen3Omni models, disable quantization of conv layers + generation_kwargs = {} if model_type == "qwen3omni": print( "Disabling quantization for conv layers, audio tower and visual encoder in Qwen3Omni model" @@ -627,6 +628,8 @@ def mono_quantize( quant_cfg["quant_cfg"]["*conv*"] = {"enable": False} quant_cfg["quant_cfg"]["*audio_tower*"] = {"enable": False} quant_cfg["quant_cfg"]["*visual*"] = {"enable": False} + generation_kwargs["return_audio"] = False + generation_kwargs["thinker_max_new_tokens"] = 1 if not model_is_already_quantized or calibration_only: # quantize the model @@ -642,7 +645,9 @@ def mono_quantize( if args.calib_with_images and is_nemotron_vl_model: calibrate_loop = create_vlm_calibration_loop(full_model, calib_dataloader) else: - calibrate_loop = create_forward_loop(dataloader=calib_dataloader) + calibrate_loop = create_forward_loop( + dataloader=calib_dataloader, generation_kwargs=generation_kwargs + ) if calibration_only: language_model = mtq.calibrate( @@ -836,7 +841,7 @@ def pre_quantize( elif model_type == "qwen3omni": # Qwen3Omni returns (text_ids, audio) tuple; text_ids has .sequences # Pass full batch with all multimodal inputs - result = full_model.generate(**calib_batch, max_new_tokens=100) + result = full_model.generate(**calib_batch, return_audio=False, thinker_max_new_tokens=100) if isinstance(result, tuple): text_ids, _ = result generated_ids_before_ptq = ( @@ -895,7 +900,7 @@ def post_quantize( elif model_type == "qwen3omni": # Qwen3Omni returns (text_ids, audio) tuple; text_ids has .sequences # Pass full batch with all multimodal inputs - result = full_model.generate(**calib_batch, max_new_tokens=100) + result = full_model.generate(**calib_batch, return_audio=False, thinker_max_new_tokens=100) if isinstance(result, tuple): text_ids, _ = result generated_ids_after_ptq = ( diff --git a/modelopt/torch/export/unified_export_hf.py b/modelopt/torch/export/unified_export_hf.py index e105c46aa4..17796e026d 100644 --- a/modelopt/torch/export/unified_export_hf.py +++ b/modelopt/torch/export/unified_export_hf.py @@ -357,9 +357,11 @@ def llm_dummy_forward(): [1, model.config.num_mel_bins, feature_extractor.nb_max_frames], dtype=model.dtype ).to(model.device) - if is_vl_model and "nemotron" in model_type: - # For Nemotron VL models, run optimization on just the language model/decoder. - # This avoids needing pixel_values for the vision encoder. + if getattr(model.config, "is_encoder_decoder", False): + # For encoder-decoder models, we need to pass both the encoder and decoder input ids + model(fake_input, decoder_input_ids=decoder_fake_input) + elif (is_vl_model and "nemotron" in model_type) or model_type.startswith("qwen3omni"): + # For Nemotron VL models, try to run optimization on just the language model part language_model_lineage = get_language_model_from_vl(model) if language_model_lineage is not None: @@ -371,7 +373,7 @@ def llm_dummy_forward(): language_model(fake_input, use_cache=False) else: raise ValueError( - f"Cannot extract language_model from Nemotron VL model (type: {model_type}). " + f"Cannot extract language_model from VL model (type: {model_type}). " "This is required for requantization/resmoothing optimization. " "Please ensure the model architecture is supported or file an issue." ) diff --git a/modelopt/torch/quantization/plugins/huggingface.py b/modelopt/torch/quantization/plugins/huggingface.py index b03c46fd03..0a451e8ccb 100644 --- a/modelopt/torch/quantization/plugins/huggingface.py +++ b/modelopt/torch/quantization/plugins/huggingface.py @@ -1182,14 +1182,9 @@ def unpack_weight(self): try: from transformers.models.qwen3_omni_moe.modeling_qwen3_omni_moe import ( - Qwen3OmniMoeTalkerTextSparseMoeBlock, Qwen3OmniMoeThinkerTextSparseMoeBlock, ) - if Qwen3OmniMoeTalkerTextSparseMoeBlock not in QuantModuleRegistry: - QuantModuleRegistry.register( - {Qwen3OmniMoeTalkerTextSparseMoeBlock: "hf.Qwen3OmniMoeTalkerTextSparseMoeBlock"} - )(_QuantSparseMoe) if Qwen3OmniMoeThinkerTextSparseMoeBlock not in QuantModuleRegistry: QuantModuleRegistry.register( {Qwen3OmniMoeThinkerTextSparseMoeBlock: "hf.Qwen3OmniMoeThinkerTextSparseMoeBlock"} diff --git a/modelopt/torch/utils/dataset_utils.py b/modelopt/torch/utils/dataset_utils.py index 9a5b7ccf98..a02a0717c8 100644 --- a/modelopt/torch/utils/dataset_utils.py +++ b/modelopt/torch/utils/dataset_utils.py @@ -587,12 +587,13 @@ def _get_free_gpu_mem(): return 512 -def _process_batch(batch_data, infer_method, max_working_batch_size=None): +def _process_batch(batch_data, infer_method, generation_kwargs={}, max_working_batch_size=None): """Process a batch of data through the model's inference method. Args: batch_data: Dictionary containing the batch data infer_method: Model's inference method (either forward or generate) + generation_kwargs: Keyword arguments to pass to the model.generate() method. max_working_batch_size: Maximum batch size known to work without OOM Returns: @@ -630,7 +631,7 @@ def _process_batch(batch_data, infer_method, max_working_batch_size=None): # Try processing with current batch size try: - infer_method(**batch_data) + infer_method(**batch_data, **generation_kwargs) return ( batch_size if max_working_batch_size is None @@ -661,24 +662,27 @@ def _process_batch(batch_data, infer_method, max_working_batch_size=None): return max_working_batch_size -def _forward_loop(model: torch.nn.Module, dataloader: DataLoader) -> None: +def _forward_loop( + model: torch.nn.Module, dataloader: DataLoader, generation_kwargs: dict = {} +) -> None: """Runs forward passes through the model using data from the dataloader. Args: model: The PyTorch model to run inference on dataloader: DataLoader containing the batched input data + generation_kwargs: Keyword arguments to pass to the model.generate() method. """ with torch.no_grad(): - use_generate = _should_use_generate(model) + # use_generate = _should_use_generate(model) + use_generate = model_type_is_enc_dec(model) infer_method = model.generate if use_generate else model.forward max_working_batch_size = None # Initialize max working batch size as None for _, data in enumerate(tqdm(dataloader)): - # For generate(), add max_new_tokens to prevent indefinite generation during calibration - if use_generate: - data["max_new_tokens"] = 1 # Process batch and update max working batch size - max_working_batch_size = _process_batch(data, infer_method, max_working_batch_size) + max_working_batch_size = _process_batch( + data, infer_method, generation_kwargs, max_working_batch_size + ) def create_forward_loop( @@ -691,6 +695,7 @@ def create_forward_loop( device: str | None = None, include_labels: bool = False, dataloader: DataLoader | None = None, + generation_kwargs: dict = {}, ) -> Callable: """Creates and returns a forward loop function configured for a specific model, dataset, and tokenizer. @@ -709,7 +714,7 @@ def create_forward_loop( device: Target device for the returned dataloader. include_labels: Whether to include labels in the dataloader. dataloader: If provided, use the provided dataloader instead. - + generation_kwargs: Keyword arguments to pass to the model.generate() method. Example usage for quantization: .. code-block:: python @@ -748,7 +753,7 @@ def create_forward_loop( include_labels=include_labels, ) - return lambda model: _forward_loop(model, dataloader) + return lambda model: _forward_loop(model, dataloader, generation_kwargs) def model_type_is_enc_dec(model): From df50095e60d30a7df671c3401ec2a5b81809a986 Mon Sep 17 00:00:00 2001 From: ajrasane <131806219+ajrasane@users.noreply.github.com> Date: Fri, 6 Feb 2026 06:11:27 +0000 Subject: [PATCH 03/12] Refactor model specific code to example_utils Signed-off-by: ajrasane <131806219+ajrasane@users.noreply.github.com> --- examples/llm_ptq/example_utils.py | 124 +++++++++++++++++++++ examples/llm_ptq/hf_ptq.py | 97 ++++------------ modelopt/torch/export/model_utils.py | 56 +++++----- modelopt/torch/export/unified_export_hf.py | 10 +- modelopt/torch/quantization/model_quant.py | 36 +++--- modelopt/torch/utils/dataset_utils.py | 2 +- 6 files changed, 200 insertions(+), 125 deletions(-) diff --git a/examples/llm_ptq/example_utils.py b/examples/llm_ptq/example_utils.py index 5fe9ab5ad6..1c17e2b5d9 100755 --- a/examples/llm_ptq/example_utils.py +++ b/examples/llm_ptq/example_utils.py @@ -46,11 +46,21 @@ snapshot_download = None import modelopt.torch.quantization as mtq +from modelopt.torch.utils.dataset_utils import get_dataset_dataloader from modelopt.torch.utils.image_processor import ( BaseImageProcessor, MllamaImageProcessor, Qwen3OmniImageProcessor, ) +from modelopt.torch.utils.video_dataset_utils import ( + Qwen3OmniVideoProcessor, + get_supported_video_datasets, + get_video_dataset_dataloader, +) +from modelopt.torch.utils.vlm_dataset_utils import ( + get_supported_vlm_datasets, + get_vlm_dataset_dataloader, +) logger = logging.getLogger(__name__) @@ -246,9 +256,45 @@ def build_quant_cfg( quant_cfg["quant_cfg"]["*image*"] = {"enable": False} quant_cfg["quant_cfg"]["*vision*"] = {"enable": False} + if model_type in ["qwen3moe", "qwen3next"] and qformat == "nvfp4": + # Disable the attention projection layers to retain accuracy + quant_cfg["quant_cfg"]["model*.*attn*in_proj*"] = {"enable": False} + quant_cfg["quant_cfg"]["model*.*attn*q_proj*"] = {"enable": False} + quant_cfg["quant_cfg"]["model*.*attn*k_proj*"] = {"enable": False} + quant_cfg["quant_cfg"]["model*.*attn*v_proj*"] = {"enable": False} + + if model_type == "deepseek": + # Disable MLA quantization for accuracy. + quant_cfg["quant_cfg"]["*self_attn.q*"] = {"enable": False} + quant_cfg["quant_cfg"]["*self_attn.kv*"] = {"enable": False} + + if model_type == "qwen3omni": + print( + "Disabling quantization for conv layers, audio tower and visual encoder in Qwen3Omni model" + ) + quant_cfg["quant_cfg"]["*conv*"] = {"enable": False} + quant_cfg["quant_cfg"]["*audio_tower*"] = {"enable": False} + quant_cfg["quant_cfg"]["*visual*"] = {"enable": False} + return quant_cfg +def get_generation_kwargs(model_type: str) -> dict[str, Any]: + """Get model-specific generation kwargs for calibration. + + Args: + model_type: The model type string. + + Returns: + Dictionary of generation kwargs for the model. + """ + generation_kwargs = {} + if model_type == "qwen3omni": + generation_kwargs["return_audio"] = False + generation_kwargs["thinker_max_new_tokens"] = 1 + return generation_kwargs + + def is_speculative(hf_config): """Check if the model architecture is a speculative model.""" return hf_config.architectures and any( @@ -834,3 +880,81 @@ def copy_custom_model_files(source_path: str, export_path: str, trust_remote_cod print(f"Successfully copied {len(copied_files)} custom model files to {export_path}") else: print("No custom model files found to copy") + + +def get_qwen3omni_dataloader( + dataset_name: str | list[str] | None, + processor: Qwen3OmniImageProcessor | None, + tokenizer, + batch_size: int, + num_samples: int | list[int], + device: torch.device, + model_dtype: torch.dtype, + include_labels: bool = False, +): + """Create a calibration dataloader for Qwen3Omni models. + + Handles video, VLM, and text-only dataset configurations. + + Args: + dataset_name: Name of the dataset(s) to use for calibration. + processor: The Qwen3OmniImageProcessor for multimodal inputs. + tokenizer: The tokenizer for text-only fallback. + batch_size: Batch size for the dataloader. + num_samples: Number of samples to use (int or list for multi-dataset). + device: Target device for tensors. + model_dtype: Model dtype for proper tensor conversion. + include_labels: Whether to include labels (for gradient-based auto_quantize). + + Returns: + DataLoader for calibration. + """ + if dataset_name is None: + dataset_name = ["cnn_dailymail", "nemotron-post-training-dataset-v2"] + + if processor is not None: + if dataset_name in get_supported_video_datasets(): + assert isinstance(dataset_name, str) + video_processor = Qwen3OmniVideoProcessor( + processor.tokenizer if hasattr(processor, "tokenizer") else processor, + device=device, + dtype=model_dtype, + use_audio_in_video=True, + ) + calib_dataloader = get_video_dataset_dataloader( + dataset_name=dataset_name, + processor=video_processor, + batch_size=batch_size, + num_samples=num_samples if isinstance(num_samples, int) else num_samples[0], + ) + elif dataset_name in get_supported_vlm_datasets(): + assert isinstance(dataset_name, str) + assert isinstance(processor, Qwen3OmniImageProcessor), ( + "The Qwen3OmniImageProcessor must be set." + ) + # Set the dtype for proper tensor conversion in collate_function + processor.dtype = model_dtype + calib_dataloader = get_vlm_dataset_dataloader( + dataset_name=dataset_name, + processor=processor, + batch_size=batch_size, + num_samples=num_samples if isinstance(num_samples, int) else num_samples[0], + ) + else: + raise ValueError( + f"Dataset '{dataset_name}' not supported for Qwen3Omni with processor. " + f"Supported video datasets: {get_supported_video_datasets()}, " + f"Supported VLM datasets: {get_supported_vlm_datasets()}" + ) + else: + # Text-only fallback + calib_dataloader = get_dataset_dataloader( + dataset_name=dataset_name if isinstance(dataset_name, list) else [dataset_name], + tokenizer=tokenizer, + batch_size=batch_size, + num_samples=num_samples if isinstance(num_samples, list) else [num_samples], + device=device, + include_labels=include_labels, + ) + + return calib_dataloader diff --git a/examples/llm_ptq/hf_ptq.py b/examples/llm_ptq/hf_ptq.py index d854c36454..571cea125a 100755 --- a/examples/llm_ptq/hf_ptq.py +++ b/examples/llm_ptq/hf_ptq.py @@ -15,9 +15,7 @@ import argparse import copy -import io import random -import sys import time import warnings from typing import Any @@ -29,8 +27,10 @@ build_quant_cfg, copy_custom_model_files, create_vlm_calibration_loop, + get_generation_kwargs, get_model, get_processor, + get_qwen3omni_dataloader, get_tokenizer, is_enc_dec, is_nemotron_vl, @@ -79,15 +79,7 @@ ) from modelopt.torch.utils.memory_monitor import launch_memory_monitor from modelopt.torch.utils.speech_dataset_utils import get_speech_dataset_dataloader -from modelopt.torch.utils.video_dataset_utils import ( - Qwen3OmniVideoProcessor, - get_supported_video_datasets, - get_video_dataset_dataloader, -) -from modelopt.torch.utils.vlm_dataset_utils import ( - get_supported_vlm_datasets, - get_vlm_dataset_dataloader, -) +from modelopt.torch.utils.vlm_dataset_utils import get_vlm_dataset_dataloader RAND_SEED = 1234 @@ -223,47 +215,20 @@ def make_calib_dataloader( num_samples=args.calib_size[0], ) elif model_type == "qwen3omni": - dataset_name = args.dataset[0] if args.dataset else "cnn_dailymail" - # Check if using video dataset (e.g., finevideo) - if processor is not None: - if dataset_name in get_supported_video_datasets(): - video_processor = Qwen3OmniVideoProcessor( - processor.tokenizer if hasattr(processor, "tokenizer") else processor, - device=device, - dtype=language_model.dtype, - use_audio_in_video=True, - ) - calib_dataloader = get_video_dataset_dataloader( - dataset_name=dataset_name, - processor=video_processor, - batch_size=args.batch_size, - num_samples=args.calib_size[0], - ) - elif dataset_name in get_supported_vlm_datasets(): - assert isinstance(processor, Qwen3OmniImageProcessor), ( - "The Qwen3OmniImageProcessor must be set." - ) - # Set the dtype for proper tensor conversion in collate_function - processor.dtype = language_model.dtype - calib_dataloader = get_vlm_dataset_dataloader( - dataset_name=dataset_name, - processor=processor, - batch_size=args.batch_size, - num_samples=args.calib_size[0], - ) - else: - # Labels are only needed for gradient-based auto_quantize - include_labels = ( - args.auto_quantize_bits is not None and args.auto_quantize_method == "gradient" - ) - calib_dataloader = get_dataset_dataloader( - dataset_name=args.dataset, - tokenizer=tokenizer, - batch_size=args.batch_size, - num_samples=args.calib_size, - device=device, - include_labels=include_labels, - ) + # Labels are only needed for gradient-based auto_quantize + include_labels = ( + args.auto_quantize_bits is not None and args.auto_quantize_method == "gradient" + ) + calib_dataloader = get_qwen3omni_dataloader( + dataset_name=args.dataset[0] if args.dataset else None, + processor=processor, + tokenizer=tokenizer, + batch_size=args.batch_size, + num_samples=args.calib_size[0] if processor else args.calib_size, + device=device, + model_dtype=language_model.dtype, + include_labels=include_labels, + ) elif model_type == "whisper": assert processor is not None and isinstance(processor, WhisperProcessor), ( "The AutoProcessor must be set." @@ -619,17 +584,8 @@ def mono_quantize( quant_cfg["quant_cfg"]["*model_encoder*"] = {"enable": False} # Nemotron-Parse specific print("Quantization will only be applied to the decoder (text generation) component") - # For Qwen3Omni models, disable quantization of conv layers - generation_kwargs = {} - if model_type == "qwen3omni": - print( - "Disabling quantization for conv layers, audio tower and visual encoder in Qwen3Omni model" - ) - quant_cfg["quant_cfg"]["*conv*"] = {"enable": False} - quant_cfg["quant_cfg"]["*audio_tower*"] = {"enable": False} - quant_cfg["quant_cfg"]["*visual*"] = {"enable": False} - generation_kwargs["return_audio"] = False - generation_kwargs["thinker_max_new_tokens"] = 1 + # Get model-specific generation kwargs (e.g., for Qwen3Omni) + generation_kwargs = get_generation_kwargs(model_type) if not model_is_already_quantized or calibration_only: # quantize the model @@ -876,20 +832,7 @@ def post_quantize( """ if args.verbose: - if args.quant_summary_path: - # Capture the summary output to a file - old_stdout = sys.stdout - sys.stdout = buffer = io.StringIO() - try: - mtq.print_quant_summary(full_model, args.export_path) - finally: - sys.stdout = old_stdout - summary = buffer.getvalue() - with open(args.quant_summary_path, "w") as f: - f.write(summary) - print(f"Quantization summary saved to {args.quant_summary_path}") - else: - mtq.print_quant_summary(full_model, args.export_path) + mtq.print_quant_summary(full_model, save_path=args.quant_summary_path) save_expert_token_count_table(full_model, args.export_path) # Run some samples diff --git a/modelopt/torch/export/model_utils.py b/modelopt/torch/export/model_utils.py index b71e53bacf..7501ed7bbc 100755 --- a/modelopt/torch/export/model_utils.py +++ b/modelopt/torch/export/model_utils.py @@ -17,47 +17,47 @@ import torch.nn as nn MODEL_NAME_TO_TYPE = { - "ArcticForCausalLM": "llama", - "baichuan": "baichuan", - "Bart": "bart", - "Bloom": "bloom", - "ChatGLM": "chatglm", - "Dbrx": "dbrx", - "Deepseek": "deepseek", - "ExaoneForCausalLM": "exaone", - "FalconForCausalLM": "falcon", - "Gemma": "gemma", - "Gemma2": "gemma2", - "Gemma3": "gemma3", - "GLM": "glm", "GPT2": "gpt", - "GPTJ": "gptj", - "gptoss": "gptoss", - "InternLM2ForCausalLM": "internlm", - "Llama": "llama", + "Mllama": "mllama", "Llama4": "llama4", + "Llama": "llama", "Mistral": "llama", - "MixtralForCausalLM": "llama", - "Mllama": "mllama", + "GPTJ": "gptj", + "FalconForCausalLM": "falcon", + "RWForCausalLM": "falcon", + "baichuan": "baichuan", "MPT": "mpt", - "Nemotron": "gpt", - "phi": "phi", - "phi3": "phi3", - "phi3small": "phi3small", - "Phi4MMForCausalLM": "phi4mm", - "PhiMoEForCausalLM": "phi3", + "Bloom": "bloom", + "ChatGLM": "chatglm", "Qwen3Moe": "qwen3moe", "Qwen3Next": "qwen3next", "Qwen3OmniMoeForConditionalGeneration": "qwen3omni", "QWen": "qwen", "RecurrentGemma": "recurrentgemma", - "RWForCausalLM": "falcon", - "StarCoder": "gpt", - "T5": "t5", + "Gemma3": "gemma3", + "Gemma2": "gemma2", + "Gemma": "gemma", + "phi3small": "phi3small", + "phi3": "phi3", + "PhiMoEForCausalLM": "phi3", + "Phi4MMForCausalLM": "phi4mm", + "phi": "phi", "TLGv4ForCausalLM": "phi", "NemotronH": "nemotron_h", + "MixtralForCausalLM": "llama", + "ArcticForCausalLM": "llama", + "StarCoder": "gpt", + "Dbrx": "dbrx", + "T5": "t5", + "Bart": "bart", + "GLM": "glm", + "InternLM2ForCausalLM": "internlm", + "ExaoneForCausalLM": "exaone", + "Nemotron": "gpt", + "Deepseek": "deepseek", "Whisper": "whisper", "MiniMax": "minimax", + "gptoss": "gptoss", } __doc__ = f"""Utility functions for model type detection and classification. diff --git a/modelopt/torch/export/unified_export_hf.py b/modelopt/torch/export/unified_export_hf.py index 17796e026d..e8a1e06857 100644 --- a/modelopt/torch/export/unified_export_hf.py +++ b/modelopt/torch/export/unified_export_hf.py @@ -1188,10 +1188,12 @@ def export_hf_checkpoint( if hasattr(model, "generation_config") and model.generation_config is not None: gen_config = model.generation_config if not getattr(gen_config, "do_sample", True): - # Remove sampling-related params when do_sample is False - for attr in ["temperature", "top_p", "top_k"]: - if hasattr(gen_config, attr): - setattr(gen_config, attr, None) + # Enable sampling if sampling params are present + if any( + getattr(gen_config, attr, None) is not None + for attr in ["temperature", "top_p", "top_k"] + ): + gen_config.do_sample = True # Save model # Temporarily disable revert_weight_conversion if available — it doesn't handle diff --git a/modelopt/torch/quantization/model_quant.py b/modelopt/torch/quantization/model_quant.py index 4aa1ff46b4..782702703b 100644 --- a/modelopt/torch/quantization/model_quant.py +++ b/modelopt/torch/quantization/model_quant.py @@ -583,22 +583,28 @@ def enable_quantizer(model: nn.Module, wildcard_or_filter_func: str | Callable): @atomic_print -def print_quant_summary(model: nn.Module, output_dir: str | None = None): - """Print summary of all quantizer modules in the model.""" - lines = [ - f"{name:80} {mod}" - for name, mod in model.named_modules() - if isinstance(mod, TensorQuantizer) - ] - lines.append(f"{len(lines)} TensorQuantizers found in model") - - if output_dir: - path = os.path.join(output_dir, ".quant_summary.txt") - with open(path, "w", encoding="utf-8") as f: - f.write("\n".join(lines) + "\n") - print(f"\033[1mQuant summary saved to {path}\033[0m") +def print_quant_summary(model: nn.Module, save_path: str | None = None): + """Print summary of all quantizer modules in the model. + + Args: + model: The model to summarize. + save_path: Optional path to save the summary to a file. If None, prints to stdout. + """ + lines = [] + count = 0 + for name, mod in model.named_modules(): + if isinstance(mod, TensorQuantizer): + lines.append(f"{name:80} {mod}") + count += 1 + lines.append(f"{count} TensorQuantizers found in model") + + summary = "\n".join(lines) + if save_path: + with open(save_path, "w") as f: + f.write(summary) + print(f"Quantization summary saved to {save_path}") else: - print("\n".join(lines)) + print(summary) def fold_weight(model: nn.Module, keep_attrs: bool = False): diff --git a/modelopt/torch/utils/dataset_utils.py b/modelopt/torch/utils/dataset_utils.py index a02a0717c8..842a797afb 100644 --- a/modelopt/torch/utils/dataset_utils.py +++ b/modelopt/torch/utils/dataset_utils.py @@ -531,7 +531,7 @@ def _get_free_gpu_mem(): torch.cuda.empty_cache() free_mem_before, max_allocated_before = _get_free_gpu_mem() - use_generate = _should_use_generate(model) + use_generate = model_type_is_enc_dec(model) infer_method = model.generate if use_generate else model.forward if sample_input_single_batch is None: From 3f5859b2222f3d196e3bf4d2b2f255b215b10548 Mon Sep 17 00:00:00 2001 From: ajrasane <131806219+ajrasane@users.noreply.github.com> Date: Fri, 6 Feb 2026 19:11:26 +0000 Subject: [PATCH 04/12] Update hf configs for vLLM deployment Signed-off-by: ajrasane <131806219+ajrasane@users.noreply.github.com> --- examples/llm_ptq/example_utils.py | 50 +++++++++++++++++++++++++++++++ examples/llm_ptq/hf_ptq.py | 4 +++ 2 files changed, 54 insertions(+) diff --git a/examples/llm_ptq/example_utils.py b/examples/llm_ptq/example_utils.py index 1c17e2b5d9..24c9cfa10f 100755 --- a/examples/llm_ptq/example_utils.py +++ b/examples/llm_ptq/example_utils.py @@ -882,6 +882,55 @@ def copy_custom_model_files(source_path: str, export_path: str, trust_remote_cod print("No custom model files found to copy") +def patch_config_for_unified_export(model_type: str, export_path: str) -> None: + """Patch config files to add missing exclusion patterns for unified HF export. + + This function adds missing exclusion patterns for modules that should not be quantized + (e.g., audio tower, visual encoder, lm_head) to both hf_quant_config.json and config.json. + + Args: + export_path: Path to the exported model directory. + """ + if model_type == "qwen3omni": + missing_patterns = [ + "thinker.audio_tower*", + "thinker.visual*", + "thinker.lm_head", + ] + + # (filename, path_to_exclude_list) + configs = [ + ("hf_quant_config.json", ["quantization", "exclude_modules"]), + ("config.json", ["quantization_config", "ignore"]), + ] + + for filename, keys in configs: + filepath = os.path.join(export_path, filename) + if not os.path.exists(filepath): + continue + try: + with open(filepath) as f: + config = json.load(f) + + # Navigate to nested key + target = config + for key in keys[:-1]: + target = target.get(key, {}) + + exclude_list = target.get(keys[-1]) + if exclude_list is None: + continue + + added = [p for p in missing_patterns if p not in exclude_list] + if added: + exclude_list.extend(added) + with open(filepath, "w") as f: + json.dump(config, f, indent=2) + print(f"Patched {filename} with exclusions: {added}") + except Exception as e: + print(f"Warning: Failed to patch {filename}: {e}") + + def get_qwen3omni_dataloader( dataset_name: str | list[str] | None, processor: Qwen3OmniImageProcessor | None, @@ -911,6 +960,7 @@ def get_qwen3omni_dataloader( """ if dataset_name is None: dataset_name = ["cnn_dailymail", "nemotron-post-training-dataset-v2"] + num_samples = [512, 512] if processor is not None: if dataset_name in get_supported_video_datasets(): diff --git a/examples/llm_ptq/hf_ptq.py b/examples/llm_ptq/hf_ptq.py index 571cea125a..cbb8961efd 100755 --- a/examples/llm_ptq/hf_ptq.py +++ b/examples/llm_ptq/hf_ptq.py @@ -35,6 +35,7 @@ is_enc_dec, is_nemotron_vl, load_mtp_weights, + patch_config_for_unified_export, run_nemotron_vl_preview, ) from torch.utils.data import DataLoader @@ -734,6 +735,9 @@ def export_quantized( extra_state_dict=mtp_state_dict, ) + # Exclude non-quantized modules in config.json and hf_quant_config.json + patch_config_for_unified_export(model_type, export_path) + # Restore default padding and export the tokenizer as well. if tokenizer is not None: tokenizer.padding_side = default_padding_side From 41ee25b7c2d3607c48c8e4b5903568e7f93e937f Mon Sep 17 00:00:00 2001 From: ajrasane <131806219+ajrasane@users.noreply.github.com> Date: Fri, 6 Feb 2026 20:14:12 +0000 Subject: [PATCH 05/12] Create a script to run vllm inference Signed-off-by: ajrasane <131806219+ajrasane@users.noreply.github.com> --- examples/llm_ptq/example_utils.py | 111 ++++++++++++++++++++++++ examples/llm_ptq/run_vllm.py | 136 ++++++++++++++++++++++++++++++ 2 files changed, 247 insertions(+) create mode 100644 examples/llm_ptq/run_vllm.py diff --git a/examples/llm_ptq/example_utils.py b/examples/llm_ptq/example_utils.py index 24c9cfa10f..fa71607ad0 100755 --- a/examples/llm_ptq/example_utils.py +++ b/examples/llm_ptq/example_utils.py @@ -46,6 +46,7 @@ snapshot_download = None import modelopt.torch.quantization as mtq +from modelopt.torch.export.model_utils import MODEL_NAME_TO_TYPE from modelopt.torch.utils.dataset_utils import get_dataset_dataloader from modelopt.torch.utils.image_processor import ( BaseImageProcessor, @@ -66,6 +67,116 @@ SPECULATIVE_MODEL_LIST = ["Eagle", "Medusa"] +# Files needed for tokenizer/processor that vLLM loads from model path +TOKENIZER_FILES = [ + "vocab.json", + "merges.txt", + "tokenizer.json", + "tokenizer_config.json", + "special_tokens_map.json", + "preprocessor_config.json", + "chat_template.json", +] + + +def get_model_type_from_config(model_path: str) -> str | None: + """Get model type from the config.json file. + + Args: + model_path: Path to the model directory or HuggingFace model ID. + + Returns: + Model type string (e.g., 'qwen3omni', 'llama', 'gpt') or None if not found. + """ + config_path = os.path.join(model_path, "config.json") + if not os.path.exists(config_path): + return None + + with open(config_path) as f: + config = json.load(f) + + # Check architectures field first + architectures = config.get("architectures", []) + for arch in architectures: + for key, model_type in MODEL_NAME_TO_TYPE.items(): + if key.lower() in arch.lower(): + return model_type + + # Fallback to model_type field + model_type_field = config.get("model_type", "") + for key, model_type in MODEL_NAME_TO_TYPE.items(): + if key.lower() in model_type_field.lower(): + return model_type + + return None + + +def get_sampling_params_from_config(model_path: str) -> dict: + """Extract sampling params from generation_config.json if present.""" + gen_config_path = Path(model_path) / "generation_config.json" + if not gen_config_path.exists(): + return {} + + gen_config = json.loads(gen_config_path.read_text()) + + params = {k: gen_config[k] for k in ("temperature", "top_p", "top_k") if k in gen_config} + + for key in ("max_new_tokens", "max_length"): + if key in gen_config: + params["max_tokens"] = gen_config[key] + break + + return params + + +def get_quantization_format(model_path: str) -> str | None: + """Get quantization format from the model config. + + Args: + model_path: Path to the model directory. + + Returns: + vLLM quantization string ('modelopt', 'modelopt_fp4') or None if not quantized. + """ + hf_quant_config_path = os.path.join(model_path, "hf_quant_config.json") + if os.path.exists(hf_quant_config_path): + with open(hf_quant_config_path) as f: + quant_config = json.load(f) + quant_algo = quant_config.get("quantization", {}).get("quant_algo", "") + if "NVFP4" in quant_algo: + return "modelopt_fp4" + + return None + + +def ensure_tokenizer_files(model_path: str, source_model_id: str) -> None: + """Copy tokenizer files from HF model to local quantized model dir if missing.""" + if not os.path.isdir(model_path): + return # Not a local path, nothing to do + + # Check if tokenizer files are missing + missing_files = [f for f in TOKENIZER_FILES if not os.path.exists(os.path.join(model_path, f))] + if not missing_files: + return + + if snapshot_download is None: + print("Warning: huggingface_hub not installed, cannot download tokenizer files") + return + + print(f"Copying missing tokenizer files from {source_model_id}...") + # Download only tokenizer files from HF + cache_dir = snapshot_download( + source_model_id, + allow_patterns=TOKENIZER_FILES, + ) + + for fname in TOKENIZER_FILES: + src = os.path.join(cache_dir, fname) + dst = os.path.join(model_path, fname) + if os.path.exists(src) and not os.path.exists(dst): + shutil.copy2(src, dst) + print(f" Copied {fname}") + def run_nemotron_vl_preview( full_model, tokenizer, input_ids, pyt_ckpt_path, stage_name, allow_fallback=False diff --git a/examples/llm_ptq/run_vllm.py b/examples/llm_ptq/run_vllm.py new file mode 100644 index 0000000000..748c5b13a0 --- /dev/null +++ b/examples/llm_ptq/run_vllm.py @@ -0,0 +1,136 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Unified HF checkpoint inference with vLLM. + +Usage: + python run_vllm.py --model /path/to/quantized/model + python run_vllm.py --model /path/to/model --tp 4 +""" + +from __future__ import annotations + +import argparse + +from example_utils import ( + ensure_tokenizer_files, + get_model_type_from_config, + get_quantization_format, + get_sampling_params_from_config, +) +from transformers import AutoConfig, AutoProcessor +from vllm import LLM, SamplingParams + + +def main(): + parser = argparse.ArgumentParser(description="Run unified hf checkpoint inference with vLLM") + parser.add_argument("--model", type=str, required=True, help="Model ID or path") + parser.add_argument("--tp", type=int, default=1, help="Tensor parallel size") + parser.add_argument( + "--max-model-len", + type=int, + default=None, + help="Max model length (auto-detected from config if not specified)", + ) + parser.add_argument("--prompt", type=str, default="What in Nvidia?", help="Text prompt") + parser.add_argument( + "--tokenizer", type=str, default=None, help="Tokenizer ID or path (defaults to model path)" + ) + parser.add_argument("--temperature", type=float, default=0.7, help="Sampling temperature") + parser.add_argument("--top-p", type=float, default=0.9, help="Top-p sampling") + parser.add_argument("--top-k", type=int, default=-1, help="Top-k sampling (-1 to disable)") + parser.add_argument("--max-tokens", type=int, default=512, help="Max tokens to generate") + + args = parser.parse_args() + + # Detect model type from config + model_type = get_model_type_from_config(args.model) + print(f"Detected model type: {model_type}") + + # Detect quantization format + quantization = get_quantization_format(args.model) + print(f"Detected quantization: {quantization}") + + # Get max_model_len from config if not specified + if args.max_model_len is None: + config = AutoConfig.from_pretrained(args.model, trust_remote_code=True) + args.max_model_len = getattr(config, "max_position_embeddings", 4096) + print(f"Using max_model_len from config: {args.max_model_len}") + + # Determine tokenizer source + tokenizer_id = args.tokenizer or args.model + + # Load processor for chat template + processor = AutoProcessor.from_pretrained(tokenizer_id, trust_remote_code=True) + + # Text-only conversations + conversations = [ + [ + { + "role": "user", + "content": [{"type": "text", "text": args.prompt}], + } + ], + ] + + # Apply chat template + apply_chat_kwargs = { + "add_generation_prompt": True, + "tokenize": False, + } + # Qwen3Omni-specific: disable thinking mode + if model_type == "qwen3omni": + apply_chat_kwargs["enable_thinking"] = False + + texts = processor.apply_chat_template(conversations, **apply_chat_kwargs) + + # Ensure tokenizer files exist in local model dir (vLLM loads processor from model path) + if args.tokenizer: + ensure_tokenizer_files(args.model, args.tokenizer) + + print(f"Loading model: {args.model}") + llm = LLM( + model=args.model, + tokenizer=tokenizer_id, + tensor_parallel_size=args.tp, + max_model_len=args.max_model_len, + trust_remote_code=True, + quantization=quantization, + ) + + # Get sampling params from config, with CLI/defaults as fallback + config_params = get_sampling_params_from_config(args.model) + sampling_kwargs = { + "temperature": config_params.get("temperature", args.temperature), + "top_p": config_params.get("top_p", args.top_p), + "max_tokens": config_params.get("max_tokens", args.max_tokens), + } + top_k = config_params.get("top_k", args.top_k) + if top_k > 0: + sampling_kwargs["top_k"] = top_k + print(f"Sampling params: {sampling_kwargs}") + sampling_params = SamplingParams(**sampling_kwargs) + + print("Running inference...") + outputs = llm.generate(texts, sampling_params) + + for output in outputs: + generated_text = output.outputs[0].text + print("-" * 80) + print(f"Generated: {generated_text}") + + +if __name__ == "__main__": + main() From a71c73b29bd0982eb04a03ddff3c5366cd8f53ab Mon Sep 17 00:00:00 2001 From: ajrasane <131806219+ajrasane@users.noreply.github.com> Date: Mon, 9 Feb 2026 22:37:47 +0000 Subject: [PATCH 06/12] Add an option to supply host as an argument Signed-off-by: ajrasane <131806219+ajrasane@users.noreply.github.com> --- examples/llm_eval/run_lm_eval_vllm.sh | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) mode change 100644 => 100755 examples/llm_eval/run_lm_eval_vllm.sh diff --git a/examples/llm_eval/run_lm_eval_vllm.sh b/examples/llm_eval/run_lm_eval_vllm.sh old mode 100644 new mode 100755 index ef94a66d14..18c52995c9 --- a/examples/llm_eval/run_lm_eval_vllm.sh +++ b/examples/llm_eval/run_lm_eval_vllm.sh @@ -19,12 +19,13 @@ # Script to run lm-evaluation-harness against a running vLLM OpenAI-compatible server. # # Usage: -# bash run_lm_eval_vllm.sh [port] [task] +# bash run_lm_eval_vllm.sh [port] [task] [host] # # Arguments: # : The name of the model being served (e.g., Qwen/Qwen3-30B-A3B). Used for the 'model' argument in lm_eval. # [port]: The port the vLLM server is listening on (default: 8000). # [task]: The lm_eval task(s) to run (default: mmlu). +# [host]: The IP address or hostname of the vLLM server (default: localhost). # # Example: # # Start vLLM server first (in another terminal): @@ -35,6 +36,9 @@ # # # Run for a different task, e.g., hellaswag: # bash run_lm_eval_vllm.sh Qwen/Qwen3-30B-A3B 8000 hellaswag +# +# # Run against a remote server: +# bash run_lm_eval_vllm.sh Qwen/Qwen3-30B-A3B 8000 mmlu 10.78.17.40 # --- set -e @@ -42,16 +46,17 @@ set -x # --- Argument Parsing --- if [ -z "$1" ]; then - echo "Usage: $0 [port] [task]" + echo "Usage: $0 [port] [task] [host]" exit 1 fi MODEL_NAME=$1 PORT=${2:-8000} # Default port is 8000 if not provided TASK=${3:-mmlu} # Default task is mmlu if not provided +HOST=${4:-localhost} # Default host is localhost if not provided # --- Environment Setup --- export OPENAI_API_KEY="local" # Not strictly required for local, but good practice -BASE_URL="http://localhost:${PORT}/v1" +BASE_URL="http://${HOST}:${PORT}/v1" COMPLETIONS_URL="${BASE_URL}/completions" # --- Evaluation --- From e0e108e8befb3fb86ad40e49fb44f75064ec7805 Mon Sep 17 00:00:00 2001 From: ajrasane <131806219+ajrasane@users.noreply.github.com> Date: Mon, 9 Feb 2026 23:46:01 +0000 Subject: [PATCH 07/12] Add video dataset utils Signed-off-by: ajrasane <131806219+ajrasane@users.noreply.github.com> --- modelopt/torch/utils/video_dataset_utils.py | 332 ++++++++++++++++++++ 1 file changed, 332 insertions(+) create mode 100644 modelopt/torch/utils/video_dataset_utils.py diff --git a/modelopt/torch/utils/video_dataset_utils.py b/modelopt/torch/utils/video_dataset_utils.py new file mode 100644 index 0000000000..e022d7e24f --- /dev/null +++ b/modelopt/torch/utils/video_dataset_utils.py @@ -0,0 +1,332 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Utility functions for getting samples and forward loop function for video datasets.""" + +import os +import tempfile +from typing import Any + +import torch +from torch.utils.data import DataLoader + +from .image_processor import BaseImageProcessor + +# Use dict to store the config for each dataset. +SUPPORTED_VIDEO_DATASET_CONFIG: dict[str, dict[str, Any]] = { + "finevideo": { + "config": {"path": "HuggingFaceFV/finevideo", "split": "train", "streaming": True} + }, +} + +__all__ = [ + "Qwen3OmniVideoProcessor", + "get_supported_video_datasets", + "get_video_dataset_dataloader", +] + + +def _get_video_dataset(dataset_name: str, num_samples: int): + """Load a portion of train dataset with the dataset name and a given size. + + Args: + dataset_name: Name of the dataset to load. + num_samples: Number of samples to load from the dataset. + + Returns: + A hugging face Dataset. + """ + if dataset_name in SUPPORTED_VIDEO_DATASET_CONFIG: + from datasets import Dataset, load_dataset + + config = SUPPORTED_VIDEO_DATASET_CONFIG[dataset_name]["config"] + is_streaming = config.get("streaming", False) + + dataset = load_dataset(**config) + + if is_streaming: + # For streaming datasets, use take() and convert to list then Dataset + samples = list(dataset.take(num_samples)) + return Dataset.from_list(samples) + else: + return dataset.select(range(num_samples)) + else: + raise NotImplementedError( + f"dataset {dataset_name} is not supported. Please use one of the following:" + f" {get_supported_video_datasets()}." + ) + + +def get_supported_video_datasets() -> list[str]: + """Retrieves a list of video datasets supported. + + Returns: + A list of strings, where each string is the name of a supported dataset. + + Example usage: + + .. code-block:: python + + from modelopt.torch.utils import get_supported_video_datasets + + print("Supported video datasets:", get_supported_video_datasets()) + """ + return list(SUPPORTED_VIDEO_DATASET_CONFIG.keys()) + + +def get_video_dataset_dataloader( + dataset_name: str = "finevideo", + processor: "Qwen3OmniVideoProcessor" = None, + batch_size: int = 1, + num_samples: int = 512, + cache_dir: str | None = None, +) -> DataLoader: + """Get a dataloader with the dataset name and processor of the target model. + + Args: + dataset_name: Name of the dataset to load. + processor: Processor used for encoding video and text data. + batch_size: Batch size of the returned dataloader. + num_samples: Number of samples from the dataset. + cache_dir: Directory to cache the processed dataset. Defaults to a temp directory. + If the cache exists, it will be loaded instead of reprocessing. + + Returns: + An instance of dataloader. + """ + assert processor is not None, "Please provide a valid processor." + + # Default cache_dir to temp directory + if cache_dir is None: + cache_dir = os.path.join(tempfile.gettempdir(), "modelopt_video_dataset_cache") + + processed_dataset = None + + # Try to load from cache (use torch.save/load to avoid Arrow 32-bit offset overflow) + if cache_dir is not None: + cache_path = os.path.join(cache_dir, f"{dataset_name}_n{num_samples}_processed.pt") + if os.path.exists(cache_path): + try: + from datasets import Dataset + + processed_samples = torch.load(cache_path, weights_only=False) + processed_dataset = Dataset.from_list(processed_samples) + print(f"Loaded processed dataset from cache: {cache_path}") + except Exception as e: + print(f"Failed to load cache from {cache_path}: {e}. Reprocessing...") + processed_dataset = None + + # Process dataset if not loaded from cache + if processed_dataset is None: + from datasets import Dataset + + dataset = _get_video_dataset(dataset_name, num_samples=num_samples) + + # Process samples manually to avoid Arrow 32-bit offset overflow + # (dataset.map() uses Arrow internally which can't handle large nested lists) + processed_samples = [] + for i, sample in enumerate(dataset): + processed = processor.preprocess_function(sample) + processed_samples.append(processed) + if (i + 1) % 10 == 0: + print(f"Processed {i + 1}/{len(dataset)} samples...") + + processed_dataset = Dataset.from_list(processed_samples) + + # Save to cache using torch.save to avoid Arrow 32-bit offset overflow + if cache_dir is not None: + os.makedirs(cache_dir, exist_ok=True) + torch.save(processed_samples, cache_path) + print(f"Saved processed dataset to cache: {cache_path}") + + # Create DataLoader with the custom collate function + return DataLoader( + processed_dataset, + batch_size=batch_size, + shuffle=False, + collate_fn=processor.collate_function, + ) + + +class Qwen3OmniVideoProcessor(BaseImageProcessor): + """Video processor for Qwen3-Omni multimodal model with finevideo dataset support.""" + + def __init__(self, tokenizer, device="cuda", dtype=None, use_audio_in_video=True): + """Constructor. + + Args: + tokenizer: The Qwen3OmniMoeProcessor for tokenizing and processing inputs. + device: Device to move tensors to. + dtype: dtype for float tensors (e.g., torch.bfloat16). If None, uses default. + use_audio_in_video: Whether to extract and use audio from video files. + """ + super().__init__(tokenizer, device) + self.dtype = dtype + self.use_audio_in_video = use_audio_in_video + self._temp_dir = tempfile.mkdtemp(prefix="qwen3omni_video_") + self._video_counter = 0 + # Try to import qwen_omni_utils for multimodal processing + try: + from qwen_omni_utils import process_mm_info + + self.process_mm_info = process_mm_info + except ImportError: + raise ImportError( + "qwen_omni_utils is required for Qwen3OmniVideoProcessor. " + "Please install it from https://github.com/QwenLM/Qwen3-Omni" + ) + + def _save_video_bytes_to_file(self, video_bytes: bytes) -> str: + """Save video bytes to a temporary file and return the path. + + Args: + video_bytes: Raw video bytes (e.g., from finevideo's 'mp4' field). + + Returns: + Path to the temporary video file. + """ + video_path = os.path.join(self._temp_dir, f"video_{self._video_counter}.mp4") + self._video_counter += 1 + with open(video_path, "wb") as f: + f.write(video_bytes) + return video_path + + def preprocess_function(self, examples): + """Preprocess function for Qwen3-Omni with video support. + + Handles both standard video paths and raw video bytes (finevideo format). + """ + # Get question/prompt - finevideo has metadata in 'json' field + if "json" in examples and examples["json"] is not None: + metadata = examples["json"] + # Try to get a meaningful question from metadata + category = metadata.get("content_fine_category", "") + question = ( + f"Describe what is happening in this video in detail. Category hint: {category}" + ) + else: + question = examples.get("question", "Describe this video in detail.") + + # Build conversation in Qwen format + content = [] + + # Handle video - check for raw bytes (finevideo format) or path + video_path = None + if examples.get("mp4") is not None: + # finevideo format: raw video bytes in 'mp4' field + video_path = self._save_video_bytes_to_file(examples["mp4"]) + elif examples.get("video") is not None: + # Standard format: video path or URL + video_path = examples["video"] + + if video_path is not None: + content.append({"type": "video", "video": video_path}) + + content.append({"type": "text", "text": question}) + + conversation = [{"role": "user", "content": content}] + text = self.tokenizer.apply_chat_template( + conversation, add_generation_prompt=True, tokenize=False, enable_thinking=False + ) + + # Extract multimodal info using qwen_omni_utils + audios, images, videos = self.process_mm_info( + conversation, use_audio_in_video=self.use_audio_in_video + ) + + # Process inputs with the processor + values = self.tokenizer( + text=text, + audio=audios, + images=images, + videos=videos, + return_tensors="pt", + padding=True, + use_audio_in_video=self.use_audio_in_video, + ) + # Define all possible keys to ensure consistent schema for Arrow serialization + all_keys = [ + "input_ids", + "attention_mask", + "pixel_values_videos", + "video_grid_thw", + "video_second_per_grid", + "feature_attention_mask", + "input_features", + ] + + # Convert tensors to lists for Arrow serialization compatibility + # Tensor conversion back happens in collate_function + result = dict.fromkeys(all_keys) # Initialize all keys to None + for key, val in values.items(): + if val is not None and hasattr(val, "tolist"): + result[key] = val.tolist() + elif val is not None: + result[key] = val + + return result + + def collate_function(self, batch): + """Collate function to process inputs during data loading.""" + result = {} + + # Take first item from batch (batch_size handling) + first = batch[0] + + # Convert lists to tensors and move to device + if first.get("input_ids") is not None: + result["input_ids"] = torch.LongTensor(first["input_ids"]).to(self.device) + if first.get("attention_mask") is not None: + result["attention_mask"] = torch.LongTensor(first["attention_mask"]).to(self.device) + + # Handle pixel values for video frames + if first.get("pixel_values_videos") is not None: + pv = torch.tensor(first["pixel_values_videos"]) + if self.dtype is not None: + pv = pv.to(self.dtype) + result["pixel_values_videos"] = pv.to(self.device) + + # Handle video grid thw (tile height width info) + if first.get("video_grid_thw") is not None: + result["video_grid_thw"] = torch.LongTensor(first["video_grid_thw"]).to(self.device) + + # Handle video second per grid (temporal info for rope) + if first.get("video_second_per_grid") is not None: + result["video_second_per_grid"] = torch.tensor(first["video_second_per_grid"]).to( + self.device + ) + + # Handle audio features if present + if first.get("feature_attention_mask") is not None: + result["feature_attention_mask"] = torch.LongTensor(first["feature_attention_mask"]).to( + self.device + ) + if first.get("input_features") is not None: + inp_feat = torch.tensor(first["input_features"]) + if self.dtype is not None: + inp_feat = inp_feat.to(self.dtype) + result["input_features"] = inp_feat.to(self.device) + + # Pass use_audio_in_video flag to model.generate() for Qwen3Omni + result["use_audio_in_video"] = self.use_audio_in_video + + return result + + def cleanup(self): + """Clean up temporary video files.""" + import shutil + + if os.path.exists(self._temp_dir): + shutil.rmtree(self._temp_dir) From d334c9a89f6547a336c0dc62ee8da89bbdbe738f Mon Sep 17 00:00:00 2001 From: ajrasane <131806219+ajrasane@users.noreply.github.com> Date: Tue, 17 Mar 2026 21:30:29 +0000 Subject: [PATCH 08/12] Update documentation for post_quantize Signed-off-by: ajrasane <131806219+ajrasane@users.noreply.github.com> --- examples/llm_ptq/hf_ptq.py | 33 +++++++++++++++++++++++++++++---- 1 file changed, 29 insertions(+), 4 deletions(-) diff --git a/examples/llm_ptq/hf_ptq.py b/examples/llm_ptq/hf_ptq.py index cbb8961efd..4091c54493 100755 --- a/examples/llm_ptq/hf_ptq.py +++ b/examples/llm_ptq/hf_ptq.py @@ -827,11 +827,36 @@ def post_quantize( first_text_speech_dataset, calib_batch: dict | None = None, ): - """ - Processing after the quantization. + """Processing after the quantization. + + Runs one round of generation using the quantized model for a sample prompt and + compares it with the pre-quantize generation from ``pre_quantize()``. - Currently we run one round of generation using the quantized model for a sample prompt, - and compare it with pre-quantize generation. + Args: + args: Parsed CLI arguments. Used for ``verbose``, ``quant_summary_path``, + ``export_path``, ``pyt_ckpt_path``, and ``skip_generate`` flags. + full_model: The quantized model to run post-quantization generation on. + model_type: Model architecture identifier (e.g. ``"qwen3omni"``, ``"whisper"``, + ``"llama4"``, ``"deepseek"``). Controls model-specific generation and + decoding paths. ``None`` for generic models. + tokenizer: HF tokenizer for decoding generated token ids. May be ``None`` when + a ``processor`` is used instead (e.g. vision-language or speech models). + processor: HF image/audio processor for multimodal models. Used for decoding + outputs from vision-language (Mllama, Qwen3Omni) and speech (Whisper) + models. ``None`` for text-only models. + preview_input_ids: Input token ids (single sample) produced by ``pre_quantize()`` + for the preview generation comparison. + generated_ids_before_ptq: Generation output from ``pre_quantize()`` to compare + against post-quantization output. ``None`` if generation was skipped. + is_nemotron_vl_model: Whether the model is a Nemotron VL model, which uses + ``model.chat()`` and returns text strings instead of token tensors. + first_text_speech_dataset: Text transcript of the first speech sample, used as + the display input for Whisper models since their ``input_ids`` are + mel-spectrogram features rather than decodable tokens. + calib_batch: Full calibration batch dict from ``pre_quantize``. Required for + multimodal models (e.g. Qwen3Omni) whose ``generate()`` needs the complete + input dict (audio features, attention masks, etc.) rather than just + ``input_ids``. For text-only models this is unused and may be ``None``. """ From 55f1e736e7532e534d07a87592f831e5fe86f48b Mon Sep 17 00:00:00 2001 From: ajrasane <131806219+ajrasane@users.noreply.github.com> Date: Wed, 18 Mar 2026 02:35:17 +0000 Subject: [PATCH 09/12] Bug fixes Signed-off-by: ajrasane <131806219+ajrasane@users.noreply.github.com> --- examples/llm_ptq/example_utils.py | 7 +++++-- examples/llm_ptq/run_vllm.py | 14 +++++++++++--- modelopt/torch/quantization/model_quant.py | 1 - modelopt/torch/utils/dataset_utils.py | 4 ++-- modelopt/torch/utils/image_processor.py | 16 ++++++++++++---- modelopt/torch/utils/video_dataset_utils.py | 4 +++- 6 files changed, 33 insertions(+), 13 deletions(-) diff --git a/examples/llm_ptq/example_utils.py b/examples/llm_ptq/example_utils.py index fa71607ad0..b9d3dd5f86 100755 --- a/examples/llm_ptq/example_utils.py +++ b/examples/llm_ptq/example_utils.py @@ -45,7 +45,6 @@ except ImportError: snapshot_download = None -import modelopt.torch.quantization as mtq from modelopt.torch.export.model_utils import MODEL_NAME_TO_TYPE from modelopt.torch.utils.dataset_utils import get_dataset_dataloader from modelopt.torch.utils.image_processor import ( @@ -1074,6 +1073,9 @@ def get_qwen3omni_dataloader( num_samples = [512, 512] if processor is not None: + # Normalize single-element list to str for supported-dataset lookups + if isinstance(dataset_name, list) and len(dataset_name) == 1: + dataset_name = dataset_name[0] if dataset_name in get_supported_video_datasets(): assert isinstance(dataset_name, str) video_processor = Qwen3OmniVideoProcessor( @@ -1093,7 +1095,8 @@ def get_qwen3omni_dataloader( assert isinstance(processor, Qwen3OmniImageProcessor), ( "The Qwen3OmniImageProcessor must be set." ) - # Set the dtype for proper tensor conversion in collate_function + # Set dtype for proper tensor conversion in collate_function. + # Processor is created before model_dtype is known, so we set it here. processor.dtype = model_dtype calib_dataloader = get_vlm_dataset_dataloader( dataset_name=dataset_name, diff --git a/examples/llm_ptq/run_vllm.py b/examples/llm_ptq/run_vllm.py index 748c5b13a0..3e69ab9de7 100644 --- a/examples/llm_ptq/run_vllm.py +++ b/examples/llm_ptq/run_vllm.py @@ -52,6 +52,12 @@ def main(): parser.add_argument("--top-p", type=float, default=0.9, help="Top-p sampling") parser.add_argument("--top-k", type=int, default=-1, help="Top-k sampling (-1 to disable)") parser.add_argument("--max-tokens", type=int, default=512, help="Max tokens to generate") + parser.add_argument( + "--trust-remote-code", + action="store_true", + default=False, + help="Trust remote code from HuggingFace model repos", + ) args = parser.parse_args() @@ -65,7 +71,7 @@ def main(): # Get max_model_len from config if not specified if args.max_model_len is None: - config = AutoConfig.from_pretrained(args.model, trust_remote_code=True) + config = AutoConfig.from_pretrained(args.model, trust_remote_code=args.trust_remote_code) args.max_model_len = getattr(config, "max_position_embeddings", 4096) print(f"Using max_model_len from config: {args.max_model_len}") @@ -73,7 +79,9 @@ def main(): tokenizer_id = args.tokenizer or args.model # Load processor for chat template - processor = AutoProcessor.from_pretrained(tokenizer_id, trust_remote_code=True) + processor = AutoProcessor.from_pretrained( + tokenizer_id, trust_remote_code=args.trust_remote_code + ) # Text-only conversations conversations = [ @@ -106,7 +114,7 @@ def main(): tokenizer=tokenizer_id, tensor_parallel_size=args.tp, max_model_len=args.max_model_len, - trust_remote_code=True, + trust_remote_code=args.trust_remote_code, quantization=quantization, ) diff --git a/modelopt/torch/quantization/model_quant.py b/modelopt/torch/quantization/model_quant.py index 782702703b..a2dae3fbe9 100644 --- a/modelopt/torch/quantization/model_quant.py +++ b/modelopt/torch/quantization/model_quant.py @@ -17,7 +17,6 @@ import fnmatch import inspect -import os import warnings from collections.abc import Callable, Iterable from typing import Any diff --git a/modelopt/torch/utils/dataset_utils.py b/modelopt/torch/utils/dataset_utils.py index 842a797afb..cd538111c6 100644 --- a/modelopt/torch/utils/dataset_utils.py +++ b/modelopt/torch/utils/dataset_utils.py @@ -606,8 +606,8 @@ def _process_batch(batch_data, infer_method, generation_kwargs={}, max_working_b assert all(torch.is_tensor(data) or data is None for data in tensor_data.values()), ( "tensor_data values must be tensors" ) - # Get the batch size of current data - batch_size = tensor_data[next(iter(batch_data.keys()))].shape[0] + # Get the batch size from the first non-None tensor value + batch_size = next(v for v in tensor_data.values() if v is not None).shape[0] # If we know a smaller batch size works, preemptively split if max_working_batch_size is not None and batch_size > max_working_batch_size: diff --git a/modelopt/torch/utils/image_processor.py b/modelopt/torch/utils/image_processor.py index 07deca7fc4..7691d65951 100644 --- a/modelopt/torch/utils/image_processor.py +++ b/modelopt/torch/utils/image_processor.py @@ -175,9 +175,10 @@ def collate_function(self, batch): class Qwen3OmniImageProcessor(BaseImageProcessor): """Image processor for Qwen3-Omni multimodal model.""" - def __init__(self, tokenizer, device="auto", use_audio_in_video=False): + def __init__(self, tokenizer, device="auto", dtype=None, use_audio_in_video=False): """Constructor.""" super().__init__(tokenizer, device) + self.dtype = dtype self.use_audio_in_video = use_audio_in_video # Try to import qwen_omni_utils for multimodal processing try: @@ -251,7 +252,8 @@ def collate_function(self, batch): """Collate function to process inputs during data loading.""" result = {} - # Take first item from batch (batch_size handling) + # Take first item only — multimodal inputs have variable-length sequences + # (images, audio) that cannot be stacked, so batch_size=1 is expected. first = batch[0] # Convert lists to tensors and move to device @@ -262,7 +264,10 @@ def collate_function(self, batch): # Handle pixel values for images if first.get("pixel_values") is not None: - result["pixel_values"] = torch.tensor(first["pixel_values"]).to(self.device) + pv = torch.tensor(first["pixel_values"]) + if self.dtype is not None: + pv = pv.to(self.dtype) + result["pixel_values"] = pv.to(self.device) # Handle image grid thw (tile height width info) if first.get("image_grid_thw") is not None: @@ -274,7 +279,10 @@ def collate_function(self, batch): self.device ) if first.get("audio_features") is not None: - result["audio_features"] = torch.tensor(first["audio_features"]).to(self.device) + af = torch.tensor(first["audio_features"]) + if self.dtype is not None: + af = af.to(self.dtype) + result["audio_features"] = af.to(self.device) # Handle video features if present if first.get("video_grid_thw") is not None: diff --git a/modelopt/torch/utils/video_dataset_utils.py b/modelopt/torch/utils/video_dataset_utils.py index e022d7e24f..a48c29048b 100644 --- a/modelopt/torch/utils/video_dataset_utils.py +++ b/modelopt/torch/utils/video_dataset_utils.py @@ -121,6 +121,7 @@ def get_video_dataset_dataloader( try: from datasets import Dataset + # weights_only=False is safe here: the cache file is self-generated at line 151 processed_samples = torch.load(cache_path, weights_only=False) processed_dataset = Dataset.from_list(processed_samples) print(f"Loaded processed dataset from cache: {cache_path}") @@ -282,7 +283,8 @@ def collate_function(self, batch): """Collate function to process inputs during data loading.""" result = {} - # Take first item from batch (batch_size handling) + # Take first item only — multimodal inputs have variable-length sequences + # (video frames, audio) that cannot be stacked, so batch_size=1 is expected. first = batch[0] # Convert lists to tensors and move to device From 9e3b3991c0a2150a387f581b4defa22051b1fb36 Mon Sep 17 00:00:00 2001 From: ajrasane <131806219+ajrasane@users.noreply.github.com> Date: Wed, 18 Mar 2026 03:57:18 +0000 Subject: [PATCH 10/12] Update get_expert_linear_names Signed-off-by: ajrasane <131806219+ajrasane@users.noreply.github.com> --- examples/llm_ptq/run_vllm.py | 1 + modelopt/torch/export/layer_utils.py | 1 + 2 files changed, 2 insertions(+) diff --git a/examples/llm_ptq/run_vllm.py b/examples/llm_ptq/run_vllm.py index 3e69ab9de7..60cfcb2cd1 100644 --- a/examples/llm_ptq/run_vllm.py +++ b/examples/llm_ptq/run_vllm.py @@ -116,6 +116,7 @@ def main(): max_model_len=args.max_model_len, trust_remote_code=args.trust_remote_code, quantization=quantization, + enforce_eager=True, ) # Get sampling params from config, with CLI/defaults as fallback diff --git a/modelopt/torch/export/layer_utils.py b/modelopt/torch/export/layer_utils.py index 9a2cd4b2f0..641204d4f7 100755 --- a/modelopt/torch/export/layer_utils.py +++ b/modelopt/torch/export/layer_utils.py @@ -972,6 +972,7 @@ def module_match_name_list(module, name_list): "Qwen3MoeSparseMoeBlock", "Qwen3NextSparseMoeBlock", "Qwen3_5MoeSparseMoeBlock", + "Qwen3OmniMoeThinkerTextSparseMoeBlock", "DeepseekMoE", ], ): From 20a4b33d5c0a3459b03e35588386261ea52b877f Mon Sep 17 00:00:00 2001 From: ajrasane <131806219+ajrasane@users.noreply.github.com> Date: Wed, 1 Apr 2026 20:12:16 +0000 Subject: [PATCH 11/12] Address comments Signed-off-by: ajrasane <131806219+ajrasane@users.noreply.github.com> --- examples/llm_ptq/example_utils.py | 78 ++------ examples/llm_ptq/hf_ptq.py | 63 ++++--- modelopt/torch/export/model_utils.py | 26 ++- modelopt/torch/export/unified_export_hf.py | 17 +- modelopt/torch/utils/dataset_utils.py | 130 ++++++------- modelopt/torch/utils/image_processor.py | 194 +++++++++++--------- modelopt/torch/utils/video_dataset_utils.py | 115 +++--------- 7 files changed, 288 insertions(+), 335 deletions(-) diff --git a/examples/llm_ptq/example_utils.py b/examples/llm_ptq/example_utils.py index b9d3dd5f86..a39acf4c73 100755 --- a/examples/llm_ptq/example_utils.py +++ b/examples/llm_ptq/example_utils.py @@ -45,7 +45,7 @@ except ImportError: snapshot_download = None -from modelopt.torch.export.model_utils import MODEL_NAME_TO_TYPE +from modelopt.torch.export.model_utils import match_model_type_by_name from modelopt.torch.utils.dataset_utils import get_dataset_dataloader from modelopt.torch.utils.image_processor import ( BaseImageProcessor, @@ -95,19 +95,13 @@ def get_model_type_from_config(model_path: str) -> str | None: config = json.load(f) # Check architectures field first - architectures = config.get("architectures", []) - for arch in architectures: - for key, model_type in MODEL_NAME_TO_TYPE.items(): - if key.lower() in arch.lower(): - return model_type + for arch in config.get("architectures", []): + result = match_model_type_by_name(arch) + if result is not None: + return result # Fallback to model_type field - model_type_field = config.get("model_type", "") - for key, model_type in MODEL_NAME_TO_TYPE.items(): - if key.lower() in model_type_field.lower(): - return model_type - - return None + return match_model_type_by_name(config.get("model_type", "")) def get_sampling_params_from_config(model_path: str) -> dict: @@ -164,10 +158,13 @@ def ensure_tokenizer_files(model_path: str, source_model_id: str) -> None: print(f"Copying missing tokenizer files from {source_model_id}...") # Download only tokenizer files from HF - cache_dir = snapshot_download( - source_model_id, - allow_patterns=TOKENIZER_FILES, - ) + if os.path.isdir(source_model_id): + cache_dir = source_model_id + else: + cache_dir = snapshot_download( + source_model_id, + allow_patterns=TOKENIZER_FILES, + ) for fname in TOKENIZER_FILES: src = os.path.join(cache_dir, fname) @@ -992,55 +989,6 @@ def copy_custom_model_files(source_path: str, export_path: str, trust_remote_cod print("No custom model files found to copy") -def patch_config_for_unified_export(model_type: str, export_path: str) -> None: - """Patch config files to add missing exclusion patterns for unified HF export. - - This function adds missing exclusion patterns for modules that should not be quantized - (e.g., audio tower, visual encoder, lm_head) to both hf_quant_config.json and config.json. - - Args: - export_path: Path to the exported model directory. - """ - if model_type == "qwen3omni": - missing_patterns = [ - "thinker.audio_tower*", - "thinker.visual*", - "thinker.lm_head", - ] - - # (filename, path_to_exclude_list) - configs = [ - ("hf_quant_config.json", ["quantization", "exclude_modules"]), - ("config.json", ["quantization_config", "ignore"]), - ] - - for filename, keys in configs: - filepath = os.path.join(export_path, filename) - if not os.path.exists(filepath): - continue - try: - with open(filepath) as f: - config = json.load(f) - - # Navigate to nested key - target = config - for key in keys[:-1]: - target = target.get(key, {}) - - exclude_list = target.get(keys[-1]) - if exclude_list is None: - continue - - added = [p for p in missing_patterns if p not in exclude_list] - if added: - exclude_list.extend(added) - with open(filepath, "w") as f: - json.dump(config, f, indent=2) - print(f"Patched {filename} with exclusions: {added}") - except Exception as e: - print(f"Warning: Failed to patch {filename}: {e}") - - def get_qwen3omni_dataloader( dataset_name: str | list[str] | None, processor: Qwen3OmniImageProcessor | None, diff --git a/examples/llm_ptq/hf_ptq.py b/examples/llm_ptq/hf_ptq.py index 4091c54493..2d441f4b35 100755 --- a/examples/llm_ptq/hf_ptq.py +++ b/examples/llm_ptq/hf_ptq.py @@ -18,6 +18,7 @@ import random import time import warnings +from collections import namedtuple from typing import Any import numpy as np @@ -35,7 +36,6 @@ is_enc_dec, is_nemotron_vl, load_mtp_weights, - patch_config_for_unified_export, run_nemotron_vl_preview, ) from torch.utils.data import DataLoader @@ -735,9 +735,6 @@ def export_quantized( extra_state_dict=mtp_state_dict, ) - # Exclude non-quantized modules in config.json and hf_quant_config.json - patch_config_for_unified_export(model_type, export_path) - # Restore default padding and export the tokenizer as well. if tokenizer is not None: tokenizer.padding_side = default_padding_side @@ -757,6 +754,23 @@ def export_quantized( ) +PreQuantizeResult = namedtuple( + "PreQuantizeResult", ["preview_input_ids", "generated_ids_before_ptq", "calib_batch"] +) + + +def _qwen3omni_generate(model, calib_batch): + """Run Qwen3Omni generate and unpack the result. + + Qwen3Omni returns a (text_ids, audio) tuple; text_ids may have a .sequences attribute. + """ + result = model.generate(**calib_batch, return_audio=False, thinker_max_new_tokens=100) + if isinstance(result, tuple): + text_ids, _ = result + return text_ids.sequences if hasattr(text_ids, "sequences") else text_ids + return result + + def pre_quantize( args: argparse.Namespace, full_model: torch.nn.Module, @@ -799,20 +813,15 @@ def pre_quantize( allow_fallback=False, ) elif model_type == "qwen3omni": - # Qwen3Omni returns (text_ids, audio) tuple; text_ids has .sequences - # Pass full batch with all multimodal inputs - result = full_model.generate(**calib_batch, return_audio=False, thinker_max_new_tokens=100) - if isinstance(result, tuple): - text_ids, _ = result - generated_ids_before_ptq = ( - text_ids.sequences if hasattr(text_ids, "sequences") else text_ids - ) - else: - generated_ids_before_ptq = result + # Use only a single sample for preview generation to avoid OOM + single_sample = { + k: v[0:1] if isinstance(v, torch.Tensor) else v for k, v in calib_batch.items() + } + generated_ids_before_ptq = _qwen3omni_generate(full_model, single_sample) else: generated_ids_before_ptq = full_model.generate(preview_input_ids, max_new_tokens=100) - return preview_input_ids, generated_ids_before_ptq, calib_batch + return PreQuantizeResult(preview_input_ids, generated_ids_before_ptq, calib_batch) def post_quantize( @@ -861,25 +870,23 @@ def post_quantize( """ if args.verbose: - mtq.print_quant_summary(full_model, save_path=args.quant_summary_path) - save_expert_token_count_table(full_model, args.export_path) + try: + mtq.print_quant_summary(full_model, save_path=args.quant_summary_path) + save_expert_token_count_table(full_model, args.export_path) + except Exception as e: + print(f"Warning: Failed to print quant summary: {e}") # Run some samples torch.cuda.empty_cache() generated_ids_after_ptq = None if generated_ids_before_ptq is None: pass - elif model_type == "qwen3omni": - # Qwen3Omni returns (text_ids, audio) tuple; text_ids has .sequences - # Pass full batch with all multimodal inputs - result = full_model.generate(**calib_batch, return_audio=False, thinker_max_new_tokens=100) - if isinstance(result, tuple): - text_ids, _ = result - generated_ids_after_ptq = ( - text_ids.sequences if hasattr(text_ids, "sequences") else text_ids - ) - else: - generated_ids_after_ptq = result + elif model_type == "qwen3omni" and calib_batch is not None: + # Use only a single sample for preview generation to avoid OOM + single_sample = { + k: v[0:1] if isinstance(v, torch.Tensor) else v for k, v in calib_batch.items() + } + generated_ids_after_ptq = _qwen3omni_generate(full_model, single_sample) elif model_type != "llama4" and not is_nemotron_vl_model: # Our fake quantizer may not be fully compatible with torch.compile. generated_ids_after_ptq = full_model.generate(preview_input_ids, max_new_tokens=100) diff --git a/modelopt/torch/export/model_utils.py b/modelopt/torch/export/model_utils.py index 7501ed7bbc..17798d0837 100755 --- a/modelopt/torch/export/model_utils.py +++ b/modelopt/torch/export/model_utils.py @@ -67,17 +67,35 @@ {MODEL_NAME_TO_TYPE=} """ -__all__ = ["get_language_model_from_vl", "get_model_type", "is_multimodal_model"] +__all__ = [ + "get_language_model_from_vl", + "get_model_type", + "is_multimodal_model", + "match_model_type_by_name", +] -def get_model_type(model): - """Try get the model type from the model name. If not found, return None.""" +def match_model_type_by_name(name: str) -> str | None: + """Match a model type from MODEL_NAME_TO_TYPE by case-insensitive substring match. + + Args: + name: String to match against (e.g. class name, architecture string, model_type field). + + Returns: + Matched model type string, or None. + """ + name_lower = name.lower() for k, v in MODEL_NAME_TO_TYPE.items(): - if k.lower() in type(model).__name__.lower(): + if k.lower() in name_lower: return v return None +def get_model_type(model): + """Try get the model type from the model name. If not found, return None.""" + return match_model_type_by_name(type(model).__name__) + + def is_multimodal_model(model): """Check if a model is a Vision-Language Model (VLM) or multimodal model. diff --git a/modelopt/torch/export/unified_export_hf.py b/modelopt/torch/export/unified_export_hf.py index e8a1e06857..c60469a587 100644 --- a/modelopt/torch/export/unified_export_hf.py +++ b/modelopt/torch/export/unified_export_hf.py @@ -87,7 +87,7 @@ QUANTIZATION_W4A8_AWQ, QUANTIZATION_W4A8_NVFP4_FP8, ) -from .model_utils import get_language_model_from_vl, is_multimodal_model +from .model_utils import get_language_model_from_vl, get_model_type, is_multimodal_model from .plugins import SpeculativeDecodingExporter, has_spec_opt from .quant_utils import ( fuse_prequant_layernorm, @@ -781,6 +781,16 @@ def _export_transformers_checkpoint( exclude_modules.append(pattern) print(f"Adding MTP layer to quantization_config ignore: {pattern}") + # Add model-specific non-quantized module exclusions + _model_type_exclusions = { + "qwen3omni": ["thinker.audio_tower*", "thinker.visual*", "thinker.lm_head"], + } + model_type = get_model_type(model) + for pattern in _model_type_exclusions.get(model_type, []): + exclude_modules = quant_config["quantization"].setdefault("exclude_modules", []) + if pattern not in exclude_modules: + exclude_modules.append(pattern) + # Safety net: sync any gate/up weight quantizer amaxes that # requantize_resmooth_fused_llm_layers did not reach (e.g. experts not # activated during the dummy forward, or non-standard expert naming). @@ -1185,6 +1195,8 @@ def export_hf_checkpoint( # Fix generation_config conflicts before saving # Some models have temperature/top_p/top_k set but do_sample=False which causes validation errors + # Restore the original value after save to avoid mutating the caller's model. + _gen_config_restore = None if hasattr(model, "generation_config") and model.generation_config is not None: gen_config = model.generation_config if not getattr(gen_config, "do_sample", True): @@ -1193,6 +1205,7 @@ def export_hf_checkpoint( getattr(gen_config, attr, None) is not None for attr in ["temperature", "top_p", "top_k"] ): + _gen_config_restore = gen_config.do_sample gen_config.do_sample = True # Save model @@ -1211,6 +1224,8 @@ def export_hf_checkpoint( ) finally: _unpatch_revert_weight_conversion(_patches) + if _gen_config_restore is not None: + model.generation_config.do_sample = _gen_config_restore original_config = f"{export_dir}/config.json" config_data = {} diff --git a/modelopt/torch/utils/dataset_utils.py b/modelopt/torch/utils/dataset_utils.py index cd538111c6..f5a64054fe 100644 --- a/modelopt/torch/utils/dataset_utils.py +++ b/modelopt/torch/utils/dataset_utils.py @@ -212,6 +212,44 @@ def _auto_preprocess_sample( ) +def _load_text_samples(dataset_name, num_samples, **kwargs): + """Normalize inputs and load raw text samples from one or more datasets. + + Args: + dataset_name: Single name or list of names. + num_samples: Single count or list of counts (must match dataset_name length). + **kwargs: Forwarded to get_dataset_samples(). + + Returns: + List of raw text strings. + """ + if isinstance(num_samples, int): + num_samples = [num_samples] + if isinstance(dataset_name, str): + dataset_name = [dataset_name] + assert len(dataset_name) == len(num_samples), ( + "dataset_name and num_samples must be the same length" + ) + all_samples = [] + for ds_name, num_sample in zip(dataset_name, num_samples): + samples = get_dataset_samples(ds_name, num_sample, **kwargs) + all_samples.extend(samples) + return all_samples + + +class _ListDataset(torch.utils.data.Dataset): + """Simple dataset wrapping a list of dicts.""" + + def __init__(self, samples): + self.samples = samples + + def __getitem__(self, idx): + return self.samples[idx] + + def __len__(self): + return len(self.samples) + + def get_qwen3omni_text_dataloader( dataset_name: str | list[str] = "cnn_dailymail", processor=None, @@ -236,59 +274,25 @@ def get_qwen3omni_text_dataloader( """ assert processor is not None, "Please provide a Qwen3OmniTextProcessor." - if isinstance(num_samples, int): - num_samples = [num_samples] - - if isinstance(dataset_name, str): - dataset_name = [dataset_name] - - assert len(dataset_name) == len(num_samples), ( - "dataset_name and num_samples must be the same length" - ) + all_samples = _load_text_samples(dataset_name, num_samples) - # Get raw text samples - all_samples = [] - for ds_name, num_sample in zip(dataset_name, num_samples): - samples = get_dataset_samples(ds_name, num_sample) - all_samples.extend(samples) + # Preprocess each sample with the conversation template and convert to lists + from .image_processor import _Qwen3OmniProcessorMixin - # Preprocess each sample with the conversation template processed_samples = [] for text in all_samples: - # Apply conversation template and tokenize values = processor.preprocess_function(text) + processed_samples.append( + _Qwen3OmniProcessorMixin._serialize_for_arrow(values, list(values.keys())) + ) - # Convert to lists for dataset compatibility - sample_dict = {} - for key, val in values.items(): - if val is not None and hasattr(val, "tolist"): - sample_dict[key] = val.tolist() - elif val is not None: - sample_dict[key] = val - processed_samples.append(sample_dict) - - # Create dataset - class _Qwen3OmniTextDataset(torch.utils.data.Dataset): - def __init__(self, samples): - self.samples = samples - - def __getitem__(self, idx): - return self.samples[idx] - - def __len__(self): - return len(self.samples) - - dataset = _Qwen3OmniTextDataset(processed_samples) - - calib_dataloader = DataLoader( - dataset, + return DataLoader( + _ListDataset(processed_samples), batch_size=batch_size, shuffle=False, collate_fn=processor.collate_function, ) - return calib_dataloader - def get_dataset_samples( dataset_name: str, @@ -446,23 +450,13 @@ def get_dataset_dataloader( "Tokenizer with the right padding_side may impact calibration accuracy. Recommend set to left" ) - if isinstance(num_samples, int): - num_samples = [num_samples] - - if isinstance(dataset_name, str): - dataset_name = [dataset_name] - - assert len(dataset_name) == len(num_samples), ( - "dataset_name and num_samples must be the same length" + all_samples = _load_text_samples( + dataset_name, + num_samples, + apply_chat_template=apply_chat_template, + tokenizer=tokenizer, ) - all_samples = [] - for ds_name, num_sample in zip(dataset_name, num_samples): - samples = get_dataset_samples( - ds_name, num_sample, apply_chat_template=apply_chat_template, tokenizer=tokenizer - ) - all_samples.extend(samples) - batch_encoded = tokenizer( all_samples, return_tensors="pt", @@ -531,7 +525,7 @@ def _get_free_gpu_mem(): torch.cuda.empty_cache() free_mem_before, max_allocated_before = _get_free_gpu_mem() - use_generate = model_type_is_enc_dec(model) + use_generate = _should_use_generate(model) infer_method = model.generate if use_generate else model.forward if sample_input_single_batch is None: @@ -587,7 +581,7 @@ def _get_free_gpu_mem(): return 512 -def _process_batch(batch_data, infer_method, generation_kwargs={}, max_working_batch_size=None): +def _process_batch(batch_data, infer_method, generation_kwargs=None, max_working_batch_size=None): """Process a batch of data through the model's inference method. Args: @@ -599,6 +593,8 @@ def _process_batch(batch_data, infer_method, generation_kwargs={}, max_working_b Returns: The maximum batch size that worked successfully """ + if generation_kwargs is None: + generation_kwargs = {} # Separate tensor values from scalar parameters (like max_new_tokens) tensor_data = {k: v for k, v in batch_data.items() if torch.is_tensor(v) or v is None} scalar_data = {k: v for k, v in batch_data.items() if not torch.is_tensor(v) and v is not None} @@ -663,7 +659,7 @@ def _process_batch(batch_data, infer_method, generation_kwargs={}, max_working_b def _forward_loop( - model: torch.nn.Module, dataloader: DataLoader, generation_kwargs: dict = {} + model: torch.nn.Module, dataloader: DataLoader, generation_kwargs: dict | None = None ) -> None: """Runs forward passes through the model using data from the dataloader. @@ -672,9 +668,10 @@ def _forward_loop( dataloader: DataLoader containing the batched input data generation_kwargs: Keyword arguments to pass to the model.generate() method. """ + if generation_kwargs is None: + generation_kwargs = {} with torch.no_grad(): - # use_generate = _should_use_generate(model) - use_generate = model_type_is_enc_dec(model) + use_generate = _should_use_generate(model) infer_method = model.generate if use_generate else model.forward max_working_batch_size = None # Initialize max working batch size as None @@ -695,7 +692,7 @@ def create_forward_loop( device: str | None = None, include_labels: bool = False, dataloader: DataLoader | None = None, - generation_kwargs: dict = {}, + generation_kwargs: dict | None = None, ) -> Callable: """Creates and returns a forward loop function configured for a specific model, dataset, and tokenizer. @@ -737,6 +734,8 @@ def create_forward_loop( A forward loop function that can be called with no arguments. When called, this function iterates over the dataset specified by `dataset_name`. """ + if generation_kwargs is None: + generation_kwargs = {} if dataloader is None: if batch_size == 0: # We let the system to determine the max data batch for each forward. @@ -860,4 +859,7 @@ def _should_use_generate(model): """ generate_model_list = ["qwen3omni"] model_name = model.__class__.__name__.lower() - return model_type_is_enc_dec(model) or any(name in model_name for name in generate_model_list) + needs_generate = model_type_is_enc_dec(model) or any( + name in model_name for name in generate_model_list + ) + return needs_generate and hasattr(model, "generate") diff --git a/modelopt/torch/utils/image_processor.py b/modelopt/torch/utils/image_processor.py index 7691d65951..2f226e41c5 100644 --- a/modelopt/torch/utils/image_processor.py +++ b/modelopt/torch/utils/image_processor.py @@ -16,6 +16,8 @@ # Adapted from tensorrt_llm/quantization/image_processing.py """Utility classes for image processing.""" +from typing import Any + import torch @@ -39,6 +41,33 @@ def collate_function(self, examples): """Collate function to process images during data loading.""" raise NotImplementedError("Each image processor must implement its own collate method") + def _collate_first_item(self, batch, long_keys=(), float_keys=(), dtype=None): + """Shared collate helper: validates batch_size=1, converts lists to tensors. + + Args: + batch: List of sample dicts from the DataLoader. + long_keys: Keys to convert via torch.LongTensor. + float_keys: Keys to convert via torch.tensor with optional dtype cast. + dtype: Optional dtype for float_keys tensors. + + Returns: + Dict of tensors moved to self.device. + """ + if len(batch) != 1: + raise ValueError(f"{type(self).__name__} currently supports batch_size=1 only.") + first = batch[0] + result = {} + for key in long_keys: + if first.get(key) is not None: + result[key] = torch.LongTensor(first[key]).to(self.device) + for key in float_keys: + if first.get(key) is not None: + t = torch.tensor(first[key]) + if dtype is not None: + t = t.to(dtype) + result[key] = t.to(self.device) + return result + # A light Encapsulation for Huggingface MllamaImageProcessor @@ -161,20 +190,77 @@ def preprocess_function(self, text: str) -> dict: def collate_function(self, batch): """Collate function to process text inputs during data loading.""" - result = {} - first = batch[0] + return self._collate_first_item( + batch, + long_keys=("input_ids", "attention_mask"), + ) + + +class _Qwen3OmniProcessorMixin: + """Shared preprocessing logic for Qwen3-Omni image/video processors.""" + + tokenizer: Any + process_mm_info: Any + use_audio_in_video: Any - if "input_ids" in first and first["input_ids"] is not None: - result["input_ids"] = torch.LongTensor(first["input_ids"]).to(self.device) - if "attention_mask" in first and first["attention_mask"] is not None: - result["attention_mask"] = torch.LongTensor(first["attention_mask"]).to(self.device) + def _tokenize_conversation(self, conversation): + """Tokenize a Qwen3-Omni conversation and return processor outputs. + Args: + conversation: List of conversation dicts in Qwen format. + + Returns: + Processor output dict with tensors. + """ + text = self.tokenizer.apply_chat_template( + conversation, add_generation_prompt=True, tokenize=False, enable_thinking=False + ) + audios, images, videos = self.process_mm_info( + conversation, use_audio_in_video=self.use_audio_in_video + ) + return self.tokenizer( + text=text, + audio=audios, + images=images, + videos=videos, + return_tensors="pt", + padding=True, + use_audio_in_video=self.use_audio_in_video, + ) + + @staticmethod + def _serialize_for_arrow(values, all_keys): + """Convert processor outputs to lists for Arrow serialization. + + Args: + values: Processor output dict (may contain tensors). + all_keys: List of keys to include in the result (ensures consistent schema). + + Returns: + Dict with all_keys initialized to None, populated from values. + """ + result = dict.fromkeys(all_keys) + for key, val in values.items(): + if val is not None and hasattr(val, "tolist"): + result[key] = val.tolist() + elif val is not None: + result[key] = val return result -class Qwen3OmniImageProcessor(BaseImageProcessor): +class Qwen3OmniImageProcessor(_Qwen3OmniProcessorMixin, BaseImageProcessor): """Image processor for Qwen3-Omni multimodal model.""" + _ALL_KEYS = [ + "input_ids", + "attention_mask", + "pixel_values", + "image_grid_thw", + "audio_features", + "audio_feature_lens", + "video_grid_thw", + ] + def __init__(self, tokenizer, device="auto", dtype=None, use_audio_in_video=False): """Constructor.""" super().__init__(tokenizer, device) @@ -206,86 +292,20 @@ def preprocess_function(self, examples): content.append({"type": "text", "text": question}) conversation = [{"role": "user", "content": content}] - text = self.tokenizer.apply_chat_template( - conversation, add_generation_prompt=True, tokenize=False, enable_thinking=False - ) - - # Extract multimodal info using qwen_omni_utils - audios, images, videos = self.process_mm_info( - conversation, use_audio_in_video=self.use_audio_in_video - ) - - # Process inputs with the processor - values = self.tokenizer( - text=text, - audio=audios, - images=images, - videos=videos, - return_tensors="pt", - padding=True, - use_audio_in_video=self.use_audio_in_video, - ) - - # Define all possible keys to ensure consistent schema for Arrow serialization - all_keys = [ - "input_ids", - "attention_mask", - "pixel_values", - "image_grid_thw", - "audio_features", - "audio_feature_lens", - "video_grid_thw", - ] - - # Convert tensors to lists for Arrow serialization compatibility - # Tensor conversion back happens in collate_function - result = dict.fromkeys(all_keys) # Initialize all keys to None - for key, val in values.items(): - if val is not None and hasattr(val, "tolist"): - result[key] = val.tolist() - elif val is not None: - result[key] = val - - return result + values = self._tokenize_conversation(conversation) + return self._serialize_for_arrow(values, self._ALL_KEYS) def collate_function(self, batch): """Collate function to process inputs during data loading.""" - result = {} - - # Take first item only — multimodal inputs have variable-length sequences - # (images, audio) that cannot be stacked, so batch_size=1 is expected. - first = batch[0] - - # Convert lists to tensors and move to device - if "input_ids" in first and first["input_ids"] is not None: - result["input_ids"] = torch.LongTensor(first["input_ids"]).to(self.device) - if "attention_mask" in first and first["attention_mask"] is not None: - result["attention_mask"] = torch.LongTensor(first["attention_mask"]).to(self.device) - - # Handle pixel values for images - if first.get("pixel_values") is not None: - pv = torch.tensor(first["pixel_values"]) - if self.dtype is not None: - pv = pv.to(self.dtype) - result["pixel_values"] = pv.to(self.device) - - # Handle image grid thw (tile height width info) - if first.get("image_grid_thw") is not None: - result["image_grid_thw"] = torch.LongTensor(first["image_grid_thw"]).to(self.device) - - # Handle audio features if present - if first.get("audio_feature_lens") is not None: - result["audio_feature_lens"] = torch.LongTensor(first["audio_feature_lens"]).to( - self.device - ) - if first.get("audio_features") is not None: - af = torch.tensor(first["audio_features"]) - if self.dtype is not None: - af = af.to(self.dtype) - result["audio_features"] = af.to(self.device) - - # Handle video features if present - if first.get("video_grid_thw") is not None: - result["video_grid_thw"] = torch.LongTensor(first["video_grid_thw"]).to(self.device) - - return result + return self._collate_first_item( + batch, + long_keys=( + "input_ids", + "attention_mask", + "image_grid_thw", + "audio_feature_lens", + "video_grid_thw", + ), + float_keys=("pixel_values", "audio_features"), + dtype=self.dtype, + ) diff --git a/modelopt/torch/utils/video_dataset_utils.py b/modelopt/torch/utils/video_dataset_utils.py index a48c29048b..d8b02b7ee1 100644 --- a/modelopt/torch/utils/video_dataset_utils.py +++ b/modelopt/torch/utils/video_dataset_utils.py @@ -22,7 +22,7 @@ import torch from torch.utils.data import DataLoader -from .image_processor import BaseImageProcessor +from .image_processor import BaseImageProcessor, _Qwen3OmniProcessorMixin # Use dict to store the config for each dataset. SUPPORTED_VIDEO_DATASET_CONFIG: dict[str, dict[str, Any]] = { @@ -161,7 +161,7 @@ def get_video_dataset_dataloader( ) -class Qwen3OmniVideoProcessor(BaseImageProcessor): +class Qwen3OmniVideoProcessor(_Qwen3OmniProcessorMixin, BaseImageProcessor): """Video processor for Qwen3-Omni multimodal model with finevideo dataset support.""" def __init__(self, tokenizer, device="cuda", dtype=None, use_audio_in_video=True): @@ -204,6 +204,16 @@ def _save_video_bytes_to_file(self, video_bytes: bytes) -> str: f.write(video_bytes) return video_path + _ALL_KEYS = [ + "input_ids", + "attention_mask", + "pixel_values_videos", + "video_grid_thw", + "video_second_per_grid", + "feature_attention_mask", + "input_features", + ] + def preprocess_function(self, examples): """Preprocess function for Qwen3-Omni with video support. @@ -212,7 +222,6 @@ def preprocess_function(self, examples): # Get question/prompt - finevideo has metadata in 'json' field if "json" in examples and examples["json"] is not None: metadata = examples["json"] - # Try to get a meaningful question from metadata category = metadata.get("content_fine_category", "") question = ( f"Describe what is happening in this video in detail. Category hint: {category}" @@ -226,10 +235,8 @@ def preprocess_function(self, examples): # Handle video - check for raw bytes (finevideo format) or path video_path = None if examples.get("mp4") is not None: - # finevideo format: raw video bytes in 'mp4' field video_path = self._save_video_bytes_to_file(examples["mp4"]) elif examples.get("video") is not None: - # Standard format: video path or URL video_path = examples["video"] if video_path is not None: @@ -238,92 +245,24 @@ def preprocess_function(self, examples): content.append({"type": "text", "text": question}) conversation = [{"role": "user", "content": content}] - text = self.tokenizer.apply_chat_template( - conversation, add_generation_prompt=True, tokenize=False, enable_thinking=False - ) - - # Extract multimodal info using qwen_omni_utils - audios, images, videos = self.process_mm_info( - conversation, use_audio_in_video=self.use_audio_in_video - ) - - # Process inputs with the processor - values = self.tokenizer( - text=text, - audio=audios, - images=images, - videos=videos, - return_tensors="pt", - padding=True, - use_audio_in_video=self.use_audio_in_video, - ) - # Define all possible keys to ensure consistent schema for Arrow serialization - all_keys = [ - "input_ids", - "attention_mask", - "pixel_values_videos", - "video_grid_thw", - "video_second_per_grid", - "feature_attention_mask", - "input_features", - ] - - # Convert tensors to lists for Arrow serialization compatibility - # Tensor conversion back happens in collate_function - result = dict.fromkeys(all_keys) # Initialize all keys to None - for key, val in values.items(): - if val is not None and hasattr(val, "tolist"): - result[key] = val.tolist() - elif val is not None: - result[key] = val - - return result + values = self._tokenize_conversation(conversation) + return self._serialize_for_arrow(values, self._ALL_KEYS) def collate_function(self, batch): """Collate function to process inputs during data loading.""" - result = {} - - # Take first item only — multimodal inputs have variable-length sequences - # (video frames, audio) that cannot be stacked, so batch_size=1 is expected. - first = batch[0] - - # Convert lists to tensors and move to device - if first.get("input_ids") is not None: - result["input_ids"] = torch.LongTensor(first["input_ids"]).to(self.device) - if first.get("attention_mask") is not None: - result["attention_mask"] = torch.LongTensor(first["attention_mask"]).to(self.device) - - # Handle pixel values for video frames - if first.get("pixel_values_videos") is not None: - pv = torch.tensor(first["pixel_values_videos"]) - if self.dtype is not None: - pv = pv.to(self.dtype) - result["pixel_values_videos"] = pv.to(self.device) - - # Handle video grid thw (tile height width info) - if first.get("video_grid_thw") is not None: - result["video_grid_thw"] = torch.LongTensor(first["video_grid_thw"]).to(self.device) - - # Handle video second per grid (temporal info for rope) - if first.get("video_second_per_grid") is not None: - result["video_second_per_grid"] = torch.tensor(first["video_second_per_grid"]).to( - self.device - ) - - # Handle audio features if present - if first.get("feature_attention_mask") is not None: - result["feature_attention_mask"] = torch.LongTensor(first["feature_attention_mask"]).to( - self.device - ) - if first.get("input_features") is not None: - inp_feat = torch.tensor(first["input_features"]) - if self.dtype is not None: - inp_feat = inp_feat.to(self.dtype) - result["input_features"] = inp_feat.to(self.device) - + result = self._collate_first_item( + batch, + long_keys=( + "input_ids", + "attention_mask", + "video_grid_thw", + "feature_attention_mask", + ), + float_keys=("pixel_values_videos", "video_second_per_grid", "input_features"), + dtype=self.dtype, + ) # Pass use_audio_in_video flag to model.generate() for Qwen3Omni result["use_audio_in_video"] = self.use_audio_in_video - return result def cleanup(self): @@ -332,3 +271,7 @@ def cleanup(self): if os.path.exists(self._temp_dir): shutil.rmtree(self._temp_dir) + + def __del__(self): + """Ensure temporary files are cleaned up when the processor is garbage collected.""" + self.cleanup() From ee95177dbcf0018ab0ef10952d9c90cf6cd8bcbc Mon Sep 17 00:00:00 2001 From: ajrasane <131806219+ajrasane@users.noreply.github.com> Date: Thu, 2 Apr 2026 00:04:44 +0000 Subject: [PATCH 12/12] Remove manual registration of sparse moe block Signed-off-by: ajrasane <131806219+ajrasane@users.noreply.github.com> --- modelopt/torch/quantization/plugins/huggingface.py | 13 ------------- 1 file changed, 13 deletions(-) diff --git a/modelopt/torch/quantization/plugins/huggingface.py b/modelopt/torch/quantization/plugins/huggingface.py index 0a451e8ccb..0d02716a6e 100644 --- a/modelopt/torch/quantization/plugins/huggingface.py +++ b/modelopt/torch/quantization/plugins/huggingface.py @@ -1180,19 +1180,6 @@ def unpack_weight(self): pass -try: - from transformers.models.qwen3_omni_moe.modeling_qwen3_omni_moe import ( - Qwen3OmniMoeThinkerTextSparseMoeBlock, - ) - - if Qwen3OmniMoeThinkerTextSparseMoeBlock not in QuantModuleRegistry: - QuantModuleRegistry.register( - {Qwen3OmniMoeThinkerTextSparseMoeBlock: "hf.Qwen3OmniMoeThinkerTextSparseMoeBlock"} - )(_QuantSparseMoe) -except ImportError: - pass - - class _QuantGptOssExperts(_QuantFunctionalMixin): """Quantized wrapper for `transformers.GptOssExperts`.