From 9aef8905a8417deb5d386c860ae1f975cb44f530 Mon Sep 17 00:00:00 2001 From: LRL2-ModelCloud Date: Sat, 27 Sep 2025 10:31:29 +0800 Subject: [PATCH 01/27] add Qwen3OmniMoe --- gptqmodel/models/auto.py | 2 ++ gptqmodel/models/definitions/__init__.py | 1 + .../models/definitions/qwen3_omni_moe.py | 31 +++++++++++++++++++ 3 files changed, 34 insertions(+) create mode 100644 gptqmodel/models/definitions/qwen3_omni_moe.py diff --git a/gptqmodel/models/auto.py b/gptqmodel/models/auto.py index da8c0729a..397f67caa 100644 --- a/gptqmodel/models/auto.py +++ b/gptqmodel/models/auto.py @@ -116,6 +116,7 @@ from .definitions.starcoder2 import Starcoder2QModel # noqa: E402 from .definitions.telechat2 import TeleChat2QModel from .definitions.xverse import XverseQModel # noqa: E402 +from .definitions.qwen3_omni_moe import Qwen3OmniMoeGPTQ # make quants and inference more determinisitc torch.manual_seed(787) @@ -180,6 +181,7 @@ "qwen2_vl": Qwen2VLQModel, "qwen2_5_vl": Qwen2_5_VLQModel, "qwen2_5_omni": Qwen2_5_OmniGPTQ, + "qwen3_omni_moe": Qwen3OmniMoeGPTQ, "dbrx": DbrxQModel, "dbrx_converted": DbrxConvertedQModel, "deepseek_v2": DeepSeekV2QModel, diff --git a/gptqmodel/models/definitions/__init__.py b/gptqmodel/models/definitions/__init__.py index 64d608f67..2a9459aa4 100644 --- a/gptqmodel/models/definitions/__init__.py +++ b/gptqmodel/models/definitions/__init__.py @@ -63,3 +63,4 @@ from .klear import KlearQModel from .llava_qwen2 import LlavaQwen2QModel from .nemotron_h import NemotronHQModel +from .qwen3_omni_moe import Qwen3OmniMoeGPTQ \ No newline at end of file diff --git a/gptqmodel/models/definitions/qwen3_omni_moe.py b/gptqmodel/models/definitions/qwen3_omni_moe.py new file mode 100644 index 000000000..527b964cc --- /dev/null +++ b/gptqmodel/models/definitions/qwen3_omni_moe.py @@ -0,0 +1,31 @@ +# SPDX-FileCopyrightText: 2024-2025 ModelCloud.ai +# SPDX-FileCopyrightText: 2024-2025 qubitium@modelcloud.ai +# SPDX-License-Identifier: Apache-2.0 +# Contact: qubitium@modelcloud.ai, x.com/qubitium + +from transformers import AutoModelForTextToWaveform +from ..base import BaseQModel + + +class Qwen3OmniMoeGPTQ(BaseQModel): + loader = AutoModelForTextToWaveform + + pre_lm_head_norm_module = "thinker.model.norm" + + module_tree = [ + "thinker", + "model", + "layers", + "#", + { + "input_layernorm": ("input_layernorm:!",), + "self_attn": ("q_proj:0", "k_proj:0", "v_proj:0", "o_proj:1"), + "post_attention_layernorm": ("post_attention_layernorm:!",), + "mlp": { + "gate": ("gate",), + "experts": { + "#": ("gate_proj:0", "up_proj:0", "down_proj:1"), + }, + }, + } + ] \ No newline at end of file From dd1d400e341b7e61acf4bf8bd6a000c022ef2f28 Mon Sep 17 00:00:00 2001 From: LRL2-ModelCloud Date: Sat, 27 Sep 2025 13:40:58 +0800 Subject: [PATCH 02/27] Fixing now: if offload_to_disk=False, we should not load the model into meta first. --- gptqmodel/models/base.py | 3 +++ gptqmodel/models/loader.py | 23 +++++++++++++++-------- 2 files changed, 18 insertions(+), 8 deletions(-) diff --git a/gptqmodel/models/base.py b/gptqmodel/models/base.py index 4ee513de5..9fa2776c1 100644 --- a/gptqmodel/models/base.py +++ b/gptqmodel/models/base.py @@ -1046,6 +1046,9 @@ def shell_module_materialize( device: torch.device, non_blocking: bool = False, ) -> torch.nn.Module: + if self.turtle_model is None: + return target_submodule + module = alias_from_turtle_for_submodule( target_model=self.model, turtle_model=self.turtle_model, diff --git a/gptqmodel/models/loader.py b/gptqmodel/models/loader.py index 90993d7c1..73e3e9117 100644 --- a/gptqmodel/models/loader.py +++ b/gptqmodel/models/loader.py @@ -180,14 +180,21 @@ def skip(*args, **kwargs): cls.before_model_load(cls, load_quantized_model=False) from ..utils.hf import build_shell_model - #model = cls.loader.from_pretrained(model_local_path, config=config, **model_init_kwargs) - print("shell model-----------") - model = build_shell_model(cls.loader, config=config, **model_init_kwargs) - model._model_init_kwargs = model_init_kwargs - - print_module_tree(model=model) - # enable mmap with low_cpu_mem_usage - turtle_model = cls.loader.from_pretrained(model_local_path, config=config, low_cpu_mem_usage=True, **model_init_kwargs) + if quantize_config.offload_to_disk: + print("shell model-----------") + model = build_shell_model(cls.loader, config=config, **model_init_kwargs) + model._model_init_kwargs = model_init_kwargs + print_module_tree(model=model) + + # enable mmap with low_cpu_mem_usage + turtle_model = cls.loader.from_pretrained(model_local_path, config=config, low_cpu_mem_usage=True, **model_init_kwargs) + else: + print("loading model directly to CPU (not using meta device or turtle_model)-----------") + model = cls.loader.from_pretrained(model_local_path, config=config, **model_init_kwargs) + model._model_init_kwargs = model_init_kwargs + print_module_tree(model=model) + + turtle_model = None # TODO FIX ME...temp store model_init args turtle_model._model_init_kwargs = model_init_kwargs From 964a92f0d38541960dbd057e57f3d949bc1f9518 Mon Sep 17 00:00:00 2001 From: LRL2-ModelCloud Date: Sat, 27 Sep 2025 13:42:09 +0800 Subject: [PATCH 03/27] cleanup --- gptqmodel/models/loader.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/gptqmodel/models/loader.py b/gptqmodel/models/loader.py index 73e3e9117..39db57469 100644 --- a/gptqmodel/models/loader.py +++ b/gptqmodel/models/loader.py @@ -196,10 +196,10 @@ def skip(*args, **kwargs): turtle_model = None - # TODO FIX ME...temp store model_init args - turtle_model._model_init_kwargs = model_init_kwargs - # print("actual turtle model-----------") - # print_module_tree(model=turtle_model) + # TODO FIX ME...temp store model_init args + turtle_model._model_init_kwargs = model_init_kwargs + # print("actual turtle model-----------") + # print_module_tree(model=turtle_model) model_config = model.config.to_dict() seq_len_keys = ["max_position_embeddings", "seq_length", "n_positions", "multimodal_max_length"] From f60ae0c4c4b6fa91ca1eb4b3d25c24f0a7e3f9fd Mon Sep 17 00:00:00 2001 From: LRL2-ModelCloud Date: Sat, 27 Sep 2025 13:43:56 +0800 Subject: [PATCH 04/27] fix nonetype --- gptqmodel/models/loader.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/gptqmodel/models/loader.py b/gptqmodel/models/loader.py index 39db57469..84b1a24ac 100644 --- a/gptqmodel/models/loader.py +++ b/gptqmodel/models/loader.py @@ -188,6 +188,11 @@ def skip(*args, **kwargs): # enable mmap with low_cpu_mem_usage turtle_model = cls.loader.from_pretrained(model_local_path, config=config, low_cpu_mem_usage=True, **model_init_kwargs) + + # TODO FIX ME...temp store model_init args + turtle_model._model_init_kwargs = model_init_kwargs + # print("actual turtle model-----------") + # print_module_tree(model=turtle_model) else: print("loading model directly to CPU (not using meta device or turtle_model)-----------") model = cls.loader.from_pretrained(model_local_path, config=config, **model_init_kwargs) @@ -196,11 +201,6 @@ def skip(*args, **kwargs): turtle_model = None - # TODO FIX ME...temp store model_init args - turtle_model._model_init_kwargs = model_init_kwargs - # print("actual turtle model-----------") - # print_module_tree(model=turtle_model) - model_config = model.config.to_dict() seq_len_keys = ["max_position_embeddings", "seq_length", "n_positions", "multimodal_max_length"] config_seq_len = find_config_seq_len(model_config, seq_len_keys) From 5864c49474fc825b0e991eaf8840e0c8cda77ece Mon Sep 17 00:00:00 2001 From: LRL2-ModelCloud Date: Sat, 27 Sep 2025 13:45:34 +0800 Subject: [PATCH 05/27] fix none --- gptqmodel/models/loader.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gptqmodel/models/loader.py b/gptqmodel/models/loader.py index 84b1a24ac..5ced21725 100644 --- a/gptqmodel/models/loader.py +++ b/gptqmodel/models/loader.py @@ -211,7 +211,7 @@ def skip(*args, **kwargs): model.seqlen = 4096 model.eval() - turtle_model.eval() + turtle_model.eval() if turtle_model is not None else None tokenizer = AutoTokenizer.from_pretrained(pretrained_model_id_or_path, trust_remote_code=trust_remote_code) From 7d54a92d9f70b398b6997acbefb91f6aef8a5157 Mon Sep 17 00:00:00 2001 From: LRL2-ModelCloud Date: Sat, 27 Sep 2025 15:05:00 +0800 Subject: [PATCH 06/27] add qwen3 omine moe support --- gptqmodel/looper/module_looper.py | 6 ++--- gptqmodel/models/base.py | 2 ++ .../models/definitions/qwen3_omni_moe.py | 22 +++++++++++++++++-- 3 files changed, 25 insertions(+), 5 deletions(-) diff --git a/gptqmodel/looper/module_looper.py b/gptqmodel/looper/module_looper.py index 517d065fb..7442f5841 100644 --- a/gptqmodel/looper/module_looper.py +++ b/gptqmodel/looper/module_looper.py @@ -109,7 +109,7 @@ def store_input_hook(module, args, kwargs): # Keyword arguments. # TODO FIX ME..why is Qwen2_5OmniDecoderLayer harded here? - if kwargs.get("attention_mask") is not None and str(type(module)) != "": + if kwargs.get("attention_mask") is not None and module.__class__.__name__ in ["Qwen2_5OmniDecoderLayer", "Qwen3OmniMoeThinkerTextDecoderLayer"]: attention_masks.append(kwargs["attention_mask"].to(device=data_device)) else: attention_masks.append(None) @@ -160,7 +160,7 @@ def store_input_hook(module, args, kwargs): for example in calibration_data: for k, v in example.items(): - if str(type(layers[0])) == "": + if layers[0].__class__.__name__ in ["Qwen2_5OmniDecoderLayer", "Qwen3OmniMoeThinkerTextDecoderLayer"]: data_device = self.gptq_model.quantize_config.device else: data_device = self.gptq_model.quantize_config.device if k == "pixel_values" else cur_layer_device @@ -175,7 +175,7 @@ def store_input_hook(module, args, kwargs): v = v.unsqueeze(0) example[k] = move_to(v, device=data_device) try: - if str(type(layers[0])) == "": + if layers[0].__class__.__name__ in ["Qwen2_5OmniDecoderLayer", "Qwen3OmniMoeThinkerTextDecoderLayer"]: self.gptq_model.model.generate(**example, return_audio=False) else: self.gptq_model.model(**example, use_cache=use_cache) diff --git a/gptqmodel/models/base.py b/gptqmodel/models/base.py index 9fa2776c1..01f0208dc 100644 --- a/gptqmodel/models/base.py +++ b/gptqmodel/models/base.py @@ -275,6 +275,8 @@ def build_moe_modules_if_need(cls, model_config, layer_modules, is_awq_quantize: def get_num_experts(cls, model_config): if hasattr(model_config, "text_config"): num_experts = getattr(model_config.text_config, cls.dynamic_expert_index) + elif hasattr(model_config, "thinker_config"): + num_experts = getattr(model_config.thinker_config.text_config, cls.dynamic_expert_index) else: num_experts = getattr(model_config, cls.dynamic_expert_index) return num_experts diff --git a/gptqmodel/models/definitions/qwen3_omni_moe.py b/gptqmodel/models/definitions/qwen3_omni_moe.py index 527b964cc..b40e0c607 100644 --- a/gptqmodel/models/definitions/qwen3_omni_moe.py +++ b/gptqmodel/models/definitions/qwen3_omni_moe.py @@ -5,11 +5,13 @@ from transformers import AutoModelForTextToWaveform from ..base import BaseQModel - +from .._const import CPU class Qwen3OmniMoeGPTQ(BaseQModel): loader = AutoModelForTextToWaveform + dynamic_expert_index = "num_experts" + pre_lm_head_norm_module = "thinker.model.norm" module_tree = [ @@ -28,4 +30,20 @@ class Qwen3OmniMoeGPTQ(BaseQModel): }, }, } - ] \ No newline at end of file + ] + + def pre_quantize_generate_hook_start(self): + self.model.thinker.model.embed_tokens = self.model.thinker.model.embed_tokens.to(self.quantize_config.device) + self.model.thinker.visual = self.model.thinker.visual.to(self.quantize_config.device) + self.model.thinker.audio_tower = self.model.thinker.audio_tower.to(self.quantize_config.device) + + self.model.thinker.visual.rotary_pos_emb = self.model.thinker.visual.rotary_pos_emb.to(self.quantize_config.device) + self.model.thinker.model.rotary_emb = self.model.thinker.model.rotary_emb.to(self.quantize_config.device) + + def pre_quantize_generate_hook_end(self): + self.model.thinker.model.embed_tokens = self.model.thinker.model.embed_tokens.to(CPU) + self.model.thinker.visual = self.model.thinker.visual.to(CPU) + self.model.thinker.audio_tower = self.model.thinker.audio_tower.to(CPU) + + self.model.thinker.visual.rotary_pos_emb = self.model.thinker.visual.rotary_pos_emb.to(CPU) + self.model.thinker.model.rotary_emb = self.model.thinker.model.rotary_emb.to(CPU) \ No newline at end of file From e42a21db10adf0ffac7ec66764d4ad741951e590 Mon Sep 17 00:00:00 2001 From: LRL2-ModelCloud Date: Sat, 27 Sep 2025 15:14:38 +0800 Subject: [PATCH 07/27] require attention_mask to be int type if model decoder layer type is Qwen2_5OmniDecoderLayer or Qwen3OmniMoeThinkerTextDecoderLayer --- gptqmodel/looper/module_looper.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/gptqmodel/looper/module_looper.py b/gptqmodel/looper/module_looper.py index 7442f5841..536df3a9e 100644 --- a/gptqmodel/looper/module_looper.py +++ b/gptqmodel/looper/module_looper.py @@ -176,6 +176,9 @@ def store_input_hook(module, args, kwargs): example[k] = move_to(v, device=data_device) try: if layers[0].__class__.__name__ in ["Qwen2_5OmniDecoderLayer", "Qwen3OmniMoeThinkerTextDecoderLayer"]: + # require attention_mask to be int type if model decoder layer type is Qwen2_5OmniDecoderLayer or Qwen3OmniMoeThinkerTextDecoderLayer + if example["attention_mask"] is not None: + example["attention_mask"] = example["attention_mask"].int() self.gptq_model.model.generate(**example, return_audio=False) else: self.gptq_model.model(**example, use_cache=use_cache) From 37165ba5c2c5258077f03d337b7b46d25bf5adc5 Mon Sep 17 00:00:00 2001 From: LRL2-ModelCloud Date: Sat, 27 Sep 2025 15:15:30 +0800 Subject: [PATCH 08/27] Revert "require attention_mask to be int type if model decoder layer type is Qwen2_5OmniDecoderLayer or Qwen3OmniMoeThinkerTextDecoderLayer" This reverts commit e42a21db10adf0ffac7ec66764d4ad741951e590. --- gptqmodel/looper/module_looper.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/gptqmodel/looper/module_looper.py b/gptqmodel/looper/module_looper.py index 536df3a9e..7442f5841 100644 --- a/gptqmodel/looper/module_looper.py +++ b/gptqmodel/looper/module_looper.py @@ -176,9 +176,6 @@ def store_input_hook(module, args, kwargs): example[k] = move_to(v, device=data_device) try: if layers[0].__class__.__name__ in ["Qwen2_5OmniDecoderLayer", "Qwen3OmniMoeThinkerTextDecoderLayer"]: - # require attention_mask to be int type if model decoder layer type is Qwen2_5OmniDecoderLayer or Qwen3OmniMoeThinkerTextDecoderLayer - if example["attention_mask"] is not None: - example["attention_mask"] = example["attention_mask"].int() self.gptq_model.model.generate(**example, return_audio=False) else: self.gptq_model.model(**example, use_cache=use_cache) From f20f37d84c6973b4705294b60de13882dbef31eb Mon Sep 17 00:00:00 2001 From: LRL2-ModelCloud Date: Sat, 27 Sep 2025 15:20:59 +0800 Subject: [PATCH 09/27] for compatibility, attention_mask should be of type long. --- gptqmodel/utils/data.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/gptqmodel/utils/data.py b/gptqmodel/utils/data.py index 7e30f0ae4..2f275a983 100644 --- a/gptqmodel/utils/data.py +++ b/gptqmodel/utils/data.py @@ -167,7 +167,7 @@ def collate_data(batch: List[Dict[str, List[List[int]]]], pad_token_id: int) -> for r in range(len(ids_list)): ids = torch.as_tensor(ids_list[r], dtype=torch.long) # make mask boolean immediately - msk = torch.as_tensor(msk_list[r], dtype=torch.bool) + msk = torch.as_tensor(msk_list[r], dtype=torch.long) if ids.numel() != msk.numel(): raise ValueError("Row has mismatched lengths between input_ids and attention_mask") @@ -193,11 +193,11 @@ def right_pad(row: torch.Tensor, pad_value, dtype=None) -> torch.Tensor: padded_ids = [right_pad(t, pad_token_id, dtype=torch.long) for t in rows_ids] # pad masks with False, not 0 - padded_msk = [right_pad(t, False, dtype=torch.bool) for t in rows_mask] + padded_msk = [right_pad(t, 0, dtype=torch.long) for t in rows_mask] # Stack into [total_rows_in_batch, max_len] input_ids = torch.stack(padded_ids, dim=0) if padded_ids else torch.empty((0, 0), dtype=torch.long) - attention_mask = torch.stack(padded_msk, dim=0) if padded_msk else torch.empty((0, 0), dtype=torch.bool) + attention_mask = torch.stack(padded_msk, dim=0) if padded_msk else torch.empty((0, 0), dtype=torch.long) return { "input_ids": input_ids, From f7cd5ae628d1574db1ce7e14302d4ad3bdb199a0 Mon Sep 17 00:00:00 2001 From: LRL2-ModelCloud Date: Sat, 27 Sep 2025 15:35:46 +0800 Subject: [PATCH 10/27] add support_offload_to_disk --- gptqmodel/models/base.py | 2 ++ gptqmodel/models/definitions/opt.py | 2 ++ gptqmodel/models/definitions/qwen3_omni_moe.py | 2 ++ gptqmodel/models/loader.py | 4 ++++ 4 files changed, 10 insertions(+) diff --git a/gptqmodel/models/base.py b/gptqmodel/models/base.py index 01f0208dc..8b47d3caa 100644 --- a/gptqmodel/models/base.py +++ b/gptqmodel/models/base.py @@ -149,6 +149,8 @@ class BaseQModel(nn.Module): support_batch_quantize = True + support_offload_to_disk = True + def __init__( self, model: PreTrainedModel, diff --git a/gptqmodel/models/definitions/opt.py b/gptqmodel/models/definitions/opt.py index 36d443a4b..b829e223d 100644 --- a/gptqmodel/models/definitions/opt.py +++ b/gptqmodel/models/definitions/opt.py @@ -13,6 +13,8 @@ class OptQModel(BaseQModel): pre_lm_head_norm_module = "model.decoder.final_layer_norm" + support_offload_to_disk = False + module_tree = [ "model", "decoder", diff --git a/gptqmodel/models/definitions/qwen3_omni_moe.py b/gptqmodel/models/definitions/qwen3_omni_moe.py index b40e0c607..e440b81eb 100644 --- a/gptqmodel/models/definitions/qwen3_omni_moe.py +++ b/gptqmodel/models/definitions/qwen3_omni_moe.py @@ -14,6 +14,8 @@ class Qwen3OmniMoeGPTQ(BaseQModel): pre_lm_head_norm_module = "thinker.model.norm" + support_offload_to_disk = False + module_tree = [ "thinker", "model", diff --git a/gptqmodel/models/loader.py b/gptqmodel/models/loader.py index 5ced21725..e027353a5 100644 --- a/gptqmodel/models/loader.py +++ b/gptqmodel/models/loader.py @@ -180,6 +180,10 @@ def skip(*args, **kwargs): cls.before_model_load(cls, load_quantized_model=False) from ..utils.hf import build_shell_model + if not cls.support_offload_to_disk: + quantize_config.offload_to_disk = False + log.info(f"Current model class {cls.__name__} does not support offload_to_disk feature; quantize_config.offload_to_disk has been forcibly disabled.") + if quantize_config.offload_to_disk: print("shell model-----------") model = build_shell_model(cls.loader, config=config, **model_init_kwargs) From f3ff5c7316519e9497810aef85cdb0dab618d5c0 Mon Sep 17 00:00:00 2001 From: LRL2-ModelCloud Date: Sat, 27 Sep 2025 15:53:15 +0800 Subject: [PATCH 11/27] cleanup --- gptqmodel/looper/module_looper.py | 10 ++++++---- gptqmodel/models/base.py | 4 ++++ gptqmodel/models/definitions/base_qwen2_5_omni.py | 4 ++++ gptqmodel/models/definitions/qwen3_omni_moe.py | 4 ++++ gptqmodel/utils/data.py | 6 +++--- 5 files changed, 21 insertions(+), 7 deletions(-) diff --git a/gptqmodel/looper/module_looper.py b/gptqmodel/looper/module_looper.py index 7442f5841..34ca7a1b0 100644 --- a/gptqmodel/looper/module_looper.py +++ b/gptqmodel/looper/module_looper.py @@ -108,8 +108,7 @@ def store_input_hook(module, args, kwargs): layer_inputs.append(layer_input) # Keyword arguments. - # TODO FIX ME..why is Qwen2_5OmniDecoderLayer harded here? - if kwargs.get("attention_mask") is not None and module.__class__.__name__ in ["Qwen2_5OmniDecoderLayer", "Qwen3OmniMoeThinkerTextDecoderLayer"]: + if self.gptq_model.ATTENTION_MASKS_REQUIRED_FOR_INPUT: attention_masks.append(kwargs["attention_mask"].to(device=data_device)) else: attention_masks.append(None) @@ -160,7 +159,7 @@ def store_input_hook(module, args, kwargs): for example in calibration_data: for k, v in example.items(): - if layers[0].__class__.__name__ in ["Qwen2_5OmniDecoderLayer", "Qwen3OmniMoeThinkerTextDecoderLayer"]: + if self.gptq_model.ATTENTION_MASKS_REQUIRED_FOR_INPUT: data_device = self.gptq_model.quantize_config.device else: data_device = self.gptq_model.quantize_config.device if k == "pixel_values" else cur_layer_device @@ -175,7 +174,10 @@ def store_input_hook(module, args, kwargs): v = v.unsqueeze(0) example[k] = move_to(v, device=data_device) try: - if layers[0].__class__.__name__ in ["Qwen2_5OmniDecoderLayer", "Qwen3OmniMoeThinkerTextDecoderLayer"]: + if example.get("attention_mask") is not None and example["attention_mask"].dtype != self.gptq_model.ATTENTION_MASKS_DTYPE: + example["attention_mask"] = example["attention_mask"].to(self.gptq_model.ATTENTION_MASKS_DTYPE) + + if self.gptq_model.ATTENTION_MASKS_REQUIRED_FOR_INPUT: self.gptq_model.model.generate(**example, return_audio=False) else: self.gptq_model.model(**example, use_cache=use_cache) diff --git a/gptqmodel/models/base.py b/gptqmodel/models/base.py index 8b47d3caa..87df5b9eb 100644 --- a/gptqmodel/models/base.py +++ b/gptqmodel/models/base.py @@ -151,6 +151,10 @@ class BaseQModel(nn.Module): support_offload_to_disk = True + ATTENTION_MASKS_DTYPE = torch.bool # default to bool + + ATTENTION_MASKS_REQUIRED_FOR_INPUT: bool = False + def __init__( self, model: PreTrainedModel, diff --git a/gptqmodel/models/definitions/base_qwen2_5_omni.py b/gptqmodel/models/definitions/base_qwen2_5_omni.py index 16347813c..c59f1f0c3 100644 --- a/gptqmodel/models/definitions/base_qwen2_5_omni.py +++ b/gptqmodel/models/definitions/base_qwen2_5_omni.py @@ -13,9 +13,13 @@ from ...utils.model import MODALITY from .._const import CPU from ..base import BaseQModel +import torch class BaseQwen2_5_OmniGPTQ(BaseQModel): + ATTENTION_MASKS_REQUIRED_FOR_INPUT = True + ATTENTION_MASKS_DTYPE = torch.long + loader = AutoModelForTextToWaveform pre_lm_head_norm_module = "thinker.model.norm" diff --git a/gptqmodel/models/definitions/qwen3_omni_moe.py b/gptqmodel/models/definitions/qwen3_omni_moe.py index e440b81eb..609c5dc68 100644 --- a/gptqmodel/models/definitions/qwen3_omni_moe.py +++ b/gptqmodel/models/definitions/qwen3_omni_moe.py @@ -6,8 +6,12 @@ from transformers import AutoModelForTextToWaveform from ..base import BaseQModel from .._const import CPU +import torch class Qwen3OmniMoeGPTQ(BaseQModel): + ATTENTION_MASKS_REQUIRED_FOR_INPUT = True + ATTENTION_MASKS_DTYPE = torch.long + loader = AutoModelForTextToWaveform dynamic_expert_index = "num_experts" diff --git a/gptqmodel/utils/data.py b/gptqmodel/utils/data.py index 2f275a983..7e30f0ae4 100644 --- a/gptqmodel/utils/data.py +++ b/gptqmodel/utils/data.py @@ -167,7 +167,7 @@ def collate_data(batch: List[Dict[str, List[List[int]]]], pad_token_id: int) -> for r in range(len(ids_list)): ids = torch.as_tensor(ids_list[r], dtype=torch.long) # make mask boolean immediately - msk = torch.as_tensor(msk_list[r], dtype=torch.long) + msk = torch.as_tensor(msk_list[r], dtype=torch.bool) if ids.numel() != msk.numel(): raise ValueError("Row has mismatched lengths between input_ids and attention_mask") @@ -193,11 +193,11 @@ def right_pad(row: torch.Tensor, pad_value, dtype=None) -> torch.Tensor: padded_ids = [right_pad(t, pad_token_id, dtype=torch.long) for t in rows_ids] # pad masks with False, not 0 - padded_msk = [right_pad(t, 0, dtype=torch.long) for t in rows_mask] + padded_msk = [right_pad(t, False, dtype=torch.bool) for t in rows_mask] # Stack into [total_rows_in_batch, max_len] input_ids = torch.stack(padded_ids, dim=0) if padded_ids else torch.empty((0, 0), dtype=torch.long) - attention_mask = torch.stack(padded_msk, dim=0) if padded_msk else torch.empty((0, 0), dtype=torch.long) + attention_mask = torch.stack(padded_msk, dim=0) if padded_msk else torch.empty((0, 0), dtype=torch.bool) return { "input_ids": input_ids, From 0765e286360bab9a37afdd1c187669ae92014884 Mon Sep 17 00:00:00 2001 From: LRL2-ModelCloud Date: Sat, 27 Sep 2025 16:02:38 +0800 Subject: [PATCH 12/27] cleanup --- gptqmodel/looper/module_looper.py | 4 ++-- gptqmodel/models/base.py | 2 ++ 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/gptqmodel/looper/module_looper.py b/gptqmodel/looper/module_looper.py index 34ca7a1b0..c6b974331 100644 --- a/gptqmodel/looper/module_looper.py +++ b/gptqmodel/looper/module_looper.py @@ -174,8 +174,8 @@ def store_input_hook(module, args, kwargs): v = v.unsqueeze(0) example[k] = move_to(v, device=data_device) try: - if example.get("attention_mask") is not None and example["attention_mask"].dtype != self.gptq_model.ATTENTION_MASKS_DTYPE: - example["attention_mask"] = example["attention_mask"].to(self.gptq_model.ATTENTION_MASKS_DTYPE) + if self.gptq_model.ATTENTION_MASKS_DTYPE is torch.bool: + example["attention_mask"] = example["attention_mask"].long() if self.gptq_model.ATTENTION_MASKS_REQUIRED_FOR_INPUT: self.gptq_model.model.generate(**example, return_audio=False) diff --git a/gptqmodel/models/base.py b/gptqmodel/models/base.py index 87df5b9eb..092e542b4 100644 --- a/gptqmodel/models/base.py +++ b/gptqmodel/models/base.py @@ -155,6 +155,8 @@ class BaseQModel(nn.Module): ATTENTION_MASKS_REQUIRED_FOR_INPUT: bool = False + INPUT_EMBEDDING_EXTRA_ARGS = None + def __init__( self, model: PreTrainedModel, From 7fda54ab4b8186c9527d39fd2a6817bfcb560423 Mon Sep 17 00:00:00 2001 From: LRL2-ModelCloud Date: Sat, 27 Sep 2025 16:04:33 +0800 Subject: [PATCH 13/27] typo --- gptqmodel/looper/module_looper.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gptqmodel/looper/module_looper.py b/gptqmodel/looper/module_looper.py index c6b974331..f94fc0e3c 100644 --- a/gptqmodel/looper/module_looper.py +++ b/gptqmodel/looper/module_looper.py @@ -174,7 +174,7 @@ def store_input_hook(module, args, kwargs): v = v.unsqueeze(0) example[k] = move_to(v, device=data_device) try: - if self.gptq_model.ATTENTION_MASKS_DTYPE is torch.bool: + if self.gptq_model.ATTENTION_MASKS_DTYPE is torch.long: example["attention_mask"] = example["attention_mask"].long() if self.gptq_model.ATTENTION_MASKS_REQUIRED_FOR_INPUT: From 0560d814c093754902a8dfb3880bd7ccdb4bcba0 Mon Sep 17 00:00:00 2001 From: LRL2-ModelCloud Date: Sat, 27 Sep 2025 16:08:18 +0800 Subject: [PATCH 14/27] check none --- gptqmodel/looper/module_looper.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gptqmodel/looper/module_looper.py b/gptqmodel/looper/module_looper.py index f94fc0e3c..2f27f2db9 100644 --- a/gptqmodel/looper/module_looper.py +++ b/gptqmodel/looper/module_looper.py @@ -108,7 +108,7 @@ def store_input_hook(module, args, kwargs): layer_inputs.append(layer_input) # Keyword arguments. - if self.gptq_model.ATTENTION_MASKS_REQUIRED_FOR_INPUT: + if kwargs.get("attention_mask") is not None and self.gptq_model.ATTENTION_MASKS_REQUIRED_FOR_INPUT: attention_masks.append(kwargs["attention_mask"].to(device=data_device)) else: attention_masks.append(None) From 1c9c91a3f0656b2a67a392c8ed76d2b4d1e004ea Mon Sep 17 00:00:00 2001 From: LRL2-ModelCloud Date: Sat, 27 Sep 2025 16:31:02 +0800 Subject: [PATCH 15/27] cleanup --- gptqmodel/models/base.py | 2 - gptqmodel/models/definitions/opt.py | 2 - .../models/definitions/qwen3_omni_moe.py | 13 +++---- gptqmodel/models/loader.py | 39 +++++++------------ 4 files changed, 19 insertions(+), 37 deletions(-) diff --git a/gptqmodel/models/base.py b/gptqmodel/models/base.py index 092e542b4..922eb8cda 100644 --- a/gptqmodel/models/base.py +++ b/gptqmodel/models/base.py @@ -149,8 +149,6 @@ class BaseQModel(nn.Module): support_batch_quantize = True - support_offload_to_disk = True - ATTENTION_MASKS_DTYPE = torch.bool # default to bool ATTENTION_MASKS_REQUIRED_FOR_INPUT: bool = False diff --git a/gptqmodel/models/definitions/opt.py b/gptqmodel/models/definitions/opt.py index b829e223d..36d443a4b 100644 --- a/gptqmodel/models/definitions/opt.py +++ b/gptqmodel/models/definitions/opt.py @@ -13,8 +13,6 @@ class OptQModel(BaseQModel): pre_lm_head_norm_module = "model.decoder.final_layer_norm" - support_offload_to_disk = False - module_tree = [ "model", "decoder", diff --git a/gptqmodel/models/definitions/qwen3_omni_moe.py b/gptqmodel/models/definitions/qwen3_omni_moe.py index 609c5dc68..d9c9b135c 100644 --- a/gptqmodel/models/definitions/qwen3_omni_moe.py +++ b/gptqmodel/models/definitions/qwen3_omni_moe.py @@ -18,8 +18,6 @@ class Qwen3OmniMoeGPTQ(BaseQModel): pre_lm_head_norm_module = "thinker.model.norm" - support_offload_to_disk = False - module_tree = [ "thinker", "model", @@ -39,12 +37,11 @@ class Qwen3OmniMoeGPTQ(BaseQModel): ] def pre_quantize_generate_hook_start(self): - self.model.thinker.model.embed_tokens = self.model.thinker.model.embed_tokens.to(self.quantize_config.device) - self.model.thinker.visual = self.model.thinker.visual.to(self.quantize_config.device) - self.model.thinker.audio_tower = self.model.thinker.audio_tower.to(self.quantize_config.device) - - self.model.thinker.visual.rotary_pos_emb = self.model.thinker.visual.rotary_pos_emb.to(self.quantize_config.device) - self.model.thinker.model.rotary_emb = self.model.thinker.model.rotary_emb.to(self.quantize_config.device) + self.shell_module_materialize(self.model.thinker.model.embed_tokens, self.quantize_config.device) + self.shell_module_materialize(self.model.thinker.visual, self.quantize_config.device) + self.shell_module_materialize(self.model.thinker.audio_tower, self.quantize_config.device) + self.shell_module_materialize(self.model.thinker.visual.rotary_pos_emb, self.quantize_config.device) + self.shell_module_materialize(self.model.thinker.model.rotary_emb, self.quantize_config.device) def pre_quantize_generate_hook_end(self): self.model.thinker.model.embed_tokens = self.model.thinker.model.embed_tokens.to(CPU) diff --git a/gptqmodel/models/loader.py b/gptqmodel/models/loader.py index e027353a5..d4139db64 100644 --- a/gptqmodel/models/loader.py +++ b/gptqmodel/models/loader.py @@ -180,30 +180,19 @@ def skip(*args, **kwargs): cls.before_model_load(cls, load_quantized_model=False) from ..utils.hf import build_shell_model - if not cls.support_offload_to_disk: - quantize_config.offload_to_disk = False - log.info(f"Current model class {cls.__name__} does not support offload_to_disk feature; quantize_config.offload_to_disk has been forcibly disabled.") - - if quantize_config.offload_to_disk: - print("shell model-----------") - model = build_shell_model(cls.loader, config=config, **model_init_kwargs) - model._model_init_kwargs = model_init_kwargs - print_module_tree(model=model) - - # enable mmap with low_cpu_mem_usage - turtle_model = cls.loader.from_pretrained(model_local_path, config=config, low_cpu_mem_usage=True, **model_init_kwargs) - - # TODO FIX ME...temp store model_init args - turtle_model._model_init_kwargs = model_init_kwargs - # print("actual turtle model-----------") - # print_module_tree(model=turtle_model) - else: - print("loading model directly to CPU (not using meta device or turtle_model)-----------") - model = cls.loader.from_pretrained(model_local_path, config=config, **model_init_kwargs) - model._model_init_kwargs = model_init_kwargs - print_module_tree(model=model) - - turtle_model = None + print("shell model-----------") + model = build_shell_model(cls.loader, config=config, **model_init_kwargs) + model._model_init_kwargs = model_init_kwargs + print_module_tree(model=model) + + # enable mmap with low_cpu_mem_usage + turtle_model = cls.loader.from_pretrained(model_local_path, config=config, low_cpu_mem_usage=True, **model_init_kwargs) + + # TODO FIX ME...temp store model_init args + turtle_model._model_init_kwargs = model_init_kwargs + # print("actual turtle model-----------") + # print_module_tree(model=turtle_model) + model_config = model.config.to_dict() seq_len_keys = ["max_position_embeddings", "seq_length", "n_positions", "multimodal_max_length"] @@ -215,7 +204,7 @@ def skip(*args, **kwargs): model.seqlen = 4096 model.eval() - turtle_model.eval() if turtle_model is not None else None + turtle_model.eval() tokenizer = AutoTokenizer.from_pretrained(pretrained_model_id_or_path, trust_remote_code=trust_remote_code) From 9ef86796831605a43eaad3b8155a278cebddd311 Mon Sep 17 00:00:00 2001 From: LRL2-ModelCloud Date: Sat, 27 Sep 2025 16:56:56 +0800 Subject: [PATCH 16/27] offload to disk --- .../models/definitions/qwen3_omni_moe.py | 29 +++++++++++++++---- 1 file changed, 24 insertions(+), 5 deletions(-) diff --git a/gptqmodel/models/definitions/qwen3_omni_moe.py b/gptqmodel/models/definitions/qwen3_omni_moe.py index d9c9b135c..4b60f00de 100644 --- a/gptqmodel/models/definitions/qwen3_omni_moe.py +++ b/gptqmodel/models/definitions/qwen3_omni_moe.py @@ -6,6 +6,7 @@ from transformers import AutoModelForTextToWaveform from ..base import BaseQModel from .._const import CPU +from ...utils.offload import offload_to_disk import torch class Qwen3OmniMoeGPTQ(BaseQModel): @@ -44,9 +45,27 @@ def pre_quantize_generate_hook_start(self): self.shell_module_materialize(self.model.thinker.model.rotary_emb, self.quantize_config.device) def pre_quantize_generate_hook_end(self): - self.model.thinker.model.embed_tokens = self.model.thinker.model.embed_tokens.to(CPU) - self.model.thinker.visual = self.model.thinker.visual.to(CPU) - self.model.thinker.audio_tower = self.model.thinker.audio_tower.to(CPU) + offload_to_disk(model=self.model.thinker.model, + module=self.model.thinker.model.embed_tokens, + disk_path=self.quantize_config.offload_to_disk_path, + ) - self.model.thinker.visual.rotary_pos_emb = self.model.thinker.visual.rotary_pos_emb.to(CPU) - self.model.thinker.model.rotary_emb = self.model.thinker.model.rotary_emb.to(CPU) \ No newline at end of file + offload_to_disk(model=self.model.thinker.model, + module=self.model.thinker.visual, + disk_path=self.quantize_config.offload_to_disk_path, + ) + + offload_to_disk(model=self.model.thinker.model, + module=self.model.thinker.audio_tower, + disk_path=self.quantize_config.offload_to_disk_path, + ) + + offload_to_disk(model=self.model.thinker.model, + module=self.model.thinker.visual.rotary_pos_emb, + disk_path=self.quantize_config.offload_to_disk_path, + ) + + offload_to_disk(model=self.model.thinker.model, + module=self.model.thinker.model.rotary_emb, + disk_path=self.quantize_config.offload_to_disk_path, + ) From 2f5b277c00f95eded450041e99b934fcf0be89a8 Mon Sep 17 00:00:00 2001 From: LRL2-ModelCloud Date: Sat, 27 Sep 2025 17:02:27 +0800 Subject: [PATCH 17/27] cleanup --- gptqmodel/models/definitions/qwen3_omni_moe.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/gptqmodel/models/definitions/qwen3_omni_moe.py b/gptqmodel/models/definitions/qwen3_omni_moe.py index 4b60f00de..fe2fb39d6 100644 --- a/gptqmodel/models/definitions/qwen3_omni_moe.py +++ b/gptqmodel/models/definitions/qwen3_omni_moe.py @@ -50,17 +50,17 @@ def pre_quantize_generate_hook_end(self): disk_path=self.quantize_config.offload_to_disk_path, ) - offload_to_disk(model=self.model.thinker.model, + offload_to_disk(model=self.model.thinker, module=self.model.thinker.visual, disk_path=self.quantize_config.offload_to_disk_path, ) - offload_to_disk(model=self.model.thinker.model, + offload_to_disk(model=self.model.thinker, module=self.model.thinker.audio_tower, disk_path=self.quantize_config.offload_to_disk_path, ) - offload_to_disk(model=self.model.thinker.model, + offload_to_disk(model=self.model.thinker.visual, module=self.model.thinker.visual.rotary_pos_emb, disk_path=self.quantize_config.offload_to_disk_path, ) From 78f67586da98992349544ffa7b984251d145ab8f Mon Sep 17 00:00:00 2001 From: LRL2-ModelCloud Date: Sun, 28 Sep 2025 09:03:38 +0800 Subject: [PATCH 18/27] update --- gptqmodel/looper/module_looper.py | 4 ++-- gptqmodel/models/definitions/base_qwen2_5_omni.py | 4 ++++ gptqmodel/models/definitions/qwen3_omni_moe.py | 4 ++++ 3 files changed, 10 insertions(+), 2 deletions(-) diff --git a/gptqmodel/looper/module_looper.py b/gptqmodel/looper/module_looper.py index 2f27f2db9..38dc5c53c 100644 --- a/gptqmodel/looper/module_looper.py +++ b/gptqmodel/looper/module_looper.py @@ -177,8 +177,8 @@ def store_input_hook(module, args, kwargs): if self.gptq_model.ATTENTION_MASKS_DTYPE is torch.long: example["attention_mask"] = example["attention_mask"].long() - if self.gptq_model.ATTENTION_MASKS_REQUIRED_FOR_INPUT: - self.gptq_model.model.generate(**example, return_audio=False) + if self.gptq_model.INPUT_EMBEDDING_EXTRA_ARGS: + self.gptq_model.model.generate(**example, **self.gptq_model.INPUT_EMBEDDING_EXTRA_ARGS) else: self.gptq_model.model(**example, use_cache=use_cache) except StopForward: diff --git a/gptqmodel/models/definitions/base_qwen2_5_omni.py b/gptqmodel/models/definitions/base_qwen2_5_omni.py index c59f1f0c3..4fca6b67f 100644 --- a/gptqmodel/models/definitions/base_qwen2_5_omni.py +++ b/gptqmodel/models/definitions/base_qwen2_5_omni.py @@ -20,6 +20,10 @@ class BaseQwen2_5_OmniGPTQ(BaseQModel): ATTENTION_MASKS_REQUIRED_FOR_INPUT = True ATTENTION_MASKS_DTYPE = torch.long + INPUT_EMBEDDING_EXTRA_ARGS = { + "return_audio": False, + } + loader = AutoModelForTextToWaveform pre_lm_head_norm_module = "thinker.model.norm" diff --git a/gptqmodel/models/definitions/qwen3_omni_moe.py b/gptqmodel/models/definitions/qwen3_omni_moe.py index fe2fb39d6..65ce7d62a 100644 --- a/gptqmodel/models/definitions/qwen3_omni_moe.py +++ b/gptqmodel/models/definitions/qwen3_omni_moe.py @@ -12,6 +12,10 @@ class Qwen3OmniMoeGPTQ(BaseQModel): ATTENTION_MASKS_REQUIRED_FOR_INPUT = True ATTENTION_MASKS_DTYPE = torch.long + + INPUT_EMBEDDING_EXTRA_ARGS = { + "return_audio": False, + } loader = AutoModelForTextToWaveform From 4ade4d0d6be5f14274a3b6a724d506875cbcfc47 Mon Sep 17 00:00:00 2001 From: LRL2-ModelCloud Date: Sun, 28 Sep 2025 09:22:14 +0800 Subject: [PATCH 19/27] mod filter_not_quantize_module --- gptqmodel/models/base.py | 29 ++++++++++++++++++----------- 1 file changed, 18 insertions(+), 11 deletions(-) diff --git a/gptqmodel/models/base.py b/gptqmodel/models/base.py index 922eb8cda..7a82237b4 100644 --- a/gptqmodel/models/base.py +++ b/gptqmodel/models/base.py @@ -22,7 +22,7 @@ from ..nn_modules.qlinear import BaseQuantLinear from ..nn_modules.qlinear.torch import TorchQuantLinear from ..quantization import QuantizeConfig -from ..quantization.config import FORMAT, METHOD, QUANTIZE_BLACK_LIST +from ..quantization.config import FORMAT, METHOD, QUANTIZE_BLACK_LIST, dynamic_get from ..quantization.rotation.rotation import fuse_layer_norms, rotate_model from ..utils.backend import BACKEND from ..utils.data import collate_data @@ -56,14 +56,6 @@ def classproperty(func): return _ClassPropertyDescriptor(func) -def filter_not_quantize_module(layer_modules): - return [ - [name for name in block if NOT_QUANTIZE_FLAG not in name] - for block in layer_modules - if any(NOT_QUANTIZE_FLAG not in name for name in block) - ] - - def generate_node_for_awq_scaling(inp, prev_op, module_kwargs, nodes_size, subset, module2inspect): n = { "prev_op": prev_op, @@ -287,6 +279,21 @@ def get_num_experts(cls, model_config): num_experts = getattr(model_config, cls.dynamic_expert_index) return num_experts + @classmethod + def filter_not_quantize_module(cls,layer_modules): + layer_modules = [ + [name for name in block if NOT_QUANTIZE_FLAG not in name] + for block in layer_modules + if any(NOT_QUANTIZE_FLAG not in name for name in block) + ] + + if cls.quantize_config.dynamic: + for module in layer_modules: + if not dynamic_get(cls.quantize_config.dynamic, module_name=module): + layer_modules.remove(module) + + return layer_modules + # Inside each `LlamaDecoderLayer` layer are many internal modules # List them in the order executed in model forward() code # Many models have same execution order of: attention (q_k_v) projection, attention (output) projection, mlp (n) projections @@ -296,8 +303,8 @@ def simple_layer_modules(cls, model_config, is_awq_quantize: bool = False): layer_modules = cls.build_moe_modules_if_need(model_config, layer_modules, is_awq_quantize) - layer_modules = filter_not_quantize_module(layer_modules) - # print(f"simple_layer_modules layer_modules: {layer_modules}") + layer_modules = cls.filter_not_quantize_module(layer_modules) + print(f"simple_layer_modules layer_modules: {layer_modules}") return layer_modules @classmethod From 59cf8c28da952a413c700cd41ba5f804dc7b4562 Mon Sep 17 00:00:00 2001 From: LRL2-ModelCloud Date: Sun, 28 Sep 2025 09:28:32 +0800 Subject: [PATCH 20/27] if offload_to_disk=False, we should not load the model into meta first. --- gptqmodel/models/loader.py | 35 +++++++++++++++++++++-------------- 1 file changed, 21 insertions(+), 14 deletions(-) diff --git a/gptqmodel/models/loader.py b/gptqmodel/models/loader.py index d4139db64..5ced21725 100644 --- a/gptqmodel/models/loader.py +++ b/gptqmodel/models/loader.py @@ -180,19 +180,26 @@ def skip(*args, **kwargs): cls.before_model_load(cls, load_quantized_model=False) from ..utils.hf import build_shell_model - print("shell model-----------") - model = build_shell_model(cls.loader, config=config, **model_init_kwargs) - model._model_init_kwargs = model_init_kwargs - print_module_tree(model=model) - - # enable mmap with low_cpu_mem_usage - turtle_model = cls.loader.from_pretrained(model_local_path, config=config, low_cpu_mem_usage=True, **model_init_kwargs) - - # TODO FIX ME...temp store model_init args - turtle_model._model_init_kwargs = model_init_kwargs - # print("actual turtle model-----------") - # print_module_tree(model=turtle_model) - + if quantize_config.offload_to_disk: + print("shell model-----------") + model = build_shell_model(cls.loader, config=config, **model_init_kwargs) + model._model_init_kwargs = model_init_kwargs + print_module_tree(model=model) + + # enable mmap with low_cpu_mem_usage + turtle_model = cls.loader.from_pretrained(model_local_path, config=config, low_cpu_mem_usage=True, **model_init_kwargs) + + # TODO FIX ME...temp store model_init args + turtle_model._model_init_kwargs = model_init_kwargs + # print("actual turtle model-----------") + # print_module_tree(model=turtle_model) + else: + print("loading model directly to CPU (not using meta device or turtle_model)-----------") + model = cls.loader.from_pretrained(model_local_path, config=config, **model_init_kwargs) + model._model_init_kwargs = model_init_kwargs + print_module_tree(model=model) + + turtle_model = None model_config = model.config.to_dict() seq_len_keys = ["max_position_embeddings", "seq_length", "n_positions", "multimodal_max_length"] @@ -204,7 +211,7 @@ def skip(*args, **kwargs): model.seqlen = 4096 model.eval() - turtle_model.eval() + turtle_model.eval() if turtle_model is not None else None tokenizer = AutoTokenizer.from_pretrained(pretrained_model_id_or_path, trust_remote_code=trust_remote_code) From 41fb9f4c8ea4813c5ab6738e38e7c91caf101fcd Mon Sep 17 00:00:00 2001 From: LRL2-ModelCloud Date: Sun, 28 Sep 2025 09:32:18 +0800 Subject: [PATCH 21/27] mod base.py --- gptqmodel/models/base.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/gptqmodel/models/base.py b/gptqmodel/models/base.py index 7a82237b4..98ce8e080 100644 --- a/gptqmodel/models/base.py +++ b/gptqmodel/models/base.py @@ -1062,6 +1062,9 @@ def shell_module_materialize( non_blocking: bool = False, ) -> torch.nn.Module: if self.turtle_model is None: + if target_submodule.device != device: + target_submodule.to(device) + return target_submodule module = alias_from_turtle_for_submodule( From e3fc3ca24a241f7ef49309f9ea9c262f0561b6a9 Mon Sep 17 00:00:00 2001 From: LRL2-ModelCloud Date: Sun, 28 Sep 2025 09:35:23 +0800 Subject: [PATCH 22/27] mod pre_quantize_generate_hook_end --- .../models/definitions/qwen3_omni_moe.py | 53 +++++++++++-------- 1 file changed, 31 insertions(+), 22 deletions(-) diff --git a/gptqmodel/models/definitions/qwen3_omni_moe.py b/gptqmodel/models/definitions/qwen3_omni_moe.py index 65ce7d62a..4487136cc 100644 --- a/gptqmodel/models/definitions/qwen3_omni_moe.py +++ b/gptqmodel/models/definitions/qwen3_omni_moe.py @@ -49,27 +49,36 @@ def pre_quantize_generate_hook_start(self): self.shell_module_materialize(self.model.thinker.model.rotary_emb, self.quantize_config.device) def pre_quantize_generate_hook_end(self): - offload_to_disk(model=self.model.thinker.model, - module=self.model.thinker.model.embed_tokens, - disk_path=self.quantize_config.offload_to_disk_path, - ) + if self.quantize_config.offload_to_disk: + offload_to_disk(model=self.model.thinker.model, + module=self.model.thinker.model.embed_tokens, + disk_path=self.quantize_config.offload_to_disk_path, + ) - offload_to_disk(model=self.model.thinker, - module=self.model.thinker.visual, - disk_path=self.quantize_config.offload_to_disk_path, - ) - - offload_to_disk(model=self.model.thinker, - module=self.model.thinker.audio_tower, - disk_path=self.quantize_config.offload_to_disk_path, - ) + offload_to_disk(model=self.model.thinker, + module=self.model.thinker.visual, + disk_path=self.quantize_config.offload_to_disk_path, + ) + + offload_to_disk(model=self.model.thinker, + module=self.model.thinker.audio_tower, + disk_path=self.quantize_config.offload_to_disk_path, + ) - offload_to_disk(model=self.model.thinker.visual, - module=self.model.thinker.visual.rotary_pos_emb, - disk_path=self.quantize_config.offload_to_disk_path, - ) - - offload_to_disk(model=self.model.thinker.model, - module=self.model.thinker.model.rotary_emb, - disk_path=self.quantize_config.offload_to_disk_path, - ) + offload_to_disk(model=self.model.thinker.visual, + module=self.model.thinker.visual.rotary_pos_emb, + disk_path=self.quantize_config.offload_to_disk_path, + ) + + offload_to_disk(model=self.model.thinker.model, + module=self.model.thinker.model.rotary_emb, + disk_path=self.quantize_config.offload_to_disk_path, + ) + return + + self.model.thinker.model.embed_tokens = self.model.thinker.model.embed_tokens.to(CPU) + self.model.thinker.visual = self.model.thinker.visual.to(CPU) + self.model.thinker.audio_tower = self.model.thinker.audio_tower.to(CPU) + + self.model.thinker.visual.rotary_pos_emb = self.model.thinker.visual.rotary_pos_emb.to(CPU) + self.model.thinker.model.rotary_emb = self.model.thinker.model.rotary_emb.to(CPU) From 58e6d791fbd83decf61ca1a8b309d11f01cf9db9 Mon Sep 17 00:00:00 2001 From: LRL2-ModelCloud Date: Sun, 28 Sep 2025 09:43:35 +0800 Subject: [PATCH 23/27] fix has no attr quantize_config --- gptqmodel/looper/module_looper.py | 2 +- gptqmodel/models/base.py | 10 +++++----- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/gptqmodel/looper/module_looper.py b/gptqmodel/looper/module_looper.py index 38dc5c53c..05a2ca9d2 100644 --- a/gptqmodel/looper/module_looper.py +++ b/gptqmodel/looper/module_looper.py @@ -242,7 +242,7 @@ def loop(self, fail_safe: bool = False, **kwargs): for processor in self.processors: processor.release_calibration_dataset() - layer_modules = self.gptq_model.simple_layer_modules(model_config=self.gptq_model.model.config) + layer_modules = self.gptq_model.simple_layer_modules(model_config=self.gptq_model.model.config, quantize_config=self.gptq_model.quantize_config) if not self.gptq_model.quantize_config.true_sequential: layer_modules = [sum(layer_modules, [])] diff --git a/gptqmodel/models/base.py b/gptqmodel/models/base.py index 98ce8e080..a92cfde6a 100644 --- a/gptqmodel/models/base.py +++ b/gptqmodel/models/base.py @@ -280,16 +280,16 @@ def get_num_experts(cls, model_config): return num_experts @classmethod - def filter_not_quantize_module(cls,layer_modules): + def filter_not_quantize_module(cls,layer_modules, quantize_config): layer_modules = [ [name for name in block if NOT_QUANTIZE_FLAG not in name] for block in layer_modules if any(NOT_QUANTIZE_FLAG not in name for name in block) ] - if cls.quantize_config.dynamic: + if quantize_config.dynamic: for module in layer_modules: - if not dynamic_get(cls.quantize_config.dynamic, module_name=module): + if not dynamic_get(quantize_config.dynamic, module_name=module): layer_modules.remove(module) return layer_modules @@ -298,12 +298,12 @@ def filter_not_quantize_module(cls,layer_modules): # List them in the order executed in model forward() code # Many models have same execution order of: attention (q_k_v) projection, attention (output) projection, mlp (n) projections @classmethod - def simple_layer_modules(cls, model_config, is_awq_quantize: bool = False): + def simple_layer_modules(cls, model_config, quantize_config, is_awq_quantize: bool = False): layer_modules = cls.build_layer_modules(cls.module_tree) layer_modules = cls.build_moe_modules_if_need(model_config, layer_modules, is_awq_quantize) - layer_modules = cls.filter_not_quantize_module(layer_modules) + layer_modules = cls.filter_not_quantize_module(layer_modules, quantize_config) print(f"simple_layer_modules layer_modules: {layer_modules}") return layer_modules From acb5f8031f483e8df0b2c95dd7ddb9d5b22e4dee Mon Sep 17 00:00:00 2001 From: LRL2-ModelCloud Date: Sun, 28 Sep 2025 09:56:32 +0800 Subject: [PATCH 24/27] fix filter --- gptqmodel/models/base.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/gptqmodel/models/base.py b/gptqmodel/models/base.py index a92cfde6a..d9f08e91d 100644 --- a/gptqmodel/models/base.py +++ b/gptqmodel/models/base.py @@ -288,9 +288,10 @@ def filter_not_quantize_module(cls,layer_modules, quantize_config): ] if quantize_config.dynamic: - for module in layer_modules: - if not dynamic_get(quantize_config.dynamic, module_name=module): - layer_modules.remove(module) + for modules in layer_modules: + for module in modules: + if dynamic_get(quantize_config.dynamic, module_name=module) == False: + modules.remove(module) return layer_modules @@ -1062,7 +1063,7 @@ def shell_module_materialize( non_blocking: bool = False, ) -> torch.nn.Module: if self.turtle_model is None: - if target_submodule.device != device: + if get_device(target_submodule) != device: target_submodule.to(device) return target_submodule From 5be44b7708bc3b09eaf69e14a809cee4a4ee5d61 Mon Sep 17 00:00:00 2001 From: LRL2-ModelCloud Date: Sun, 28 Sep 2025 10:08:50 +0800 Subject: [PATCH 25/27] check none --- gptqmodel/utils/structure.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/gptqmodel/utils/structure.py b/gptqmodel/utils/structure.py index f187ee096..44b57f1fa 100644 --- a/gptqmodel/utils/structure.py +++ b/gptqmodel/utils/structure.py @@ -608,6 +608,9 @@ def alias_all_from_turtle_if_meta( Logs each swap via log.info(). """ + if turtle_model is None: + return 0 + turtle_map = dict(turtle_model.named_modules()) swapped = 0 From c20d9d20e369310e6c99f0b8a58a94a9b39b5103 Mon Sep 17 00:00:00 2001 From: LRL2-ModelCloud Date: Sun, 28 Sep 2025 11:02:48 +0800 Subject: [PATCH 26/27] need config --- gptqmodel/models/loader.py | 2 +- gptqmodel/models/writer.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/gptqmodel/models/loader.py b/gptqmodel/models/loader.py index 5ced21725..2576583bb 100644 --- a/gptqmodel/models/loader.py +++ b/gptqmodel/models/loader.py @@ -469,7 +469,7 @@ def skip(*args, **kwargs): continue if not any(name.startswith(prefix) for prefix in cls.extract_layers_node()) or any(name.startswith(ignore_module) for ignore_module in ignore_modules) or all( - not name.endswith(ignore_module) for sublist in cls.simple_layer_modules(config) for ignore_module in sublist + not name.endswith(ignore_module) for sublist in cls.simple_layer_modules(config, qcfg) for ignore_module in sublist ): # log non-lm-head quantized modules only if name is not cls.lm_head: diff --git a/gptqmodel/models/writer.py b/gptqmodel/models/writer.py index 9aae554f9..eccdbb264 100644 --- a/gptqmodel/models/writer.py +++ b/gptqmodel/models/writer.py @@ -443,7 +443,7 @@ def skip(*args, **kwargs): continue if any(name.startswith(ignore_module) for ignore_module in ignore_modules) or all( - not name.endswith(ignore_module) for sublist in self.simple_layer_modules(config) for ignore_module in sublist + not name.endswith(ignore_module) for sublist in self.simple_layer_modules(config, qcfg) for ignore_module in sublist ): # log non-lm-head quantizerd modules only if name is not self.lm_head: From 65eb981148be8b5a321627a02d7075f23cc47f21 Mon Sep 17 00:00:00 2001 From: LRL2-ModelCloud Date: Sun, 28 Sep 2025 14:59:46 +0800 Subject: [PATCH 27/27] fix filter_not_quantize_module --- gptqmodel/models/base.py | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/gptqmodel/models/base.py b/gptqmodel/models/base.py index d9f08e91d..654feb303 100644 --- a/gptqmodel/models/base.py +++ b/gptqmodel/models/base.py @@ -280,19 +280,24 @@ def get_num_experts(cls, model_config): return num_experts @classmethod - def filter_not_quantize_module(cls,layer_modules, quantize_config): + def filter_not_quantize_module(cls, layer_modules, quantize_config): layer_modules = [ [name for name in block if NOT_QUANTIZE_FLAG not in name] for block in layer_modules - if any(NOT_QUANTIZE_FLAG not in name for name in block) ] + layer_modules = [block for block in layer_modules if block] # 去掉空 block - if quantize_config.dynamic: + if getattr(quantize_config, "dynamic", None): + new_layer_modules = [] for modules in layer_modules: - for module in modules: - if dynamic_get(quantize_config.dynamic, module_name=module) == False: - modules.remove(module) - + filtered = [ + m for m in modules + if dynamic_get(quantize_config.dynamic, module_name=m) is not False + ] + if filtered: + new_layer_modules.append(filtered) + layer_modules = new_layer_modules + return layer_modules # Inside each `LlamaDecoderLayer` layer are many internal modules @@ -305,6 +310,7 @@ def simple_layer_modules(cls, model_config, quantize_config, is_awq_quantize: bo layer_modules = cls.build_moe_modules_if_need(model_config, layer_modules, is_awq_quantize) layer_modules = cls.filter_not_quantize_module(layer_modules, quantize_config) + print(f"simple_layer_modules layer_modules: {layer_modules}") return layer_modules