From 0fd26e3498fb00f590b33d1792e4ea398858f27d Mon Sep 17 00:00:00 2001 From: ZX-ModelCloud Date: Thu, 16 Oct 2025 16:04:32 +0800 Subject: [PATCH 1/4] mlp.gate cannot be skipped Signed-off-by: ZX-ModelCloud --- gptqmodel/models/definitions/qwen3_moe.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gptqmodel/models/definitions/qwen3_moe.py b/gptqmodel/models/definitions/qwen3_moe.py index 31e788838..0f0dd9919 100644 --- a/gptqmodel/models/definitions/qwen3_moe.py +++ b/gptqmodel/models/definitions/qwen3_moe.py @@ -26,7 +26,7 @@ class Qwen3MoeQModel(BaseQModel): "self_attn": ("q_proj:0", "k_proj:0", "v_proj:0", "o_proj:1"), "post_attention_layernorm": ("post_attention_layernorm:!",), "mlp": { - "gate": ("gate:!",), # <-- 0.5MB per layer. Not worth quantizing + "gate": ("gate",), "experts": { "#": ("gate_proj:0", "up_proj:0", "down_proj:1"), }, From 8dd881d3896aa3007108869e474496962a451109 Mon Sep 17 00:00:00 2001 From: ZX-ModelCloud Date: Thu, 16 Oct 2025 17:31:34 +0800 Subject: [PATCH 2/4] Revert "mlp.gate cannot be skipped" This reverts commit 0fd26e3498fb00f590b33d1792e4ea398858f27d. --- gptqmodel/models/definitions/qwen3_moe.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gptqmodel/models/definitions/qwen3_moe.py b/gptqmodel/models/definitions/qwen3_moe.py index 0f0dd9919..31e788838 100644 --- a/gptqmodel/models/definitions/qwen3_moe.py +++ b/gptqmodel/models/definitions/qwen3_moe.py @@ -26,7 +26,7 @@ class Qwen3MoeQModel(BaseQModel): "self_attn": ("q_proj:0", "k_proj:0", "v_proj:0", "o_proj:1"), "post_attention_layernorm": ("post_attention_layernorm:!",), "mlp": { - "gate": ("gate",), + "gate": ("gate:!",), # <-- 0.5MB per layer. Not worth quantizing "experts": { "#": ("gate_proj:0", "up_proj:0", "down_proj:1"), }, From f19d863ba314c0dd97c29cef49dfbdf8906afa2d Mon Sep 17 00:00:00 2001 From: ZX-ModelCloud Date: Thu, 16 Oct 2025 19:22:37 +0800 Subject: [PATCH 3/4] add "module_tree_overrides" field Signed-off-by: ZX-ModelCloud --- gptqmodel/models/base.py | 30 ++++++++++++++++++++++- gptqmodel/models/definitions/qwen3_moe.py | 11 +++++++++ 2 files changed, 40 insertions(+), 1 deletion(-) diff --git a/gptqmodel/models/base.py b/gptqmodel/models/base.py index bfffa308c..726bbc4a3 100644 --- a/gptqmodel/models/base.py +++ b/gptqmodel/models/base.py @@ -110,6 +110,25 @@ def check_support_param_buffer_assignment(*args, **kwargs): return False +def apply_module_tree_override(module_tree, override): + """ + Recursively find the corresponding key of override in module_tree and override it. + """ + if isinstance(module_tree, dict) and isinstance(override, dict): + for k, v in override.items(): + if k in module_tree and isinstance(module_tree[k], (dict, list)) and isinstance(v, (dict, list)): + module_tree[k] = apply_module_tree_override(module_tree[k], v) + else: + module_tree[k] = v + elif isinstance(module_tree, list) and isinstance(override, list): + for o in override: + if isinstance(o, dict): + for b in module_tree: + if isinstance(b, dict): + apply_module_tree_override(b, o) + return module_tree + + NOT_QUANTIZE_FLAG = ":!" @@ -125,6 +144,8 @@ class BaseQModel(nn.Module): # a tree node of all the roots that contain quantizable modules module_tree: List[str] = None + # Override module_tree according to different QUANT_METHOD + module_tree_overrides: dict[METHOD, List[str]] = None # Strict=True -> all layer_modules must exists in model # Some models (deepseek2-lite) dynamically create lora modules based on config.rank @@ -198,6 +219,13 @@ def __init__( ): super().__init__() + quant_method = quantize_config.quant_method + # override module_tree if need + if self.module_tree_overrides.get(quant_method) is not None: + log.info(f'Module Tree: overridden by METHOD.{quant_method.upper()}') + # setting cls.module_tree + type(self).module_tree = apply_module_tree_override(self.module_tree, self.module_tree_overrides[quant_method]) + # record configuration early so model lifecycle hooks can rely on them self.compiled = False # set to True while compile() is triggered successfully self.quantized = quantized @@ -794,7 +822,7 @@ def quantize( ) if not self.support_batch_quantize: - log.warn("Quantize: batch_size overriden by model class definition to `disabled`") + log.warn("Quantize: batch_size overridden by model class definition to `disabled`") batch_size = 1 # but actually disabled if self.quantize_config.format == FORMAT.MARLIN: diff --git a/gptqmodel/models/definitions/qwen3_moe.py b/gptqmodel/models/definitions/qwen3_moe.py index 31e788838..e0f56602a 100644 --- a/gptqmodel/models/definitions/qwen3_moe.py +++ b/gptqmodel/models/definitions/qwen3_moe.py @@ -4,6 +4,7 @@ # Contact: qubitium@modelcloud.ai, x.com/qubitium from ..base import BaseQModel +from ...quantization import METHOD class Qwen3MoeQModel(BaseQModel): @@ -33,3 +34,13 @@ class Qwen3MoeQModel(BaseQModel): }, } ] + + module_tree_overrides = { + METHOD.AWQ: [ + { + "mlp": { + "gate": ("gate",), + } + } + ] + } From bb2bf937137957e81209a36ca29d84237a9c3d6d Mon Sep 17 00:00:00 2001 From: ZX-ModelCloud Date: Thu, 16 Oct 2025 19:40:15 +0800 Subject: [PATCH 4/4] check self.module_tree_overrides is not None Signed-off-by: ZX-ModelCloud --- gptqmodel/models/base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gptqmodel/models/base.py b/gptqmodel/models/base.py index 726bbc4a3..358e255f7 100644 --- a/gptqmodel/models/base.py +++ b/gptqmodel/models/base.py @@ -221,7 +221,7 @@ def __init__( quant_method = quantize_config.quant_method # override module_tree if need - if self.module_tree_overrides.get(quant_method) is not None: + if self.module_tree_overrides is not None and self.module_tree_overrides.get(quant_method) is not None: log.info(f'Module Tree: overridden by METHOD.{quant_method.upper()}') # setting cls.module_tree type(self).module_tree = apply_module_tree_override(self.module_tree, self.module_tree_overrides[quant_method])