ModelCloud · Qubitium · Nov 5, 2025 · Nov 3, 2025 · Nov 4, 2025 · Nov 4, 2025
diff --git a/gptqmodel/looper/awq_processor.py b/gptqmodel/looper/awq_processor.py
@@ -204,13 +204,13 @@ def _layer_input_features(self, state: _AWQLayerState) -> Dict[str, torch.Tensor
                 #     tuple(features[name].shape),
                 # )
 
-        for root, tensors in root_buckets.items():
-            if not tensors or root in features:
-                continue
-            try:
-                features[root] = torch.cat(tensors, dim=0)
-            except RuntimeError:
-                features[root] = tensors[0]
+        # for root, tensors in root_buckets.items():
+        #     if not tensors or root in features:
+        #         continue
+        #     try:
+        #         features[root] = torch.cat(tensors, dim=0)
+        #     except RuntimeError:
+        #         features[root] = tensors[0]
         return features
 
     def _refresh_forward_kwargs_from_cache(self) -> None:
@@ -1233,7 +1233,7 @@ def pre_process_fwd_hook(self, name: str) -> Callable[[Module, Tuple[torch.Tenso
         def hook(module, inp: Tuple[torch.Tensor, ...], out: torch.Tensor):
             if not inp:
                 return
-            feature = inp[0]
+            feature = inp
             if isinstance(feature, (tuple, list)) and feature:
                 feature = feature[0]
             self._record_input_feature(name, feature)

diff --git a/gptqmodel/looper/module_looper.py b/gptqmodel/looper/module_looper.py
@@ -33,6 +33,7 @@
 from ..looper.named_module import NamedModule
 from ..models import BaseQModel
 from ..models._const import SUPPORTS_MODULE_TYPES
+from ..models.base import CAPTURE_ONLY_FLAG
 from ..nn_modules.hooked_linear import (STOP_FORWARD_EXCEPTION, HookedLinear,
                                         StopForward, replace_module_with_hooked_legacy)
 from ..quantization.config import VRAMStrategy
@@ -1251,11 +1252,15 @@ def _loop_impl(self, fail_safe: bool = False, **kwargs):
 
         return total_log
 
-    def crate_named_modules(self, full, is_lm_head_module, layer_index, layers_prefix, names, processor, fail_safe, layer_module=None) -> Dict[str, NamedModule]:
+    def crate_named_modules(self, module, full, is_lm_head_module, layer_index, layers_prefix, names, processor, fail_safe, layer_module=None) -> Dict[str, NamedModule]:
         subset = {}
         for n in names:
             if n in full:
                 subset[n] = full[n]
+            elif n.endswith(CAPTURE_ONLY_FLAG):
+                # Obtain the CAPTURE_ONLY_FLAG Module separately
+                n = n.split(CAPTURE_ONLY_FLAG, 1)[0]
+                subset[n], _ = get_module_by_name_prefix(module, module_name=n)
             # some modules have layer_modules that are dynamic based on config
             # ref: deepseek v2/v3/r1
             elif self.gptq_model.layer_modules_strict:

diff --git a/gptqmodel/looper/named_module.py b/gptqmodel/looper/named_module.py
@@ -54,12 +54,14 @@ def __init__(self, module: torch.nn.Module, name: str, full_name:str, layer_inde
             in_features = module.weight.shape[0]
             out_features = module.weight.shape[1]
         else:
-            raise NotImplementedError(f"Unsupported module.module type: `{type(module)}`")
+            in_features = None
+            out_features = None
 
-        self.state.update({
-            "in_features": in_features,
-            "out_features": out_features,
-        })
+        if in_features and out_features:
+            self.state.update({
+                "in_features": in_features,
+                "out_features": out_features,
+            })
 
     def parameters(self, recurse: bool = True):
         return self.module.parameters(recurse=recurse)
@@ -73,6 +75,12 @@ def buffers(self, recurse: bool = True):
     def named_buffers(self, prefix: str = "", recurse: bool = True):
         return self.module.named_buffers(prefix=prefix, recurse=recurse)
 
+    def register_forward_hook(
+        self, *args, **kwargs
+    ):
+        with self._parent_lock:
+            return self.module.register_forward_hook(*args, **kwargs)
+
     def register_buffer(
         self, name: str, tensor: Optional[Tensor], persistent: bool = True
     ) -> None:

diff --git a/gptqmodel/looper/stage_subset.py b/gptqmodel/looper/stage_subset.py
@@ -79,6 +79,7 @@ def run_subset_stage(
     is_awq_processor = processor_name_lower.startswith("awq")
 
     subset = looper.crate_named_modules(
+        module=module,
         full=full,
         is_lm_head_module=is_lm_head_module,
         layer_index=layer_index,

diff --git a/gptqmodel/models/base.py b/gptqmodel/models/base.py
diff --git a/gptqmodel/models/definitions/qwen2_moe.py b/gptqmodel/models/definitions/qwen2_moe.py
@@ -21,12 +21,26 @@ class Qwen2MoeQModel(BaseQModel):
             "input_layernorm": ("input_layernorm:!",),
             "self_attn": ("q_proj:0", "k_proj:0", "v_proj:0", "o_proj:1"),
             "post_attention_layernorm": ("post_attention_layernorm:!",),
-            "mlp": {
-                "gate": ("gate:0",),
-                "shared_expert": ("gate_proj:0", "up_proj:0", "down_proj:1"),
-                "experts": {
+            "mlp:?": {
+                "gate": ("gate:!",),
+                "shared_expert:0": ("gate_proj:0", "up_proj:0", "down_proj:1"),
+                "experts:0": {
                     "#": ("gate_proj:0", "up_proj:0", "down_proj:1"),
                 },
             },
         }
     ]
+
+    # module_tree_overrides = {
+    #     METHOD.AWQ: [
+    #         {
+    #             "mlp:?": {
+    #                 "gate": ("gate:!",),
+    #                 "shared_expert": None,
+    #                 "experts": {
+    #                     "#": ("gate_proj:0", "up_proj:0", "down_proj:1"),
+    #                 },
+    #             },
+    #         }
+    #     ]
+    # }
diff --git a/gptqmodel/quantization/gptq.py b/gptqmodel/quantization/gptq.py
@@ -571,8 +571,8 @@ def materialize_global_hessian(self, target_device: Optional[torch.device] = Non
             for partial_device, partial in self._device_hessian_partials.items():
                 if partial.device != result_accum.device or partial.dtype != torch.float32:
                     # TODO FIXME multi-3090 using P2P is revaling an issue where result_accum and/or partial is not ready for consolidation on the main thread
-                    # when parials are calculated on the individual 
-                    try:                    
+                    # when parials are calculated on the individual
+                    try:
                         result_accum.add_(partial.to(device=result_accum.device, dtype=torch.float32))
                     except:
                         log.warn(f"Quantization: Module `{self.name}` -> Retry partial.to in 0.25s")

diff --git a/tests/models/model_test.py b/tests/models/model_test.py
@@ -58,14 +58,14 @@ def is_flash_attn_2_available():  # type: ignore
         return False
 
 from gptqmodel import BACKEND, DEBUG_ON, GPTQModel  # noqa: E402
+from gptqmodel.looper.module_looper import StopMainLoop  # noqa: E402
 from gptqmodel.models.base import BaseQModel  # noqa: E402
 from gptqmodel.nn_modules.qlinear import BaseQuantLinear  # noqa: E402
 from gptqmodel.quantization import FORMAT, METHOD  # noqa: E402
 from gptqmodel.quantization.config import QuantizeConfig, VRAMStrategy  # noqa: E402
 from gptqmodel.utils.eval import EVAL  # noqa: E402
 from gptqmodel.utils.model import MODALITY  # noqa: E402
 from gptqmodel.utils.torch import torch_empty_cache  # noqa: E402
-from gptqmodel.looper.module_looper import StopMainLoop  # noqa: E402
 
 
 RAND_SEED = 898

diff --git a/tests/models/test_glm4_moe._awq.py b/tests/models/test_glm4_moe._awq.py
@@ -4,10 +4,8 @@
 # Contact: qubitium@modelcloud.ai, x.com/qubitium
 from model_test import ModelTest
 
-from gptqmodel.quantization.config import VRAMStrategy
-from gptqmodel.utils.eval import EVAL
 from gptqmodel.quantization import FORMAT, METHOD
-
+from gptqmodel.utils.eval import EVAL
 
 
 # | Metric                         |   MARLIN |

diff --git a/tests/models/test_qwen2_moe_quant.py b/tests/models/test_qwen2_moe_quant.py
@@ -17,8 +17,6 @@ class TestQwen2_5_Moe(ModelTest):
             "acc_norm": {"value": 0.3055, "floor_pct": 0.2},
         },
     }
-    TRUST_REMOTE_CODE = False
-    EVAL_BATCH_SIZE = 6
 
     def test_qwen2_5(self):
         self.quant_lm_eval()
diff --git a/tests/test_awq.py b/tests/test_awq.py
@@ -47,15 +47,14 @@ def setUpClass(cls):
         if requested_samples is not None:
             sample_count = max(1, int(requested_samples))
         else:
-            total_mem_gb = 0
             if torch.cuda.is_available():
                 try:
-                    total_mem_gb = (
+                    (
                         torch.cuda.get_device_properties(torch.cuda.current_device()).total_memory
                         / (1024 ** 3)
                     )
                 except Exception:
-                    total_mem_gb = 0
+                    pass
 
             # if total_mem_gb >= 80:
             #     sample_count = 1024

diff --git a/tests/test_awq_weight_mean.py b/tests/test_awq_weight_mean.py
@@ -1,12 +1,13 @@
 import os
 
+
 os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
 os.environ["PYTORCH_ALLOC_CONF"] = "expandable_segments:True,max_split_size_mb:256,garbage_collection_threshold:0.7" #"expandable_segments:True"
 
 import time
-import torch
-import pytest
 
+import pytest
+import torch
 from parameterized import parameterized
 from pytest import MonkeyPatch
 from torch import nn

diff --git a/tests/test_subset_parsing.py b/tests/test_subset_parsing.py
@@ -3,23 +3,32 @@
 # SPDX-License-Identifier: Apache-2.0
 # Contact: qubitium@modelcloud.ai, x.com/qubitium
 import os
+import sys
+from pathlib import Path
 from types import SimpleNamespace
 from typing import Callable, Dict, List, Optional
 
 import torch
 from transformers import Qwen3MoeConfig, Qwen3MoeForCausalLM
 from transformers.models.qwen3_moe.modeling_qwen3_moe import Qwen3MoeSparseMoeBlock
 
+
+repo_root = Path(__file__).resolve().parents[1]
+repo_str = str(repo_root)
+if repo_str not in sys.path:
+    sys.path.insert(0, repo_str)
+
 from gptqmodel.looper.awq_processor import AWQProcessor
 from gptqmodel.looper.loop_processor import LoopProcessor
 from gptqmodel.looper.module_looper import ModuleLooper
-from gptqmodel.looper.stage_subset import run_subset_stage
 from gptqmodel.looper.named_module import NamedModule
+from gptqmodel.looper.stage_subset import run_subset_stage
+from gptqmodel.models.definitions.qwen2_moe import Qwen2MoeQModel
+from gptqmodel.models.definitions.qwen3_moe import Qwen3MoeQModel
+from gptqmodel.nn_modules.hooked_linear import replace_module_with_hooked_legacy
 from gptqmodel.quantization import FORMAT, METHOD
 from gptqmodel.quantization.config import QuantizeConfig, VRAMStrategy
-from gptqmodel.nn_modules.hooked_linear import replace_module_with_hooked_legacy
 from gptqmodel.utils.model import find_modules, get_module_by_name_prefix
-from gptqmodel.models.definitions.qwen3_moe import Qwen3MoeQModel
 
 
 # honour the request to bind the test harness to GPU index 5 when CUDA is available
@@ -71,7 +80,7 @@ def test_mlp_capture_flag_propagates_to_layer_modules():
         include_capture_only=True,
     )
     capture_blocks = [block for block in full if any(":?" in name for name in block)]
-    assert capture_blocks and capture_blocks[0] == ["mlp:?"]
+    assert capture_blocks and "mlp:?" in capture_blocks[0]
 
     simple = Qwen3MoeQModel.simple_layer_modules(
         model_config=model_config,
@@ -85,6 +94,20 @@ def test_mlp_capture_flag_propagates_to_layer_modules():
     assert isinstance(mlp_module, Qwen3MoeSparseMoeBlock)
 
 
+def test_qwen2_moe_shared_expert_merges_with_experts():
+    blocks = Qwen2MoeQModel.build_layer_modules(Qwen2MoeQModel.module_tree)
+
+    gate_block = next(block for block in blocks if "mlp.shared_expert.gate_proj" in block)
+    assert "mlp.experts.{expert_index}.gate_proj" in gate_block
+    assert "mlp.experts.{expert_index}.up_proj" in gate_block
+
+    down_block = next(block for block in blocks if "mlp.shared_expert.down_proj" in block)
+    assert "mlp.experts.{expert_index}.down_proj" in down_block
+
+    expert_gate_blocks = [block for block in blocks if "mlp.experts.{expert_index}.gate_proj" in block]
+    assert len(expert_gate_blocks) == 1
+
+
 def test_awq_processor_enables_subset_early_stop():
     calibration = [{"input_ids": torch.tensor([1, 2, 3])}]
     qcfg = _make_quant_config()
@@ -165,14 +188,14 @@ def __call__(
         processor: str,
     ):
         self.events.append(
-            dict(
-                stage=stage,
-                layer_idx=layer_idx,
-                subset_index=subset_index,
-                subset_total=subset_total,
-                module_names=module_names,
-                processor=processor,
-            )
+            {
+                "stage": stage,
+                "layer_idx": layer_idx,
+                "subset_index": subset_index,
+                "subset_total": subset_total,
+                "module_names": module_names,
+                "processor": processor,
+            }
         )