Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 8 additions & 8 deletions gptqmodel/looper/awq_processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -204,13 +204,13 @@ def _layer_input_features(self, state: _AWQLayerState) -> Dict[str, torch.Tensor
# tuple(features[name].shape),
# )

for root, tensors in root_buckets.items():
if not tensors or root in features:
continue
try:
features[root] = torch.cat(tensors, dim=0)
except RuntimeError:
features[root] = tensors[0]
# for root, tensors in root_buckets.items():
# if not tensors or root in features:
# continue
# try:
# features[root] = torch.cat(tensors, dim=0)
# except RuntimeError:
# features[root] = tensors[0]
return features

def _refresh_forward_kwargs_from_cache(self) -> None:
Expand Down Expand Up @@ -1233,7 +1233,7 @@ def pre_process_fwd_hook(self, name: str) -> Callable[[Module, Tuple[torch.Tenso
def hook(module, inp: Tuple[torch.Tensor, ...], out: torch.Tensor):
if not inp:
return
feature = inp[0]
feature = inp
if isinstance(feature, (tuple, list)) and feature:
feature = feature[0]
self._record_input_feature(name, feature)
Expand Down
7 changes: 6 additions & 1 deletion gptqmodel/looper/module_looper.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@
from ..looper.named_module import NamedModule
from ..models import BaseQModel
from ..models._const import SUPPORTS_MODULE_TYPES
from ..models.base import CAPTURE_ONLY_FLAG
from ..nn_modules.hooked_linear import (STOP_FORWARD_EXCEPTION, HookedLinear,
StopForward, replace_module_with_hooked_legacy)
from ..quantization.config import VRAMStrategy
Expand Down Expand Up @@ -1251,11 +1252,15 @@ def _loop_impl(self, fail_safe: bool = False, **kwargs):

return total_log

def crate_named_modules(self, full, is_lm_head_module, layer_index, layers_prefix, names, processor, fail_safe, layer_module=None) -> Dict[str, NamedModule]:
def crate_named_modules(self, module, full, is_lm_head_module, layer_index, layers_prefix, names, processor, fail_safe, layer_module=None) -> Dict[str, NamedModule]:
subset = {}
for n in names:
if n in full:
subset[n] = full[n]
elif n.endswith(CAPTURE_ONLY_FLAG):
# Obtain the CAPTURE_ONLY_FLAG Module separately
n = n.split(CAPTURE_ONLY_FLAG, 1)[0]
subset[n], _ = get_module_by_name_prefix(module, module_name=n)
# some modules have layer_modules that are dynamic based on config
# ref: deepseek v2/v3/r1
elif self.gptq_model.layer_modules_strict:
Expand Down
18 changes: 13 additions & 5 deletions gptqmodel/looper/named_module.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,12 +54,14 @@ def __init__(self, module: torch.nn.Module, name: str, full_name:str, layer_inde
in_features = module.weight.shape[0]
out_features = module.weight.shape[1]
else:
raise NotImplementedError(f"Unsupported module.module type: `{type(module)}`")
in_features = None
out_features = None

self.state.update({
"in_features": in_features,
"out_features": out_features,
})
if in_features and out_features:
self.state.update({
"in_features": in_features,
"out_features": out_features,
})

def parameters(self, recurse: bool = True):
return self.module.parameters(recurse=recurse)
Expand All @@ -73,6 +75,12 @@ def buffers(self, recurse: bool = True):
def named_buffers(self, prefix: str = "", recurse: bool = True):
return self.module.named_buffers(prefix=prefix, recurse=recurse)

def register_forward_hook(
self, *args, **kwargs
):
with self._parent_lock:
return self.module.register_forward_hook(*args, **kwargs)

def register_buffer(
self, name: str, tensor: Optional[Tensor], persistent: bool = True
) -> None:
Expand Down
1 change: 1 addition & 0 deletions gptqmodel/looper/stage_subset.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,7 @@ def run_subset_stage(
is_awq_processor = processor_name_lower.startswith("awq")

subset = looper.crate_named_modules(
module=module,
full=full,
is_lm_head_module=is_lm_head_module,
layer_index=layer_index,
Expand Down
176 changes: 140 additions & 36 deletions gptqmodel/models/base.py

Large diffs are not rendered by default.

22 changes: 18 additions & 4 deletions gptqmodel/models/definitions/qwen2_moe.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,12 +21,26 @@ class Qwen2MoeQModel(BaseQModel):
"input_layernorm": ("input_layernorm:!",),
"self_attn": ("q_proj:0", "k_proj:0", "v_proj:0", "o_proj:1"),
"post_attention_layernorm": ("post_attention_layernorm:!",),
"mlp": {
"gate": ("gate:0",),
"shared_expert": ("gate_proj:0", "up_proj:0", "down_proj:1"),
"experts": {
"mlp:?": {
"gate": ("gate:!",),
"shared_expert:0": ("gate_proj:0", "up_proj:0", "down_proj:1"),
"experts:0": {
"#": ("gate_proj:0", "up_proj:0", "down_proj:1"),
},
},
}
]

# module_tree_overrides = {
# METHOD.AWQ: [
# {
# "mlp:?": {
# "gate": ("gate:!",),
# "shared_expert": None,
# "experts": {
# "#": ("gate_proj:0", "up_proj:0", "down_proj:1"),
# },
# },
# }
# ]
# }
4 changes: 2 additions & 2 deletions gptqmodel/quantization/gptq.py
Original file line number Diff line number Diff line change
Expand Up @@ -571,8 +571,8 @@ def materialize_global_hessian(self, target_device: Optional[torch.device] = Non
for partial_device, partial in self._device_hessian_partials.items():
if partial.device != result_accum.device or partial.dtype != torch.float32:
# TODO FIXME multi-3090 using P2P is revaling an issue where result_accum and/or partial is not ready for consolidation on the main thread
# when parials are calculated on the individual
try:
# when parials are calculated on the individual
try:
result_accum.add_(partial.to(device=result_accum.device, dtype=torch.float32))
except:
log.warn(f"Quantization: Module `{self.name}` -> Retry partial.to in 0.25s")
Expand Down
2 changes: 1 addition & 1 deletion tests/models/model_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,14 +58,14 @@ def is_flash_attn_2_available(): # type: ignore
return False

from gptqmodel import BACKEND, DEBUG_ON, GPTQModel # noqa: E402
from gptqmodel.looper.module_looper import StopMainLoop # noqa: E402
from gptqmodel.models.base import BaseQModel # noqa: E402
from gptqmodel.nn_modules.qlinear import BaseQuantLinear # noqa: E402
from gptqmodel.quantization import FORMAT, METHOD # noqa: E402
from gptqmodel.quantization.config import QuantizeConfig, VRAMStrategy # noqa: E402
from gptqmodel.utils.eval import EVAL # noqa: E402
from gptqmodel.utils.model import MODALITY # noqa: E402
from gptqmodel.utils.torch import torch_empty_cache # noqa: E402
from gptqmodel.looper.module_looper import StopMainLoop # noqa: E402


RAND_SEED = 898
Expand Down
4 changes: 1 addition & 3 deletions tests/models/test_glm4_moe._awq.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,8 @@
# Contact: qubitium@modelcloud.ai, x.com/qubitium
from model_test import ModelTest

from gptqmodel.quantization.config import VRAMStrategy
from gptqmodel.utils.eval import EVAL
from gptqmodel.quantization import FORMAT, METHOD

from gptqmodel.utils.eval import EVAL


# | Metric | MARLIN |
Expand Down
2 changes: 0 additions & 2 deletions tests/models/test_qwen2_moe_quant.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,6 @@ class TestQwen2_5_Moe(ModelTest):
"acc_norm": {"value": 0.3055, "floor_pct": 0.2},
},
}
TRUST_REMOTE_CODE = False
EVAL_BATCH_SIZE = 6

def test_qwen2_5(self):
self.quant_lm_eval()
5 changes: 2 additions & 3 deletions tests/test_awq.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,15 +47,14 @@ def setUpClass(cls):
if requested_samples is not None:
sample_count = max(1, int(requested_samples))
else:
total_mem_gb = 0
if torch.cuda.is_available():
try:
total_mem_gb = (
(
torch.cuda.get_device_properties(torch.cuda.current_device()).total_memory
/ (1024 ** 3)
)
except Exception:
total_mem_gb = 0
pass

# if total_mem_gb >= 80:
# sample_count = 1024
Expand Down
5 changes: 3 additions & 2 deletions tests/test_awq_weight_mean.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,13 @@
import os


os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["PYTORCH_ALLOC_CONF"] = "expandable_segments:True,max_split_size_mb:256,garbage_collection_threshold:0.7" #"expandable_segments:True"

import time
import torch
import pytest

import pytest
import torch
from parameterized import parameterized
from pytest import MonkeyPatch
from torch import nn
Expand Down
47 changes: 35 additions & 12 deletions tests/test_subset_parsing.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,23 +3,32 @@
# SPDX-License-Identifier: Apache-2.0
# Contact: qubitium@modelcloud.ai, x.com/qubitium
import os
import sys
from pathlib import Path
from types import SimpleNamespace
from typing import Callable, Dict, List, Optional

import torch
from transformers import Qwen3MoeConfig, Qwen3MoeForCausalLM
from transformers.models.qwen3_moe.modeling_qwen3_moe import Qwen3MoeSparseMoeBlock


repo_root = Path(__file__).resolve().parents[1]
repo_str = str(repo_root)
if repo_str not in sys.path:
sys.path.insert(0, repo_str)

from gptqmodel.looper.awq_processor import AWQProcessor
from gptqmodel.looper.loop_processor import LoopProcessor
from gptqmodel.looper.module_looper import ModuleLooper
from gptqmodel.looper.stage_subset import run_subset_stage
from gptqmodel.looper.named_module import NamedModule
from gptqmodel.looper.stage_subset import run_subset_stage
from gptqmodel.models.definitions.qwen2_moe import Qwen2MoeQModel
from gptqmodel.models.definitions.qwen3_moe import Qwen3MoeQModel
from gptqmodel.nn_modules.hooked_linear import replace_module_with_hooked_legacy
from gptqmodel.quantization import FORMAT, METHOD
from gptqmodel.quantization.config import QuantizeConfig, VRAMStrategy
from gptqmodel.nn_modules.hooked_linear import replace_module_with_hooked_legacy
from gptqmodel.utils.model import find_modules, get_module_by_name_prefix
from gptqmodel.models.definitions.qwen3_moe import Qwen3MoeQModel


# honour the request to bind the test harness to GPU index 5 when CUDA is available
Expand Down Expand Up @@ -71,7 +80,7 @@ def test_mlp_capture_flag_propagates_to_layer_modules():
include_capture_only=True,
)
capture_blocks = [block for block in full if any(":?" in name for name in block)]
assert capture_blocks and capture_blocks[0] == ["mlp:?"]
assert capture_blocks and "mlp:?" in capture_blocks[0]

simple = Qwen3MoeQModel.simple_layer_modules(
model_config=model_config,
Expand All @@ -85,6 +94,20 @@ def test_mlp_capture_flag_propagates_to_layer_modules():
assert isinstance(mlp_module, Qwen3MoeSparseMoeBlock)


def test_qwen2_moe_shared_expert_merges_with_experts():
blocks = Qwen2MoeQModel.build_layer_modules(Qwen2MoeQModel.module_tree)

gate_block = next(block for block in blocks if "mlp.shared_expert.gate_proj" in block)
assert "mlp.experts.{expert_index}.gate_proj" in gate_block
assert "mlp.experts.{expert_index}.up_proj" in gate_block

down_block = next(block for block in blocks if "mlp.shared_expert.down_proj" in block)
assert "mlp.experts.{expert_index}.down_proj" in down_block

expert_gate_blocks = [block for block in blocks if "mlp.experts.{expert_index}.gate_proj" in block]
assert len(expert_gate_blocks) == 1


def test_awq_processor_enables_subset_early_stop():
calibration = [{"input_ids": torch.tensor([1, 2, 3])}]
qcfg = _make_quant_config()
Expand Down Expand Up @@ -165,14 +188,14 @@ def __call__(
processor: str,
):
self.events.append(
dict(
stage=stage,
layer_idx=layer_idx,
subset_index=subset_index,
subset_total=subset_total,
module_names=module_names,
processor=processor,
)
{
"stage": stage,
"layer_idx": layer_idx,
"subset_index": subset_index,
"subset_total": subset_total,
"module_names": module_names,
"processor": processor,
}
)


Expand Down