Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -183,3 +183,4 @@ example.py
/gptqmodel_ext/marlin/kernel_fp16_ku8b128.cu
/gptqmodel_offload/
/gptqmodel_ext/machete/generated/
AGENT.md
7 changes: 4 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,8 @@
</p>

## Latest News
* 10/30/2025 5.1.0-dev: +Marin model. +AWQ Torch reference kernel. Fix AWQ Marlin kernel for bf16. Fix GLM 4.5/4.6 MoE missing `mtp` layers on model save (HF bug). Modular refractor.
* 10/30/2025 5.1.0-dev: 🎉AWQ support out of beta with full feature support in including multi-gpu quant and MoE vram saving.
* 10/30/2025 5.1.0-dev: ✨Marin model. New AWQ Torch reference kernel. Fix AWQ Marlin kernel for bf16. Fix GLM 4.5/4.6 MoE missing `mtp` layers on model save (HF bug). Modular refractor.
* 10/28/2025 5.1.0-dev: Minimax M2 support with [ModelCloud BF16 M2 Model](https://huggingface.co/ModelCloud/MiniMax-M2-BF16). New `VramStrategy.Balanced` quantization property for reduced memory usage for large MoE on multi-3090 (24GB) devices.
* 10/24/2025 [5.0.0](https://github.com/ModelCloud/GPTQModel/releases/tag/v5.0.0): 🎉 Data-parallel quant support for `MoE` models on multi-gpu using `nogil` Python. `offload_to_disk` support enabled by
default to massively reduce `cpu` ram usage. New `Intel` and `AMD` cpu hw accelerated `TorchFused` kernel. Packing stage is now 4x faster and now inlined with quantization. `Vram` pressure for large models reduced during quantization.
Expand Down Expand Up @@ -135,12 +136,12 @@ GPT-QModel not only supports GPTQ but also QQQ, GPTQv2, Eora with more quantizat

GPT-QModel is a modular design supporting multiple quantization methods and feature extensions.

| Quantization Feature | GPT-QModel | Transformers | vLLM | SGLang | Lora Training |
| Quantization Feature | GPT-QModel | Transformers | vLLM | SGLang | Lora Training |
|---------------------------|------------|---|---|---|---------------|
| GPTQ | ✅ | ✅ | ✅ | ✅ | ✅ |
| AWQ | ✅ | ✅ | ✅ | ✅ | ✅ |
| EoRA | ✅ | ✅ | ✅ | ✅ | x |
| Group Aware Act Reordering | ✅ | ✅ | ✅ | ✅ | ✅ |
| AWQ | ✅ | ✅* | ✅* | ✅* | ✅* |
| QQQ | ✅ | x | x | x | x |
| Rotation | ✅ | x | x | x | x |
| GPTAQ | ✅ | ✅ | ✅ | ✅ | ✅ |
Expand Down
902 changes: 652 additions & 250 deletions gptqmodel/looper/awq_processor.py

Large diffs are not rendered by default.

12 changes: 10 additions & 2 deletions gptqmodel/looper/dequantize_processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
# SPDX-License-Identifier: Apache-2.0
# Contact: qubitium@modelcloud.ai, x.com/qubitium

from typing import Dict
from typing import Dict, Optional

import torch

Expand All @@ -28,7 +28,15 @@ def set_calibration_dataset(self, calibration_dataset):
self.num_batches = 0

# de-quantize weights
def process(self, module: NamedModule):
def process(
self,
module: NamedModule,
device: torch.device = None,
subset: Optional[Dict[str, NamedModule]] = None,
previous_subset: Optional[Dict[str, NamedModule]] = None,
subset_index: Optional[int] = None,
subset_total: Optional[int] = None,
):
device = module.weight.device

# TODO fix num_itr param..need to calculate this before dequant
Expand Down
10 changes: 9 additions & 1 deletion gptqmodel/looper/eora_processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -195,7 +195,15 @@ def _finalize_eigen_scaling_matrix(self, name: str) -> torch.Tensor:

return merge_eora_segments(segment_pairs)

def process(self, module: NamedModule):
def process(
self,
module: NamedModule,
device: torch.device = None,
subset: Optional[Dict[str, NamedModule]] = None,
previous_subset: Optional[Dict[str, NamedModule]] = None,
subset_index: Optional[int] = None,
subset_total: Optional[int] = None,
):
assert isinstance(module.adapter_cfg, Lora)

self.pb.title(f"EoRA: Processing {module.name} ({module.module_dtype}) in layer").draw()
Expand Down
12 changes: 10 additions & 2 deletions gptqmodel/looper/gptq_processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
import copy
import threading
import time
from typing import Callable, Optional, Tuple
from typing import Callable, Dict, Optional, Tuple

import torch
from torch.nn import Module
Expand Down Expand Up @@ -104,7 +104,15 @@ def tmp(module, inp: Tuple[torch.Tensor, ...], out: torch.Tensor):
del inp, out
return tmp

def process(self, module: NamedModule):
def process(
self,
module: NamedModule,
device: torch.device = None,
subset: Optional[Dict[str, NamedModule]] = None,
previous_subset: Optional[Dict[str, NamedModule]] = None,
subset_index: Optional[int] = None,
subset_total: Optional[int] = None,
):
# Reset peak memory stats
#torch.cuda.reset_peak_memory_stats()
self.pb.title(f"Quantizing {module.name} in layer ").draw()
Expand Down
10 changes: 9 additions & 1 deletion gptqmodel/looper/loop_processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -545,7 +545,15 @@ def pre_process_fwd_hook(self, name: str) -> Callable[[Module, Tuple[torch.Tenso
pass

# do work and return processor.self state which will updated/merged
def process(self, module: NamedModule, device: torch.device = None):
def process(
self,
module: NamedModule,
device: torch.device = None,
subset: Optional[Dict[str, NamedModule]] = None,
previous_subset: Optional[Dict[str, NamedModule]] = None,
subset_index: Optional[int] = None,
subset_total: Optional[int] = None,
):
pass

# last step, after all loop processor is called
Expand Down
40 changes: 22 additions & 18 deletions gptqmodel/looper/module_looper.py
Original file line number Diff line number Diff line change
Expand Up @@ -1030,7 +1030,12 @@ def _loop_impl(self, fail_safe: bool = False, **kwargs):
if region_timer is not None:
region_timer.flush()

layer_modules = self.gptq_model.simple_layer_modules(model_config=self.gptq_model.model.config, quantize_config=self.gptq_model.quantize_config)
is_awq_quantize = any(isinstance(proc, AWQProcessor) for proc in self.processors)
layer_modules = self.gptq_model.simple_layer_modules(
model_config=self.gptq_model.model.config,
quantize_config=self.gptq_model.quantize_config,
is_awq_quantize=is_awq_quantize,
)

# true-sequential will replay the quantized activations after each subset has been quantized to be used for next subset quantization
# this should always be true for gptq unless you want lower but misleading error_loss that is misleading and will lead to lower post-quantized model
Expand Down Expand Up @@ -1143,8 +1148,7 @@ def _loop_impl(self, fail_safe: bool = False, **kwargs):

return total_log

def crate_named_modules(self, full, is_lm_head_module, layer_index, layers_prefix, names, processor, fail_safe) -> Dict[str, NamedModule]:
is_awq_quant = isinstance(processor, AWQProcessor)
def crate_named_modules(self, full, is_lm_head_module, layer_index, layers_prefix, names, processor, fail_safe, layer_module=None) -> Dict[str, NamedModule]:
subset = {}
for n in names:
if n in full:
Expand All @@ -1168,20 +1172,20 @@ def crate_named_modules(self, full, is_lm_head_module, layer_index, layers_prefi

subset[name] = named_module
full[name] = named_module
if layer_module is not None:
named_module.state.setdefault("layer_module", layer_module)

if not is_awq_quant:
if isinstance(processor, GPTQProcessor):
processor.preprocess(subset[name], fail_safe=fail_safe)
else:
processor.preprocess(subset[name])
# some modules are skipped
if processor.is_skipped(subset[name]):
skipped_modules.append(name)

if not is_awq_quant:
for name in skipped_modules:
subset.pop(name)
task_map = getattr(processor, "tasks", None)
if task_map is not None:
task_map.pop(name, None)
if isinstance(processor, GPTQProcessor):
processor.preprocess(subset[name], fail_safe=fail_safe)
else:
processor.preprocess(subset[name])
# some modules are skipped
if processor.is_skipped(subset[name]):
skipped_modules.append(name)

for name in skipped_modules:
subset.pop(name)
task_map = getattr(processor, "tasks", None)
if task_map is not None:
task_map.pop(name, None)
return subset
12 changes: 10 additions & 2 deletions gptqmodel/looper/native_processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
# SPDX-License-Identifier: Apache-2.0
# Contact: qubitium@modelcloud.ai, x.com/qubitium

from typing import Callable, Optional, Tuple
from typing import Callable, Dict, Optional, Tuple

import torch
from torch.nn import Module
Expand Down Expand Up @@ -66,7 +66,15 @@ def tmp(module, inp: Tuple[torch.Tensor, ...], out: torch.Tensor):

return tmp

def process(self, module: NamedModule):
def process(
self,
module: NamedModule,
device: torch.device = None,
subset: Optional[Dict[str, NamedModule]] = None,
previous_subset: Optional[Dict[str, NamedModule]] = None,
subset_index: Optional[int] = None,
subset_total: Optional[int] = None,
):
module.state[NATIVE_INPUTS_STATE_KEY] = self.native_inp_caches.pop(module.name)

def submodule_finalize(self, module: NamedModule, model: BaseQModel, **kwargs):
Expand Down
12 changes: 10 additions & 2 deletions gptqmodel/looper/qqq_processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

import contextlib
import copy
from typing import Callable, Optional, Tuple
from typing import Callable, Dict, Optional, Tuple

import torch
from torch.nn import Module
Expand Down Expand Up @@ -100,7 +100,15 @@ def tmp(_, inp: Tuple[torch.Tensor, ...], out: torch.Tensor):
q.add_batch(inp[0].data, out.data) # noqa: F821
return tmp

def process(self, module: NamedModule):
def process(
self,
module: NamedModule,
device: torch.device = None,
subset: Optional[Dict[str, NamedModule]] = None,
previous_subset: Optional[Dict[str, NamedModule]] = None,
subset_index: Optional[int] = None,
subset_total: Optional[int] = None,
):
self.pb.title(f"Quantizing {module.name} in layer ").draw()
qqq = self.tasks

Expand Down
57 changes: 23 additions & 34 deletions gptqmodel/looper/stage_layer.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,10 +7,10 @@

from __future__ import annotations

import logging
import threading
import time
from concurrent.futures import as_completed
from contextlib import nullcontext
from typing import TYPE_CHECKING, Dict, List, Optional

import torch
Expand All @@ -20,10 +20,8 @@
from ..looper.gptq_processor import GPTQProcessor
from ..looper.named_module import NamedModule
from ..looper.qqq_processor import QQQProcessor
from ..utils.ctx import ctx
from ..utils.device import get_device, get_device_new
from ..utils.logger import log_time_block, setup_logger
from ..utils.looper_helpers import device_ctx
from ..utils.model import find_modules, get_module
from ..utils.offload import offload_to_disk
from ..utils.torch import CPU, torch_sync
Expand Down Expand Up @@ -90,37 +88,6 @@ def run_layer_stage(
# merge all subsets into one
modules = [sum(modules, [])]

# AWQ does per-layer itself; skip here
if isinstance(processor, AWQProcessor):
named_childs = dict()
for index, names in enumerate(modules):
named_modules = looper.crate_named_modules(full=full,
is_lm_head_module=is_lm_head_module,
layer_index=layer_index, layers_prefix=layers_prefix,
names=names,
processor=processor,
fail_safe=fail_safe)
named_childs.update(named_modules)

lock_ctx = nullcontext()
device_for_ctx = cur_layer_device if getattr(cur_layer_device, 'type', None) != 'meta' else None
if device_for_ctx is not None:
lock_ctx = DEVICE_THREAD_POOL.read_lock(cur_layer_device)
with ctx(lock_ctx, device_ctx(device_for_ctx)):
processor.layer_quantize(module, cur_layer_device, named_childs)
if p_index == len(looper.processors) - 1:
looper._emit_layer_complete(
layer_idx=layer_index,
submodule_finalized=False,
raise_in_place=True,
)
looper._emit_layer_complete(
layer_idx=layer_index,
submodule_finalized=True,
raise_in_place=True,
)
continue

layer_inputs = processor.inputs_cache.layer_inputs
if is_lm_head_module:
layer_inputs = looper.gptq_model.lm_head_pre_quantize_generate_hook(layer_inputs)
Expand All @@ -131,8 +98,28 @@ def run_layer_stage(
processed_subset: Dict[str, NamedModule] = {}
last_subset_context: Optional[SubsetForwardContext] = None
subset_total = len(modules)
previous_subset_processed: Optional[Dict[str, NamedModule]] = None

for index, names in enumerate(modules):
if isinstance(processor, AWQProcessor):
log.info(
"StageLayer[awq]: layer=%s subset=%s/%s size=%s names=%s",
layer_index,
index + 1,
subset_total,
len(names),
names[:5],
)
elif log.isEnabledFor(logging.DEBUG):
log.debug(
"StageLayer: layer=%s subset=%s/%s processor=%s size=%s names=%s",
layer_index,
index + 1,
subset_total,
processor.name(),
len(names),
names[:8],
)
subset_result = run_subset_stage(
looper,
processor=processor,
Expand All @@ -156,10 +143,12 @@ def run_layer_stage(
pb=pb,
log=log,
region_timer=region_timer,
previous_processed_subset=previous_subset_processed,
)

layer_inputs = subset_result.layer_inputs
processed_subset.update(subset_result.processed_subset)
previous_subset_processed = subset_result.processed_subset
if subset_result.forward_context is not None:
last_subset_context = subset_result.forward_context

Expand Down
Loading