Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
29 changes: 15 additions & 14 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
</p>

## Latest News
* 10/31/2025 5.1.0-dev: ✨IBM Granite Nano support. New `calibration_concat_separator` config option.
* 10/30/2025 5.1.0-dev: 🎉AWQ support out of beta with full feature support in including multi-gpu quant and MoE vram saving.
* 10/30/2025 5.1.0-dev: ✨Marin model. New AWQ Torch reference kernel. Fix AWQ Marlin kernel for bf16. Fix GLM 4.5/4.6 MoE missing `mtp` layers on model save (HF bug). Modular refractor.
* 10/28/2025 5.1.0-dev: Minimax M2 support with [ModelCloud BF16 M2 Model](https://huggingface.co/ModelCloud/MiniMax-M2-BF16). New `VramStrategy.Balanced` quantization property for reduced memory usage for large MoE on multi-3090 (24GB) devices.
Expand Down Expand Up @@ -179,20 +180,20 @@ Native support support some of the most popular multi-modal models:
<img src=https://github.com/user-attachments/assets/c1b89394-f8f6-44e5-9949-bef15a124723 width="51%"> <img src=https://github.com/user-attachments/assets/23901236-10c5-4435-ac2f-06cf2e097f1e width="47%">

## Model Support
| Model | | | | | | | | | |
|-------------------|---|-------------------|---|----------------|---|----------------|---|---------------------|---|
| Apertus | ✅ | EXAONE 3.0 | ✅ | InternLM 1/2.5 | ✅ | Mixtral | ✅ | Qwen 2/3 (Next/MoE) | ✅ |
| Baichuan | ✅ | Falcon (H1) | ✅ | Kimi K2 | ✅ | MobileLLM | ✅ | Qwen 2/2.5 VL | ✅ |
| Bloom | ✅ | FastVLM | ✅ | Klear | ✅ | MOSS | ✅ | Qwen 2.5/3 Omni | ✅ |
| ChatGLM | ✅ | Gemma 1/2/3 | ✅ | LING/RING | ✅ | MPT | ✅ | RefinedWeb | ✅ |
| CodeGen | ✅ | GPTBigCod | ✅ | Llama 1-3.3 | ✅ | Nemotron H | ✅ | StableLM | ✅ |
| Cohere 1-2 | ✅ | GPTQ-Neo/GPT-NeoX | ✅ | Llama 3.2 VL | ✅ | Nemotron Ultra | ✅ | StarCoder2 | ✅ |
| DBRX Converted | ✅ | GPT-2 | ✅ | Llama 4 | ✅ | OPT | ✅ | TeleChat2 | ✅ |
| Deci | ✅ | GPT-J | ✅ | LongCatFlash | ✅ | OLMo2 | ✅ | Yi | ✅ |
| DeepSeek-V2/V3/R1 | ✅ | GPT-OSS | ✅ | LongLLaMA | ✅ | Ovis 1.6/2 | ✅ | Seed-OSS | ✅ |
| DeepSeek-V2-Lite | ✅ | Granite | ✅ | Instella | ✅ | Phi 1-4 | ✅ | XVERSE | ✅ |
| Dream | ✅ | GRIN-MoE | ✅ | MiniCPM3 | ✅ | PanGu-α | ✅ | | |
| ERNIE 4.5 | ✅ | Hymba | ✅ | Mistral | ✅ | Qwen 1/2/3 | ✅ | | |
| Model | | | | | | | | | |
|-------------------|---|-------------|---|----------------|---|----------------|---|---------------------|---|
| Apertus | ✅ | EXAONE 3.0 | ✅ | InternLM 1/2.5 | ✅ | Mixtral | ✅ | Qwen 2/3 (Next/MoE) | ✅ |
| Baichuan | ✅ | Falcon (H1) | ✅ | Kimi K2 | ✅ | MobileLLM | ✅ | Qwen 2/2.5/3 VL | ✅ |
| Bloom | ✅ | FastVLM | ✅ | Klear | ✅ | MOSS | ✅ | Qwen 2.5/3 Omni | ✅ |
| ChatGLM | ✅ | Gemma 1/2/3 | ✅ | LING/RING | ✅ | MPT | ✅ | RefinedWeb | ✅ |
| CodeGen | ✅ | GPTBigCod | ✅ | Llama 1-3.3 | ✅ | Nemotron H | ✅ | StableLM | ✅ |
| Cohere 1-2 | ✅ | GPTQ-Neo(X) | ✅ | Llama 3.2 VL | ✅ | Nemotron Ultra | ✅ | StarCoder2 | ✅ |
| DBRX Converted | ✅ | GPT-2 | ✅ | Llama 4 | ✅ | OPT | ✅ | TeleChat2 | ✅ |
| Deci | ✅ | GPT-J | ✅ | LongCatFlash | ✅ | OLMo2 | ✅ | Yi | ✅ |
| DeepSeek-V2/V3/R1 | ✅ | GPT-OSS | ✅ | LongLLaMA | ✅ | Ovis 1.6/2 | ✅ | Seed-OSS | ✅ |
| DeepSeek-V2-Lite | ✅ | Granite | ✅ | Instella | ✅ | Phi 1-4 | ✅ | XVERSE | ✅ |
| Dream | ✅ | GRIN-MoE | ✅ | MiniCPM3 | ✅ | PanGu-α | ✅ | Minimax M2 | ✅ |
| ERNIE 4.5 | ✅ | Hymba | ✅ | Mistral | ✅ | Qwen 1/2/3 | ✅ | GLM 4.X | ✅ |


## Platform and HW Support
Expand Down
25 changes: 23 additions & 2 deletions gptqmodel/quantization/gptq.py
Original file line number Diff line number Diff line change
Expand Up @@ -594,6 +594,10 @@ def hessian_inverse(self, H: torch.Tensor):
mean = torch.mean(current_diag)
damp = self.qcfg.damp_percent

damp_recovery_started = False
recovery_initial_damp = None
recovery_last_damp = None

while 0 < damp < 1:
try:
diag_view.add_(damp * mean)
Expand All @@ -602,19 +606,36 @@ def hessian_inverse(self, H: torch.Tensor):
diag_view.copy_(current_diag)
del H2
used_damp = damp
if damp_recovery_started:
log.warn(
f"Quantization: Module `{self.name}` -> Damp recovery succeeded at `damp_percent={damp:.5f}` "
f"(started at {recovery_initial_damp:.5f})."
)
return Hinv_result, used_damp
except torch._C._LinAlgError as e:
last_error = e
diag_view.copy_(current_diag)
if self.qcfg.damp_auto_increment != 0:
log.warn(
f"Quantization: Module `{self.name}` -> Current `damp_percent = {damp:.5f}` is too low, auto-incrementing by `{self.qcfg.damp_auto_increment:.5f}`")
if not damp_recovery_started:
damp_recovery_started = True
recovery_initial_damp = damp
log.warn(
f"Quantization: Module `{self.name}` -> Starting damp recovery at "
f"`damp_percent={damp:.5f}`, increment step `{self.qcfg.damp_auto_increment:.5f}`."
)
damp += self.qcfg.damp_auto_increment
recovery_last_damp = damp
else:
log.warn(
f"Quantization: Module `{self.name}` -> Hessian Cholesky failed with `damp_percent={damp:.5f}` and no auto increment configured.")
break

if damp_recovery_started:
final_damp = recovery_last_damp if recovery_last_damp is not None else damp
log.warn(
f"Quantization: Module `{self.name}` -> Damp recovery failed after reaching `damp_percent={final_damp:.5f}`."
)

attempt += 1

log.error(
Expand Down
16 changes: 9 additions & 7 deletions gptqmodel/utils/nogil_patcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -138,7 +138,7 @@ def patched_check_disk_cache(self, tuning_key, configs, bench_fn):
)
return False, bench_time, configs_timings, best_config

def _get_config_for_key(self, key, nargs, args, kwargs):
def _get_config_for_key(self, key, args, kwargs):
with self._cache_lock:
cached = self._cache.get(key)
if cached is not None:
Expand All @@ -157,18 +157,17 @@ def _get_config_for_key(self, key, nargs, args, kwargs):
if future.error is not None:
raise future.error
return future.config, future.used_cached_result, future.bench_time

pruned_configs = self.prune_configs(kwargs, nargs)
pruned_configs = self.prune_configs(kwargs)

def benchmark():
bench_start = time.time()
timings = {
config: self._bench(nargs, *args, config=config, **kwargs)
config: self._bench(*args, config=config, **kwargs)
for config in pruned_configs
}
bench_duration = time.time() - bench_start
best_config = builtins_mod.min(timings, key=timings.get)
full_nargs_local = {**nargs, **kwargs, **best_config.all_kwargs()}
full_nargs_local = {**self.nargs, **kwargs, **best_config.all_kwargs()}
self.pre_hook(full_nargs_local, reset_only=True)
return timings, bench_duration, best_config

Expand Down Expand Up @@ -203,6 +202,7 @@ def benchmark():

def patched_run(self, *args, **kwargs):
nargs = dict(zip(self.arg_names, args))
self.nargs = nargs
used_cached_result = True
bench_time = None
key = None
Expand All @@ -214,7 +214,7 @@ def patched_run(self, *args, **kwargs):
if hasattr(arg, "dtype"):
key_values.append(str(arg.dtype))
key = tuple(key_values)
config, used_cached_result, bench_time = _get_config_for_key(self, key, nargs, args, kwargs)
config, used_cached_result, bench_time = _get_config_for_key(self, key, args, kwargs)
else:
config = self.configs[0]

Expand All @@ -231,11 +231,13 @@ def patched_run(self, *args, **kwargs):
full_nargs = {**nargs, **kwargs, **config.all_kwargs()}
if config.pre_hook is not None:
config.pre_hook(full_nargs)
return self.fn.run(
result = self.fn.run(
*args,
**kwargs,
**config.all_kwargs(),
)
self.nargs = None
return result

autotuner_cls.__init__ = patched_init
autotuner_cls.check_disk_cache = patched_check_disk_cache
Expand Down