diff --git a/README.md b/README.md
index 89b696f92..f86ccffbd 100644
--- a/README.md
+++ b/README.md
@@ -17,6 +17,7 @@
## Latest News
+* 10/31/2025 5.1.0-dev: β¨IBM Granite Nano support. New `calibration_concat_separator` config option.
* 10/30/2025 5.1.0-dev: πAWQ support out of beta with full feature support in including multi-gpu quant and MoE vram saving.
* 10/30/2025 5.1.0-dev: β¨Marin model. New AWQ Torch reference kernel. Fix AWQ Marlin kernel for bf16. Fix GLM 4.5/4.6 MoE missing `mtp` layers on model save (HF bug). Modular refractor.
* 10/28/2025 5.1.0-dev: Minimax M2 support with [ModelCloud BF16 M2 Model](https://huggingface.co/ModelCloud/MiniMax-M2-BF16). New `VramStrategy.Balanced` quantization property for reduced memory usage for large MoE on multi-3090 (24GB) devices.
@@ -179,20 +180,20 @@ Native support support some of the most popular multi-modal models:
## Model Support
-| Model | | | | | | | | | |
-|-------------------|---|-------------------|---|----------------|---|----------------|---|---------------------|---|
-| Apertus | β
| EXAONE 3.0 | β
| InternLM 1/2.5 | β
| Mixtral | β
| Qwen 2/3 (Next/MoE) | β
|
-| Baichuan | β
| Falcon (H1) | β
| Kimi K2 | β
| MobileLLM | β
| Qwen 2/2.5 VL | β
|
-| Bloom | β
| FastVLM | β
| Klear | β
| MOSS | β
| Qwen 2.5/3 Omni | β
|
-| ChatGLM | β
| Gemma 1/2/3 | β
| LING/RING | β
| MPT | β
| RefinedWeb | β
|
-| CodeGen | β
| GPTBigCod | β
| Llama 1-3.3 | β
| Nemotron H | β
| StableLM | β
|
-| Cohere 1-2 | β
| GPTQ-Neo/GPT-NeoX | β
| Llama 3.2 VL | β
| Nemotron Ultra | β
| StarCoder2 | β
|
-| DBRX Converted | β
| GPT-2 | β
| Llama 4 | β
| OPT | β
| TeleChat2 | β
|
-| Deci | β
| GPT-J | β
| LongCatFlash | β
| OLMo2 | β
| Yi | β
|
-| DeepSeek-V2/V3/R1 | β
| GPT-OSS | β
| LongLLaMA | β
| Ovis 1.6/2 | β
| Seed-OSS | β
|
-| DeepSeek-V2-Lite | β
| Granite | β
| Instella | β
| Phi 1-4 | β
| XVERSE | β
|
-| Dream | β
| GRIN-MoE | β
| MiniCPM3 | β
| PanGu-Ξ± | β
| | |
-| ERNIE 4.5 | β
| Hymba | β
| Mistral | β
| Qwen 1/2/3 | β
| | |
+| Model | | | | | | | | | |
+|-------------------|---|-------------|---|----------------|---|----------------|---|---------------------|---|
+| Apertus | β
| EXAONE 3.0 | β
| InternLM 1/2.5 | β
| Mixtral | β
| Qwen 2/3 (Next/MoE) | β
|
+| Baichuan | β
| Falcon (H1) | β
| Kimi K2 | β
| MobileLLM | β
| Qwen 2/2.5/3 VL | β
|
+| Bloom | β
| FastVLM | β
| Klear | β
| MOSS | β
| Qwen 2.5/3 Omni | β
|
+| ChatGLM | β
| Gemma 1/2/3 | β
| LING/RING | β
| MPT | β
| RefinedWeb | β
|
+| CodeGen | β
| GPTBigCod | β
| Llama 1-3.3 | β
| Nemotron H | β
| StableLM | β
|
+| Cohere 1-2 | β
| GPTQ-Neo(X) | β
| Llama 3.2 VL | β
| Nemotron Ultra | β
| StarCoder2 | β
|
+| DBRX Converted | β
| GPT-2 | β
| Llama 4 | β
| OPT | β
| TeleChat2 | β
|
+| Deci | β
| GPT-J | β
| LongCatFlash | β
| OLMo2 | β
| Yi | β
|
+| DeepSeek-V2/V3/R1 | β
| GPT-OSS | β
| LongLLaMA | β
| Ovis 1.6/2 | β
| Seed-OSS | β
|
+| DeepSeek-V2-Lite | β
| Granite | β
| Instella | β
| Phi 1-4 | β
| XVERSE | β
|
+| Dream | β
| GRIN-MoE | β
| MiniCPM3 | β
| PanGu-Ξ± | β
| Minimax M2 | β
|
+| ERNIE 4.5 | β
| Hymba | β
| Mistral | β
| Qwen 1/2/3 | β
| GLM 4.X | β
|
## Platform and HW Support
diff --git a/gptqmodel/quantization/gptq.py b/gptqmodel/quantization/gptq.py
index 7321d5c98..4f7723a02 100644
--- a/gptqmodel/quantization/gptq.py
+++ b/gptqmodel/quantization/gptq.py
@@ -594,6 +594,10 @@ def hessian_inverse(self, H: torch.Tensor):
mean = torch.mean(current_diag)
damp = self.qcfg.damp_percent
+ damp_recovery_started = False
+ recovery_initial_damp = None
+ recovery_last_damp = None
+
while 0 < damp < 1:
try:
diag_view.add_(damp * mean)
@@ -602,19 +606,36 @@ def hessian_inverse(self, H: torch.Tensor):
diag_view.copy_(current_diag)
del H2
used_damp = damp
+ if damp_recovery_started:
+ log.warn(
+ f"Quantization: Module `{self.name}` -> Damp recovery succeeded at `damp_percent={damp:.5f}` "
+ f"(started at {recovery_initial_damp:.5f})."
+ )
return Hinv_result, used_damp
except torch._C._LinAlgError as e:
last_error = e
diag_view.copy_(current_diag)
if self.qcfg.damp_auto_increment != 0:
- log.warn(
- f"Quantization: Module `{self.name}` -> Current `damp_percent = {damp:.5f}` is too low, auto-incrementing by `{self.qcfg.damp_auto_increment:.5f}`")
+ if not damp_recovery_started:
+ damp_recovery_started = True
+ recovery_initial_damp = damp
+ log.warn(
+ f"Quantization: Module `{self.name}` -> Starting damp recovery at "
+ f"`damp_percent={damp:.5f}`, increment step `{self.qcfg.damp_auto_increment:.5f}`."
+ )
damp += self.qcfg.damp_auto_increment
+ recovery_last_damp = damp
else:
log.warn(
f"Quantization: Module `{self.name}` -> Hessian Cholesky failed with `damp_percent={damp:.5f}` and no auto increment configured.")
break
+ if damp_recovery_started:
+ final_damp = recovery_last_damp if recovery_last_damp is not None else damp
+ log.warn(
+ f"Quantization: Module `{self.name}` -> Damp recovery failed after reaching `damp_percent={final_damp:.5f}`."
+ )
+
attempt += 1
log.error(
diff --git a/gptqmodel/utils/nogil_patcher.py b/gptqmodel/utils/nogil_patcher.py
index e6b85d42f..5b2c8a166 100644
--- a/gptqmodel/utils/nogil_patcher.py
+++ b/gptqmodel/utils/nogil_patcher.py
@@ -138,7 +138,7 @@ def patched_check_disk_cache(self, tuning_key, configs, bench_fn):
)
return False, bench_time, configs_timings, best_config
- def _get_config_for_key(self, key, nargs, args, kwargs):
+ def _get_config_for_key(self, key, args, kwargs):
with self._cache_lock:
cached = self._cache.get(key)
if cached is not None:
@@ -157,18 +157,17 @@ def _get_config_for_key(self, key, nargs, args, kwargs):
if future.error is not None:
raise future.error
return future.config, future.used_cached_result, future.bench_time
-
- pruned_configs = self.prune_configs(kwargs, nargs)
+ pruned_configs = self.prune_configs(kwargs)
def benchmark():
bench_start = time.time()
timings = {
- config: self._bench(nargs, *args, config=config, **kwargs)
+ config: self._bench(*args, config=config, **kwargs)
for config in pruned_configs
}
bench_duration = time.time() - bench_start
best_config = builtins_mod.min(timings, key=timings.get)
- full_nargs_local = {**nargs, **kwargs, **best_config.all_kwargs()}
+ full_nargs_local = {**self.nargs, **kwargs, **best_config.all_kwargs()}
self.pre_hook(full_nargs_local, reset_only=True)
return timings, bench_duration, best_config
@@ -203,6 +202,7 @@ def benchmark():
def patched_run(self, *args, **kwargs):
nargs = dict(zip(self.arg_names, args))
+ self.nargs = nargs
used_cached_result = True
bench_time = None
key = None
@@ -214,7 +214,7 @@ def patched_run(self, *args, **kwargs):
if hasattr(arg, "dtype"):
key_values.append(str(arg.dtype))
key = tuple(key_values)
- config, used_cached_result, bench_time = _get_config_for_key(self, key, nargs, args, kwargs)
+ config, used_cached_result, bench_time = _get_config_for_key(self, key, args, kwargs)
else:
config = self.configs[0]
@@ -231,11 +231,13 @@ def patched_run(self, *args, **kwargs):
full_nargs = {**nargs, **kwargs, **config.all_kwargs()}
if config.pre_hook is not None:
config.pre_hook(full_nargs)
- return self.fn.run(
+ result = self.fn.run(
*args,
**kwargs,
**config.all_kwargs(),
)
+ self.nargs = None
+ return result
autotuner_cls.__init__ = patched_init
autotuner_cls.check_disk_cache = patched_check_disk_cache