ModelCloud · Qubitium · Oct 31, 2025 · Oct 31, 2025 · Oct 31, 2025
diff --git a/README.md b/README.md
@@ -17,6 +17,7 @@
 </p>
 
 ## Latest News
+* 10/31/2025 5.1.0-dev: ✨IBM Granite Nano support. New `calibration_concat_separator` config option.
 * 10/30/2025 5.1.0-dev: 🎉AWQ support out of beta with full feature support in including multi-gpu quant and MoE vram saving.  
 * 10/30/2025 5.1.0-dev: ✨Marin model. New AWQ Torch reference kernel. Fix AWQ Marlin kernel for bf16. Fix GLM 4.5/4.6 MoE missing `mtp` layers on model save (HF bug). Modular refractor. 
 * 10/28/2025 5.1.0-dev: Minimax M2 support with [ModelCloud BF16 M2 Model](https://huggingface.co/ModelCloud/MiniMax-M2-BF16). New `VramStrategy.Balanced` quantization property for reduced memory usage for large MoE on multi-3090 (24GB) devices.
@@ -179,20 +180,20 @@ Native support support some of the most popular multi-modal models:
 <img src=https://github.com/user-attachments/assets/c1b89394-f8f6-44e5-9949-bef15a124723 width="51%"> <img src=https://github.com/user-attachments/assets/23901236-10c5-4435-ac2f-06cf2e097f1e width="47%">
 
 ## Model Support  
-| Model             |   |                   |   |                |   |                |   |                     |   |
-|-------------------|---|-------------------|---|----------------|---|----------------|---|---------------------|---|
-| Apertus           | ✅ | EXAONE 3.0        | ✅ | InternLM 1/2.5 | ✅ | Mixtral        | ✅ | Qwen 2/3 (Next/MoE) | ✅ |
-| Baichuan          | ✅ | Falcon (H1)       | ✅ | Kimi K2        | ✅ | MobileLLM      | ✅ | Qwen 2/2.5 VL       | ✅ |
-| Bloom             | ✅ | FastVLM           | ✅ | Klear          | ✅ | MOSS           | ✅ | Qwen 2.5/3 Omni     | ✅ |
-| ChatGLM           | ✅ | Gemma 1/2/3       | ✅ | LING/RING      | ✅ | MPT            | ✅ | RefinedWeb          | ✅ |
-| CodeGen           | ✅ | GPTBigCod         | ✅ | Llama 1-3.3    | ✅ | Nemotron H     | ✅ | StableLM            | ✅ |
-| Cohere 1-2        | ✅ | GPTQ-Neo/GPT-NeoX | ✅ | Llama 3.2 VL   | ✅ | Nemotron Ultra | ✅ | StarCoder2          | ✅ |
-| DBRX Converted    | ✅ | GPT-2             | ✅ | Llama 4        | ✅ | OPT            | ✅ | TeleChat2           | ✅ |
-| Deci              | ✅ | GPT-J             | ✅ | LongCatFlash   | ✅ | OLMo2          | ✅ | Yi                  | ✅ |
-| DeepSeek-V2/V3/R1 | ✅ | GPT-OSS           | ✅ | LongLLaMA      | ✅ | Ovis 1.6/2     | ✅ | Seed-OSS            | ✅ |
-| DeepSeek-V2-Lite  | ✅ | Granite           | ✅ | Instella       | ✅ | Phi 1-4        | ✅ | XVERSE              | ✅ |
-| Dream             | ✅ | GRIN-MoE          | ✅ | MiniCPM3       | ✅ | PanGu-α        | ✅ |                     |   |
-| ERNIE 4.5         | ✅ | Hymba             | ✅ | Mistral        | ✅ | Qwen 1/2/3     | ✅ |                     |   |
+| Model             |   |             |   |                |   |                |   |                     |   |
+|-------------------|---|-------------|---|----------------|---|----------------|---|---------------------|---|
+| Apertus           | ✅ | EXAONE 3.0  | ✅ | InternLM 1/2.5 | ✅ | Mixtral        | ✅ | Qwen 2/3 (Next/MoE) | ✅ |
+| Baichuan          | ✅ | Falcon (H1) | ✅ | Kimi K2        | ✅ | MobileLLM      | ✅ | Qwen 2/2.5/3 VL     | ✅ |
+| Bloom             | ✅ | FastVLM     | ✅ | Klear          | ✅ | MOSS           | ✅ | Qwen 2.5/3 Omni     | ✅ |
+| ChatGLM           | ✅ | Gemma 1/2/3 | ✅ | LING/RING      | ✅ | MPT            | ✅ | RefinedWeb          | ✅ |
+| CodeGen           | ✅ | GPTBigCod   | ✅ | Llama 1-3.3    | ✅ | Nemotron H     | ✅ | StableLM            | ✅ |
+| Cohere 1-2        | ✅ | GPTQ-Neo(X) | ✅ | Llama 3.2 VL   | ✅ | Nemotron Ultra | ✅ | StarCoder2          | ✅ |
+| DBRX Converted    | ✅ | GPT-2       | ✅ | Llama 4        | ✅ | OPT            | ✅ | TeleChat2           | ✅ |
+| Deci              | ✅ | GPT-J       | ✅ | LongCatFlash   | ✅ | OLMo2          | ✅ | Yi                  | ✅ |
+| DeepSeek-V2/V3/R1 | ✅ | GPT-OSS     | ✅ | LongLLaMA      | ✅ | Ovis 1.6/2     | ✅ | Seed-OSS            | ✅ |
+| DeepSeek-V2-Lite  | ✅ | Granite     | ✅ | Instella       | ✅ | Phi 1-4        | ✅ | XVERSE              | ✅ |
+| Dream             | ✅ | GRIN-MoE    | ✅ | MiniCPM3       | ✅ | PanGu-α        | ✅ | Minimax M2          | ✅ |
+| ERNIE 4.5         | ✅ | Hymba       | ✅ | Mistral        | ✅ | Qwen 1/2/3     | ✅ | GLM 4.X             | ✅ |
 
 
 ## Platform and HW Support 

diff --git a/gptqmodel/quantization/gptq.py b/gptqmodel/quantization/gptq.py
@@ -594,6 +594,10 @@ def hessian_inverse(self, H: torch.Tensor):
             mean = torch.mean(current_diag)
             damp = self.qcfg.damp_percent
 
+            damp_recovery_started = False
+            recovery_initial_damp = None
+            recovery_last_damp = None
+
             while 0 < damp < 1:
                 try:
                     diag_view.add_(damp * mean)
@@ -602,19 +606,36 @@ def hessian_inverse(self, H: torch.Tensor):
                     diag_view.copy_(current_diag)
                     del H2
                     used_damp = damp
+                    if damp_recovery_started:
+                        log.warn(
+                            f"Quantization: Module `{self.name}` -> Damp recovery succeeded at `damp_percent={damp:.5f}` "
+                            f"(started at {recovery_initial_damp:.5f})."
+                        )
                     return Hinv_result, used_damp
                 except torch._C._LinAlgError as e:
                     last_error = e
                     diag_view.copy_(current_diag)
                     if self.qcfg.damp_auto_increment != 0:
-                        log.warn(
-                            f"Quantization: Module `{self.name}` -> Current `damp_percent = {damp:.5f}` is too low, auto-incrementing by `{self.qcfg.damp_auto_increment:.5f}`")
+                        if not damp_recovery_started:
+                            damp_recovery_started = True
+                            recovery_initial_damp = damp
+                            log.warn(
+                                f"Quantization: Module `{self.name}` -> Starting damp recovery at "
+                                f"`damp_percent={damp:.5f}`, increment step `{self.qcfg.damp_auto_increment:.5f}`."
+                            )
                         damp += self.qcfg.damp_auto_increment
+                        recovery_last_damp = damp
                     else:
                         log.warn(
                             f"Quantization: Module `{self.name}` -> Hessian Cholesky failed with `damp_percent={damp:.5f}` and no auto increment configured.")
                         break
 
+            if damp_recovery_started:
+                final_damp = recovery_last_damp if recovery_last_damp is not None else damp
+                log.warn(
+                    f"Quantization: Module `{self.name}` -> Damp recovery failed after reaching `damp_percent={final_damp:.5f}`."
+                )
+
             attempt += 1
 
         log.error(

diff --git a/gptqmodel/utils/nogil_patcher.py b/gptqmodel/utils/nogil_patcher.py
@@ -138,7 +138,7 @@ def patched_check_disk_cache(self, tuning_key, configs, bench_fn):
         )
         return False, bench_time, configs_timings, best_config
 
-    def _get_config_for_key(self, key, nargs, args, kwargs):
+    def _get_config_for_key(self, key, args, kwargs):
         with self._cache_lock:
             cached = self._cache.get(key)
             if cached is not None:
@@ -157,18 +157,17 @@ def _get_config_for_key(self, key, nargs, args, kwargs):
             if future.error is not None:
                 raise future.error
             return future.config, future.used_cached_result, future.bench_time
-
-        pruned_configs = self.prune_configs(kwargs, nargs)
+        pruned_configs = self.prune_configs(kwargs)
 
         def benchmark():
             bench_start = time.time()
             timings = {
-                config: self._bench(nargs, *args, config=config, **kwargs)
+                config: self._bench(*args, config=config, **kwargs)
                 for config in pruned_configs
             }
             bench_duration = time.time() - bench_start
             best_config = builtins_mod.min(timings, key=timings.get)
-            full_nargs_local = {**nargs, **kwargs, **best_config.all_kwargs()}
+            full_nargs_local = {**self.nargs, **kwargs, **best_config.all_kwargs()}
             self.pre_hook(full_nargs_local, reset_only=True)
             return timings, bench_duration, best_config
 
@@ -203,6 +202,7 @@ def benchmark():
 
     def patched_run(self, *args, **kwargs):
         nargs = dict(zip(self.arg_names, args))
+        self.nargs = nargs
         used_cached_result = True
         bench_time = None
         key = None
@@ -214,7 +214,7 @@ def patched_run(self, *args, **kwargs):
                 if hasattr(arg, "dtype"):
                     key_values.append(str(arg.dtype))
             key = tuple(key_values)
-            config, used_cached_result, bench_time = _get_config_for_key(self, key, nargs, args, kwargs)
+            config, used_cached_result, bench_time = _get_config_for_key(self, key, args, kwargs)
         else:
             config = self.configs[0]
 
@@ -231,11 +231,13 @@ def patched_run(self, *args, **kwargs):
         full_nargs = {**nargs, **kwargs, **config.all_kwargs()}
         if config.pre_hook is not None:
             config.pre_hook(full_nargs)
-        return self.fn.run(
+        result = self.fn.run(
             *args,
             **kwargs,
             **config.all_kwargs(),
         )
+        self.nargs = None
+        return result
 
     autotuner_cls.__init__ = patched_init
     autotuner_cls.check_disk_cache = patched_check_disk_cache