diff --git a/README.md b/README.md index 89b696f92..f86ccffbd 100644 --- a/README.md +++ b/README.md @@ -17,6 +17,7 @@

## Latest News +* 10/31/2025 5.1.0-dev: ✨IBM Granite Nano support. New `calibration_concat_separator` config option. * 10/30/2025 5.1.0-dev: πŸŽ‰AWQ support out of beta with full feature support in including multi-gpu quant and MoE vram saving. * 10/30/2025 5.1.0-dev: ✨Marin model. New AWQ Torch reference kernel. Fix AWQ Marlin kernel for bf16. Fix GLM 4.5/4.6 MoE missing `mtp` layers on model save (HF bug). Modular refractor. * 10/28/2025 5.1.0-dev: Minimax M2 support with [ModelCloud BF16 M2 Model](https://huggingface.co/ModelCloud/MiniMax-M2-BF16). New `VramStrategy.Balanced` quantization property for reduced memory usage for large MoE on multi-3090 (24GB) devices. @@ -179,20 +180,20 @@ Native support support some of the most popular multi-modal models: ## Model Support -| Model | | | | | | | | | | -|-------------------|---|-------------------|---|----------------|---|----------------|---|---------------------|---| -| Apertus | βœ… | EXAONE 3.0 | βœ… | InternLM 1/2.5 | βœ… | Mixtral | βœ… | Qwen 2/3 (Next/MoE) | βœ… | -| Baichuan | βœ… | Falcon (H1) | βœ… | Kimi K2 | βœ… | MobileLLM | βœ… | Qwen 2/2.5 VL | βœ… | -| Bloom | βœ… | FastVLM | βœ… | Klear | βœ… | MOSS | βœ… | Qwen 2.5/3 Omni | βœ… | -| ChatGLM | βœ… | Gemma 1/2/3 | βœ… | LING/RING | βœ… | MPT | βœ… | RefinedWeb | βœ… | -| CodeGen | βœ… | GPTBigCod | βœ… | Llama 1-3.3 | βœ… | Nemotron H | βœ… | StableLM | βœ… | -| Cohere 1-2 | βœ… | GPTQ-Neo/GPT-NeoX | βœ… | Llama 3.2 VL | βœ… | Nemotron Ultra | βœ… | StarCoder2 | βœ… | -| DBRX Converted | βœ… | GPT-2 | βœ… | Llama 4 | βœ… | OPT | βœ… | TeleChat2 | βœ… | -| Deci | βœ… | GPT-J | βœ… | LongCatFlash | βœ… | OLMo2 | βœ… | Yi | βœ… | -| DeepSeek-V2/V3/R1 | βœ… | GPT-OSS | βœ… | LongLLaMA | βœ… | Ovis 1.6/2 | βœ… | Seed-OSS | βœ… | -| DeepSeek-V2-Lite | βœ… | Granite | βœ… | Instella | βœ… | Phi 1-4 | βœ… | XVERSE | βœ… | -| Dream | βœ… | GRIN-MoE | βœ… | MiniCPM3 | βœ… | PanGu-Ξ± | βœ… | | | -| ERNIE 4.5 | βœ… | Hymba | βœ… | Mistral | βœ… | Qwen 1/2/3 | βœ… | | | +| Model | | | | | | | | | | +|-------------------|---|-------------|---|----------------|---|----------------|---|---------------------|---| +| Apertus | βœ… | EXAONE 3.0 | βœ… | InternLM 1/2.5 | βœ… | Mixtral | βœ… | Qwen 2/3 (Next/MoE) | βœ… | +| Baichuan | βœ… | Falcon (H1) | βœ… | Kimi K2 | βœ… | MobileLLM | βœ… | Qwen 2/2.5/3 VL | βœ… | +| Bloom | βœ… | FastVLM | βœ… | Klear | βœ… | MOSS | βœ… | Qwen 2.5/3 Omni | βœ… | +| ChatGLM | βœ… | Gemma 1/2/3 | βœ… | LING/RING | βœ… | MPT | βœ… | RefinedWeb | βœ… | +| CodeGen | βœ… | GPTBigCod | βœ… | Llama 1-3.3 | βœ… | Nemotron H | βœ… | StableLM | βœ… | +| Cohere 1-2 | βœ… | GPTQ-Neo(X) | βœ… | Llama 3.2 VL | βœ… | Nemotron Ultra | βœ… | StarCoder2 | βœ… | +| DBRX Converted | βœ… | GPT-2 | βœ… | Llama 4 | βœ… | OPT | βœ… | TeleChat2 | βœ… | +| Deci | βœ… | GPT-J | βœ… | LongCatFlash | βœ… | OLMo2 | βœ… | Yi | βœ… | +| DeepSeek-V2/V3/R1 | βœ… | GPT-OSS | βœ… | LongLLaMA | βœ… | Ovis 1.6/2 | βœ… | Seed-OSS | βœ… | +| DeepSeek-V2-Lite | βœ… | Granite | βœ… | Instella | βœ… | Phi 1-4 | βœ… | XVERSE | βœ… | +| Dream | βœ… | GRIN-MoE | βœ… | MiniCPM3 | βœ… | PanGu-Ξ± | βœ… | Minimax M2 | βœ… | +| ERNIE 4.5 | βœ… | Hymba | βœ… | Mistral | βœ… | Qwen 1/2/3 | βœ… | GLM 4.X | βœ… | ## Platform and HW Support diff --git a/gptqmodel/quantization/gptq.py b/gptqmodel/quantization/gptq.py index 7321d5c98..4f7723a02 100644 --- a/gptqmodel/quantization/gptq.py +++ b/gptqmodel/quantization/gptq.py @@ -594,6 +594,10 @@ def hessian_inverse(self, H: torch.Tensor): mean = torch.mean(current_diag) damp = self.qcfg.damp_percent + damp_recovery_started = False + recovery_initial_damp = None + recovery_last_damp = None + while 0 < damp < 1: try: diag_view.add_(damp * mean) @@ -602,19 +606,36 @@ def hessian_inverse(self, H: torch.Tensor): diag_view.copy_(current_diag) del H2 used_damp = damp + if damp_recovery_started: + log.warn( + f"Quantization: Module `{self.name}` -> Damp recovery succeeded at `damp_percent={damp:.5f}` " + f"(started at {recovery_initial_damp:.5f})." + ) return Hinv_result, used_damp except torch._C._LinAlgError as e: last_error = e diag_view.copy_(current_diag) if self.qcfg.damp_auto_increment != 0: - log.warn( - f"Quantization: Module `{self.name}` -> Current `damp_percent = {damp:.5f}` is too low, auto-incrementing by `{self.qcfg.damp_auto_increment:.5f}`") + if not damp_recovery_started: + damp_recovery_started = True + recovery_initial_damp = damp + log.warn( + f"Quantization: Module `{self.name}` -> Starting damp recovery at " + f"`damp_percent={damp:.5f}`, increment step `{self.qcfg.damp_auto_increment:.5f}`." + ) damp += self.qcfg.damp_auto_increment + recovery_last_damp = damp else: log.warn( f"Quantization: Module `{self.name}` -> Hessian Cholesky failed with `damp_percent={damp:.5f}` and no auto increment configured.") break + if damp_recovery_started: + final_damp = recovery_last_damp if recovery_last_damp is not None else damp + log.warn( + f"Quantization: Module `{self.name}` -> Damp recovery failed after reaching `damp_percent={final_damp:.5f}`." + ) + attempt += 1 log.error( diff --git a/gptqmodel/utils/nogil_patcher.py b/gptqmodel/utils/nogil_patcher.py index e6b85d42f..5b2c8a166 100644 --- a/gptqmodel/utils/nogil_patcher.py +++ b/gptqmodel/utils/nogil_patcher.py @@ -138,7 +138,7 @@ def patched_check_disk_cache(self, tuning_key, configs, bench_fn): ) return False, bench_time, configs_timings, best_config - def _get_config_for_key(self, key, nargs, args, kwargs): + def _get_config_for_key(self, key, args, kwargs): with self._cache_lock: cached = self._cache.get(key) if cached is not None: @@ -157,18 +157,17 @@ def _get_config_for_key(self, key, nargs, args, kwargs): if future.error is not None: raise future.error return future.config, future.used_cached_result, future.bench_time - - pruned_configs = self.prune_configs(kwargs, nargs) + pruned_configs = self.prune_configs(kwargs) def benchmark(): bench_start = time.time() timings = { - config: self._bench(nargs, *args, config=config, **kwargs) + config: self._bench(*args, config=config, **kwargs) for config in pruned_configs } bench_duration = time.time() - bench_start best_config = builtins_mod.min(timings, key=timings.get) - full_nargs_local = {**nargs, **kwargs, **best_config.all_kwargs()} + full_nargs_local = {**self.nargs, **kwargs, **best_config.all_kwargs()} self.pre_hook(full_nargs_local, reset_only=True) return timings, bench_duration, best_config @@ -203,6 +202,7 @@ def benchmark(): def patched_run(self, *args, **kwargs): nargs = dict(zip(self.arg_names, args)) + self.nargs = nargs used_cached_result = True bench_time = None key = None @@ -214,7 +214,7 @@ def patched_run(self, *args, **kwargs): if hasattr(arg, "dtype"): key_values.append(str(arg.dtype)) key = tuple(key_values) - config, used_cached_result, bench_time = _get_config_for_key(self, key, nargs, args, kwargs) + config, used_cached_result, bench_time = _get_config_for_key(self, key, args, kwargs) else: config = self.configs[0] @@ -231,11 +231,13 @@ def patched_run(self, *args, **kwargs): full_nargs = {**nargs, **kwargs, **config.all_kwargs()} if config.pre_hook is not None: config.pre_hook(full_nargs) - return self.fn.run( + result = self.fn.run( *args, **kwargs, **config.all_kwargs(), ) + self.nargs = None + return result autotuner_cls.__init__ = patched_init autotuner_cls.check_disk_cache = patched_check_disk_cache