From 6d5cc65cb7d81321c0943dcd929680716a0a6d0b Mon Sep 17 00:00:00 2001 From: LRL2-ModelCloud Date: Wed, 29 Oct 2025 16:54:13 +0800 Subject: [PATCH] add support_offload_to_disk var --- gptqmodel/models/base.py | 2 ++ gptqmodel/models/definitions/gpt_oss.py | 2 ++ gptqmodel/models/definitions/llama4.py | 1 + gptqmodel/models/loader.py | 6 ++++++ 4 files changed, 11 insertions(+) diff --git a/gptqmodel/models/base.py b/gptqmodel/models/base.py index 491622c87..90ec9daae 100644 --- a/gptqmodel/models/base.py +++ b/gptqmodel/models/base.py @@ -205,6 +205,8 @@ class BaseQModel(nn.Module): support_batch_quantize = True + support_offload_to_disk = True + ATTENTION_MASKS_DTYPE = torch.bool # default to bool ATTENTION_MASKS_REQUIRED_FOR_INPUT: bool = False diff --git a/gptqmodel/models/definitions/gpt_oss.py b/gptqmodel/models/definitions/gpt_oss.py index a4cfbeb69..9bbe72fac 100644 --- a/gptqmodel/models/definitions/gpt_oss.py +++ b/gptqmodel/models/definitions/gpt_oss.py @@ -126,6 +126,8 @@ def forward(self, hidden_states): return router_scores, router_indices class GPTOSSGPTQ(BaseQModel): + support_offload_to_disk = False + dynamic_expert_index = "num_local_experts" pre_lm_head_norm_module = "model.norm" diff --git a/gptqmodel/models/definitions/llama4.py b/gptqmodel/models/definitions/llama4.py index 146fd90bc..16742d2a0 100644 --- a/gptqmodel/models/definitions/llama4.py +++ b/gptqmodel/models/definitions/llama4.py @@ -12,6 +12,7 @@ class Llama4QModel(BaseQModel): # some bug in the attention_mask of transformers.modeling_llama4, # so batch quantization for Llama4 is temporarily not supported. support_batch_quantize = False + support_offload_to_disk = False loader = AutoModelForImageTextToText pre_lm_head_norm_module = "language_model.model.norm" diff --git a/gptqmodel/models/loader.py b/gptqmodel/models/loader.py index 58cad6730..28baf7fe2 100644 --- a/gptqmodel/models/loader.py +++ b/gptqmodel/models/loader.py @@ -205,6 +205,12 @@ def skip(*args, **kwargs): if hasattr(config, "hidden_act") and config.hidden_act == "xielu": quantize_config.offload_to_disk = False + # some models need convert moe-experts after model loaded, like GPTOSS and Llama4 + # so offload_to_disk is not supported for them. + if not cls.support_offload_to_disk: + quantize_config.offload_to_disk = False + log.warn(f"{cls} doesn't support offload_to_disk, set quantize_config.offload_to_disk to False.") + if quantize_config.offload_to_disk: model = build_shell_model(cls.loader, config=config, **model_init_kwargs) model._model_init_kwargs = model_init_kwargs