From 350229d8d1f41302ba7ac4787b64aed42f455d9e Mon Sep 17 00:00:00 2001 From: Qubitium Date: Fri, 26 Sep 2025 02:51:52 +0000 Subject: [PATCH] remove calibration_enable_gpu_cache toggle Signed-off-by: Qubitium --- gptqmodel/looper/module_looper.py | 9 ++++----- gptqmodel/models/auto.py | 2 -- gptqmodel/models/base.py | 7 +------ 3 files changed, 5 insertions(+), 13 deletions(-) diff --git a/gptqmodel/looper/module_looper.py b/gptqmodel/looper/module_looper.py index 6ce20bf80..158c47c13 100644 --- a/gptqmodel/looper/module_looper.py +++ b/gptqmodel/looper/module_looper.py @@ -83,14 +83,14 @@ def hook(module, inputs, output): return inner_hook(module, new_inputs, new_output) return hook - def cache_inputs(self, layers, calibration_data, calibration_enable_gpu_cache, use_cache): + def cache_inputs(self, layers, calibration_data, use_cache): layer_inputs = [] attention_masks = [] position_ids = [] layer_input_kwargs = [] cur_layer_device = get_device(layers[0]) - data_device = cur_layer_device if calibration_enable_gpu_cache else CPU + data_device = cur_layer_device # TODO HookLinear add register_forward_pre_hook() def store_input_hook(module, args, kwargs): @@ -188,7 +188,7 @@ def store_input_hook(module, args, kwargs): attention_masks=attention_masks) @torch.inference_mode - def loop(self, calibration_enable_gpu_cache=True, fail_safe: bool = False, **kwargs): + def loop(self, fail_safe: bool = False, **kwargs): if self.gptq_model.quantize_config.lm_head: if self.gptq_model.model.config.tie_word_embeddings and hasattr(self.gptq_model.model.model, "_tied_weights_keys"): tied_keys = self.gptq_model.model._tied_weights_keys @@ -231,7 +231,6 @@ def loop(self, calibration_enable_gpu_cache=True, fail_safe: bool = False, **kwa input_cache = self.cache_inputs(layers=layers, calibration_data=processor.calibration_dataset, - calibration_enable_gpu_cache=calibration_enable_gpu_cache, use_cache=False) processor.receive_input_cache(input_cache) @@ -513,7 +512,7 @@ def process_module(name, m): layer_output = move_to( layer_output, - device=cur_layer_device if calibration_enable_gpu_cache else CPU, + device=cur_layer_device, ) layer_outputs.append([layer_output]) diff --git a/gptqmodel/models/auto.py b/gptqmodel/models/auto.py index 11b41fa4a..da8c0729a 100644 --- a/gptqmodel/models/auto.py +++ b/gptqmodel/models/auto.py @@ -610,7 +610,6 @@ def generate( calibration_dataset_concat_size: Optional[int] = None, calibration_dataset_sort: Optional[str] = None, batch_size: Optional[int] = 1, - calibration_enable_gpu_cache: Optional[bool] = True, tokenizer: Optional[PreTrainedTokenizerBase] = None, logger_board: Optional[str] = None, # pass-through vars for load() @@ -657,7 +656,6 @@ def generate( calibration_dataset_concat_size=calibration_dataset_concat_size, calibration_dataset_sort=calibration_dataset_sort, batch_size=batch_size, - calibration_enable_gpu_cache=calibration_enable_gpu_cache, tokenizer=tokenizer, logger_board=logger_board, ) diff --git a/gptqmodel/models/base.py b/gptqmodel/models/base.py index 5cf830416..4f00a8826 100644 --- a/gptqmodel/models/base.py +++ b/gptqmodel/models/base.py @@ -487,7 +487,6 @@ def quantize( calibration_concat_size: Optional[int] = None, calibration_sort: Optional[str] = None, # valid values are asc, desc, shuffle batch_size: int = 1, - calibration_enable_gpu_cache: bool = True, tokenizer: Optional[PreTrainedTokenizerBase] = None, logger_board: Optional[str] = None, backend: Optional[BACKEND] = BACKEND.AUTO, @@ -669,7 +668,6 @@ def quantize( module_looper = ModuleLooper(self, processors=processors) return module_looper.loop( - calibration_enable_gpu_cache=calibration_enable_gpu_cache, backend=backend, fail_safe=self.quantize_config.fail_safe, ) @@ -683,7 +681,6 @@ def _eora_generate( calibration_dataset_concat_size: Optional[int] = None, calibration_dataset_sort: Optional[str] = None, batch_size: int = 1, - calibration_enable_gpu_cache: bool = True, tokenizer: Optional[PreTrainedTokenizerBase] = None, logger_board: Optional[str] = None, ): @@ -728,9 +725,7 @@ def _eora_generate( # prepare processor worker (looper) module_looper = ModuleLooper(model=self, processors=processors) - module_looper.loop( - calibration_enable_gpu_cache=calibration_enable_gpu_cache, - ) + module_looper.loop() self.eora_save(save_dir=adapter.path, model_save_dir=self.model_local_path) return