From 350229d8d1f41302ba7ac4787b64aed42f455d9e Mon Sep 17 00:00:00 2001
From: Qubitium <Qubitium@modelcloud.ai>
Date: Fri, 26 Sep 2025 02:51:52 +0000
Subject: [PATCH] remove calibration_enable_gpu_cache toggle

Signed-off-by: Qubitium <Qubitium@modelcloud.ai>
---
 gptqmodel/looper/module_looper.py | 9 ++++-----
 gptqmodel/models/auto.py          | 2 --
 gptqmodel/models/base.py          | 7 +------
 3 files changed, 5 insertions(+), 13 deletions(-)

diff --git a/gptqmodel/looper/module_looper.py b/gptqmodel/looper/module_looper.py
index 6ce20bf80..158c47c13 100644
--- a/gptqmodel/looper/module_looper.py
+++ b/gptqmodel/looper/module_looper.py
@@ -83,14 +83,14 @@ def hook(module, inputs, output):
             return inner_hook(module, new_inputs, new_output)
         return hook
 
-    def cache_inputs(self, layers, calibration_data, calibration_enable_gpu_cache, use_cache):
+    def cache_inputs(self, layers, calibration_data, use_cache):
         layer_inputs = []
         attention_masks = []
         position_ids = []
         layer_input_kwargs = []
 
         cur_layer_device = get_device(layers[0])
-        data_device = cur_layer_device if calibration_enable_gpu_cache else CPU
+        data_device = cur_layer_device
 
         # TODO HookLinear add register_forward_pre_hook()
         def store_input_hook(module, args, kwargs):
@@ -188,7 +188,7 @@ def store_input_hook(module, args, kwargs):
                           attention_masks=attention_masks)
 
     @torch.inference_mode
-    def loop(self, calibration_enable_gpu_cache=True, fail_safe: bool = False, **kwargs):
+    def loop(self, fail_safe: bool = False, **kwargs):
         if self.gptq_model.quantize_config.lm_head:
             if self.gptq_model.model.config.tie_word_embeddings and hasattr(self.gptq_model.model.model, "_tied_weights_keys"):
                 tied_keys = self.gptq_model.model._tied_weights_keys
@@ -231,7 +231,6 @@ def loop(self, calibration_enable_gpu_cache=True, fail_safe: bool = False, **kwa
 
             input_cache = self.cache_inputs(layers=layers,
                                             calibration_data=processor.calibration_dataset,
-                                            calibration_enable_gpu_cache=calibration_enable_gpu_cache,
                                             use_cache=False)
             processor.receive_input_cache(input_cache)
 
@@ -513,7 +512,7 @@ def process_module(name, m):
 
                         layer_output = move_to(
                             layer_output,
-                            device=cur_layer_device if calibration_enable_gpu_cache else CPU,
+                            device=cur_layer_device,
                         )
 
                         layer_outputs.append([layer_output])
diff --git a/gptqmodel/models/auto.py b/gptqmodel/models/auto.py
index 11b41fa4a..da8c0729a 100644
--- a/gptqmodel/models/auto.py
+++ b/gptqmodel/models/auto.py
@@ -610,7 +610,6 @@ def generate(
             calibration_dataset_concat_size: Optional[int] = None,
             calibration_dataset_sort: Optional[str] = None,
             batch_size: Optional[int] = 1,
-            calibration_enable_gpu_cache: Optional[bool] = True,
             tokenizer: Optional[PreTrainedTokenizerBase] = None,
             logger_board: Optional[str] = None,
             # pass-through vars for load()
@@ -657,7 +656,6 @@ def generate(
                     calibration_dataset_concat_size=calibration_dataset_concat_size,
                     calibration_dataset_sort=calibration_dataset_sort,
                     batch_size=batch_size,
-                    calibration_enable_gpu_cache=calibration_enable_gpu_cache,
                     tokenizer=tokenizer,
                     logger_board=logger_board,
                 )
diff --git a/gptqmodel/models/base.py b/gptqmodel/models/base.py
index 5cf830416..4f00a8826 100644
--- a/gptqmodel/models/base.py
+++ b/gptqmodel/models/base.py
@@ -487,7 +487,6 @@ def quantize(
         calibration_concat_size: Optional[int] = None,
         calibration_sort: Optional[str] = None,  # valid values are asc, desc, shuffle
         batch_size: int = 1,
-        calibration_enable_gpu_cache: bool = True,
         tokenizer: Optional[PreTrainedTokenizerBase] = None,
         logger_board: Optional[str] = None,
         backend: Optional[BACKEND] = BACKEND.AUTO,
@@ -669,7 +668,6 @@ def quantize(
         module_looper = ModuleLooper(self, processors=processors)
 
         return module_looper.loop(
-            calibration_enable_gpu_cache=calibration_enable_gpu_cache,
             backend=backend,
             fail_safe=self.quantize_config.fail_safe,
         )
@@ -683,7 +681,6 @@ def _eora_generate(
         calibration_dataset_concat_size: Optional[int] = None,
         calibration_dataset_sort: Optional[str] = None,
         batch_size: int = 1,
-        calibration_enable_gpu_cache: bool = True,
         tokenizer: Optional[PreTrainedTokenizerBase] = None,
         logger_board: Optional[str] = None,
     ):
@@ -728,9 +725,7 @@ def _eora_generate(
         # prepare processor worker (looper)
         module_looper = ModuleLooper(model=self, processors=processors)
 
-        module_looper.loop(
-            calibration_enable_gpu_cache=calibration_enable_gpu_cache,
-        )
+        module_looper.loop()
 
         self.eora_save(save_dir=adapter.path, model_save_dir=self.model_local_path)
         return