Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 4 additions & 5 deletions gptqmodel/looper/module_looper.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,14 +83,14 @@ def hook(module, inputs, output):
return inner_hook(module, new_inputs, new_output)
return hook

def cache_inputs(self, layers, calibration_data, calibration_enable_gpu_cache, use_cache):
def cache_inputs(self, layers, calibration_data, use_cache):
layer_inputs = []
attention_masks = []
position_ids = []
layer_input_kwargs = []

cur_layer_device = get_device(layers[0])
data_device = cur_layer_device if calibration_enable_gpu_cache else CPU
data_device = cur_layer_device

# TODO HookLinear add register_forward_pre_hook()
def store_input_hook(module, args, kwargs):
Expand Down Expand Up @@ -188,7 +188,7 @@ def store_input_hook(module, args, kwargs):
attention_masks=attention_masks)

@torch.inference_mode
def loop(self, calibration_enable_gpu_cache=True, fail_safe: bool = False, **kwargs):
def loop(self, fail_safe: bool = False, **kwargs):
if self.gptq_model.quantize_config.lm_head:
if self.gptq_model.model.config.tie_word_embeddings and hasattr(self.gptq_model.model.model, "_tied_weights_keys"):
tied_keys = self.gptq_model.model._tied_weights_keys
Expand Down Expand Up @@ -231,7 +231,6 @@ def loop(self, calibration_enable_gpu_cache=True, fail_safe: bool = False, **kwa

input_cache = self.cache_inputs(layers=layers,
calibration_data=processor.calibration_dataset,
calibration_enable_gpu_cache=calibration_enable_gpu_cache,
use_cache=False)
processor.receive_input_cache(input_cache)

Expand Down Expand Up @@ -513,7 +512,7 @@ def process_module(name, m):

layer_output = move_to(
layer_output,
device=cur_layer_device if calibration_enable_gpu_cache else CPU,
device=cur_layer_device,
)

layer_outputs.append([layer_output])
Expand Down
2 changes: 0 additions & 2 deletions gptqmodel/models/auto.py
Original file line number Diff line number Diff line change
Expand Up @@ -610,7 +610,6 @@ def generate(
calibration_dataset_concat_size: Optional[int] = None,
calibration_dataset_sort: Optional[str] = None,
batch_size: Optional[int] = 1,
calibration_enable_gpu_cache: Optional[bool] = True,
tokenizer: Optional[PreTrainedTokenizerBase] = None,
logger_board: Optional[str] = None,
# pass-through vars for load()
Expand Down Expand Up @@ -657,7 +656,6 @@ def generate(
calibration_dataset_concat_size=calibration_dataset_concat_size,
calibration_dataset_sort=calibration_dataset_sort,
batch_size=batch_size,
calibration_enable_gpu_cache=calibration_enable_gpu_cache,
tokenizer=tokenizer,
logger_board=logger_board,
)
Expand Down
7 changes: 1 addition & 6 deletions gptqmodel/models/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -487,7 +487,6 @@ def quantize(
calibration_concat_size: Optional[int] = None,
calibration_sort: Optional[str] = None, # valid values are asc, desc, shuffle
batch_size: int = 1,
calibration_enable_gpu_cache: bool = True,
tokenizer: Optional[PreTrainedTokenizerBase] = None,
logger_board: Optional[str] = None,
backend: Optional[BACKEND] = BACKEND.AUTO,
Expand Down Expand Up @@ -669,7 +668,6 @@ def quantize(
module_looper = ModuleLooper(self, processors=processors)

return module_looper.loop(
calibration_enable_gpu_cache=calibration_enable_gpu_cache,
backend=backend,
fail_safe=self.quantize_config.fail_safe,
)
Expand All @@ -683,7 +681,6 @@ def _eora_generate(
calibration_dataset_concat_size: Optional[int] = None,
calibration_dataset_sort: Optional[str] = None,
batch_size: int = 1,
calibration_enable_gpu_cache: bool = True,
tokenizer: Optional[PreTrainedTokenizerBase] = None,
logger_board: Optional[str] = None,
):
Expand Down Expand Up @@ -728,9 +725,7 @@ def _eora_generate(
# prepare processor worker (looper)
module_looper = ModuleLooper(model=self, processors=processors)

module_looper.loop(
calibration_enable_gpu_cache=calibration_enable_gpu_cache,
)
module_looper.loop()

self.eora_save(save_dir=adapter.path, model_save_dir=self.model_local_path)
return
Expand Down
Loading