diff --git a/gptqmodel/looper/module_looper.py b/gptqmodel/looper/module_looper.py index d231f0583..c718870f5 100644 --- a/gptqmodel/looper/module_looper.py +++ b/gptqmodel/looper/module_looper.py @@ -493,7 +493,8 @@ def store_input_hook(module, args, kwargs): layer_inputs.append(layer_input) # Keyword arguments. - if kwargs.get("attention_mask") is not None and self.gptq_model.ATTENTION_MASKS_REQUIRED_FOR_INPUT: + # Always capture attention_mask so downstream masking can drop padded tokens + if kwargs.get("attention_mask") is not None: attention_masks.append(kwargs["attention_mask"].to(device=data_device)) else: attention_masks.append(None) @@ -632,6 +633,8 @@ def loop(self, fail_safe: bool = False, **kwargs): layer_modules = self.gptq_model.simple_layer_modules(model_config=self.gptq_model.model.config, quantize_config=self.gptq_model.quantize_config) + # true-sequential will replay the quantized activations after each subset has been quantized to be used for next subset quantization + # this should always be true for gptq unless you want lower but misleading error_loss that is misleading and will lead to lower post-quantized model if not self.gptq_model.quantize_config.true_sequential: layer_modules = [sum(layer_modules, [])] diff --git a/tests/models/model_test.py b/tests/models/model_test.py index 62f5602d7..d85ce2ad2 100644 --- a/tests/models/model_test.py +++ b/tests/models/model_test.py @@ -13,6 +13,13 @@ os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True" +# Following makes test results more deterministic but much slower +# # the CUBLAS env is required for use_deterministic_algorithms +# os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:8" +# +# import torch +# torch.use_deterministic_algorithms(True) + # -- end do not touch from pathlib import Path # noqa: E402