ModelCloud · Qubitium · Oct 2, 2025 · Oct 2, 2025 · Oct 2, 2025 · Oct 2, 2025
diff --git a/gptqmodel/looper/module_looper.py b/gptqmodel/looper/module_looper.py
@@ -493,7 +493,8 @@ def store_input_hook(module, args, kwargs):
             layer_inputs.append(layer_input)
 
             # Keyword arguments.
-            if kwargs.get("attention_mask") is not None and self.gptq_model.ATTENTION_MASKS_REQUIRED_FOR_INPUT:
+            # Always capture attention_mask so downstream masking can drop padded tokens
+            if kwargs.get("attention_mask") is not None:
                 attention_masks.append(kwargs["attention_mask"].to(device=data_device))
             else:
                 attention_masks.append(None)
@@ -632,6 +633,8 @@ def loop(self, fail_safe: bool = False, **kwargs):
 
         layer_modules = self.gptq_model.simple_layer_modules(model_config=self.gptq_model.model.config, quantize_config=self.gptq_model.quantize_config)
 
+        # true-sequential will replay the quantized activations after each subset has been quantized to be used for next subset quantization
+        # this should always be true for gptq unless you want lower but misleading error_loss that is misleading and will lead to lower post-quantized model
         if not self.gptq_model.quantize_config.true_sequential:
             layer_modules = [sum(layer_modules, [])]
 

diff --git a/tests/models/model_test.py b/tests/models/model_test.py
@@ -13,6 +13,13 @@
 os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
 os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
 
+# Following makes test results more deterministic but much slower
+# # the CUBLAS env is required for use_deterministic_algorithms
+# os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:8"
+#
+# import torch
+# torch.use_deterministic_algorithms(True)
+
 # -- end do not touch
 
 from pathlib import Path  # noqa: E402