From 2a44f4a7544b98dcc05105e57778146b8540a29b Mon Sep 17 00:00:00 2001
From: Qubitium <qubitium@modelcloud.ai>
Date: Sat, 1 Nov 2025 08:57:55 +0000
Subject: [PATCH 1/2] rename gptq_v2 to gptaq

---
 gptqmodel/looper/gptq_processor.py      |  8 ++--
 gptqmodel/looper/module_looper.py       |  2 +-
 gptqmodel/looper/native_processor.py    | 20 +++++-----
 gptqmodel/models/base.py                |  2 +-
 gptqmodel/models/loader.py              |  2 +-
 gptqmodel/models/writer.py              | 12 +++---
 gptqmodel/quantization/config.py        | 52 ++++++++++++++++++++-----
 gptqmodel/quantization/gptqv2.py        |  2 +-
 tests/test_out_of_model_tensor_files.py |  5 ++-
 tests/test_writer_attention.py          |  5 ++-
 10 files changed, 72 insertions(+), 38 deletions(-)

diff --git a/gptqmodel/looper/gptq_processor.py b/gptqmodel/looper/gptq_processor.py
index d0427288c..2d961205d 100644
--- a/gptqmodel/looper/gptq_processor.py
+++ b/gptqmodel/looper/gptq_processor.py
@@ -85,15 +85,15 @@ def preprocess(self, module: NamedModule, fail_safe: bool):
                 qcfg_clone.act_group_aware = act_group_aware_override
             qcfg_clone.damp_percent = self.qcfg.dynamic_get(module.full_name, "damp_percent", qcfg_clone.damp_percent)
             qcfg_clone.static_groups = self.qcfg.dynamic_get(module.full_name, "static_groups", qcfg_clone.static_groups)
-            qcfg_clone.v2 = self.qcfg.dynamic_get(module.full_name, "v2", qcfg_clone.v2)
-            qcfg_clone.v2_alpha = self.qcfg.dynamic_get(module.full_name, "v2_alpha", qcfg_clone.v2_alpha)
+            qcfg_clone.gptaq = self.qcfg.dynamic_get(module.full_name, "gptaq", qcfg_clone.gptaq)
+            qcfg_clone.gptaq_alpha = self.qcfg.dynamic_get(module.full_name, "gptaq_alpha", qcfg_clone.gptaq_alpha)
 
             qcfg_clone._resolve_activation_ordering(desc_act_override, act_group_aware_override)
 
         # store last used qcfg_dynamic
         self.qcfg_dynamic = qcfg_clone
 
-        if qcfg_clone.v2 is True:
+        if qcfg_clone.gptaq is True:
             tmp = GPTQv2(module=module, qcfg=qcfg_clone)
         else:
             tmp = GPTQ(module=module, qcfg=qcfg_clone)
@@ -383,4 +383,4 @@ def verify_calibration_dataset(self, processor_index: int) -> bool:
     def name(self) -> str:
         # TODO fix me..this hacks inherited base class logic, why not override name in gptqv2?
         qcfg = self.qcfg_dynamic if self.qcfg_dynamic is not None else self.qcfg
-        return "gptq v2" if qcfg.v2 else "gptq"
+        return "gptaq" if qcfg.gptaq else "gptq"
diff --git a/gptqmodel/looper/module_looper.py b/gptqmodel/looper/module_looper.py
index 054cff7ea..6a414a0aa 100644
--- a/gptqmodel/looper/module_looper.py
+++ b/gptqmodel/looper/module_looper.py
@@ -1006,7 +1006,7 @@ def _loop_impl(self, fail_safe: bool = False, **kwargs):
         for p_index, processor in enumerate(self.processors):
             if not processor.verify_calibration_dataset(p_index):
                 if isinstance(processor, EoraProcessor) or\
-                        (isinstance(processor, GPTQProcessor) and self.gptq_model.quantize_config.v2):
+                        (isinstance(processor, GPTQProcessor) and self.gptq_model.quantize_config.gptaq):
                     prev_processor = self.processors[p_index - 1]
                     processor.set_calibration_dataset(prev_processor.calibration_dataset)
                     # If calibration_dataset is None or Empty, the input_cache of the previous processor is used.
diff --git a/gptqmodel/looper/native_processor.py b/gptqmodel/looper/native_processor.py
index 44fc81f6e..05f37fffc 100644
--- a/gptqmodel/looper/native_processor.py
+++ b/gptqmodel/looper/native_processor.py
@@ -65,19 +65,19 @@ def tmp(module, inp: Tuple[torch.Tensor, ...], out: torch.Tensor):
             # gptq is mutable.
             inp = inp[0].detach()
 
-            if self.qcfg.v2_memory_device == "auto":
-                v2_memory_device = DEVICE_1
-            elif self.qcfg.v2_memory_device == "cpu":
+            if self.qcfg.gptaq_memory_device == "auto":
+                target_device = DEVICE_1
+            elif self.qcfg.gptaq_memory_device == "cpu":
                 # slower but >= 4x vram memory reduction
-                v2_memory_device = CPU
-            elif isinstance(self.qcfg.v2_memory_device, str):
-                v2_memory_device = torch.device(self.qcfg.v2_memory_device)
-            elif isinstance(self.qcfg.v2_memory_device, torch.device):
-                v2_memory_device = self.qcfg.v2_memory_device
+                target_device = CPU
+            elif isinstance(self.qcfg.gptaq_memory_device, str):
+                target_device = torch.device(self.qcfg.gptaq_memory_device)
+            elif isinstance(self.qcfg.gptaq_memory_device, torch.device):
+                target_device = self.qcfg.gptaq_memory_device
             else:
-                v2_memory_device = DEVICE_1
+                target_device = DEVICE_1
 
-            self.native_inp_caches[name] += [inp.to(device=v2_memory_device)]
+            self.native_inp_caches[name] += [inp.to(device=target_device)]
             del inp, out
 
         return tmp
diff --git a/gptqmodel/models/base.py b/gptqmodel/models/base.py
index 870d8cae7..b02d8f9d6 100644
--- a/gptqmodel/models/base.py
+++ b/gptqmodel/models/base.py
@@ -618,7 +618,7 @@ def quantize(
                 GPTQProcessor(**args),
             ]
 
-        if self.quantize_config.v2 is True:
+        if self.quantize_config.gptaq is True:
             from ..looper.native_processor import NativeProcessor
 
             # During the deepcopy process, self.prepare_dataset will be deeply copied along with self. However,
diff --git a/gptqmodel/models/loader.py b/gptqmodel/models/loader.py
index 6ac77302f..5edef870d 100644
--- a/gptqmodel/models/loader.py
+++ b/gptqmodel/models/loader.py
@@ -768,7 +768,7 @@ def assign(mod, device_id):
 
             if qcfg.format == FORMAT.GPTQ:
                 # validate sym=False v1 loading needs to be protected for models produced with new v2 format codebase
-                if not qcfg.sym and not qcfg.is_quantized_by_v2():
+                if not qcfg.sym and not qcfg.is_quantized_by_gptaq():
                     raise ValueError(
                         f"Format: Loading of a sym=False model with format={FORMAT.GPTQ} is only supported if produced by gptqmodel version >= {MIN_VERSION_WITH_V2}"
                     )
diff --git a/gptqmodel/models/writer.py b/gptqmodel/models/writer.py
index 6ad7aa6c8..5d98fda3c 100644
--- a/gptqmodel/models/writer.py
+++ b/gptqmodel/models/writer.py
@@ -35,8 +35,8 @@
     META_FIELD_STATIC_GROUPS,
     META_FIELD_TRUE_SEQUENTIAL,
     META_FIELD_URI,
-    META_FIELD_V2_ALPHA,
-    META_FIELD_V2_ENABLED,
+    META_FIELD_GPTAQ_ALPHA,
+    META_FIELD_GPTAQ_ENABLED,
     META_QUANTIZER_GPTQMODEL,
     META_VALUE_URI,
     MIN_VERSION_WITH_V2,
@@ -199,13 +199,13 @@ def save_quantized(
         )
 
         self.quantize_config.meta_set(
-            key=META_FIELD_V2_ENABLED,
-            value=self.quantize_config.v2
+            key=META_FIELD_GPTAQ_ENABLED,
+            value=self.quantize_config.gptaq
         )
 
         self.quantize_config.meta_set(
-            key=META_FIELD_V2_ALPHA,
-            value=self.quantize_config.v2_alpha
+            key=META_FIELD_GPTAQ_ALPHA,
+            value=self.quantize_config.gptaq_alpha
         )
 
         self.quantize_config.meta_set(
diff --git a/gptqmodel/quantization/config.py b/gptqmodel/quantization/config.py
index df35ec839..79a7057c4 100644
--- a/gptqmodel/quantization/config.py
+++ b/gptqmodel/quantization/config.py
@@ -51,9 +51,9 @@
 META_FIELD_MSE = "mse"
 META_FIELD_ACT_GROUP_AWARE = "act_group_aware"
 
-META_FIELD_V2_ENABLED = "v2"
-META_FIELD_V2_ALPHA = "v2_alpha"
-META_FIELD_V2_MEMORY_DEVICE = "v2_memory_device"
+META_FIELD_GPTAQ_ENABLED = "gptaq"
+META_FIELD_GPTAQ_ALPHA = "gptaq_alpha"
+META_FIELD_GPTAQ_MEMORY_DEVICE = "gptaq_memory_device"
 
 ADAPTER_FIELD = "adapter"
 
@@ -112,10 +112,19 @@ class VRAMStrategy(str, Enum):
     "q_group_size": GROUP_SIZE_FIELD_CODE,
     # AWQ compat
     "version" : FORMAT_FIELD_CODE,
+    "v2": "gptaq",
+    "v2_alpha": "gptaq_alpha",
+    "v2_memory_device": "gptaq_memory_device",
     # map format field (checkpoint_format) to class/code (format)
     FORMAT_FIELD_CHECKPOINT: FORMAT_FIELD_CODE,
 }
 
+DYNAMIC_FIELD_SYNONYMS = {
+    "gptaq": ("v2",),
+    "gptaq_alpha": ("v2_alpha",),
+    "gptaq_memory_device": ("v2_memory_device",),
+}
+
 def dict_scale_dtype_to_str(d: Dict[str, Any]) -> None:
     """
     Checks whether the passed dictionary and its nested dicts have a *scale_dtype* key and if it's not None,
@@ -145,12 +154,23 @@ def dynamic_get(dynamic: Dict[str, Dict[str, Union[int, bool]]], module_name: st
                 # subkey example: Lora override format: `{ "adapter": { "rank": 512 } }`
                 if sub_key:
                     sub_value = overrides.get(key, None)
+                    if sub_value is None and key in DYNAMIC_FIELD_SYNONYMS:
+                        for legacy_key in DYNAMIC_FIELD_SYNONYMS[key]:
+                            if legacy_key in overrides:
+                                sub_value = overrides[legacy_key]
+                                break
                     if isinstance(sub_value, Dict):
                         return sub_value.get(sub_key, default)
                     else:
                         log.info(f"QuantConfig: Dynamic `sub_key`: `{sub_key}` failed extraction from  `sub_value`: `{sub_value}`")
                 else:
-                    return overrides.get(key, default)
+                    if key in overrides:
+                        return overrides[key]
+                    if key in DYNAMIC_FIELD_SYNONYMS:
+                        for legacy_key in DYNAMIC_FIELD_SYNONYMS[key]:
+                            if legacy_key in overrides:
+                                return overrides[legacy_key]
+                    return default
     return default
 
 @dataclass
@@ -222,10 +242,10 @@ class QuantizeConfig():
     # use mock quantization to quantize module so the gptq process can continue and not fail
     fail_safe: bool = field(default=False)
 
-    # gptq v2* only:
-    v2: bool = field(default=False)
-    v2_alpha: float = field(default=0.25)
-    v2_memory_device: str = field(default="auto")
+    # gptaq only:
+    gptaq: bool = field(default=False)
+    gptaq_alpha: float = field(default=0.25)
+    gptaq_memory_device: str = field(default="auto")
 
     # awq only:
     zero_point: bool = field(default=True)
@@ -449,8 +469,8 @@ def meta_get_versionable(self, key: str) -> List[Tuple[str, str]]:
                 result.append((parts[0].lower(), parts[1].lower()))
         return result
 
-    # is quantized model quantized or packed by gptqmodel version with v2 format code
-    def is_quantized_by_v2(self) -> bool:
+    # is quantized model quantized or packed by gptqmodel version with gptaq format code
+    def is_quantized_by_gptaq(self) -> bool:
         # check meta.quantizer
         result = self.meta_get_versionable(META_FIELD_QUANTIZER)
         if len(result) > 0:
@@ -550,6 +570,18 @@ def from_quant_config(cls, quantize_cfg, format: str = None):
                 "QuantizeConfig: config does not contain `sym` (symmetric quantization). This may result in silent errors. Defaulting to `sym=True`."
             )
 
+        dynamic_overrides = normalized.get("dynamic")
+        if isinstance(dynamic_overrides, dict):
+            for overrides in dynamic_overrides.values():
+                if not isinstance(overrides, dict):
+                    continue
+                if "v2" in overrides and "gptaq" not in overrides:
+                    overrides["gptaq"] = overrides.pop("v2")
+                if "v2_alpha" in overrides and "gptaq_alpha" not in overrides:
+                    overrides["gptaq_alpha"] = overrides.pop("v2_alpha")
+                if "v2_memory_device" in overrides and "gptaq_memory_device" not in overrides:
+                    overrides["gptaq_memory_device"] = overrides.pop("v2_memory_device")
+
         return cls(**normalized)
 
     @classmethod
diff --git a/gptqmodel/quantization/gptqv2.py b/gptqmodel/quantization/gptqv2.py
index 77deea81f..640b708b0 100644
--- a/gptqmodel/quantization/gptqv2.py
+++ b/gptqmodel/quantization/gptqv2.py
@@ -177,7 +177,7 @@ def quantize(
         Q = torch.zeros_like(W)
 
         Hinv, damp = self.hessian_inverse(H)
-        P = self.qcfg.v2_alpha * ((self.dXXT @ Hinv.T).triu(diagonal=1)) @ Hinv
+        P = self.qcfg.gptaq_alpha * ((self.dXXT @ Hinv.T).triu(diagonal=1)) @ Hinv
         del self.dXXT
 
         for i1 in range(0, self.columns, blocksize):
diff --git a/tests/test_out_of_model_tensor_files.py b/tests/test_out_of_model_tensor_files.py
index a8241dfc0..38d333dca 100644
--- a/tests/test_out_of_model_tensor_files.py
+++ b/tests/test_out_of_model_tensor_files.py
@@ -28,8 +28,9 @@ class _DummyQuantizeConfig:
     static_groups = False
     true_sequential = False
     mse = False
-    v2 = False
-    v2_alpha = 0.0
+    gptaq = False
+    gptaq_alpha = 0.0
+    gptaq_memory_device = "auto"
     act_group_aware = False
     adapter = None
     dynamic = False
diff --git a/tests/test_writer_attention.py b/tests/test_writer_attention.py
index bc3110388..87a474c75 100644
--- a/tests/test_writer_attention.py
+++ b/tests/test_writer_attention.py
@@ -23,8 +23,9 @@ class _DummyQuantizeConfig:
     static_groups = False
     true_sequential = False
     mse = False
-    v2 = False
-    v2_alpha = 0.0
+    gptaq = False
+    gptaq_alpha = 0.0
+    gptaq_memory_device = "auto"
     act_group_aware = False
     adapter = None
     dynamic = False

From cc34f9fda9cefd19133b096cbd7c0db2206d8a28 Mon Sep 17 00:00:00 2001
From: Qubitium <qubitium@modelcloud.ai>
Date: Sat, 1 Nov 2025 09:04:34 +0000
Subject: [PATCH 2/2] fix ci test

---
 gptqmodel/models/writer.py | 29 ++++++++++++++++++++++++++---
 1 file changed, 26 insertions(+), 3 deletions(-)

diff --git a/gptqmodel/models/writer.py b/gptqmodel/models/writer.py
index 5d98fda3c..e9460a7b8 100644
--- a/gptqmodel/models/writer.py
+++ b/gptqmodel/models/writer.py
@@ -236,9 +236,32 @@ def save_quantized(
         config.quantization_config = quantize_config.to_dict()
         self.model.config = config
 
-        # Save model config, including generation_config
-        # Use empty state_dict hack to bypass saving weights
-        self.model.save_pretrained(save_dir, state_dict={}, is_main_process=True)
+        def strip_attention_impl_fields(target: Any) -> Dict[str, Any]:
+            removed: Dict[str, Any] = {}
+            for attr in ("attn_implementation", "_attn_implementation"):
+                if hasattr(target, attr):
+                    removed[attr] = getattr(target, attr)
+                    delattr(target, attr)
+            return removed
+
+        generation_config = getattr(self.model, "generation_config", None)
+        removed_config_attention_attrs: Dict[str, Any] = {}
+        removed_generation_attention_attrs: Dict[str, Any] = {}
+
+        try:
+            removed_config_attention_attrs = strip_attention_impl_fields(self.model.config)
+            if generation_config is not None:
+                removed_generation_attention_attrs = strip_attention_impl_fields(generation_config)
+
+            # Save model config, including generation_config
+            # Use empty state_dict hack to bypass saving weights
+            self.model.save_pretrained(save_dir, state_dict={}, is_main_process=True)
+        finally:
+            for attr, value in removed_config_attention_attrs.items():
+                setattr(self.model.config, attr, value)
+            if generation_config is not None:
+                for attr, value in removed_generation_attention_attrs.items():
+                    setattr(generation_config, attr, value)
 
         gen_config_path = os.path.join(save_dir, "generation_config.json")
         if sanitize_generation_config_file(gen_config_path):