From 9d6ee277a4fdb1c03f2ca263a74e6252fdb26e1b Mon Sep 17 00:00:00 2001 From: avtc Date: Sat, 1 Nov 2025 22:27:38 +0200 Subject: [PATCH 1/2] retry partial.to to fix accelerate invalid argument error for first moe layer for multi-GPU (>4) setups --- gptqmodel/quantization/gptq.py | 20 +++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) diff --git a/gptqmodel/quantization/gptq.py b/gptqmodel/quantization/gptq.py index 4f7723a02..4ff849447 100644 --- a/gptqmodel/quantization/gptq.py +++ b/gptqmodel/quantization/gptq.py @@ -500,9 +500,23 @@ def _materialize_global_hessian(self, target_device: Optional[torch.device] = No for partial_device, partial in self._device_hessian_partials.items(): if partial.device != result_accum.device or partial.dtype != torch.float32: - tmp = partial.to(device=result_accum.device, dtype=torch.float32) - result_accum.add_(tmp) - del tmp + try: + tmp = partial.to(device=result_accum.device, dtype=torch.float32) + result_accum.add_(tmp) + del tmp + except: + log.warn(f"Quantization: Module `{self.name}` -> Retry 1/2 partial.to in 0.5s") + time.sleep(0.5) + try: + tmp = partial.to(device=result_accum.device, dtype=torch.float32) + result_accum.add_(tmp) + del tmp + except: + log.warn(f"Quantization: Module `{self.name}` -> Retry 2/2 partial.to in 0.5s") + time.sleep(0.5) + tmp = partial.to(device=result_accum.device, dtype=torch.float32) + result_accum.add_(tmp) + del tmp else: result_accum.add_(partial) From c8f2a6428510c46dab5a68d6f9280f0b65f45537 Mon Sep 17 00:00:00 2001 From: Qubitium-ModelCloud Date: Sun, 2 Nov 2025 20:10:51 +0800 Subject: [PATCH 2/2] remove second loop, reduce delay time --- gptqmodel/quantization/gptq.py | 18 +++++++----------- 1 file changed, 7 insertions(+), 11 deletions(-) diff --git a/gptqmodel/quantization/gptq.py b/gptqmodel/quantization/gptq.py index 4ff849447..3ef45433d 100644 --- a/gptqmodel/quantization/gptq.py +++ b/gptqmodel/quantization/gptq.py @@ -500,23 +500,19 @@ def _materialize_global_hessian(self, target_device: Optional[torch.device] = No for partial_device, partial in self._device_hessian_partials.items(): if partial.device != result_accum.device or partial.dtype != torch.float32: + # TODO FIXME multi-3090 using P2P is revaling an issue where result_accum and/or partial is not ready for consolidation on the main thread + # when parials are calculated on the individual try: tmp = partial.to(device=result_accum.device, dtype=torch.float32) result_accum.add_(tmp) del tmp except: log.warn(f"Quantization: Module `{self.name}` -> Retry 1/2 partial.to in 0.5s") - time.sleep(0.5) - try: - tmp = partial.to(device=result_accum.device, dtype=torch.float32) - result_accum.add_(tmp) - del tmp - except: - log.warn(f"Quantization: Module `{self.name}` -> Retry 2/2 partial.to in 0.5s") - time.sleep(0.5) - tmp = partial.to(device=result_accum.device, dtype=torch.float32) - result_accum.add_(tmp) - del tmp + time.sleep(0.25) + tmp = partial.to(device=result_accum.device, dtype=torch.float32) + result_accum.add_(tmp) + del tmp + else: result_accum.add_(partial)