From 9d6ee277a4fdb1c03f2ca263a74e6252fdb26e1b Mon Sep 17 00:00:00 2001
From: avtc <tarasenkov@gmail.com>
Date: Sat, 1 Nov 2025 22:27:38 +0200
Subject: [PATCH 1/2] retry partial.to to fix accelerate invalid argument error
 for first moe layer for multi-GPU (>4) setups

---
 gptqmodel/quantization/gptq.py | 20 +++++++++++++++++---
 1 file changed, 17 insertions(+), 3 deletions(-)

diff --git a/gptqmodel/quantization/gptq.py b/gptqmodel/quantization/gptq.py
index 4f7723a02..4ff849447 100644
--- a/gptqmodel/quantization/gptq.py
+++ b/gptqmodel/quantization/gptq.py
@@ -500,9 +500,23 @@ def _materialize_global_hessian(self, target_device: Optional[torch.device] = No
 
             for partial_device, partial in self._device_hessian_partials.items():
                 if partial.device != result_accum.device or partial.dtype != torch.float32:
-                    tmp = partial.to(device=result_accum.device, dtype=torch.float32)
-                    result_accum.add_(tmp)
-                    del tmp
+                    try:
+                        tmp = partial.to(device=result_accum.device, dtype=torch.float32)
+                        result_accum.add_(tmp)
+                        del tmp
+                    except:
+                        log.warn(f"Quantization: Module `{self.name}` -> Retry 1/2 partial.to in 0.5s")
+                        time.sleep(0.5)
+                        try:
+                            tmp = partial.to(device=result_accum.device, dtype=torch.float32)
+                            result_accum.add_(tmp)
+                            del tmp
+                        except:
+                            log.warn(f"Quantization: Module `{self.name}` -> Retry 2/2 partial.to in 0.5s")
+                            time.sleep(0.5)
+                            tmp = partial.to(device=result_accum.device, dtype=torch.float32)
+                            result_accum.add_(tmp)
+                            del tmp
                 else:
                     result_accum.add_(partial)
 

From c8f2a6428510c46dab5a68d6f9280f0b65f45537 Mon Sep 17 00:00:00 2001
From: Qubitium-ModelCloud <qubitium@modelcloud.ai>
Date: Sun, 2 Nov 2025 20:10:51 +0800
Subject: [PATCH 2/2] remove second loop, reduce delay time

---
 gptqmodel/quantization/gptq.py | 18 +++++++-----------
 1 file changed, 7 insertions(+), 11 deletions(-)

diff --git a/gptqmodel/quantization/gptq.py b/gptqmodel/quantization/gptq.py
index 4ff849447..3ef45433d 100644
--- a/gptqmodel/quantization/gptq.py
+++ b/gptqmodel/quantization/gptq.py
@@ -500,23 +500,19 @@ def _materialize_global_hessian(self, target_device: Optional[torch.device] = No
 
             for partial_device, partial in self._device_hessian_partials.items():
                 if partial.device != result_accum.device or partial.dtype != torch.float32:
+                    # TODO FIXME multi-3090 using P2P is revaling an issue where result_accum and/or partial is not ready for consolidation on the main thread
+                    # when parials are calculated on the individual 
                     try:
                         tmp = partial.to(device=result_accum.device, dtype=torch.float32)
                         result_accum.add_(tmp)
                         del tmp
                     except:
                         log.warn(f"Quantization: Module `{self.name}` -> Retry 1/2 partial.to in 0.5s")
-                        time.sleep(0.5)
-                        try:
-                            tmp = partial.to(device=result_accum.device, dtype=torch.float32)
-                            result_accum.add_(tmp)
-                            del tmp
-                        except:
-                            log.warn(f"Quantization: Module `{self.name}` -> Retry 2/2 partial.to in 0.5s")
-                            time.sleep(0.5)
-                            tmp = partial.to(device=result_accum.device, dtype=torch.float32)
-                            result_accum.add_(tmp)
-                            del tmp
+                        time.sleep(0.25)
+                        tmp = partial.to(device=result_accum.device, dtype=torch.float32)
+                        result_accum.add_(tmp)
+                        del tmp
+                       
                 else:
                     result_accum.add_(partial)