ModelCloud · Qubitium · Sep 27, 2025 · Sep 26, 2025 · Sep 27, 2025 · Sep 27, 2025
diff --git a/gptqmodel/looper/gptq_processor.py b/gptqmodel/looper/gptq_processor.py
@@ -19,6 +19,7 @@
 from ..quantization.config import METHOD, QuantizeConfig
 from ..utils.importer import select_quant_linear
 from ..utils.logger import setup_logger
+from ..utils.memory import MEM_LORD
 from ..utils.model import create_quant_module, find_modules, move_to, pack_model, pack_module
 from ..utils.offload import undo_offload_to_disk
 from ..utils.torch import HAS_CUDA, torch_streamCtx, torch_sync
@@ -126,6 +127,7 @@ def process(self, module: NamedModule):
             g = self.tasks[module.name]
 
         wq, q_scales, q_zeros, q_g_idx, duration, avg_loss, damp_percent, nsamples = g.quantize()
+        MEM_LORD.free((q_scales, q_zeros, q_g_idx))
 
         with self.lock:
             module.state.update({"q_scales": q_scales})
@@ -196,6 +198,7 @@ def process(self, module: NamedModule):
                 "wq": wq,  # fp16, quantized weight but not int4 (packed qweight)
             })
 
+        MEM_LORD.free(module.weight)
         module.weight.data = wq
 
     # submodule_finalized is called in reverse after all next sequential processes are called
@@ -248,6 +251,7 @@ def submodule_finalize(self, module: NamedModule, model: BaseQModel, **kwargs):
         with self.lock:
             self.result_pop(module.full_name)
 
+        # MEM_LORD.free(module.weight)
         module.unregister_parameter("weight")
 
     def finalize(self, model: BaseQModel, **kwargs):
@@ -256,6 +260,8 @@ def finalize(self, model: BaseQModel, **kwargs):
             torch_sync()
 
         model.model = undo_offload_to_disk(module=model.model, include_buffers=True, delete_offload_folders=True)
+        MEM_LORD.free(model.model)
+
         # print("finalize")
         # print_module_tree(model.model)
 

diff --git a/gptqmodel/nn_modules/qlinear/marlin.py b/gptqmodel/nn_modules/qlinear/marlin.py
@@ -207,16 +207,15 @@ def validate(cls, **args) -> Tuple[bool, Optional[Exception]]:
     @classmethod
     def validate_device(cls, device: DEVICE):
         super().validate_device(device)
-        CUDA_VISIBLE_DEVICES = os.environ.get("CUDA_VISIBLE_DEVICES")
         if device == DEVICE.CUDA:
             if IS_ROCM:
                 raise NotImplementedError("Marlin kernel is not supported on ROCm.")
 
-            if CUDA_VISIBLE_DEVICES is None:
-                has_cuda_v8 = all(torch.cuda.get_device_capability(i)[0] >= 8 for i in range(torch.cuda.device_count()))
-            else:
-                has_cuda_v8 = all(
-                    torch.cuda.get_device_capability(i)[0] >= 8 for i in range(len(CUDA_VISIBLE_DEVICES.split(","))))
+            # Directly check capabilities of all currently visible CUDA devices
+            has_cuda_v8 = all(
+                torch.cuda.get_device_capability(i)[0] >= 8
+                for i in range(torch.cuda.device_count())
+            )
             if not has_cuda_v8:
                 raise NotImplementedError("Marlin kernel only supports compute capability >= 8.0.")
 

diff --git a/gptqmodel/quantization/gptq.py b/gptqmodel/quantization/gptq.py
@@ -25,6 +25,7 @@
 from ..utils.torch import HAS_CUDA, HAS_XPU, device_next
 from .gar import compose_final_perm, compute_global_perm, compute_local_perms, invert_perm
 from .quantizer import HF_OPTIMUM, Quantizer
+from ..utils.memory import MEM_LORD
 
 log = setup_logger()
 
@@ -522,6 +523,7 @@ def quantize(
             avg_loss = 999999999
 
         del Losses
+        MEM_LORD.free(self.H)
         del self.H
 
         group_size = self.qcfg.group_size if self.qcfg.group_size != -1 else self.columns