From 7427018eb61f35577a81f569b72753eed1155986 Mon Sep 17 00:00:00 2001 From: ZX-ModelCloud Date: Thu, 25 Sep 2025 17:16:30 +0800 Subject: [PATCH 1/2] fix test_kernel_output_torch_fused Signed-off-by: ZX-ModelCloud --- gptqmodel/nn_modules/qlinear/__init__.py | 4 ++-- tests/test_kernel_output_torch_fused.py | 8 ++++---- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/gptqmodel/nn_modules/qlinear/__init__.py b/gptqmodel/nn_modules/qlinear/__init__.py index 5efc44a12..b9dc99589 100644 --- a/gptqmodel/nn_modules/qlinear/__init__.py +++ b/gptqmodel/nn_modules/qlinear/__init__.py @@ -415,8 +415,8 @@ def post_init(self, **kwargs): dtype=t.int32, ).reshape(1, 3, 12).to(device=self.g_idx.device) - self.wf_unsqueeze_zero = wf.unsqueeze(0).to(device=self.g_idx.device) - self.wf_unsqueeze_neg_one = wf.unsqueeze(-1).to(device=self.g_idx.device) + self.register_buffer("wf_unsqueeze_zero", wf.unsqueeze(0).to(device=self.g_idx.device), persistent=False) + self.register_buffer("wf_unsqueeze_neg_one", wf.unsqueeze(-1).to(device=self.g_idx.device), persistent=False) super().post_init(**kwargs) diff --git a/tests/test_kernel_output_torch_fused.py b/tests/test_kernel_output_torch_fused.py index 3a0ce93b1..386d455f2 100644 --- a/tests/test_kernel_output_torch_fused.py +++ b/tests/test_kernel_output_torch_fused.py @@ -19,7 +19,7 @@ class TestKernelOutput(unittest.TestCase): BACKEND.TORCH_FUSED: TorchFusedQuantLinear, } target = 'model.layers.6.self_attn.v_proj' - device_map = "cpu" + device = "xpu" m = [1, 16, 64, 256, 1024] k = 2048 dtype = torch.float16 @@ -29,7 +29,7 @@ class TestKernelOutput(unittest.TestCase): @classmethod def setUp(self): - self.torch_model = GPTQModel.load(self.model_path, backend=BACKEND.TORCH, device_map=self.device_map, dtype=self.dtype) + self.torch_model = GPTQModel.load(self.model_path, backend=BACKEND.TORCH, device=self.device, dtype=self.dtype) self.x = [] self.torch_kernel_outs = [] for dim_0 in self.m: @@ -61,8 +61,8 @@ def assert_on_mismatch(self, a: Tensor, b: Tensor, rtol=0.00005, atol=0.00005): (BACKEND.TORCH_FUSED, r_tolerance, a_tolerance), ]) def test_kernel_output(self, backend: BACKEND, r_tolerance: float, a_tolerance: float): - model = GPTQModel.load(self.model_path, backend=backend, device_map=self.device_map, dtype=self.dtype) - log.info(f"device_map: {self.device_map} ") + model = GPTQModel.load(self.model_path, backend=backend, device=self.device, dtype=self.dtype) + log.info(f"device: {self.device} ") log.info(f"backend: {backend} ") for i in range(len(self.x)): out = self.forward(model, self.x[i], backend=backend) From a2f1f12dfdd77ac83bb3324604ddd2b3b62c7c49 Mon Sep 17 00:00:00 2001 From: ZX-ModelCloud Date: Thu, 25 Sep 2025 17:55:07 +0800 Subject: [PATCH 2/2] TorchFusedQuantLinear also supports CPU Signed-off-by: ZX-ModelCloud --- gptqmodel/nn_modules/qlinear/torch_fused.py | 4 ++-- tests/test_kernel_output_torch_fused.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/gptqmodel/nn_modules/qlinear/torch_fused.py b/gptqmodel/nn_modules/qlinear/torch_fused.py index 66ad7b362..db7017bac 100644 --- a/gptqmodel/nn_modules/qlinear/torch_fused.py +++ b/gptqmodel/nn_modules/qlinear/torch_fused.py @@ -46,7 +46,7 @@ class TorchFusedQuantLinear(PackableQuantLinear): SUPPORTS_OUT_FEATURES_DIVISIBLE_BY = [1] # optimized for XPU but should run on all - SUPPORTS_DEVICES = [DEVICE.XPU] # change this to XPU to limit to Intel XPU + SUPPORTS_DEVICES = [DEVICE.CPU, DEVICE.XPU] # change this to XPU to limit to Intel XPU SUPPORTS_PLATFORM = [PLATFORM.ALL] SUPPORTS_PACK_DTYPES = [torch.int32] SUPPORTS_ADAPTERS = [Lora] @@ -174,7 +174,7 @@ def forward(self, x: torch.Tensor): def _forward(self, x, out_shape): num_itr = self.g_idx.shape[0] // x.shape[-1] - if not self.training and not self.transformed and TORCH_HAS_XPU_FUSED_OPS: + if not self.training and not self.transformed and TORCH_HAS_XPU_FUSED_OPS and "xpu" == x.device.type: # one-time transform per module for xpu aten fused ops self.transform(x.dtype) self.transformed = True diff --git a/tests/test_kernel_output_torch_fused.py b/tests/test_kernel_output_torch_fused.py index 386d455f2..c4a0e460b 100644 --- a/tests/test_kernel_output_torch_fused.py +++ b/tests/test_kernel_output_torch_fused.py @@ -19,7 +19,7 @@ class TestKernelOutput(unittest.TestCase): BACKEND.TORCH_FUSED: TorchFusedQuantLinear, } target = 'model.layers.6.self_attn.v_proj' - device = "xpu" + device = "cpu" m = [1, 16, 64, 256, 1024] k = 2048 dtype = torch.float16 @@ -75,7 +75,7 @@ class TestKernelOutputBFloat16(TestKernelOutput): @unittest.skipUnless(hasattr(torch, "xpu") and torch.xpu.is_available(), reason="Test requires XPU") class TestKernelOutputXPU(TestKernelOutput): - device_map = "xpu:0" + device = "xpu:0" a_tolerance = 0.0005