diff --git a/gptqmodel/nn_modules/qlinear/__init__.py b/gptqmodel/nn_modules/qlinear/__init__.py index 5efc44a12..b9dc99589 100644 --- a/gptqmodel/nn_modules/qlinear/__init__.py +++ b/gptqmodel/nn_modules/qlinear/__init__.py @@ -415,8 +415,8 @@ def post_init(self, **kwargs): dtype=t.int32, ).reshape(1, 3, 12).to(device=self.g_idx.device) - self.wf_unsqueeze_zero = wf.unsqueeze(0).to(device=self.g_idx.device) - self.wf_unsqueeze_neg_one = wf.unsqueeze(-1).to(device=self.g_idx.device) + self.register_buffer("wf_unsqueeze_zero", wf.unsqueeze(0).to(device=self.g_idx.device), persistent=False) + self.register_buffer("wf_unsqueeze_neg_one", wf.unsqueeze(-1).to(device=self.g_idx.device), persistent=False) super().post_init(**kwargs) diff --git a/gptqmodel/nn_modules/qlinear/torch_fused.py b/gptqmodel/nn_modules/qlinear/torch_fused.py index 66ad7b362..db7017bac 100644 --- a/gptqmodel/nn_modules/qlinear/torch_fused.py +++ b/gptqmodel/nn_modules/qlinear/torch_fused.py @@ -46,7 +46,7 @@ class TorchFusedQuantLinear(PackableQuantLinear): SUPPORTS_OUT_FEATURES_DIVISIBLE_BY = [1] # optimized for XPU but should run on all - SUPPORTS_DEVICES = [DEVICE.XPU] # change this to XPU to limit to Intel XPU + SUPPORTS_DEVICES = [DEVICE.CPU, DEVICE.XPU] # change this to XPU to limit to Intel XPU SUPPORTS_PLATFORM = [PLATFORM.ALL] SUPPORTS_PACK_DTYPES = [torch.int32] SUPPORTS_ADAPTERS = [Lora] @@ -174,7 +174,7 @@ def forward(self, x: torch.Tensor): def _forward(self, x, out_shape): num_itr = self.g_idx.shape[0] // x.shape[-1] - if not self.training and not self.transformed and TORCH_HAS_XPU_FUSED_OPS: + if not self.training and not self.transformed and TORCH_HAS_XPU_FUSED_OPS and "xpu" == x.device.type: # one-time transform per module for xpu aten fused ops self.transform(x.dtype) self.transformed = True diff --git a/tests/test_kernel_output_torch_fused.py b/tests/test_kernel_output_torch_fused.py index 3a0ce93b1..c4a0e460b 100644 --- a/tests/test_kernel_output_torch_fused.py +++ b/tests/test_kernel_output_torch_fused.py @@ -19,7 +19,7 @@ class TestKernelOutput(unittest.TestCase): BACKEND.TORCH_FUSED: TorchFusedQuantLinear, } target = 'model.layers.6.self_attn.v_proj' - device_map = "cpu" + device = "cpu" m = [1, 16, 64, 256, 1024] k = 2048 dtype = torch.float16 @@ -29,7 +29,7 @@ class TestKernelOutput(unittest.TestCase): @classmethod def setUp(self): - self.torch_model = GPTQModel.load(self.model_path, backend=BACKEND.TORCH, device_map=self.device_map, dtype=self.dtype) + self.torch_model = GPTQModel.load(self.model_path, backend=BACKEND.TORCH, device=self.device, dtype=self.dtype) self.x = [] self.torch_kernel_outs = [] for dim_0 in self.m: @@ -61,8 +61,8 @@ def assert_on_mismatch(self, a: Tensor, b: Tensor, rtol=0.00005, atol=0.00005): (BACKEND.TORCH_FUSED, r_tolerance, a_tolerance), ]) def test_kernel_output(self, backend: BACKEND, r_tolerance: float, a_tolerance: float): - model = GPTQModel.load(self.model_path, backend=backend, device_map=self.device_map, dtype=self.dtype) - log.info(f"device_map: {self.device_map} ") + model = GPTQModel.load(self.model_path, backend=backend, device=self.device, dtype=self.dtype) + log.info(f"device: {self.device} ") log.info(f"backend: {backend} ") for i in range(len(self.x)): out = self.forward(model, self.x[i], backend=backend) @@ -75,7 +75,7 @@ class TestKernelOutputBFloat16(TestKernelOutput): @unittest.skipUnless(hasattr(torch, "xpu") and torch.xpu.is_available(), reason="Test requires XPU") class TestKernelOutputXPU(TestKernelOutput): - device_map = "xpu:0" + device = "xpu:0" a_tolerance = 0.0005