From 365aa57c434d0cf52af2c1cd07740a3bb0509ec9 Mon Sep 17 00:00:00 2001 From: Qubitium Date: Wed, 15 Oct 2025 01:56:49 +0000 Subject: [PATCH 1/4] thread safety Signed-off-by: Qubitium --- gptqmodel_ext/pack_block_cpu.cpp | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/gptqmodel_ext/pack_block_cpu.cpp b/gptqmodel_ext/pack_block_cpu.cpp index eae714322..223b0c109 100644 --- a/gptqmodel_ext/pack_block_cpu.cpp +++ b/gptqmodel_ext/pack_block_cpu.cpp @@ -80,11 +80,6 @@ std::tuple pack_block_cpu( const int32_t* gidx_ptr = g_idx_i32.const_data_ptr(); int32_t* qweight_ptr = qweight.data_ptr(); - int old_threads = at::get_num_threads(); - if (threads > 0) { - at::set_num_threads(static_cast(threads)); - } - const int64_t out_stride = in_features; const int64_t scales_stride = out_features; @@ -92,6 +87,15 @@ std::tuple pack_block_cpu( if (grain_size <= 0) { grain_size = 1; } + if (threads > 0) { + // Limit the number of parallel chunks to roughly `threads` without + // mutating the global ATen thread configuration, keeping the kernel reentrant. + int64_t target_chunk = (num_blocks + threads - 1) / threads; + if (target_chunk <= 0) { + target_chunk = 1; + } + grain_size = std::max(grain_size, target_chunk); + } at::parallel_for(0, num_blocks, grain_size, [&](int64_t block_begin, int64_t block_end) { std::array qvals{}; @@ -163,10 +167,6 @@ std::tuple pack_block_cpu( } }); - if (threads > 0) { - at::set_num_threads(old_threads); - } - at::Tensor zeros_i32_contig = zeros_i32.contiguous(); const int32_t* zeros_ptr = zeros_i32_contig.const_data_ptr(); const int64_t zeros_stride = zeros_i32_contig.size(1); From f2e7759e0e34ffe079a7c1c6f2fe5587048fc8ca Mon Sep 17 00:00:00 2001 From: Qubitium Date: Wed, 15 Oct 2025 02:23:26 +0000 Subject: [PATCH 2/4] add timing Signed-off-by: Qubitium --- tests/test_pack.py | 28 ++++++++++++++++++---------- 1 file changed, 18 insertions(+), 10 deletions(-) diff --git a/tests/test_pack.py b/tests/test_pack.py index e898b9c53..057492ff8 100644 --- a/tests/test_pack.py +++ b/tests/test_pack.py @@ -3,6 +3,7 @@ # Contact: qubitium@modelcloud.ai, x.com/qubitium import math +import time import unittest import torch @@ -62,6 +63,8 @@ def _run_impl(self, impl: str, linear, scales, zeros, g_idx): scales_T = scales.t().contiguous() zeros_T = zeros.t().contiguous() + start = time.perf_counter() + if impl == "original": qlinear.pack_original(linear, scales_T, zeros_T, g_idx=g_idx) elif impl == "pack_block": @@ -84,6 +87,9 @@ def _run_impl(self, impl: str, linear, scales, zeros, g_idx): else: raise ValueError(f"Unknown impl `{impl}`") + end = time.perf_counter() + duration = end - start + # Move buffers to CPU for comparisons result = { "qweight": qlinear.qweight.detach().cpu(), @@ -93,7 +99,7 @@ def _run_impl(self, impl: str, linear, scales, zeros, g_idx): } if hasattr(qlinear, "bias") and qlinear.bias is not None: result["bias"] = qlinear.bias.detach().cpu() - return result + return result, duration @parameterized.expand( [ @@ -109,16 +115,17 @@ def test_pack_consistency(self, bits, group_size): linear, scales, zeros, g_idx = self._build_inputs(bits, group_size) - baseline = self._run_impl("original", linear, scales, zeros, g_idx) - pack_cpu = self._run_impl("pack_block", linear, scales, zeros, g_idx) - results = {"pack_block": pack_cpu} + baseline, baseline_time = self._run_impl("original", linear, scales, zeros, g_idx) + pack_cpu, pack_cpu_time = self._run_impl("pack_block", linear, scales, zeros, g_idx) + results = {"pack_block": (pack_cpu, pack_cpu_time)} if torch.cuda.is_available(): - results["pack_gpu"] = self._run_impl("gpu", linear, scales, zeros, g_idx) + pack_gpu, pack_gpu_time = self._run_impl("gpu", linear, scales, zeros, g_idx) + results["pack_gpu"] = (pack_gpu, pack_gpu_time) rows = [] - rows.append([f"pack_original (bits={bits}, g={group_size})", 0.0, 0.0, 0.0, 0.0]) - for name, tensors in results.items(): + rows.append([f"pack_original (bits={bits}, g={group_size})", 0.0, 0.0, 0.0, 0.0, baseline_time * 1e3]) + for name, (tensors, duration) in results.items(): diff_qweight = (tensors["qweight"].to(dtype=baseline["qweight"].dtype) - baseline["qweight"]).abs().max().item() diff_qzeros = (tensors["qzeros"].to(dtype=baseline["qzeros"].dtype) - baseline["qzeros"]).abs().max().item() diff_scales = (tensors["scales"].to(dtype=baseline["scales"].dtype) - baseline["scales"]).abs().max().item() @@ -129,6 +136,7 @@ def test_pack_consistency(self, bits, group_size): diff_qzeros, diff_scales, diff_gidx, + duration * 1e3, ]) self.assertTrue(torch.equal(tensors["qweight"], baseline["qweight"])) @@ -139,7 +147,7 @@ def test_pack_consistency(self, bits, group_size): print( tabulate( rows, - headers=["impl", "max|Δ qweight|", "max|Δ qzeros|", "max|Δ scales|", "max|Δ g_idx|"], + headers=["impl", "max|Δ qweight|", "max|Δ qzeros|", "max|Δ scales|", "max|Δ g_idx|", "time [ms]"], floatfmt=".3e", ) ) @@ -155,8 +163,8 @@ def test_pack_negative_g_idx(self): g_idx_neg = g_idx.to(dtype=torch.int32) g_idx_neg[::7] -= groups - baseline = self._run_impl("original", linear, scales, zeros, g_idx_neg) - pack_cpu = self._run_impl("pack_block", linear, scales, zeros, g_idx_neg) + baseline, _ = self._run_impl("original", linear, scales, zeros, g_idx_neg) + pack_cpu, _ = self._run_impl("pack_block", linear, scales, zeros, g_idx_neg) self.assertTrue(torch.equal(pack_cpu["qweight"], baseline["qweight"])) self.assertTrue(torch.equal(pack_cpu["qzeros"], baseline["qzeros"])) From fcbe85b4d6a2ecd01c6f8e187ea23d30b75e022b Mon Sep 17 00:00:00 2001 From: Qubitium Date: Wed, 15 Oct 2025 02:23:43 +0000 Subject: [PATCH 3/4] compiled triton will clash with _dynamo config Signed-off-by: Qubitium --- gptqmodel/utils/torch.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/gptqmodel/utils/torch.py b/gptqmodel/utils/torch.py index d5e5f0744..24803ef26 100644 --- a/gptqmodel/utils/torch.py +++ b/gptqmodel/utils/torch.py @@ -65,11 +65,14 @@ def timed_gc_collect(*args, **kwargs) -> int: return collected # reset dynamo cache on each model load since during ci loop model inference may exhuast cache -torch._dynamo.reset() - -# Increase the dynamo cache size limit, default of 8 is too low -if torch._dynamo.config.cache_size_limit < 128: - torch._dynamo.config.cache_size_limit = 128 +try: + torch._dynamo.reset() + # Increase the dynamo cache size limit, default of 8 is too low + if torch._dynamo.config.cache_size_limit < 128: + torch._dynamo.config.cache_size_limit = 128 +except BaseException: + # triton built from source maybe incompatible with _dynamo private api + pass if hasattr(torch, "cuda") and hasattr(torch.cuda, "is_available") and torch.cuda.is_available(): HAS_CUDA = True From 92f5e64dc4ac408ca7bdd11a011e4daacdc7efc1 Mon Sep 17 00:00:00 2001 From: Qubitium Date: Wed, 15 Oct 2025 02:32:57 +0000 Subject: [PATCH 4/4] import Signed-off-by: Qubitium --- tests/models/model_test.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/tests/models/model_test.py b/tests/models/model_test.py index bb70ab5a4..1bce752ea 100644 --- a/tests/models/model_test.py +++ b/tests/models/model_test.py @@ -40,7 +40,11 @@ import torch.cuda # noqa: E402 from datasets import load_dataset # noqa: E402 -from ovis.image_to_test_dataset import get_calib_dataset # noqa: E402 +try: + from ovis.image_to_test_dataset import get_calib_dataset # noqa: E402 +except BaseException: + pass + from transformers import AutoProcessor, AutoTokenizer # noqa: E402 from gptqmodel import BACKEND, GPTQModel # noqa: E402