From 5bf62f67dc2c1418def52497300df22759e07550 Mon Sep 17 00:00:00 2001 From: Qubitium Date: Thu, 16 Oct 2025 09:24:32 +0000 Subject: [PATCH 1/3] update scores --- tests/models/test_qwen3_moe.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/tests/models/test_qwen3_moe.py b/tests/models/test_qwen3_moe.py index 4916361f6..a36f4e480 100644 --- a/tests/models/test_qwen3_moe.py +++ b/tests/models/test_qwen3_moe.py @@ -6,11 +6,12 @@ from model_test import ModelTest + class TestQwen3Moe(ModelTest): NATIVE_MODEL_ID = "/monster/data/model/Qwen3-30B-A3B" - QUANT_ARC_MAX_DELTA_FLOOR_PERCENT = 0.2 - NATIVE_ARC_CHALLENGE_ACC = 0.3700 - NATIVE_ARC_CHALLENGE_ACC_NORM = 0.3700 + QUANT_ARC_MAX_DELTA_FLOOR_PERCENT = 0.04 + NATIVE_ARC_CHALLENGE_ACC = 0.3788 # a100 4,5,6,7 + NATIVE_ARC_CHALLENGE_ACC_NORM = 0.3899 # a100 4,5,6,7 # TRUST_REMOTE_CODE = False APPLY_CHAT_TEMPLATE = True # EVAL_BATCH_SIZE = 6 From f1e1ebe564df86d69716c9c9c33babaa53f12cf2 Mon Sep 17 00:00:00 2001 From: Qubitium Date: Thu, 16 Oct 2025 17:12:42 +0000 Subject: [PATCH 2/3] cleanup --- gptqmodel/__init__.py | 2 +- tests/models/test_qwen3_next.py | 14 +++++++++++--- 2 files changed, 12 insertions(+), 4 deletions(-) diff --git a/gptqmodel/__init__.py b/gptqmodel/__init__.py index 10559ead4..b54284928 100644 --- a/gptqmodel/__init__.py +++ b/gptqmodel/__init__.py @@ -16,7 +16,7 @@ DEVICE_THREAD_POOL = DeviceThreadPool( inference_mode=True, workers={ - "cuda:per": 2, + "cuda:per": 4, "xpu:per": 1, "mps": 8, "cpu": 8, diff --git a/tests/models/test_qwen3_next.py b/tests/models/test_qwen3_next.py index 2c45f9dc7..dcb960d14 100644 --- a/tests/models/test_qwen3_next.py +++ b/tests/models/test_qwen3_next.py @@ -8,13 +8,21 @@ class TestQwen3Next(ModelTest): NATIVE_MODEL_ID = "/monster/data/model/Qwen3-Next-80B-A3B-Instruct" - QUANT_ARC_MAX_DELTA_FLOOR_PERCENT = 0.05 + QUANT_ARC_MAX_DELTA_FLOOR_PERCENT = 0.04 NATIVE_ARC_CHALLENGE_ACC = 0.3900 NATIVE_ARC_CHALLENGE_ACC_NORM = 0.3900 TRUST_REMOTE_CODE = True APPLY_CHAT_TEMPLATE = True - EVAL_BATCH_SIZE = 6 - #DATASET_SIZE = 1024 + EVAL_BATCH_SIZE = 4 + V2 = False + DEBUG = True + ACT_GROUP_AWARE = True + DESC_ACT = False + DATASET_SIZE = 1024 + DATASET_SORT = "desc" + QUANT_BATCH_SIZE = 4 + CALIB_NOISE_MODE = "unseen" + CALIB_NOISE_PERCENT = 0.025 def test_mimo(self): self.quant_lm_eval() From 0d3f45995886c0325e3d57ad290d6ae47d6f4abd Mon Sep 17 00:00:00 2001 From: Qubitium Date: Thu, 16 Oct 2025 17:14:41 +0000 Subject: [PATCH 3/3] cleanup --- README.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 9365145a4..5611545b1 100644 --- a/README.md +++ b/README.md @@ -17,7 +17,8 @@

## Latest News -* 09/30/2025 5.0.0-dev `main`: 👀: New Data Parallel + Multi-GPU + Python 3.13g (PYTHON_GIL=0) equals 80%+ overall quant time reduction of large MoE models va v4.2.5. +* 10/17/2025 5.0.0-dev `main`: 👀: EoRA now multi-gpu compatible. Fixed both quality stability of multi-gpu quanta and vram usage. New LFM and Ling models support. +* 09/30/2025 5.0.0-dev `main`: 👀: New Data Parallel + Multi-GPU + Python 3.13T (PYTHON_GIL=0) equals 80%+ overall quant time reduction of large MoE models vs v4.2.5. * 09/29/2025 5.0.0-dev `main`: 🎉 New Qwen3 Omni model support. AWQ Marlin kernel integrated + many disk offload, threading, and memory usage fixes. * 09/24/2025 5.0.0-dev `main`: 🎉 Up to 90% cpu mem saving for large MoE models with faster/inline packing! 26% quant time reduction for Qwen3 MoE! AWQ Marlin kernel added. AWQ Gemm loading bug fixes. `act_group_aware` now faster and auto enabled for GPTQ when `desc_act` is False for higher quality recovery. * 09/19/2025 5.0.0-dev `main`: 👀 Cpu memory saving of ~73.5% during quantization stage with new `offload_to_disk` quantization config property default to `True`.