From a044eb5e83a564ca412fe430659dded0bcd769ee Mon Sep 17 00:00:00 2001 From: gufengc Date: Mon, 13 Oct 2025 16:12:54 +0800 Subject: [PATCH 1/2] feat(model): Add qwen3-235B-int4 --- pyproject.toml | 1 + src/backend/server/static_config.py | 21 ++++++++++++++++++--- src/parallax/launch.py | 1 + src/scheduling/layer_allocation.py | 2 +- 4 files changed, 21 insertions(+), 4 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index c52667dc..1032b410 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -18,6 +18,7 @@ packages = [ dependencies = [ "msgpack>=1.0.7", "safetensors>=0.5.1", + "huggingface-hub" "numpy>=1.26", "pyzmq>=25.0", "psutil>=5.9.5", diff --git a/src/backend/server/static_config.py b/src/backend/server/static_config.py index 8324e7ba..d5c82873 100644 --- a/src/backend/server/static_config.py +++ b/src/backend/server/static_config.py @@ -1,4 +1,5 @@ -from mlx_lm.utils import get_model_path, load_config +import json +from pathlib import Path from scheduling.model_info import ModelInfo @@ -20,6 +21,7 @@ # "Qwen/Qwen3-30B-A3B-Thinking-2507-FP8", "Qwen/Qwen3-235B-A22B-Instruct-2507-FP8", "Qwen/Qwen3-235B-A22B-Thinking-2507-FP8", + "Qwen/Qwen3-235B-A22B-GPTQ-Int4", # "Qwen/Qwen2.5-3B-Instruct", # "Qwen/Qwen2.5-7B-Instruct", # "Qwen/Qwen2.5-14B-Instruct", @@ -35,8 +37,21 @@ def get_model_info(model_name): - model_path = get_model_path(model_name)[0] - config = load_config(model_path) + def _load_config_only(name: str) -> dict: + local_path = Path(name) + if local_path.exists(): + config_path = local_path / "config.json" + with open(config_path, "r") as f: + return json.load(f) + + # Hugging Face only – download just config.json + from huggingface_hub import hf_hub_download # type: ignore + + config_file = hf_hub_download(repo_id=name, filename="config.json") + with open(config_file, "r") as f: + return json.load(f) + + config = _load_config_only(model_name) # get quant method quant_method = config.get("quant_method", None) diff --git a/src/parallax/launch.py b/src/parallax/launch.py index cb02b4b5..a68e16d7 100644 --- a/src/parallax/launch.py +++ b/src/parallax/launch.py @@ -36,6 +36,7 @@ "Qwen/Qwen3-Next-80B-A3B-Thinking-FP8": "mlx-community/Qwen3-Next-80B-A3B-Thinking-8bit", "Qwen/Qwen3-235B-A22B-Instruct-2507-FP8": "mlx-community/Qwen3-235B-A22B-Instruct-2507-4bit", "Qwen/Qwen3-235B-A22B-Thinking-2507-FP8": "mlx-community/Qwen3-235B-A22B-Thinking-2507-4bit", + "Qwen/Qwen3-235B-A22B-GPTQ-Int4": "mlx-community/Qwen3-235B-A22B-4bit", "moonshotai/Kimi-K2-Instruct": "mlx-community/Kimi-K2-Instruct-4bit", } diff --git a/src/scheduling/layer_allocation.py b/src/scheduling/layer_allocation.py index cd73edd1..91a8e127 100644 --- a/src/scheduling/layer_allocation.py +++ b/src/scheduling/layer_allocation.py @@ -815,7 +815,7 @@ def global_allocation(self) -> bool: total_cap = sum(node.get_decoder_layer_capacity() for node in self.nodes) if num_layers <= 0 or num_nodes == 0 or total_cap < num_layers: - logger.debug( + logger.warning( "[DP] Insufficient resources: nodes=%d, layers=%d, total_cap=%d", num_nodes, num_layers, From f06167825cae548011de4e63296b03d762c8c900 Mon Sep 17 00:00:00 2001 From: gufengc Date: Mon, 13 Oct 2025 16:13:36 +0800 Subject: [PATCH 2/2] update --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 1032b410..fdc6d91f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -18,7 +18,7 @@ packages = [ dependencies = [ "msgpack>=1.0.7", "safetensors>=0.5.1", - "huggingface-hub" + "huggingface-hub", "numpy>=1.26", "pyzmq>=25.0", "psutil>=5.9.5",