diff --git a/pyproject.toml b/pyproject.toml index c52667dc..fdc6d91f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -18,6 +18,7 @@ packages = [ dependencies = [ "msgpack>=1.0.7", "safetensors>=0.5.1", + "huggingface-hub", "numpy>=1.26", "pyzmq>=25.0", "psutil>=5.9.5", diff --git a/src/backend/server/static_config.py b/src/backend/server/static_config.py index 8324e7ba..d5c82873 100644 --- a/src/backend/server/static_config.py +++ b/src/backend/server/static_config.py @@ -1,4 +1,5 @@ -from mlx_lm.utils import get_model_path, load_config +import json +from pathlib import Path from scheduling.model_info import ModelInfo @@ -20,6 +21,7 @@ # "Qwen/Qwen3-30B-A3B-Thinking-2507-FP8", "Qwen/Qwen3-235B-A22B-Instruct-2507-FP8", "Qwen/Qwen3-235B-A22B-Thinking-2507-FP8", + "Qwen/Qwen3-235B-A22B-GPTQ-Int4", # "Qwen/Qwen2.5-3B-Instruct", # "Qwen/Qwen2.5-7B-Instruct", # "Qwen/Qwen2.5-14B-Instruct", @@ -35,8 +37,21 @@ def get_model_info(model_name): - model_path = get_model_path(model_name)[0] - config = load_config(model_path) + def _load_config_only(name: str) -> dict: + local_path = Path(name) + if local_path.exists(): + config_path = local_path / "config.json" + with open(config_path, "r") as f: + return json.load(f) + + # Hugging Face only – download just config.json + from huggingface_hub import hf_hub_download # type: ignore + + config_file = hf_hub_download(repo_id=name, filename="config.json") + with open(config_file, "r") as f: + return json.load(f) + + config = _load_config_only(model_name) # get quant method quant_method = config.get("quant_method", None) diff --git a/src/parallax/launch.py b/src/parallax/launch.py index cb02b4b5..a68e16d7 100644 --- a/src/parallax/launch.py +++ b/src/parallax/launch.py @@ -36,6 +36,7 @@ "Qwen/Qwen3-Next-80B-A3B-Thinking-FP8": "mlx-community/Qwen3-Next-80B-A3B-Thinking-8bit", "Qwen/Qwen3-235B-A22B-Instruct-2507-FP8": "mlx-community/Qwen3-235B-A22B-Instruct-2507-4bit", "Qwen/Qwen3-235B-A22B-Thinking-2507-FP8": "mlx-community/Qwen3-235B-A22B-Thinking-2507-4bit", + "Qwen/Qwen3-235B-A22B-GPTQ-Int4": "mlx-community/Qwen3-235B-A22B-4bit", "moonshotai/Kimi-K2-Instruct": "mlx-community/Kimi-K2-Instruct-4bit", } diff --git a/src/scheduling/layer_allocation.py b/src/scheduling/layer_allocation.py index cd73edd1..91a8e127 100644 --- a/src/scheduling/layer_allocation.py +++ b/src/scheduling/layer_allocation.py @@ -815,7 +815,7 @@ def global_allocation(self) -> bool: total_cap = sum(node.get_decoder_layer_capacity() for node in self.nodes) if num_layers <= 0 or num_nodes == 0 or total_cap < num_layers: - logger.debug( + logger.warning( "[DP] Insufficient resources: nodes=%d, layers=%d, total_cap=%d", num_nodes, num_layers,