From 5aa76e7fa8c7d61ad6ccebcc3c9abb9c8dbfc848 Mon Sep 17 00:00:00 2001 From: yuhao_zhang Date: Wed, 29 Oct 2025 15:18:57 +0800 Subject: [PATCH 01/13] add model --- src/backend/server/static_config.py | 1 + src/parallax/launch.py | 1 + 2 files changed, 2 insertions(+) diff --git a/src/backend/server/static_config.py b/src/backend/server/static_config.py index 39dd184a..d2691c39 100644 --- a/src/backend/server/static_config.py +++ b/src/backend/server/static_config.py @@ -50,6 +50,7 @@ "deepseek-ai/DeepSeek-V3", "deepseek-ai/DeepSeek-V2", "MiniMaxAI/MiniMax-M2", + "zai-org/GLM-4.6", ] NODE_JOIN_COMMAND_LOCAL_NETWORK = """parallax join""" diff --git a/src/parallax/launch.py b/src/parallax/launch.py index 7627388d..e4443c23 100644 --- a/src/parallax/launch.py +++ b/src/parallax/launch.py @@ -41,6 +41,7 @@ "moonshotai/Kimi-K2-Instruct": "mlx-community/Kimi-K2-Instruct-4bit", "deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct": "mlx-community/DeepSeek-Coder-V2-Lite-Instruct-4bit-mlx", "MiniMaxAI/MiniMax-M2": "mlx-community/MiniMax-M2-4bit", + "zai-org/GLM-4.6": "mlx-community/GLM-4.6-4bit", } if __name__ == "__main__": From e5e40a9298c739d691ecd748cff4305b6ba316ba Mon Sep 17 00:00:00 2001 From: Alien mac air <2214632589@qq.com> Date: Wed, 29 Oct 2025 17:09:18 +0800 Subject: [PATCH 02/13] fix params --- src/backend/server/static_config.py | 29 +++++++++++++++++++++++++---- src/parallax/launch.py | 15 +-------------- src/scheduling/model_info.py | 5 +++++ src/scheduling/node.py | 17 ++++++++++++++--- 4 files changed, 45 insertions(+), 21 deletions(-) diff --git a/src/backend/server/static_config.py b/src/backend/server/static_config.py index d2691c39..761100a2 100644 --- a/src/backend/server/static_config.py +++ b/src/backend/server/static_config.py @@ -1,6 +1,6 @@ import json from pathlib import Path - +import logging from scheduling.model_info import ModelInfo # Supported model list @@ -53,6 +53,21 @@ "zai-org/GLM-4.6", ] +MLX_MODEL_NAME_MAP = { + "openai/gpt-oss-20b": "mlx-community/gpt-oss-20b-MXFP4-Q8", + "openai/gpt-oss-120b": "mlx-community/gpt-oss-120b-4bit", + "Qwen/Qwen3-Next-80B-A3B-Instruct-FP8": "mlx-community/Qwen3-Next-80B-A3B-Instruct-8bit", + "Qwen/Qwen3-Next-80B-A3B-Thinking-FP8": "mlx-community/Qwen3-Next-80B-A3B-Thinking-8bit", + "Qwen/Qwen3-235B-A22B-Instruct-2507-FP8": "mlx-community/Qwen3-235B-A22B-Instruct-2507-4bit", + "Qwen/Qwen3-235B-A22B-Thinking-2507-FP8": "mlx-community/Qwen3-235B-A22B-Thinking-2507-4bit", + "Qwen/Qwen3-235B-A22B-GPTQ-Int4": "mlx-community/Qwen3-235B-A22B-4bit", + "moonshotai/Kimi-K2-Instruct": "mlx-community/Kimi-K2-Instruct-4bit", + "deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct": "mlx-community/DeepSeek-Coder-V2-Lite-Instruct-4bit-mlx", + "MiniMaxAI/MiniMax-M2": "mlx-community/MiniMax-M2-4bit", + "zai-org/GLM-4.6": "mlx-community/GLM-4.6-4bit", +} + +logger = logging.getLogger(__name__) NODE_JOIN_COMMAND_LOCAL_NETWORK = """parallax join""" NODE_JOIN_COMMAND_PUBLIC_NETWORK = """parallax join -s {scheduler_addr} """ @@ -76,6 +91,8 @@ def _load_config_only(name: str) -> dict: config = _load_config_only(model_name) # get quant method + # logger.info(f"Loading model config from {model_name}") + quant_method = config.get("quant_method", None) quantization_config = config.get("quantization_config", None) if quant_method is None and quantization_config is not None: @@ -88,9 +105,12 @@ def _load_config_only(name: str) -> dict: elif quant_method in ("mxfp4", "int4", "awq", "gptq"): param_bytes_per_element = 0.5 - # Only for hack, fix it when support different quantization bits - # if "minimax-m2" in model_name.lower(): - # param_bytes_per_element = 0.5 + mlx_param_bytes_per_element = param_bytes_per_element + if model_name in MLX_MODEL_NAME_MAP: + mlx_config = _load_config_only(MLX_MODEL_NAME_MAP[model_name]) + mlx_quant_dict = mlx_config.get("quantization_config", None) + if "bits" in mlx_quant_dict: + mlx_param_bytes_per_element = mlx_quant_dict["bits"] / 8 # get local experts num_local_experts = config.get("num_local_experts", None) @@ -112,6 +132,7 @@ def _load_config_only(name: str) -> dict: num_layers=config.get("num_hidden_layers", 0), ffn_num_projections=3, param_bytes_per_element=param_bytes_per_element, + mlx_param_bytes_per_element=mlx_param_bytes_per_element, cache_bytes_per_element=2, embedding_bytes_per_element=2, num_local_experts=num_local_experts, diff --git a/src/parallax/launch.py b/src/parallax/launch.py index e4443c23..3795dccf 100644 --- a/src/parallax/launch.py +++ b/src/parallax/launch.py @@ -26,23 +26,10 @@ from parallax.utils.utils import get_current_device from parallax_utils.ascii_anime import display_parallax_join from parallax_utils.logging_config import get_logger, set_log_level +from backend.server.static_config import MLX_MODEL_NAME_MAP logger = get_logger("parallax.launch") -"""Currently hard code model name for MAC""" -MLX_MODEL_NAME_MAP = { - "openai/gpt-oss-20b": "mlx-community/gpt-oss-20b-MXFP4-Q8", - "openai/gpt-oss-120b": "mlx-community/gpt-oss-120b-4bit", - "Qwen/Qwen3-Next-80B-A3B-Instruct-FP8": "mlx-community/Qwen3-Next-80B-A3B-Instruct-8bit", - "Qwen/Qwen3-Next-80B-A3B-Thinking-FP8": "mlx-community/Qwen3-Next-80B-A3B-Thinking-8bit", - "Qwen/Qwen3-235B-A22B-Instruct-2507-FP8": "mlx-community/Qwen3-235B-A22B-Instruct-2507-4bit", - "Qwen/Qwen3-235B-A22B-Thinking-2507-FP8": "mlx-community/Qwen3-235B-A22B-Thinking-2507-4bit", - "Qwen/Qwen3-235B-A22B-GPTQ-Int4": "mlx-community/Qwen3-235B-A22B-4bit", - "moonshotai/Kimi-K2-Instruct": "mlx-community/Kimi-K2-Instruct-4bit", - "deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct": "mlx-community/DeepSeek-Coder-V2-Lite-Instruct-4bit-mlx", - "MiniMaxAI/MiniMax-M2": "mlx-community/MiniMax-M2-4bit", - "zai-org/GLM-4.6": "mlx-community/GLM-4.6-4bit", -} if __name__ == "__main__": multiprocessing.set_start_method("spawn", force=True) diff --git a/src/scheduling/model_info.py b/src/scheduling/model_info.py index 5d6cd9f2..1736aecf 100644 --- a/src/scheduling/model_info.py +++ b/src/scheduling/model_info.py @@ -37,6 +37,7 @@ class ModelInfo: tie_embedding: bool = False # Default int8 param_bytes_per_element: float = 1 + mlx_param_bytes_per_element: float = 1 cache_bytes_per_element: int = 1 embedding_bytes_per_element: int = 1 @@ -70,6 +71,10 @@ def k_dim(self) -> int: """Return key head dim.""" return self.num_kv_heads * self.head_size_k + @property + def mlx_bit_factor(self) -> float: + return self.mlx_param_bytes_per_element / self.param_bytes_per_element + @property def embedding_io_bytes(self) -> int: """Estimate memory for input_embeddings / or lm_head.""" diff --git a/src/scheduling/node.py b/src/scheduling/node.py index 5824ddae..f0ea4a6c 100644 --- a/src/scheduling/node.py +++ b/src/scheduling/node.py @@ -15,6 +15,7 @@ from typing import Callable, Dict, List, Optional from parallax_utils.logging_config import get_logger +from parallax.utils.utils import get_current_device from parallax_utils.utils import bytes_per_element, compute_max_batch_size from scheduling.model_info import ModelInfo @@ -294,9 +295,19 @@ def get_decoder_layer_capacity( available_memory_bytes, self.model_info.decoder_layer_io_bytes(roofline=False), ) - return floor( - available_memory_bytes / self.model_info.decoder_layer_io_bytes(roofline=False) - ) + if get_current_device() == "mlx": + # For mlx, consider mlx bit factor + return floor( + available_memory_bytes + / ( + self.model_info.decoder_layer_io_bytes(roofline=False) + * self.model_info.mlx_bit_factor + ) + ) + else: + return floor( + available_memory_bytes / self.model_info.decoder_layer_io_bytes(roofline=False) + ) @property def per_decoder_layer_kv_cache_memory(self) -> Optional[int]: From 7c47b3b55538db7d92f1481e7e8598a49056de1b Mon Sep 17 00:00:00 2001 From: yuhao_zhang Date: Wed, 29 Oct 2025 18:00:42 +0800 Subject: [PATCH 03/13] update --- src/backend/server/static_config.py | 3 ++- src/parallax/launch.py | 2 +- src/scheduling/node.py | 2 +- 3 files changed, 4 insertions(+), 3 deletions(-) diff --git a/src/backend/server/static_config.py b/src/backend/server/static_config.py index 761100a2..3c70de3f 100644 --- a/src/backend/server/static_config.py +++ b/src/backend/server/static_config.py @@ -1,6 +1,7 @@ import json -from pathlib import Path import logging +from pathlib import Path + from scheduling.model_info import ModelInfo # Supported model list diff --git a/src/parallax/launch.py b/src/parallax/launch.py index 3795dccf..736118fb 100644 --- a/src/parallax/launch.py +++ b/src/parallax/launch.py @@ -18,6 +18,7 @@ import tempfile import threading +from backend.server.static_config import MLX_MODEL_NAME_MAP from common.version_check import check_latest_release from parallax.p2p.server import ServerState, launch_p2p_server from parallax.server.executor import Executor @@ -26,7 +27,6 @@ from parallax.utils.utils import get_current_device from parallax_utils.ascii_anime import display_parallax_join from parallax_utils.logging_config import get_logger, set_log_level -from backend.server.static_config import MLX_MODEL_NAME_MAP logger = get_logger("parallax.launch") diff --git a/src/scheduling/node.py b/src/scheduling/node.py index f0ea4a6c..6f4882ea 100644 --- a/src/scheduling/node.py +++ b/src/scheduling/node.py @@ -14,8 +14,8 @@ from math import floor from typing import Callable, Dict, List, Optional -from parallax_utils.logging_config import get_logger from parallax.utils.utils import get_current_device +from parallax_utils.logging_config import get_logger from parallax_utils.utils import bytes_per_element, compute_max_batch_size from scheduling.model_info import ModelInfo From 65aa8106a8e43a61dd68d96cbc1ecf0d280e461e Mon Sep 17 00:00:00 2001 From: yuhao_zhang Date: Wed, 29 Oct 2025 18:55:40 +0800 Subject: [PATCH 04/13] update device get way --- src/backend/server/rpc_connection_handler.py | 18 ++++++++++++------ src/backend/server/static_config.py | 4 +++- src/parallax/server/server_info.py | 4 ++++ src/scheduling/node.py | 4 ++-- 4 files changed, 21 insertions(+), 9 deletions(-) diff --git a/src/backend/server/rpc_connection_handler.py b/src/backend/server/rpc_connection_handler.py index 0fee9922..6fc4ec4c 100644 --- a/src/backend/server/rpc_connection_handler.py +++ b/src/backend/server/rpc_connection_handler.py @@ -140,12 +140,18 @@ def get_layer_allocation(self, current_node_id): list_node_allocations = self.scheduler.list_node_allocations() for node_id, start_layer, end_layer in list_node_allocations: if current_node_id == node_id: - return { - "node_id": node_id, - "model_name": self.scheduler.model_info.model_name, - "start_layer": start_layer, - "end_layer": end_layer, - } + node = self.scheduler.get_node(node_id) + if node: + return { + "node_id": node_id, + "model_name": ( + node.model_info.model_name + if node.hardware.device != "mlx" + else node.model_info.mlx_model_name + ), + "start_layer": start_layer, + "end_layer": end_layer, + } return {} def build_node(self, node_json: dict): diff --git a/src/backend/server/static_config.py b/src/backend/server/static_config.py index 3c70de3f..f2c10e18 100644 --- a/src/backend/server/static_config.py +++ b/src/backend/server/static_config.py @@ -107,8 +107,10 @@ def _load_config_only(name: str) -> dict: param_bytes_per_element = 0.5 mlx_param_bytes_per_element = param_bytes_per_element + mlx_model_name = model_name if model_name in MLX_MODEL_NAME_MAP: - mlx_config = _load_config_only(MLX_MODEL_NAME_MAP[model_name]) + mlx_model_name = MLX_MODEL_NAME_MAP[model_name] + mlx_config = _load_config_only(mlx_model_name) mlx_quant_dict = mlx_config.get("quantization_config", None) if "bits" in mlx_quant_dict: mlx_param_bytes_per_element = mlx_quant_dict["bits"] / 8 diff --git a/src/parallax/server/server_info.py b/src/parallax/server/server_info.py index 4d056c42..e83d675e 100644 --- a/src/parallax/server/server_info.py +++ b/src/parallax/server/server_info.py @@ -182,6 +182,7 @@ def detect_node_hardware(node_id: Optional[str]) -> Dict[str, Any]: "gpu_name": "Unknown", "memory_gb": 16.0, "memory_bandwidth_gbps": 100.0, + "device": "Unknown", } if isinstance(hw, NvidiaHardwareInfo): @@ -191,6 +192,7 @@ def detect_node_hardware(node_id: Optional[str]) -> Dict[str, Any]: "gpu_name": hw.chip, "memory_gb": hw.vram_gb, "memory_bandwidth_gbps": hw.memory_bandwidth_gbps, + "device": "cuda", } if isinstance(hw, AppleSiliconHardwareInfo): # Use unified memory size as memory_gb; bandwidth rough estimate per family @@ -201,6 +203,7 @@ def detect_node_hardware(node_id: Optional[str]) -> Dict[str, Any]: "gpu_name": hw.chip, "memory_gb": hw.total_ram_gb, "memory_bandwidth_gbps": est_bandwidth, + "device": "mlx", } # Generic fallback return { @@ -209,6 +212,7 @@ def detect_node_hardware(node_id: Optional[str]) -> Dict[str, Any]: "gpu_name": "Unknown", "memory_gb": 16.0, "memory_bandwidth_gbps": 100.0, + "device": "Unknown", } diff --git a/src/scheduling/node.py b/src/scheduling/node.py index 6f4882ea..d33276c7 100644 --- a/src/scheduling/node.py +++ b/src/scheduling/node.py @@ -14,7 +14,6 @@ from math import floor from typing import Callable, Dict, List, Optional -from parallax.utils.utils import get_current_device from parallax_utils.logging_config import get_logger from parallax_utils.utils import bytes_per_element, compute_max_batch_size from scheduling.model_info import ModelInfo @@ -36,6 +35,7 @@ class NodeHardwareInfo: gpu_name: str memory_gb: float memory_bandwidth_gbps: float + device: str @dataclass @@ -295,7 +295,7 @@ def get_decoder_layer_capacity( available_memory_bytes, self.model_info.decoder_layer_io_bytes(roofline=False), ) - if get_current_device() == "mlx": + if self.hardware.device == "mlx": # For mlx, consider mlx bit factor return floor( available_memory_bytes From 83c99ac94283ca8f9280e5175eadc41bdcb8e918 Mon Sep 17 00:00:00 2001 From: yuhao_zhang Date: Wed, 29 Oct 2025 18:56:51 +0800 Subject: [PATCH 05/13] update --- src/scheduling/model_info.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/scheduling/model_info.py b/src/scheduling/model_info.py index 1736aecf..bd16da69 100644 --- a/src/scheduling/model_info.py +++ b/src/scheduling/model_info.py @@ -23,6 +23,7 @@ class ModelInfo: """ model_name: str + mlx_model_name: str = None head_size: int hidden_dim: int intermediate_dim: int From 5d9ab45f84bb3e167457824ceb658100d9cb7c5e Mon Sep 17 00:00:00 2001 From: yuhao_zhang Date: Wed, 29 Oct 2025 19:04:03 +0800 Subject: [PATCH 06/13] update --- src/scheduling/model_info.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/scheduling/model_info.py b/src/scheduling/model_info.py index bd16da69..b79d9e45 100644 --- a/src/scheduling/model_info.py +++ b/src/scheduling/model_info.py @@ -23,7 +23,7 @@ class ModelInfo: """ model_name: str - mlx_model_name: str = None + mlx_model_name: str head_size: int hidden_dim: int intermediate_dim: int From 8b548c2ddfed8cc30f2f0d68c47dfb49c56c2684 Mon Sep 17 00:00:00 2001 From: yuhao_zhang Date: Wed, 29 Oct 2025 19:18:39 +0800 Subject: [PATCH 07/13] update --- src/backend/server/rpc_connection_handler.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/backend/server/rpc_connection_handler.py b/src/backend/server/rpc_connection_handler.py index 6fc4ec4c..234652c6 100644 --- a/src/backend/server/rpc_connection_handler.py +++ b/src/backend/server/rpc_connection_handler.py @@ -183,10 +183,12 @@ def build_hardware(self, hardware_json): gpu_name = hardware_json.get("gpu_name") memory_gb = hardware_json.get("memory_gb") memory_bandwidth_gbps = hardware_json.get("memory_bandwidth_gbps") + device = hardware_json.get("device") return NodeHardwareInfo( node_id=node_id, tflops_fp16=tflops_fp16, gpu_name=gpu_name, memory_gb=memory_gb, memory_bandwidth_gbps=memory_bandwidth_gbps, + device=device, ) From d7f6c33c2b12ea1b934ef31cf34481712f269229 Mon Sep 17 00:00:00 2001 From: yuhao_zhang Date: Wed, 29 Oct 2025 19:24:11 +0800 Subject: [PATCH 08/13] update --- src/backend/server/rpc_connection_handler.py | 2 +- src/backend/server/static_config.py | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/src/backend/server/rpc_connection_handler.py b/src/backend/server/rpc_connection_handler.py index 234652c6..0288f26c 100644 --- a/src/backend/server/rpc_connection_handler.py +++ b/src/backend/server/rpc_connection_handler.py @@ -140,7 +140,7 @@ def get_layer_allocation(self, current_node_id): list_node_allocations = self.scheduler.list_node_allocations() for node_id, start_layer, end_layer in list_node_allocations: if current_node_id == node_id: - node = self.scheduler.get_node(node_id) + node = self.scheduler.node_id_to_node.get(node_id) if node: return { "node_id": node_id, diff --git a/src/backend/server/static_config.py b/src/backend/server/static_config.py index f2c10e18..d8934c92 100644 --- a/src/backend/server/static_config.py +++ b/src/backend/server/static_config.py @@ -124,6 +124,7 @@ def _load_config_only(name: str) -> dict: model_info = ModelInfo( model_name=model_name, + mlx_model_name=mlx_model_name, head_size=config.get("head_dim", 128), qk_nope_head_dim=config.get("qk_nope_head_dim", None), qk_rope_head_dim=config.get("qk_rope_head_dim", None), From 1dcf6025821f8b768fb0c008b4fc84396a69b292 Mon Sep 17 00:00:00 2001 From: yuhao_zhang Date: Wed, 29 Oct 2025 19:28:15 +0800 Subject: [PATCH 09/13] update test --- .../scheduler_tests/test_layer_allocation.py | 8 ++--- tests/scheduler_tests/test_scheduler.py | 1 + tests/scheduler_tests/test_utils.py | 29 ++++++++++++++++--- 3 files changed, 30 insertions(+), 8 deletions(-) diff --git a/tests/scheduler_tests/test_layer_allocation.py b/tests/scheduler_tests/test_layer_allocation.py index 780a0de8..972950f0 100644 --- a/tests/scheduler_tests/test_layer_allocation.py +++ b/tests/scheduler_tests/test_layer_allocation.py @@ -26,10 +26,10 @@ def _build_node(gpu_type: str, model: ModelInfo, id_suffix: str = "") -> Node: hw_map = { - "a100-80g": NodeHardwareInfo("a100-80g" + id_suffix, 312.0, "", 80.0, 2039.0), - "a100-40g": NodeHardwareInfo("a100-40g" + id_suffix, 312.0, "", 40.0, 1935.0), - "rtx5090": NodeHardwareInfo("rtx5090" + id_suffix, 165, "", 32.0, 1792.0), - "rtx4090": NodeHardwareInfo("rtx4090" + id_suffix, 82.6, "", 24.0, 1008.0), + "a100-80g": NodeHardwareInfo("a100-80g" + id_suffix, 312.0, "", 80.0, 2039.0, "cuda"), + "a100-40g": NodeHardwareInfo("a100-40g" + id_suffix, 312.0, "", 40.0, 1935.0, "cuda"), + "rtx5090": NodeHardwareInfo("rtx5090" + id_suffix, 165, "", 32.0, 1792.0, "cuda"), + "rtx4090": NodeHardwareInfo("rtx4090" + id_suffix, 82.6, "", 24.0, 1008.0, "cuda"), } hw = hw_map[gpu_type] return Node(node_id=hw.node_id, hardware=hw, model_info=model) diff --git a/tests/scheduler_tests/test_scheduler.py b/tests/scheduler_tests/test_scheduler.py index 2c8f6083..e8e4ff53 100644 --- a/tests/scheduler_tests/test_scheduler.py +++ b/tests/scheduler_tests/test_scheduler.py @@ -18,6 +18,7 @@ def _build_node(node_id: str, model: ModelInfo, *, tflops: float, mem_gb: float) gpu_name="", memory_gb=mem_gb, memory_bandwidth_gbps=1000.0, + device="cuda", ) n = Node(node_id=node_id, hardware=hw, model_info=model) # Ensure latency estimation uses a defined speedup diff --git a/tests/scheduler_tests/test_utils.py b/tests/scheduler_tests/test_utils.py index d888b88c..b663aa88 100644 --- a/tests/scheduler_tests/test_utils.py +++ b/tests/scheduler_tests/test_utils.py @@ -11,16 +11,36 @@ from scheduling.node import Node, NodeHardwareInfo A100_80G = NodeHardwareInfo( - node_id="a100-80g", tflops_fp16=312.0, gpu_name="", memory_gb=80.0, memory_bandwidth_gbps=2039 + node_id="a100-80g", + tflops_fp16=312.0, + gpu_name="", + memory_gb=80.0, + memory_bandwidth_gbps=2039, + device="cuda", ) A100_40G = NodeHardwareInfo( - node_id="a100-40g", tflops_fp16=312.0, gpu_name="", memory_gb=40.0, memory_bandwidth_gbps=1935 + node_id="a100-40g", + tflops_fp16=312.0, + gpu_name="", + memory_gb=40.0, + memory_bandwidth_gbps=1935, + device="cuda", ) RTX5090 = NodeHardwareInfo( - node_id="rtx5090", tflops_fp16=104.8, gpu_name="", memory_gb=32.0, memory_bandwidth_gbps=1792 + node_id="rtx5090", + tflops_fp16=104.8, + gpu_name="", + memory_gb=32.0, + memory_bandwidth_gbps=1792, + device="cuda", ) RTX4090 = NodeHardwareInfo( - node_id="rtx4090", tflops_fp16=82.6, gpu_name="", memory_gb=24.0, memory_bandwidth_gbps=1008 + node_id="rtx4090", + tflops_fp16=82.6, + gpu_name="", + memory_gb=24.0, + memory_bandwidth_gbps=1008, + device="cuda", ) @@ -60,6 +80,7 @@ def build_node( gpu_name="", memory_gb=mem_gb, memory_bandwidth_gbps=mem_bandwidth_gbps, + device="cuda", ) n = Node(node_id=node_id, hardware=hw, model_info=model, _force_max_concurrent_requests=True) # Attach coordinates for RTT synthesis in tests From a3b1538b2b39493ce70c382806a15991f5f42b5f Mon Sep 17 00:00:00 2001 From: yuhao_zhang Date: Thu, 30 Oct 2025 13:34:44 +0800 Subject: [PATCH 10/13] update --- tests/scheduler_tests/test_utils.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/scheduler_tests/test_utils.py b/tests/scheduler_tests/test_utils.py index b663aa88..47ffcfe1 100644 --- a/tests/scheduler_tests/test_utils.py +++ b/tests/scheduler_tests/test_utils.py @@ -48,6 +48,7 @@ def build_model_info(num_layers: int) -> ModelInfo: """Build a model config used across tests (matches allocation tests).""" return ModelInfo( model_name=f"GPUOss-{num_layers}L", + mlx_model_name=f"MLXOss-{num_layers}L", head_size=64, hidden_dim=2880, intermediate_dim=2880, @@ -59,6 +60,7 @@ def build_model_info(num_layers: int) -> ModelInfo: num_local_experts=128, num_experts_per_tok=4, param_bytes_per_element=1, + mlx_param_bytes_per_element=1, cache_bytes_per_element=2, embedding_bytes_per_element=2, ) From f405c9b57ebbda9dc0969739d64ca72a52420837 Mon Sep 17 00:00:00 2001 From: yuhao_zhang Date: Thu, 30 Oct 2025 14:28:55 +0800 Subject: [PATCH 11/13] update --- src/backend/server/static_config.py | 103 ++++++++++++---------------- src/parallax/launch.py | 14 +--- 2 files changed, 48 insertions(+), 69 deletions(-) diff --git a/src/backend/server/static_config.py b/src/backend/server/static_config.py index d8934c92..ff01b309 100644 --- a/src/backend/server/static_config.py +++ b/src/backend/server/static_config.py @@ -4,68 +4,55 @@ from scheduling.model_info import ModelInfo -# Supported model list -MODEL_LIST = [ - "Qwen/Qwen3-0.6B", - "openai/gpt-oss-20b", - "openai/gpt-oss-120b", - "moonshotai/Kimi-K2-Instruct", - "moonshotai/Kimi-K2-Instruct-0905", - "Qwen/Qwen3-Next-80B-A3B-Instruct", - "Qwen/Qwen3-Next-80B-A3B-Instruct-FP8", - "Qwen/Qwen3-Next-80B-A3B-Thinking", - "Qwen/Qwen3-Next-80B-A3B-Thinking-FP8", - "Qwen/Qwen3-0.6B-FP8", - "Qwen/Qwen3-1.7B", - "Qwen/Qwen3-1.7B-FP8", - "Qwen/Qwen3-4B", - "Qwen/Qwen3-4B-FP8", - "Qwen/Qwen3-4B-Instruct-2507", - "Qwen/Qwen3-4B-Instruct-2507-FP8", - "Qwen/Qwen3-4B-Thinking-2507", - "Qwen/Qwen3-4B-Thinking-2507-FP8", - "Qwen/Qwen3-8B", - "Qwen/Qwen3-8B-FP8", - "Qwen/Qwen3-14B", - "Qwen/Qwen3-14B-FP8", - "Qwen/Qwen3-32B", - "Qwen/Qwen3-32B-FP8", - "Qwen/Qwen3-30B-A3B", - "Qwen/Qwen3-30B-A3B-Instruct-2507-FP8", - "Qwen/Qwen3-30B-A3B-Thinking-2507-FP8", - "Qwen/Qwen3-235B-A22B-Instruct-2507-FP8", - "Qwen/Qwen3-235B-A22B-Thinking-2507-FP8", - "Qwen/Qwen3-235B-A22B-GPTQ-Int4", - "Qwen/Qwen2.5-0.5B-Instruct", - "Qwen/Qwen2.5-1.5B-Instruct", - "Qwen/Qwen2.5-3B-Instruct", - "Qwen/Qwen2.5-7B-Instruct", - "Qwen/Qwen2.5-14B-Instruct", - "Qwen/Qwen2.5-32B-Instruct", - "Qwen/Qwen2.5-72B-Instruct", - "nvidia/Llama-3.3-70B-Instruct-FP8", - "nvidia/Llama-3.1-70B-Instruct-FP8", - "nvidia/Llama-3.1-8B-Instruct-FP8", - "deepseek-ai/DeepSeek-V3.1", - "deepseek-ai/DeepSeek-R1", - "deepseek-ai/DeepSeek-V3", - "deepseek-ai/DeepSeek-V2", - "MiniMaxAI/MiniMax-M2", - "zai-org/GLM-4.6", -] - -MLX_MODEL_NAME_MAP = { +# Supported model list - key: model name, value: MLX model name (same as key if no MLX variant) +MODELS = { + "Qwen/Qwen3-0.6B": "Qwen/Qwen3-0.6B", "openai/gpt-oss-20b": "mlx-community/gpt-oss-20b-MXFP4-Q8", "openai/gpt-oss-120b": "mlx-community/gpt-oss-120b-4bit", + "moonshotai/Kimi-K2-Instruct": "mlx-community/Kimi-K2-Instruct-4bit", + "moonshotai/Kimi-K2-Instruct-0905": "mlx-community/Kimi-K2-Instruct-0905-mlx-DQ3_K_M", + "Qwen/Qwen3-Next-80B-A3B-Instruct": "mlx-community/Qwen3-Next-80B-A3B-Instruct-8bit", "Qwen/Qwen3-Next-80B-A3B-Instruct-FP8": "mlx-community/Qwen3-Next-80B-A3B-Instruct-8bit", + "Qwen/Qwen3-Next-80B-A3B-Thinking": "mlx-community/Qwen3-Next-80B-A3B-Thinking-8bit", "Qwen/Qwen3-Next-80B-A3B-Thinking-FP8": "mlx-community/Qwen3-Next-80B-A3B-Thinking-8bit", + "Qwen/Qwen3-0.6B-FP8": "Qwen/Qwen3-0.6B", + "Qwen/Qwen3-1.7B": "Qwen/Qwen3-1.7B", + "Qwen/Qwen3-1.7B-FP8": "Qwen/Qwen3-1.7B", + "Qwen/Qwen3-4B": "Qwen/Qwen3-4B", + "Qwen/Qwen3-4B-FP8": "Qwen/Qwen3-4B", + "Qwen/Qwen3-4B-Instruct-2507": "Qwen/Qwen3-4B-Instruct-2507", + "Qwen/Qwen3-4B-Instruct-2507-FP8": "Qwen/Qwen3-4B-Instruct-2507-FP8", + "Qwen/Qwen3-4B-Thinking-2507": "Qwen/Qwen3-4B-Thinking-2507", + "Qwen/Qwen3-4B-Thinking-2507-FP8": "Qwen/Qwen3-4B-Thinking-2507-FP8", + "Qwen/Qwen3-8B": "Qwen/Qwen3-8B", + "Qwen/Qwen3-8B-FP8": "Qwen/Qwen3-8B-FP8", + "Qwen/Qwen3-14B": "Qwen/Qwen3-14B", + "Qwen/Qwen3-14B-FP8": "Qwen/Qwen3-14B-FP8", + "Qwen/Qwen3-32B": "Qwen/Qwen3-32B", + "Qwen/Qwen3-32B-FP8": "Qwen/Qwen3-32B-FP8", + "Qwen/Qwen3-30B-A3B": "Qwen/Qwen3-30B-A3B", + "Qwen/Qwen3-30B-A3B-Instruct-2507-FP8": "Qwen/Qwen3-30B-A3B-Instruct-2507-FP8", + "Qwen/Qwen3-30B-A3B-Thinking-2507-FP8": "Qwen/Qwen3-30B-A3B-Thinking-2507-FP8", "Qwen/Qwen3-235B-A22B-Instruct-2507-FP8": "mlx-community/Qwen3-235B-A22B-Instruct-2507-4bit", "Qwen/Qwen3-235B-A22B-Thinking-2507-FP8": "mlx-community/Qwen3-235B-A22B-Thinking-2507-4bit", "Qwen/Qwen3-235B-A22B-GPTQ-Int4": "mlx-community/Qwen3-235B-A22B-4bit", - "moonshotai/Kimi-K2-Instruct": "mlx-community/Kimi-K2-Instruct-4bit", - "deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct": "mlx-community/DeepSeek-Coder-V2-Lite-Instruct-4bit-mlx", + "Qwen/Qwen2.5-0.5B-Instruct": "Qwen/Qwen2.5-0.5B-Instruct", + "Qwen/Qwen2.5-1.5B-Instruct": "Qwen/Qwen2.5-1.5B-Instruct", + "Qwen/Qwen2.5-3B-Instruct": "Qwen/Qwen2.5-3B-Instruct", + "Qwen/Qwen2.5-7B-Instruct": "Qwen/Qwen2.5-7B-Instruct", + "Qwen/Qwen2.5-14B-Instruct": "Qwen/Qwen2.5-14B-Instruct", + "Qwen/Qwen2.5-32B-Instruct": "Qwen/Qwen2.5-32B-Instruct", + "Qwen/Qwen2.5-72B-Instruct": "Qwen/Qwen2.5-72B-Instruct", + "nvidia/Llama-3.3-70B-Instruct-FP8": "nvidia/Llama-3.3-70B-Instruct-FP8", + "nvidia/Llama-3.1-70B-Instruct-FP8": "nvidia/Llama-3.1-70B-Instruct-FP8", + "nvidia/Llama-3.1-8B-Instruct-FP8": "nvidia/Llama-3.1-8B-Instruct-FP8", + "deepseek-ai/DeepSeek-V3.1": "deepseek-ai/DeepSeek-V3.1", + "deepseek-ai/DeepSeek-R1": "deepseek-ai/DeepSeek-R1", + "deepseek-ai/DeepSeek-V3": "deepseek-ai/DeepSeek-V3", + "deepseek-ai/DeepSeek-V2": "deepseek-ai/DeepSeek-V2", "MiniMaxAI/MiniMax-M2": "mlx-community/MiniMax-M2-4bit", "zai-org/GLM-4.6": "mlx-community/GLM-4.6-4bit", + "deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct": "mlx-community/DeepSeek-Coder-V2-Lite-Instruct-4bit-mlx", } logger = logging.getLogger(__name__) @@ -107,12 +94,12 @@ def _load_config_only(name: str) -> dict: param_bytes_per_element = 0.5 mlx_param_bytes_per_element = param_bytes_per_element - mlx_model_name = model_name - if model_name in MLX_MODEL_NAME_MAP: - mlx_model_name = MLX_MODEL_NAME_MAP[model_name] + mlx_model_name = MODELS.get(model_name, model_name) + + if mlx_model_name != model_name: mlx_config = _load_config_only(mlx_model_name) mlx_quant_dict = mlx_config.get("quantization_config", None) - if "bits" in mlx_quant_dict: + if mlx_quant_dict and "bits" in mlx_quant_dict: mlx_param_bytes_per_element = mlx_quant_dict["bits"] / 8 # get local experts @@ -147,7 +134,7 @@ def _load_config_only(name: str) -> dict: def get_model_list(): - return MODEL_LIST + return list(MODELS.keys()) def get_node_join_command(scheduler_addr, is_local_network): diff --git a/src/parallax/launch.py b/src/parallax/launch.py index 736118fb..49dd41ad 100644 --- a/src/parallax/launch.py +++ b/src/parallax/launch.py @@ -18,7 +18,6 @@ import tempfile import threading -from backend.server.static_config import MLX_MODEL_NAME_MAP from common.version_check import check_latest_release from parallax.p2p.server import ServerState, launch_p2p_server from parallax.server.executor import Executor @@ -52,11 +51,7 @@ logger.debug(f"executor_input_addr: {args.executor_input_ipc}") logger.debug(f"executor_output_addr: {args.executor_output_ipc}") # Hard code for mlx-community models - if get_current_device() == "mlx": - mlx_model_repo = MLX_MODEL_NAME_MAP.get(args.model_path, None) - if mlx_model_repo is not None: - args.model_path = mlx_model_repo - logger.debug(f"Replace mlx model path: {mlx_model_repo}") + logger.debug(f"self.model_path before mlx check: {args.model_path}") if args.scheduler_addr is None: if args.log_level != "DEBUG": display_parallax_join(args.model_path) @@ -109,11 +104,8 @@ args.end_layer = gradient_server.block_end_index args.model_path = gradient_server.model_name # Hard code for mlx-community models - if get_current_device() == "mlx": - mlx_model_repo = MLX_MODEL_NAME_MAP.get(args.model_path, None) - if mlx_model_repo is not None: - args.model_path = mlx_model_repo - logger.debug(f"Replace mlx model path: {mlx_model_repo}") + + logger.debug(f"self.model_path after mlx check: {args.model_path}") logger.debug( f"Start Executor with start_layer: {args.start_layer}, end_layer: {args.end_layer}" ) From 9f63d43509f5fa8b1785ef86e0fa57ca829b3f26 Mon Sep 17 00:00:00 2001 From: yuhao_zhang Date: Thu, 30 Oct 2025 14:31:06 +0800 Subject: [PATCH 12/13] update --- src/parallax/launch.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/parallax/launch.py b/src/parallax/launch.py index 49dd41ad..94b0a001 100644 --- a/src/parallax/launch.py +++ b/src/parallax/launch.py @@ -23,7 +23,6 @@ from parallax.server.executor import Executor from parallax.server.http_server import launch_http_server from parallax.server.server_args import parse_args -from parallax.utils.utils import get_current_device from parallax_utils.ascii_anime import display_parallax_join from parallax_utils.logging_config import get_logger, set_log_level From 44829d533c77bb080814132bc5138eee0ab639c3 Mon Sep 17 00:00:00 2001 From: yuhao_zhang Date: Fri, 31 Oct 2025 10:58:58 +0800 Subject: [PATCH 13/13] update --- src/backend/server/static_config.py | 1 - src/parallax/launch.py | 4 ---- 2 files changed, 5 deletions(-) diff --git a/src/backend/server/static_config.py b/src/backend/server/static_config.py index ff01b309..5974e221 100644 --- a/src/backend/server/static_config.py +++ b/src/backend/server/static_config.py @@ -52,7 +52,6 @@ "deepseek-ai/DeepSeek-V2": "deepseek-ai/DeepSeek-V2", "MiniMaxAI/MiniMax-M2": "mlx-community/MiniMax-M2-4bit", "zai-org/GLM-4.6": "mlx-community/GLM-4.6-4bit", - "deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct": "mlx-community/DeepSeek-Coder-V2-Lite-Instruct-4bit-mlx", } logger = logging.getLogger(__name__) diff --git a/src/parallax/launch.py b/src/parallax/launch.py index 94b0a001..be706465 100644 --- a/src/parallax/launch.py +++ b/src/parallax/launch.py @@ -49,8 +49,6 @@ logger.debug(f"executor_input_addr: {args.executor_input_ipc}") logger.debug(f"executor_output_addr: {args.executor_output_ipc}") - # Hard code for mlx-community models - logger.debug(f"self.model_path before mlx check: {args.model_path}") if args.scheduler_addr is None: if args.log_level != "DEBUG": display_parallax_join(args.model_path) @@ -103,8 +101,6 @@ args.end_layer = gradient_server.block_end_index args.model_path = gradient_server.model_name # Hard code for mlx-community models - - logger.debug(f"self.model_path after mlx check: {args.model_path}") logger.debug( f"Start Executor with start_layer: {args.start_layer}, end_layer: {args.end_layer}" )