From da5dafd6b0b84f3e5dda3785b01dd1f7bb63e1ce Mon Sep 17 00:00:00 2001 From: TianyiZhao1437 Date: Thu, 9 Oct 2025 14:17:49 +0800 Subject: [PATCH 1/8] fix:layer allocator parameter size calculation --- src/backend/server/static_config.py | 7 ++++++- src/scheduling/layer_allocation.py | 7 +++++++ src/scheduling/model_info.py | 19 +++++++++++++++++-- src/scheduling/node.py | 5 +++++ 4 files changed, 35 insertions(+), 3 deletions(-) diff --git a/src/backend/server/static_config.py b/src/backend/server/static_config.py index 64230299..e7267368 100644 --- a/src/backend/server/static_config.py +++ b/src/backend/server/static_config.py @@ -60,6 +60,11 @@ def get_model_info(model_name): elif quant_method in ("mxfp4", "int4", "awq", "gptq"): param_bytes_per_element = 0.5 + # get local experts + num_local_experts = config.get("num_local_experts", None) + if num_local_experts is None: + num_local_experts = num_local_experts = config.get("num_experts", None) + model_info = ModelInfo( model_name=model_name, head_size=config.get("head_dim", 128), @@ -75,7 +80,7 @@ def get_model_info(model_name): param_bytes_per_element=param_bytes_per_element, cache_bytes_per_element=2, embedding_bytes_per_element=2, - num_local_experts=config.get("num_experts", None), + num_local_experts=num_local_experts, num_experts_per_tok=config.get("num_experts_per_tok", None), ) return model_info diff --git a/src/scheduling/layer_allocation.py b/src/scheduling/layer_allocation.py index b6850170..cd73edd1 100644 --- a/src/scheduling/layer_allocation.py +++ b/src/scheduling/layer_allocation.py @@ -822,6 +822,13 @@ def global_allocation(self) -> bool: total_cap, ) return False + else: + logger.debug( + "[DP] Sufficient resources: nodes=%d, layers=%d, total_cap=%d", + num_nodes, + num_layers, + total_cap, + ) # used for pruning suffix_sum = [0] * (num_nodes + 1) for i in range(num_nodes - 1, -1, -1): diff --git a/src/scheduling/model_info.py b/src/scheduling/model_info.py index bd49e060..096b6930 100644 --- a/src/scheduling/model_info.py +++ b/src/scheduling/model_info.py @@ -10,6 +10,10 @@ from dataclasses import dataclass from typing import Optional +from parallax_utils.logging_config import get_logger + +logger = get_logger(__name__) + @dataclass class ModelInfo: @@ -50,6 +54,11 @@ def __init__(self, **kwargs): self.head_size_k = self.head_size self.head_size_v = self.head_size + @property + def q_dim(self) -> int: + """Return query head dim.""" + return self.num_attention_heads * self.head_size + @property def v_dim(self) -> int: """Return key and value head dim.""" @@ -143,8 +152,8 @@ def decoder_layer_io_bytes( source_seq_len: Source sequence length (prompt tokens) """ # Attention params - qo_params = self.param_bytes_per_element * self.hidden_dim * self.hidden_dim - kv_params = self.param_bytes_per_element * self.hidden_dim * (self.k_dim + self.v_dim) // 2 + qo_params = self.param_bytes_per_element * self.hidden_dim * self.q_dim * 2 + kv_params = self.param_bytes_per_element * self.hidden_dim * (self.k_dim + self.v_dim) attention_params = qo_params + kv_params # FFN params @@ -168,6 +177,12 @@ def decoder_layer_io_bytes( ffn_params *= self.num_local_experts kv_cache_size = 0 + logger.debug( + "Model Info ffn_params=%d, kv_cache_size=%d, attention_params=%d", + ffn_params, + kv_cache_size, + attention_params, + ) return round(ffn_params + kv_cache_size + attention_params) def lm_head_flops(self, target_seq_len: int = 1) -> int: diff --git a/src/scheduling/node.py b/src/scheduling/node.py index 6e40b7eb..5824ddae 100644 --- a/src/scheduling/node.py +++ b/src/scheduling/node.py @@ -289,6 +289,11 @@ def get_decoder_layer_capacity( if not (include_input_embed and self.model_info.tie_embedding): available_memory_bytes -= self.model_info.embedding_io_bytes + logger.debug( + "Node available_memory_bytes=%d, decoder_layer_io_bytes=%d", + available_memory_bytes, + self.model_info.decoder_layer_io_bytes(roofline=False), + ) return floor( available_memory_bytes / self.model_info.decoder_layer_io_bytes(roofline=False) ) From 91b847676ff2633c1b37976f329469ec41a2c64b Mon Sep 17 00:00:00 2001 From: TianyiZhao1437 Date: Thu, 9 Oct 2025 15:59:11 +0800 Subject: [PATCH 2/8] update layer allocation tests --- tests/scheduler_tests/test_layer_allocation.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/tests/scheduler_tests/test_layer_allocation.py b/tests/scheduler_tests/test_layer_allocation.py index 14539f44..780a0de8 100644 --- a/tests/scheduler_tests/test_layer_allocation.py +++ b/tests/scheduler_tests/test_layer_allocation.py @@ -56,12 +56,12 @@ def test_capacity_sanity_check(): "num_layers,gpu_types,expected_layers", [ (21, ["a100-80g", "rtx5090", "rtx4090"], [13, 5, 3]), - (15, ["a100-80g", "rtx5090"], [10, 5]), + (15, ["a100-80g", "rtx5090"], [11, 4]), # (20 * 312 : 20 * 165 : 20 * 82.6) / 559.6 = 11.1 : 5.8 : 2.9 -> 12 : 5 : 3 (20, ["a100-80g", "rtx5090", "rtx4090"], [12, 5, 3]), (25, ["a100-80g", "rtx5090", "rtx4090", "rtx4090"], [13, 5, 4, 3]), (29, ["rtx4090", "a100-80g", "rtx5090", "rtx5090", "rtx4090"], [3, 13, 5, 5, 3]), - (9, ["rtx5090", "rtx5090"], [5, 4]), + (8, ["rtx5090", "rtx5090"], [4, 4]), (7, ["a100-40g", "rtx5090"], [5, 2]), ], ) @@ -155,25 +155,25 @@ def _test_gap_patch_rebalance(allocator: BaseLayerAllocator): ], "dp", ), - # 14 Layers, capacity (13, 5, 5, 3, 3) -> greedy assigns (9, 5) + # 14 Layers, capacity (13, 5, 5, 3, 3) -> greedy assigns (10, 4) ( 14, (1, 0, 2, 2), [ - (0, 9), - (9, 14), + (0, 10), + (10, 14), ], "greedy", ), - # 7 Layers, capacity (6, 5, 5, 3, 3) -> greedy assigns (5, 2, 5, 2) + # 7 Layers, capacity (6, 5, 5, 3, 3) -> greedy assigns (5, 2, 4, 3) ( 7, (0, 1, 2, 2), [ (0, 5), (5, 7), - (0, 5), - (5, 7), + (0, 4), + (4, 7), ], "greedy", ), From d832d48c317448953e142ae332ac3b5fef370884 Mon Sep 17 00:00:00 2001 From: TianyiZhao1437 Date: Fri, 10 Oct 2025 03:52:04 +0000 Subject: [PATCH 3/8] fix(scheduler): moe intermediate size for ffn param calculation --- src/backend/server/static_config.py | 5 ++++- src/scheduling/model_info.py | 7 ++++++- 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/src/backend/server/static_config.py b/src/backend/server/static_config.py index e7267368..0c33f4da 100644 --- a/src/backend/server/static_config.py +++ b/src/backend/server/static_config.py @@ -11,7 +11,9 @@ "openai/gpt-oss-120b", "moonshotai/Kimi-K2-Instruct", "moonshotai/Kimi-K2-Instruct-0905", - "Qwen/Qwen3-Next-80B-A3B-Instruct", + " + " + "", "Qwen/Qwen3-Next-80B-A3B-Thinking", # "Qwen/Qwen3-8B", # "Qwen/Qwen3-8B-FP8", @@ -82,6 +84,7 @@ def get_model_info(model_name): embedding_bytes_per_element=2, num_local_experts=num_local_experts, num_experts_per_tok=config.get("num_experts_per_tok", None), + moe_intermediate_dim=config.get("moe_intermediate_size", None), ) return model_info diff --git a/src/scheduling/model_info.py b/src/scheduling/model_info.py index 096b6930..bef7894a 100644 --- a/src/scheduling/model_info.py +++ b/src/scheduling/model_info.py @@ -33,6 +33,7 @@ class ModelInfo: ffn_num_projections: int = 3 num_local_experts: Optional[int] = None num_experts_per_tok: Optional[int] = None + moe_intermediate_dim: Optional[int] = None tie_embedding: bool = False # Default int8 param_bytes_per_element: float = 1 @@ -161,8 +162,12 @@ def decoder_layer_io_bytes( self.param_bytes_per_element * self.ffn_num_projections * self.hidden_dim - * self.intermediate_dim ) + if self.moe_intermediate_dim is not None: + ffn_params *= self.moe_intermediate_dim + else: + ffn_params *= self.intermediate_dim + if roofline: expected_experts = self.expected_num_activated_experts( batch_size=batch_size, target_seq_len=target_seq_len From d8b818dfcce07a74d198b5506cae757e4ec5e137 Mon Sep 17 00:00:00 2001 From: TianyiZhao1437 Date: Fri, 10 Oct 2025 03:53:53 +0000 Subject: [PATCH 4/8] update --- src/backend/server/static_config.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/src/backend/server/static_config.py b/src/backend/server/static_config.py index 0c33f4da..daab4c98 100644 --- a/src/backend/server/static_config.py +++ b/src/backend/server/static_config.py @@ -11,9 +11,7 @@ "openai/gpt-oss-120b", "moonshotai/Kimi-K2-Instruct", "moonshotai/Kimi-K2-Instruct-0905", - " - " - "", + "Qwen/Qwen3-Next-80B-A3B-Instruct", "Qwen/Qwen3-Next-80B-A3B-Thinking", # "Qwen/Qwen3-8B", # "Qwen/Qwen3-8B-FP8", From ab9b871db86f7abb63ed54b4e9e195871238d7d6 Mon Sep 17 00:00:00 2001 From: TianyiZhao1437 Date: Fri, 10 Oct 2025 11:55:46 +0800 Subject: [PATCH 5/8] update --- src/scheduling/model_info.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/src/scheduling/model_info.py b/src/scheduling/model_info.py index bef7894a..5d6cd9f2 100644 --- a/src/scheduling/model_info.py +++ b/src/scheduling/model_info.py @@ -158,11 +158,7 @@ def decoder_layer_io_bytes( attention_params = qo_params + kv_params # FFN params - ffn_params = ( - self.param_bytes_per_element - * self.ffn_num_projections - * self.hidden_dim - ) + ffn_params = self.param_bytes_per_element * self.ffn_num_projections * self.hidden_dim if self.moe_intermediate_dim is not None: ffn_params *= self.moe_intermediate_dim else: From 21cd20e162f9643883c17403a360ac8515de0631 Mon Sep 17 00:00:00 2001 From: TianyiZhao1437 Date: Fri, 10 Oct 2025 04:15:57 +0000 Subject: [PATCH 6/8] update --- src/backend/server/static_config.py | 4 ++-- src/parallax/launch.py | 4 ++++ 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/src/backend/server/static_config.py b/src/backend/server/static_config.py index daab4c98..3902670d 100644 --- a/src/backend/server/static_config.py +++ b/src/backend/server/static_config.py @@ -11,8 +11,8 @@ "openai/gpt-oss-120b", "moonshotai/Kimi-K2-Instruct", "moonshotai/Kimi-K2-Instruct-0905", - "Qwen/Qwen3-Next-80B-A3B-Instruct", - "Qwen/Qwen3-Next-80B-A3B-Thinking", + "Qwen/Qwen3-Next-80B-A3B-Instruct-FP8", + "Qwen/Qwen3-Next-80B-A3B-Thinking-FP8", # "Qwen/Qwen3-8B", # "Qwen/Qwen3-8B-FP8", "Qwen/Qwen3-32B", diff --git a/src/parallax/launch.py b/src/parallax/launch.py index f1a10648..c9582c1c 100644 --- a/src/parallax/launch.py +++ b/src/parallax/launch.py @@ -30,6 +30,10 @@ MLX_MODEL_NAME_MAP = { "openai/gpt-oss-20b": "mlx-community/gpt-oss-20b-MXFP4-Q8", "openai/gpt-oss-120b": "mlx-community/gpt-oss-120b-4bit", + "Qwen/Qwen3-Next-80B-A3B-Instruct-FP8": "mlx-community/Qwen3-Next-80B-A3B-Instruct-8bit", + "Qwen/Qwen3-Next-80B-A3B-Thinking-FP8": "mlx-community/Qwen3-Next-80B-A3B-Thinking-8bit", + "Qwen/Qwen3-235B-A22B-Instruct-2507-FP8": "mlx-community/Qwen3-235B-A22B-Instruct-2507-4bit", + "Qwen/Qwen3-235B-A22B-Thinking-2507-FP8": "mlx-community/Qwen3-235B-A22B-Thinking-2507-4bit", } if __name__ == "__main__": From 756726cd866e0700145f90baa966833ba2f80ed9 Mon Sep 17 00:00:00 2001 From: TianyiZhao1437 Date: Fri, 10 Oct 2025 04:39:18 +0000 Subject: [PATCH 7/8] add mlx kimi-k2 model name map --- src/parallax/launch.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/parallax/launch.py b/src/parallax/launch.py index c9582c1c..f8f36a07 100644 --- a/src/parallax/launch.py +++ b/src/parallax/launch.py @@ -34,6 +34,7 @@ "Qwen/Qwen3-Next-80B-A3B-Thinking-FP8": "mlx-community/Qwen3-Next-80B-A3B-Thinking-8bit", "Qwen/Qwen3-235B-A22B-Instruct-2507-FP8": "mlx-community/Qwen3-235B-A22B-Instruct-2507-4bit", "Qwen/Qwen3-235B-A22B-Thinking-2507-FP8": "mlx-community/Qwen3-235B-A22B-Thinking-2507-4bit", + "moonshotai/Kimi-K2-Instruct": "mlx-community/Kimi-K2-Instruct-4bit", } if __name__ == "__main__": From 3a6c84dc303488da0eb2750c159d3e4d76421e5a Mon Sep 17 00:00:00 2001 From: TianyiZhao1437 Date: Fri, 10 Oct 2025 04:45:27 +0000 Subject: [PATCH 8/8] fix kimi experts num --- src/backend/server/static_config.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/backend/server/static_config.py b/src/backend/server/static_config.py index 3902670d..b03f833b 100644 --- a/src/backend/server/static_config.py +++ b/src/backend/server/static_config.py @@ -63,7 +63,9 @@ def get_model_info(model_name): # get local experts num_local_experts = config.get("num_local_experts", None) if num_local_experts is None: - num_local_experts = num_local_experts = config.get("num_experts", None) + num_local_experts = config.get("num_experts", None) + if num_local_experts is None: + num_local_experts = config.get("n_routed_experts", None) model_info = ModelInfo( model_name=model_name,