GradientHQ · TianyiZhao1437 · Oct 10, 2025 · Oct 9, 2025 · Oct 9, 2025 · Oct 10, 2025
diff --git a/src/backend/server/static_config.py b/src/backend/server/static_config.py
@@ -11,8 +11,8 @@
     "openai/gpt-oss-120b",
     "moonshotai/Kimi-K2-Instruct",
     "moonshotai/Kimi-K2-Instruct-0905",
-    "Qwen/Qwen3-Next-80B-A3B-Instruct",
-    "Qwen/Qwen3-Next-80B-A3B-Thinking",
+    "Qwen/Qwen3-Next-80B-A3B-Instruct-FP8",
+    "Qwen/Qwen3-Next-80B-A3B-Thinking-FP8",
     # "Qwen/Qwen3-8B",
     # "Qwen/Qwen3-8B-FP8",
     "Qwen/Qwen3-32B",
@@ -60,6 +60,13 @@ def get_model_info(model_name):
     elif quant_method in ("mxfp4", "int4", "awq", "gptq"):
         param_bytes_per_element = 0.5
 
+    # get local experts
+    num_local_experts = config.get("num_local_experts", None)
+    if num_local_experts is None:
+        num_local_experts = config.get("num_experts", None)
+    if num_local_experts is None:
+        num_local_experts = config.get("n_routed_experts", None)
+
     model_info = ModelInfo(
         model_name=model_name,
         head_size=config.get("head_dim", 128),
@@ -75,8 +82,9 @@ def get_model_info(model_name):
         param_bytes_per_element=param_bytes_per_element,
         cache_bytes_per_element=2,
         embedding_bytes_per_element=2,
-        num_local_experts=config.get("num_experts", None),
+        num_local_experts=num_local_experts,
         num_experts_per_tok=config.get("num_experts_per_tok", None),
+        moe_intermediate_dim=config.get("moe_intermediate_size", None),
     )
     return model_info
 

diff --git a/src/parallax/launch.py b/src/parallax/launch.py
@@ -30,6 +30,11 @@
 MLX_MODEL_NAME_MAP = {
     "openai/gpt-oss-20b": "mlx-community/gpt-oss-20b-MXFP4-Q8",
     "openai/gpt-oss-120b": "mlx-community/gpt-oss-120b-4bit",
+    "Qwen/Qwen3-Next-80B-A3B-Instruct-FP8": "mlx-community/Qwen3-Next-80B-A3B-Instruct-8bit",
+    "Qwen/Qwen3-Next-80B-A3B-Thinking-FP8": "mlx-community/Qwen3-Next-80B-A3B-Thinking-8bit",
+    "Qwen/Qwen3-235B-A22B-Instruct-2507-FP8": "mlx-community/Qwen3-235B-A22B-Instruct-2507-4bit",
+    "Qwen/Qwen3-235B-A22B-Thinking-2507-FP8": "mlx-community/Qwen3-235B-A22B-Thinking-2507-4bit",
+    "moonshotai/Kimi-K2-Instruct": "mlx-community/Kimi-K2-Instruct-4bit",
 }
 
 if __name__ == "__main__":

diff --git a/src/scheduling/layer_allocation.py b/src/scheduling/layer_allocation.py
@@ -822,6 +822,13 @@ def global_allocation(self) -> bool:
                 total_cap,
             )
             return False
+        else:
+            logger.debug(
+                "[DP] Sufficient resources: nodes=%d, layers=%d, total_cap=%d",
+                num_nodes,
+                num_layers,
+                total_cap,
+            )
         # used for pruning
         suffix_sum = [0] * (num_nodes + 1)
         for i in range(num_nodes - 1, -1, -1):

diff --git a/src/scheduling/model_info.py b/src/scheduling/model_info.py
@@ -10,6 +10,10 @@
 from dataclasses import dataclass
 from typing import Optional
 
+from parallax_utils.logging_config import get_logger
+
+logger = get_logger(__name__)
+
 
 @dataclass
 class ModelInfo:
@@ -29,6 +33,7 @@ class ModelInfo:
     ffn_num_projections: int = 3
     num_local_experts: Optional[int] = None
     num_experts_per_tok: Optional[int] = None
+    moe_intermediate_dim: Optional[int] = None
     tie_embedding: bool = False
     # Default int8
     param_bytes_per_element: float = 1
@@ -50,6 +55,11 @@ def __init__(self, **kwargs):
             self.head_size_k = self.head_size
         self.head_size_v = self.head_size
 
+    @property
+    def q_dim(self) -> int:
+        """Return query head dim."""
+        return self.num_attention_heads * self.head_size
+
     @property
     def v_dim(self) -> int:
         """Return key and value head dim."""
@@ -143,17 +153,17 @@ def decoder_layer_io_bytes(
             source_seq_len: Source sequence length (prompt tokens)
         """
         # Attention params
-        qo_params = self.param_bytes_per_element * self.hidden_dim * self.hidden_dim
-        kv_params = self.param_bytes_per_element * self.hidden_dim * (self.k_dim + self.v_dim) // 2
+        qo_params = self.param_bytes_per_element * self.hidden_dim * self.q_dim * 2
+        kv_params = self.param_bytes_per_element * self.hidden_dim * (self.k_dim + self.v_dim)
         attention_params = qo_params + kv_params
 
         # FFN params
-        ffn_params = (
-            self.param_bytes_per_element
-            * self.ffn_num_projections
-            * self.hidden_dim
-            * self.intermediate_dim
-        )
+        ffn_params = self.param_bytes_per_element * self.ffn_num_projections * self.hidden_dim
+        if self.moe_intermediate_dim is not None:
+            ffn_params *= self.moe_intermediate_dim
+        else:
+            ffn_params *= self.intermediate_dim
+
         if roofline:
             expected_experts = self.expected_num_activated_experts(
                 batch_size=batch_size, target_seq_len=target_seq_len
@@ -168,6 +178,12 @@ def decoder_layer_io_bytes(
                 ffn_params *= self.num_local_experts
             kv_cache_size = 0
 
+        logger.debug(
+            "Model Info ffn_params=%d, kv_cache_size=%d, attention_params=%d",
+            ffn_params,
+            kv_cache_size,
+            attention_params,
+        )
         return round(ffn_params + kv_cache_size + attention_params)
 
     def lm_head_flops(self, target_seq_len: int = 1) -> int:

diff --git a/src/scheduling/node.py b/src/scheduling/node.py
@@ -289,6 +289,11 @@ def get_decoder_layer_capacity(
             if not (include_input_embed and self.model_info.tie_embedding):
                 available_memory_bytes -= self.model_info.embedding_io_bytes
 
+        logger.debug(
+            "Node available_memory_bytes=%d, decoder_layer_io_bytes=%d",
+            available_memory_bytes,
+            self.model_info.decoder_layer_io_bytes(roofline=False),
+        )
         return floor(
             available_memory_bytes / self.model_info.decoder_layer_io_bytes(roofline=False)
         )

diff --git a/tests/scheduler_tests/test_layer_allocation.py b/tests/scheduler_tests/test_layer_allocation.py
@@ -56,12 +56,12 @@ def test_capacity_sanity_check():
     "num_layers,gpu_types,expected_layers",
     [
         (21, ["a100-80g", "rtx5090", "rtx4090"], [13, 5, 3]),
-        (15, ["a100-80g", "rtx5090"], [10, 5]),
+        (15, ["a100-80g", "rtx5090"], [11, 4]),
         # (20 * 312 : 20 * 165 : 20 * 82.6) / 559.6 = 11.1 : 5.8 : 2.9 -> 12 : 5 : 3
         (20, ["a100-80g", "rtx5090", "rtx4090"], [12, 5, 3]),
         (25, ["a100-80g", "rtx5090", "rtx4090", "rtx4090"], [13, 5, 4, 3]),
         (29, ["rtx4090", "a100-80g", "rtx5090", "rtx5090", "rtx4090"], [3, 13, 5, 5, 3]),
-        (9, ["rtx5090", "rtx5090"], [5, 4]),
+        (8, ["rtx5090", "rtx5090"], [4, 4]),
         (7, ["a100-40g", "rtx5090"], [5, 2]),
     ],
 )
@@ -155,25 +155,25 @@ def _test_gap_patch_rebalance(allocator: BaseLayerAllocator):
             ],
             "dp",
         ),
-        # 14 Layers, capacity (13, 5, 5, 3, 3) -> greedy assigns (9, 5)
+        # 14 Layers, capacity (13, 5, 5, 3, 3) -> greedy assigns (10, 4)
         (
             14,
             (1, 0, 2, 2),
             [
-                (0, 9),
-                (9, 14),
+                (0, 10),
+                (10, 14),
             ],
             "greedy",
         ),
-        # 7 Layers, capacity (6, 5, 5, 3, 3) -> greedy assigns (5, 2, 5, 2)
+        # 7 Layers, capacity (6, 5, 5, 3, 3) -> greedy assigns (5, 2, 4, 3)
         (
             7,
             (0, 1, 2, 2),
             [
                 (0, 5),
                 (5, 7),
-                (0, 5),
-                (5, 7),
+                (0, 4),
+                (4, 7),
             ],
             "greedy",
         ),