NVIDIA · podkidyshev · Apr 21, 2026 · Apr 21, 2026 · Apr 21, 2026
@@ -118,7 +118,12 @@ class LLMServingCmdArgs(CmdArgs, Generic[LLMServingArgsT]):
 
     docker_image_url: str
     model: str
-    port: int = Field(default=8000, ge=1, le=65535)
+    port: int = Field(default=8300, ge=1, le=65535)
+    host: str = Field(default="0.0.0.0", description="Host/interface for serve or router processes to bind to.")
+    bench_host: str | None = Field(
+        default=None,
+        description="Hostname used by the benchmark client. Defaults to the allocated node hostname.",
+    )
     serve_wait_seconds: int = 300
     prefill: LLMServingArgsT | None = Field(default=None)
     decode: LLMServingArgsT
@@ -363,6 +368,19 @@ def disaggregated_role_host(self, role: str) -> str:
             return "${DECODE_NODE}"
         raise ValueError(f"Unknown disaggregated role: {role}")
 
+    @property
+    def bind_host(self) -> str:
+        return self.tdef.cmd_args.host
+
+    @property
+    def bench_host(self) -> str:
+        configured_host = self.tdef.cmd_args.bench_host
+        if configured_host:
+            return configured_host
+        if self.is_disaggregated:
+            return "${PREFILL_NODE}"
+        return "${NODE}"
+
     def generate_disaggregated_node_setup(self) -> str:
         if not self.is_disaggregated:
             return ""

@@ -40,15 +40,14 @@ def workload_name(self) -> str:
 
     def get_serve_commands(self) -> list[list[str]]:
         cmd_args = self.tdef.cmd_args
-        bind_host = "0.0.0.0"
 
         base_cmd = ["python3", "-m", cmd_args.serve_module, "--model-path", cmd_args.model]
         if not cmd_args.prefill:
             return [
                 [
                     *base_cmd,
                     "--host",
-                    bind_host,
+                    self.bind_host,
                     "--port",
                     str(self.serve_port),
                     *cmd_args.decode.serve_args,
@@ -57,8 +56,8 @@ def get_serve_commands(self) -> list[list[str]]:
 
         commands: list[list[str]] = []
         for host, port, mode, args in [
-            (bind_host, self.prefill_port, "prefill", cast(SglangArgs, cmd_args.prefill)),
-            (bind_host, self.decode_port, "decode", cmd_args.decode),
+            (self.bind_host, self.prefill_port, "prefill", cast(SglangArgs, cmd_args.prefill)),
+            (self.bind_host, self.decode_port, "decode", cmd_args.decode),
         ]:
             commands.append(
                 [
@@ -88,7 +87,7 @@ def get_helper_command(self) -> list[str]:
             "--decode",
             f"http://{self.disaggregated_role_host('decode')}:{self.decode_port}",
             "--host",
-            "0.0.0.0",
+            self.bind_host,
             "--port",
             str(self.serve_port),
         ]
@@ -103,7 +102,7 @@ def get_bench_command(self) -> list[str]:
             "-m",
             self.tdef.cmd_args.bench_module,
             f"--backend {bench_args.backend}",
-            f"--base-url http://127.0.0.1:{self.serve_port}",
+            f"--base-url http://{self.bench_host}:{self.serve_port}",
             f"--model {self.tdef.cmd_args.model}",
             f"--dataset-name {bench_args.dataset_name}",
             f"--num-prompts {bench_args.num_prompts}",

@@ -45,7 +45,7 @@ def get_serve_commands(self) -> list[list[str]]:
         tdef: VllmTestDefinition = cast(VllmTestDefinition, self.test_run.test)
         cmd_args: VllmCmdArgs = tdef.cmd_args
 
-        base_cmd = ["vllm", "serve", cmd_args.model]
+        base_cmd = ["vllm", "serve", cmd_args.model, "--host", self.bind_host]
         if not tdef.cmd_args.prefill:
             return [[*base_cmd, *tdef.cmd_args.decode.serve_args, "--port", str(self.serve_port)]]
 
@@ -88,6 +88,8 @@ def get_helper_command(self) -> list[str]:
         return [
             "python3",
             self.tdef.cmd_args.proxy_script,
+            "--host",
+            self.bind_host,
             "--port",
             str(self.serve_port),
             "--prefiller-hosts",
@@ -109,7 +111,7 @@ def get_bench_command(self) -> list[str]:
             "bench",
             "serve",
             f"--model {self.tdef.cmd_args.model}",
-            f"--base-url http://127.0.0.1:{self.serve_port}",
+            f"--base-url http://{self.bench_host}:{self.serve_port}",
             f"--random-input-len {bench_args.random_input_len}",
             f"--random-output-len {bench_args.random_output_len}",
             f"--max-concurrency {bench_args.max_concurrency}",

@@ -56,25 +56,25 @@ echo "Node roles: prefill=$PREFILL_NODE decode=$DECODE_NODE"
 echo "Starting SGLang instances..."
 srun --export=ALL --mpi=pmix --container-image=docker.io/lmsysorg/sglang:dev --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output,__INSTALL_DIR__/huggingface:/root/.cache/huggingface --overlap --ntasks-per-node=1 --ntasks=1 --relative=0 -N1 \
     --output=__OUTPUT_DIR__/output/sglang-prefill.log \
-    env CUDA_VISIBLE_DEVICES="0,1,2,3" python3 -m sglang.launch_server --model-path Qwen/Qwen3-8B --host 0.0.0.0 --port 8100 --disaggregation-mode prefill --disaggregation-transfer-backend nixl &
+    env CUDA_VISIBLE_DEVICES="0,1,2,3" python3 -m sglang.launch_server --model-path Qwen/Qwen3-8B --host 0.0.0.0 --port 8400 --disaggregation-mode prefill --disaggregation-transfer-backend nixl &
 PREFILL_PID=$!
 
 srun --export=ALL --mpi=pmix --container-image=docker.io/lmsysorg/sglang:dev --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output,__INSTALL_DIR__/huggingface:/root/.cache/huggingface --overlap --ntasks-per-node=1 --ntasks=1 --relative=1 -N1 \
     --output=__OUTPUT_DIR__/output/sglang-decode.log \
-    env CUDA_VISIBLE_DEVICES="0,1,2,3" python3 -m sglang.launch_server --model-path Qwen/Qwen3-8B --host 0.0.0.0 --port 8200 --disaggregation-mode decode --disaggregation-transfer-backend nixl &
+    env CUDA_VISIBLE_DEVICES="0,1,2,3" python3 -m sglang.launch_server --model-path Qwen/Qwen3-8B --host 0.0.0.0 --port 8500 --disaggregation-mode decode --disaggregation-transfer-backend nixl &
 DECODE_PID=$!
 
 echo "Waiting for SGLang on $PREFILL_NODE and $DECODE_NODE to be ready..."
-wait_for_health "http://${PREFILL_NODE}:8100/health" || exit 1
-wait_for_health "http://${DECODE_NODE}:8200/health" || exit 1
+wait_for_health "http://${PREFILL_NODE}:8400/health" || exit 1
+wait_for_health "http://${DECODE_NODE}:8500/health" || exit 1
 
 echo "Starting router..."
 srun --export=ALL --mpi=pmix --container-image=docker.io/lmsysorg/sglang:dev --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output,__INSTALL_DIR__/huggingface:/root/.cache/huggingface --overlap --ntasks-per-node=1 --ntasks=1 --relative=0 -N1 \
     --output=__OUTPUT_DIR__/output/sglang-router.log \
-    python3 -m sglang_router.launch_router --pd-disaggregation --prefill http://${PREFILL_NODE}:8100 --decode http://${DECODE_NODE}:8200 --host 0.0.0.0 --port 8000 &
+    python3 -m sglang_router.launch_router --pd-disaggregation --prefill http://${PREFILL_NODE}:8400 --decode http://${DECODE_NODE}:8500 --host 0.0.0.0 --port 8300 &
 HELPER_PID=$!
 
 echo "Running benchmark..."
 srun --export=ALL --mpi=pmix --container-image=docker.io/lmsysorg/sglang:dev --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output,__INSTALL_DIR__/huggingface:/root/.cache/huggingface --overlap --ntasks-per-node=1 --ntasks=1 --relative=0 -N1 \
     --output=__OUTPUT_DIR__/output/sglang-bench.log \
-    python3 -m sglang.bench_serving --backend sglang --base-url http://127.0.0.1:8000 --model Qwen/Qwen3-8B --dataset-name random --num-prompts 30 --max-concurrency 16 --random-input 16 --random-output 128 --warmup-requests 2 --random-range-ratio 1.0 --output-file __OUTPUT_DIR__/output/sglang-bench.jsonl --output-details --pd-separated
+    python3 -m sglang.bench_serving --backend sglang --base-url http://${PREFILL_NODE}:8300 --model Qwen/Qwen3-8B --dataset-name random --num-prompts 30 --max-concurrency 16 --random-input 16 --random-output 128 --warmup-requests 2 --random-range-ratio 1.0 --output-file __OUTPUT_DIR__/output/sglang-bench.jsonl --output-details --pd-separated
@@ -52,25 +52,25 @@ echo "Node roles: prefill=$PREFILL_NODE decode=$DECODE_NODE"
 echo "Starting SGLang instances..."
 srun --export=ALL --mpi=pmix -N1 --container-image=docker.io/lmsysorg/sglang:dev --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output,__INSTALL_DIR__/huggingface:/root/.cache/huggingface --overlap --ntasks-per-node=1 --ntasks=1 \
     --output=__OUTPUT_DIR__/output/sglang-prefill.log \
-    env CUDA_VISIBLE_DEVICES="0,1" python3 -m sglang.launch_server --model-path Qwen/Qwen3-8B --host 0.0.0.0 --port 8100 --disaggregation-mode prefill --disaggregation-transfer-backend nixl &
+    env CUDA_VISIBLE_DEVICES="0,1" python3 -m sglang.launch_server --model-path Qwen/Qwen3-8B --host 0.0.0.0 --port 8400 --disaggregation-mode prefill --disaggregation-transfer-backend nixl &
 PREFILL_PID=$!
 
 srun --export=ALL --mpi=pmix -N1 --container-image=docker.io/lmsysorg/sglang:dev --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output,__INSTALL_DIR__/huggingface:/root/.cache/huggingface --overlap --ntasks-per-node=1 --ntasks=1 \
     --output=__OUTPUT_DIR__/output/sglang-decode.log \
-    env CUDA_VISIBLE_DEVICES="2,3" python3 -m sglang.launch_server --model-path Qwen/Qwen3-8B --host 0.0.0.0 --port 8200 --disaggregation-mode decode --disaggregation-transfer-backend nixl &
+    env CUDA_VISIBLE_DEVICES="2,3" python3 -m sglang.launch_server --model-path Qwen/Qwen3-8B --host 0.0.0.0 --port 8500 --disaggregation-mode decode --disaggregation-transfer-backend nixl &
 DECODE_PID=$!
 
 echo "Waiting for SGLang on $PREFILL_NODE and $DECODE_NODE to be ready..."
-wait_for_health "http://${PREFILL_NODE}:8100/health" || exit 1
-wait_for_health "http://${DECODE_NODE}:8200/health" || exit 1
+wait_for_health "http://${PREFILL_NODE}:8400/health" || exit 1
+wait_for_health "http://${DECODE_NODE}:8500/health" || exit 1
 
 echo "Starting router..."
 srun --export=ALL --mpi=pmix -N1 --container-image=docker.io/lmsysorg/sglang:dev --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output,__INSTALL_DIR__/huggingface:/root/.cache/huggingface --overlap --ntasks-per-node=1 --ntasks=1 \
     --output=__OUTPUT_DIR__/output/sglang-router.log \
-    python3 -m sglang_router.launch_router --pd-disaggregation --prefill http://${PREFILL_NODE}:8100 --decode http://${DECODE_NODE}:8200 --host 0.0.0.0 --port 8000 &
+    python3 -m sglang_router.launch_router --pd-disaggregation --prefill http://${PREFILL_NODE}:8400 --decode http://${DECODE_NODE}:8500 --host 0.0.0.0 --port 8300 &
 HELPER_PID=$!
 
 echo "Running benchmark..."
 srun --export=ALL --mpi=pmix -N1 --container-image=docker.io/lmsysorg/sglang:dev --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output,__INSTALL_DIR__/huggingface:/root/.cache/huggingface --overlap --ntasks-per-node=1 --ntasks=1 \
     --output=__OUTPUT_DIR__/output/sglang-bench.log \
-    python3 -m sglang.bench_serving --backend sglang --base-url http://127.0.0.1:8000 --model Qwen/Qwen3-8B --dataset-name random --num-prompts 30 --max-concurrency 16 --random-input 16 --random-output 128 --warmup-requests 2 --random-range-ratio 1.0 --output-file __OUTPUT_DIR__/output/sglang-bench.jsonl --output-details --pd-separated
+    python3 -m sglang.bench_serving --backend sglang --base-url http://${PREFILL_NODE}:8300 --model Qwen/Qwen3-8B --dataset-name random --num-prompts 30 --max-concurrency 16 --random-input 16 --random-output 128 --warmup-requests 2 --random-range-ratio 1.0 --output-file __OUTPUT_DIR__/output/sglang-bench.jsonl --output-details --pd-separated
@@ -41,14 +41,14 @@ wait_for_health() {
 echo "Starting SGLang instances..."
 srun --export=ALL --mpi=pmix -N1 --container-image=docker.io/lmsysorg/sglang:dev --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output,__INSTALL_DIR__/huggingface:/root/.cache/huggingface --overlap --ntasks-per-node=1 --ntasks=1 \
     --output=__OUTPUT_DIR__/output/sglang-serve.log \
-    env CUDA_VISIBLE_DEVICES="0" python3 -m sglang.launch_server --model-path Qwen/Qwen3-8B --host 0.0.0.0 --port 8000 &
+    env CUDA_VISIBLE_DEVICES="0" python3 -m sglang.launch_server --model-path Qwen/Qwen3-8B --host 0.0.0.0 --port 8300 &
 SERVE_PID=$!
 
 NODE=$(scontrol show hostname $SLURM_JOB_NODELIST | head -n 1)
 echo "Waiting for SGLang on $NODE to be ready..."
-wait_for_health "http://${NODE}:8000/health" || exit 1
+wait_for_health "http://${NODE}:8300/health" || exit 1
 
 echo "Running benchmark..."
 srun --export=ALL --mpi=pmix -N1 --container-image=docker.io/lmsysorg/sglang:dev --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output,__INSTALL_DIR__/huggingface:/root/.cache/huggingface --overlap --ntasks-per-node=1 --ntasks=1 \
     --output=__OUTPUT_DIR__/output/sglang-bench.log \
-    python3 -m sglang.bench_serving --backend sglang --base-url http://127.0.0.1:8000 --model Qwen/Qwen3-8B --dataset-name random --num-prompts 30 --max-concurrency 16 --random-input 16 --random-output 128 --warmup-requests 2 --random-range-ratio 1.0 --output-file __OUTPUT_DIR__/output/sglang-bench.jsonl --output-details
+    python3 -m sglang.bench_serving --backend sglang --base-url http://${NODE}:8300 --model Qwen/Qwen3-8B --dataset-name random --num-prompts 30 --max-concurrency 16 --random-input 16 --random-output 128 --warmup-requests 2 --random-range-ratio 1.0 --output-file __OUTPUT_DIR__/output/sglang-bench.jsonl --output-details
@@ -60,25 +60,25 @@ echo "Node roles: prefill=$PREFILL_NODE decode=$DECODE_NODE"
 echo "Starting vLLM instances..."
 srun --export=ALL --mpi=pmix --container-image=nvcr.io/nvidia/vllm:latest --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output,__INSTALL_DIR__/huggingface:/root/.cache/huggingface --overlap --ntasks-per-node=1 --ntasks=1 --relative=0 -N1 \
     --output=__OUTPUT_DIR__/output/vllm-prefill.log \
-    env CUDA_VISIBLE_DEVICES="0,1,2,3" VLLM_NIXL_SIDE_CHANNEL_HOST="${PREFILL_NODE}" VLLM_NIXL_SIDE_CHANNEL_PORT="$PREFILL_NIXL_PORT" vllm serve Qwen/Qwen3-0.6B --port 8100 --kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_producer"}' &
+    env CUDA_VISIBLE_DEVICES="0,1,2,3" VLLM_NIXL_SIDE_CHANNEL_HOST="${PREFILL_NODE}" VLLM_NIXL_SIDE_CHANNEL_PORT="$PREFILL_NIXL_PORT" vllm serve Qwen/Qwen3-0.6B --host 0.0.0.0 --port 8400 --kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_producer"}' &
 PREFILL_PID=$!
 
 srun --export=ALL --mpi=pmix --container-image=nvcr.io/nvidia/vllm:latest --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output,__INSTALL_DIR__/huggingface:/root/.cache/huggingface --overlap --ntasks-per-node=1 --ntasks=1 --relative=1 -N1 \
     --output=__OUTPUT_DIR__/output/vllm-decode.log \
-    env CUDA_VISIBLE_DEVICES="0,1,2,3" VLLM_NIXL_SIDE_CHANNEL_HOST="${DECODE_NODE}" VLLM_NIXL_SIDE_CHANNEL_PORT="$DECODE_NIXL_PORT" vllm serve Qwen/Qwen3-0.6B --port 8200 --kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_consumer"}' &
+    env CUDA_VISIBLE_DEVICES="0,1,2,3" VLLM_NIXL_SIDE_CHANNEL_HOST="${DECODE_NODE}" VLLM_NIXL_SIDE_CHANNEL_PORT="$DECODE_NIXL_PORT" vllm serve Qwen/Qwen3-0.6B --host 0.0.0.0 --port 8500 --kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_consumer"}' &
 DECODE_PID=$!
 
 echo "Waiting for vLLM on $PREFILL_NODE and $DECODE_NODE to be ready..."
-wait_for_health "http://${PREFILL_NODE}:8100/health" || exit 1
-wait_for_health "http://${DECODE_NODE}:8200/health" || exit 1
+wait_for_health "http://${PREFILL_NODE}:8400/health" || exit 1
+wait_for_health "http://${DECODE_NODE}:8500/health" || exit 1
 
 echo "Starting router..."
 srun --export=ALL --mpi=pmix --container-image=nvcr.io/nvidia/vllm:latest --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output,__INSTALL_DIR__/huggingface:/root/.cache/huggingface --overlap --ntasks-per-node=1 --ntasks=1 --relative=0 -N1 \
     --output=__OUTPUT_DIR__/output/vllm-router.log \
-    python3 /opt/vllm/tests/v1/kv_connector/nixl_integration/toy_proxy_server.py --port 8000 --prefiller-hosts ${PREFILL_NODE} --prefiller-ports 8100 --decoder-hosts ${DECODE_NODE} --decoder-ports 8200 &
+    python3 /opt/vllm/tests/v1/kv_connector/nixl_integration/toy_proxy_server.py --host 0.0.0.0 --port 8300 --prefiller-hosts ${PREFILL_NODE} --prefiller-ports 8400 --decoder-hosts ${DECODE_NODE} --decoder-ports 8500 &
 HELPER_PID=$!
 
 echo "Running benchmark..."
 srun --export=ALL --mpi=pmix --container-image=nvcr.io/nvidia/vllm:latest --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output,__INSTALL_DIR__/huggingface:/root/.cache/huggingface --overlap --ntasks-per-node=1 --ntasks=1 --relative=0 -N1 \
     --output=__OUTPUT_DIR__/output/vllm-bench.log \
-    vllm bench serve --model Qwen/Qwen3-0.6B --base-url http://127.0.0.1:8000 --random-input-len 16 --random-output-len 128 --max-concurrency 16 --num-prompts 30 --result-dir __OUTPUT_DIR__/output --result-filename vllm-bench.json --save-result
+    vllm bench serve --model Qwen/Qwen3-0.6B --base-url http://${PREFILL_NODE}:8300 --random-input-len 16 --random-output-len 128 --max-concurrency 16 --num-prompts 30 --result-dir __OUTPUT_DIR__/output --result-filename vllm-bench.json --save-result