From 294f8b4da4dbda894c3ba9b5eb9db236a364b2aa Mon Sep 17 00:00:00 2001
From: Ivan Podkidyshev <ipodkidyshev@nvidia.com>
Date: Tue, 21 Apr 2026 14:46:59 +0200
Subject: [PATCH 1/2] fix various vllm/sglan bugs

---
 src/cloudai/workloads/common/llm_serving.py   | 20 ++++++++++++++++++-
 .../sglang/slurm_command_gen_strategy.py      | 11 +++++-----
 .../vllm/slurm_command_gen_strategy.py        |  6 ++++--
 tests/ref_data/sglang-disagg-2nodes.sbatch    |  2 +-
 tests/ref_data/sglang-disagg.sbatch           |  2 +-
 tests/ref_data/sglang.sbatch                  |  2 +-
 tests/ref_data/vllm-disagg-2nodes.sbatch      |  8 ++++----
 tests/ref_data/vllm-disagg.sbatch             |  8 ++++----
 tests/ref_data/vllm.sbatch                    |  4 ++--
 .../sglang/test_command_gen_strategy_slurm.py |  7 ++++---
 tests/workloads/test_llm_serving.py           | 16 ++++++++-------
 .../vllm/test_command_gen_strategy_slurm.py   | 14 ++++++++++---
 12 files changed, 65 insertions(+), 35 deletions(-)

diff --git a/src/cloudai/workloads/common/llm_serving.py b/src/cloudai/workloads/common/llm_serving.py
index f7b2618f0..213421f56 100644
--- a/src/cloudai/workloads/common/llm_serving.py
+++ b/src/cloudai/workloads/common/llm_serving.py
@@ -118,7 +118,12 @@ class LLMServingCmdArgs(CmdArgs, Generic[LLMServingArgsT]):
 
     docker_image_url: str
     model: str
-    port: int = Field(default=8000, ge=1, le=65535)
+    port: int = Field(default=8300, ge=1, le=65535)
+    host: str = Field(default="0.0.0.0", description="Host/interface for serve or router processes to bind to.")
+    bench_host: str | None = Field(
+        default=None,
+        description="Hostname used by the benchmark client. Defaults to the allocated node hostname.",
+    )
     serve_wait_seconds: int = 300
     prefill: LLMServingArgsT | None = Field(default=None)
     decode: LLMServingArgsT
@@ -363,6 +368,19 @@ def disaggregated_role_host(self, role: str) -> str:
             return "${DECODE_NODE}"
         raise ValueError(f"Unknown disaggregated role: {role}")
 
+    @property
+    def bind_host(self) -> str:
+        return self.tdef.cmd_args.host
+
+    @property
+    def bench_host(self) -> str:
+        configured_host = self.tdef.cmd_args.bench_host
+        if configured_host:
+            return configured_host
+        if self.is_disaggregated:
+            return "${PREFILL_NODE}"
+        return "${NODE}"
+
     def generate_disaggregated_node_setup(self) -> str:
         if not self.is_disaggregated:
             return ""
diff --git a/src/cloudai/workloads/sglang/slurm_command_gen_strategy.py b/src/cloudai/workloads/sglang/slurm_command_gen_strategy.py
index 4b2333f1a..fd2548b18 100644
--- a/src/cloudai/workloads/sglang/slurm_command_gen_strategy.py
+++ b/src/cloudai/workloads/sglang/slurm_command_gen_strategy.py
@@ -40,7 +40,6 @@ def workload_name(self) -> str:
 
     def get_serve_commands(self) -> list[list[str]]:
         cmd_args = self.tdef.cmd_args
-        bind_host = "0.0.0.0"
 
         base_cmd = ["python3", "-m", cmd_args.serve_module, "--model-path", cmd_args.model]
         if not cmd_args.prefill:
@@ -48,7 +47,7 @@ def get_serve_commands(self) -> list[list[str]]:
                 [
                     *base_cmd,
                     "--host",
-                    bind_host,
+                    self.bind_host,
                     "--port",
                     str(self.serve_port),
                     *cmd_args.decode.serve_args,
@@ -57,8 +56,8 @@ def get_serve_commands(self) -> list[list[str]]:
 
         commands: list[list[str]] = []
         for host, port, mode, args in [
-            (bind_host, self.prefill_port, "prefill", cast(SglangArgs, cmd_args.prefill)),
-            (bind_host, self.decode_port, "decode", cmd_args.decode),
+            (self.bind_host, self.prefill_port, "prefill", cast(SglangArgs, cmd_args.prefill)),
+            (self.bind_host, self.decode_port, "decode", cmd_args.decode),
         ]:
             commands.append(
                 [
@@ -88,7 +87,7 @@ def get_helper_command(self) -> list[str]:
             "--decode",
             f"http://{self.disaggregated_role_host('decode')}:{self.decode_port}",
             "--host",
-            "0.0.0.0",
+            self.bind_host,
             "--port",
             str(self.serve_port),
         ]
@@ -103,7 +102,7 @@ def get_bench_command(self) -> list[str]:
             "-m",
             self.tdef.cmd_args.bench_module,
             f"--backend {bench_args.backend}",
-            f"--base-url http://127.0.0.1:{self.serve_port}",
+            f"--base-url http://{self.bench_host}:{self.serve_port}",
             f"--model {self.tdef.cmd_args.model}",
             f"--dataset-name {bench_args.dataset_name}",
             f"--num-prompts {bench_args.num_prompts}",
diff --git a/src/cloudai/workloads/vllm/slurm_command_gen_strategy.py b/src/cloudai/workloads/vllm/slurm_command_gen_strategy.py
index f55d085a9..1fb40d83d 100644
--- a/src/cloudai/workloads/vllm/slurm_command_gen_strategy.py
+++ b/src/cloudai/workloads/vllm/slurm_command_gen_strategy.py
@@ -45,7 +45,7 @@ def get_serve_commands(self) -> list[list[str]]:
         tdef: VllmTestDefinition = cast(VllmTestDefinition, self.test_run.test)
         cmd_args: VllmCmdArgs = tdef.cmd_args
 
-        base_cmd = ["vllm", "serve", cmd_args.model]
+        base_cmd = ["vllm", "serve", cmd_args.model, "--host", self.bind_host]
         if not tdef.cmd_args.prefill:
             return [[*base_cmd, *tdef.cmd_args.decode.serve_args, "--port", str(self.serve_port)]]
 
@@ -88,6 +88,8 @@ def get_helper_command(self) -> list[str]:
         return [
             "python3",
             self.tdef.cmd_args.proxy_script,
+            "--host",
+            self.bind_host,
             "--port",
             str(self.serve_port),
             "--prefiller-hosts",
@@ -109,7 +111,7 @@ def get_bench_command(self) -> list[str]:
             "bench",
             "serve",
             f"--model {self.tdef.cmd_args.model}",
-            f"--base-url http://127.0.0.1:{self.serve_port}",
+            f"--base-url http://{self.bench_host}:{self.serve_port}",
             f"--random-input-len {bench_args.random_input_len}",
             f"--random-output-len {bench_args.random_output_len}",
             f"--max-concurrency {bench_args.max_concurrency}",
diff --git a/tests/ref_data/sglang-disagg-2nodes.sbatch b/tests/ref_data/sglang-disagg-2nodes.sbatch
index adca0b32f..f00f2ed63 100644
--- a/tests/ref_data/sglang-disagg-2nodes.sbatch
+++ b/tests/ref_data/sglang-disagg-2nodes.sbatch
@@ -77,4 +77,4 @@ HELPER_PID=$!
 echo "Running benchmark..."
 srun --export=ALL --mpi=pmix --container-image=docker.io/lmsysorg/sglang:dev --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output,__INSTALL_DIR__/huggingface:/root/.cache/huggingface --overlap --ntasks-per-node=1 --ntasks=1 --relative=0 -N1 \
     --output=__OUTPUT_DIR__/output/sglang-bench.log \
-    python3 -m sglang.bench_serving --backend sglang --base-url http://127.0.0.1:8000 --model Qwen/Qwen3-8B --dataset-name random --num-prompts 30 --max-concurrency 16 --random-input 16 --random-output 128 --warmup-requests 2 --random-range-ratio 1.0 --output-file __OUTPUT_DIR__/output/sglang-bench.jsonl --output-details --pd-separated
+    python3 -m sglang.bench_serving --backend sglang --base-url http://${PREFILL_NODE}:8000 --model Qwen/Qwen3-8B --dataset-name random --num-prompts 30 --max-concurrency 16 --random-input 16 --random-output 128 --warmup-requests 2 --random-range-ratio 1.0 --output-file __OUTPUT_DIR__/output/sglang-bench.jsonl --output-details --pd-separated
diff --git a/tests/ref_data/sglang-disagg.sbatch b/tests/ref_data/sglang-disagg.sbatch
index 2f6e6133b..a821a126d 100644
--- a/tests/ref_data/sglang-disagg.sbatch
+++ b/tests/ref_data/sglang-disagg.sbatch
@@ -73,4 +73,4 @@ HELPER_PID=$!
 echo "Running benchmark..."
 srun --export=ALL --mpi=pmix -N1 --container-image=docker.io/lmsysorg/sglang:dev --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output,__INSTALL_DIR__/huggingface:/root/.cache/huggingface --overlap --ntasks-per-node=1 --ntasks=1 \
     --output=__OUTPUT_DIR__/output/sglang-bench.log \
-    python3 -m sglang.bench_serving --backend sglang --base-url http://127.0.0.1:8000 --model Qwen/Qwen3-8B --dataset-name random --num-prompts 30 --max-concurrency 16 --random-input 16 --random-output 128 --warmup-requests 2 --random-range-ratio 1.0 --output-file __OUTPUT_DIR__/output/sglang-bench.jsonl --output-details --pd-separated
+    python3 -m sglang.bench_serving --backend sglang --base-url http://${PREFILL_NODE}:8000 --model Qwen/Qwen3-8B --dataset-name random --num-prompts 30 --max-concurrency 16 --random-input 16 --random-output 128 --warmup-requests 2 --random-range-ratio 1.0 --output-file __OUTPUT_DIR__/output/sglang-bench.jsonl --output-details --pd-separated
diff --git a/tests/ref_data/sglang.sbatch b/tests/ref_data/sglang.sbatch
index 415cd8b9e..f979d3d6f 100644
--- a/tests/ref_data/sglang.sbatch
+++ b/tests/ref_data/sglang.sbatch
@@ -51,4 +51,4 @@ wait_for_health "http://${NODE}:8000/health" || exit 1
 echo "Running benchmark..."
 srun --export=ALL --mpi=pmix -N1 --container-image=docker.io/lmsysorg/sglang:dev --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output,__INSTALL_DIR__/huggingface:/root/.cache/huggingface --overlap --ntasks-per-node=1 --ntasks=1 \
     --output=__OUTPUT_DIR__/output/sglang-bench.log \
-    python3 -m sglang.bench_serving --backend sglang --base-url http://127.0.0.1:8000 --model Qwen/Qwen3-8B --dataset-name random --num-prompts 30 --max-concurrency 16 --random-input 16 --random-output 128 --warmup-requests 2 --random-range-ratio 1.0 --output-file __OUTPUT_DIR__/output/sglang-bench.jsonl --output-details
+    python3 -m sglang.bench_serving --backend sglang --base-url http://${NODE}:8000 --model Qwen/Qwen3-8B --dataset-name random --num-prompts 30 --max-concurrency 16 --random-input 16 --random-output 128 --warmup-requests 2 --random-range-ratio 1.0 --output-file __OUTPUT_DIR__/output/sglang-bench.jsonl --output-details
diff --git a/tests/ref_data/vllm-disagg-2nodes.sbatch b/tests/ref_data/vllm-disagg-2nodes.sbatch
index 009cec246..60c7d99e3 100644
--- a/tests/ref_data/vllm-disagg-2nodes.sbatch
+++ b/tests/ref_data/vllm-disagg-2nodes.sbatch
@@ -60,12 +60,12 @@ echo "Node roles: prefill=$PREFILL_NODE decode=$DECODE_NODE"
 echo "Starting vLLM instances..."
 srun --export=ALL --mpi=pmix --container-image=nvcr.io/nvidia/vllm:latest --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output,__INSTALL_DIR__/huggingface:/root/.cache/huggingface --overlap --ntasks-per-node=1 --ntasks=1 --relative=0 -N1 \
     --output=__OUTPUT_DIR__/output/vllm-prefill.log \
-    env CUDA_VISIBLE_DEVICES="0,1,2,3" VLLM_NIXL_SIDE_CHANNEL_HOST="${PREFILL_NODE}" VLLM_NIXL_SIDE_CHANNEL_PORT="$PREFILL_NIXL_PORT" vllm serve Qwen/Qwen3-0.6B --port 8100 --kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_producer"}' &
+    env CUDA_VISIBLE_DEVICES="0,1,2,3" VLLM_NIXL_SIDE_CHANNEL_HOST="${PREFILL_NODE}" VLLM_NIXL_SIDE_CHANNEL_PORT="$PREFILL_NIXL_PORT" vllm serve Qwen/Qwen3-0.6B --host 0.0.0.0 --port 8100 --kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_producer"}' &
 PREFILL_PID=$!
 
 srun --export=ALL --mpi=pmix --container-image=nvcr.io/nvidia/vllm:latest --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output,__INSTALL_DIR__/huggingface:/root/.cache/huggingface --overlap --ntasks-per-node=1 --ntasks=1 --relative=1 -N1 \
     --output=__OUTPUT_DIR__/output/vllm-decode.log \
-    env CUDA_VISIBLE_DEVICES="0,1,2,3" VLLM_NIXL_SIDE_CHANNEL_HOST="${DECODE_NODE}" VLLM_NIXL_SIDE_CHANNEL_PORT="$DECODE_NIXL_PORT" vllm serve Qwen/Qwen3-0.6B --port 8200 --kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_consumer"}' &
+    env CUDA_VISIBLE_DEVICES="0,1,2,3" VLLM_NIXL_SIDE_CHANNEL_HOST="${DECODE_NODE}" VLLM_NIXL_SIDE_CHANNEL_PORT="$DECODE_NIXL_PORT" vllm serve Qwen/Qwen3-0.6B --host 0.0.0.0 --port 8200 --kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_consumer"}' &
 DECODE_PID=$!
 
 echo "Waiting for vLLM on $PREFILL_NODE and $DECODE_NODE to be ready..."
@@ -75,10 +75,10 @@ wait_for_health "http://${DECODE_NODE}:8200/health" || exit 1
 echo "Starting router..."
 srun --export=ALL --mpi=pmix --container-image=nvcr.io/nvidia/vllm:latest --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output,__INSTALL_DIR__/huggingface:/root/.cache/huggingface --overlap --ntasks-per-node=1 --ntasks=1 --relative=0 -N1 \
     --output=__OUTPUT_DIR__/output/vllm-router.log \
-    python3 /opt/vllm/tests/v1/kv_connector/nixl_integration/toy_proxy_server.py --port 8000 --prefiller-hosts ${PREFILL_NODE} --prefiller-ports 8100 --decoder-hosts ${DECODE_NODE} --decoder-ports 8200 &
+    python3 /opt/vllm/tests/v1/kv_connector/nixl_integration/toy_proxy_server.py --host 0.0.0.0 --port 8000 --prefiller-hosts ${PREFILL_NODE} --prefiller-ports 8100 --decoder-hosts ${DECODE_NODE} --decoder-ports 8200 &
 HELPER_PID=$!
 
 echo "Running benchmark..."
 srun --export=ALL --mpi=pmix --container-image=nvcr.io/nvidia/vllm:latest --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output,__INSTALL_DIR__/huggingface:/root/.cache/huggingface --overlap --ntasks-per-node=1 --ntasks=1 --relative=0 -N1 \
     --output=__OUTPUT_DIR__/output/vllm-bench.log \
-    vllm bench serve --model Qwen/Qwen3-0.6B --base-url http://127.0.0.1:8000 --random-input-len 16 --random-output-len 128 --max-concurrency 16 --num-prompts 30 --result-dir __OUTPUT_DIR__/output --result-filename vllm-bench.json --save-result
+    vllm bench serve --model Qwen/Qwen3-0.6B --base-url http://${PREFILL_NODE}:8000 --random-input-len 16 --random-output-len 128 --max-concurrency 16 --num-prompts 30 --result-dir __OUTPUT_DIR__/output --result-filename vllm-bench.json --save-result
diff --git a/tests/ref_data/vllm-disagg.sbatch b/tests/ref_data/vllm-disagg.sbatch
index 32c835842..f309a32ed 100644
--- a/tests/ref_data/vllm-disagg.sbatch
+++ b/tests/ref_data/vllm-disagg.sbatch
@@ -56,12 +56,12 @@ echo "Node roles: prefill=$PREFILL_NODE decode=$DECODE_NODE"
 echo "Starting vLLM instances..."
 srun --export=ALL --mpi=pmix -N1 --container-image=nvcr.io/nvidia/vllm:latest --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output,__INSTALL_DIR__/huggingface:/root/.cache/huggingface --overlap --ntasks-per-node=1 --ntasks=1 \
     --output=__OUTPUT_DIR__/output/vllm-prefill.log \
-    env CUDA_VISIBLE_DEVICES="0,1" VLLM_NIXL_SIDE_CHANNEL_HOST="${PREFILL_NODE}" VLLM_NIXL_SIDE_CHANNEL_PORT="$PREFILL_NIXL_PORT" vllm serve Qwen/Qwen3-0.6B --port 8100 --kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_producer"}' &
+    env CUDA_VISIBLE_DEVICES="0,1" VLLM_NIXL_SIDE_CHANNEL_HOST="${PREFILL_NODE}" VLLM_NIXL_SIDE_CHANNEL_PORT="$PREFILL_NIXL_PORT" vllm serve Qwen/Qwen3-0.6B --host 0.0.0.0 --port 8100 --kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_producer"}' &
 PREFILL_PID=$!
 
 srun --export=ALL --mpi=pmix -N1 --container-image=nvcr.io/nvidia/vllm:latest --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output,__INSTALL_DIR__/huggingface:/root/.cache/huggingface --overlap --ntasks-per-node=1 --ntasks=1 \
     --output=__OUTPUT_DIR__/output/vllm-decode.log \
-    env CUDA_VISIBLE_DEVICES="2,3" VLLM_NIXL_SIDE_CHANNEL_HOST="${DECODE_NODE}" VLLM_NIXL_SIDE_CHANNEL_PORT="$DECODE_NIXL_PORT" vllm serve Qwen/Qwen3-0.6B --port 8200 --kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_consumer"}' &
+    env CUDA_VISIBLE_DEVICES="2,3" VLLM_NIXL_SIDE_CHANNEL_HOST="${DECODE_NODE}" VLLM_NIXL_SIDE_CHANNEL_PORT="$DECODE_NIXL_PORT" vllm serve Qwen/Qwen3-0.6B --host 0.0.0.0 --port 8200 --kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_consumer"}' &
 DECODE_PID=$!
 
 echo "Waiting for vLLM on $PREFILL_NODE and $DECODE_NODE to be ready..."
@@ -71,10 +71,10 @@ wait_for_health "http://${DECODE_NODE}:8200/health" || exit 1
 echo "Starting router..."
 srun --export=ALL --mpi=pmix -N1 --container-image=nvcr.io/nvidia/vllm:latest --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output,__INSTALL_DIR__/huggingface:/root/.cache/huggingface --overlap --ntasks-per-node=1 --ntasks=1 \
     --output=__OUTPUT_DIR__/output/vllm-router.log \
-    python3 /opt/vllm/tests/v1/kv_connector/nixl_integration/toy_proxy_server.py --port 8000 --prefiller-hosts ${PREFILL_NODE} --prefiller-ports 8100 --decoder-hosts ${DECODE_NODE} --decoder-ports 8200 &
+    python3 /opt/vllm/tests/v1/kv_connector/nixl_integration/toy_proxy_server.py --host 0.0.0.0 --port 8000 --prefiller-hosts ${PREFILL_NODE} --prefiller-ports 8100 --decoder-hosts ${DECODE_NODE} --decoder-ports 8200 &
 HELPER_PID=$!
 
 echo "Running benchmark..."
 srun --export=ALL --mpi=pmix -N1 --container-image=nvcr.io/nvidia/vllm:latest --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output,__INSTALL_DIR__/huggingface:/root/.cache/huggingface --overlap --ntasks-per-node=1 --ntasks=1 \
     --output=__OUTPUT_DIR__/output/vllm-bench.log \
-    vllm bench serve --model Qwen/Qwen3-0.6B --base-url http://127.0.0.1:8000 --random-input-len 16 --random-output-len 128 --max-concurrency 16 --num-prompts 30 --result-dir __OUTPUT_DIR__/output --result-filename vllm-bench.json --save-result
+    vllm bench serve --model Qwen/Qwen3-0.6B --base-url http://${PREFILL_NODE}:8000 --random-input-len 16 --random-output-len 128 --max-concurrency 16 --num-prompts 30 --result-dir __OUTPUT_DIR__/output --result-filename vllm-bench.json --save-result
diff --git a/tests/ref_data/vllm.sbatch b/tests/ref_data/vllm.sbatch
index 733a49f51..de8c74ddf 100644
--- a/tests/ref_data/vllm.sbatch
+++ b/tests/ref_data/vllm.sbatch
@@ -41,7 +41,7 @@ wait_for_health() {
 echo "Starting vLLM instances..."
 srun --export=ALL --mpi=pmix -N1 --container-image=nvcr.io/nvidia/vllm:latest --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output,__INSTALL_DIR__/huggingface:/root/.cache/huggingface --overlap --ntasks-per-node=1 --ntasks=1 \
     --output=__OUTPUT_DIR__/output/vllm-serve.log \
-    vllm serve Qwen/Qwen3-0.6B --port 8000 &
+    vllm serve Qwen/Qwen3-0.6B --host 0.0.0.0 --port 8000 &
 SERVE_PID=$!
 
 NODE=$(scontrol show hostname $SLURM_JOB_NODELIST | head -n 1)
@@ -51,4 +51,4 @@ wait_for_health "http://${NODE}:8000/health" || exit 1
 echo "Running benchmark..."
 srun --export=ALL --mpi=pmix -N1 --container-image=nvcr.io/nvidia/vllm:latest --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output,__INSTALL_DIR__/huggingface:/root/.cache/huggingface --overlap --ntasks-per-node=1 --ntasks=1 \
     --output=__OUTPUT_DIR__/output/vllm-bench.log \
-    vllm bench serve --model Qwen/Qwen3-0.6B --base-url http://127.0.0.1:8000 --random-input-len 16 --random-output-len 128 --max-concurrency 16 --num-prompts 30 --result-dir __OUTPUT_DIR__/output --result-filename vllm-bench.json --save-result
+    vllm bench serve --model Qwen/Qwen3-0.6B --base-url http://${NODE}:8000 --random-input-len 16 --random-output-len 128 --max-concurrency 16 --num-prompts 30 --result-dir __OUTPUT_DIR__/output --result-filename vllm-bench.json --save-result
diff --git a/tests/workloads/sglang/test_command_gen_strategy_slurm.py b/tests/workloads/sglang/test_command_gen_strategy_slurm.py
index 00a835ba0..18c9d739b 100644
--- a/tests/workloads/sglang/test_command_gen_strategy_slurm.py
+++ b/tests/workloads/sglang/test_command_gen_strategy_slurm.py
@@ -94,7 +94,7 @@ def test_get_sglang_serve_commands_aggregated(sglang_cmd_gen_strategy: SglangSlu
         "--model-path",
         cmd_args.model,
         "--host",
-        "0.0.0.0",
+        cmd_args.host,
         "--port",
         str(cmd_args.port),
     ]
@@ -132,6 +132,7 @@ def test_get_sglang_bench_command_writes_jsonl(
     command = sglang_cmd_gen_strategy.get_bench_command()
     output_file_args = [part for part in command if part.startswith("--output-file ")]
     assert len(output_file_args) == 1
+    assert f"--base-url http://${{NODE}}:{sglang_cmd_gen_strategy.test_run.test.cmd_args.port}" in command
     assert output_file_args[0].endswith(f"/{SGLANG_BENCH_JSONL_FILE}")
 
 
@@ -150,7 +151,7 @@ def test_gen_srun_command_contains_expected_flow(sglang_disagg_tr: TestRun, slur
     assert 'wait_for_health "http://${DECODE_NODE}:8200/health"' in srun_command
     assert "--prefill http://${PREFILL_NODE}:8100" in srun_command
     assert "--decode http://${DECODE_NODE}:8200" in srun_command
-    assert "--base-url http://127.0.0.1:8000" in srun_command
+    assert "--base-url http://${PREFILL_NODE}:8000" in srun_command
     assert f"--output={strategy.test_run.output_path.absolute()}/{SGLANG_BENCH_LOG_FILE}" in srun_command
 
 
@@ -171,7 +172,7 @@ def test_gen_srun_command_contains_expected_two_node_flow(
     assert 'wait_for_health "http://${DECODE_NODE}:8200/health"' in srun_command
     assert "--prefill http://${PREFILL_NODE}:8100" in srun_command
     assert "--decode http://${DECODE_NODE}:8200" in srun_command
-    assert "--base-url http://127.0.0.1:8000" in srun_command
+    assert "--base-url http://${PREFILL_NODE}:8000" in srun_command
 
 
 def test_disagg_more_than_two_nodes_is_rejected(sglang_disagg_tr: TestRun, slurm_system: SlurmSystem) -> None:
diff --git a/tests/workloads/test_llm_serving.py b/tests/workloads/test_llm_serving.py
index aba270858..e2a7e219d 100644
--- a/tests/workloads/test_llm_serving.py
+++ b/tests/workloads/test_llm_serving.py
@@ -259,11 +259,13 @@ def test_two_node_disagg_uses_shared_gpu_ids_and_role_hosts(self, slurm_system:
         strategy = FakeLLMSlurmStrategy(slurm_system, tr)
 
         assert strategy.workload_slug == "fake-llm"
-        assert strategy.serve_port == 8000
+        assert strategy.serve_port == 8300
         assert strategy.prefill_gpu_ids == [0, 1, 2, 3]
         assert strategy.decode_gpu_ids == [0, 1, 2, 3]
-        assert strategy.prefill_port == 8100
-        assert strategy.decode_port == 8200
+        assert strategy.prefill_port == 8400
+        assert strategy.decode_port == 8500
+        assert strategy.bind_host == "0.0.0.0"
+        assert strategy.bench_host == "${PREFILL_NODE}"
         assert strategy.disaggregated_role_host("prefill") == "${PREFILL_NODE}"
         assert strategy.disaggregated_role_host("decode") == "${DECODE_NODE}"
         assert strategy.prefill_log_file == "fake-llm-prefill.log"
@@ -285,16 +287,16 @@ def test_single_node_disagg_wait_block_uses_role_hosts(self, slurm_system: Slurm
             strategy.generate_wait_for_health_block(
                 "Fake LLM",
                 [
-                    "http://${PREFILL_NODE}:8100/health",
-                    "http://${DECODE_NODE}:8200/health",
+                    "http://${PREFILL_NODE}:8400/health",
+                    "http://${DECODE_NODE}:8500/health",
                 ],
                 host_setup="",
                 host_display="$PREFILL_NODE and $DECODE_NODE",
             )
             == """\
 echo "Waiting for Fake LLM on $PREFILL_NODE and $DECODE_NODE to be ready..."
-wait_for_health "http://${PREFILL_NODE}:8100/health" || exit 1
-wait_for_health "http://${DECODE_NODE}:8200/health" || exit 1"""
+wait_for_health "http://${PREFILL_NODE}:8400/health" || exit 1
+wait_for_health "http://${DECODE_NODE}:8500/health" || exit 1"""
         )
         assert "DECODE_NODE=${NODES[1]:-${PREFILL_NODE}}" in strategy.generate_disaggregated_node_setup()
 
diff --git a/tests/workloads/vllm/test_command_gen_strategy_slurm.py b/tests/workloads/vllm/test_command_gen_strategy_slurm.py
index deb1b1fb0..2f3810ab4 100644
--- a/tests/workloads/vllm/test_command_gen_strategy_slurm.py
+++ b/tests/workloads/vllm/test_command_gen_strategy_slurm.py
@@ -159,7 +159,7 @@ def test_get_vllm_bench_command(self, vllm_cmd_gen_strategy: VllmSlurmCommandGen
             "bench",
             "serve",
             f"--model {cmd_args.model}",
-            f"--base-url http://127.0.0.1:{cmd_args.port}",
+            f"--base-url http://${{NODE}}:{cmd_args.port}",
             f"--random-input-len {bench_args.random_input_len}",
             f"--random-output-len {bench_args.random_output_len}",
             f"--max-concurrency {bench_args.max_concurrency}",
@@ -193,7 +193,7 @@ def test_get_vllm_serve_commands_single_gpu(self, vllm_cmd_gen_strategy: VllmSlu
         commands = vllm_cmd_gen_strategy.get_serve_commands()
 
         assert len(commands) == 1
-        assert commands[0] == ["vllm", "serve", cmd_args.model, "--port", str(cmd_args.port)]
+        assert commands[0] == ["vllm", "serve", cmd_args.model, "--host", cmd_args.host, "--port", str(cmd_args.port)]
 
     def test_get_vllm_serve_commands_convert_boolean_flags(
         self, vllm: VllmTestDefinition, vllm_tr: TestRun, slurm_system: SlurmSystem
@@ -208,6 +208,8 @@ def test_get_vllm_serve_commands_convert_boolean_flags(
             "vllm",
             "serve",
             vllm.cmd_args.model,
+            "--host",
+            vllm.cmd_args.host,
             "--enable-expert-parallel",
             "--port",
             str(vllm.cmd_args.port),
@@ -304,6 +306,8 @@ def test_get_vllm_serve_commands_returns_two(self, vllm_disagg_tr: TestRun, slur
             "vllm",
             "serve",
             cmd_args.model,
+            "--host",
+            cmd_args.host,
             "--port",
             str(cmd_args.port + 100),
             "--kv-transfer-config",
@@ -313,6 +317,8 @@ def test_get_vllm_serve_commands_returns_two(self, vllm_disagg_tr: TestRun, slur
             "vllm",
             "serve",
             cmd_args.model,
+            "--host",
+            cmd_args.host,
             "--port",
             str(cmd_args.port + 200),
             "--kv-transfer-config",
@@ -329,6 +335,8 @@ def test_get_helper_command(self, vllm_disagg_tr: TestRun, slurm_system: SlurmSy
         assert command == [
             "python3",
             cmd_args.proxy_script,
+            "--host",
+            cmd_args.host,
             "--port",
             str(cmd_args.port),
             "--prefiller-hosts",
@@ -441,7 +449,7 @@ def test_gen_srun_command_disagg_two_nodes_flow(
         assert 'wait_for_health "http://${DECODE_NODE}:8200/health"' in srun_command
         assert "--prefiller-hosts ${PREFILL_NODE}" in srun_command
         assert "--decoder-hosts ${DECODE_NODE}" in srun_command
-        assert "--base-url http://127.0.0.1:8000" in srun_command
+        assert "--base-url http://${PREFILL_NODE}:8000" in srun_command
 
     def test_disagg_more_than_two_nodes_is_rejected(self, vllm_disagg_tr: TestRun, slurm_system: SlurmSystem) -> None:
         vllm_disagg_tr.num_nodes = 3

From 6f3cc1991301f276aa371ca815d28df3f78597d1 Mon Sep 17 00:00:00 2001
From: Ivan Podkidyshev <ipodkidyshev@nvidia.com>
Date: Tue, 21 Apr 2026 15:09:13 +0200
Subject: [PATCH 2/2] update tests to use default port

---
 tests/ref_data/sglang-disagg-2nodes.sbatch | 12 ++++++------
 tests/ref_data/sglang-disagg.sbatch        | 12 ++++++------
 tests/ref_data/sglang.sbatch               |  6 +++---
 tests/ref_data/vllm-disagg-2nodes.sbatch   | 12 ++++++------
 tests/ref_data/vllm-disagg.sbatch          | 12 ++++++------
 tests/ref_data/vllm.sbatch                 |  6 +++---
 tests/test_acceptance.py                   |  6 ------
 7 files changed, 30 insertions(+), 36 deletions(-)

diff --git a/tests/ref_data/sglang-disagg-2nodes.sbatch b/tests/ref_data/sglang-disagg-2nodes.sbatch
index f00f2ed63..d5f2b72b1 100644
--- a/tests/ref_data/sglang-disagg-2nodes.sbatch
+++ b/tests/ref_data/sglang-disagg-2nodes.sbatch
@@ -56,25 +56,25 @@ echo "Node roles: prefill=$PREFILL_NODE decode=$DECODE_NODE"
 echo "Starting SGLang instances..."
 srun --export=ALL --mpi=pmix --container-image=docker.io/lmsysorg/sglang:dev --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output,__INSTALL_DIR__/huggingface:/root/.cache/huggingface --overlap --ntasks-per-node=1 --ntasks=1 --relative=0 -N1 \
     --output=__OUTPUT_DIR__/output/sglang-prefill.log \
-    env CUDA_VISIBLE_DEVICES="0,1,2,3" python3 -m sglang.launch_server --model-path Qwen/Qwen3-8B --host 0.0.0.0 --port 8100 --disaggregation-mode prefill --disaggregation-transfer-backend nixl &
+    env CUDA_VISIBLE_DEVICES="0,1,2,3" python3 -m sglang.launch_server --model-path Qwen/Qwen3-8B --host 0.0.0.0 --port 8400 --disaggregation-mode prefill --disaggregation-transfer-backend nixl &
 PREFILL_PID=$!
 
 srun --export=ALL --mpi=pmix --container-image=docker.io/lmsysorg/sglang:dev --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output,__INSTALL_DIR__/huggingface:/root/.cache/huggingface --overlap --ntasks-per-node=1 --ntasks=1 --relative=1 -N1 \
     --output=__OUTPUT_DIR__/output/sglang-decode.log \
-    env CUDA_VISIBLE_DEVICES="0,1,2,3" python3 -m sglang.launch_server --model-path Qwen/Qwen3-8B --host 0.0.0.0 --port 8200 --disaggregation-mode decode --disaggregation-transfer-backend nixl &
+    env CUDA_VISIBLE_DEVICES="0,1,2,3" python3 -m sglang.launch_server --model-path Qwen/Qwen3-8B --host 0.0.0.0 --port 8500 --disaggregation-mode decode --disaggregation-transfer-backend nixl &
 DECODE_PID=$!
 
 echo "Waiting for SGLang on $PREFILL_NODE and $DECODE_NODE to be ready..."
-wait_for_health "http://${PREFILL_NODE}:8100/health" || exit 1
-wait_for_health "http://${DECODE_NODE}:8200/health" || exit 1
+wait_for_health "http://${PREFILL_NODE}:8400/health" || exit 1
+wait_for_health "http://${DECODE_NODE}:8500/health" || exit 1
 
 echo "Starting router..."
 srun --export=ALL --mpi=pmix --container-image=docker.io/lmsysorg/sglang:dev --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output,__INSTALL_DIR__/huggingface:/root/.cache/huggingface --overlap --ntasks-per-node=1 --ntasks=1 --relative=0 -N1 \
     --output=__OUTPUT_DIR__/output/sglang-router.log \
-    python3 -m sglang_router.launch_router --pd-disaggregation --prefill http://${PREFILL_NODE}:8100 --decode http://${DECODE_NODE}:8200 --host 0.0.0.0 --port 8000 &
+    python3 -m sglang_router.launch_router --pd-disaggregation --prefill http://${PREFILL_NODE}:8400 --decode http://${DECODE_NODE}:8500 --host 0.0.0.0 --port 8300 &
 HELPER_PID=$!
 
 echo "Running benchmark..."
 srun --export=ALL --mpi=pmix --container-image=docker.io/lmsysorg/sglang:dev --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output,__INSTALL_DIR__/huggingface:/root/.cache/huggingface --overlap --ntasks-per-node=1 --ntasks=1 --relative=0 -N1 \
     --output=__OUTPUT_DIR__/output/sglang-bench.log \
-    python3 -m sglang.bench_serving --backend sglang --base-url http://${PREFILL_NODE}:8000 --model Qwen/Qwen3-8B --dataset-name random --num-prompts 30 --max-concurrency 16 --random-input 16 --random-output 128 --warmup-requests 2 --random-range-ratio 1.0 --output-file __OUTPUT_DIR__/output/sglang-bench.jsonl --output-details --pd-separated
+    python3 -m sglang.bench_serving --backend sglang --base-url http://${PREFILL_NODE}:8300 --model Qwen/Qwen3-8B --dataset-name random --num-prompts 30 --max-concurrency 16 --random-input 16 --random-output 128 --warmup-requests 2 --random-range-ratio 1.0 --output-file __OUTPUT_DIR__/output/sglang-bench.jsonl --output-details --pd-separated
diff --git a/tests/ref_data/sglang-disagg.sbatch b/tests/ref_data/sglang-disagg.sbatch
index a821a126d..5469c972b 100644
--- a/tests/ref_data/sglang-disagg.sbatch
+++ b/tests/ref_data/sglang-disagg.sbatch
@@ -52,25 +52,25 @@ echo "Node roles: prefill=$PREFILL_NODE decode=$DECODE_NODE"
 echo "Starting SGLang instances..."
 srun --export=ALL --mpi=pmix -N1 --container-image=docker.io/lmsysorg/sglang:dev --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output,__INSTALL_DIR__/huggingface:/root/.cache/huggingface --overlap --ntasks-per-node=1 --ntasks=1 \
     --output=__OUTPUT_DIR__/output/sglang-prefill.log \
-    env CUDA_VISIBLE_DEVICES="0,1" python3 -m sglang.launch_server --model-path Qwen/Qwen3-8B --host 0.0.0.0 --port 8100 --disaggregation-mode prefill --disaggregation-transfer-backend nixl &
+    env CUDA_VISIBLE_DEVICES="0,1" python3 -m sglang.launch_server --model-path Qwen/Qwen3-8B --host 0.0.0.0 --port 8400 --disaggregation-mode prefill --disaggregation-transfer-backend nixl &
 PREFILL_PID=$!
 
 srun --export=ALL --mpi=pmix -N1 --container-image=docker.io/lmsysorg/sglang:dev --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output,__INSTALL_DIR__/huggingface:/root/.cache/huggingface --overlap --ntasks-per-node=1 --ntasks=1 \
     --output=__OUTPUT_DIR__/output/sglang-decode.log \
-    env CUDA_VISIBLE_DEVICES="2,3" python3 -m sglang.launch_server --model-path Qwen/Qwen3-8B --host 0.0.0.0 --port 8200 --disaggregation-mode decode --disaggregation-transfer-backend nixl &
+    env CUDA_VISIBLE_DEVICES="2,3" python3 -m sglang.launch_server --model-path Qwen/Qwen3-8B --host 0.0.0.0 --port 8500 --disaggregation-mode decode --disaggregation-transfer-backend nixl &
 DECODE_PID=$!
 
 echo "Waiting for SGLang on $PREFILL_NODE and $DECODE_NODE to be ready..."
-wait_for_health "http://${PREFILL_NODE}:8100/health" || exit 1
-wait_for_health "http://${DECODE_NODE}:8200/health" || exit 1
+wait_for_health "http://${PREFILL_NODE}:8400/health" || exit 1
+wait_for_health "http://${DECODE_NODE}:8500/health" || exit 1
 
 echo "Starting router..."
 srun --export=ALL --mpi=pmix -N1 --container-image=docker.io/lmsysorg/sglang:dev --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output,__INSTALL_DIR__/huggingface:/root/.cache/huggingface --overlap --ntasks-per-node=1 --ntasks=1 \
     --output=__OUTPUT_DIR__/output/sglang-router.log \
-    python3 -m sglang_router.launch_router --pd-disaggregation --prefill http://${PREFILL_NODE}:8100 --decode http://${DECODE_NODE}:8200 --host 0.0.0.0 --port 8000 &
+    python3 -m sglang_router.launch_router --pd-disaggregation --prefill http://${PREFILL_NODE}:8400 --decode http://${DECODE_NODE}:8500 --host 0.0.0.0 --port 8300 &
 HELPER_PID=$!
 
 echo "Running benchmark..."
 srun --export=ALL --mpi=pmix -N1 --container-image=docker.io/lmsysorg/sglang:dev --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output,__INSTALL_DIR__/huggingface:/root/.cache/huggingface --overlap --ntasks-per-node=1 --ntasks=1 \
     --output=__OUTPUT_DIR__/output/sglang-bench.log \
-    python3 -m sglang.bench_serving --backend sglang --base-url http://${PREFILL_NODE}:8000 --model Qwen/Qwen3-8B --dataset-name random --num-prompts 30 --max-concurrency 16 --random-input 16 --random-output 128 --warmup-requests 2 --random-range-ratio 1.0 --output-file __OUTPUT_DIR__/output/sglang-bench.jsonl --output-details --pd-separated
+    python3 -m sglang.bench_serving --backend sglang --base-url http://${PREFILL_NODE}:8300 --model Qwen/Qwen3-8B --dataset-name random --num-prompts 30 --max-concurrency 16 --random-input 16 --random-output 128 --warmup-requests 2 --random-range-ratio 1.0 --output-file __OUTPUT_DIR__/output/sglang-bench.jsonl --output-details --pd-separated
diff --git a/tests/ref_data/sglang.sbatch b/tests/ref_data/sglang.sbatch
index f979d3d6f..5219284e7 100644
--- a/tests/ref_data/sglang.sbatch
+++ b/tests/ref_data/sglang.sbatch
@@ -41,14 +41,14 @@ wait_for_health() {
 echo "Starting SGLang instances..."
 srun --export=ALL --mpi=pmix -N1 --container-image=docker.io/lmsysorg/sglang:dev --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output,__INSTALL_DIR__/huggingface:/root/.cache/huggingface --overlap --ntasks-per-node=1 --ntasks=1 \
     --output=__OUTPUT_DIR__/output/sglang-serve.log \
-    env CUDA_VISIBLE_DEVICES="0" python3 -m sglang.launch_server --model-path Qwen/Qwen3-8B --host 0.0.0.0 --port 8000 &
+    env CUDA_VISIBLE_DEVICES="0" python3 -m sglang.launch_server --model-path Qwen/Qwen3-8B --host 0.0.0.0 --port 8300 &
 SERVE_PID=$!
 
 NODE=$(scontrol show hostname $SLURM_JOB_NODELIST | head -n 1)
 echo "Waiting for SGLang on $NODE to be ready..."
-wait_for_health "http://${NODE}:8000/health" || exit 1
+wait_for_health "http://${NODE}:8300/health" || exit 1
 
 echo "Running benchmark..."
 srun --export=ALL --mpi=pmix -N1 --container-image=docker.io/lmsysorg/sglang:dev --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output,__INSTALL_DIR__/huggingface:/root/.cache/huggingface --overlap --ntasks-per-node=1 --ntasks=1 \
     --output=__OUTPUT_DIR__/output/sglang-bench.log \
-    python3 -m sglang.bench_serving --backend sglang --base-url http://${NODE}:8000 --model Qwen/Qwen3-8B --dataset-name random --num-prompts 30 --max-concurrency 16 --random-input 16 --random-output 128 --warmup-requests 2 --random-range-ratio 1.0 --output-file __OUTPUT_DIR__/output/sglang-bench.jsonl --output-details
+    python3 -m sglang.bench_serving --backend sglang --base-url http://${NODE}:8300 --model Qwen/Qwen3-8B --dataset-name random --num-prompts 30 --max-concurrency 16 --random-input 16 --random-output 128 --warmup-requests 2 --random-range-ratio 1.0 --output-file __OUTPUT_DIR__/output/sglang-bench.jsonl --output-details
diff --git a/tests/ref_data/vllm-disagg-2nodes.sbatch b/tests/ref_data/vllm-disagg-2nodes.sbatch
index 60c7d99e3..5a9ccdf4a 100644
--- a/tests/ref_data/vllm-disagg-2nodes.sbatch
+++ b/tests/ref_data/vllm-disagg-2nodes.sbatch
@@ -60,25 +60,25 @@ echo "Node roles: prefill=$PREFILL_NODE decode=$DECODE_NODE"
 echo "Starting vLLM instances..."
 srun --export=ALL --mpi=pmix --container-image=nvcr.io/nvidia/vllm:latest --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output,__INSTALL_DIR__/huggingface:/root/.cache/huggingface --overlap --ntasks-per-node=1 --ntasks=1 --relative=0 -N1 \
     --output=__OUTPUT_DIR__/output/vllm-prefill.log \
-    env CUDA_VISIBLE_DEVICES="0,1,2,3" VLLM_NIXL_SIDE_CHANNEL_HOST="${PREFILL_NODE}" VLLM_NIXL_SIDE_CHANNEL_PORT="$PREFILL_NIXL_PORT" vllm serve Qwen/Qwen3-0.6B --host 0.0.0.0 --port 8100 --kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_producer"}' &
+    env CUDA_VISIBLE_DEVICES="0,1,2,3" VLLM_NIXL_SIDE_CHANNEL_HOST="${PREFILL_NODE}" VLLM_NIXL_SIDE_CHANNEL_PORT="$PREFILL_NIXL_PORT" vllm serve Qwen/Qwen3-0.6B --host 0.0.0.0 --port 8400 --kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_producer"}' &
 PREFILL_PID=$!
 
 srun --export=ALL --mpi=pmix --container-image=nvcr.io/nvidia/vllm:latest --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output,__INSTALL_DIR__/huggingface:/root/.cache/huggingface --overlap --ntasks-per-node=1 --ntasks=1 --relative=1 -N1 \
     --output=__OUTPUT_DIR__/output/vllm-decode.log \
-    env CUDA_VISIBLE_DEVICES="0,1,2,3" VLLM_NIXL_SIDE_CHANNEL_HOST="${DECODE_NODE}" VLLM_NIXL_SIDE_CHANNEL_PORT="$DECODE_NIXL_PORT" vllm serve Qwen/Qwen3-0.6B --host 0.0.0.0 --port 8200 --kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_consumer"}' &
+    env CUDA_VISIBLE_DEVICES="0,1,2,3" VLLM_NIXL_SIDE_CHANNEL_HOST="${DECODE_NODE}" VLLM_NIXL_SIDE_CHANNEL_PORT="$DECODE_NIXL_PORT" vllm serve Qwen/Qwen3-0.6B --host 0.0.0.0 --port 8500 --kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_consumer"}' &
 DECODE_PID=$!
 
 echo "Waiting for vLLM on $PREFILL_NODE and $DECODE_NODE to be ready..."
-wait_for_health "http://${PREFILL_NODE}:8100/health" || exit 1
-wait_for_health "http://${DECODE_NODE}:8200/health" || exit 1
+wait_for_health "http://${PREFILL_NODE}:8400/health" || exit 1
+wait_for_health "http://${DECODE_NODE}:8500/health" || exit 1
 
 echo "Starting router..."
 srun --export=ALL --mpi=pmix --container-image=nvcr.io/nvidia/vllm:latest --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output,__INSTALL_DIR__/huggingface:/root/.cache/huggingface --overlap --ntasks-per-node=1 --ntasks=1 --relative=0 -N1 \
     --output=__OUTPUT_DIR__/output/vllm-router.log \
-    python3 /opt/vllm/tests/v1/kv_connector/nixl_integration/toy_proxy_server.py --host 0.0.0.0 --port 8000 --prefiller-hosts ${PREFILL_NODE} --prefiller-ports 8100 --decoder-hosts ${DECODE_NODE} --decoder-ports 8200 &
+    python3 /opt/vllm/tests/v1/kv_connector/nixl_integration/toy_proxy_server.py --host 0.0.0.0 --port 8300 --prefiller-hosts ${PREFILL_NODE} --prefiller-ports 8400 --decoder-hosts ${DECODE_NODE} --decoder-ports 8500 &
 HELPER_PID=$!
 
 echo "Running benchmark..."
 srun --export=ALL --mpi=pmix --container-image=nvcr.io/nvidia/vllm:latest --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output,__INSTALL_DIR__/huggingface:/root/.cache/huggingface --overlap --ntasks-per-node=1 --ntasks=1 --relative=0 -N1 \
     --output=__OUTPUT_DIR__/output/vllm-bench.log \
-    vllm bench serve --model Qwen/Qwen3-0.6B --base-url http://${PREFILL_NODE}:8000 --random-input-len 16 --random-output-len 128 --max-concurrency 16 --num-prompts 30 --result-dir __OUTPUT_DIR__/output --result-filename vllm-bench.json --save-result
+    vllm bench serve --model Qwen/Qwen3-0.6B --base-url http://${PREFILL_NODE}:8300 --random-input-len 16 --random-output-len 128 --max-concurrency 16 --num-prompts 30 --result-dir __OUTPUT_DIR__/output --result-filename vllm-bench.json --save-result
diff --git a/tests/ref_data/vllm-disagg.sbatch b/tests/ref_data/vllm-disagg.sbatch
index f309a32ed..1d53a16fe 100644
--- a/tests/ref_data/vllm-disagg.sbatch
+++ b/tests/ref_data/vllm-disagg.sbatch
@@ -56,25 +56,25 @@ echo "Node roles: prefill=$PREFILL_NODE decode=$DECODE_NODE"
 echo "Starting vLLM instances..."
 srun --export=ALL --mpi=pmix -N1 --container-image=nvcr.io/nvidia/vllm:latest --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output,__INSTALL_DIR__/huggingface:/root/.cache/huggingface --overlap --ntasks-per-node=1 --ntasks=1 \
     --output=__OUTPUT_DIR__/output/vllm-prefill.log \
-    env CUDA_VISIBLE_DEVICES="0,1" VLLM_NIXL_SIDE_CHANNEL_HOST="${PREFILL_NODE}" VLLM_NIXL_SIDE_CHANNEL_PORT="$PREFILL_NIXL_PORT" vllm serve Qwen/Qwen3-0.6B --host 0.0.0.0 --port 8100 --kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_producer"}' &
+    env CUDA_VISIBLE_DEVICES="0,1" VLLM_NIXL_SIDE_CHANNEL_HOST="${PREFILL_NODE}" VLLM_NIXL_SIDE_CHANNEL_PORT="$PREFILL_NIXL_PORT" vllm serve Qwen/Qwen3-0.6B --host 0.0.0.0 --port 8400 --kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_producer"}' &
 PREFILL_PID=$!
 
 srun --export=ALL --mpi=pmix -N1 --container-image=nvcr.io/nvidia/vllm:latest --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output,__INSTALL_DIR__/huggingface:/root/.cache/huggingface --overlap --ntasks-per-node=1 --ntasks=1 \
     --output=__OUTPUT_DIR__/output/vllm-decode.log \
-    env CUDA_VISIBLE_DEVICES="2,3" VLLM_NIXL_SIDE_CHANNEL_HOST="${DECODE_NODE}" VLLM_NIXL_SIDE_CHANNEL_PORT="$DECODE_NIXL_PORT" vllm serve Qwen/Qwen3-0.6B --host 0.0.0.0 --port 8200 --kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_consumer"}' &
+    env CUDA_VISIBLE_DEVICES="2,3" VLLM_NIXL_SIDE_CHANNEL_HOST="${DECODE_NODE}" VLLM_NIXL_SIDE_CHANNEL_PORT="$DECODE_NIXL_PORT" vllm serve Qwen/Qwen3-0.6B --host 0.0.0.0 --port 8500 --kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_consumer"}' &
 DECODE_PID=$!
 
 echo "Waiting for vLLM on $PREFILL_NODE and $DECODE_NODE to be ready..."
-wait_for_health "http://${PREFILL_NODE}:8100/health" || exit 1
-wait_for_health "http://${DECODE_NODE}:8200/health" || exit 1
+wait_for_health "http://${PREFILL_NODE}:8400/health" || exit 1
+wait_for_health "http://${DECODE_NODE}:8500/health" || exit 1
 
 echo "Starting router..."
 srun --export=ALL --mpi=pmix -N1 --container-image=nvcr.io/nvidia/vllm:latest --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output,__INSTALL_DIR__/huggingface:/root/.cache/huggingface --overlap --ntasks-per-node=1 --ntasks=1 \
     --output=__OUTPUT_DIR__/output/vllm-router.log \
-    python3 /opt/vllm/tests/v1/kv_connector/nixl_integration/toy_proxy_server.py --host 0.0.0.0 --port 8000 --prefiller-hosts ${PREFILL_NODE} --prefiller-ports 8100 --decoder-hosts ${DECODE_NODE} --decoder-ports 8200 &
+    python3 /opt/vllm/tests/v1/kv_connector/nixl_integration/toy_proxy_server.py --host 0.0.0.0 --port 8300 --prefiller-hosts ${PREFILL_NODE} --prefiller-ports 8400 --decoder-hosts ${DECODE_NODE} --decoder-ports 8500 &
 HELPER_PID=$!
 
 echo "Running benchmark..."
 srun --export=ALL --mpi=pmix -N1 --container-image=nvcr.io/nvidia/vllm:latest --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output,__INSTALL_DIR__/huggingface:/root/.cache/huggingface --overlap --ntasks-per-node=1 --ntasks=1 \
     --output=__OUTPUT_DIR__/output/vllm-bench.log \
-    vllm bench serve --model Qwen/Qwen3-0.6B --base-url http://${PREFILL_NODE}:8000 --random-input-len 16 --random-output-len 128 --max-concurrency 16 --num-prompts 30 --result-dir __OUTPUT_DIR__/output --result-filename vllm-bench.json --save-result
+    vllm bench serve --model Qwen/Qwen3-0.6B --base-url http://${PREFILL_NODE}:8300 --random-input-len 16 --random-output-len 128 --max-concurrency 16 --num-prompts 30 --result-dir __OUTPUT_DIR__/output --result-filename vllm-bench.json --save-result
diff --git a/tests/ref_data/vllm.sbatch b/tests/ref_data/vllm.sbatch
index de8c74ddf..f19956c6f 100644
--- a/tests/ref_data/vllm.sbatch
+++ b/tests/ref_data/vllm.sbatch
@@ -41,14 +41,14 @@ wait_for_health() {
 echo "Starting vLLM instances..."
 srun --export=ALL --mpi=pmix -N1 --container-image=nvcr.io/nvidia/vllm:latest --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output,__INSTALL_DIR__/huggingface:/root/.cache/huggingface --overlap --ntasks-per-node=1 --ntasks=1 \
     --output=__OUTPUT_DIR__/output/vllm-serve.log \
-    vllm serve Qwen/Qwen3-0.6B --host 0.0.0.0 --port 8000 &
+    vllm serve Qwen/Qwen3-0.6B --host 0.0.0.0 --port 8300 &
 SERVE_PID=$!
 
 NODE=$(scontrol show hostname $SLURM_JOB_NODELIST | head -n 1)
 echo "Waiting for vLLM on $NODE to be ready..."
-wait_for_health "http://${NODE}:8000/health" || exit 1
+wait_for_health "http://${NODE}:8300/health" || exit 1
 
 echo "Running benchmark..."
 srun --export=ALL --mpi=pmix -N1 --container-image=nvcr.io/nvidia/vllm:latest --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output,__INSTALL_DIR__/huggingface:/root/.cache/huggingface --overlap --ntasks-per-node=1 --ntasks=1 \
     --output=__OUTPUT_DIR__/output/vllm-bench.log \
-    vllm bench serve --model Qwen/Qwen3-0.6B --base-url http://${NODE}:8000 --random-input-len 16 --random-output-len 128 --max-concurrency 16 --num-prompts 30 --result-dir __OUTPUT_DIR__/output --result-filename vllm-bench.json --save-result
+    vllm bench serve --model Qwen/Qwen3-0.6B --base-url http://${NODE}:8300 --random-input-len 16 --random-output-len 128 --max-concurrency 16 --num-prompts 30 --result-dir __OUTPUT_DIR__/output --result-filename vllm-bench.json --save-result
diff --git a/tests/test_acceptance.py b/tests/test_acceptance.py
index 0dabd46d0..1eea029a5 100644
--- a/tests/test_acceptance.py
+++ b/tests/test_acceptance.py
@@ -587,7 +587,6 @@ def test_req(request, slurm_system: SlurmSystem, partial_tr: partial[TestRun]) -
                 cmd_args=VllmCmdArgs(
                     docker_image_url="nvcr.io/nvidia/vllm:latest",
                     model="Qwen/Qwen3-0.6B",
-                    port=8000,
                 ),
                 extra_env_vars={"CUDA_VISIBLE_DEVICES": "0"},
             ),
@@ -602,7 +601,6 @@ def test_req(request, slurm_system: SlurmSystem, partial_tr: partial[TestRun]) -
                 cmd_args=SglangCmdArgs(
                     docker_image_url="docker.io/lmsysorg/sglang:dev",
                     model="Qwen/Qwen3-8B",
-                    port=8000,
                 ),
                 extra_env_vars={"CUDA_VISIBLE_DEVICES": "0"},
             ),
@@ -617,7 +615,6 @@ def test_req(request, slurm_system: SlurmSystem, partial_tr: partial[TestRun]) -
                 cmd_args=SglangCmdArgs(
                     docker_image_url="docker.io/lmsysorg/sglang:dev",
                     model="Qwen/Qwen3-8B",
-                    port=8000,
                     prefill=SglangArgs(),
                 ),
                 extra_env_vars={"CUDA_VISIBLE_DEVICES": "0,1,2,3"},
@@ -633,7 +630,6 @@ def test_req(request, slurm_system: SlurmSystem, partial_tr: partial[TestRun]) -
                 cmd_args=SglangCmdArgs(
                     docker_image_url="docker.io/lmsysorg/sglang:dev",
                     model="Qwen/Qwen3-8B",
-                    port=8000,
                     prefill=SglangArgs(),
                 ),
                 extra_env_vars={"CUDA_VISIBLE_DEVICES": "0,1,2,3"},
@@ -649,7 +645,6 @@ def test_req(request, slurm_system: SlurmSystem, partial_tr: partial[TestRun]) -
                 cmd_args=VllmCmdArgs(
                     docker_image_url="nvcr.io/nvidia/vllm:latest",
                     model="Qwen/Qwen3-0.6B",
-                    port=8000,
                     prefill=VllmArgs(),
                 ),
                 extra_env_vars={"CUDA_VISIBLE_DEVICES": "0,1,2,3"},
@@ -665,7 +660,6 @@ def test_req(request, slurm_system: SlurmSystem, partial_tr: partial[TestRun]) -
                 cmd_args=VllmCmdArgs(
                     docker_image_url="nvcr.io/nvidia/vllm:latest",
                     model="Qwen/Qwen3-0.6B",
-                    port=8000,
                     prefill=VllmArgs(),
                 ),
                 extra_env_vars={"CUDA_VISIBLE_DEVICES": "0,1,2,3"},