From 294f8b4da4dbda894c3ba9b5eb9db236a364b2aa Mon Sep 17 00:00:00 2001 From: Ivan Podkidyshev Date: Tue, 21 Apr 2026 14:46:59 +0200 Subject: [PATCH 1/2] fix various vllm/sglan bugs --- src/cloudai/workloads/common/llm_serving.py | 20 ++++++++++++++++++- .../sglang/slurm_command_gen_strategy.py | 11 +++++----- .../vllm/slurm_command_gen_strategy.py | 6 ++++-- tests/ref_data/sglang-disagg-2nodes.sbatch | 2 +- tests/ref_data/sglang-disagg.sbatch | 2 +- tests/ref_data/sglang.sbatch | 2 +- tests/ref_data/vllm-disagg-2nodes.sbatch | 8 ++++---- tests/ref_data/vllm-disagg.sbatch | 8 ++++---- tests/ref_data/vllm.sbatch | 4 ++-- .../sglang/test_command_gen_strategy_slurm.py | 7 ++++--- tests/workloads/test_llm_serving.py | 16 ++++++++------- .../vllm/test_command_gen_strategy_slurm.py | 14 ++++++++++--- 12 files changed, 65 insertions(+), 35 deletions(-) diff --git a/src/cloudai/workloads/common/llm_serving.py b/src/cloudai/workloads/common/llm_serving.py index f7b2618f0..213421f56 100644 --- a/src/cloudai/workloads/common/llm_serving.py +++ b/src/cloudai/workloads/common/llm_serving.py @@ -118,7 +118,12 @@ class LLMServingCmdArgs(CmdArgs, Generic[LLMServingArgsT]): docker_image_url: str model: str - port: int = Field(default=8000, ge=1, le=65535) + port: int = Field(default=8300, ge=1, le=65535) + host: str = Field(default="0.0.0.0", description="Host/interface for serve or router processes to bind to.") + bench_host: str | None = Field( + default=None, + description="Hostname used by the benchmark client. Defaults to the allocated node hostname.", + ) serve_wait_seconds: int = 300 prefill: LLMServingArgsT | None = Field(default=None) decode: LLMServingArgsT @@ -363,6 +368,19 @@ def disaggregated_role_host(self, role: str) -> str: return "${DECODE_NODE}" raise ValueError(f"Unknown disaggregated role: {role}") + @property + def bind_host(self) -> str: + return self.tdef.cmd_args.host + + @property + def bench_host(self) -> str: + configured_host = self.tdef.cmd_args.bench_host + if configured_host: + return configured_host + if self.is_disaggregated: + return "${PREFILL_NODE}" + return "${NODE}" + def generate_disaggregated_node_setup(self) -> str: if not self.is_disaggregated: return "" diff --git a/src/cloudai/workloads/sglang/slurm_command_gen_strategy.py b/src/cloudai/workloads/sglang/slurm_command_gen_strategy.py index 4b2333f1a..fd2548b18 100644 --- a/src/cloudai/workloads/sglang/slurm_command_gen_strategy.py +++ b/src/cloudai/workloads/sglang/slurm_command_gen_strategy.py @@ -40,7 +40,6 @@ def workload_name(self) -> str: def get_serve_commands(self) -> list[list[str]]: cmd_args = self.tdef.cmd_args - bind_host = "0.0.0.0" base_cmd = ["python3", "-m", cmd_args.serve_module, "--model-path", cmd_args.model] if not cmd_args.prefill: @@ -48,7 +47,7 @@ def get_serve_commands(self) -> list[list[str]]: [ *base_cmd, "--host", - bind_host, + self.bind_host, "--port", str(self.serve_port), *cmd_args.decode.serve_args, @@ -57,8 +56,8 @@ def get_serve_commands(self) -> list[list[str]]: commands: list[list[str]] = [] for host, port, mode, args in [ - (bind_host, self.prefill_port, "prefill", cast(SglangArgs, cmd_args.prefill)), - (bind_host, self.decode_port, "decode", cmd_args.decode), + (self.bind_host, self.prefill_port, "prefill", cast(SglangArgs, cmd_args.prefill)), + (self.bind_host, self.decode_port, "decode", cmd_args.decode), ]: commands.append( [ @@ -88,7 +87,7 @@ def get_helper_command(self) -> list[str]: "--decode", f"http://{self.disaggregated_role_host('decode')}:{self.decode_port}", "--host", - "0.0.0.0", + self.bind_host, "--port", str(self.serve_port), ] @@ -103,7 +102,7 @@ def get_bench_command(self) -> list[str]: "-m", self.tdef.cmd_args.bench_module, f"--backend {bench_args.backend}", - f"--base-url http://127.0.0.1:{self.serve_port}", + f"--base-url http://{self.bench_host}:{self.serve_port}", f"--model {self.tdef.cmd_args.model}", f"--dataset-name {bench_args.dataset_name}", f"--num-prompts {bench_args.num_prompts}", diff --git a/src/cloudai/workloads/vllm/slurm_command_gen_strategy.py b/src/cloudai/workloads/vllm/slurm_command_gen_strategy.py index f55d085a9..1fb40d83d 100644 --- a/src/cloudai/workloads/vllm/slurm_command_gen_strategy.py +++ b/src/cloudai/workloads/vllm/slurm_command_gen_strategy.py @@ -45,7 +45,7 @@ def get_serve_commands(self) -> list[list[str]]: tdef: VllmTestDefinition = cast(VllmTestDefinition, self.test_run.test) cmd_args: VllmCmdArgs = tdef.cmd_args - base_cmd = ["vllm", "serve", cmd_args.model] + base_cmd = ["vllm", "serve", cmd_args.model, "--host", self.bind_host] if not tdef.cmd_args.prefill: return [[*base_cmd, *tdef.cmd_args.decode.serve_args, "--port", str(self.serve_port)]] @@ -88,6 +88,8 @@ def get_helper_command(self) -> list[str]: return [ "python3", self.tdef.cmd_args.proxy_script, + "--host", + self.bind_host, "--port", str(self.serve_port), "--prefiller-hosts", @@ -109,7 +111,7 @@ def get_bench_command(self) -> list[str]: "bench", "serve", f"--model {self.tdef.cmd_args.model}", - f"--base-url http://127.0.0.1:{self.serve_port}", + f"--base-url http://{self.bench_host}:{self.serve_port}", f"--random-input-len {bench_args.random_input_len}", f"--random-output-len {bench_args.random_output_len}", f"--max-concurrency {bench_args.max_concurrency}", diff --git a/tests/ref_data/sglang-disagg-2nodes.sbatch b/tests/ref_data/sglang-disagg-2nodes.sbatch index adca0b32f..f00f2ed63 100644 --- a/tests/ref_data/sglang-disagg-2nodes.sbatch +++ b/tests/ref_data/sglang-disagg-2nodes.sbatch @@ -77,4 +77,4 @@ HELPER_PID=$! echo "Running benchmark..." srun --export=ALL --mpi=pmix --container-image=docker.io/lmsysorg/sglang:dev --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output,__INSTALL_DIR__/huggingface:/root/.cache/huggingface --overlap --ntasks-per-node=1 --ntasks=1 --relative=0 -N1 \ --output=__OUTPUT_DIR__/output/sglang-bench.log \ - python3 -m sglang.bench_serving --backend sglang --base-url http://127.0.0.1:8000 --model Qwen/Qwen3-8B --dataset-name random --num-prompts 30 --max-concurrency 16 --random-input 16 --random-output 128 --warmup-requests 2 --random-range-ratio 1.0 --output-file __OUTPUT_DIR__/output/sglang-bench.jsonl --output-details --pd-separated + python3 -m sglang.bench_serving --backend sglang --base-url http://${PREFILL_NODE}:8000 --model Qwen/Qwen3-8B --dataset-name random --num-prompts 30 --max-concurrency 16 --random-input 16 --random-output 128 --warmup-requests 2 --random-range-ratio 1.0 --output-file __OUTPUT_DIR__/output/sglang-bench.jsonl --output-details --pd-separated diff --git a/tests/ref_data/sglang-disagg.sbatch b/tests/ref_data/sglang-disagg.sbatch index 2f6e6133b..a821a126d 100644 --- a/tests/ref_data/sglang-disagg.sbatch +++ b/tests/ref_data/sglang-disagg.sbatch @@ -73,4 +73,4 @@ HELPER_PID=$! echo "Running benchmark..." srun --export=ALL --mpi=pmix -N1 --container-image=docker.io/lmsysorg/sglang:dev --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output,__INSTALL_DIR__/huggingface:/root/.cache/huggingface --overlap --ntasks-per-node=1 --ntasks=1 \ --output=__OUTPUT_DIR__/output/sglang-bench.log \ - python3 -m sglang.bench_serving --backend sglang --base-url http://127.0.0.1:8000 --model Qwen/Qwen3-8B --dataset-name random --num-prompts 30 --max-concurrency 16 --random-input 16 --random-output 128 --warmup-requests 2 --random-range-ratio 1.0 --output-file __OUTPUT_DIR__/output/sglang-bench.jsonl --output-details --pd-separated + python3 -m sglang.bench_serving --backend sglang --base-url http://${PREFILL_NODE}:8000 --model Qwen/Qwen3-8B --dataset-name random --num-prompts 30 --max-concurrency 16 --random-input 16 --random-output 128 --warmup-requests 2 --random-range-ratio 1.0 --output-file __OUTPUT_DIR__/output/sglang-bench.jsonl --output-details --pd-separated diff --git a/tests/ref_data/sglang.sbatch b/tests/ref_data/sglang.sbatch index 415cd8b9e..f979d3d6f 100644 --- a/tests/ref_data/sglang.sbatch +++ b/tests/ref_data/sglang.sbatch @@ -51,4 +51,4 @@ wait_for_health "http://${NODE}:8000/health" || exit 1 echo "Running benchmark..." srun --export=ALL --mpi=pmix -N1 --container-image=docker.io/lmsysorg/sglang:dev --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output,__INSTALL_DIR__/huggingface:/root/.cache/huggingface --overlap --ntasks-per-node=1 --ntasks=1 \ --output=__OUTPUT_DIR__/output/sglang-bench.log \ - python3 -m sglang.bench_serving --backend sglang --base-url http://127.0.0.1:8000 --model Qwen/Qwen3-8B --dataset-name random --num-prompts 30 --max-concurrency 16 --random-input 16 --random-output 128 --warmup-requests 2 --random-range-ratio 1.0 --output-file __OUTPUT_DIR__/output/sglang-bench.jsonl --output-details + python3 -m sglang.bench_serving --backend sglang --base-url http://${NODE}:8000 --model Qwen/Qwen3-8B --dataset-name random --num-prompts 30 --max-concurrency 16 --random-input 16 --random-output 128 --warmup-requests 2 --random-range-ratio 1.0 --output-file __OUTPUT_DIR__/output/sglang-bench.jsonl --output-details diff --git a/tests/ref_data/vllm-disagg-2nodes.sbatch b/tests/ref_data/vllm-disagg-2nodes.sbatch index 009cec246..60c7d99e3 100644 --- a/tests/ref_data/vllm-disagg-2nodes.sbatch +++ b/tests/ref_data/vllm-disagg-2nodes.sbatch @@ -60,12 +60,12 @@ echo "Node roles: prefill=$PREFILL_NODE decode=$DECODE_NODE" echo "Starting vLLM instances..." srun --export=ALL --mpi=pmix --container-image=nvcr.io/nvidia/vllm:latest --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output,__INSTALL_DIR__/huggingface:/root/.cache/huggingface --overlap --ntasks-per-node=1 --ntasks=1 --relative=0 -N1 \ --output=__OUTPUT_DIR__/output/vllm-prefill.log \ - env CUDA_VISIBLE_DEVICES="0,1,2,3" VLLM_NIXL_SIDE_CHANNEL_HOST="${PREFILL_NODE}" VLLM_NIXL_SIDE_CHANNEL_PORT="$PREFILL_NIXL_PORT" vllm serve Qwen/Qwen3-0.6B --port 8100 --kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_producer"}' & + env CUDA_VISIBLE_DEVICES="0,1,2,3" VLLM_NIXL_SIDE_CHANNEL_HOST="${PREFILL_NODE}" VLLM_NIXL_SIDE_CHANNEL_PORT="$PREFILL_NIXL_PORT" vllm serve Qwen/Qwen3-0.6B --host 0.0.0.0 --port 8100 --kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_producer"}' & PREFILL_PID=$! srun --export=ALL --mpi=pmix --container-image=nvcr.io/nvidia/vllm:latest --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output,__INSTALL_DIR__/huggingface:/root/.cache/huggingface --overlap --ntasks-per-node=1 --ntasks=1 --relative=1 -N1 \ --output=__OUTPUT_DIR__/output/vllm-decode.log \ - env CUDA_VISIBLE_DEVICES="0,1,2,3" VLLM_NIXL_SIDE_CHANNEL_HOST="${DECODE_NODE}" VLLM_NIXL_SIDE_CHANNEL_PORT="$DECODE_NIXL_PORT" vllm serve Qwen/Qwen3-0.6B --port 8200 --kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_consumer"}' & + env CUDA_VISIBLE_DEVICES="0,1,2,3" VLLM_NIXL_SIDE_CHANNEL_HOST="${DECODE_NODE}" VLLM_NIXL_SIDE_CHANNEL_PORT="$DECODE_NIXL_PORT" vllm serve Qwen/Qwen3-0.6B --host 0.0.0.0 --port 8200 --kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_consumer"}' & DECODE_PID=$! echo "Waiting for vLLM on $PREFILL_NODE and $DECODE_NODE to be ready..." @@ -75,10 +75,10 @@ wait_for_health "http://${DECODE_NODE}:8200/health" || exit 1 echo "Starting router..." srun --export=ALL --mpi=pmix --container-image=nvcr.io/nvidia/vllm:latest --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output,__INSTALL_DIR__/huggingface:/root/.cache/huggingface --overlap --ntasks-per-node=1 --ntasks=1 --relative=0 -N1 \ --output=__OUTPUT_DIR__/output/vllm-router.log \ - python3 /opt/vllm/tests/v1/kv_connector/nixl_integration/toy_proxy_server.py --port 8000 --prefiller-hosts ${PREFILL_NODE} --prefiller-ports 8100 --decoder-hosts ${DECODE_NODE} --decoder-ports 8200 & + python3 /opt/vllm/tests/v1/kv_connector/nixl_integration/toy_proxy_server.py --host 0.0.0.0 --port 8000 --prefiller-hosts ${PREFILL_NODE} --prefiller-ports 8100 --decoder-hosts ${DECODE_NODE} --decoder-ports 8200 & HELPER_PID=$! echo "Running benchmark..." srun --export=ALL --mpi=pmix --container-image=nvcr.io/nvidia/vllm:latest --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output,__INSTALL_DIR__/huggingface:/root/.cache/huggingface --overlap --ntasks-per-node=1 --ntasks=1 --relative=0 -N1 \ --output=__OUTPUT_DIR__/output/vllm-bench.log \ - vllm bench serve --model Qwen/Qwen3-0.6B --base-url http://127.0.0.1:8000 --random-input-len 16 --random-output-len 128 --max-concurrency 16 --num-prompts 30 --result-dir __OUTPUT_DIR__/output --result-filename vllm-bench.json --save-result + vllm bench serve --model Qwen/Qwen3-0.6B --base-url http://${PREFILL_NODE}:8000 --random-input-len 16 --random-output-len 128 --max-concurrency 16 --num-prompts 30 --result-dir __OUTPUT_DIR__/output --result-filename vllm-bench.json --save-result diff --git a/tests/ref_data/vllm-disagg.sbatch b/tests/ref_data/vllm-disagg.sbatch index 32c835842..f309a32ed 100644 --- a/tests/ref_data/vllm-disagg.sbatch +++ b/tests/ref_data/vllm-disagg.sbatch @@ -56,12 +56,12 @@ echo "Node roles: prefill=$PREFILL_NODE decode=$DECODE_NODE" echo "Starting vLLM instances..." srun --export=ALL --mpi=pmix -N1 --container-image=nvcr.io/nvidia/vllm:latest --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output,__INSTALL_DIR__/huggingface:/root/.cache/huggingface --overlap --ntasks-per-node=1 --ntasks=1 \ --output=__OUTPUT_DIR__/output/vllm-prefill.log \ - env CUDA_VISIBLE_DEVICES="0,1" VLLM_NIXL_SIDE_CHANNEL_HOST="${PREFILL_NODE}" VLLM_NIXL_SIDE_CHANNEL_PORT="$PREFILL_NIXL_PORT" vllm serve Qwen/Qwen3-0.6B --port 8100 --kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_producer"}' & + env CUDA_VISIBLE_DEVICES="0,1" VLLM_NIXL_SIDE_CHANNEL_HOST="${PREFILL_NODE}" VLLM_NIXL_SIDE_CHANNEL_PORT="$PREFILL_NIXL_PORT" vllm serve Qwen/Qwen3-0.6B --host 0.0.0.0 --port 8100 --kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_producer"}' & PREFILL_PID=$! srun --export=ALL --mpi=pmix -N1 --container-image=nvcr.io/nvidia/vllm:latest --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output,__INSTALL_DIR__/huggingface:/root/.cache/huggingface --overlap --ntasks-per-node=1 --ntasks=1 \ --output=__OUTPUT_DIR__/output/vllm-decode.log \ - env CUDA_VISIBLE_DEVICES="2,3" VLLM_NIXL_SIDE_CHANNEL_HOST="${DECODE_NODE}" VLLM_NIXL_SIDE_CHANNEL_PORT="$DECODE_NIXL_PORT" vllm serve Qwen/Qwen3-0.6B --port 8200 --kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_consumer"}' & + env CUDA_VISIBLE_DEVICES="2,3" VLLM_NIXL_SIDE_CHANNEL_HOST="${DECODE_NODE}" VLLM_NIXL_SIDE_CHANNEL_PORT="$DECODE_NIXL_PORT" vllm serve Qwen/Qwen3-0.6B --host 0.0.0.0 --port 8200 --kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_consumer"}' & DECODE_PID=$! echo "Waiting for vLLM on $PREFILL_NODE and $DECODE_NODE to be ready..." @@ -71,10 +71,10 @@ wait_for_health "http://${DECODE_NODE}:8200/health" || exit 1 echo "Starting router..." srun --export=ALL --mpi=pmix -N1 --container-image=nvcr.io/nvidia/vllm:latest --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output,__INSTALL_DIR__/huggingface:/root/.cache/huggingface --overlap --ntasks-per-node=1 --ntasks=1 \ --output=__OUTPUT_DIR__/output/vllm-router.log \ - python3 /opt/vllm/tests/v1/kv_connector/nixl_integration/toy_proxy_server.py --port 8000 --prefiller-hosts ${PREFILL_NODE} --prefiller-ports 8100 --decoder-hosts ${DECODE_NODE} --decoder-ports 8200 & + python3 /opt/vllm/tests/v1/kv_connector/nixl_integration/toy_proxy_server.py --host 0.0.0.0 --port 8000 --prefiller-hosts ${PREFILL_NODE} --prefiller-ports 8100 --decoder-hosts ${DECODE_NODE} --decoder-ports 8200 & HELPER_PID=$! echo "Running benchmark..." srun --export=ALL --mpi=pmix -N1 --container-image=nvcr.io/nvidia/vllm:latest --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output,__INSTALL_DIR__/huggingface:/root/.cache/huggingface --overlap --ntasks-per-node=1 --ntasks=1 \ --output=__OUTPUT_DIR__/output/vllm-bench.log \ - vllm bench serve --model Qwen/Qwen3-0.6B --base-url http://127.0.0.1:8000 --random-input-len 16 --random-output-len 128 --max-concurrency 16 --num-prompts 30 --result-dir __OUTPUT_DIR__/output --result-filename vllm-bench.json --save-result + vllm bench serve --model Qwen/Qwen3-0.6B --base-url http://${PREFILL_NODE}:8000 --random-input-len 16 --random-output-len 128 --max-concurrency 16 --num-prompts 30 --result-dir __OUTPUT_DIR__/output --result-filename vllm-bench.json --save-result diff --git a/tests/ref_data/vllm.sbatch b/tests/ref_data/vllm.sbatch index 733a49f51..de8c74ddf 100644 --- a/tests/ref_data/vllm.sbatch +++ b/tests/ref_data/vllm.sbatch @@ -41,7 +41,7 @@ wait_for_health() { echo "Starting vLLM instances..." srun --export=ALL --mpi=pmix -N1 --container-image=nvcr.io/nvidia/vllm:latest --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output,__INSTALL_DIR__/huggingface:/root/.cache/huggingface --overlap --ntasks-per-node=1 --ntasks=1 \ --output=__OUTPUT_DIR__/output/vllm-serve.log \ - vllm serve Qwen/Qwen3-0.6B --port 8000 & + vllm serve Qwen/Qwen3-0.6B --host 0.0.0.0 --port 8000 & SERVE_PID=$! NODE=$(scontrol show hostname $SLURM_JOB_NODELIST | head -n 1) @@ -51,4 +51,4 @@ wait_for_health "http://${NODE}:8000/health" || exit 1 echo "Running benchmark..." srun --export=ALL --mpi=pmix -N1 --container-image=nvcr.io/nvidia/vllm:latest --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output,__INSTALL_DIR__/huggingface:/root/.cache/huggingface --overlap --ntasks-per-node=1 --ntasks=1 \ --output=__OUTPUT_DIR__/output/vllm-bench.log \ - vllm bench serve --model Qwen/Qwen3-0.6B --base-url http://127.0.0.1:8000 --random-input-len 16 --random-output-len 128 --max-concurrency 16 --num-prompts 30 --result-dir __OUTPUT_DIR__/output --result-filename vllm-bench.json --save-result + vllm bench serve --model Qwen/Qwen3-0.6B --base-url http://${NODE}:8000 --random-input-len 16 --random-output-len 128 --max-concurrency 16 --num-prompts 30 --result-dir __OUTPUT_DIR__/output --result-filename vllm-bench.json --save-result diff --git a/tests/workloads/sglang/test_command_gen_strategy_slurm.py b/tests/workloads/sglang/test_command_gen_strategy_slurm.py index 00a835ba0..18c9d739b 100644 --- a/tests/workloads/sglang/test_command_gen_strategy_slurm.py +++ b/tests/workloads/sglang/test_command_gen_strategy_slurm.py @@ -94,7 +94,7 @@ def test_get_sglang_serve_commands_aggregated(sglang_cmd_gen_strategy: SglangSlu "--model-path", cmd_args.model, "--host", - "0.0.0.0", + cmd_args.host, "--port", str(cmd_args.port), ] @@ -132,6 +132,7 @@ def test_get_sglang_bench_command_writes_jsonl( command = sglang_cmd_gen_strategy.get_bench_command() output_file_args = [part for part in command if part.startswith("--output-file ")] assert len(output_file_args) == 1 + assert f"--base-url http://${{NODE}}:{sglang_cmd_gen_strategy.test_run.test.cmd_args.port}" in command assert output_file_args[0].endswith(f"/{SGLANG_BENCH_JSONL_FILE}") @@ -150,7 +151,7 @@ def test_gen_srun_command_contains_expected_flow(sglang_disagg_tr: TestRun, slur assert 'wait_for_health "http://${DECODE_NODE}:8200/health"' in srun_command assert "--prefill http://${PREFILL_NODE}:8100" in srun_command assert "--decode http://${DECODE_NODE}:8200" in srun_command - assert "--base-url http://127.0.0.1:8000" in srun_command + assert "--base-url http://${PREFILL_NODE}:8000" in srun_command assert f"--output={strategy.test_run.output_path.absolute()}/{SGLANG_BENCH_LOG_FILE}" in srun_command @@ -171,7 +172,7 @@ def test_gen_srun_command_contains_expected_two_node_flow( assert 'wait_for_health "http://${DECODE_NODE}:8200/health"' in srun_command assert "--prefill http://${PREFILL_NODE}:8100" in srun_command assert "--decode http://${DECODE_NODE}:8200" in srun_command - assert "--base-url http://127.0.0.1:8000" in srun_command + assert "--base-url http://${PREFILL_NODE}:8000" in srun_command def test_disagg_more_than_two_nodes_is_rejected(sglang_disagg_tr: TestRun, slurm_system: SlurmSystem) -> None: diff --git a/tests/workloads/test_llm_serving.py b/tests/workloads/test_llm_serving.py index aba270858..e2a7e219d 100644 --- a/tests/workloads/test_llm_serving.py +++ b/tests/workloads/test_llm_serving.py @@ -259,11 +259,13 @@ def test_two_node_disagg_uses_shared_gpu_ids_and_role_hosts(self, slurm_system: strategy = FakeLLMSlurmStrategy(slurm_system, tr) assert strategy.workload_slug == "fake-llm" - assert strategy.serve_port == 8000 + assert strategy.serve_port == 8300 assert strategy.prefill_gpu_ids == [0, 1, 2, 3] assert strategy.decode_gpu_ids == [0, 1, 2, 3] - assert strategy.prefill_port == 8100 - assert strategy.decode_port == 8200 + assert strategy.prefill_port == 8400 + assert strategy.decode_port == 8500 + assert strategy.bind_host == "0.0.0.0" + assert strategy.bench_host == "${PREFILL_NODE}" assert strategy.disaggregated_role_host("prefill") == "${PREFILL_NODE}" assert strategy.disaggregated_role_host("decode") == "${DECODE_NODE}" assert strategy.prefill_log_file == "fake-llm-prefill.log" @@ -285,16 +287,16 @@ def test_single_node_disagg_wait_block_uses_role_hosts(self, slurm_system: Slurm strategy.generate_wait_for_health_block( "Fake LLM", [ - "http://${PREFILL_NODE}:8100/health", - "http://${DECODE_NODE}:8200/health", + "http://${PREFILL_NODE}:8400/health", + "http://${DECODE_NODE}:8500/health", ], host_setup="", host_display="$PREFILL_NODE and $DECODE_NODE", ) == """\ echo "Waiting for Fake LLM on $PREFILL_NODE and $DECODE_NODE to be ready..." -wait_for_health "http://${PREFILL_NODE}:8100/health" || exit 1 -wait_for_health "http://${DECODE_NODE}:8200/health" || exit 1""" +wait_for_health "http://${PREFILL_NODE}:8400/health" || exit 1 +wait_for_health "http://${DECODE_NODE}:8500/health" || exit 1""" ) assert "DECODE_NODE=${NODES[1]:-${PREFILL_NODE}}" in strategy.generate_disaggregated_node_setup() diff --git a/tests/workloads/vllm/test_command_gen_strategy_slurm.py b/tests/workloads/vllm/test_command_gen_strategy_slurm.py index deb1b1fb0..2f3810ab4 100644 --- a/tests/workloads/vllm/test_command_gen_strategy_slurm.py +++ b/tests/workloads/vllm/test_command_gen_strategy_slurm.py @@ -159,7 +159,7 @@ def test_get_vllm_bench_command(self, vllm_cmd_gen_strategy: VllmSlurmCommandGen "bench", "serve", f"--model {cmd_args.model}", - f"--base-url http://127.0.0.1:{cmd_args.port}", + f"--base-url http://${{NODE}}:{cmd_args.port}", f"--random-input-len {bench_args.random_input_len}", f"--random-output-len {bench_args.random_output_len}", f"--max-concurrency {bench_args.max_concurrency}", @@ -193,7 +193,7 @@ def test_get_vllm_serve_commands_single_gpu(self, vllm_cmd_gen_strategy: VllmSlu commands = vllm_cmd_gen_strategy.get_serve_commands() assert len(commands) == 1 - assert commands[0] == ["vllm", "serve", cmd_args.model, "--port", str(cmd_args.port)] + assert commands[0] == ["vllm", "serve", cmd_args.model, "--host", cmd_args.host, "--port", str(cmd_args.port)] def test_get_vllm_serve_commands_convert_boolean_flags( self, vllm: VllmTestDefinition, vllm_tr: TestRun, slurm_system: SlurmSystem @@ -208,6 +208,8 @@ def test_get_vllm_serve_commands_convert_boolean_flags( "vllm", "serve", vllm.cmd_args.model, + "--host", + vllm.cmd_args.host, "--enable-expert-parallel", "--port", str(vllm.cmd_args.port), @@ -304,6 +306,8 @@ def test_get_vllm_serve_commands_returns_two(self, vllm_disagg_tr: TestRun, slur "vllm", "serve", cmd_args.model, + "--host", + cmd_args.host, "--port", str(cmd_args.port + 100), "--kv-transfer-config", @@ -313,6 +317,8 @@ def test_get_vllm_serve_commands_returns_two(self, vllm_disagg_tr: TestRun, slur "vllm", "serve", cmd_args.model, + "--host", + cmd_args.host, "--port", str(cmd_args.port + 200), "--kv-transfer-config", @@ -329,6 +335,8 @@ def test_get_helper_command(self, vllm_disagg_tr: TestRun, slurm_system: SlurmSy assert command == [ "python3", cmd_args.proxy_script, + "--host", + cmd_args.host, "--port", str(cmd_args.port), "--prefiller-hosts", @@ -441,7 +449,7 @@ def test_gen_srun_command_disagg_two_nodes_flow( assert 'wait_for_health "http://${DECODE_NODE}:8200/health"' in srun_command assert "--prefiller-hosts ${PREFILL_NODE}" in srun_command assert "--decoder-hosts ${DECODE_NODE}" in srun_command - assert "--base-url http://127.0.0.1:8000" in srun_command + assert "--base-url http://${PREFILL_NODE}:8000" in srun_command def test_disagg_more_than_two_nodes_is_rejected(self, vllm_disagg_tr: TestRun, slurm_system: SlurmSystem) -> None: vllm_disagg_tr.num_nodes = 3 From 6f3cc1991301f276aa371ca815d28df3f78597d1 Mon Sep 17 00:00:00 2001 From: Ivan Podkidyshev Date: Tue, 21 Apr 2026 15:09:13 +0200 Subject: [PATCH 2/2] update tests to use default port --- tests/ref_data/sglang-disagg-2nodes.sbatch | 12 ++++++------ tests/ref_data/sglang-disagg.sbatch | 12 ++++++------ tests/ref_data/sglang.sbatch | 6 +++--- tests/ref_data/vllm-disagg-2nodes.sbatch | 12 ++++++------ tests/ref_data/vllm-disagg.sbatch | 12 ++++++------ tests/ref_data/vllm.sbatch | 6 +++--- tests/test_acceptance.py | 6 ------ 7 files changed, 30 insertions(+), 36 deletions(-) diff --git a/tests/ref_data/sglang-disagg-2nodes.sbatch b/tests/ref_data/sglang-disagg-2nodes.sbatch index f00f2ed63..d5f2b72b1 100644 --- a/tests/ref_data/sglang-disagg-2nodes.sbatch +++ b/tests/ref_data/sglang-disagg-2nodes.sbatch @@ -56,25 +56,25 @@ echo "Node roles: prefill=$PREFILL_NODE decode=$DECODE_NODE" echo "Starting SGLang instances..." srun --export=ALL --mpi=pmix --container-image=docker.io/lmsysorg/sglang:dev --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output,__INSTALL_DIR__/huggingface:/root/.cache/huggingface --overlap --ntasks-per-node=1 --ntasks=1 --relative=0 -N1 \ --output=__OUTPUT_DIR__/output/sglang-prefill.log \ - env CUDA_VISIBLE_DEVICES="0,1,2,3" python3 -m sglang.launch_server --model-path Qwen/Qwen3-8B --host 0.0.0.0 --port 8100 --disaggregation-mode prefill --disaggregation-transfer-backend nixl & + env CUDA_VISIBLE_DEVICES="0,1,2,3" python3 -m sglang.launch_server --model-path Qwen/Qwen3-8B --host 0.0.0.0 --port 8400 --disaggregation-mode prefill --disaggregation-transfer-backend nixl & PREFILL_PID=$! srun --export=ALL --mpi=pmix --container-image=docker.io/lmsysorg/sglang:dev --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output,__INSTALL_DIR__/huggingface:/root/.cache/huggingface --overlap --ntasks-per-node=1 --ntasks=1 --relative=1 -N1 \ --output=__OUTPUT_DIR__/output/sglang-decode.log \ - env CUDA_VISIBLE_DEVICES="0,1,2,3" python3 -m sglang.launch_server --model-path Qwen/Qwen3-8B --host 0.0.0.0 --port 8200 --disaggregation-mode decode --disaggregation-transfer-backend nixl & + env CUDA_VISIBLE_DEVICES="0,1,2,3" python3 -m sglang.launch_server --model-path Qwen/Qwen3-8B --host 0.0.0.0 --port 8500 --disaggregation-mode decode --disaggregation-transfer-backend nixl & DECODE_PID=$! echo "Waiting for SGLang on $PREFILL_NODE and $DECODE_NODE to be ready..." -wait_for_health "http://${PREFILL_NODE}:8100/health" || exit 1 -wait_for_health "http://${DECODE_NODE}:8200/health" || exit 1 +wait_for_health "http://${PREFILL_NODE}:8400/health" || exit 1 +wait_for_health "http://${DECODE_NODE}:8500/health" || exit 1 echo "Starting router..." srun --export=ALL --mpi=pmix --container-image=docker.io/lmsysorg/sglang:dev --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output,__INSTALL_DIR__/huggingface:/root/.cache/huggingface --overlap --ntasks-per-node=1 --ntasks=1 --relative=0 -N1 \ --output=__OUTPUT_DIR__/output/sglang-router.log \ - python3 -m sglang_router.launch_router --pd-disaggregation --prefill http://${PREFILL_NODE}:8100 --decode http://${DECODE_NODE}:8200 --host 0.0.0.0 --port 8000 & + python3 -m sglang_router.launch_router --pd-disaggregation --prefill http://${PREFILL_NODE}:8400 --decode http://${DECODE_NODE}:8500 --host 0.0.0.0 --port 8300 & HELPER_PID=$! echo "Running benchmark..." srun --export=ALL --mpi=pmix --container-image=docker.io/lmsysorg/sglang:dev --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output,__INSTALL_DIR__/huggingface:/root/.cache/huggingface --overlap --ntasks-per-node=1 --ntasks=1 --relative=0 -N1 \ --output=__OUTPUT_DIR__/output/sglang-bench.log \ - python3 -m sglang.bench_serving --backend sglang --base-url http://${PREFILL_NODE}:8000 --model Qwen/Qwen3-8B --dataset-name random --num-prompts 30 --max-concurrency 16 --random-input 16 --random-output 128 --warmup-requests 2 --random-range-ratio 1.0 --output-file __OUTPUT_DIR__/output/sglang-bench.jsonl --output-details --pd-separated + python3 -m sglang.bench_serving --backend sglang --base-url http://${PREFILL_NODE}:8300 --model Qwen/Qwen3-8B --dataset-name random --num-prompts 30 --max-concurrency 16 --random-input 16 --random-output 128 --warmup-requests 2 --random-range-ratio 1.0 --output-file __OUTPUT_DIR__/output/sglang-bench.jsonl --output-details --pd-separated diff --git a/tests/ref_data/sglang-disagg.sbatch b/tests/ref_data/sglang-disagg.sbatch index a821a126d..5469c972b 100644 --- a/tests/ref_data/sglang-disagg.sbatch +++ b/tests/ref_data/sglang-disagg.sbatch @@ -52,25 +52,25 @@ echo "Node roles: prefill=$PREFILL_NODE decode=$DECODE_NODE" echo "Starting SGLang instances..." srun --export=ALL --mpi=pmix -N1 --container-image=docker.io/lmsysorg/sglang:dev --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output,__INSTALL_DIR__/huggingface:/root/.cache/huggingface --overlap --ntasks-per-node=1 --ntasks=1 \ --output=__OUTPUT_DIR__/output/sglang-prefill.log \ - env CUDA_VISIBLE_DEVICES="0,1" python3 -m sglang.launch_server --model-path Qwen/Qwen3-8B --host 0.0.0.0 --port 8100 --disaggregation-mode prefill --disaggregation-transfer-backend nixl & + env CUDA_VISIBLE_DEVICES="0,1" python3 -m sglang.launch_server --model-path Qwen/Qwen3-8B --host 0.0.0.0 --port 8400 --disaggregation-mode prefill --disaggregation-transfer-backend nixl & PREFILL_PID=$! srun --export=ALL --mpi=pmix -N1 --container-image=docker.io/lmsysorg/sglang:dev --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output,__INSTALL_DIR__/huggingface:/root/.cache/huggingface --overlap --ntasks-per-node=1 --ntasks=1 \ --output=__OUTPUT_DIR__/output/sglang-decode.log \ - env CUDA_VISIBLE_DEVICES="2,3" python3 -m sglang.launch_server --model-path Qwen/Qwen3-8B --host 0.0.0.0 --port 8200 --disaggregation-mode decode --disaggregation-transfer-backend nixl & + env CUDA_VISIBLE_DEVICES="2,3" python3 -m sglang.launch_server --model-path Qwen/Qwen3-8B --host 0.0.0.0 --port 8500 --disaggregation-mode decode --disaggregation-transfer-backend nixl & DECODE_PID=$! echo "Waiting for SGLang on $PREFILL_NODE and $DECODE_NODE to be ready..." -wait_for_health "http://${PREFILL_NODE}:8100/health" || exit 1 -wait_for_health "http://${DECODE_NODE}:8200/health" || exit 1 +wait_for_health "http://${PREFILL_NODE}:8400/health" || exit 1 +wait_for_health "http://${DECODE_NODE}:8500/health" || exit 1 echo "Starting router..." srun --export=ALL --mpi=pmix -N1 --container-image=docker.io/lmsysorg/sglang:dev --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output,__INSTALL_DIR__/huggingface:/root/.cache/huggingface --overlap --ntasks-per-node=1 --ntasks=1 \ --output=__OUTPUT_DIR__/output/sglang-router.log \ - python3 -m sglang_router.launch_router --pd-disaggregation --prefill http://${PREFILL_NODE}:8100 --decode http://${DECODE_NODE}:8200 --host 0.0.0.0 --port 8000 & + python3 -m sglang_router.launch_router --pd-disaggregation --prefill http://${PREFILL_NODE}:8400 --decode http://${DECODE_NODE}:8500 --host 0.0.0.0 --port 8300 & HELPER_PID=$! echo "Running benchmark..." srun --export=ALL --mpi=pmix -N1 --container-image=docker.io/lmsysorg/sglang:dev --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output,__INSTALL_DIR__/huggingface:/root/.cache/huggingface --overlap --ntasks-per-node=1 --ntasks=1 \ --output=__OUTPUT_DIR__/output/sglang-bench.log \ - python3 -m sglang.bench_serving --backend sglang --base-url http://${PREFILL_NODE}:8000 --model Qwen/Qwen3-8B --dataset-name random --num-prompts 30 --max-concurrency 16 --random-input 16 --random-output 128 --warmup-requests 2 --random-range-ratio 1.0 --output-file __OUTPUT_DIR__/output/sglang-bench.jsonl --output-details --pd-separated + python3 -m sglang.bench_serving --backend sglang --base-url http://${PREFILL_NODE}:8300 --model Qwen/Qwen3-8B --dataset-name random --num-prompts 30 --max-concurrency 16 --random-input 16 --random-output 128 --warmup-requests 2 --random-range-ratio 1.0 --output-file __OUTPUT_DIR__/output/sglang-bench.jsonl --output-details --pd-separated diff --git a/tests/ref_data/sglang.sbatch b/tests/ref_data/sglang.sbatch index f979d3d6f..5219284e7 100644 --- a/tests/ref_data/sglang.sbatch +++ b/tests/ref_data/sglang.sbatch @@ -41,14 +41,14 @@ wait_for_health() { echo "Starting SGLang instances..." srun --export=ALL --mpi=pmix -N1 --container-image=docker.io/lmsysorg/sglang:dev --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output,__INSTALL_DIR__/huggingface:/root/.cache/huggingface --overlap --ntasks-per-node=1 --ntasks=1 \ --output=__OUTPUT_DIR__/output/sglang-serve.log \ - env CUDA_VISIBLE_DEVICES="0" python3 -m sglang.launch_server --model-path Qwen/Qwen3-8B --host 0.0.0.0 --port 8000 & + env CUDA_VISIBLE_DEVICES="0" python3 -m sglang.launch_server --model-path Qwen/Qwen3-8B --host 0.0.0.0 --port 8300 & SERVE_PID=$! NODE=$(scontrol show hostname $SLURM_JOB_NODELIST | head -n 1) echo "Waiting for SGLang on $NODE to be ready..." -wait_for_health "http://${NODE}:8000/health" || exit 1 +wait_for_health "http://${NODE}:8300/health" || exit 1 echo "Running benchmark..." srun --export=ALL --mpi=pmix -N1 --container-image=docker.io/lmsysorg/sglang:dev --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output,__INSTALL_DIR__/huggingface:/root/.cache/huggingface --overlap --ntasks-per-node=1 --ntasks=1 \ --output=__OUTPUT_DIR__/output/sglang-bench.log \ - python3 -m sglang.bench_serving --backend sglang --base-url http://${NODE}:8000 --model Qwen/Qwen3-8B --dataset-name random --num-prompts 30 --max-concurrency 16 --random-input 16 --random-output 128 --warmup-requests 2 --random-range-ratio 1.0 --output-file __OUTPUT_DIR__/output/sglang-bench.jsonl --output-details + python3 -m sglang.bench_serving --backend sglang --base-url http://${NODE}:8300 --model Qwen/Qwen3-8B --dataset-name random --num-prompts 30 --max-concurrency 16 --random-input 16 --random-output 128 --warmup-requests 2 --random-range-ratio 1.0 --output-file __OUTPUT_DIR__/output/sglang-bench.jsonl --output-details diff --git a/tests/ref_data/vllm-disagg-2nodes.sbatch b/tests/ref_data/vllm-disagg-2nodes.sbatch index 60c7d99e3..5a9ccdf4a 100644 --- a/tests/ref_data/vllm-disagg-2nodes.sbatch +++ b/tests/ref_data/vllm-disagg-2nodes.sbatch @@ -60,25 +60,25 @@ echo "Node roles: prefill=$PREFILL_NODE decode=$DECODE_NODE" echo "Starting vLLM instances..." srun --export=ALL --mpi=pmix --container-image=nvcr.io/nvidia/vllm:latest --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output,__INSTALL_DIR__/huggingface:/root/.cache/huggingface --overlap --ntasks-per-node=1 --ntasks=1 --relative=0 -N1 \ --output=__OUTPUT_DIR__/output/vllm-prefill.log \ - env CUDA_VISIBLE_DEVICES="0,1,2,3" VLLM_NIXL_SIDE_CHANNEL_HOST="${PREFILL_NODE}" VLLM_NIXL_SIDE_CHANNEL_PORT="$PREFILL_NIXL_PORT" vllm serve Qwen/Qwen3-0.6B --host 0.0.0.0 --port 8100 --kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_producer"}' & + env CUDA_VISIBLE_DEVICES="0,1,2,3" VLLM_NIXL_SIDE_CHANNEL_HOST="${PREFILL_NODE}" VLLM_NIXL_SIDE_CHANNEL_PORT="$PREFILL_NIXL_PORT" vllm serve Qwen/Qwen3-0.6B --host 0.0.0.0 --port 8400 --kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_producer"}' & PREFILL_PID=$! srun --export=ALL --mpi=pmix --container-image=nvcr.io/nvidia/vllm:latest --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output,__INSTALL_DIR__/huggingface:/root/.cache/huggingface --overlap --ntasks-per-node=1 --ntasks=1 --relative=1 -N1 \ --output=__OUTPUT_DIR__/output/vllm-decode.log \ - env CUDA_VISIBLE_DEVICES="0,1,2,3" VLLM_NIXL_SIDE_CHANNEL_HOST="${DECODE_NODE}" VLLM_NIXL_SIDE_CHANNEL_PORT="$DECODE_NIXL_PORT" vllm serve Qwen/Qwen3-0.6B --host 0.0.0.0 --port 8200 --kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_consumer"}' & + env CUDA_VISIBLE_DEVICES="0,1,2,3" VLLM_NIXL_SIDE_CHANNEL_HOST="${DECODE_NODE}" VLLM_NIXL_SIDE_CHANNEL_PORT="$DECODE_NIXL_PORT" vllm serve Qwen/Qwen3-0.6B --host 0.0.0.0 --port 8500 --kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_consumer"}' & DECODE_PID=$! echo "Waiting for vLLM on $PREFILL_NODE and $DECODE_NODE to be ready..." -wait_for_health "http://${PREFILL_NODE}:8100/health" || exit 1 -wait_for_health "http://${DECODE_NODE}:8200/health" || exit 1 +wait_for_health "http://${PREFILL_NODE}:8400/health" || exit 1 +wait_for_health "http://${DECODE_NODE}:8500/health" || exit 1 echo "Starting router..." srun --export=ALL --mpi=pmix --container-image=nvcr.io/nvidia/vllm:latest --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output,__INSTALL_DIR__/huggingface:/root/.cache/huggingface --overlap --ntasks-per-node=1 --ntasks=1 --relative=0 -N1 \ --output=__OUTPUT_DIR__/output/vllm-router.log \ - python3 /opt/vllm/tests/v1/kv_connector/nixl_integration/toy_proxy_server.py --host 0.0.0.0 --port 8000 --prefiller-hosts ${PREFILL_NODE} --prefiller-ports 8100 --decoder-hosts ${DECODE_NODE} --decoder-ports 8200 & + python3 /opt/vllm/tests/v1/kv_connector/nixl_integration/toy_proxy_server.py --host 0.0.0.0 --port 8300 --prefiller-hosts ${PREFILL_NODE} --prefiller-ports 8400 --decoder-hosts ${DECODE_NODE} --decoder-ports 8500 & HELPER_PID=$! echo "Running benchmark..." srun --export=ALL --mpi=pmix --container-image=nvcr.io/nvidia/vllm:latest --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output,__INSTALL_DIR__/huggingface:/root/.cache/huggingface --overlap --ntasks-per-node=1 --ntasks=1 --relative=0 -N1 \ --output=__OUTPUT_DIR__/output/vllm-bench.log \ - vllm bench serve --model Qwen/Qwen3-0.6B --base-url http://${PREFILL_NODE}:8000 --random-input-len 16 --random-output-len 128 --max-concurrency 16 --num-prompts 30 --result-dir __OUTPUT_DIR__/output --result-filename vllm-bench.json --save-result + vllm bench serve --model Qwen/Qwen3-0.6B --base-url http://${PREFILL_NODE}:8300 --random-input-len 16 --random-output-len 128 --max-concurrency 16 --num-prompts 30 --result-dir __OUTPUT_DIR__/output --result-filename vllm-bench.json --save-result diff --git a/tests/ref_data/vllm-disagg.sbatch b/tests/ref_data/vllm-disagg.sbatch index f309a32ed..1d53a16fe 100644 --- a/tests/ref_data/vllm-disagg.sbatch +++ b/tests/ref_data/vllm-disagg.sbatch @@ -56,25 +56,25 @@ echo "Node roles: prefill=$PREFILL_NODE decode=$DECODE_NODE" echo "Starting vLLM instances..." srun --export=ALL --mpi=pmix -N1 --container-image=nvcr.io/nvidia/vllm:latest --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output,__INSTALL_DIR__/huggingface:/root/.cache/huggingface --overlap --ntasks-per-node=1 --ntasks=1 \ --output=__OUTPUT_DIR__/output/vllm-prefill.log \ - env CUDA_VISIBLE_DEVICES="0,1" VLLM_NIXL_SIDE_CHANNEL_HOST="${PREFILL_NODE}" VLLM_NIXL_SIDE_CHANNEL_PORT="$PREFILL_NIXL_PORT" vllm serve Qwen/Qwen3-0.6B --host 0.0.0.0 --port 8100 --kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_producer"}' & + env CUDA_VISIBLE_DEVICES="0,1" VLLM_NIXL_SIDE_CHANNEL_HOST="${PREFILL_NODE}" VLLM_NIXL_SIDE_CHANNEL_PORT="$PREFILL_NIXL_PORT" vllm serve Qwen/Qwen3-0.6B --host 0.0.0.0 --port 8400 --kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_producer"}' & PREFILL_PID=$! srun --export=ALL --mpi=pmix -N1 --container-image=nvcr.io/nvidia/vllm:latest --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output,__INSTALL_DIR__/huggingface:/root/.cache/huggingface --overlap --ntasks-per-node=1 --ntasks=1 \ --output=__OUTPUT_DIR__/output/vllm-decode.log \ - env CUDA_VISIBLE_DEVICES="2,3" VLLM_NIXL_SIDE_CHANNEL_HOST="${DECODE_NODE}" VLLM_NIXL_SIDE_CHANNEL_PORT="$DECODE_NIXL_PORT" vllm serve Qwen/Qwen3-0.6B --host 0.0.0.0 --port 8200 --kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_consumer"}' & + env CUDA_VISIBLE_DEVICES="2,3" VLLM_NIXL_SIDE_CHANNEL_HOST="${DECODE_NODE}" VLLM_NIXL_SIDE_CHANNEL_PORT="$DECODE_NIXL_PORT" vllm serve Qwen/Qwen3-0.6B --host 0.0.0.0 --port 8500 --kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_consumer"}' & DECODE_PID=$! echo "Waiting for vLLM on $PREFILL_NODE and $DECODE_NODE to be ready..." -wait_for_health "http://${PREFILL_NODE}:8100/health" || exit 1 -wait_for_health "http://${DECODE_NODE}:8200/health" || exit 1 +wait_for_health "http://${PREFILL_NODE}:8400/health" || exit 1 +wait_for_health "http://${DECODE_NODE}:8500/health" || exit 1 echo "Starting router..." srun --export=ALL --mpi=pmix -N1 --container-image=nvcr.io/nvidia/vllm:latest --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output,__INSTALL_DIR__/huggingface:/root/.cache/huggingface --overlap --ntasks-per-node=1 --ntasks=1 \ --output=__OUTPUT_DIR__/output/vllm-router.log \ - python3 /opt/vllm/tests/v1/kv_connector/nixl_integration/toy_proxy_server.py --host 0.0.0.0 --port 8000 --prefiller-hosts ${PREFILL_NODE} --prefiller-ports 8100 --decoder-hosts ${DECODE_NODE} --decoder-ports 8200 & + python3 /opt/vllm/tests/v1/kv_connector/nixl_integration/toy_proxy_server.py --host 0.0.0.0 --port 8300 --prefiller-hosts ${PREFILL_NODE} --prefiller-ports 8400 --decoder-hosts ${DECODE_NODE} --decoder-ports 8500 & HELPER_PID=$! echo "Running benchmark..." srun --export=ALL --mpi=pmix -N1 --container-image=nvcr.io/nvidia/vllm:latest --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output,__INSTALL_DIR__/huggingface:/root/.cache/huggingface --overlap --ntasks-per-node=1 --ntasks=1 \ --output=__OUTPUT_DIR__/output/vllm-bench.log \ - vllm bench serve --model Qwen/Qwen3-0.6B --base-url http://${PREFILL_NODE}:8000 --random-input-len 16 --random-output-len 128 --max-concurrency 16 --num-prompts 30 --result-dir __OUTPUT_DIR__/output --result-filename vllm-bench.json --save-result + vllm bench serve --model Qwen/Qwen3-0.6B --base-url http://${PREFILL_NODE}:8300 --random-input-len 16 --random-output-len 128 --max-concurrency 16 --num-prompts 30 --result-dir __OUTPUT_DIR__/output --result-filename vllm-bench.json --save-result diff --git a/tests/ref_data/vllm.sbatch b/tests/ref_data/vllm.sbatch index de8c74ddf..f19956c6f 100644 --- a/tests/ref_data/vllm.sbatch +++ b/tests/ref_data/vllm.sbatch @@ -41,14 +41,14 @@ wait_for_health() { echo "Starting vLLM instances..." srun --export=ALL --mpi=pmix -N1 --container-image=nvcr.io/nvidia/vllm:latest --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output,__INSTALL_DIR__/huggingface:/root/.cache/huggingface --overlap --ntasks-per-node=1 --ntasks=1 \ --output=__OUTPUT_DIR__/output/vllm-serve.log \ - vllm serve Qwen/Qwen3-0.6B --host 0.0.0.0 --port 8000 & + vllm serve Qwen/Qwen3-0.6B --host 0.0.0.0 --port 8300 & SERVE_PID=$! NODE=$(scontrol show hostname $SLURM_JOB_NODELIST | head -n 1) echo "Waiting for vLLM on $NODE to be ready..." -wait_for_health "http://${NODE}:8000/health" || exit 1 +wait_for_health "http://${NODE}:8300/health" || exit 1 echo "Running benchmark..." srun --export=ALL --mpi=pmix -N1 --container-image=nvcr.io/nvidia/vllm:latest --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output,__INSTALL_DIR__/huggingface:/root/.cache/huggingface --overlap --ntasks-per-node=1 --ntasks=1 \ --output=__OUTPUT_DIR__/output/vllm-bench.log \ - vllm bench serve --model Qwen/Qwen3-0.6B --base-url http://${NODE}:8000 --random-input-len 16 --random-output-len 128 --max-concurrency 16 --num-prompts 30 --result-dir __OUTPUT_DIR__/output --result-filename vllm-bench.json --save-result + vllm bench serve --model Qwen/Qwen3-0.6B --base-url http://${NODE}:8300 --random-input-len 16 --random-output-len 128 --max-concurrency 16 --num-prompts 30 --result-dir __OUTPUT_DIR__/output --result-filename vllm-bench.json --save-result diff --git a/tests/test_acceptance.py b/tests/test_acceptance.py index 0dabd46d0..1eea029a5 100644 --- a/tests/test_acceptance.py +++ b/tests/test_acceptance.py @@ -587,7 +587,6 @@ def test_req(request, slurm_system: SlurmSystem, partial_tr: partial[TestRun]) - cmd_args=VllmCmdArgs( docker_image_url="nvcr.io/nvidia/vllm:latest", model="Qwen/Qwen3-0.6B", - port=8000, ), extra_env_vars={"CUDA_VISIBLE_DEVICES": "0"}, ), @@ -602,7 +601,6 @@ def test_req(request, slurm_system: SlurmSystem, partial_tr: partial[TestRun]) - cmd_args=SglangCmdArgs( docker_image_url="docker.io/lmsysorg/sglang:dev", model="Qwen/Qwen3-8B", - port=8000, ), extra_env_vars={"CUDA_VISIBLE_DEVICES": "0"}, ), @@ -617,7 +615,6 @@ def test_req(request, slurm_system: SlurmSystem, partial_tr: partial[TestRun]) - cmd_args=SglangCmdArgs( docker_image_url="docker.io/lmsysorg/sglang:dev", model="Qwen/Qwen3-8B", - port=8000, prefill=SglangArgs(), ), extra_env_vars={"CUDA_VISIBLE_DEVICES": "0,1,2,3"}, @@ -633,7 +630,6 @@ def test_req(request, slurm_system: SlurmSystem, partial_tr: partial[TestRun]) - cmd_args=SglangCmdArgs( docker_image_url="docker.io/lmsysorg/sglang:dev", model="Qwen/Qwen3-8B", - port=8000, prefill=SglangArgs(), ), extra_env_vars={"CUDA_VISIBLE_DEVICES": "0,1,2,3"}, @@ -649,7 +645,6 @@ def test_req(request, slurm_system: SlurmSystem, partial_tr: partial[TestRun]) - cmd_args=VllmCmdArgs( docker_image_url="nvcr.io/nvidia/vllm:latest", model="Qwen/Qwen3-0.6B", - port=8000, prefill=VllmArgs(), ), extra_env_vars={"CUDA_VISIBLE_DEVICES": "0,1,2,3"}, @@ -665,7 +660,6 @@ def test_req(request, slurm_system: SlurmSystem, partial_tr: partial[TestRun]) - cmd_args=VllmCmdArgs( docker_image_url="nvcr.io/nvidia/vllm:latest", model="Qwen/Qwen3-0.6B", - port=8000, prefill=VllmArgs(), ), extra_env_vars={"CUDA_VISIBLE_DEVICES": "0,1,2,3"},