Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 19 additions & 1 deletion src/cloudai/workloads/common/llm_serving.py
Original file line number Diff line number Diff line change
Expand Up @@ -118,7 +118,12 @@ class LLMServingCmdArgs(CmdArgs, Generic[LLMServingArgsT]):

docker_image_url: str
model: str
port: int = Field(default=8000, ge=1, le=65535)
port: int = Field(default=8300, ge=1, le=65535)
host: str = Field(default="0.0.0.0", description="Host/interface for serve or router processes to bind to.")
bench_host: str | None = Field(
default=None,
description="Hostname used by the benchmark client. Defaults to the allocated node hostname.",
)
serve_wait_seconds: int = 300
prefill: LLMServingArgsT | None = Field(default=None)
decode: LLMServingArgsT
Expand Down Expand Up @@ -363,6 +368,19 @@ def disaggregated_role_host(self, role: str) -> str:
return "${DECODE_NODE}"
raise ValueError(f"Unknown disaggregated role: {role}")

@property
def bind_host(self) -> str:
return self.tdef.cmd_args.host

@property
def bench_host(self) -> str:
configured_host = self.tdef.cmd_args.bench_host
if configured_host:
return configured_host
if self.is_disaggregated:
return "${PREFILL_NODE}"
return "${NODE}"

def generate_disaggregated_node_setup(self) -> str:
if not self.is_disaggregated:
return ""
Expand Down
11 changes: 5 additions & 6 deletions src/cloudai/workloads/sglang/slurm_command_gen_strategy.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,15 +40,14 @@ def workload_name(self) -> str:

def get_serve_commands(self) -> list[list[str]]:
cmd_args = self.tdef.cmd_args
bind_host = "0.0.0.0"

base_cmd = ["python3", "-m", cmd_args.serve_module, "--model-path", cmd_args.model]
if not cmd_args.prefill:
return [
[
*base_cmd,
"--host",
bind_host,
self.bind_host,
"--port",
str(self.serve_port),
*cmd_args.decode.serve_args,
Expand All @@ -57,8 +56,8 @@ def get_serve_commands(self) -> list[list[str]]:

commands: list[list[str]] = []
for host, port, mode, args in [
(bind_host, self.prefill_port, "prefill", cast(SglangArgs, cmd_args.prefill)),
(bind_host, self.decode_port, "decode", cmd_args.decode),
(self.bind_host, self.prefill_port, "prefill", cast(SglangArgs, cmd_args.prefill)),
(self.bind_host, self.decode_port, "decode", cmd_args.decode),
]:
commands.append(
[
Expand Down Expand Up @@ -88,7 +87,7 @@ def get_helper_command(self) -> list[str]:
"--decode",
f"http://{self.disaggregated_role_host('decode')}:{self.decode_port}",
"--host",
"0.0.0.0",
self.bind_host,
"--port",
str(self.serve_port),
]
Expand All @@ -103,7 +102,7 @@ def get_bench_command(self) -> list[str]:
"-m",
self.tdef.cmd_args.bench_module,
f"--backend {bench_args.backend}",
f"--base-url http://127.0.0.1:{self.serve_port}",
f"--base-url http://{self.bench_host}:{self.serve_port}",
f"--model {self.tdef.cmd_args.model}",
f"--dataset-name {bench_args.dataset_name}",
f"--num-prompts {bench_args.num_prompts}",
Expand Down
6 changes: 4 additions & 2 deletions src/cloudai/workloads/vllm/slurm_command_gen_strategy.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ def get_serve_commands(self) -> list[list[str]]:
tdef: VllmTestDefinition = cast(VllmTestDefinition, self.test_run.test)
cmd_args: VllmCmdArgs = tdef.cmd_args

base_cmd = ["vllm", "serve", cmd_args.model]
base_cmd = ["vllm", "serve", cmd_args.model, "--host", self.bind_host]
if not tdef.cmd_args.prefill:
return [[*base_cmd, *tdef.cmd_args.decode.serve_args, "--port", str(self.serve_port)]]

Expand Down Expand Up @@ -88,6 +88,8 @@ def get_helper_command(self) -> list[str]:
return [
"python3",
self.tdef.cmd_args.proxy_script,
"--host",
self.bind_host,
"--port",
str(self.serve_port),
"--prefiller-hosts",
Expand All @@ -109,7 +111,7 @@ def get_bench_command(self) -> list[str]:
"bench",
"serve",
f"--model {self.tdef.cmd_args.model}",
f"--base-url http://127.0.0.1:{self.serve_port}",
f"--base-url http://{self.bench_host}:{self.serve_port}",
f"--random-input-len {bench_args.random_input_len}",
f"--random-output-len {bench_args.random_output_len}",
f"--max-concurrency {bench_args.max_concurrency}",
Expand Down
12 changes: 6 additions & 6 deletions tests/ref_data/sglang-disagg-2nodes.sbatch
Original file line number Diff line number Diff line change
Expand Up @@ -56,25 +56,25 @@ echo "Node roles: prefill=$PREFILL_NODE decode=$DECODE_NODE"
echo "Starting SGLang instances..."
srun --export=ALL --mpi=pmix --container-image=docker.io/lmsysorg/sglang:dev --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output,__INSTALL_DIR__/huggingface:/root/.cache/huggingface --overlap --ntasks-per-node=1 --ntasks=1 --relative=0 -N1 \
--output=__OUTPUT_DIR__/output/sglang-prefill.log \
env CUDA_VISIBLE_DEVICES="0,1,2,3" python3 -m sglang.launch_server --model-path Qwen/Qwen3-8B --host 0.0.0.0 --port 8100 --disaggregation-mode prefill --disaggregation-transfer-backend nixl &
env CUDA_VISIBLE_DEVICES="0,1,2,3" python3 -m sglang.launch_server --model-path Qwen/Qwen3-8B --host 0.0.0.0 --port 8400 --disaggregation-mode prefill --disaggregation-transfer-backend nixl &
PREFILL_PID=$!

srun --export=ALL --mpi=pmix --container-image=docker.io/lmsysorg/sglang:dev --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output,__INSTALL_DIR__/huggingface:/root/.cache/huggingface --overlap --ntasks-per-node=1 --ntasks=1 --relative=1 -N1 \
--output=__OUTPUT_DIR__/output/sglang-decode.log \
env CUDA_VISIBLE_DEVICES="0,1,2,3" python3 -m sglang.launch_server --model-path Qwen/Qwen3-8B --host 0.0.0.0 --port 8200 --disaggregation-mode decode --disaggregation-transfer-backend nixl &
env CUDA_VISIBLE_DEVICES="0,1,2,3" python3 -m sglang.launch_server --model-path Qwen/Qwen3-8B --host 0.0.0.0 --port 8500 --disaggregation-mode decode --disaggregation-transfer-backend nixl &
DECODE_PID=$!

echo "Waiting for SGLang on $PREFILL_NODE and $DECODE_NODE to be ready..."
wait_for_health "http://${PREFILL_NODE}:8100/health" || exit 1
wait_for_health "http://${DECODE_NODE}:8200/health" || exit 1
wait_for_health "http://${PREFILL_NODE}:8400/health" || exit 1
wait_for_health "http://${DECODE_NODE}:8500/health" || exit 1

echo "Starting router..."
srun --export=ALL --mpi=pmix --container-image=docker.io/lmsysorg/sglang:dev --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output,__INSTALL_DIR__/huggingface:/root/.cache/huggingface --overlap --ntasks-per-node=1 --ntasks=1 --relative=0 -N1 \
--output=__OUTPUT_DIR__/output/sglang-router.log \
python3 -m sglang_router.launch_router --pd-disaggregation --prefill http://${PREFILL_NODE}:8100 --decode http://${DECODE_NODE}:8200 --host 0.0.0.0 --port 8000 &
python3 -m sglang_router.launch_router --pd-disaggregation --prefill http://${PREFILL_NODE}:8400 --decode http://${DECODE_NODE}:8500 --host 0.0.0.0 --port 8300 &
HELPER_PID=$!

echo "Running benchmark..."
srun --export=ALL --mpi=pmix --container-image=docker.io/lmsysorg/sglang:dev --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output,__INSTALL_DIR__/huggingface:/root/.cache/huggingface --overlap --ntasks-per-node=1 --ntasks=1 --relative=0 -N1 \
--output=__OUTPUT_DIR__/output/sglang-bench.log \
python3 -m sglang.bench_serving --backend sglang --base-url http://127.0.0.1:8000 --model Qwen/Qwen3-8B --dataset-name random --num-prompts 30 --max-concurrency 16 --random-input 16 --random-output 128 --warmup-requests 2 --random-range-ratio 1.0 --output-file __OUTPUT_DIR__/output/sglang-bench.jsonl --output-details --pd-separated
python3 -m sglang.bench_serving --backend sglang --base-url http://${PREFILL_NODE}:8300 --model Qwen/Qwen3-8B --dataset-name random --num-prompts 30 --max-concurrency 16 --random-input 16 --random-output 128 --warmup-requests 2 --random-range-ratio 1.0 --output-file __OUTPUT_DIR__/output/sglang-bench.jsonl --output-details --pd-separated
12 changes: 6 additions & 6 deletions tests/ref_data/sglang-disagg.sbatch
Original file line number Diff line number Diff line change
Expand Up @@ -52,25 +52,25 @@ echo "Node roles: prefill=$PREFILL_NODE decode=$DECODE_NODE"
echo "Starting SGLang instances..."
srun --export=ALL --mpi=pmix -N1 --container-image=docker.io/lmsysorg/sglang:dev --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output,__INSTALL_DIR__/huggingface:/root/.cache/huggingface --overlap --ntasks-per-node=1 --ntasks=1 \
--output=__OUTPUT_DIR__/output/sglang-prefill.log \
env CUDA_VISIBLE_DEVICES="0,1" python3 -m sglang.launch_server --model-path Qwen/Qwen3-8B --host 0.0.0.0 --port 8100 --disaggregation-mode prefill --disaggregation-transfer-backend nixl &
env CUDA_VISIBLE_DEVICES="0,1" python3 -m sglang.launch_server --model-path Qwen/Qwen3-8B --host 0.0.0.0 --port 8400 --disaggregation-mode prefill --disaggregation-transfer-backend nixl &
PREFILL_PID=$!

srun --export=ALL --mpi=pmix -N1 --container-image=docker.io/lmsysorg/sglang:dev --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output,__INSTALL_DIR__/huggingface:/root/.cache/huggingface --overlap --ntasks-per-node=1 --ntasks=1 \
--output=__OUTPUT_DIR__/output/sglang-decode.log \
env CUDA_VISIBLE_DEVICES="2,3" python3 -m sglang.launch_server --model-path Qwen/Qwen3-8B --host 0.0.0.0 --port 8200 --disaggregation-mode decode --disaggregation-transfer-backend nixl &
env CUDA_VISIBLE_DEVICES="2,3" python3 -m sglang.launch_server --model-path Qwen/Qwen3-8B --host 0.0.0.0 --port 8500 --disaggregation-mode decode --disaggregation-transfer-backend nixl &
DECODE_PID=$!

echo "Waiting for SGLang on $PREFILL_NODE and $DECODE_NODE to be ready..."
wait_for_health "http://${PREFILL_NODE}:8100/health" || exit 1
wait_for_health "http://${DECODE_NODE}:8200/health" || exit 1
wait_for_health "http://${PREFILL_NODE}:8400/health" || exit 1
wait_for_health "http://${DECODE_NODE}:8500/health" || exit 1

echo "Starting router..."
srun --export=ALL --mpi=pmix -N1 --container-image=docker.io/lmsysorg/sglang:dev --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output,__INSTALL_DIR__/huggingface:/root/.cache/huggingface --overlap --ntasks-per-node=1 --ntasks=1 \
--output=__OUTPUT_DIR__/output/sglang-router.log \
python3 -m sglang_router.launch_router --pd-disaggregation --prefill http://${PREFILL_NODE}:8100 --decode http://${DECODE_NODE}:8200 --host 0.0.0.0 --port 8000 &
python3 -m sglang_router.launch_router --pd-disaggregation --prefill http://${PREFILL_NODE}:8400 --decode http://${DECODE_NODE}:8500 --host 0.0.0.0 --port 8300 &
HELPER_PID=$!

echo "Running benchmark..."
srun --export=ALL --mpi=pmix -N1 --container-image=docker.io/lmsysorg/sglang:dev --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output,__INSTALL_DIR__/huggingface:/root/.cache/huggingface --overlap --ntasks-per-node=1 --ntasks=1 \
--output=__OUTPUT_DIR__/output/sglang-bench.log \
python3 -m sglang.bench_serving --backend sglang --base-url http://127.0.0.1:8000 --model Qwen/Qwen3-8B --dataset-name random --num-prompts 30 --max-concurrency 16 --random-input 16 --random-output 128 --warmup-requests 2 --random-range-ratio 1.0 --output-file __OUTPUT_DIR__/output/sglang-bench.jsonl --output-details --pd-separated
python3 -m sglang.bench_serving --backend sglang --base-url http://${PREFILL_NODE}:8300 --model Qwen/Qwen3-8B --dataset-name random --num-prompts 30 --max-concurrency 16 --random-input 16 --random-output 128 --warmup-requests 2 --random-range-ratio 1.0 --output-file __OUTPUT_DIR__/output/sglang-bench.jsonl --output-details --pd-separated
6 changes: 3 additions & 3 deletions tests/ref_data/sglang.sbatch
Original file line number Diff line number Diff line change
Expand Up @@ -41,14 +41,14 @@ wait_for_health() {
echo "Starting SGLang instances..."
srun --export=ALL --mpi=pmix -N1 --container-image=docker.io/lmsysorg/sglang:dev --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output,__INSTALL_DIR__/huggingface:/root/.cache/huggingface --overlap --ntasks-per-node=1 --ntasks=1 \
--output=__OUTPUT_DIR__/output/sglang-serve.log \
env CUDA_VISIBLE_DEVICES="0" python3 -m sglang.launch_server --model-path Qwen/Qwen3-8B --host 0.0.0.0 --port 8000 &
env CUDA_VISIBLE_DEVICES="0" python3 -m sglang.launch_server --model-path Qwen/Qwen3-8B --host 0.0.0.0 --port 8300 &
SERVE_PID=$!

NODE=$(scontrol show hostname $SLURM_JOB_NODELIST | head -n 1)
echo "Waiting for SGLang on $NODE to be ready..."
wait_for_health "http://${NODE}:8000/health" || exit 1
wait_for_health "http://${NODE}:8300/health" || exit 1

echo "Running benchmark..."
srun --export=ALL --mpi=pmix -N1 --container-image=docker.io/lmsysorg/sglang:dev --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output,__INSTALL_DIR__/huggingface:/root/.cache/huggingface --overlap --ntasks-per-node=1 --ntasks=1 \
--output=__OUTPUT_DIR__/output/sglang-bench.log \
python3 -m sglang.bench_serving --backend sglang --base-url http://127.0.0.1:8000 --model Qwen/Qwen3-8B --dataset-name random --num-prompts 30 --max-concurrency 16 --random-input 16 --random-output 128 --warmup-requests 2 --random-range-ratio 1.0 --output-file __OUTPUT_DIR__/output/sglang-bench.jsonl --output-details
python3 -m sglang.bench_serving --backend sglang --base-url http://${NODE}:8300 --model Qwen/Qwen3-8B --dataset-name random --num-prompts 30 --max-concurrency 16 --random-input 16 --random-output 128 --warmup-requests 2 --random-range-ratio 1.0 --output-file __OUTPUT_DIR__/output/sglang-bench.jsonl --output-details
12 changes: 6 additions & 6 deletions tests/ref_data/vllm-disagg-2nodes.sbatch
Original file line number Diff line number Diff line change
Expand Up @@ -60,25 +60,25 @@ echo "Node roles: prefill=$PREFILL_NODE decode=$DECODE_NODE"
echo "Starting vLLM instances..."
srun --export=ALL --mpi=pmix --container-image=nvcr.io/nvidia/vllm:latest --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output,__INSTALL_DIR__/huggingface:/root/.cache/huggingface --overlap --ntasks-per-node=1 --ntasks=1 --relative=0 -N1 \
--output=__OUTPUT_DIR__/output/vllm-prefill.log \
env CUDA_VISIBLE_DEVICES="0,1,2,3" VLLM_NIXL_SIDE_CHANNEL_HOST="${PREFILL_NODE}" VLLM_NIXL_SIDE_CHANNEL_PORT="$PREFILL_NIXL_PORT" vllm serve Qwen/Qwen3-0.6B --port 8100 --kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_producer"}' &
env CUDA_VISIBLE_DEVICES="0,1,2,3" VLLM_NIXL_SIDE_CHANNEL_HOST="${PREFILL_NODE}" VLLM_NIXL_SIDE_CHANNEL_PORT="$PREFILL_NIXL_PORT" vllm serve Qwen/Qwen3-0.6B --host 0.0.0.0 --port 8400 --kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_producer"}' &
PREFILL_PID=$!

srun --export=ALL --mpi=pmix --container-image=nvcr.io/nvidia/vllm:latest --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output,__INSTALL_DIR__/huggingface:/root/.cache/huggingface --overlap --ntasks-per-node=1 --ntasks=1 --relative=1 -N1 \
--output=__OUTPUT_DIR__/output/vllm-decode.log \
env CUDA_VISIBLE_DEVICES="0,1,2,3" VLLM_NIXL_SIDE_CHANNEL_HOST="${DECODE_NODE}" VLLM_NIXL_SIDE_CHANNEL_PORT="$DECODE_NIXL_PORT" vllm serve Qwen/Qwen3-0.6B --port 8200 --kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_consumer"}' &
env CUDA_VISIBLE_DEVICES="0,1,2,3" VLLM_NIXL_SIDE_CHANNEL_HOST="${DECODE_NODE}" VLLM_NIXL_SIDE_CHANNEL_PORT="$DECODE_NIXL_PORT" vllm serve Qwen/Qwen3-0.6B --host 0.0.0.0 --port 8500 --kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_consumer"}' &
DECODE_PID=$!

echo "Waiting for vLLM on $PREFILL_NODE and $DECODE_NODE to be ready..."
wait_for_health "http://${PREFILL_NODE}:8100/health" || exit 1
wait_for_health "http://${DECODE_NODE}:8200/health" || exit 1
wait_for_health "http://${PREFILL_NODE}:8400/health" || exit 1
wait_for_health "http://${DECODE_NODE}:8500/health" || exit 1

echo "Starting router..."
srun --export=ALL --mpi=pmix --container-image=nvcr.io/nvidia/vllm:latest --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output,__INSTALL_DIR__/huggingface:/root/.cache/huggingface --overlap --ntasks-per-node=1 --ntasks=1 --relative=0 -N1 \
--output=__OUTPUT_DIR__/output/vllm-router.log \
python3 /opt/vllm/tests/v1/kv_connector/nixl_integration/toy_proxy_server.py --port 8000 --prefiller-hosts ${PREFILL_NODE} --prefiller-ports 8100 --decoder-hosts ${DECODE_NODE} --decoder-ports 8200 &
python3 /opt/vllm/tests/v1/kv_connector/nixl_integration/toy_proxy_server.py --host 0.0.0.0 --port 8300 --prefiller-hosts ${PREFILL_NODE} --prefiller-ports 8400 --decoder-hosts ${DECODE_NODE} --decoder-ports 8500 &
HELPER_PID=$!

echo "Running benchmark..."
srun --export=ALL --mpi=pmix --container-image=nvcr.io/nvidia/vllm:latest --container-mounts=__OUTPUT_DIR__/output:/cloudai_run_results,__OUTPUT_DIR__/install:/cloudai_install,__OUTPUT_DIR__/output,__INSTALL_DIR__/huggingface:/root/.cache/huggingface --overlap --ntasks-per-node=1 --ntasks=1 --relative=0 -N1 \
--output=__OUTPUT_DIR__/output/vllm-bench.log \
vllm bench serve --model Qwen/Qwen3-0.6B --base-url http://127.0.0.1:8000 --random-input-len 16 --random-output-len 128 --max-concurrency 16 --num-prompts 30 --result-dir __OUTPUT_DIR__/output --result-filename vllm-bench.json --save-result
vllm bench serve --model Qwen/Qwen3-0.6B --base-url http://${PREFILL_NODE}:8300 --random-input-len 16 --random-output-len 128 --max-concurrency 16 --num-prompts 30 --result-dir __OUTPUT_DIR__/output --result-filename vllm-bench.json --save-result
Loading
Loading