Skip to content

Commit e92ee4f

Browse files
tianyuz-nvwanqian-nvzongfeijingKefeng-Duan
authored
[None][feat] Add DWDP (Distributed Weight Data Parallelism) support for MoE inference (#12136)
Signed-off-by: tianyuz-nv <tianyuz@nvidia.com> Signed-off-by: Tianyu Zhang <tianyuz@nvidia.com> Signed-off-by: Kefeng-Duan <176893526+Kefeng-Duan@users.noreply.github.com> Co-authored-by: wanqian-nv <221923321+wanqian-nv@users.noreply.github.com> Co-authored-by: zongfeijing <20381269+zongfeijing@users.noreply.github.com> Co-authored-by: Kefeng-Duan <176893526+Kefeng-Duan@users.noreply.github.com>
1 parent b581ac4 commit e92ee4f

File tree

23 files changed

+4013
-578
lines changed

23 files changed

+4013
-578
lines changed

cpp/tensorrt_llm/thop/moeAlltoAllOp.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -144,7 +144,7 @@ torch::Tensor moeA2AInitializeOp(torch::Tensor const& workspace, int64_t epRank,
144144

145145
// Synchronize among ranks
146146
cudaDeviceSynchronize();
147-
tensorrt_llm::mpi::MpiComm::world().barrier();
147+
tensorrt_llm::mpi::MpiComm::session().barrier();
148148

149149
return metainfo;
150150
}
Lines changed: 189 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,189 @@
1+
#!/bin/bash
2+
set -euo pipefail
3+
4+
# Parse named arguments
5+
while [[ $# -gt 0 ]]; do
6+
case $1 in
7+
# Benchmark Configuration
8+
--benchmark-mode) benchmark_mode="$2"; shift 2 ;;
9+
10+
# Environment and paths
11+
--trtllm-repo) trtllm_repo="$2"; shift 2 ;;
12+
--work-dir) work_dir="$2"; shift 2 ;;
13+
--full-logdir) full_logdir="$2"; shift 2 ;;
14+
--container-name) container_name="$2"; shift 2 ;;
15+
--container-mount) container_mount="$2"; shift 2 ;;
16+
--container-image) container_image="$2"; shift 2 ;;
17+
--build-wheel) build_wheel="$2"; shift 2 ;;
18+
--cuda-architectures) cuda_architectures="$2"; shift 2 ;;
19+
--trtllm-wheel-path) trtllm_wheel_path="$2"; shift 2 ;;
20+
*)
21+
echo "Unknown argument: $1"
22+
exit 1
23+
;;
24+
esac
25+
done
26+
27+
# Print all parsed arguments
28+
echo "Parsed arguments:"
29+
echo
30+
echo "Benchmark Configuration:"
31+
echo " benchmark_mode: ${benchmark_mode}"
32+
echo
33+
echo "Environment Configuration:"
34+
echo " trtllm_repo: ${trtllm_repo}"
35+
echo " work_dir: ${work_dir}"
36+
echo " full_logdir: ${full_logdir}"
37+
echo " container_mount: ${container_mount}"
38+
echo " container_image: ${container_image}"
39+
echo " build_wheel: ${build_wheel}"
40+
echo " cuda_architectures: ${cuda_architectures}"
41+
echo " trtllm_wheel_path: ${trtllm_wheel_path}"
42+
43+
# Set TRTLLM_DISAGG_BENCHMARK_GEN_ONLY=1 for gen_only_no_context mode
44+
if [ "${benchmark_mode}" = "gen_only_no_context" ]; then
45+
export TRTLLM_DISAGG_BENCHMARK_GEN_ONLY=1
46+
echo "Setting TRTLLM_DISAGG_BENCHMARK_GEN_ONLY=1 for gen_only_no_context mode"
47+
fi
48+
49+
# Function to cleanup on failure
50+
cleanup_on_failure() {
51+
echo "Error: $1"
52+
scancel ${SLURM_JOB_ID}
53+
exit 1
54+
}
55+
56+
replace_placeholder() {
57+
file_path="$1"
58+
all_nodes_str="$2"
59+
new_file_path="$3"
60+
cp "$file_path" "$new_file_path"
61+
IFS=',' read -r -a node_array <<< "$all_nodes_str"
62+
for i in "${!node_array[@]}"; do
63+
current_val="${node_array[$i]}"
64+
placeholder="<node${i}_placeholder>"
65+
66+
# Use sed to replace the placeholder with the value in-place
67+
sed -i "s|$placeholder|$current_val|g" "${new_file_path}"
68+
echo "Replaced $placeholder with $current_val in ${new_file_path}"
69+
done
70+
}
71+
72+
env > ${full_logdir}/environment.txt
73+
74+
# Start container
75+
echo "Starting container..."
76+
if ! srun -l --container-image=${container_image} \
77+
--container-name=${container_name} \
78+
--container-mounts=${container_mount} \
79+
--mpi=pmix \
80+
echo "Container up." &> ${full_logdir}/1_container_launch.log; then
81+
cleanup_on_failure "Failed to start container. Check ${full_logdir}/1_container_launch.log"
82+
fi
83+
84+
# Install TensorRT-LLM
85+
if [ -n "${trtllm_wheel_path}" ]; then
86+
# Install from pre-built wheel if path is provided
87+
echo "Installing TensorRT-LLM from wheel: ${trtllm_wheel_path}..."
88+
if ! srun --container-name=${container_name} \
89+
--container-mounts=${container_mount} --no-container-mount-home \
90+
--mpi=pmix --overlap -N $SLURM_NNODES --ntasks-per-node=1 \
91+
bash -c "pip install ${trtllm_wheel_path}[devel]" \
92+
&> ${full_logdir}/2_install.log; then
93+
cleanup_on_failure "TensorRT-LLM wheel installation failed. Check ${full_logdir}/2_install.log for details"
94+
fi
95+
echo "TensorRT-LLM wheel installation completed successfully"
96+
elif [ -d "${trtllm_repo}" ]; then
97+
# Build and install from repository if no wheel path provided
98+
echo "Installing TensorRT-LLM from ${trtllm_repo}..."
99+
TRT_LLM_GIT_COMMIT=$(git -C ${trtllm_repo} rev-parse --short HEAD 2>/dev/null || echo "unknown")
100+
echo "TRT_LLM_GIT_COMMIT: ${TRT_LLM_GIT_COMMIT}"
101+
102+
if [ "${build_wheel}" = "true" ]; then
103+
echo "Building TensorRT-LLM wheel on one node..."
104+
build_command="python3 ./scripts/build_wheel.py --trt_root /usr/local/tensorrt --benchmarks --use_ccache --clean"
105+
if [ -n "${cuda_architectures:-}" ]; then
106+
build_command="${build_command} --cuda_architectures \"${cuda_architectures}\""
107+
fi
108+
if ! srun --container-name=${container_name} \
109+
--container-mounts=${container_mount} \
110+
--mpi=pmix --overlap -N 1 --ntasks-per-node=1 \
111+
bash -c "cd ${trtllm_repo} && ${build_command}" \
112+
&> ${full_logdir}/2_build.log; then
113+
cleanup_on_failure "TensorRT-LLM build failed. Check ${full_logdir}/2_build.log for details"
114+
fi
115+
echo "TensorRT-LLM build completed successfully"
116+
fi
117+
118+
echo "Installing TensorRT-LLM..."
119+
if ! srun --container-name=${container_name} \
120+
--container-mounts=${container_mount} --no-container-mount-home \
121+
--mpi=pmix --overlap -N $SLURM_NNODES --ntasks-per-node=1 \
122+
bash -c "cd ${trtllm_repo} && pip install -e .[devel]" \
123+
&> ${full_logdir}/2_install.log; then
124+
cleanup_on_failure "TensorRT-LLM installation failed. Check ${full_logdir}/2_install.log for details"
125+
fi
126+
echo "TensorRT-LLM installation completed successfully"
127+
else
128+
echo "trtllm_wheel_path and trtllm_repo are not provided, will use the installed TensorRT-LLM from the container"
129+
# get_env file is in the same directory as this script
130+
get_env_file=${work_dir}/get_env.py
131+
if ! srun --container-name=${container_name} \
132+
--container-mounts=${container_mount} --no-container-mount-home \
133+
--mpi=pmix --overlap -N 1 --ntasks-per-node=1 \
134+
bash -c "python ${get_env_file} -e ${full_logdir}/env_vars.json" \
135+
&> ${full_logdir}/2_get_env.log; then
136+
cleanup_on_failure "Failed to get TensorRT-LLM environment variables. Check ${full_logdir}/2_get_env.log for details"
137+
fi
138+
echo "TensorRT-LLM environment variables saved to ${full_logdir}/env_vars.json"
139+
fi
140+
141+
# Get node lists and replace the placeholder with the actual node names
142+
echo "SLURM_NODELIST: ${SLURM_NODELIST}"
143+
all_nodes=($(scontrol show hostname $SLURM_NODELIST | sort))
144+
all_nodes_str=$(IFS=','; echo "${all_nodes[*]}")
145+
echo "all_nodes_str: ${all_nodes_str}"
146+
147+
start_server_cmds_base_file=${full_logdir}/start_server_cmds_base.sh
148+
start_server_cmds_file=${full_logdir}/start_server_cmds.sh
149+
replace_placeholder "${start_server_cmds_base_file}" "${all_nodes_str}" "${start_server_cmds_file}"
150+
server_config_base_file=${full_logdir}/server_config_base.yaml
151+
server_config_file=${full_logdir}/server_config.yaml
152+
replace_placeholder "${server_config_base_file}" "${all_nodes_str}" "${server_config_file}"
153+
mpi_worker_config_base_file=${full_logdir}/mpi_worker_config_base.yaml
154+
mpi_worker_config_file=${full_logdir}/mpi_worker_config.yaml
155+
if [ -f "${mpi_worker_config_base_file}" ]; then
156+
replace_placeholder "${mpi_worker_config_base_file}" "${all_nodes_str}" "${mpi_worker_config_file}"
157+
fi
158+
client_cmds_base_file=${full_logdir}/client_cmds_base.sh
159+
client_cmds_file=${full_logdir}/client_cmds.sh
160+
replace_placeholder "${client_cmds_base_file}" "${all_nodes_str}" "${client_cmds_file}"
161+
162+
# start the servers (skip ctx workers if TRTLLM_DISAGG_BENCHMARK_GEN_ONLY is set).
163+
echo "Starting worker commands from ${start_server_cmds_file}..."
164+
cat ${start_server_cmds_file} | while read cmd; do
165+
# Skip ctx worker commands if in gen-only mode
166+
# CTX appears as argument to start_worker.sh and in log filename
167+
if [ "${TRTLLM_DISAGG_BENCHMARK_GEN_ONLY:-0}" = "1" ] && [[ "$cmd" == *"start_worker.sh CTX"* ]]; then
168+
echo "Skipping ctx worker command (TRTLLM_DISAGG_BENCHMARK_GEN_ONLY is set): ${cmd}"
169+
continue
170+
fi
171+
echo "Executing command: ${cmd}"
172+
eval "${cmd}"
173+
done
174+
echo "Server is ready!"
175+
176+
# Start client commands
177+
echo "Starting client commands from ${client_cmds_file}..."
178+
while read -r cmd <&3; do
179+
echo "Starting client command: ${cmd}"
180+
eval "${cmd}"
181+
if [ $? -ne 0 ]; then
182+
cleanup_on_failure "Command failed: ${cmd}."
183+
fi
184+
done 3< "${client_cmds_file}"
185+
186+
echo "Job completed successfully, total runtime: $SECONDS seconds"
187+
188+
# try to kill the server and workers
189+
scancel ${SLURM_JOB_ID}
Lines changed: 61 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,61 @@
1+
#! /bin/bash
2+
set -u
3+
set -e
4+
set -x
5+
6+
config_file=${1}
7+
numa_bind=${2}
8+
log_dir=${3}
9+
enable_nsys=${4}
10+
ctx_profile_range=${5}
11+
gen_profile_range=${6}
12+
num_ctx_gpus=${7}
13+
ctx_worker_env_var=${8}
14+
gen_worker_env_var=${9}
15+
16+
unset UCX_NET_DEVICES
17+
unset UCX_TLS
18+
19+
echo "SLURM_PROCID: ${SLURM_PROCID}, hostname: $(hostname)"
20+
21+
if [ "${SLURM_PROCID}" -lt "${num_ctx_gpus}" ]; then
22+
worker_role="CTX"
23+
worker_env_var=${ctx_worker_env_var}
24+
profile_range=${ctx_profile_range}
25+
else
26+
worker_role="GEN"
27+
worker_env_var=${gen_worker_env_var}
28+
profile_range=${gen_profile_range}
29+
fi
30+
31+
echo "worker_role: ${worker_role}, profile_range: ${profile_range}"
32+
33+
for env_var in ${worker_env_var}; do
34+
export "${env_var}"
35+
echo "Exported: ${env_var}"
36+
done
37+
38+
if [ "${numa_bind}" = "true" ]; then
39+
numa_bind_cmd="numactl -m 0,1"
40+
echo "numactl -m 0,1 - Only allocate memory from nodes on GB200/GB300 NVL72"
41+
else
42+
numa_bind_cmd=""
43+
echo "Not binding memory. If on GB200/GB300 NVL72, use \"numactl -m 0,1\" to only allocate memory from nodes."
44+
fi
45+
46+
echo "config_file: ${config_file}"
47+
48+
nsys_prefix=""
49+
if [ "${enable_nsys}" != "true" ]; then
50+
echo "nsys is not enabled, start normal flow"
51+
else
52+
nsys_file=${log_dir}/nsys_worker_proc_${worker_role}_${SLURM_PROCID}
53+
export TLLM_PROFILE_RECORD_GC=1
54+
export TLLM_NVTX_DEBUG=1
55+
export NSYS_MPI_STORE_TEAMS_PER_RANK=1
56+
export TLLM_PROFILE_START_STOP=${profile_range}
57+
echo "nsys is enabled on ${worker_role} ranks, TLLM_PROFILE_START_STOP=${profile_range}"
58+
nsys_prefix="nsys profile -o ${nsys_file} -f true -t cuda,nvtx,python-gil -c cudaProfilerApi --cuda-graph-trace node --capture-range-end=stop --gpu-metrics-devices=none"
59+
fi
60+
61+
${nsys_prefix} ${numa_bind_cmd} trtllm-serve disaggregated_mpi_worker -c ${config_file}

examples/disaggregated/slurm/benchmark/submit.py

Lines changed: 5 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -105,10 +105,13 @@ def assign_servers(
105105
server_allocations[server_type][i] = server_allocation
106106
port += 1
107107

108-
assign_servers(allocations, "GEN", num_gen_servers, gen_world_size,
109-
gpus_per_node)
108+
# Keep the allocation order aligned with disagg_utils, which builds
109+
# server_configs as ctx_cfgs + gen_cfgs and assigns rank offsets in that
110+
# same order during split_world_comm().
110111
assign_servers(allocations, "CTX", num_ctx_servers, ctx_world_size,
111112
gpus_per_node)
113+
assign_servers(allocations, "GEN", num_gen_servers, gen_world_size,
114+
gpus_per_node)
112115

113116
return allocations
114117

@@ -506,17 +509,13 @@ def submit_job(config, log_dir, dry_run):
506509
}
507510
}
508511

509-
# Generate start worker commands with placeholder hostnames
510512
for server_type in allocations.keys():
511513
server_cfg = server_configs[server_type]
512514

513515
for server_id in allocations[server_type].keys():
514516
allocation = allocations[server_type][server_id]
515-
# Get GPU IDs for this server from allocation
516-
# When multi-node, all nodes have same device list, so use first node [0]
517517
gpu_ids = list(allocation["nodes"].values())[0]
518518

519-
# Build environment for this worker
520519
cuda_devices = ','.join(map(str, gpu_ids))
521520
worker_env = build_worker_environment(
522521
worker_config=worker_config,
@@ -529,7 +528,6 @@ def submit_job(config, log_dir, dry_run):
529528
)
530529
export_str = format_export_string(worker_env)
531530

532-
# Use script_dir for start_worker.sh
533531
cmd = [
534532
"srun -l",
535533
f"--nodelist {','.join(allocation['nodes'].keys())}",

0 commit comments

Comments
 (0)