|
| 1 | +#!/bin/bash |
| 2 | +set -euo pipefail |
| 3 | + |
| 4 | +# Parse named arguments |
| 5 | +while [[ $# -gt 0 ]]; do |
| 6 | + case $1 in |
| 7 | + # Benchmark Configuration |
| 8 | + --benchmark-mode) benchmark_mode="$2"; shift 2 ;; |
| 9 | + |
| 10 | + # Environment and paths |
| 11 | + --trtllm-repo) trtllm_repo="$2"; shift 2 ;; |
| 12 | + --work-dir) work_dir="$2"; shift 2 ;; |
| 13 | + --full-logdir) full_logdir="$2"; shift 2 ;; |
| 14 | + --container-name) container_name="$2"; shift 2 ;; |
| 15 | + --container-mount) container_mount="$2"; shift 2 ;; |
| 16 | + --container-image) container_image="$2"; shift 2 ;; |
| 17 | + --build-wheel) build_wheel="$2"; shift 2 ;; |
| 18 | + --cuda-architectures) cuda_architectures="$2"; shift 2 ;; |
| 19 | + --trtllm-wheel-path) trtllm_wheel_path="$2"; shift 2 ;; |
| 20 | + *) |
| 21 | + echo "Unknown argument: $1" |
| 22 | + exit 1 |
| 23 | + ;; |
| 24 | + esac |
| 25 | +done |
| 26 | + |
| 27 | +# Print all parsed arguments |
| 28 | +echo "Parsed arguments:" |
| 29 | +echo |
| 30 | +echo "Benchmark Configuration:" |
| 31 | +echo " benchmark_mode: ${benchmark_mode}" |
| 32 | +echo |
| 33 | +echo "Environment Configuration:" |
| 34 | +echo " trtllm_repo: ${trtllm_repo}" |
| 35 | +echo " work_dir: ${work_dir}" |
| 36 | +echo " full_logdir: ${full_logdir}" |
| 37 | +echo " container_mount: ${container_mount}" |
| 38 | +echo " container_image: ${container_image}" |
| 39 | +echo " build_wheel: ${build_wheel}" |
| 40 | +echo " cuda_architectures: ${cuda_architectures}" |
| 41 | +echo " trtllm_wheel_path: ${trtllm_wheel_path}" |
| 42 | + |
| 43 | +# Set TRTLLM_DISAGG_BENCHMARK_GEN_ONLY=1 for gen_only_no_context mode |
| 44 | +if [ "${benchmark_mode}" = "gen_only_no_context" ]; then |
| 45 | + export TRTLLM_DISAGG_BENCHMARK_GEN_ONLY=1 |
| 46 | + echo "Setting TRTLLM_DISAGG_BENCHMARK_GEN_ONLY=1 for gen_only_no_context mode" |
| 47 | +fi |
| 48 | + |
| 49 | +# Function to cleanup on failure |
| 50 | +cleanup_on_failure() { |
| 51 | + echo "Error: $1" |
| 52 | + scancel ${SLURM_JOB_ID} |
| 53 | + exit 1 |
| 54 | +} |
| 55 | + |
| 56 | +replace_placeholder() { |
| 57 | + file_path="$1" |
| 58 | + all_nodes_str="$2" |
| 59 | + new_file_path="$3" |
| 60 | + cp "$file_path" "$new_file_path" |
| 61 | + IFS=',' read -r -a node_array <<< "$all_nodes_str" |
| 62 | + for i in "${!node_array[@]}"; do |
| 63 | + current_val="${node_array[$i]}" |
| 64 | + placeholder="<node${i}_placeholder>" |
| 65 | + |
| 66 | + # Use sed to replace the placeholder with the value in-place |
| 67 | + sed -i "s|$placeholder|$current_val|g" "${new_file_path}" |
| 68 | + echo "Replaced $placeholder with $current_val in ${new_file_path}" |
| 69 | + done |
| 70 | +} |
| 71 | + |
| 72 | +env > ${full_logdir}/environment.txt |
| 73 | + |
| 74 | +# Start container |
| 75 | +echo "Starting container..." |
| 76 | +if ! srun -l --container-image=${container_image} \ |
| 77 | + --container-name=${container_name} \ |
| 78 | + --container-mounts=${container_mount} \ |
| 79 | + --mpi=pmix \ |
| 80 | + echo "Container up." &> ${full_logdir}/1_container_launch.log; then |
| 81 | + cleanup_on_failure "Failed to start container. Check ${full_logdir}/1_container_launch.log" |
| 82 | +fi |
| 83 | + |
| 84 | +# Install TensorRT-LLM |
| 85 | +if [ -n "${trtllm_wheel_path}" ]; then |
| 86 | + # Install from pre-built wheel if path is provided |
| 87 | + echo "Installing TensorRT-LLM from wheel: ${trtllm_wheel_path}..." |
| 88 | + if ! srun --container-name=${container_name} \ |
| 89 | + --container-mounts=${container_mount} --no-container-mount-home \ |
| 90 | + --mpi=pmix --overlap -N $SLURM_NNODES --ntasks-per-node=1 \ |
| 91 | + bash -c "pip install ${trtllm_wheel_path}[devel]" \ |
| 92 | + &> ${full_logdir}/2_install.log; then |
| 93 | + cleanup_on_failure "TensorRT-LLM wheel installation failed. Check ${full_logdir}/2_install.log for details" |
| 94 | + fi |
| 95 | + echo "TensorRT-LLM wheel installation completed successfully" |
| 96 | +elif [ -d "${trtllm_repo}" ]; then |
| 97 | + # Build and install from repository if no wheel path provided |
| 98 | + echo "Installing TensorRT-LLM from ${trtllm_repo}..." |
| 99 | + TRT_LLM_GIT_COMMIT=$(git -C ${trtllm_repo} rev-parse --short HEAD 2>/dev/null || echo "unknown") |
| 100 | + echo "TRT_LLM_GIT_COMMIT: ${TRT_LLM_GIT_COMMIT}" |
| 101 | + |
| 102 | + if [ "${build_wheel}" = "true" ]; then |
| 103 | + echo "Building TensorRT-LLM wheel on one node..." |
| 104 | + build_command="python3 ./scripts/build_wheel.py --trt_root /usr/local/tensorrt --benchmarks --use_ccache --clean" |
| 105 | + if [ -n "${cuda_architectures:-}" ]; then |
| 106 | + build_command="${build_command} --cuda_architectures \"${cuda_architectures}\"" |
| 107 | + fi |
| 108 | + if ! srun --container-name=${container_name} \ |
| 109 | + --container-mounts=${container_mount} \ |
| 110 | + --mpi=pmix --overlap -N 1 --ntasks-per-node=1 \ |
| 111 | + bash -c "cd ${trtllm_repo} && ${build_command}" \ |
| 112 | + &> ${full_logdir}/2_build.log; then |
| 113 | + cleanup_on_failure "TensorRT-LLM build failed. Check ${full_logdir}/2_build.log for details" |
| 114 | + fi |
| 115 | + echo "TensorRT-LLM build completed successfully" |
| 116 | + fi |
| 117 | + |
| 118 | + echo "Installing TensorRT-LLM..." |
| 119 | + if ! srun --container-name=${container_name} \ |
| 120 | + --container-mounts=${container_mount} --no-container-mount-home \ |
| 121 | + --mpi=pmix --overlap -N $SLURM_NNODES --ntasks-per-node=1 \ |
| 122 | + bash -c "cd ${trtllm_repo} && pip install -e .[devel]" \ |
| 123 | + &> ${full_logdir}/2_install.log; then |
| 124 | + cleanup_on_failure "TensorRT-LLM installation failed. Check ${full_logdir}/2_install.log for details" |
| 125 | + fi |
| 126 | + echo "TensorRT-LLM installation completed successfully" |
| 127 | +else |
| 128 | + echo "trtllm_wheel_path and trtllm_repo are not provided, will use the installed TensorRT-LLM from the container" |
| 129 | + # get_env file is in the same directory as this script |
| 130 | + get_env_file=${work_dir}/get_env.py |
| 131 | + if ! srun --container-name=${container_name} \ |
| 132 | + --container-mounts=${container_mount} --no-container-mount-home \ |
| 133 | + --mpi=pmix --overlap -N 1 --ntasks-per-node=1 \ |
| 134 | + bash -c "python ${get_env_file} -e ${full_logdir}/env_vars.json" \ |
| 135 | + &> ${full_logdir}/2_get_env.log; then |
| 136 | + cleanup_on_failure "Failed to get TensorRT-LLM environment variables. Check ${full_logdir}/2_get_env.log for details" |
| 137 | + fi |
| 138 | + echo "TensorRT-LLM environment variables saved to ${full_logdir}/env_vars.json" |
| 139 | +fi |
| 140 | + |
| 141 | +# Get node lists and replace the placeholder with the actual node names |
| 142 | +echo "SLURM_NODELIST: ${SLURM_NODELIST}" |
| 143 | +all_nodes=($(scontrol show hostname $SLURM_NODELIST | sort)) |
| 144 | +all_nodes_str=$(IFS=','; echo "${all_nodes[*]}") |
| 145 | +echo "all_nodes_str: ${all_nodes_str}" |
| 146 | + |
| 147 | +start_server_cmds_base_file=${full_logdir}/start_server_cmds_base.sh |
| 148 | +start_server_cmds_file=${full_logdir}/start_server_cmds.sh |
| 149 | +replace_placeholder "${start_server_cmds_base_file}" "${all_nodes_str}" "${start_server_cmds_file}" |
| 150 | +server_config_base_file=${full_logdir}/server_config_base.yaml |
| 151 | +server_config_file=${full_logdir}/server_config.yaml |
| 152 | +replace_placeholder "${server_config_base_file}" "${all_nodes_str}" "${server_config_file}" |
| 153 | +mpi_worker_config_base_file=${full_logdir}/mpi_worker_config_base.yaml |
| 154 | +mpi_worker_config_file=${full_logdir}/mpi_worker_config.yaml |
| 155 | +if [ -f "${mpi_worker_config_base_file}" ]; then |
| 156 | + replace_placeholder "${mpi_worker_config_base_file}" "${all_nodes_str}" "${mpi_worker_config_file}" |
| 157 | +fi |
| 158 | +client_cmds_base_file=${full_logdir}/client_cmds_base.sh |
| 159 | +client_cmds_file=${full_logdir}/client_cmds.sh |
| 160 | +replace_placeholder "${client_cmds_base_file}" "${all_nodes_str}" "${client_cmds_file}" |
| 161 | + |
| 162 | +# start the servers (skip ctx workers if TRTLLM_DISAGG_BENCHMARK_GEN_ONLY is set). |
| 163 | +echo "Starting worker commands from ${start_server_cmds_file}..." |
| 164 | +cat ${start_server_cmds_file} | while read cmd; do |
| 165 | + # Skip ctx worker commands if in gen-only mode |
| 166 | + # CTX appears as argument to start_worker.sh and in log filename |
| 167 | + if [ "${TRTLLM_DISAGG_BENCHMARK_GEN_ONLY:-0}" = "1" ] && [[ "$cmd" == *"start_worker.sh CTX"* ]]; then |
| 168 | + echo "Skipping ctx worker command (TRTLLM_DISAGG_BENCHMARK_GEN_ONLY is set): ${cmd}" |
| 169 | + continue |
| 170 | + fi |
| 171 | + echo "Executing command: ${cmd}" |
| 172 | + eval "${cmd}" |
| 173 | +done |
| 174 | +echo "Server is ready!" |
| 175 | + |
| 176 | +# Start client commands |
| 177 | +echo "Starting client commands from ${client_cmds_file}..." |
| 178 | +while read -r cmd <&3; do |
| 179 | + echo "Starting client command: ${cmd}" |
| 180 | + eval "${cmd}" |
| 181 | + if [ $? -ne 0 ]; then |
| 182 | + cleanup_on_failure "Command failed: ${cmd}." |
| 183 | + fi |
| 184 | +done 3< "${client_cmds_file}" |
| 185 | + |
| 186 | +echo "Job completed successfully, total runtime: $SECONDS seconds" |
| 187 | + |
| 188 | +# try to kill the server and workers |
| 189 | +scancel ${SLURM_JOB_ID} |
0 commit comments