Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
98 changes: 47 additions & 51 deletions jenkins/L0_Test.groovy
Original file line number Diff line number Diff line change
Expand Up @@ -114,7 +114,7 @@ def uploadResults(def pipeline, SlurmCluster cluster, String nodeName, String st
def remote = [
ip : randomLoginNode,
host : randomLoginNode,
port : cluster.sshPort,
port : cluster.sshPort?:22,
user : "${pipeline.USERNAME}",
passwd : "${pipeline.PASSWORD}",
allowAnyHosts: true,
Expand All @@ -127,8 +127,7 @@ def uploadResults(def pipeline, SlurmCluster cluster, String nodeName, String st
pipeline.stage('Submit Test Results') {
sh "mkdir -p ${stageName}"
def resultsFilePath = "/home/svc_tensorrt/bloom/scripts/${nodeName}/results.xml"
def downloadResultCmd = "sshpass -p '${remote.passwd}' scp -P ${remote.port} -r -p ${COMMON_SSH_OPTIONS} ${remote.user}@${remote.host}:${resultsFilePath} ${stageName}/"
downloadSucceed = sh(script: downloadResultCmd, returnStatus: true) == 0
downloadSucceed = Utils.exec(pipeline, script: "sshpass -p '${remote.passwd}' scp -P ${remote.port} -r -p ${COMMON_SSH_OPTIONS} ${remote.user}@${remote.host}:${resultsFilePath} ${stageName}/", returnStatus: true, numRetries: 3) == 0
if (downloadSucceed) {
sh "ls ${stageName}"
echo "Upload test results."
Expand Down Expand Up @@ -419,7 +418,7 @@ def cleanUpSlurmResources(def pipeline, SlurmCluster cluster, String jobUID){
// The slurm_run.sh will add the slurm job id in that file.
script: Utils.sshUserCmd(
remote,
"\"test -f ${jobWorkspace}/slurm_job_id.txt && cat ${jobWorkspace}/slurm_job_id.txt || true\""
"\"cat ${jobWorkspace}/slurm_job_id.txt || true\""
),
returnStdout: true
).trim()
Expand All @@ -440,19 +439,23 @@ def cleanUpSlurmResources(def pipeline, SlurmCluster cluster, String jobUID){

Utils.exec(pipeline, script: "echo Sleeping to allow Slurm job termination; sleep 30")

def cleanupCommands = [
"rm -rf /lustre/fs1/portfolios/coreai/projects/coreai_tensorrt_ci/users/svc_tensorrt/containers/container-${slurmJobID}.sqsh || true",
"rm -rf ${jobWorkspace} || true",
].join(" && ")
Utils.exec(
pipeline,
script: Utils.sshUserCmd(
remote,
"\"rm -rf ${jobWorkspace} || true\""
"\"${cleanupCommands}\""
)
)

Utils.exec(pipeline, script: "echo Slurm job ID: ${slurmJobID} cleaned up")
}
}

// Methods to run slurm job with Jenkins Agent
// Methods to run Slurm job with Jenkins Agent
def cleanUpNodeResources(def pipeline, SlurmCluster cluster, String nodeName, String slurmJobID) {
withCredentials([usernamePassword(credentialsId: 'svc_tensorrt', usernameVariable: 'USERNAME', passwordVariable: 'PASSWORD')]) {
def randomLoginNode = SlurmConfig.getRandomLoginNode(cluster.host)
Expand Down Expand Up @@ -537,12 +540,10 @@ def runLLMTestlistWithAgent(pipeline, platform, testList, config=VANILLA_CONFIG,

def jenkinsSetupPath = Utils.copyLibraryResource(pipeline, entrypoint)

Utils.exec(pipeline, script: "chmod +x ${jenkinsSetupPath}", returnStdout: true)

Utils.exec(pipeline, script: "sshpass -p '${remote.passwd}' scp -P ${remote.port} -r -p ${COMMON_SSH_OPTIONS} ${jenkinsSetupPath} ${remote.user}@${remote.host}:~/bloom/scripts/${nodeName}-${entrypoint}", numRetries: 3)

Utils.exec(pipeline, script: "cat ${jenkinsSetupPath}")

Utils.copyFileToRemoteHost(pipeline, remote, jenkinsSetupPath, "/home/svc_tensorrt/bloom/scripts/${nodeName}-${entrypoint}", true)

Utils.exec(pipeline, script: "echo Sleeping before Slurm job submission; sleep \$((RANDOM % 29 + 1))")

// Specific for OCI machines
Expand Down Expand Up @@ -606,7 +607,9 @@ def runLLMTestlistWithAgent(pipeline, platform, testList, config=VANILLA_CONFIG,
// Wait 10 minutes to check status of the node again
sleep(time: 10, unit: 'MINUTES')
// Avoid the node being stuck in the held state.
Utils.exec(pipeline, script: Utils.sshUserCmd(remote, "\"scontrol release ${slurmJobID} || true\""), numRetries: 3)
if (counter % 3 == 0) {
Utils.exec(pipeline, script: Utils.sshUserCmd(remote, "\"scontrol release ${slurmJobID} || true\""), numRetries: 3)
}
counter++
}
}
Expand Down Expand Up @@ -684,7 +687,6 @@ def runLLMTestlistWithAgent(pipeline, platform, testList, config=VANILLA_CONFIG,
}
}
}
// End of Methods to run slurm job with Jenkins Agent

def executeLLMTestOnSlurm(pipeline, platform, testList, config=VANILLA_CONFIG, perfMode=false, stageName="Undefined", splitId=1, splits=1, skipInstallWheel=false, cpver="cp312", runner)
{
Expand Down Expand Up @@ -716,6 +718,7 @@ def executeLLMTestOnSlurm(pipeline, platform, testList, config=VANILLA_CONFIG, p
})
}
}
// End of Methods to run Slurm job with Jenkins Agent

def getNodeArgs(int nodeCount, int gpuCount) {
int gpusPerNode = ((gpuCount / nodeCount) as BigDecimal).setScale(0, BigDecimal.ROUND_CEILING).intValue()
Expand Down Expand Up @@ -802,8 +805,6 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG

Utils.exec(pipeline, script: "env | sort && pwd && ls -alh")

def slurmOutputFile = null

try {
// Run ssh command to start node in desired cluster via SLURM
withCredentials([
Expand All @@ -830,16 +831,15 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
def resourcePathNode = "/tmp"
def llmSrcNode = "${resourcePathNode}/TensorRT-LLM/src"
def llmSrcLocal = "${llmPath}/TensorRT-LLM/src"
def scriptRunNode = "${jobWorkspace}/${jobUID}-slurm_run.sh"
def scriptLaunch = "${jobWorkspace}/${jobUID}-slurm_launch.sh"
slurmOutputFile = SlurmConfig.getOutputFilePath("/home/svc_tensorrt/slurm-logs", jobUID)
def scriptRunLocalPath = "${llmSrcLocal}/jenkins/scripts/slurm_run.sh"
def scriptRunPathNode = "${jobWorkspace}/${jobUID}-slurm_run.sh"
def testListPathNode = "${jobWorkspace}/${testList}.txt"
def waivesListPathNode = "${jobWorkspace}/waives.txt"
def outputPath = "${jobWorkspace}/job-output.log"
def scriptLaunchPathLocal = Utils.createTempLocation(pipeline, "./slurm_launch.sh")
def scriptLaunchPathNode = "${jobWorkspace}/slurm_launch.sh"
def scriptLaunchPathNode = "${jobWorkspace}/${jobUID}-slurm_launch.sh"
def scriptExecPathLocal = Utils.createTempLocation(pipeline, "./slurm_exec.sh")
def scriptExecPathNode = "${jobWorkspace}/slurm_exec.sh"
def scriptExecPathNode = "${jobWorkspace}/${jobUID}-slurm_exec.sh"
def isAarch64 = config.contains("aarch64")
def coverageConfigFile = "${jobWorkspace}/.coveragerc"

Expand All @@ -851,15 +851,12 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
trtllm_utils.llmExecStepWithRetry(pipeline, script: "cd ${llmPath} && wget -nv ${llmTarfile}")
sh "cd ${llmPath} && tar -zxf ${BUILD_CONFIGS[config][TARNAME]}"

// Upload slurm_run_sh to Frontend node
def scriptRunLocalPath = "${llmSrcLocal}/jenkins/scripts/slurm_run.sh"

Utils.exec(pipeline, script: "echo \"Script to trigger slurm job: \" && cat ${scriptRunLocalPath}")
Utils.exec(pipeline, script: "echo \"Script to trigger Slurm srun job: \" && cat ${scriptRunLocalPath}")
Utils.copyFileToRemoteHost(
pipeline,
remote,
scriptRunLocalPath,
scriptRunNode,
scriptRunPathNode,
true
)

Expand Down Expand Up @@ -995,22 +992,22 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
export pytestCommand="$pytestCommand"
export coverageConfigFile="$coverageConfigFile"
export NVIDIA_IMEX_CHANNELS=0
export NVIDIA_IMEX_CHANNELS=0
export NVIDIA_VISIBLE_DEVICES=\$(seq -s, 0 \$((\$(nvidia-smi --query-gpu=count -i 0 --format=noheader)-1)))
[ -z "\${NVIDIA_VISIBLE_DEVICES:-}" ] && export NVIDIA_VISIBLE_DEVICES=\$(seq -s, 0 \$((\$(nvidia-smi --query-gpu=count -i 0 --format=noheader)-1)))

${srunPrologue}

chmod +x $scriptRunNode
srun --kill-on-bad-exit=1 ${srunArgs.join(" ")} ${scriptRunNode}
srun --kill-on-bad-exit=1 ${srunArgs.join(" ")} ${scriptRunPathNode}
""".replaceAll("(?m)^\\s*", "")
pipeline.writeFile(file: scriptLaunchPathLocal, text: scriptContent)
Utils.exec(pipeline, script: "echo \"Script to trigger Slurm sbatch job: \" && cat ${scriptLaunchPathLocal}")
Utils.copyFileToRemoteHost(
pipeline,
remote,
scriptLaunchPathLocal,
scriptLaunchPathNode,
true
)

def scriptExec = """
touch ${outputPath}
jobId=\$(sbatch ${scriptLaunchPathNode} | awk '{print \$4}')
Expand All @@ -1035,6 +1032,7 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
fi
""".replaceAll("(?m)^\\s*", "").trim()
pipeline.writeFile(file: scriptExecPathLocal, text: scriptExec)
Utils.exec(pipeline, script: "echo \"Script to trigger Slurm submission job: \" && cat ${scriptExecPathLocal}")
Utils.copyFileToRemoteHost(
pipeline,
remote,
Expand All @@ -1050,7 +1048,8 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
script: Utils.sshUserCmd(
remote,
scriptExecPathNode
)
),
numRetries: 3
)
}

Expand Down Expand Up @@ -2568,8 +2567,8 @@ def runInDockerOnNodeMultiStage(image, label, dockerArgs, needToDeleteDir=true)
docker.image(image).pull()
}
// We submit the Slurm job with SlurmConfig.DEFAULT_TIMEOUT minutes (300) timeout
// The timeout here is to avoid the Slurm job being stuck.
timeout(time: SlurmConfig.DEFAULT_TIMEOUT, unit: 'MINUTES') {
// Minus 10 minutes to avoid the Slurm job being stopped earlier.
timeout(time: SlurmConfig.DEFAULT_TIMEOUT - 10, unit: 'MINUTES') {
docker.image(image).inside(dockerArgs) {
runner()
}
Expand All @@ -2589,7 +2588,9 @@ def runInEnrootOnNode(label)
{
return {
runner -> node(label) {
timeout(time: SlurmConfig.DEFAULT_TIMEOUT_SHORT, unit: 'MINUTES') {
// We submit the Slurm job with SlurmConfig.DEFAULT_TIMEOUT_SHORT minutes (240) timeout
// Minus 10 minutes to avoid the Slurm job being stopped earlier.
timeout(time: SlurmConfig.DEFAULT_TIMEOUT_SHORT - 10, unit: 'MINUTES') {
runner()
}
}
Expand Down Expand Up @@ -2628,8 +2629,6 @@ def launchTestJobs(pipeline, testFilter)
// may break the mapping functionality.

x86TestConfigs = [
"DGX_H100-4_GPUs-PyTorch-DeepSeek-1": ["dgx-h100-x4", "l0_dgx_h100", 1, 1, 4],
"DGX_H100-2_GPUs-PyTorch-Others-1": ["dgx-h100-x2", "l0_dgx_h100", 1, 1, 2],
"DGX_H100-4_GPUs-CPP-1": ["dgx-h100-x4", "l0_dgx_h100", 1, 1, 4],
"A10-PyTorch-1": ["a10", "l0_a10", 1, 2],
"A10-PyTorch-2": ["a10", "l0_a10", 2, 2],
Expand Down Expand Up @@ -2739,7 +2738,9 @@ def launchTestJobs(pipeline, testFilter)
fullSet = parallelJobs.keySet()

x86SlurmTestConfigs = [
"DGX_H100-2_GPUs-PyTorch-Others-1": ["dgx-h100-x2-oci", "l0_dgx_h100", 1, 1, 2],
"DGX_H100-2_GPUs-PyTorch-Ray-1": ["dgx-h100-x2-oci", "l0_dgx_h100", 1, 1, 2],
"DGX_H100-4_GPUs-PyTorch-DeepSeek-1": ["dgx-h100-x4-oci", "l0_dgx_h100", 1, 1, 4],
"DGX_H100-4_GPUs-PyTorch-GptOss-1": ["dgx-h100-x4-oci", "l0_dgx_h100", 1, 1, 4],
"DGX_H100-4_GPUs-PyTorch-Others-1": ["dgx-h100-x4-oci", "l0_dgx_h100", 1, 1, 4],
"B300-PyTorch-1": ["b300-single", "l0_b300", 1, 1],
Expand Down Expand Up @@ -2777,28 +2778,23 @@ def launchTestJobs(pipeline, testFilter)
fullSet += SBSATestConfigs.keySet()

SBSASlurmTestConfigs = [
"GB200-4_GPUs-PyTorch-1": ["gb200-x4-oci", "l0_gb200_multi_gpus", 1, 1, 4],
"GB200-4_GPUs-PyTorch-Post-Merge-1": ["gb200-x4-oci", "l0_gb200_multi_gpus", 1, 1, 4],
// Disable GB300 stages due to nodes will be offline temporarily.
// "GB300-PyTorch-1": ["gb300-single", "l0_gb300", 1, 1],
"GB200-4_GPUs-PyTorch-1": ["gb200-trtllm", "l0_gb200_multi_gpus", 1, 1, 4],
"GB200-4_GPUs-PyTorch-Post-Merge-1": ["gb200-x4-oci", "l0_gb200_multi_gpus", 1, 1, 4],
// "GB300-4_GPUs-PyTorch-Post-Merge-1": ["gb300-trtllm", "l0_gb300_multi_gpus", 1, 1, 4],
// "GB300-4_GPUs-PyTorch-Post-Merge-1": ["gb300-x4", "l0_gb300_multi_gpus", 1, 1, 4],
]
fullSet += SBSASlurmTestConfigs.keySet()

// multiNodesSBSAConfigs = [
// Each stage test 1 testcase with 8 GPUs and 2 nodes.
// Disable GB200 multi-node testing in L0 pre-merge until related issues is resolved (https://nvbugs/5485182, https://nvbugs/5437384)
// "GB200-8_GPUs-2_Nodes-PyTorch-1": ["gb200-trtllm", "l0_gb200_multi_nodes", 1, 5, 8, 2],
// "GB200-8_GPUs-2_Nodes-PyTorch-2": ["gb200-trtllm", "l0_gb200_multi_nodes", 2, 5, 8, 2],
// "GB200-8_GPUs-2_Nodes-PyTorch-3": ["gb200-trtllm", "l0_gb200_multi_nodes", 3, 5, 8, 2],
// "GB200-8_GPUs-2_Nodes-PyTorch-4": ["gb200-trtllm", "l0_gb200_multi_nodes", 4, 5, 8, 2],
// "GB200-8_GPUs-2_Nodes-PyTorch-5": ["gb200-trtllm", "l0_gb200_multi_nodes", 5, 5, 8, 2],
// ]
multiNodesSBSAConfigs = [:]
def numMultiNodeTests = 3
multiNodesSBSAConfigs += (1..numMultiNodeTests).collectEntries { i ->
["GB200-8_GPUs-2_Nodes-PyTorch-Post-Merge-${i}".toString(), ["gb200-oci-trtllm", "l0_gb200_multi_nodes", i, numMultiNodeTests, 8, 2]]
}
multiNodesSBSAConfigs = [
// Each testcase uses 8 GPUs and 2 nodes.
// https://nvbugs/5598863 (uncorrectable NVLink error detected during the execution) may not exist in OCI machines.
"GB200-8_GPUs-2_Nodes-PyTorch-1": ["gb200-oci-trtllm", "l0_gb200_multi_nodes", 1, 2, 8, 2],
"GB200-8_GPUs-2_Nodes-PyTorch-2": ["gb200-oci-trtllm", "l0_gb200_multi_nodes", 2, 2, 8, 2],
"GB200-8_GPUs-2_Nodes-PyTorch-Post-Merge-1": ["gb200-oci-trtllm", "l0_gb200_multi_nodes", 1, 3, 8, 2],
"GB200-8_GPUs-2_Nodes-PyTorch-Post-Merge-2": ["gb200-oci-trtllm", "l0_gb200_multi_nodes", 2, 3, 8, 2],
"GB200-8_GPUs-2_Nodes-PyTorch-Post-Merge-3": ["gb200-oci-trtllm", "l0_gb200_multi_nodes", 3, 3, 8, 2],
]
fullSet += multiNodesSBSAConfigs.keySet()

if (env.targetArch == AARCH64_TRIPLE) {
Expand Down
16 changes: 11 additions & 5 deletions jenkins/scripts/slurm_run.sh
Original file line number Diff line number Diff line change
Expand Up @@ -29,10 +29,13 @@ set_value_in_command() {
echo "$result"
}

if [ $SLURM_LOCALID -eq 0 ]; then
# save job ID in $jobWorkspace/slurm_job_id.txt for later job to retrieve
# Only the first process will save the job ID
if [ $SLURM_PROCID -eq 0 ]; then
# Save job ID in $jobWorkspace/slurm_job_id.txt for later job to retrieve
echo $SLURM_JOB_ID > $jobWorkspace/slurm_job_id.txt
fi

if [ $SLURM_LOCALID -eq 0 ]; then
wget -nv $llmTarfile
tar -zxf $tarName
which python3
Expand All @@ -55,7 +58,6 @@ else
done
fi


llmapiLaunchScript="$llmSrcNode/tensorrt_llm/llmapi/trtllm-llmapi-launch"
chmod +x $llmapiLaunchScript
cd $llmSrcNode/tests/integration/defs
Expand All @@ -64,10 +66,14 @@ cd $llmSrcNode/tests/integration/defs
trtllmWhlPath=$(pip3 show tensorrt_llm | grep Location | cut -d ' ' -f 2)
trtllmWhlPath=$(echo "$trtllmWhlPath" | sed 's/[[:space:]]+/_/g')
echo "TRTLLM WHEEL PATH: $trtllmWhlPath"
if [ $SLURM_LOCALID -eq 0 ]; then
pytestCommand=$(set_value_in_command "TRTLLM_WHL_PATH" "$trtllmWhlPath" "$pytestCommand")

# Only the first process will save the coverage config file
if [ $SLURM_PROCID -eq 0 ]; then
sed -i "s|---wheel_path---|$trtllmWhlPath|g" "$coverageConfigFile"
fi
pytestCommand=$(set_value_in_command "TRTLLM_WHL_PATH" "$trtllmWhlPath" "$pytestCommand")
# Sleep 10 seconds to wait for the coverage config file to be saved
sleep 10

containerPipLLMLibPath=$(pip3 show tensorrt_llm | grep "Location" | awk -F ":" '{ gsub(/ /, "", $2); print $2"/tensorrt_llm/libs"}')
containerPipLLMLibPath=$(echo "$containerPipLLMLibPath" | sed 's/[[:space:]]+/_/g')
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,8 @@ def result(self):

DuckLLM = namedtuple('DuckLLM', ['args', 'tokenizer', 'generate_async'])

DEFAULT_TEST_TIMEOUT = 1800
# TODO: Change back to 1800 when the disaggregated serving test slowdown issue is resolved.
DEFAULT_TEST_TIMEOUT = 3600
DEFAULT_SERVER_WAITING_TIMEOUT = 3600


Expand Down