From 2c81b4734cabb103c93401ac5fad6c5b9277b6ea Mon Sep 17 00:00:00 2001 From: Spencer Bryngelson Date: Fri, 8 Aug 2025 18:27:30 -0400 Subject: [PATCH 1/6] Update test.sh --- .github/workflows/phoenix/test.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/phoenix/test.sh b/.github/workflows/phoenix/test.sh index 60b9920f51..e6c9a03350 100644 --- a/.github/workflows/phoenix/test.sh +++ b/.github/workflows/phoenix/test.sh @@ -9,13 +9,13 @@ export TMPDIR=$currentdir n_test_threads=8 build_opts="" -if [ "$job_device" = "gpu" ]; then +if [ "$device" = "gpu" ]; then build_opts="--gpu" fi ./mfc.sh test --dry-run -j $n_test_threads $build_opts -if [ "$job_device" = "gpu" ]; then +if [ "$device" = "gpu" ]; then gpu_count=$(nvidia-smi -L | wc -l) # number of GPUs on node gpu_ids=$(seq -s ' ' 0 $(($gpu_count-1))) # 0,1,2,...,gpu_count-1 device_opts="-g $gpu_ids" From c8f17807754e4e4de848e5563d125dba2e3d03ab Mon Sep 17 00:00:00 2001 From: Spencer Bryngelson Date: Fri, 8 Aug 2025 20:41:32 -0400 Subject: [PATCH 2/6] fix --- .github/workflows/bench.yml | 8 ++++ .github/workflows/phoenix/bench.sh | 28 ------------ .github/workflows/phoenix/submit-bench.sh | 44 +++++++++++++++--- .github/workflows/phoenix/submit.sh | 56 ++++++++++++++++++++--- .github/workflows/phoenix/test.sh | 30 ------------ .github/workflows/test.yml | 2 +- 6 files changed, 95 insertions(+), 73 deletions(-) delete mode 100644 .github/workflows/phoenix/bench.sh delete mode 100644 .github/workflows/phoenix/test.sh diff --git a/.github/workflows/bench.yml b/.github/workflows/bench.yml index cadfe220f3..4c1574c52d 100644 --- a/.github/workflows/bench.yml +++ b/.github/workflows/bench.yml @@ -79,11 +79,19 @@ jobs: wait %1 && wait %2 - name: Bench (Master v. PR) + if: matrix.cluster == 'frontier' run: | (cd pr && bash .github/workflows/${{ matrix.cluster }}/submit-bench.sh .github/workflows/${{ matrix.cluster }}/bench.sh ${{ matrix.device }}) & (cd master && bash .github/workflows/${{ matrix.cluster }}/submit-bench.sh .github/workflows/${{ matrix.cluster }}/bench.sh ${{ matrix.device }}) & wait %1 && wait %2 + - name: Bench (Master v. PR) + if: matrix.cluster == 'phoenix' + run: | + (cd pr && bash .github/workflows/${{ matrix.cluster }}/submit-bench.sh ${{ matrix.device }}) & + (cd master && bash .github/workflows/${{ matrix.cluster }}/submit-bench.sh ${{ matrix.device }}) & + wait %1 && wait %2 + - name: Generate & Post Comment run: | (cd pr && . ./mfc.sh load -c ${{ matrix.flag }} -m g) diff --git a/.github/workflows/phoenix/bench.sh b/.github/workflows/phoenix/bench.sh deleted file mode 100644 index a0e93f9052..0000000000 --- a/.github/workflows/phoenix/bench.sh +++ /dev/null @@ -1,28 +0,0 @@ -#!/bin/bash - -n_ranks=12 - -echo "My benchmarking device is:" $device -if [ "$device" = "gpu" ]; then - n_ranks=$(nvidia-smi -L | wc -l) # number of GPUs on node - gpu_ids=$(seq -s ' ' 0 $(($n_ranks-1))) # 0,1,2,...,gpu_count-1 - device_opts="--gpu -g $gpu_ids" -fi - -tmpbuild=/storage/scratch1/6/sbryngelson3/mytmp_build -currentdir=$tmpbuild/run-$(( RANDOM % 900 )) -mkdir -p $tmpbuild -mkdir -p $currentdir - -export TMPDIR=$currentdir - -if [ "$device" = "gpu" ]; then - ./mfc.sh bench --mem 12 -j $(nproc) -o "$job_slug.yaml" -- -c phoenix-bench $device_opts -n $n_ranks -else - ./mfc.sh bench --mem 1 -j $(nproc) -o "$job_slug.yaml" -- -c phoenix-bench $device_opts -n $n_ranks -fi - -sleep 10 -rm -rf "$currentdir" || true - -unset TMPDIR diff --git a/.github/workflows/phoenix/submit-bench.sh b/.github/workflows/phoenix/submit-bench.sh index cb2c772681..c8ba962835 100644 --- a/.github/workflows/phoenix/submit-bench.sh +++ b/.github/workflows/phoenix/submit-bench.sh @@ -2,16 +2,14 @@ set -euo pipefail usage() { - echo "Usage: $0 [script.sh] [cpu|gpu]" + echo "Usage: $0 [cpu|gpu]" exit 1 } -[[ $# -eq 2 ]] || usage +[[ $# -eq 1 ]] || usage -sbatch_script="$1" - -device="$2" -job_slug="`basename "$1" | sed 's/\.sh$//' | sed 's/[^a-zA-Z0-9]/-/g'`-$2" +device="$1" +job_slug="bench-$1" # read the body of the user script sbatch_body=$(<"$sbatch_script") @@ -65,7 +63,39 @@ JOBID=$(sbatch <<-EOT | awk '{print $4}' . ./mfc.sh load -c p -m $device # user script contents - ${sbatch_body} + n_ranks=12 + + echo "My benchmarking device is:" $device + if [ "$device" = "gpu" ]; then + echo "Set device opts for GPU cases." + n_ranks=$(nvidia-smi -L | wc -l) # number of GPUs on node + gpu_ids=$(seq -s ' ' 0 $(($n_ranks-1))) # 0,1,2,...,gpu_count-1 + device_opts="--gpu -g $gpu_ids" + fi + + tmpbuild=/storage/scratch1/6/sbryngelson3/mytmp_build + currentdir=$tmpbuild/run-$(( RANDOM % 900 )) + mkdir -p $tmpbuild + mkdir -p $currentdir + + export TMPDIR=$currentdir + + if [ "$device" = "gpu" ]; then + echo "running GPU benchmarks" + ./mfc.sh bench --mem 12 -j $(nproc) -o "$job_slug.yaml" -- -c phoenix-bench $device_opts -n $n_ranks + elif [ "$device" = "cpu" ]; then + echo "running CPU benchmarks" + ./mfc.sh bench --mem 1 -j $(nproc) -o "$job_slug.yaml" -- -c phoenix-bench $device_opts -n $n_ranks + else + echo "didn't find a device" + echo "device is" $device + exit 1 + fi + + sleep 10 + rm -rf "$currentdir" || true + + unset TMPDIR EOT ) diff --git a/.github/workflows/phoenix/submit.sh b/.github/workflows/phoenix/submit.sh index eb404e96e0..56118e568c 100644 --- a/.github/workflows/phoenix/submit.sh +++ b/.github/workflows/phoenix/submit.sh @@ -2,16 +2,14 @@ set -euo pipefail usage() { - echo "Usage: $0 [script.sh] [cpu|gpu]" + echo "Usage: $0 [cpu|gpu]" exit 1 } -[[ $# -eq 2 ]] || usage +[[ $# -eq 1 ]] || usage -sbatch_script="$1" -device="$2" - -job_slug="`basename "$1" | sed 's/\.sh$//' | sed 's/[^a-zA-Z0-9]/-/g'`-$2" +device="$1" +job_slug="test-$1" # read the body of the user script sbatch_body=$(<"$sbatch_script") @@ -49,6 +47,12 @@ JOBID=$(sbatch <<-EOT | awk '{print $4}' ${sbatch_common_opts} ${sbatch_device_opts} + export job_slug="${job_slug}" + export device="${device}" + + echo "Job slug is:" $job_slug + echo "Device is:" $device + set -e -x cd "\$SLURM_SUBMIT_DIR" @@ -58,7 +62,45 @@ JOBID=$(sbatch <<-EOT | awk '{print $4}' . ./mfc.sh load -c p -m $device # user script contents - ${sbatch_body} + tmpbuild=/storage/scratch1/6/sbryngelson3/mytmp_build + currentdir=$tmpbuild/run-$(( RANDOM % 900 )) + mkdir -p $tmpbuild + mkdir -p $currentdir + export TMPDIR=$currentdir + + n_test_threads=8 + + build_opts="" + if [ "$device" = "gpu" ]; then + build_opts="--gpu" + fi + echo "build_opts =" $build_opts + + if [[ "$device" == "cpu" ]]; then + echo "CPU BUILD" + elif [[ "$device" == "gpu" ]]; then + echo "GPU BUILD" + else + exit 1 + fi + + exit 1 + + ./mfc.sh test --dry-run -j $n_test_threads $build_opts + + if [ "$device" = "gpu" ]; then + gpu_count=$(nvidia-smi -L | wc -l) # number of GPUs on node + gpu_ids=$(seq -s ' ' 0 $(($gpu_count-1))) # 0,1,2,...,gpu_count-1 + device_opts="-g $gpu_ids" + n_test_threads=`expr $gpu_count \* 2` + fi + + ./mfc.sh test --max-attempts 3 -a -j $n_test_threads $device_opts -- -c phoenix + + sleep 10 + rm -rf "$currentdir" || true + + unset TMPDIR EOT ) diff --git a/.github/workflows/phoenix/test.sh b/.github/workflows/phoenix/test.sh deleted file mode 100644 index e6c9a03350..0000000000 --- a/.github/workflows/phoenix/test.sh +++ /dev/null @@ -1,30 +0,0 @@ -#!/bin/bash - -tmpbuild=/storage/scratch1/6/sbryngelson3/mytmp_build -currentdir=$tmpbuild/run-$(( RANDOM % 900 )) -mkdir -p $tmpbuild -mkdir -p $currentdir -export TMPDIR=$currentdir - -n_test_threads=8 - -build_opts="" -if [ "$device" = "gpu" ]; then - build_opts="--gpu" -fi - -./mfc.sh test --dry-run -j $n_test_threads $build_opts - -if [ "$device" = "gpu" ]; then - gpu_count=$(nvidia-smi -L | wc -l) # number of GPUs on node - gpu_ids=$(seq -s ' ' 0 $(($gpu_count-1))) # 0,1,2,...,gpu_count-1 - device_opts="-g $gpu_ids" - n_test_threads=`expr $gpu_count \* 2` -fi - -./mfc.sh test --max-attempts 3 -a -j $n_test_threads $device_opts -- -c phoenix - -sleep 10 -rm -rf "$currentdir" || true - -unset TMPDIR diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 7eecc105c8..3610b03380 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -111,7 +111,7 @@ jobs: - name: Build & Test if: matrix.lbl == 'gt' - run: bash .github/workflows/phoenix/submit.sh .github/workflows/phoenix/test.sh ${{ matrix.device }} + run: bash .github/workflows/phoenix/submit.sh ${{ matrix.device }} - name: Build if: matrix.lbl == 'frontier' From 39d817a280ab94c3156579f345bdc839dafece4b Mon Sep 17 00:00:00 2001 From: Spencer Bryngelson Date: Fri, 8 Aug 2025 20:46:55 -0400 Subject: [PATCH 3/6] fix again --- .github/workflows/phoenix/submit-bench.sh | 2 +- .github/workflows/phoenix/submit.sh | 5 +---- 2 files changed, 2 insertions(+), 5 deletions(-) diff --git a/.github/workflows/phoenix/submit-bench.sh b/.github/workflows/phoenix/submit-bench.sh index c8ba962835..3bfed9f9f4 100644 --- a/.github/workflows/phoenix/submit-bench.sh +++ b/.github/workflows/phoenix/submit-bench.sh @@ -16,7 +16,7 @@ sbatch_body=$(<"$sbatch_script") # common SBATCH directives sbatch_common_opts="\ -#SBATCH -J shb-${sbatch_script%%.sh}-$device # job name +#SBATCH -J MFC-benchmark-$device # job name #SBATCH --account=gts-sbryngelson3 # account #SBATCH -N1 # nodes #SBATCH -t 02:00:00 # walltime diff --git a/.github/workflows/phoenix/submit.sh b/.github/workflows/phoenix/submit.sh index 56118e568c..3294b578fd 100644 --- a/.github/workflows/phoenix/submit.sh +++ b/.github/workflows/phoenix/submit.sh @@ -11,12 +11,9 @@ usage() { device="$1" job_slug="test-$1" -# read the body of the user script -sbatch_body=$(<"$sbatch_script") - # common SBATCH directives sbatch_common_opts="\ -#SBATCH -J shb-${sbatch_script%%.sh}-$device # job name +#SBATCH -J MFC-test-$device # job name #SBATCH --account=gts-sbryngelson3 # account #SBATCH -N1 # nodes #SBATCH -t 03:00:00 # walltime From 230bf0588ce2317e9e9dd0e1b56a1b1d0fdea7da Mon Sep 17 00:00:00 2001 From: Spencer Bryngelson Date: Fri, 8 Aug 2025 20:54:32 -0400 Subject: [PATCH 4/6] blah --- .github/workflows/phoenix/submit.sh | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/.github/workflows/phoenix/submit.sh b/.github/workflows/phoenix/submit.sh index 3294b578fd..0a2d4bb6f7 100644 --- a/.github/workflows/phoenix/submit.sh +++ b/.github/workflows/phoenix/submit.sh @@ -59,17 +59,17 @@ JOBID=$(sbatch <<-EOT | awk '{print $4}' . ./mfc.sh load -c p -m $device # user script contents - tmpbuild=/storage/scratch1/6/sbryngelson3/mytmp_build - currentdir=$tmpbuild/run-$(( RANDOM % 900 )) + export tmpbuild=/storage/scratch1/6/sbryngelson3/mytmp_build + export currentdir=$tmpbuild/run-$(( RANDOM % 900 )) mkdir -p $tmpbuild mkdir -p $currentdir export TMPDIR=$currentdir n_test_threads=8 - build_opts="" + export build_opts="" if [ "$device" = "gpu" ]; then - build_opts="--gpu" + export build_opts="--gpu" fi echo "build_opts =" $build_opts @@ -86,10 +86,10 @@ JOBID=$(sbatch <<-EOT | awk '{print $4}' ./mfc.sh test --dry-run -j $n_test_threads $build_opts if [ "$device" = "gpu" ]; then - gpu_count=$(nvidia-smi -L | wc -l) # number of GPUs on node - gpu_ids=$(seq -s ' ' 0 $(($gpu_count-1))) # 0,1,2,...,gpu_count-1 - device_opts="-g $gpu_ids" - n_test_threads=`expr $gpu_count \* 2` + export gpu_count=$(nvidia-smi -L | wc -l) # number of GPUs on node + export gpu_ids=$(seq -s ' ' 0 $(($gpu_count-1))) # 0,1,2,...,gpu_count-1 + export device_opts="-g $gpu_ids" + export n_test_threads=`expr $gpu_count \* 2` fi ./mfc.sh test --max-attempts 3 -a -j $n_test_threads $device_opts -- -c phoenix From a3c741a0800006388717d5be66607cdc416d8af8 Mon Sep 17 00:00:00 2001 From: Spencer Bryngelson Date: Fri, 8 Aug 2025 21:07:19 -0400 Subject: [PATCH 5/6] fix again --- .github/workflows/phoenix/submit.sh | 157 +++++++++++++++------------- 1 file changed, 83 insertions(+), 74 deletions(-) diff --git a/.github/workflows/phoenix/submit.sh b/.github/workflows/phoenix/submit.sh index 0a2d4bb6f7..23fdb2b472 100644 --- a/.github/workflows/phoenix/submit.sh +++ b/.github/workflows/phoenix/submit.sh @@ -2,102 +2,109 @@ set -euo pipefail usage() { - echo "Usage: $0 [cpu|gpu]" - exit 1 + echo "Usage: $0 [cpu|gpu]" + exit 1 } [[ $# -eq 1 ]] || usage device="$1" -job_slug="test-$1" - -# common SBATCH directives -sbatch_common_opts="\ -#SBATCH -J MFC-test-$device # job name -#SBATCH --account=gts-sbryngelson3 # account -#SBATCH -N1 # nodes -#SBATCH -t 03:00:00 # walltime -#SBATCH -q embers # QOS -#SBATCH -o $job_slug.out # stdout+stderr -#SBATCH --mem-per-cpu=2G # default mem (overridden below) -" - -# CPU vs GPU overrides +job_slug="test-$device" + +# Build sbatch arguments (use CLI args instead of #SBATCH lines) +sbatch_args=( + -J "MFC-test-$device" + --account=gts-sbryngelson3 + -N 1 + -t 03:00:00 + -q embers + -o "${job_slug}.out" + --mem-per-cpu=2G + # Export variables for the job environment + --export=ALL,job_slug="$job_slug",device="$device" +) + if [[ "$device" == "cpu" ]]; then - sbatch_device_opts="\ -#SBATCH -p cpu-small -#SBATCH --ntasks-per-node=24 -" + sbatch_args+=( + -p cpu-small + --ntasks-per-node=24 + ) elif [[ "$device" == "gpu" ]]; then - sbatch_device_opts="\ -#SBATCH -p gpu-v100,gpu-a100,gpu-h100,gpu-l40s -#SBATCH --ntasks-per-node=4 -#SBATCH -G2 -" + sbatch_args+=( + -p gpu-v100,gpu-a100,gpu-h100,gpu-l40s + --ntasks-per-node=4 + -G 2 + ) else usage fi # submit and capture the JobID -JOBID=$(sbatch <<-EOT | awk '{print $4}' - #!/usr/bin/env bash - ${sbatch_common_opts} - ${sbatch_device_opts} - - export job_slug="${job_slug}" - export device="${device}" - - echo "Job slug is:" $job_slug - echo "Device is:" $device - - set -e -x - - cd "\$SLURM_SUBMIT_DIR" - echo "Running in \$(pwd):" +JOBID=$( + sbatch "${sbatch_args[@]}" <<'EOT' | awk '{print $4}' +#!/usr/bin/env bash +set -euo pipefail +set -x - # load your modules & env - . ./mfc.sh load -c p -m $device +echo "Job slug is: $job_slug" +echo "Device is: $device" - # user script contents - export tmpbuild=/storage/scratch1/6/sbryngelson3/mytmp_build - export currentdir=$tmpbuild/run-$(( RANDOM % 900 )) - mkdir -p $tmpbuild - mkdir -p $currentdir - export TMPDIR=$currentdir +cd "$SLURM_SUBMIT_DIR" +echo "Running in $(pwd)" - n_test_threads=8 +# load your modules & env +. ./mfc.sh load -c p -m "$device" - export build_opts="" - if [ "$device" = "gpu" ]; then - export build_opts="--gpu" - fi - echo "build_opts =" $build_opts +# user script contents +tmpbuild=/storage/scratch1/6/sbryngelson3/mytmp_build +mkdir -p "$tmpbuild" +currentdir="$tmpbuild/run-$(( RANDOM % 900 ))" +mkdir -p "$currentdir" +export TMPDIR="$currentdir" - if [[ "$device" == "cpu" ]]; then - echo "CPU BUILD" - elif [[ "$device" == "gpu" ]]; then - echo "GPU BUILD" - else - exit 1 - fi +n_test_threads=8 +build_opts="" +if [[ "$device" == "gpu" ]]; then + build_opts="--gpu" +fi +echo "build_opts = $build_opts" - exit 1 +if [[ "$device" == "cpu" ]]; then + echo "CPU BUILD" +elif [[ "$device" == "gpu" ]]; then + echo "GPU BUILD" +else + echo "Unknown device: $device" >&2 + exit 1 +fi - ./mfc.sh test --dry-run -j $n_test_threads $build_opts +# Dry run (kept from your original) +./mfc.sh test --dry-run -j "$n_test_threads" $build_opts - if [ "$device" = "gpu" ]; then - export gpu_count=$(nvidia-smi -L | wc -l) # number of GPUs on node - export gpu_ids=$(seq -s ' ' 0 $(($gpu_count-1))) # 0,1,2,...,gpu_count-1 - export device_opts="-g $gpu_ids" - export n_test_threads=`expr $gpu_count \* 2` - fi +# GPU-specific runtime options +device_opts="" +if [[ "$device" == "gpu" ]]; then + if command -v nvidia-smi >/dev/null 2>&1; then + gpu_count=$(nvidia-smi -L | wc -l) + else + gpu_count=0 + fi - ./mfc.sh test --max-attempts 3 -a -j $n_test_threads $device_opts -- -c phoenix + if [[ "$gpu_count" -gt 0 ]]; then + gpu_ids=$(seq -s ' ' 0 $(( gpu_count - 1 ))) + device_opts="-g $gpu_ids" + n_test_threads=$(( gpu_count * 2 )) + else + echo "No GPUs detected; continuing without -g list" + device_opts="" + fi +fi - sleep 10 - rm -rf "$currentdir" || true +./mfc.sh test --max-attempts 3 -a -j "$n_test_threads" ${device_opts:-} -- -c phoenix - unset TMPDIR +sleep 10 +rm -rf "$currentdir" || true +unset TMPDIR EOT ) @@ -134,6 +141,8 @@ while :; do done # Now retrieve the exit code and exit with it -EXIT_CODE=$(sacct -j "$JOBID" --noheader --format=ExitCode | head -1 | cut -d: -f1) +# (small grace period in case accounting lags) +sleep 2 +EXIT_CODE=$(sacct -j "$JOBID" --noheader --format=ExitCode | head -1 | cut -d: -f1 || echo 1) echo "Completed: SLURM job $JOBID exit code: $EXIT_CODE" exit "$EXIT_CODE" From d81a0fd4ed64597067b53add2c24f7e98e6626e6 Mon Sep 17 00:00:00 2001 From: Spencer Bryngelson Date: Fri, 8 Aug 2025 21:16:20 -0400 Subject: [PATCH 6/6] back to old way --- .github/workflows/bench.yml | 8 - .github/workflows/phoenix/bench.sh | 27 ++++ .github/workflows/phoenix/submit-bench.sh | 151 +++++-------------- .github/workflows/phoenix/submit.sh | 176 ++++++---------------- .github/workflows/phoenix/test.sh | 20 +++ .github/workflows/test.yml | 2 +- 6 files changed, 133 insertions(+), 251 deletions(-) create mode 100644 .github/workflows/phoenix/bench.sh create mode 100644 .github/workflows/phoenix/test.sh diff --git a/.github/workflows/bench.yml b/.github/workflows/bench.yml index 4c1574c52d..cadfe220f3 100644 --- a/.github/workflows/bench.yml +++ b/.github/workflows/bench.yml @@ -79,19 +79,11 @@ jobs: wait %1 && wait %2 - name: Bench (Master v. PR) - if: matrix.cluster == 'frontier' run: | (cd pr && bash .github/workflows/${{ matrix.cluster }}/submit-bench.sh .github/workflows/${{ matrix.cluster }}/bench.sh ${{ matrix.device }}) & (cd master && bash .github/workflows/${{ matrix.cluster }}/submit-bench.sh .github/workflows/${{ matrix.cluster }}/bench.sh ${{ matrix.device }}) & wait %1 && wait %2 - - name: Bench (Master v. PR) - if: matrix.cluster == 'phoenix' - run: | - (cd pr && bash .github/workflows/${{ matrix.cluster }}/submit-bench.sh ${{ matrix.device }}) & - (cd master && bash .github/workflows/${{ matrix.cluster }}/submit-bench.sh ${{ matrix.device }}) & - wait %1 && wait %2 - - name: Generate & Post Comment run: | (cd pr && . ./mfc.sh load -c ${{ matrix.flag }} -m g) diff --git a/.github/workflows/phoenix/bench.sh b/.github/workflows/phoenix/bench.sh new file mode 100644 index 0000000000..f58ef44721 --- /dev/null +++ b/.github/workflows/phoenix/bench.sh @@ -0,0 +1,27 @@ +#!/bin/bash + +n_ranks=12 + +if [ "$job_device" = "gpu" ]; then + n_ranks=$(nvidia-smi -L | wc -l) # number of GPUs on node + gpu_ids=$(seq -s ' ' 0 $(($n_ranks-1))) # 0,1,2,...,gpu_count-1 + device_opts="--gpu -g $gpu_ids" +fi + +tmpbuild=/storage/scratch1/6/sbryngelson3/mytmp_build +currentdir=$tmpbuild/run-$(( RANDOM % 900 )) +mkdir -p $tmpbuild +mkdir -p $currentdir + +export TMPDIR=$currentdir + +if [ "$job_device" = "gpu" ]; then + ./mfc.sh bench --mem 12 -j $(nproc) -o "$job_slug.yaml" -- -c phoenix-bench $device_opts -n $n_ranks +else + ./mfc.sh bench --mem 1 -j $(nproc) -o "$job_slug.yaml" -- -c phoenix-bench $device_opts -n $n_ranks +fi + +sleep 10 +rm -rf "$currentdir" || true + +unset TMPDIR diff --git a/.github/workflows/phoenix/submit-bench.sh b/.github/workflows/phoenix/submit-bench.sh index 3bfed9f9f4..e8b6dd3484 100644 --- a/.github/workflows/phoenix/submit-bench.sh +++ b/.github/workflows/phoenix/submit-bench.sh @@ -1,137 +1,64 @@ -#!/usr/bin/env bash -set -euo pipefail +#!/bin/bash + +set -e usage() { - echo "Usage: $0 [cpu|gpu]" - exit 1 + echo "Usage: $0 [script.sh] [cpu|gpu]" } -[[ $# -eq 1 ]] || usage - -device="$1" -job_slug="bench-$1" - -# read the body of the user script -sbatch_body=$(<"$sbatch_script") - -# common SBATCH directives -sbatch_common_opts="\ -#SBATCH -J MFC-benchmark-$device # job name -#SBATCH --account=gts-sbryngelson3 # account -#SBATCH -N1 # nodes -#SBATCH -t 02:00:00 # walltime -#SBATCH -q embers # QOS -#SBATCH -o $job_slug.out # stdout+stderr -#SBATCH --mem-per-cpu=2G # default mem (overridden below) -" +if [ ! -z "$1" ]; then + sbatch_script_contents=`cat $1` +else + usage + exit 1 +fi -# CPU vs GPU overrides -if [[ "$device" == "cpu" ]]; then - sbatch_device_opts="\ +sbatch_cpu_opts="\ #SBATCH -p cpu-small # partition #SBATCH --ntasks-per-node=24 # Number of cores per node required #SBATCH --mem-per-cpu=2G # Memory per core\ " -elif [[ "$device" == "gpu" ]]; then - sbatch_device_opts="\ + +sbatch_gpu_opts="\ #SBATCH -CL40S #SBATCH --ntasks-per-node=4 # Number of cores per node required #SBATCH -G2\ " + +if [ "$2" = "cpu" ]; then + sbatch_device_opts="$sbatch_cpu_opts" +elif [ "$2" = "gpu" ]; then + sbatch_device_opts="$sbatch_gpu_opts" else - usage + usage + exit 1 fi -# submit and capture the JobID -JOBID=$(sbatch <<-EOT | awk '{print $4}' - #!/usr/bin/env bash - ${sbatch_common_opts} - ${sbatch_device_opts} - - export job_slug="${job_slug}" - export device="${device}" - - echo "Job slug is:" $job_slug - echo "Device is:" $device - - set -e -x - - cd "\$SLURM_SUBMIT_DIR" - echo "Running in \$(pwd):" - - # load your modules & env - . ./mfc.sh load -c p -m $device +job_slug="`basename "$1" | sed 's/\.sh$//' | sed 's/[^a-zA-Z0-9]/-/g'`-$2" - # user script contents - n_ranks=12 +sbatch </dev/null 2>&1 || :' EXIT - -# ────────── Poll until SLURM job finishes ────────── -while :; do - # Try sacct first - STATE=$(sacct -j "$JOBID" --format=State --noheader --parsable2 | head -n1) - - # Fallback to squeue if sacct is empty - if [[ -z "$STATE" ]]; then - STATE=$(squeue -j "$JOBID" -h -o "%T" || echo "") - fi - - # If it’s one of SLURM’s terminal states, break immediately - case "$STATE" in - COMPLETED|FAILED|CANCELLED|TIMEOUT|PREEMPTED) - echo "Completed: SLURM job $JOBID reached terminal state: $STATE" - break - ;; - "") - echo "Completed: SLURM job $JOBID no longer in queue; assuming finished" - break - ;; - *) - echo "Waiting: SLURM job $JOBID state: $STATE" - sleep 10 - ;; - esac -done -# Now retrieve the exit code and exit with it -EXIT_CODE=$(sacct -j "$JOBID" --noheader --format=ExitCode | head -1 | cut -d: -f1) -echo "Completed: SLURM job $JOBID exit code: $EXIT_CODE" -exit "$EXIT_CODE" diff --git a/.github/workflows/phoenix/submit.sh b/.github/workflows/phoenix/submit.sh index 23fdb2b472..6700e38c50 100644 --- a/.github/workflows/phoenix/submit.sh +++ b/.github/workflows/phoenix/submit.sh @@ -1,148 +1,64 @@ -#!/usr/bin/env bash -set -euo pipefail +#!/bin/bash + +set -e usage() { - echo "Usage: $0 [cpu|gpu]" - exit 1 + echo "Usage: $0 [script.sh] [cpu|gpu]" } -[[ $# -eq 1 ]] || usage - -device="$1" -job_slug="test-$device" - -# Build sbatch arguments (use CLI args instead of #SBATCH lines) -sbatch_args=( - -J "MFC-test-$device" - --account=gts-sbryngelson3 - -N 1 - -t 03:00:00 - -q embers - -o "${job_slug}.out" - --mem-per-cpu=2G - # Export variables for the job environment - --export=ALL,job_slug="$job_slug",device="$device" -) - -if [[ "$device" == "cpu" ]]; then - sbatch_args+=( - -p cpu-small - --ntasks-per-node=24 - ) -elif [[ "$device" == "gpu" ]]; then - sbatch_args+=( - -p gpu-v100,gpu-a100,gpu-h100,gpu-l40s - --ntasks-per-node=4 - -G 2 - ) +if [ ! -z "$1" ]; then + sbatch_script_contents=`cat $1` else - usage -fi - -# submit and capture the JobID -JOBID=$( - sbatch "${sbatch_args[@]}" <<'EOT' | awk '{print $4}' -#!/usr/bin/env bash -set -euo pipefail -set -x - -echo "Job slug is: $job_slug" -echo "Device is: $device" - -cd "$SLURM_SUBMIT_DIR" -echo "Running in $(pwd)" - -# load your modules & env -. ./mfc.sh load -c p -m "$device" - -# user script contents -tmpbuild=/storage/scratch1/6/sbryngelson3/mytmp_build -mkdir -p "$tmpbuild" -currentdir="$tmpbuild/run-$(( RANDOM % 900 ))" -mkdir -p "$currentdir" -export TMPDIR="$currentdir" - -n_test_threads=8 -build_opts="" -if [[ "$device" == "gpu" ]]; then - build_opts="--gpu" + usage + exit 1 fi -echo "build_opts = $build_opts" -if [[ "$device" == "cpu" ]]; then - echo "CPU BUILD" -elif [[ "$device" == "gpu" ]]; then - echo "GPU BUILD" +sbatch_cpu_opts="\ +#SBATCH -p cpu-small # partition +#SBATCH --ntasks-per-node=24 # Number of cores per node required +#SBATCH --mem-per-cpu=2G # Memory per core\ +" + +sbatch_gpu_opts="\ +#SBATCH -p gpu-v100,gpu-a100,gpu-h100,gpu-l40s +#SBATCH --ntasks-per-node=4 # Number of cores per node required +#SBATCH -G2\ +" + +if [ "$2" = "cpu" ]; then + sbatch_device_opts="$sbatch_cpu_opts" +elif [ "$2" = "gpu" ]; then + sbatch_device_opts="$sbatch_gpu_opts" else - echo "Unknown device: $device" >&2 - exit 1 + usage + exit 1 fi -# Dry run (kept from your original) -./mfc.sh test --dry-run -j "$n_test_threads" $build_opts +job_slug="`basename "$1" | sed 's/\.sh$//' | sed 's/[^a-zA-Z0-9]/-/g'`-$2" -# GPU-specific runtime options -device_opts="" -if [[ "$device" == "gpu" ]]; then - if command -v nvidia-smi >/dev/null 2>&1; then - gpu_count=$(nvidia-smi -L | wc -l) - else - gpu_count=0 - fi - - if [[ "$gpu_count" -gt 0 ]]; then - gpu_ids=$(seq -s ' ' 0 $(( gpu_count - 1 ))) - device_opts="-g $gpu_ids" - n_test_threads=$(( gpu_count * 2 )) - else - echo "No GPUs detected; continuing without -g list" - device_opts="" - fi -fi +sbatch </dev/null 2>&1 || :' EXIT +job_slug="$job_slug" +job_device="$2" -# ────────── Poll until SLURM job finishes ────────── -while :; do - # Try sacct first - STATE=$(sacct -j "$JOBID" --format=State --noheader --parsable2 | head -n1) +. ./mfc.sh load -c p -m $2 - # Fallback to squeue if sacct is empty - if [[ -z "$STATE" ]]; then - STATE=$(squeue -j "$JOBID" -h -o "%T" || echo "") - fi +$sbatch_script_contents - # If it’s one of SLURM’s terminal states, break immediately - case "$STATE" in - COMPLETED|FAILED|CANCELLED|TIMEOUT|PREEMPTED) - echo "Completed: SLURM job $JOBID reached terminal state: $STATE" - break - ;; - "") - echo "Completed: SLURM job $JOBID no longer in queue; assuming finished" - break - ;; - *) - echo "Waiting: SLURM job $JOBID state: $STATE" - sleep 10 - ;; - esac -done +EOT -# Now retrieve the exit code and exit with it -# (small grace period in case accounting lags) -sleep 2 -EXIT_CODE=$(sacct -j "$JOBID" --noheader --format=ExitCode | head -1 | cut -d: -f1 || echo 1) -echo "Completed: SLURM job $JOBID exit code: $EXIT_CODE" -exit "$EXIT_CODE" diff --git a/.github/workflows/phoenix/test.sh b/.github/workflows/phoenix/test.sh new file mode 100644 index 0000000000..74d1d1265a --- /dev/null +++ b/.github/workflows/phoenix/test.sh @@ -0,0 +1,20 @@ +#!/bin/bash + +build_opts="" +if [ "$job_device" = "gpu" ]; then + build_opts="--gpu" +fi + +./mfc.sh test --dry-run -j 8 $build_opts + +n_test_threads=8 + +if [ "$job_device" = "gpu" ]; then + gpu_count=$(nvidia-smi -L | wc -l) # number of GPUs on node + gpu_ids=$(seq -s ' ' 0 $(($gpu_count-1))) # 0,1,2,...,gpu_count-1 + device_opts="-g $gpu_ids" + n_test_threads=`expr $gpu_count \* 2` +fi + +./mfc.sh test --max-attempts 3 -a -j $n_test_threads $device_opts -- -c phoenix + diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 3610b03380..7eecc105c8 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -111,7 +111,7 @@ jobs: - name: Build & Test if: matrix.lbl == 'gt' - run: bash .github/workflows/phoenix/submit.sh ${{ matrix.device }} + run: bash .github/workflows/phoenix/submit.sh .github/workflows/phoenix/test.sh ${{ matrix.device }} - name: Build if: matrix.lbl == 'frontier'