diff --git a/.github/workflows/phoenix/bench.sh b/.github/workflows/phoenix/bench.sh index a0e93f9052..f58ef44721 100644 --- a/.github/workflows/phoenix/bench.sh +++ b/.github/workflows/phoenix/bench.sh @@ -2,8 +2,7 @@ n_ranks=12 -echo "My benchmarking device is:" $device -if [ "$device" = "gpu" ]; then +if [ "$job_device" = "gpu" ]; then n_ranks=$(nvidia-smi -L | wc -l) # number of GPUs on node gpu_ids=$(seq -s ' ' 0 $(($n_ranks-1))) # 0,1,2,...,gpu_count-1 device_opts="--gpu -g $gpu_ids" @@ -16,7 +15,7 @@ mkdir -p $currentdir export TMPDIR=$currentdir -if [ "$device" = "gpu" ]; then +if [ "$job_device" = "gpu" ]; then ./mfc.sh bench --mem 12 -j $(nproc) -o "$job_slug.yaml" -- -c phoenix-bench $device_opts -n $n_ranks else ./mfc.sh bench --mem 1 -j $(nproc) -o "$job_slug.yaml" -- -c phoenix-bench $device_opts -n $n_ranks diff --git a/.github/workflows/phoenix/submit-bench.sh b/.github/workflows/phoenix/submit-bench.sh index cb2c772681..e8b6dd3484 100644 --- a/.github/workflows/phoenix/submit-bench.sh +++ b/.github/workflows/phoenix/submit-bench.sh @@ -1,107 +1,64 @@ -#!/usr/bin/env bash -set -euo pipefail +#!/bin/bash + +set -e usage() { echo "Usage: $0 [script.sh] [cpu|gpu]" - exit 1 } -[[ $# -eq 2 ]] || usage - -sbatch_script="$1" - -device="$2" -job_slug="`basename "$1" | sed 's/\.sh$//' | sed 's/[^a-zA-Z0-9]/-/g'`-$2" - -# read the body of the user script -sbatch_body=$(<"$sbatch_script") - -# common SBATCH directives -sbatch_common_opts="\ -#SBATCH -J shb-${sbatch_script%%.sh}-$device # job name -#SBATCH --account=gts-sbryngelson3 # account -#SBATCH -N1 # nodes -#SBATCH -t 02:00:00 # walltime -#SBATCH -q embers # QOS -#SBATCH -o $job_slug.out # stdout+stderr -#SBATCH --mem-per-cpu=2G # default mem (overridden below) -" +if [ ! -z "$1" ]; then + sbatch_script_contents=`cat $1` +else + usage + exit 1 +fi -# CPU vs GPU overrides -if [[ "$device" == "cpu" ]]; then - sbatch_device_opts="\ +sbatch_cpu_opts="\ #SBATCH -p cpu-small # partition #SBATCH --ntasks-per-node=24 # Number of cores per node required #SBATCH --mem-per-cpu=2G # Memory per core\ " -elif [[ "$device" == "gpu" ]]; then - sbatch_device_opts="\ + +sbatch_gpu_opts="\ #SBATCH -CL40S #SBATCH --ntasks-per-node=4 # Number of cores per node required #SBATCH -G2\ " + +if [ "$2" = "cpu" ]; then + sbatch_device_opts="$sbatch_cpu_opts" +elif [ "$2" = "gpu" ]; then + sbatch_device_opts="$sbatch_gpu_opts" else - usage + usage + exit 1 fi -# submit and capture the JobID -JOBID=$(sbatch <<-EOT | awk '{print $4}' - #!/usr/bin/env bash - ${sbatch_common_opts} - ${sbatch_device_opts} - - export job_slug="${job_slug}" - export device="${device}" - - echo "Job slug is:" $job_slug - echo "Device is:" $device - - set -e -x - - cd "\$SLURM_SUBMIT_DIR" - echo "Running in \$(pwd):" +job_slug="`basename "$1" | sed 's/\.sh$//' | sed 's/[^a-zA-Z0-9]/-/g'`-$2" - # load your modules & env - . ./mfc.sh load -c p -m $device +sbatch </dev/null 2>&1 || :' EXIT +job_slug="$job_slug" +job_device="$2" -# ────────── Poll until SLURM job finishes ────────── -while :; do - # Try sacct first - STATE=$(sacct -j "$JOBID" --format=State --noheader --parsable2 | head -n1) +. ./mfc.sh load -c p -m $2 - # Fallback to squeue if sacct is empty - if [[ -z "$STATE" ]]; then - STATE=$(squeue -j "$JOBID" -h -o "%T" || echo "") - fi +$sbatch_script_contents - # If it’s one of SLURM’s terminal states, break immediately - case "$STATE" in - COMPLETED|FAILED|CANCELLED|TIMEOUT|PREEMPTED) - echo "Completed: SLURM job $JOBID reached terminal state: $STATE" - break - ;; - "") - echo "Completed: SLURM job $JOBID no longer in queue; assuming finished" - break - ;; - *) - echo "Waiting: SLURM job $JOBID state: $STATE" - sleep 10 - ;; - esac -done +EOT -# Now retrieve the exit code and exit with it -EXIT_CODE=$(sacct -j "$JOBID" --noheader --format=ExitCode | head -1 | cut -d: -f1) -echo "Completed: SLURM job $JOBID exit code: $EXIT_CODE" -exit "$EXIT_CODE" diff --git a/.github/workflows/phoenix/submit.sh b/.github/workflows/phoenix/submit.sh index eb404e96e0..6700e38c50 100644 --- a/.github/workflows/phoenix/submit.sh +++ b/.github/workflows/phoenix/submit.sh @@ -1,100 +1,64 @@ -#!/usr/bin/env bash -set -euo pipefail +#!/bin/bash + +set -e usage() { echo "Usage: $0 [script.sh] [cpu|gpu]" - exit 1 } -[[ $# -eq 2 ]] || usage - -sbatch_script="$1" -device="$2" - -job_slug="`basename "$1" | sed 's/\.sh$//' | sed 's/[^a-zA-Z0-9]/-/g'`-$2" - -# read the body of the user script -sbatch_body=$(<"$sbatch_script") +if [ ! -z "$1" ]; then + sbatch_script_contents=`cat $1` +else + usage + exit 1 +fi -# common SBATCH directives -sbatch_common_opts="\ -#SBATCH -J shb-${sbatch_script%%.sh}-$device # job name -#SBATCH --account=gts-sbryngelson3 # account -#SBATCH -N1 # nodes -#SBATCH -t 03:00:00 # walltime -#SBATCH -q embers # QOS -#SBATCH -o $job_slug.out # stdout+stderr -#SBATCH --mem-per-cpu=2G # default mem (overridden below) +sbatch_cpu_opts="\ +#SBATCH -p cpu-small # partition +#SBATCH --ntasks-per-node=24 # Number of cores per node required +#SBATCH --mem-per-cpu=2G # Memory per core\ " -# CPU vs GPU overrides -if [[ "$device" == "cpu" ]]; then - sbatch_device_opts="\ -#SBATCH -p cpu-small -#SBATCH --ntasks-per-node=24 -" -elif [[ "$device" == "gpu" ]]; then - sbatch_device_opts="\ +sbatch_gpu_opts="\ #SBATCH -p gpu-v100,gpu-a100,gpu-h100,gpu-l40s -#SBATCH --ntasks-per-node=4 -#SBATCH -G2 +#SBATCH --ntasks-per-node=4 # Number of cores per node required +#SBATCH -G2\ " + +if [ "$2" = "cpu" ]; then + sbatch_device_opts="$sbatch_cpu_opts" +elif [ "$2" = "gpu" ]; then + sbatch_device_opts="$sbatch_gpu_opts" else - usage + usage + exit 1 fi -# submit and capture the JobID -JOBID=$(sbatch <<-EOT | awk '{print $4}' - #!/usr/bin/env bash - ${sbatch_common_opts} - ${sbatch_device_opts} - - set -e -x - - cd "\$SLURM_SUBMIT_DIR" - echo "Running in \$(pwd):" +job_slug="`basename "$1" | sed 's/\.sh$//' | sed 's/[^a-zA-Z0-9]/-/g'`-$2" - # load your modules & env - . ./mfc.sh load -c p -m $device +sbatch </dev/null 2>&1 || :' EXIT +job_slug="$job_slug" +job_device="$2" -# ────────── Poll until SLURM job finishes ────────── -while :; do - # Try sacct first - STATE=$(sacct -j "$JOBID" --format=State --noheader --parsable2 | head -n1) +. ./mfc.sh load -c p -m $2 - # Fallback to squeue if sacct is empty - if [[ -z "$STATE" ]]; then - STATE=$(squeue -j "$JOBID" -h -o "%T" || echo "") - fi +$sbatch_script_contents - # If it’s one of SLURM’s terminal states, break immediately - case "$STATE" in - COMPLETED|FAILED|CANCELLED|TIMEOUT|PREEMPTED) - echo "Completed: SLURM job $JOBID reached terminal state: $STATE" - break - ;; - "") - echo "Completed: SLURM job $JOBID no longer in queue; assuming finished" - break - ;; - *) - echo "Waiting: SLURM job $JOBID state: $STATE" - sleep 10 - ;; - esac -done +EOT -# Now retrieve the exit code and exit with it -EXIT_CODE=$(sacct -j "$JOBID" --noheader --format=ExitCode | head -1 | cut -d: -f1) -echo "Completed: SLURM job $JOBID exit code: $EXIT_CODE" -exit "$EXIT_CODE" diff --git a/.github/workflows/phoenix/test.sh b/.github/workflows/phoenix/test.sh index 60b9920f51..74d1d1265a 100644 --- a/.github/workflows/phoenix/test.sh +++ b/.github/workflows/phoenix/test.sh @@ -1,19 +1,13 @@ #!/bin/bash -tmpbuild=/storage/scratch1/6/sbryngelson3/mytmp_build -currentdir=$tmpbuild/run-$(( RANDOM % 900 )) -mkdir -p $tmpbuild -mkdir -p $currentdir -export TMPDIR=$currentdir - -n_test_threads=8 - build_opts="" if [ "$job_device" = "gpu" ]; then build_opts="--gpu" fi -./mfc.sh test --dry-run -j $n_test_threads $build_opts +./mfc.sh test --dry-run -j 8 $build_opts + +n_test_threads=8 if [ "$job_device" = "gpu" ]; then gpu_count=$(nvidia-smi -L | wc -l) # number of GPUs on node @@ -24,7 +18,3 @@ fi ./mfc.sh test --max-attempts 3 -a -j $n_test_threads $device_opts -- -c phoenix -sleep 10 -rm -rf "$currentdir" || true - -unset TMPDIR