Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 2 additions & 3 deletions .github/workflows/phoenix/bench.sh
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,7 @@

n_ranks=12

echo "My benchmarking device is:" $device
if [ "$device" = "gpu" ]; then
if [ "$job_device" = "gpu" ]; then
n_ranks=$(nvidia-smi -L | wc -l) # number of GPUs on node
gpu_ids=$(seq -s ' ' 0 $(($n_ranks-1))) # 0,1,2,...,gpu_count-1
device_opts="--gpu -g $gpu_ids"
Expand All @@ -16,7 +15,7 @@ mkdir -p $currentdir

export TMPDIR=$currentdir

if [ "$device" = "gpu" ]; then
if [ "$job_device" = "gpu" ]; then
./mfc.sh bench --mem 12 -j $(nproc) -o "$job_slug.yaml" -- -c phoenix-bench $device_opts -n $n_ranks
else
./mfc.sh bench --mem 1 -j $(nproc) -o "$job_slug.yaml" -- -c phoenix-bench $device_opts -n $n_ranks
Expand Down
121 changes: 39 additions & 82 deletions .github/workflows/phoenix/submit-bench.sh
Original file line number Diff line number Diff line change
@@ -1,107 +1,64 @@
#!/usr/bin/env bash
set -euo pipefail
#!/bin/bash

set -e

usage() {
echo "Usage: $0 [script.sh] [cpu|gpu]"
exit 1
}

[[ $# -eq 2 ]] || usage

sbatch_script="$1"

device="$2"
job_slug="`basename "$1" | sed 's/\.sh$//' | sed 's/[^a-zA-Z0-9]/-/g'`-$2"

# read the body of the user script
sbatch_body=$(<"$sbatch_script")

# common SBATCH directives
sbatch_common_opts="\
#SBATCH -J shb-${sbatch_script%%.sh}-$device # job name
#SBATCH --account=gts-sbryngelson3 # account
#SBATCH -N1 # nodes
#SBATCH -t 02:00:00 # walltime
#SBATCH -q embers # QOS
#SBATCH -o $job_slug.out # stdout+stderr
#SBATCH --mem-per-cpu=2G # default mem (overridden below)
"
if [ ! -z "$1" ]; then
Copy link
Preview

Copilot AI Aug 9, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

[nitpick] Use -n instead of ! -z for checking non-empty strings. The condition should be if [ -n "$1" ]; then for better readability and following shell scripting best practices.

Suggested change
if [ ! -z "$1" ]; then
if [ -n "$1" ]; then

Copilot uses AI. Check for mistakes.

sbatch_script_contents=`cat $1`
else
usage
exit 1
fi

# CPU vs GPU overrides
if [[ "$device" == "cpu" ]]; then
sbatch_device_opts="\
sbatch_cpu_opts="\
#SBATCH -p cpu-small # partition
#SBATCH --ntasks-per-node=24 # Number of cores per node required
#SBATCH --mem-per-cpu=2G # Memory per core\
"
elif [[ "$device" == "gpu" ]]; then
sbatch_device_opts="\

sbatch_gpu_opts="\
#SBATCH -CL40S
#SBATCH --ntasks-per-node=4 # Number of cores per node required
#SBATCH -G2\
"

if [ "$2" = "cpu" ]; then
sbatch_device_opts="$sbatch_cpu_opts"
elif [ "$2" = "gpu" ]; then
sbatch_device_opts="$sbatch_gpu_opts"
else
usage
usage
exit 1
fi

# submit and capture the JobID
JOBID=$(sbatch <<-EOT | awk '{print $4}'
#!/usr/bin/env bash
${sbatch_common_opts}
${sbatch_device_opts}
export job_slug="${job_slug}"
export device="${device}"
echo "Job slug is:" $job_slug
echo "Device is:" $device
set -e -x
cd "\$SLURM_SUBMIT_DIR"
echo "Running in \$(pwd):"
job_slug="`basename "$1" | sed 's/\.sh$//' | sed 's/[^a-zA-Z0-9]/-/g'`-$2"

# load your modules & env
. ./mfc.sh load -c p -m $device
sbatch <<EOT
#!/bin/bash
#SBATCH -Jshb-$job_slug # Job name
#SBATCH --account=gts-sbryngelson3 # charge account
#SBATCH -N1 # Number of nodes required
$sbatch_device_opts
#SBATCH -t 02:00:00 # Duration of the job (Ex: 15 mins)
#SBATCH -q embers # QOS Name
#SBATCH -o$job_slug.out # Combined output and error messages file
#SBATCH -W # Do not exit until the submitted job terminates.
Copy link
Preview

Copilot AI Aug 9, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The -W flag makes sbatch wait for job completion, but this removes the ability to handle job failures gracefully. The original polling mechanism provided better error handling and cleanup capabilities.

Suggested change
#SBATCH -W # Do not exit until the submitted job terminates.

Copilot uses AI. Check for mistakes.

# user script contents
${sbatch_body}
EOT
)
set -e
set -x
echo "Submitted: SLURM job $JOBID"
cd "\$SLURM_SUBMIT_DIR"
echo "Running in $(pwd):"
# if this wrapper is killed/canceled, make sure SLURM job is cleaned up
trap '[[ -n "${JOBID:-}" ]] && scancel "$JOBID" >/dev/null 2>&1 || :' EXIT
job_slug="$job_slug"
job_device="$2"
# ────────── Poll until SLURM job finishes ──────────
while :; do
# Try sacct first
STATE=$(sacct -j "$JOBID" --format=State --noheader --parsable2 | head -n1)
. ./mfc.sh load -c p -m $2
# Fallback to squeue if sacct is empty
if [[ -z "$STATE" ]]; then
STATE=$(squeue -j "$JOBID" -h -o "%T" || echo "")
fi
$sbatch_script_contents
# If it’s one of SLURM’s terminal states, break immediately
case "$STATE" in
COMPLETED|FAILED|CANCELLED|TIMEOUT|PREEMPTED)
echo "Completed: SLURM job $JOBID reached terminal state: $STATE"
break
;;
"")
echo "Completed: SLURM job $JOBID no longer in queue; assuming finished"
break
;;
*)
echo "Waiting: SLURM job $JOBID state: $STATE"
sleep 10
;;
esac
done
EOT

# Now retrieve the exit code and exit with it
EXIT_CODE=$(sacct -j "$JOBID" --noheader --format=ExitCode | head -1 | cut -d: -f1)
echo "Completed: SLURM job $JOBID exit code: $EXIT_CODE"
exit "$EXIT_CODE"
122 changes: 43 additions & 79 deletions .github/workflows/phoenix/submit.sh
Original file line number Diff line number Diff line change
@@ -1,100 +1,64 @@
#!/usr/bin/env bash
set -euo pipefail
#!/bin/bash

set -e

usage() {
echo "Usage: $0 [script.sh] [cpu|gpu]"
exit 1
}

[[ $# -eq 2 ]] || usage

sbatch_script="$1"
device="$2"

job_slug="`basename "$1" | sed 's/\.sh$//' | sed 's/[^a-zA-Z0-9]/-/g'`-$2"

# read the body of the user script
sbatch_body=$(<"$sbatch_script")
if [ ! -z "$1" ]; then
Copy link
Preview

Copilot AI Aug 9, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

[nitpick] Use -n instead of ! -z for checking non-empty strings. The condition should be if [ -n "$1" ]; then for better readability and following shell scripting best practices.

Suggested change
if [ ! -z "$1" ]; then
if [ -n "$1" ]; then

Copilot uses AI. Check for mistakes.

sbatch_script_contents=`cat $1`
else
usage
exit 1
fi

# common SBATCH directives
sbatch_common_opts="\
#SBATCH -J shb-${sbatch_script%%.sh}-$device # job name
#SBATCH --account=gts-sbryngelson3 # account
#SBATCH -N1 # nodes
#SBATCH -t 03:00:00 # walltime
#SBATCH -q embers # QOS
#SBATCH -o $job_slug.out # stdout+stderr
#SBATCH --mem-per-cpu=2G # default mem (overridden below)
sbatch_cpu_opts="\
#SBATCH -p cpu-small # partition
#SBATCH --ntasks-per-node=24 # Number of cores per node required
#SBATCH --mem-per-cpu=2G # Memory per core\
"

# CPU vs GPU overrides
if [[ "$device" == "cpu" ]]; then
sbatch_device_opts="\
#SBATCH -p cpu-small
#SBATCH --ntasks-per-node=24
"
elif [[ "$device" == "gpu" ]]; then
sbatch_device_opts="\
sbatch_gpu_opts="\
#SBATCH -p gpu-v100,gpu-a100,gpu-h100,gpu-l40s
#SBATCH --ntasks-per-node=4
#SBATCH -G2
#SBATCH --ntasks-per-node=4 # Number of cores per node required
#SBATCH -G2\
"

if [ "$2" = "cpu" ]; then
sbatch_device_opts="$sbatch_cpu_opts"
elif [ "$2" = "gpu" ]; then
sbatch_device_opts="$sbatch_gpu_opts"
else
usage
usage
exit 1
fi

# submit and capture the JobID
JOBID=$(sbatch <<-EOT | awk '{print $4}'
#!/usr/bin/env bash
${sbatch_common_opts}
${sbatch_device_opts}
set -e -x
cd "\$SLURM_SUBMIT_DIR"
echo "Running in \$(pwd):"
job_slug="`basename "$1" | sed 's/\.sh$//' | sed 's/[^a-zA-Z0-9]/-/g'`-$2"

# load your modules & env
. ./mfc.sh load -c p -m $device
sbatch <<EOT
#!/bin/bash
#SBATCH -Jshb-$job_slug # Job name
#SBATCH --account=gts-sbryngelson3 # charge account
#SBATCH -N1 # Number of nodes required
$sbatch_device_opts
#SBATCH -t 03:00:00 # Duration of the job (Ex: 15 mins)
#SBATCH -q embers # QOS Name
#SBATCH -o$job_slug.out # Combined output and error messages file
#SBATCH -W # Do not exit until the submitted job terminates.
Copy link
Preview

Copilot AI Aug 9, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The -W flag makes sbatch wait for job completion, but this removes the ability to handle job failures gracefully. The original polling mechanism provided better error handling and cleanup capabilities.

Suggested change
#SBATCH -W # Do not exit until the submitted job terminates.

Copilot uses AI. Check for mistakes.

# user script contents
${sbatch_body}
EOT
)
set -e
set -x
echo "Submitted: SLURM job $JOBID"
cd "\$SLURM_SUBMIT_DIR"
echo "Running in $(pwd):"
# if this wrapper is killed/canceled, make sure SLURM job is cleaned up
trap '[[ -n "${JOBID:-}" ]] && scancel "$JOBID" >/dev/null 2>&1 || :' EXIT
job_slug="$job_slug"
job_device="$2"
# ────────── Poll until SLURM job finishes ──────────
while :; do
# Try sacct first
STATE=$(sacct -j "$JOBID" --format=State --noheader --parsable2 | head -n1)
. ./mfc.sh load -c p -m $2
# Fallback to squeue if sacct is empty
if [[ -z "$STATE" ]]; then
STATE=$(squeue -j "$JOBID" -h -o "%T" || echo "")
fi
$sbatch_script_contents
# If it’s one of SLURM’s terminal states, break immediately
case "$STATE" in
COMPLETED|FAILED|CANCELLED|TIMEOUT|PREEMPTED)
echo "Completed: SLURM job $JOBID reached terminal state: $STATE"
break
;;
"")
echo "Completed: SLURM job $JOBID no longer in queue; assuming finished"
break
;;
*)
echo "Waiting: SLURM job $JOBID state: $STATE"
sleep 10
;;
esac
done
EOT

# Now retrieve the exit code and exit with it
EXIT_CODE=$(sacct -j "$JOBID" --noheader --format=ExitCode | head -1 | cut -d: -f1)
echo "Completed: SLURM job $JOBID exit code: $EXIT_CODE"
exit "$EXIT_CODE"
16 changes: 3 additions & 13 deletions .github/workflows/phoenix/test.sh
Original file line number Diff line number Diff line change
@@ -1,19 +1,13 @@
#!/bin/bash

tmpbuild=/storage/scratch1/6/sbryngelson3/mytmp_build
currentdir=$tmpbuild/run-$(( RANDOM % 900 ))
mkdir -p $tmpbuild
mkdir -p $currentdir
export TMPDIR=$currentdir

n_test_threads=8

build_opts=""
if [ "$job_device" = "gpu" ]; then
build_opts="--gpu"
fi

./mfc.sh test --dry-run -j $n_test_threads $build_opts
./mfc.sh test --dry-run -j 8 $build_opts

n_test_threads=8

Comment on lines +8 to 11
Copy link
Preview

Copilot AI Aug 9, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The variable n_test_threads is defined after it's used on line 8. This creates a logical inconsistency where the hardcoded value 8 is used before the variable is defined with the same value.

Suggested change
./mfc.sh test --dry-run -j 8 $build_opts
n_test_threads=8
n_test_threads=8
./mfc.sh test --dry-run -j $n_test_threads $build_opts

Copilot uses AI. Check for mistakes.

if [ "$job_device" = "gpu" ]; then
gpu_count=$(nvidia-smi -L | wc -l) # number of GPUs on node
Expand All @@ -24,7 +18,3 @@ fi

./mfc.sh test --max-attempts 3 -a -j $n_test_threads $device_opts -- -c phoenix

sleep 10
rm -rf "$currentdir" || true

unset TMPDIR
Loading