-
Notifications
You must be signed in to change notification settings - Fork 117
Change unknown job device #978
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change | ||
---|---|---|---|---|
@@ -1,107 +1,64 @@ | ||||
#!/usr/bin/env bash | ||||
set -euo pipefail | ||||
#!/bin/bash | ||||
|
||||
set -e | ||||
|
||||
usage() { | ||||
echo "Usage: $0 [script.sh] [cpu|gpu]" | ||||
exit 1 | ||||
} | ||||
|
||||
[[ $# -eq 2 ]] || usage | ||||
|
||||
sbatch_script="$1" | ||||
|
||||
device="$2" | ||||
job_slug="`basename "$1" | sed 's/\.sh$//' | sed 's/[^a-zA-Z0-9]/-/g'`-$2" | ||||
|
||||
# read the body of the user script | ||||
sbatch_body=$(<"$sbatch_script") | ||||
|
||||
# common SBATCH directives | ||||
sbatch_common_opts="\ | ||||
#SBATCH -J shb-${sbatch_script%%.sh}-$device # job name | ||||
#SBATCH --account=gts-sbryngelson3 # account | ||||
#SBATCH -N1 # nodes | ||||
#SBATCH -t 02:00:00 # walltime | ||||
#SBATCH -q embers # QOS | ||||
#SBATCH -o $job_slug.out # stdout+stderr | ||||
#SBATCH --mem-per-cpu=2G # default mem (overridden below) | ||||
" | ||||
if [ ! -z "$1" ]; then | ||||
sbatch_script_contents=`cat $1` | ||||
else | ||||
usage | ||||
exit 1 | ||||
fi | ||||
|
||||
# CPU vs GPU overrides | ||||
if [[ "$device" == "cpu" ]]; then | ||||
sbatch_device_opts="\ | ||||
sbatch_cpu_opts="\ | ||||
#SBATCH -p cpu-small # partition | ||||
#SBATCH --ntasks-per-node=24 # Number of cores per node required | ||||
#SBATCH --mem-per-cpu=2G # Memory per core\ | ||||
" | ||||
elif [[ "$device" == "gpu" ]]; then | ||||
sbatch_device_opts="\ | ||||
|
||||
sbatch_gpu_opts="\ | ||||
#SBATCH -CL40S | ||||
#SBATCH --ntasks-per-node=4 # Number of cores per node required | ||||
#SBATCH -G2\ | ||||
" | ||||
|
||||
if [ "$2" = "cpu" ]; then | ||||
sbatch_device_opts="$sbatch_cpu_opts" | ||||
elif [ "$2" = "gpu" ]; then | ||||
sbatch_device_opts="$sbatch_gpu_opts" | ||||
else | ||||
usage | ||||
usage | ||||
exit 1 | ||||
fi | ||||
|
||||
# submit and capture the JobID | ||||
JOBID=$(sbatch <<-EOT | awk '{print $4}' | ||||
#!/usr/bin/env bash | ||||
${sbatch_common_opts} | ||||
${sbatch_device_opts} | ||||
export job_slug="${job_slug}" | ||||
export device="${device}" | ||||
echo "Job slug is:" $job_slug | ||||
echo "Device is:" $device | ||||
set -e -x | ||||
cd "\$SLURM_SUBMIT_DIR" | ||||
echo "Running in \$(pwd):" | ||||
job_slug="`basename "$1" | sed 's/\.sh$//' | sed 's/[^a-zA-Z0-9]/-/g'`-$2" | ||||
|
||||
# load your modules & env | ||||
. ./mfc.sh load -c p -m $device | ||||
sbatch <<EOT | ||||
#!/bin/bash | ||||
#SBATCH -Jshb-$job_slug # Job name | ||||
#SBATCH --account=gts-sbryngelson3 # charge account | ||||
#SBATCH -N1 # Number of nodes required | ||||
$sbatch_device_opts | ||||
#SBATCH -t 02:00:00 # Duration of the job (Ex: 15 mins) | ||||
#SBATCH -q embers # QOS Name | ||||
#SBATCH -o$job_slug.out # Combined output and error messages file | ||||
#SBATCH -W # Do not exit until the submitted job terminates. | ||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The
Suggested change
Copilot uses AI. Check for mistakes. Positive FeedbackNegative Feedback |
||||
# user script contents | ||||
${sbatch_body} | ||||
EOT | ||||
) | ||||
set -e | ||||
set -x | ||||
echo "Submitted: SLURM job $JOBID" | ||||
cd "\$SLURM_SUBMIT_DIR" | ||||
echo "Running in $(pwd):" | ||||
# if this wrapper is killed/canceled, make sure SLURM job is cleaned up | ||||
trap '[[ -n "${JOBID:-}" ]] && scancel "$JOBID" >/dev/null 2>&1 || :' EXIT | ||||
job_slug="$job_slug" | ||||
job_device="$2" | ||||
# ────────── Poll until SLURM job finishes ────────── | ||||
while :; do | ||||
# Try sacct first | ||||
STATE=$(sacct -j "$JOBID" --format=State --noheader --parsable2 | head -n1) | ||||
. ./mfc.sh load -c p -m $2 | ||||
# Fallback to squeue if sacct is empty | ||||
if [[ -z "$STATE" ]]; then | ||||
STATE=$(squeue -j "$JOBID" -h -o "%T" || echo "") | ||||
fi | ||||
$sbatch_script_contents | ||||
# If it’s one of SLURM’s terminal states, break immediately | ||||
case "$STATE" in | ||||
COMPLETED|FAILED|CANCELLED|TIMEOUT|PREEMPTED) | ||||
echo "Completed: SLURM job $JOBID reached terminal state: $STATE" | ||||
break | ||||
;; | ||||
"") | ||||
echo "Completed: SLURM job $JOBID no longer in queue; assuming finished" | ||||
break | ||||
;; | ||||
*) | ||||
echo "Waiting: SLURM job $JOBID state: $STATE" | ||||
sleep 10 | ||||
;; | ||||
esac | ||||
done | ||||
EOT | ||||
|
||||
# Now retrieve the exit code and exit with it | ||||
EXIT_CODE=$(sacct -j "$JOBID" --noheader --format=ExitCode | head -1 | cut -d: -f1) | ||||
echo "Completed: SLURM job $JOBID exit code: $EXIT_CODE" | ||||
exit "$EXIT_CODE" |
Original file line number | Diff line number | Diff line change | ||||
---|---|---|---|---|---|---|
@@ -1,100 +1,64 @@ | ||||||
#!/usr/bin/env bash | ||||||
set -euo pipefail | ||||||
#!/bin/bash | ||||||
|
||||||
set -e | ||||||
|
||||||
usage() { | ||||||
echo "Usage: $0 [script.sh] [cpu|gpu]" | ||||||
exit 1 | ||||||
} | ||||||
|
||||||
[[ $# -eq 2 ]] || usage | ||||||
|
||||||
sbatch_script="$1" | ||||||
device="$2" | ||||||
|
||||||
job_slug="`basename "$1" | sed 's/\.sh$//' | sed 's/[^a-zA-Z0-9]/-/g'`-$2" | ||||||
|
||||||
# read the body of the user script | ||||||
sbatch_body=$(<"$sbatch_script") | ||||||
if [ ! -z "$1" ]; then | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. [nitpick] Use
Suggested change
Copilot uses AI. Check for mistakes. Positive FeedbackNegative Feedback |
||||||
sbatch_script_contents=`cat $1` | ||||||
else | ||||||
usage | ||||||
exit 1 | ||||||
fi | ||||||
|
||||||
# common SBATCH directives | ||||||
sbatch_common_opts="\ | ||||||
#SBATCH -J shb-${sbatch_script%%.sh}-$device # job name | ||||||
#SBATCH --account=gts-sbryngelson3 # account | ||||||
#SBATCH -N1 # nodes | ||||||
#SBATCH -t 03:00:00 # walltime | ||||||
#SBATCH -q embers # QOS | ||||||
#SBATCH -o $job_slug.out # stdout+stderr | ||||||
#SBATCH --mem-per-cpu=2G # default mem (overridden below) | ||||||
sbatch_cpu_opts="\ | ||||||
#SBATCH -p cpu-small # partition | ||||||
#SBATCH --ntasks-per-node=24 # Number of cores per node required | ||||||
#SBATCH --mem-per-cpu=2G # Memory per core\ | ||||||
" | ||||||
|
||||||
# CPU vs GPU overrides | ||||||
if [[ "$device" == "cpu" ]]; then | ||||||
sbatch_device_opts="\ | ||||||
#SBATCH -p cpu-small | ||||||
#SBATCH --ntasks-per-node=24 | ||||||
" | ||||||
elif [[ "$device" == "gpu" ]]; then | ||||||
sbatch_device_opts="\ | ||||||
sbatch_gpu_opts="\ | ||||||
#SBATCH -p gpu-v100,gpu-a100,gpu-h100,gpu-l40s | ||||||
#SBATCH --ntasks-per-node=4 | ||||||
#SBATCH -G2 | ||||||
#SBATCH --ntasks-per-node=4 # Number of cores per node required | ||||||
#SBATCH -G2\ | ||||||
" | ||||||
|
||||||
if [ "$2" = "cpu" ]; then | ||||||
sbatch_device_opts="$sbatch_cpu_opts" | ||||||
elif [ "$2" = "gpu" ]; then | ||||||
sbatch_device_opts="$sbatch_gpu_opts" | ||||||
else | ||||||
usage | ||||||
usage | ||||||
exit 1 | ||||||
fi | ||||||
|
||||||
# submit and capture the JobID | ||||||
JOBID=$(sbatch <<-EOT | awk '{print $4}' | ||||||
#!/usr/bin/env bash | ||||||
${sbatch_common_opts} | ||||||
${sbatch_device_opts} | ||||||
set -e -x | ||||||
cd "\$SLURM_SUBMIT_DIR" | ||||||
echo "Running in \$(pwd):" | ||||||
job_slug="`basename "$1" | sed 's/\.sh$//' | sed 's/[^a-zA-Z0-9]/-/g'`-$2" | ||||||
|
||||||
# load your modules & env | ||||||
. ./mfc.sh load -c p -m $device | ||||||
sbatch <<EOT | ||||||
#!/bin/bash | ||||||
#SBATCH -Jshb-$job_slug # Job name | ||||||
#SBATCH --account=gts-sbryngelson3 # charge account | ||||||
#SBATCH -N1 # Number of nodes required | ||||||
$sbatch_device_opts | ||||||
#SBATCH -t 03:00:00 # Duration of the job (Ex: 15 mins) | ||||||
#SBATCH -q embers # QOS Name | ||||||
#SBATCH -o$job_slug.out # Combined output and error messages file | ||||||
#SBATCH -W # Do not exit until the submitted job terminates. | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The
Suggested change
Copilot uses AI. Check for mistakes. Positive FeedbackNegative Feedback |
||||||
# user script contents | ||||||
${sbatch_body} | ||||||
EOT | ||||||
) | ||||||
set -e | ||||||
set -x | ||||||
echo "Submitted: SLURM job $JOBID" | ||||||
cd "\$SLURM_SUBMIT_DIR" | ||||||
echo "Running in $(pwd):" | ||||||
# if this wrapper is killed/canceled, make sure SLURM job is cleaned up | ||||||
trap '[[ -n "${JOBID:-}" ]] && scancel "$JOBID" >/dev/null 2>&1 || :' EXIT | ||||||
job_slug="$job_slug" | ||||||
job_device="$2" | ||||||
# ────────── Poll until SLURM job finishes ────────── | ||||||
while :; do | ||||||
# Try sacct first | ||||||
STATE=$(sacct -j "$JOBID" --format=State --noheader --parsable2 | head -n1) | ||||||
. ./mfc.sh load -c p -m $2 | ||||||
# Fallback to squeue if sacct is empty | ||||||
if [[ -z "$STATE" ]]; then | ||||||
STATE=$(squeue -j "$JOBID" -h -o "%T" || echo "") | ||||||
fi | ||||||
$sbatch_script_contents | ||||||
# If it’s one of SLURM’s terminal states, break immediately | ||||||
case "$STATE" in | ||||||
COMPLETED|FAILED|CANCELLED|TIMEOUT|PREEMPTED) | ||||||
echo "Completed: SLURM job $JOBID reached terminal state: $STATE" | ||||||
break | ||||||
;; | ||||||
"") | ||||||
echo "Completed: SLURM job $JOBID no longer in queue; assuming finished" | ||||||
break | ||||||
;; | ||||||
*) | ||||||
echo "Waiting: SLURM job $JOBID state: $STATE" | ||||||
sleep 10 | ||||||
;; | ||||||
esac | ||||||
done | ||||||
EOT | ||||||
|
||||||
# Now retrieve the exit code and exit with it | ||||||
EXIT_CODE=$(sacct -j "$JOBID" --noheader --format=ExitCode | head -1 | cut -d: -f1) | ||||||
echo "Completed: SLURM job $JOBID exit code: $EXIT_CODE" | ||||||
exit "$EXIT_CODE" |
Original file line number | Diff line number | Diff line change | ||||||||||
---|---|---|---|---|---|---|---|---|---|---|---|---|
@@ -1,19 +1,13 @@ | ||||||||||||
#!/bin/bash | ||||||||||||
|
||||||||||||
tmpbuild=/storage/scratch1/6/sbryngelson3/mytmp_build | ||||||||||||
currentdir=$tmpbuild/run-$(( RANDOM % 900 )) | ||||||||||||
mkdir -p $tmpbuild | ||||||||||||
mkdir -p $currentdir | ||||||||||||
export TMPDIR=$currentdir | ||||||||||||
|
||||||||||||
n_test_threads=8 | ||||||||||||
|
||||||||||||
build_opts="" | ||||||||||||
if [ "$job_device" = "gpu" ]; then | ||||||||||||
build_opts="--gpu" | ||||||||||||
fi | ||||||||||||
|
||||||||||||
./mfc.sh test --dry-run -j $n_test_threads $build_opts | ||||||||||||
./mfc.sh test --dry-run -j 8 $build_opts | ||||||||||||
|
||||||||||||
n_test_threads=8 | ||||||||||||
|
||||||||||||
Comment on lines
+8
to
11
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The variable n_test_threads is defined after it's used on line 8. This creates a logical inconsistency where the hardcoded value 8 is used before the variable is defined with the same value.
Suggested change
Copilot uses AI. Check for mistakes. Positive FeedbackNegative Feedback |
||||||||||||
if [ "$job_device" = "gpu" ]; then | ||||||||||||
gpu_count=$(nvidia-smi -L | wc -l) # number of GPUs on node | ||||||||||||
|
@@ -24,7 +18,3 @@ fi | |||||||||||
|
||||||||||||
./mfc.sh test --max-attempts 3 -a -j $n_test_threads $device_opts -- -c phoenix | ||||||||||||
|
||||||||||||
sleep 10 | ||||||||||||
rm -rf "$currentdir" || true | ||||||||||||
|
||||||||||||
unset TMPDIR |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
[nitpick] Use
-n
instead of! -z
for checking non-empty strings. The condition should beif [ -n "$1" ]; then
for better readability and following shell scripting best practices.Copilot uses AI. Check for mistakes.