From 9163a11e2cafae71f4be92ce3547bc8440c523f4 Mon Sep 17 00:00:00 2001 From: Spencer Bryngelson Date: Fri, 6 Mar 2026 02:08:43 -0500 Subject: [PATCH 01/14] Fix Frontier benchmark SLURM: use batch+1:59+normal QOS Benchmark jobs were using the extended partition (5:59 walltime, ENG160 account) causing multi-hour queue waits and hitting GHA's 8h wall-clock limit. The actual benchmark runs in ~20 minutes on the node. Switch to batch + 1:59 + --qos=normal (same as the test suite jobs). Co-Authored-By: Claude Sonnet 4.6 --- .github/workflows/bench.yml | 2 +- .github/workflows/frontier/submit.sh | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/.github/workflows/bench.yml b/.github/workflows/bench.yml index b45fc45e40..5cf9681e33 100644 --- a/.github/workflows/bench.yml +++ b/.github/workflows/bench.yml @@ -88,7 +88,7 @@ jobs: runs-on: group: ${{ matrix.group }} labels: ${{ matrix.labels }} - timeout-minutes: 480 + timeout-minutes: 240 steps: - name: Clone - PR uses: actions/checkout@v4 diff --git a/.github/workflows/frontier/submit.sh b/.github/workflows/frontier/submit.sh index 16d4f0d73c..8b914db03e 100644 --- a/.github/workflows/frontier/submit.sh +++ b/.github/workflows/frontier/submit.sh @@ -45,10 +45,10 @@ fi # Select SBATCH params based on job type if [ "$job_type" = "bench" ]; then - sbatch_account="#SBATCH -A ENG160" - sbatch_time="#SBATCH -t 05:59:00" - sbatch_partition="#SBATCH -p extended" - sbatch_extra="" + sbatch_account="#SBATCH -A CFD154" + sbatch_time="#SBATCH -t 01:59:00" + sbatch_partition="#SBATCH -p batch" + sbatch_extra="#SBATCH --qos=normal" else sbatch_account="#SBATCH -A CFD154" sbatch_time="#SBATCH -t 01:59:00" From ffe80ec2e01c5637955c0a21eb8c986ad7e2077c Mon Sep 17 00:00:00 2001 From: Spencer Bryngelson Date: Fri, 6 Mar 2026 02:13:47 -0500 Subject: [PATCH 02/14] Fix bench.yml: restore timeout-minutes to 480 (revert accidental 240) Co-Authored-By: Claude Sonnet 4.6 --- .github/workflows/bench.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/bench.yml b/.github/workflows/bench.yml index 5cf9681e33..b45fc45e40 100644 --- a/.github/workflows/bench.yml +++ b/.github/workflows/bench.yml @@ -88,7 +88,7 @@ jobs: runs-on: group: ${{ matrix.group }} labels: ${{ matrix.labels }} - timeout-minutes: 240 + timeout-minutes: 480 steps: - name: Clone - PR uses: actions/checkout@v4 From cfbc02303fec44b63a51ed6a03f4853c8ce8be8b Mon Sep 17 00:00:00 2001 From: Spencer Bryngelson Date: Fri, 6 Mar 2026 11:39:45 -0500 Subject: [PATCH 03/14] Remove persistent build cache for self-hosted test runners Replace setup-build-cache.sh symlink mechanism with rm -rf build before each test run on Phoenix and Frontier. Benchmark jobs unaffected. Co-Authored-By: Claude Sonnet 4.6 --- .github/workflows/frontier/build.sh | 3 +-- .github/workflows/phoenix/test.sh | 3 +-- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/.github/workflows/frontier/build.sh b/.github/workflows/frontier/build.sh index 88446ad2a0..6abb0cff8a 100644 --- a/.github/workflows/frontier/build.sh +++ b/.github/workflows/frontier/build.sh @@ -20,9 +20,8 @@ build_opts="$gpu_opts" . ./mfc.sh load -c $compiler_flag -m $([ "$job_device" = "gpu" ] && echo "g" || echo "c") -# Only set up build cache for test suite, not benchmarks if [ "$run_bench" != "bench" ]; then - source .github/scripts/setup-build-cache.sh "$cluster_name" "$job_device" "$job_interface" + rm -rf build fi source .github/scripts/retry-build.sh diff --git a/.github/workflows/phoenix/test.sh b/.github/workflows/phoenix/test.sh index 6816bd9a25..c8a5af2132 100644 --- a/.github/workflows/phoenix/test.sh +++ b/.github/workflows/phoenix/test.sh @@ -3,8 +3,7 @@ source .github/scripts/gpu-opts.sh build_opts="$gpu_opts" -# Set up persistent build cache -source .github/scripts/setup-build-cache.sh phoenix "$job_device" "$job_interface" +rm -rf build # Build with retry; smoke-test cached binaries to catch architecture mismatches # (SIGILL from binaries compiled on a different compute node). From 574203046c0f324127979718ae1c4932c67c22fc Mon Sep 17 00:00:00 2001 From: Spencer Bryngelson Date: Fri, 6 Mar 2026 11:50:47 -0500 Subject: [PATCH 04/14] Remove build cache from benchmark jobs on Phoenix and Frontier --- .github/workflows/frontier/build.sh | 4 +--- .github/workflows/phoenix/bench.sh | 2 ++ 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/frontier/build.sh b/.github/workflows/frontier/build.sh index 6abb0cff8a..d21b1ddac4 100644 --- a/.github/workflows/frontier/build.sh +++ b/.github/workflows/frontier/build.sh @@ -20,9 +20,7 @@ build_opts="$gpu_opts" . ./mfc.sh load -c $compiler_flag -m $([ "$job_device" = "gpu" ] && echo "g" || echo "c") -if [ "$run_bench" != "bench" ]; then - rm -rf build -fi +rm -rf build source .github/scripts/retry-build.sh if [ "$run_bench" == "bench" ]; then diff --git a/.github/workflows/phoenix/bench.sh b/.github/workflows/phoenix/bench.sh index 0eafc485d1..e91ece366b 100644 --- a/.github/workflows/phoenix/bench.sh +++ b/.github/workflows/phoenix/bench.sh @@ -15,6 +15,8 @@ else bench_opts="--mem 1" fi +rm -rf build + source .github/scripts/retry-build.sh RETRY_CLEAN_CMD="./mfc.sh clean" retry_build ./mfc.sh build -j $(nproc) $build_opts || exit 1 From 7edb7c389e5fff6483ea45b5af7324b378ed60fa Mon Sep 17 00:00:00 2001 From: Spencer Bryngelson Date: Fri, 6 Mar 2026 14:10:18 -0500 Subject: [PATCH 05/14] Fix submit.sh to survive monitor SIGKILL by re-checking SLURM state When the runner process is killed (exit 137) before the SLURM job completes, sacct is used to verify the job's final state. If the SLURM job completed with exit 0:0, the CI step passes regardless of the monitor's exit code. Co-Authored-By: Claude Sonnet 4.6 --- .github/workflows/phoenix/submit.sh | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/.github/workflows/phoenix/submit.sh b/.github/workflows/phoenix/submit.sh index 5b7162fef7..c370ec5a3f 100755 --- a/.github/workflows/phoenix/submit.sh +++ b/.github/workflows/phoenix/submit.sh @@ -96,4 +96,20 @@ echo "Submitted batch job $job_id" # Use resilient monitoring instead of sbatch -W SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" -bash "$SCRIPT_DIR/../../scripts/monitor_slurm_job.sh" "$job_id" "$output_file" +monitor_exit=0 +bash "$SCRIPT_DIR/../../scripts/monitor_slurm_job.sh" "$job_id" "$output_file" || monitor_exit=$? + +if [ "$monitor_exit" -ne 0 ]; then + echo "Monitor exited with code $monitor_exit; re-checking SLURM job $job_id final state..." + # Give the SLURM epilog time to finalize if the job just finished + sleep 30 + final_state=$(sacct -j "$job_id" -n -X -P -o State 2>/dev/null | head -n1 | cut -d'|' -f1 | tr -d ' ' || echo "UNKNOWN") + final_exit=$(sacct -j "$job_id" --format=ExitCode --noheader --parsable2 2>/dev/null | head -n1 | tr -d ' ' || echo "") + echo "Final SLURM state=$final_state exit=$final_exit" + if [ "$final_state" = "COMPLETED" ] && [ "$final_exit" = "0:0" ]; then + echo "SLURM job $job_id completed successfully despite monitor failure — continuing." + else + echo "ERROR: SLURM job $job_id did not complete successfully (state=$final_state exit=$final_exit)" + exit 1 + fi +fi From 773f5adfcec5d9dc68b1fcff91d5eb0c492d6cfa Mon Sep 17 00:00:00 2001 From: Spencer Bryngelson Date: Fri, 6 Mar 2026 14:28:40 -0500 Subject: [PATCH 06/14] Extract monitor SIGKILL recovery into shared run_monitored_slurm_job.sh All three submit.sh scripts (phoenix, frontier, frontier_amd symlink) now call a single helper that wraps monitor_slurm_job.sh with sacct fallback: if the monitor is killed before the SLURM job completes, the helper re-checks the job's final state and exits 0 if it succeeded. Co-Authored-By: Claude Sonnet 4.6 --- .github/scripts/run_monitored_slurm_job.sh | 37 ++++++++++++++++++++++ .github/workflows/frontier/submit.sh | 3 +- .github/workflows/phoenix/submit.sh | 19 +---------- 3 files changed, 39 insertions(+), 20 deletions(-) create mode 100644 .github/scripts/run_monitored_slurm_job.sh diff --git a/.github/scripts/run_monitored_slurm_job.sh b/.github/scripts/run_monitored_slurm_job.sh new file mode 100644 index 0000000000..905520c45e --- /dev/null +++ b/.github/scripts/run_monitored_slurm_job.sh @@ -0,0 +1,37 @@ +#!/bin/bash +# Run monitor_slurm_job.sh and recover if the monitor is killed (e.g. SIGKILL +# from the runner OS) before the SLURM job completes. When the monitor exits +# non-zero, sacct is used to verify the job's actual final state; if the SLURM +# job succeeded we exit 0 so the CI step is not falsely marked as failed. +# +# Usage: run_monitored_slurm_job.sh + +set -euo pipefail + +if [ $# -ne 2 ]; then + echo "Usage: $0 " + exit 1 +fi + +job_id="$1" +output_file="$2" + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" + +monitor_exit=0 +bash "$SCRIPT_DIR/monitor_slurm_job.sh" "$job_id" "$output_file" || monitor_exit=$? + +if [ "$monitor_exit" -ne 0 ]; then + echo "Monitor exited with code $monitor_exit; re-checking SLURM job $job_id final state..." + # Give the SLURM epilog time to finalize if the job just finished + sleep 30 + final_state=$(sacct -j "$job_id" -n -X -P -o State 2>/dev/null | head -n1 | cut -d'|' -f1 | tr -d ' ' || echo "UNKNOWN") + final_exit=$(sacct -j "$job_id" --format=ExitCode --noheader --parsable2 2>/dev/null | head -n1 | tr -d ' ' || echo "") + echo "Final SLURM state=$final_state exit=$final_exit" + if [ "$final_state" = "COMPLETED" ] && [ "$final_exit" = "0:0" ]; then + echo "SLURM job $job_id completed successfully despite monitor failure — continuing." + else + echo "ERROR: SLURM job $job_id did not complete successfully (state=$final_state exit=$final_exit)" + exit 1 + fi +fi diff --git a/.github/workflows/frontier/submit.sh b/.github/workflows/frontier/submit.sh index 8b914db03e..4b472cd433 100644 --- a/.github/workflows/frontier/submit.sh +++ b/.github/workflows/frontier/submit.sh @@ -102,5 +102,4 @@ fi echo "Submitted batch job $job_id" -# Use resilient monitoring instead of sbatch -W -bash "$SCRIPT_DIR/../../scripts/monitor_slurm_job.sh" "$job_id" "$output_file" +bash "$SCRIPT_DIR/../../scripts/run_monitored_slurm_job.sh" "$job_id" "$output_file" diff --git a/.github/workflows/phoenix/submit.sh b/.github/workflows/phoenix/submit.sh index c370ec5a3f..786489d1c4 100755 --- a/.github/workflows/phoenix/submit.sh +++ b/.github/workflows/phoenix/submit.sh @@ -94,22 +94,5 @@ fi echo "Submitted batch job $job_id" -# Use resilient monitoring instead of sbatch -W SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" -monitor_exit=0 -bash "$SCRIPT_DIR/../../scripts/monitor_slurm_job.sh" "$job_id" "$output_file" || monitor_exit=$? - -if [ "$monitor_exit" -ne 0 ]; then - echo "Monitor exited with code $monitor_exit; re-checking SLURM job $job_id final state..." - # Give the SLURM epilog time to finalize if the job just finished - sleep 30 - final_state=$(sacct -j "$job_id" -n -X -P -o State 2>/dev/null | head -n1 | cut -d'|' -f1 | tr -d ' ' || echo "UNKNOWN") - final_exit=$(sacct -j "$job_id" --format=ExitCode --noheader --parsable2 2>/dev/null | head -n1 | tr -d ' ' || echo "") - echo "Final SLURM state=$final_state exit=$final_exit" - if [ "$final_state" = "COMPLETED" ] && [ "$final_exit" = "0:0" ]; then - echo "SLURM job $job_id completed successfully despite monitor failure — continuing." - else - echo "ERROR: SLURM job $job_id did not complete successfully (state=$final_state exit=$final_exit)" - exit 1 - fi -fi +bash "$SCRIPT_DIR/../../scripts/run_monitored_slurm_job.sh" "$job_id" "$output_file" From 1311cbe4544ad75818f29e64ecec073248a20080 Mon Sep 17 00:00:00 2001 From: Spencer Bryngelson Date: Thu, 5 Mar 2026 05:03:58 -0500 Subject: [PATCH 07/14] Reduce benchmark steps and switch Frontier bench to batch/normal QOS MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Cut benchmark time steps from 60-70 to 20 (GPU) / 10 (CPU) — still sufficient for grind time measurement - Unify Frontier SLURM config: bench now uses CFD154/batch/normal like tests instead of ENG160/extended (2hr wall time vs 6hr) - Reduce CI timeout from 8hr to 4hr Co-Authored-By: Claude Opus 4.6 --- .github/workflows/bench.yml | 2 +- .github/workflows/frontier/submit.sh | 15 ++++----------- benchmarks/5eq_rk3_weno3_hllc/case.py | 4 ++-- benchmarks/hypo_hll/case.py | 4 ++-- benchmarks/ibm/case.py | 4 ++-- benchmarks/igr/case.py | 4 ++-- benchmarks/viscous_weno5_sgb_acoustic/case.py | 4 ++-- 7 files changed, 15 insertions(+), 22 deletions(-) diff --git a/.github/workflows/bench.yml b/.github/workflows/bench.yml index b45fc45e40..5cf9681e33 100644 --- a/.github/workflows/bench.yml +++ b/.github/workflows/bench.yml @@ -88,7 +88,7 @@ jobs: runs-on: group: ${{ matrix.group }} labels: ${{ matrix.labels }} - timeout-minutes: 480 + timeout-minutes: 240 steps: - name: Clone - PR uses: actions/checkout@v4 diff --git a/.github/workflows/frontier/submit.sh b/.github/workflows/frontier/submit.sh index 4b472cd433..c5dc8a41d3 100644 --- a/.github/workflows/frontier/submit.sh +++ b/.github/workflows/frontier/submit.sh @@ -44,17 +44,10 @@ else fi # Select SBATCH params based on job type -if [ "$job_type" = "bench" ]; then - sbatch_account="#SBATCH -A CFD154" - sbatch_time="#SBATCH -t 01:59:00" - sbatch_partition="#SBATCH -p batch" - sbatch_extra="#SBATCH --qos=normal" -else - sbatch_account="#SBATCH -A CFD154" - sbatch_time="#SBATCH -t 01:59:00" - sbatch_partition="#SBATCH -p batch" - sbatch_extra="#SBATCH --qos=normal" -fi +sbatch_account="#SBATCH -A CFD154" +sbatch_time="#SBATCH -t 01:59:00" +sbatch_partition="#SBATCH -p batch" +sbatch_extra="#SBATCH --qos=normal" shard_suffix="" if [ -n "$4" ]; then diff --git a/benchmarks/5eq_rk3_weno3_hllc/case.py b/benchmarks/5eq_rk3_weno3_hllc/case.py index 5ecc327e8f..fa09426ffe 100644 --- a/benchmarks/5eq_rk3_weno3_hllc/case.py +++ b/benchmarks/5eq_rk3_weno3_hllc/case.py @@ -191,8 +191,8 @@ "cyl_coord": "F", "dt": dt, "t_step_start": 0, - "t_step_stop": ARGS["steps"] if ARGS["steps"] is not None else int(7 * (5 * size + 5)), - "t_step_save": ARGS["steps"] if ARGS["steps"] is not None else int(7 * (5 * size + 5)), + "t_step_stop": ARGS["steps"] if ARGS["steps"] is not None else int(2 * (5 * size + 5)), + "t_step_save": ARGS["steps"] if ARGS["steps"] is not None else int(2 * (5 * size + 5)), # Simulation Algorithm Parameters "num_patches": 3, "model_eqns": 2, diff --git a/benchmarks/hypo_hll/case.py b/benchmarks/hypo_hll/case.py index 1663a507aa..f8d0928a01 100644 --- a/benchmarks/hypo_hll/case.py +++ b/benchmarks/hypo_hll/case.py @@ -44,8 +44,8 @@ "p": Nz, "dt": 1e-8, "t_step_start": 0, - "t_step_stop": ARGS["steps"] if ARGS["steps"] is not None else int(7 * (5 * size + 5)), - "t_step_save": ARGS["steps"] if ARGS["steps"] is not None else int(7 * (5 * size + 5)), + "t_step_stop": ARGS["steps"] if ARGS["steps"] is not None else int(2 * (5 * size + 5)), + "t_step_save": ARGS["steps"] if ARGS["steps"] is not None else int(2 * (5 * size + 5)), # Simulation Algorithm Parameters "num_patches": 2, "model_eqns": 2, diff --git a/benchmarks/ibm/case.py b/benchmarks/ibm/case.py index e16cb620b7..303cf7fcaf 100644 --- a/benchmarks/ibm/case.py +++ b/benchmarks/ibm/case.py @@ -48,8 +48,8 @@ "p": Nz, "dt": mydt, "t_step_start": 0, - "t_step_stop": ARGS["steps"] if ARGS["steps"] is not None else int(7 * (5 * size + 5)), - "t_step_save": ARGS["steps"] if ARGS["steps"] is not None else int(7 * (5 * size + 5)), + "t_step_stop": ARGS["steps"] if ARGS["steps"] is not None else int(2 * (5 * size + 5)), + "t_step_save": ARGS["steps"] if ARGS["steps"] is not None else int(2 * (5 * size + 5)), # Simulation Algorithm Parameters "num_patches": 1, "model_eqns": 2, diff --git a/benchmarks/igr/case.py b/benchmarks/igr/case.py index 469bff1fa9..4ceed76257 100644 --- a/benchmarks/igr/case.py +++ b/benchmarks/igr/case.py @@ -63,8 +63,8 @@ "cyl_coord": "F", "dt": dt, "t_step_start": 0, - "t_step_stop": ARGS["steps"] if ARGS["steps"] is not None else int(7 * (5 * size + 5)), - "t_step_save": ARGS["steps"] if ARGS["steps"] is not None else int(7 * (5 * size + 5)), + "t_step_stop": ARGS["steps"] if ARGS["steps"] is not None else int(2 * (5 * size + 5)), + "t_step_save": ARGS["steps"] if ARGS["steps"] is not None else int(2 * (5 * size + 5)), # Simulation Algorithm Parameters "num_patches": 1, "model_eqns": 2, diff --git a/benchmarks/viscous_weno5_sgb_acoustic/case.py b/benchmarks/viscous_weno5_sgb_acoustic/case.py index 9f1351b0c1..83bdc43e9c 100644 --- a/benchmarks/viscous_weno5_sgb_acoustic/case.py +++ b/benchmarks/viscous_weno5_sgb_acoustic/case.py @@ -94,8 +94,8 @@ "p": Nz, "dt": dt, "t_step_start": 0, - "t_step_stop": ARGS["steps"] if ARGS["steps"] is not None else int(6 * (5 * size + 5)), - "t_step_save": ARGS["steps"] if ARGS["steps"] is not None else int(6 * (5 * size + 5)), + "t_step_stop": ARGS["steps"] if ARGS["steps"] is not None else int(2 * (5 * size + 5)), + "t_step_save": ARGS["steps"] if ARGS["steps"] is not None else int(2 * (5 * size + 5)), # Simulation Algorithm Parameters "num_patches": 2, "model_eqns": 2, From 644c9e4d27037011518fac5c22cd1d0794ed5c1c Mon Sep 17 00:00:00 2001 From: Spencer Bryngelson Date: Tue, 3 Mar 2026 17:04:20 -0500 Subject: [PATCH 08/14] Cap bench script parallelism at 64 to fix GNR node failures On GNR nodes (192 cores), $(nproc) returns 192 which overwhelms MPI daemons and causes SIGTERM (exit 143) during benchmarks. Master lands on a 24-core node and passes while PR lands on GNR and fails, making benchmarks appear broken by the PR. Co-Authored-By: Claude Opus 4.6 --- .github/workflows/frontier/bench.sh | 5 ++++- .github/workflows/phoenix/bench.sh | 8 ++++++-- 2 files changed, 10 insertions(+), 3 deletions(-) diff --git a/.github/workflows/frontier/bench.sh b/.github/workflows/frontier/bench.sh index b60f8541a2..b896feb17c 100644 --- a/.github/workflows/frontier/bench.sh +++ b/.github/workflows/frontier/bench.sh @@ -2,8 +2,11 @@ source .github/scripts/bench-preamble.sh +# Cap parallel jobs at 64 to avoid overwhelming MPI daemons on large nodes. +n_jobs=$(( $(nproc) > 64 ? 64 : $(nproc) )) + if [ "$job_device" = "gpu" ]; then ./mfc.sh bench --mem 4 -j $n_ranks -o "$job_slug.yaml" -- -c $job_cluster $device_opts -n $n_ranks else - ./mfc.sh bench --mem 1 -j $(nproc) -o "$job_slug.yaml" -- -c $job_cluster $device_opts -n $n_ranks + ./mfc.sh bench --mem 1 -j $n_jobs -o "$job_slug.yaml" -- -c $job_cluster $device_opts -n $n_ranks fi diff --git a/.github/workflows/phoenix/bench.sh b/.github/workflows/phoenix/bench.sh index e91ece366b..9a661cb924 100644 --- a/.github/workflows/phoenix/bench.sh +++ b/.github/workflows/phoenix/bench.sh @@ -2,6 +2,10 @@ source .github/scripts/bench-preamble.sh +# Cap parallel jobs at 64 to avoid overwhelming MPI daemons on large nodes +# (GNR nodes have 192 cores but nproc is too aggressive for build/bench). +n_jobs=$(( $(nproc) > 64 ? 64 : $(nproc) )) + tmpbuild=/storage/scratch1/6/sbryngelson3/mytmp_build currentdir=$tmpbuild/run-$(( RANDOM % 900 )) mkdir -p $tmpbuild @@ -18,9 +22,9 @@ fi rm -rf build source .github/scripts/retry-build.sh -RETRY_CLEAN_CMD="./mfc.sh clean" retry_build ./mfc.sh build -j $(nproc) $build_opts || exit 1 +RETRY_CLEAN_CMD="./mfc.sh clean" retry_build ./mfc.sh build -j $n_jobs $build_opts || exit 1 -./mfc.sh bench $bench_opts -j $(nproc) -o "$job_slug.yaml" -- -c phoenix-bench $device_opts -n $n_ranks +./mfc.sh bench $bench_opts -j $n_jobs -o "$job_slug.yaml" -- -c phoenix-bench $device_opts -n $n_ranks sleep 10 rm -rf "$currentdir" || true From a02f4b20497a47f4504f051ee28d8a084bb19564 Mon Sep 17 00:00:00 2001 From: Spencer Bryngelson Date: Mon, 2 Mar 2026 20:36:56 -0500 Subject: [PATCH 09/14] Disable AVX-512 FP16 to fix build on Granite Rapids nodes gfortran 12+ with -march=native on Granite Rapids (GNR) CPUs emits vmovw instructions (AVX-512 FP16) that binutils 2.35 cannot assemble, causing LTO link failures. Add -mno-avx512fp16 when the compiler supports it. FP16 is unused in MFC's double-precision computations. Co-Authored-By: Claude Opus 4.6 --- CMakeLists.txt | 25 ++++++++++++++++++------- 1 file changed, 18 insertions(+), 7 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index ddb3876724..3c5a80638f 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -224,13 +224,24 @@ endif() if (CMAKE_BUILD_TYPE STREQUAL "Release") # Processor tuning: Check if we can target the host's native CPU's ISA. - CHECK_FORTRAN_COMPILER_FLAG("-march=native" SUPPORTS_MARCH_NATIVE) - if (SUPPORTS_MARCH_NATIVE) - add_compile_options($<$:-march=native>) - else() - CHECK_FORTRAN_COMPILER_FLAG("-mcpu=native" SUPPORTS_MCPU_NATIVE) - if (SUPPORTS_MCPU_NATIVE) - add_compile_options($<$:-mcpu=native>) + # Skip for gcov builds — -march=native on newer CPUs (e.g. Granite Rapids) + # can emit instructions the system assembler doesn't support. + if (NOT MFC_GCov) + CHECK_FORTRAN_COMPILER_FLAG("-march=native" SUPPORTS_MARCH_NATIVE) + if (SUPPORTS_MARCH_NATIVE) + add_compile_options($<$:-march=native>) + # Disable AVX-512 FP16: gfortran ≥12 emits vmovw instructions on + # Granite Rapids CPUs, but binutils <2.38 cannot assemble them. + # FP16 is unused in MFC's double-precision computations. + CHECK_FORTRAN_COMPILER_FLAG("-mno-avx512fp16" SUPPORTS_MNO_AVX512FP16) + if (SUPPORTS_MNO_AVX512FP16) + add_compile_options($<$:-mno-avx512fp16>) + endif() + else() + CHECK_FORTRAN_COMPILER_FLAG("-mcpu=native" SUPPORTS_MCPU_NATIVE) + if (SUPPORTS_MCPU_NATIVE) + add_compile_options($<$:-mcpu=native>) + endif() endif() endif() From ba91673f05785a1145f55d82af9758919b60fe23 Mon Sep 17 00:00:00 2001 From: Spencer Bryngelson Date: Mon, 2 Mar 2026 18:09:35 -0500 Subject: [PATCH 10/14] Fix Rich MarkupError crash when build output contains bracket paths Build errors containing [/tmp/...] paths (e.g. LTO linker output) were misinterpreted as Rich markup closing tags, crashing the error display and masking the actual build failure. Wrap raw output in Text() to prevent markup interpretation. Co-Authored-By: Claude Opus 4.6 --- toolchain/mfc/build.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/toolchain/mfc/build.py b/toolchain/mfc/build.py index 6430f7ad35..08ff6d7510 100644 --- a/toolchain/mfc/build.py +++ b/toolchain/mfc/build.py @@ -1,6 +1,7 @@ import os, typing, hashlib, dataclasses, subprocess, re, time, sys, threading, queue from rich.panel import Panel +from rich.text import Text from rich.progress import Progress, SpinnerColumn, BarColumn, TextColumn, TimeElapsedColumn, TaskProgressColumn from .case import Case @@ -273,14 +274,14 @@ def _show_build_error(result: subprocess.CompletedProcess, stage: str): stdout_text = result.stdout if isinstance(result.stdout, str) else result.stdout.decode('utf-8', errors='replace') stdout_text = stdout_text.strip() if stdout_text: - cons.raw.print(Panel(stdout_text, title="Output", border_style="yellow")) + cons.raw.print(Panel(Text(stdout_text), title="Output", border_style="yellow")) # Show stderr if available if result.stderr: stderr_text = result.stderr if isinstance(result.stderr, str) else result.stderr.decode('utf-8', errors='replace') stderr_text = stderr_text.strip() if stderr_text: - cons.raw.print(Panel(stderr_text, title="Errors", border_style="red")) + cons.raw.print(Panel(Text(stderr_text), title="Errors", border_style="red")) cons.print() From 3e773fffd895174160cb7e02b272e93028f17740 Mon Sep 17 00:00:00 2001 From: Spencer Bryngelson Date: Fri, 6 Mar 2026 15:07:43 -0500 Subject: [PATCH 11/14] Address bot review comments: sacct -X flag, dead job_type var, stale comment --- .github/scripts/run_monitored_slurm_job.sh | 2 +- .github/workflows/frontier/submit.sh | 7 ------- .github/workflows/phoenix/test.sh | 4 ++-- 3 files changed, 3 insertions(+), 10 deletions(-) diff --git a/.github/scripts/run_monitored_slurm_job.sh b/.github/scripts/run_monitored_slurm_job.sh index 905520c45e..22141043ad 100644 --- a/.github/scripts/run_monitored_slurm_job.sh +++ b/.github/scripts/run_monitored_slurm_job.sh @@ -26,7 +26,7 @@ if [ "$monitor_exit" -ne 0 ]; then # Give the SLURM epilog time to finalize if the job just finished sleep 30 final_state=$(sacct -j "$job_id" -n -X -P -o State 2>/dev/null | head -n1 | cut -d'|' -f1 | tr -d ' ' || echo "UNKNOWN") - final_exit=$(sacct -j "$job_id" --format=ExitCode --noheader --parsable2 2>/dev/null | head -n1 | tr -d ' ' || echo "") + final_exit=$(sacct -j "$job_id" -X --format=ExitCode --noheader --parsable2 2>/dev/null | head -n1 | tr -d ' ' || echo "") echo "Final SLURM state=$final_state exit=$final_exit" if [ "$final_state" = "COMPLETED" ] && [ "$final_exit" = "0:0" ]; then echo "SLURM job $job_id completed successfully despite monitor failure — continuing." diff --git a/.github/workflows/frontier/submit.sh b/.github/workflows/frontier/submit.sh index c5dc8a41d3..070a03094b 100644 --- a/.github/workflows/frontier/submit.sh +++ b/.github/workflows/frontier/submit.sh @@ -25,13 +25,6 @@ else exit 1 fi -# Detect job type from submitted script basename -script_basename="$(basename "$1" .sh)" -case "$script_basename" in - bench*) job_type="bench" ;; - *) job_type="test" ;; -esac - if [ "$2" = "cpu" ]; then sbatch_device_opts="\ #SBATCH -n 32 # Number of cores required" diff --git a/.github/workflows/phoenix/test.sh b/.github/workflows/phoenix/test.sh index c8a5af2132..3e8c9caa66 100644 --- a/.github/workflows/phoenix/test.sh +++ b/.github/workflows/phoenix/test.sh @@ -5,8 +5,8 @@ build_opts="$gpu_opts" rm -rf build -# Build with retry; smoke-test cached binaries to catch architecture mismatches -# (SIGILL from binaries compiled on a different compute node). +# Build with retry; smoke-test the freshly built syscheck binary to catch +# architecture mismatches (SIGILL from binaries compiled on a different compute node). source .github/scripts/retry-build.sh RETRY_VALIDATE_CMD='syscheck_bin=$(find build/install -name syscheck -type f 2>/dev/null | head -1); [ -z "$syscheck_bin" ] || "$syscheck_bin" > /dev/null 2>&1' \ retry_build ./mfc.sh test -v --dry-run -j 8 $build_opts || exit 1 From fae2e6a08a2971d5f91e50e0063fca08a8f70b70 Mon Sep 17 00:00:00 2001 From: Spencer Bryngelson Date: Fri, 6 Mar 2026 15:14:25 -0500 Subject: [PATCH 12/14] Fix bench: use PR's submit.sh for master job to get SIGKILL recovery When benchmarking master vs PR, submit_and_monitor_bench.sh was using the master directory's submit.sh for the master bench job. Master's submit.sh calls monitor_slurm_job.sh directly without SIGKILL recovery. When the monitor was killed (exit 137), the master bench YAML was never found. Fix: always use the PR's submit.sh (which calls run_monitored_slurm_job.sh with sacct fallback) for both master and PR bench submissions. Co-Authored-By: Claude Sonnet 4.6 --- .github/scripts/submit_and_monitor_bench.sh | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/.github/scripts/submit_and_monitor_bench.sh b/.github/scripts/submit_and_monitor_bench.sh index c081c8692a..9eae6b9ff7 100755 --- a/.github/scripts/submit_and_monitor_bench.sh +++ b/.github/scripts/submit_and_monitor_bench.sh @@ -17,9 +17,13 @@ cluster="$4" echo "[$dir] Submitting benchmark for $device-$interface on $cluster..." cd "$dir" -# Submit and monitor job (submit.sh auto-detects bench mode from script name) -bash .github/workflows/$cluster/submit.sh \ - .github/workflows/$cluster/bench.sh "$device" "$interface" +# Always use the PR's submit.sh so both master and PR builds benefit from the +# run_monitored_slurm_job.sh SIGKILL recovery wrapper. The bench script is +# still resolved relative to the current directory (master/ or pr/) so the +# correct branch code is benchmarked. SLURM_SUBMIT_DIR ensures the job runs +# in the right directory regardless of which submit.sh is invoked. +PR_SUBMIT="${SCRIPT_DIR}/../workflows/${cluster}/submit.sh" +bash "$PR_SUBMIT" .github/workflows/$cluster/bench.sh "$device" "$interface" # Verify the YAML output file was created job_slug="bench-$device-$interface" From 3224931537e141cee2c0c977e49bfa2307d6d4ab Mon Sep 17 00:00:00 2001 From: Spencer Bryngelson Date: Fri, 6 Mar 2026 15:21:10 -0500 Subject: [PATCH 13/14] Fix submit_and_monitor_bench.sh: define SCRIPT_DIR before use Co-Authored-By: Claude Sonnet 4.6 --- .github/scripts/submit_and_monitor_bench.sh | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/scripts/submit_and_monitor_bench.sh b/.github/scripts/submit_and_monitor_bench.sh index 9eae6b9ff7..e0a6eb7384 100755 --- a/.github/scripts/submit_and_monitor_bench.sh +++ b/.github/scripts/submit_and_monitor_bench.sh @@ -14,6 +14,8 @@ device="$2" interface="$3" cluster="$4" +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" + echo "[$dir] Submitting benchmark for $device-$interface on $cluster..." cd "$dir" From 2887def4d0c2fef2a1d202493120247063bc2e18 Mon Sep 17 00:00:00 2001 From: Spencer Bryngelson Date: Sat, 7 Mar 2026 03:59:20 -0500 Subject: [PATCH 14/14] bench: update Phoenix tmpbuild path to project storage --- .github/workflows/phoenix/bench.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/phoenix/bench.sh b/.github/workflows/phoenix/bench.sh index 9a661cb924..218cf68a5f 100644 --- a/.github/workflows/phoenix/bench.sh +++ b/.github/workflows/phoenix/bench.sh @@ -6,7 +6,7 @@ source .github/scripts/bench-preamble.sh # (GNR nodes have 192 cores but nproc is too aggressive for build/bench). n_jobs=$(( $(nproc) > 64 ? 64 : $(nproc) )) -tmpbuild=/storage/scratch1/6/sbryngelson3/mytmp_build +tmpbuild=/storage/project/r-sbryngelson3-0/sbryngelson3/mytmp_build currentdir=$tmpbuild/run-$(( RANDOM % 900 )) mkdir -p $tmpbuild mkdir -p $currentdir