From 9163a11e2cafae71f4be92ce3547bc8440c523f4 Mon Sep 17 00:00:00 2001
From: Spencer Bryngelson <sbryngelson@login10.frontier.olcf.ornl.gov>
Date: Fri, 6 Mar 2026 02:08:43 -0500
Subject: [PATCH 01/14] Fix Frontier benchmark SLURM: use batch+1:59+normal QOS

Benchmark jobs were using the extended partition (5:59 walltime, ENG160
account) causing multi-hour queue waits and hitting GHA's 8h wall-clock
limit. The actual benchmark runs in ~20 minutes on the node. Switch to
batch + 1:59 + --qos=normal (same as the test suite jobs).

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 .github/workflows/bench.yml          | 2 +-
 .github/workflows/frontier/submit.sh | 8 ++++----
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/.github/workflows/bench.yml b/.github/workflows/bench.yml
index b45fc45e40..5cf9681e33 100644
--- a/.github/workflows/bench.yml
+++ b/.github/workflows/bench.yml
@@ -88,7 +88,7 @@ jobs:
     runs-on:
       group: ${{ matrix.group }}
       labels: ${{ matrix.labels }}
-    timeout-minutes: 480
+    timeout-minutes: 240
     steps:
       - name: Clone - PR
         uses: actions/checkout@v4
diff --git a/.github/workflows/frontier/submit.sh b/.github/workflows/frontier/submit.sh
index 16d4f0d73c..8b914db03e 100644
--- a/.github/workflows/frontier/submit.sh
+++ b/.github/workflows/frontier/submit.sh
@@ -45,10 +45,10 @@ fi
 
 # Select SBATCH params based on job type
 if [ "$job_type" = "bench" ]; then
-    sbatch_account="#SBATCH -A ENG160"
-    sbatch_time="#SBATCH -t 05:59:00"
-    sbatch_partition="#SBATCH -p extended"
-    sbatch_extra=""
+    sbatch_account="#SBATCH -A CFD154"
+    sbatch_time="#SBATCH -t 01:59:00"
+    sbatch_partition="#SBATCH -p batch"
+    sbatch_extra="#SBATCH --qos=normal"
 else
     sbatch_account="#SBATCH -A CFD154"
     sbatch_time="#SBATCH -t 01:59:00"

From ffe80ec2e01c5637955c0a21eb8c986ad7e2077c Mon Sep 17 00:00:00 2001
From: Spencer Bryngelson <sbryngelson@login10.frontier.olcf.ornl.gov>
Date: Fri, 6 Mar 2026 02:13:47 -0500
Subject: [PATCH 02/14] Fix bench.yml: restore timeout-minutes to 480 (revert
 accidental 240)

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 .github/workflows/bench.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/bench.yml b/.github/workflows/bench.yml
index 5cf9681e33..b45fc45e40 100644
--- a/.github/workflows/bench.yml
+++ b/.github/workflows/bench.yml
@@ -88,7 +88,7 @@ jobs:
     runs-on:
       group: ${{ matrix.group }}
       labels: ${{ matrix.labels }}
-    timeout-minutes: 240
+    timeout-minutes: 480
     steps:
       - name: Clone - PR
         uses: actions/checkout@v4

From cfbc02303fec44b63a51ed6a03f4853c8ce8be8b Mon Sep 17 00:00:00 2001
From: Spencer Bryngelson <sbryngelson@gmail.com>
Date: Fri, 6 Mar 2026 11:39:45 -0500
Subject: [PATCH 03/14] Remove persistent build cache for self-hosted test
 runners

Replace setup-build-cache.sh symlink mechanism with rm -rf build
before each test run on Phoenix and Frontier. Benchmark jobs unaffected.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 .github/workflows/frontier/build.sh | 3 +--
 .github/workflows/phoenix/test.sh   | 3 +--
 2 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/frontier/build.sh b/.github/workflows/frontier/build.sh
index 88446ad2a0..6abb0cff8a 100644
--- a/.github/workflows/frontier/build.sh
+++ b/.github/workflows/frontier/build.sh
@@ -20,9 +20,8 @@ build_opts="$gpu_opts"
 
 . ./mfc.sh load -c $compiler_flag -m $([ "$job_device" = "gpu" ] && echo "g" || echo "c")
 
-# Only set up build cache for test suite, not benchmarks
 if [ "$run_bench" != "bench" ]; then
-    source .github/scripts/setup-build-cache.sh "$cluster_name" "$job_device" "$job_interface"
+    rm -rf build
 fi
 
 source .github/scripts/retry-build.sh
diff --git a/.github/workflows/phoenix/test.sh b/.github/workflows/phoenix/test.sh
index 6816bd9a25..c8a5af2132 100644
--- a/.github/workflows/phoenix/test.sh
+++ b/.github/workflows/phoenix/test.sh
@@ -3,8 +3,7 @@
 source .github/scripts/gpu-opts.sh
 build_opts="$gpu_opts"
 
-# Set up persistent build cache
-source .github/scripts/setup-build-cache.sh phoenix "$job_device" "$job_interface"
+rm -rf build
 
 # Build with retry; smoke-test cached binaries to catch architecture mismatches
 # (SIGILL from binaries compiled on a different compute node).

From 574203046c0f324127979718ae1c4932c67c22fc Mon Sep 17 00:00:00 2001
From: Spencer Bryngelson <sbryngelson@gmail.com>
Date: Fri, 6 Mar 2026 11:50:47 -0500
Subject: [PATCH 04/14] Remove build cache from benchmark jobs on Phoenix and
 Frontier

---
 .github/workflows/frontier/build.sh | 4 +---
 .github/workflows/phoenix/bench.sh  | 2 ++
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/frontier/build.sh b/.github/workflows/frontier/build.sh
index 6abb0cff8a..d21b1ddac4 100644
--- a/.github/workflows/frontier/build.sh
+++ b/.github/workflows/frontier/build.sh
@@ -20,9 +20,7 @@ build_opts="$gpu_opts"
 
 . ./mfc.sh load -c $compiler_flag -m $([ "$job_device" = "gpu" ] && echo "g" || echo "c")
 
-if [ "$run_bench" != "bench" ]; then
-    rm -rf build
-fi
+rm -rf build
 
 source .github/scripts/retry-build.sh
 if [ "$run_bench" == "bench" ]; then
diff --git a/.github/workflows/phoenix/bench.sh b/.github/workflows/phoenix/bench.sh
index 0eafc485d1..e91ece366b 100644
--- a/.github/workflows/phoenix/bench.sh
+++ b/.github/workflows/phoenix/bench.sh
@@ -15,6 +15,8 @@ else
     bench_opts="--mem 1"
 fi
 
+rm -rf build
+
 source .github/scripts/retry-build.sh
 RETRY_CLEAN_CMD="./mfc.sh clean" retry_build ./mfc.sh build -j $(nproc) $build_opts || exit 1
 

From 7edb7c389e5fff6483ea45b5af7324b378ed60fa Mon Sep 17 00:00:00 2001
From: Spencer Bryngelson <sbryngelson@gmail.com>
Date: Fri, 6 Mar 2026 14:10:18 -0500
Subject: [PATCH 05/14] Fix submit.sh to survive monitor SIGKILL by re-checking
 SLURM state

When the runner process is killed (exit 137) before the SLURM job
completes, sacct is used to verify the job's final state. If the
SLURM job completed with exit 0:0, the CI step passes regardless of
the monitor's exit code.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 .github/workflows/phoenix/submit.sh | 18 +++++++++++++++++-
 1 file changed, 17 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/phoenix/submit.sh b/.github/workflows/phoenix/submit.sh
index 5b7162fef7..c370ec5a3f 100755
--- a/.github/workflows/phoenix/submit.sh
+++ b/.github/workflows/phoenix/submit.sh
@@ -96,4 +96,20 @@ echo "Submitted batch job $job_id"
 
 # Use resilient monitoring instead of sbatch -W
 SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
-bash "$SCRIPT_DIR/../../scripts/monitor_slurm_job.sh" "$job_id" "$output_file"
+monitor_exit=0
+bash "$SCRIPT_DIR/../../scripts/monitor_slurm_job.sh" "$job_id" "$output_file" || monitor_exit=$?
+
+if [ "$monitor_exit" -ne 0 ]; then
+  echo "Monitor exited with code $monitor_exit; re-checking SLURM job $job_id final state..."
+  # Give the SLURM epilog time to finalize if the job just finished
+  sleep 30
+  final_state=$(sacct -j "$job_id" -n -X -P -o State 2>/dev/null | head -n1 | cut -d'|' -f1 | tr -d ' ' || echo "UNKNOWN")
+  final_exit=$(sacct -j "$job_id" --format=ExitCode --noheader --parsable2 2>/dev/null | head -n1 | tr -d ' ' || echo "")
+  echo "Final SLURM state=$final_state exit=$final_exit"
+  if [ "$final_state" = "COMPLETED" ] && [ "$final_exit" = "0:0" ]; then
+    echo "SLURM job $job_id completed successfully despite monitor failure — continuing."
+  else
+    echo "ERROR: SLURM job $job_id did not complete successfully (state=$final_state exit=$final_exit)"
+    exit 1
+  fi
+fi

From 773f5adfcec5d9dc68b1fcff91d5eb0c492d6cfa Mon Sep 17 00:00:00 2001
From: Spencer Bryngelson <sbryngelson@gmail.com>
Date: Fri, 6 Mar 2026 14:28:40 -0500
Subject: [PATCH 06/14] Extract monitor SIGKILL recovery into shared
 run_monitored_slurm_job.sh

All three submit.sh scripts (phoenix, frontier, frontier_amd symlink) now
call a single helper that wraps monitor_slurm_job.sh with sacct fallback:
if the monitor is killed before the SLURM job completes, the helper
re-checks the job's final state and exits 0 if it succeeded.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 .github/scripts/run_monitored_slurm_job.sh | 37 ++++++++++++++++++++++
 .github/workflows/frontier/submit.sh       |  3 +-
 .github/workflows/phoenix/submit.sh        | 19 +----------
 3 files changed, 39 insertions(+), 20 deletions(-)
 create mode 100644 .github/scripts/run_monitored_slurm_job.sh

diff --git a/.github/scripts/run_monitored_slurm_job.sh b/.github/scripts/run_monitored_slurm_job.sh
new file mode 100644
index 0000000000..905520c45e
--- /dev/null
+++ b/.github/scripts/run_monitored_slurm_job.sh
@@ -0,0 +1,37 @@
+#!/bin/bash
+# Run monitor_slurm_job.sh and recover if the monitor is killed (e.g. SIGKILL
+# from the runner OS) before the SLURM job completes.  When the monitor exits
+# non-zero, sacct is used to verify the job's actual final state; if the SLURM
+# job succeeded we exit 0 so the CI step is not falsely marked as failed.
+#
+# Usage: run_monitored_slurm_job.sh <job_id> <output_file>
+
+set -euo pipefail
+
+if [ $# -ne 2 ]; then
+    echo "Usage: $0 <job_id> <output_file>"
+    exit 1
+fi
+
+job_id="$1"
+output_file="$2"
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+
+monitor_exit=0
+bash "$SCRIPT_DIR/monitor_slurm_job.sh" "$job_id" "$output_file" || monitor_exit=$?
+
+if [ "$monitor_exit" -ne 0 ]; then
+    echo "Monitor exited with code $monitor_exit; re-checking SLURM job $job_id final state..."
+    # Give the SLURM epilog time to finalize if the job just finished
+    sleep 30
+    final_state=$(sacct -j "$job_id" -n -X -P -o State 2>/dev/null | head -n1 | cut -d'|' -f1 | tr -d ' ' || echo "UNKNOWN")
+    final_exit=$(sacct -j "$job_id" --format=ExitCode --noheader --parsable2 2>/dev/null | head -n1 | tr -d ' ' || echo "")
+    echo "Final SLURM state=$final_state exit=$final_exit"
+    if [ "$final_state" = "COMPLETED" ] && [ "$final_exit" = "0:0" ]; then
+        echo "SLURM job $job_id completed successfully despite monitor failure — continuing."
+    else
+        echo "ERROR: SLURM job $job_id did not complete successfully (state=$final_state exit=$final_exit)"
+        exit 1
+    fi
+fi
diff --git a/.github/workflows/frontier/submit.sh b/.github/workflows/frontier/submit.sh
index 8b914db03e..4b472cd433 100644
--- a/.github/workflows/frontier/submit.sh
+++ b/.github/workflows/frontier/submit.sh
@@ -102,5 +102,4 @@ fi
 
 echo "Submitted batch job $job_id"
 
-# Use resilient monitoring instead of sbatch -W
-bash "$SCRIPT_DIR/../../scripts/monitor_slurm_job.sh" "$job_id" "$output_file"
+bash "$SCRIPT_DIR/../../scripts/run_monitored_slurm_job.sh" "$job_id" "$output_file"
diff --git a/.github/workflows/phoenix/submit.sh b/.github/workflows/phoenix/submit.sh
index c370ec5a3f..786489d1c4 100755
--- a/.github/workflows/phoenix/submit.sh
+++ b/.github/workflows/phoenix/submit.sh
@@ -94,22 +94,5 @@ fi
 
 echo "Submitted batch job $job_id"
 
-# Use resilient monitoring instead of sbatch -W
 SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
-monitor_exit=0
-bash "$SCRIPT_DIR/../../scripts/monitor_slurm_job.sh" "$job_id" "$output_file" || monitor_exit=$?
-
-if [ "$monitor_exit" -ne 0 ]; then
-  echo "Monitor exited with code $monitor_exit; re-checking SLURM job $job_id final state..."
-  # Give the SLURM epilog time to finalize if the job just finished
-  sleep 30
-  final_state=$(sacct -j "$job_id" -n -X -P -o State 2>/dev/null | head -n1 | cut -d'|' -f1 | tr -d ' ' || echo "UNKNOWN")
-  final_exit=$(sacct -j "$job_id" --format=ExitCode --noheader --parsable2 2>/dev/null | head -n1 | tr -d ' ' || echo "")
-  echo "Final SLURM state=$final_state exit=$final_exit"
-  if [ "$final_state" = "COMPLETED" ] && [ "$final_exit" = "0:0" ]; then
-    echo "SLURM job $job_id completed successfully despite monitor failure — continuing."
-  else
-    echo "ERROR: SLURM job $job_id did not complete successfully (state=$final_state exit=$final_exit)"
-    exit 1
-  fi
-fi
+bash "$SCRIPT_DIR/../../scripts/run_monitored_slurm_job.sh" "$job_id" "$output_file"

From 1311cbe4544ad75818f29e64ecec073248a20080 Mon Sep 17 00:00:00 2001
From: Spencer Bryngelson <sbryngelson@gmail.com>
Date: Thu, 5 Mar 2026 05:03:58 -0500
Subject: [PATCH 07/14] Reduce benchmark steps and switch Frontier bench to
 batch/normal QOS
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Cut benchmark time steps from 60-70 to 20 (GPU) / 10 (CPU) — still
  sufficient for grind time measurement
- Unify Frontier SLURM config: bench now uses CFD154/batch/normal like
  tests instead of ENG160/extended (2hr wall time vs 6hr)
- Reduce CI timeout from 8hr to 4hr

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 .github/workflows/bench.yml                   |  2 +-
 .github/workflows/frontier/submit.sh          | 15 ++++-----------
 benchmarks/5eq_rk3_weno3_hllc/case.py         |  4 ++--
 benchmarks/hypo_hll/case.py                   |  4 ++--
 benchmarks/ibm/case.py                        |  4 ++--
 benchmarks/igr/case.py                        |  4 ++--
 benchmarks/viscous_weno5_sgb_acoustic/case.py |  4 ++--
 7 files changed, 15 insertions(+), 22 deletions(-)

diff --git a/.github/workflows/bench.yml b/.github/workflows/bench.yml
index b45fc45e40..5cf9681e33 100644
--- a/.github/workflows/bench.yml
+++ b/.github/workflows/bench.yml
@@ -88,7 +88,7 @@ jobs:
     runs-on:
       group: ${{ matrix.group }}
       labels: ${{ matrix.labels }}
-    timeout-minutes: 480
+    timeout-minutes: 240
     steps:
       - name: Clone - PR
         uses: actions/checkout@v4
diff --git a/.github/workflows/frontier/submit.sh b/.github/workflows/frontier/submit.sh
index 4b472cd433..c5dc8a41d3 100644
--- a/.github/workflows/frontier/submit.sh
+++ b/.github/workflows/frontier/submit.sh
@@ -44,17 +44,10 @@ else
 fi
 
 # Select SBATCH params based on job type
-if [ "$job_type" = "bench" ]; then
-    sbatch_account="#SBATCH -A CFD154"
-    sbatch_time="#SBATCH -t 01:59:00"
-    sbatch_partition="#SBATCH -p batch"
-    sbatch_extra="#SBATCH --qos=normal"
-else
-    sbatch_account="#SBATCH -A CFD154"
-    sbatch_time="#SBATCH -t 01:59:00"
-    sbatch_partition="#SBATCH -p batch"
-    sbatch_extra="#SBATCH --qos=normal"
-fi
+sbatch_account="#SBATCH -A CFD154"
+sbatch_time="#SBATCH -t 01:59:00"
+sbatch_partition="#SBATCH -p batch"
+sbatch_extra="#SBATCH --qos=normal"
 
 shard_suffix=""
 if [ -n "$4" ]; then
diff --git a/benchmarks/5eq_rk3_weno3_hllc/case.py b/benchmarks/5eq_rk3_weno3_hllc/case.py
index 5ecc327e8f..fa09426ffe 100644
--- a/benchmarks/5eq_rk3_weno3_hllc/case.py
+++ b/benchmarks/5eq_rk3_weno3_hllc/case.py
@@ -191,8 +191,8 @@
             "cyl_coord": "F",
             "dt": dt,
             "t_step_start": 0,
-            "t_step_stop": ARGS["steps"] if ARGS["steps"] is not None else int(7 * (5 * size + 5)),
-            "t_step_save": ARGS["steps"] if ARGS["steps"] is not None else int(7 * (5 * size + 5)),
+            "t_step_stop": ARGS["steps"] if ARGS["steps"] is not None else int(2 * (5 * size + 5)),
+            "t_step_save": ARGS["steps"] if ARGS["steps"] is not None else int(2 * (5 * size + 5)),
             # Simulation Algorithm Parameters
             "num_patches": 3,
             "model_eqns": 2,
diff --git a/benchmarks/hypo_hll/case.py b/benchmarks/hypo_hll/case.py
index 1663a507aa..f8d0928a01 100644
--- a/benchmarks/hypo_hll/case.py
+++ b/benchmarks/hypo_hll/case.py
@@ -44,8 +44,8 @@
             "p": Nz,
             "dt": 1e-8,
             "t_step_start": 0,
-            "t_step_stop": ARGS["steps"] if ARGS["steps"] is not None else int(7 * (5 * size + 5)),
-            "t_step_save": ARGS["steps"] if ARGS["steps"] is not None else int(7 * (5 * size + 5)),
+            "t_step_stop": ARGS["steps"] if ARGS["steps"] is not None else int(2 * (5 * size + 5)),
+            "t_step_save": ARGS["steps"] if ARGS["steps"] is not None else int(2 * (5 * size + 5)),
             # Simulation Algorithm Parameters
             "num_patches": 2,
             "model_eqns": 2,
diff --git a/benchmarks/ibm/case.py b/benchmarks/ibm/case.py
index e16cb620b7..303cf7fcaf 100644
--- a/benchmarks/ibm/case.py
+++ b/benchmarks/ibm/case.py
@@ -48,8 +48,8 @@
             "p": Nz,
             "dt": mydt,
             "t_step_start": 0,
-            "t_step_stop": ARGS["steps"] if ARGS["steps"] is not None else int(7 * (5 * size + 5)),
-            "t_step_save": ARGS["steps"] if ARGS["steps"] is not None else int(7 * (5 * size + 5)),
+            "t_step_stop": ARGS["steps"] if ARGS["steps"] is not None else int(2 * (5 * size + 5)),
+            "t_step_save": ARGS["steps"] if ARGS["steps"] is not None else int(2 * (5 * size + 5)),
             # Simulation Algorithm Parameters
             "num_patches": 1,
             "model_eqns": 2,
diff --git a/benchmarks/igr/case.py b/benchmarks/igr/case.py
index 469bff1fa9..4ceed76257 100644
--- a/benchmarks/igr/case.py
+++ b/benchmarks/igr/case.py
@@ -63,8 +63,8 @@
             "cyl_coord": "F",
             "dt": dt,
             "t_step_start": 0,
-            "t_step_stop": ARGS["steps"] if ARGS["steps"] is not None else int(7 * (5 * size + 5)),
-            "t_step_save": ARGS["steps"] if ARGS["steps"] is not None else int(7 * (5 * size + 5)),
+            "t_step_stop": ARGS["steps"] if ARGS["steps"] is not None else int(2 * (5 * size + 5)),
+            "t_step_save": ARGS["steps"] if ARGS["steps"] is not None else int(2 * (5 * size + 5)),
             # Simulation Algorithm Parameters
             "num_patches": 1,
             "model_eqns": 2,
diff --git a/benchmarks/viscous_weno5_sgb_acoustic/case.py b/benchmarks/viscous_weno5_sgb_acoustic/case.py
index 9f1351b0c1..83bdc43e9c 100644
--- a/benchmarks/viscous_weno5_sgb_acoustic/case.py
+++ b/benchmarks/viscous_weno5_sgb_acoustic/case.py
@@ -94,8 +94,8 @@
             "p": Nz,
             "dt": dt,
             "t_step_start": 0,
-            "t_step_stop": ARGS["steps"] if ARGS["steps"] is not None else int(6 * (5 * size + 5)),
-            "t_step_save": ARGS["steps"] if ARGS["steps"] is not None else int(6 * (5 * size + 5)),
+            "t_step_stop": ARGS["steps"] if ARGS["steps"] is not None else int(2 * (5 * size + 5)),
+            "t_step_save": ARGS["steps"] if ARGS["steps"] is not None else int(2 * (5 * size + 5)),
             # Simulation Algorithm Parameters
             "num_patches": 2,
             "model_eqns": 2,

From 644c9e4d27037011518fac5c22cd1d0794ed5c1c Mon Sep 17 00:00:00 2001
From: Spencer Bryngelson <sbryngelson@gmail.com>
Date: Tue, 3 Mar 2026 17:04:20 -0500
Subject: [PATCH 08/14] Cap bench script parallelism at 64 to fix GNR node
 failures

On GNR nodes (192 cores), $(nproc) returns 192 which overwhelms
MPI daemons and causes SIGTERM (exit 143) during benchmarks.
Master lands on a 24-core node and passes while PR lands on GNR
and fails, making benchmarks appear broken by the PR.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 .github/workflows/frontier/bench.sh | 5 ++++-
 .github/workflows/phoenix/bench.sh  | 8 ++++++--
 2 files changed, 10 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/frontier/bench.sh b/.github/workflows/frontier/bench.sh
index b60f8541a2..b896feb17c 100644
--- a/.github/workflows/frontier/bench.sh
+++ b/.github/workflows/frontier/bench.sh
@@ -2,8 +2,11 @@
 
 source .github/scripts/bench-preamble.sh
 
+# Cap parallel jobs at 64 to avoid overwhelming MPI daemons on large nodes.
+n_jobs=$(( $(nproc) > 64 ? 64 : $(nproc) ))
+
 if [ "$job_device" = "gpu" ]; then
     ./mfc.sh bench --mem 4 -j $n_ranks -o "$job_slug.yaml" -- -c $job_cluster $device_opts -n $n_ranks
 else
-    ./mfc.sh bench --mem 1 -j $(nproc) -o "$job_slug.yaml" -- -c $job_cluster $device_opts -n $n_ranks
+    ./mfc.sh bench --mem 1 -j $n_jobs -o "$job_slug.yaml" -- -c $job_cluster $device_opts -n $n_ranks
 fi
diff --git a/.github/workflows/phoenix/bench.sh b/.github/workflows/phoenix/bench.sh
index e91ece366b..9a661cb924 100644
--- a/.github/workflows/phoenix/bench.sh
+++ b/.github/workflows/phoenix/bench.sh
@@ -2,6 +2,10 @@
 
 source .github/scripts/bench-preamble.sh
 
+# Cap parallel jobs at 64 to avoid overwhelming MPI daemons on large nodes
+# (GNR nodes have 192 cores but nproc is too aggressive for build/bench).
+n_jobs=$(( $(nproc) > 64 ? 64 : $(nproc) ))
+
 tmpbuild=/storage/scratch1/6/sbryngelson3/mytmp_build
 currentdir=$tmpbuild/run-$(( RANDOM % 900 ))
 mkdir -p $tmpbuild
@@ -18,9 +22,9 @@ fi
 rm -rf build
 
 source .github/scripts/retry-build.sh
-RETRY_CLEAN_CMD="./mfc.sh clean" retry_build ./mfc.sh build -j $(nproc) $build_opts || exit 1
+RETRY_CLEAN_CMD="./mfc.sh clean" retry_build ./mfc.sh build -j $n_jobs $build_opts || exit 1
 
-./mfc.sh bench $bench_opts -j $(nproc) -o "$job_slug.yaml" -- -c phoenix-bench $device_opts -n $n_ranks
+./mfc.sh bench $bench_opts -j $n_jobs -o "$job_slug.yaml" -- -c phoenix-bench $device_opts -n $n_ranks
 
 sleep 10
 rm -rf "$currentdir" || true

From a02f4b20497a47f4504f051ee28d8a084bb19564 Mon Sep 17 00:00:00 2001
From: Spencer Bryngelson <sbryngelson@gmail.com>
Date: Mon, 2 Mar 2026 20:36:56 -0500
Subject: [PATCH 09/14] Disable AVX-512 FP16 to fix build on Granite Rapids
 nodes

gfortran 12+ with -march=native on Granite Rapids (GNR) CPUs emits
vmovw instructions (AVX-512 FP16) that binutils 2.35 cannot assemble,
causing LTO link failures. Add -mno-avx512fp16 when the compiler
supports it. FP16 is unused in MFC's double-precision computations.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 CMakeLists.txt | 25 ++++++++++++++++++-------
 1 file changed, 18 insertions(+), 7 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index ddb3876724..3c5a80638f 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -224,13 +224,24 @@ endif()
 
 if (CMAKE_BUILD_TYPE STREQUAL "Release")
     # Processor tuning: Check if we can target the host's native CPU's ISA.
-    CHECK_FORTRAN_COMPILER_FLAG("-march=native" SUPPORTS_MARCH_NATIVE)
-    if (SUPPORTS_MARCH_NATIVE)
-        add_compile_options($<$<COMPILE_LANGUAGE:Fortran>:-march=native>)
-    else()
-    	CHECK_FORTRAN_COMPILER_FLAG("-mcpu=native" SUPPORTS_MCPU_NATIVE)
-        if (SUPPORTS_MCPU_NATIVE)
-            add_compile_options($<$<COMPILE_LANGUAGE:Fortran>:-mcpu=native>)
+    # Skip for gcov builds — -march=native on newer CPUs (e.g. Granite Rapids)
+    # can emit instructions the system assembler doesn't support.
+    if (NOT MFC_GCov)
+        CHECK_FORTRAN_COMPILER_FLAG("-march=native" SUPPORTS_MARCH_NATIVE)
+        if (SUPPORTS_MARCH_NATIVE)
+            add_compile_options($<$<COMPILE_LANGUAGE:Fortran>:-march=native>)
+            # Disable AVX-512 FP16: gfortran ≥12 emits vmovw instructions on
+            # Granite Rapids CPUs, but binutils <2.38 cannot assemble them.
+            # FP16 is unused in MFC's double-precision computations.
+            CHECK_FORTRAN_COMPILER_FLAG("-mno-avx512fp16" SUPPORTS_MNO_AVX512FP16)
+            if (SUPPORTS_MNO_AVX512FP16)
+                add_compile_options($<$<COMPILE_LANGUAGE:Fortran>:-mno-avx512fp16>)
+            endif()
+        else()
+            CHECK_FORTRAN_COMPILER_FLAG("-mcpu=native" SUPPORTS_MCPU_NATIVE)
+            if (SUPPORTS_MCPU_NATIVE)
+                add_compile_options($<$<COMPILE_LANGUAGE:Fortran>:-mcpu=native>)
+            endif()
         endif()
     endif()
 

From ba91673f05785a1145f55d82af9758919b60fe23 Mon Sep 17 00:00:00 2001
From: Spencer Bryngelson <sbryngelson@gmail.com>
Date: Mon, 2 Mar 2026 18:09:35 -0500
Subject: [PATCH 10/14] Fix Rich MarkupError crash when build output contains
 bracket paths

Build errors containing [/tmp/...] paths (e.g. LTO linker output) were
misinterpreted as Rich markup closing tags, crashing the error display
and masking the actual build failure. Wrap raw output in Text() to
prevent markup interpretation.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 toolchain/mfc/build.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/toolchain/mfc/build.py b/toolchain/mfc/build.py
index 6430f7ad35..08ff6d7510 100644
--- a/toolchain/mfc/build.py
+++ b/toolchain/mfc/build.py
@@ -1,6 +1,7 @@
 import os, typing, hashlib, dataclasses, subprocess, re, time, sys, threading, queue
 
 from rich.panel import Panel
+from rich.text  import Text
 from rich.progress import Progress, SpinnerColumn, BarColumn, TextColumn, TimeElapsedColumn, TaskProgressColumn
 
 from .case    import Case
@@ -273,14 +274,14 @@ def _show_build_error(result: subprocess.CompletedProcess, stage: str):
         stdout_text = result.stdout if isinstance(result.stdout, str) else result.stdout.decode('utf-8', errors='replace')
         stdout_text = stdout_text.strip()
         if stdout_text:
-            cons.raw.print(Panel(stdout_text, title="Output", border_style="yellow"))
+            cons.raw.print(Panel(Text(stdout_text), title="Output", border_style="yellow"))
 
     # Show stderr if available
     if result.stderr:
         stderr_text = result.stderr if isinstance(result.stderr, str) else result.stderr.decode('utf-8', errors='replace')
         stderr_text = stderr_text.strip()
         if stderr_text:
-            cons.raw.print(Panel(stderr_text, title="Errors", border_style="red"))
+            cons.raw.print(Panel(Text(stderr_text), title="Errors", border_style="red"))
 
     cons.print()
 

From 3e773fffd895174160cb7e02b272e93028f17740 Mon Sep 17 00:00:00 2001
From: Spencer Bryngelson <sbryngelson@login10.frontier.olcf.ornl.gov>
Date: Fri, 6 Mar 2026 15:07:43 -0500
Subject: [PATCH 11/14] Address bot review comments: sacct -X flag, dead
 job_type var, stale comment

---
 .github/scripts/run_monitored_slurm_job.sh | 2 +-
 .github/workflows/frontier/submit.sh       | 7 -------
 .github/workflows/phoenix/test.sh          | 4 ++--
 3 files changed, 3 insertions(+), 10 deletions(-)

diff --git a/.github/scripts/run_monitored_slurm_job.sh b/.github/scripts/run_monitored_slurm_job.sh
index 905520c45e..22141043ad 100644
--- a/.github/scripts/run_monitored_slurm_job.sh
+++ b/.github/scripts/run_monitored_slurm_job.sh
@@ -26,7 +26,7 @@ if [ "$monitor_exit" -ne 0 ]; then
     # Give the SLURM epilog time to finalize if the job just finished
     sleep 30
     final_state=$(sacct -j "$job_id" -n -X -P -o State 2>/dev/null | head -n1 | cut -d'|' -f1 | tr -d ' ' || echo "UNKNOWN")
-    final_exit=$(sacct -j "$job_id" --format=ExitCode --noheader --parsable2 2>/dev/null | head -n1 | tr -d ' ' || echo "")
+    final_exit=$(sacct -j "$job_id" -X --format=ExitCode --noheader --parsable2 2>/dev/null | head -n1 | tr -d ' ' || echo "")
     echo "Final SLURM state=$final_state exit=$final_exit"
     if [ "$final_state" = "COMPLETED" ] && [ "$final_exit" = "0:0" ]; then
         echo "SLURM job $job_id completed successfully despite monitor failure — continuing."
diff --git a/.github/workflows/frontier/submit.sh b/.github/workflows/frontier/submit.sh
index c5dc8a41d3..070a03094b 100644
--- a/.github/workflows/frontier/submit.sh
+++ b/.github/workflows/frontier/submit.sh
@@ -25,13 +25,6 @@ else
     exit 1
 fi
 
-# Detect job type from submitted script basename
-script_basename="$(basename "$1" .sh)"
-case "$script_basename" in
-    bench*) job_type="bench" ;;
-    *)      job_type="test"  ;;
-esac
-
 if [ "$2" = "cpu" ]; then
     sbatch_device_opts="\
 #SBATCH -n 32                       # Number of cores required"
diff --git a/.github/workflows/phoenix/test.sh b/.github/workflows/phoenix/test.sh
index c8a5af2132..3e8c9caa66 100644
--- a/.github/workflows/phoenix/test.sh
+++ b/.github/workflows/phoenix/test.sh
@@ -5,8 +5,8 @@ build_opts="$gpu_opts"
 
 rm -rf build
 
-# Build with retry; smoke-test cached binaries to catch architecture mismatches
-# (SIGILL from binaries compiled on a different compute node).
+# Build with retry; smoke-test the freshly built syscheck binary to catch
+# architecture mismatches (SIGILL from binaries compiled on a different compute node).
 source .github/scripts/retry-build.sh
 RETRY_VALIDATE_CMD='syscheck_bin=$(find build/install -name syscheck -type f 2>/dev/null | head -1); [ -z "$syscheck_bin" ] || "$syscheck_bin" > /dev/null 2>&1' \
     retry_build ./mfc.sh test -v --dry-run -j 8 $build_opts || exit 1

From fae2e6a08a2971d5f91e50e0063fca08a8f70b70 Mon Sep 17 00:00:00 2001
From: Spencer Bryngelson <sbryngelson@gmail.com>
Date: Fri, 6 Mar 2026 15:14:25 -0500
Subject: [PATCH 12/14] Fix bench: use PR's submit.sh for master job to get
 SIGKILL recovery

When benchmarking master vs PR, submit_and_monitor_bench.sh was using the
master directory's submit.sh for the master bench job. Master's submit.sh
calls monitor_slurm_job.sh directly without SIGKILL recovery. When the
monitor was killed (exit 137), the master bench YAML was never found.

Fix: always use the PR's submit.sh (which calls run_monitored_slurm_job.sh
with sacct fallback) for both master and PR bench submissions.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 .github/scripts/submit_and_monitor_bench.sh | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/.github/scripts/submit_and_monitor_bench.sh b/.github/scripts/submit_and_monitor_bench.sh
index c081c8692a..9eae6b9ff7 100755
--- a/.github/scripts/submit_and_monitor_bench.sh
+++ b/.github/scripts/submit_and_monitor_bench.sh
@@ -17,9 +17,13 @@ cluster="$4"
 echo "[$dir] Submitting benchmark for $device-$interface on $cluster..."
 cd "$dir"
 
-# Submit and monitor job (submit.sh auto-detects bench mode from script name)
-bash .github/workflows/$cluster/submit.sh \
-    .github/workflows/$cluster/bench.sh "$device" "$interface"
+# Always use the PR's submit.sh so both master and PR builds benefit from the
+# run_monitored_slurm_job.sh SIGKILL recovery wrapper.  The bench script is
+# still resolved relative to the current directory (master/ or pr/) so the
+# correct branch code is benchmarked.  SLURM_SUBMIT_DIR ensures the job runs
+# in the right directory regardless of which submit.sh is invoked.
+PR_SUBMIT="${SCRIPT_DIR}/../workflows/${cluster}/submit.sh"
+bash "$PR_SUBMIT" .github/workflows/$cluster/bench.sh "$device" "$interface"
 
 # Verify the YAML output file was created
 job_slug="bench-$device-$interface"

From 3224931537e141cee2c0c977e49bfa2307d6d4ab Mon Sep 17 00:00:00 2001
From: Spencer Bryngelson <sbryngelson@gmail.com>
Date: Fri, 6 Mar 2026 15:21:10 -0500
Subject: [PATCH 13/14] Fix submit_and_monitor_bench.sh: define SCRIPT_DIR
 before use

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 .github/scripts/submit_and_monitor_bench.sh | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/.github/scripts/submit_and_monitor_bench.sh b/.github/scripts/submit_and_monitor_bench.sh
index 9eae6b9ff7..e0a6eb7384 100755
--- a/.github/scripts/submit_and_monitor_bench.sh
+++ b/.github/scripts/submit_and_monitor_bench.sh
@@ -14,6 +14,8 @@ device="$2"
 interface="$3"
 cluster="$4"
 
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+
 echo "[$dir] Submitting benchmark for $device-$interface on $cluster..."
 cd "$dir"
 

From 2887def4d0c2fef2a1d202493120247063bc2e18 Mon Sep 17 00:00:00 2001
From: Spencer Bryngelson <sbryngelson@gmail.com>
Date: Sat, 7 Mar 2026 03:59:20 -0500
Subject: [PATCH 14/14] bench: update Phoenix tmpbuild path to project storage

---
 .github/workflows/phoenix/bench.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/phoenix/bench.sh b/.github/workflows/phoenix/bench.sh
index 9a661cb924..218cf68a5f 100644
--- a/.github/workflows/phoenix/bench.sh
+++ b/.github/workflows/phoenix/bench.sh
@@ -6,7 +6,7 @@ source .github/scripts/bench-preamble.sh
 # (GNR nodes have 192 cores but nproc is too aggressive for build/bench).
 n_jobs=$(( $(nproc) > 64 ? 64 : $(nproc) ))
 
-tmpbuild=/storage/scratch1/6/sbryngelson3/mytmp_build
+tmpbuild=/storage/project/r-sbryngelson3-0/sbryngelson3/mytmp_build
 currentdir=$tmpbuild/run-$(( RANDOM % 900 ))
 mkdir -p $tmpbuild
 mkdir -p $currentdir