From 2b48871d13e1e6929014111c5910b120f9fbac07 Mon Sep 17 00:00:00 2001 From: "Walter.Kolczynski" Date: Wed, 8 Nov 2023 20:34:31 -0600 Subject: [PATCH 1/6] Use UPP modules for postanl PR #2042 added a hack to checkout a UPP version that works on Orion, but `upp.x` now fails due to a missing library. This changes the postanl job to load the UPP modules instead of the normal runtime ones. Resolves #2041 --- jobs/rocoto/post.sh | 23 ++++++++++++++++++++--- 1 file changed, 20 insertions(+), 3 deletions(-) diff --git a/jobs/rocoto/post.sh b/jobs/rocoto/post.sh index c91b9d4679..af463bf78f 100755 --- a/jobs/rocoto/post.sh +++ b/jobs/rocoto/post.sh @@ -8,9 +8,26 @@ source "${HOMEgfs}/ush/preamble.sh" ############################################################### # Source FV3GFS workflow modules -. ${HOMEgfs}/ush/load_fv3gfs_modules.sh -status=$? -[[ ${status} -ne 0 ]] && exit ${status} +# . ${HOMEgfs}/ush/load_fv3gfs_modules.sh +# status=$? +# [[ ${status} -ne 0 ]] && exit ${status} +# Temporarily load modules from UPP +source "${HOMEgfs}/ush/detect_machine.sh" +source "${HOMEgfs}/ush/module-setup.sh" +module use "${HOMEgfs}/sorc/ufs_model.fd/FV3/upp/modulefiles" +module load orion +if [[ "${MACHINE_ID}" = "wcoss2" ]]; then + module load prod_util + module load cray-pals + module load cfp +else + module load prod-util + export UTILROOT=${prod_util_ROOT} +fi +module load grib-util +module load wgrib2 +export WGRIB2=wgrib2 +# End hack export job="post" export jobid="${job}.$$" From 4d89193cf6f6d3e251c6683d2f159414f4b3c677 Mon Sep 17 00:00:00 2001 From: "Walter.Kolczynski" Date: Wed, 8 Nov 2023 20:37:34 -0600 Subject: [PATCH 2/6] Fix analysis job on Orion A recent slurm change on Orion requires `srun` to explicitly specify the number of CPUs per task rather than inheriting it from the scheduler. This should be done for all jobs, but only the analysis job is handled here (see #2044 for the others). Resolves #1996 Refs #2044 --- env/ORION.env | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/env/ORION.env b/env/ORION.env index 400963d3c8..9505006bac 100755 --- a/env/ORION.env +++ b/env/ORION.env @@ -132,20 +132,20 @@ elif [[ "${step}" = "anal" ]] || [[ "${step}" = "analcalc" ]]; then export NTHREADS_GSI=${nth_anal:-${nth_max}} [[ ${NTHREADS_GSI} -gt ${nth_max} ]] && export NTHREADS_GSI=${nth_max} - export APRUN_GSI="${launcher} -n ${npe_gsi:-${npe_anal}}" + export APRUN_GSI="${launcher} -n ${npe_gsi:-${npe_anal}} --cpus-per-task ${NTHREADS_GSI}" export NTHREADS_CALCINC=${nth_calcinc:-1} [[ ${NTHREADS_CALCINC} -gt ${nth_max} ]] && export NTHREADS_CALCINC=${nth_max} - export APRUN_CALCINC="${launcher} \$ncmd" + export APRUN_CALCINC="${launcher} \$ncmd --cpus-per-task ${NTHREADS_CALCINC}" export NTHREADS_CYCLE=${nth_cycle:-12} [[ ${NTHREADS_CYCLE} -gt ${npe_node_max} ]] && export NTHREADS_CYCLE=${npe_node_max} npe_cycle=${ntiles:-6} - export APRUN_CYCLE="${launcher} -n ${npe_cycle}" + export APRUN_CYCLE="${launcher} -n ${npe_cycle} --cpus-per-task ${NTHREADS_CYCLE}" export NTHREADS_GAUSFCANL=1 npe_gausfcanl=${npe_gausfcanl:-1} - export APRUN_GAUSFCANL="${launcher} -n ${npe_gausfcanl}" + export APRUN_GAUSFCANL="${launcher} -n ${npe_gausfcanl} --cpus-per-task ${NTHREADS_GAUSFCANL}" elif [[ "${step}" = "sfcanl" ]]; then nth_max=$((npe_node_max / npe_node_sfcanl)) From ab64fd6c202eb945a8910d5de85ffd0c7bf8eba5 Mon Sep 17 00:00:00 2001 From: "Walter.Kolczynski" Date: Wed, 8 Nov 2023 20:40:34 -0600 Subject: [PATCH 3/6] Restore undefined var error checking after module-setup `module-setup.sh` was turning on error checking, but not turning it off again at the end. Now the script will run `set_strict` if it is defined (as during a workflow job) and otherwise turns off undefined variable checking at the end. This will stop `gw_setup.sh` from breaking tab- completion in an interactive shell. --- ush/module-setup.sh | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/ush/module-setup.sh b/ush/module-setup.sh index 4279a99dd4..3f2dd9dcf9 100755 --- a/ush/module-setup.sh +++ b/ush/module-setup.sh @@ -117,3 +117,11 @@ elif [[ ${MACHINE_ID} = "noaacloud" ]]; then else echo WARNING: UNKNOWN PLATFORM 1>&2 fi + +# If this function exists in the environment, run it; else do not +ftype=$(type -t set_strict) +if [[ "${ftype}" == "function" ]]; then + set_strict +else + set +u +fi From d8868cdb90a1edadfa0bc7d3897cffc6e129b5cc Mon Sep 17 00:00:00 2001 From: "Walter.Kolczynski" Date: Wed, 8 Nov 2023 21:05:24 -0600 Subject: [PATCH 4/6] Add = to cpu-per-task option --- env/ORION.env | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/env/ORION.env b/env/ORION.env index 9505006bac..e20945bc07 100755 --- a/env/ORION.env +++ b/env/ORION.env @@ -132,20 +132,20 @@ elif [[ "${step}" = "anal" ]] || [[ "${step}" = "analcalc" ]]; then export NTHREADS_GSI=${nth_anal:-${nth_max}} [[ ${NTHREADS_GSI} -gt ${nth_max} ]] && export NTHREADS_GSI=${nth_max} - export APRUN_GSI="${launcher} -n ${npe_gsi:-${npe_anal}} --cpus-per-task ${NTHREADS_GSI}" + export APRUN_GSI="${launcher} -n ${npe_gsi:-${npe_anal}} --cpus-per-task=${NTHREADS_GSI}" export NTHREADS_CALCINC=${nth_calcinc:-1} [[ ${NTHREADS_CALCINC} -gt ${nth_max} ]] && export NTHREADS_CALCINC=${nth_max} - export APRUN_CALCINC="${launcher} \$ncmd --cpus-per-task ${NTHREADS_CALCINC}" + export APRUN_CALCINC="${launcher} \$ncmd --cpus-per-task=${NTHREADS_CALCINC}" export NTHREADS_CYCLE=${nth_cycle:-12} [[ ${NTHREADS_CYCLE} -gt ${npe_node_max} ]] && export NTHREADS_CYCLE=${npe_node_max} npe_cycle=${ntiles:-6} - export APRUN_CYCLE="${launcher} -n ${npe_cycle} --cpus-per-task ${NTHREADS_CYCLE}" + export APRUN_CYCLE="${launcher} -n ${npe_cycle} --cpus-per-task=${NTHREADS_CYCLE}" export NTHREADS_GAUSFCANL=1 npe_gausfcanl=${npe_gausfcanl:-1} - export APRUN_GAUSFCANL="${launcher} -n ${npe_gausfcanl} --cpus-per-task ${NTHREADS_GAUSFCANL}" + export APRUN_GAUSFCANL="${launcher} -n ${npe_gausfcanl} --cpus-per-task=${NTHREADS_GAUSFCANL}" elif [[ "${step}" = "sfcanl" ]]; then nth_max=$((npe_node_max / npe_node_sfcanl)) From 27fa9badb293fe2dfef5f1a47ab521f87847cab1 Mon Sep 17 00:00:00 2001 From: "Walter.Kolczynski" Date: Thu, 9 Nov 2023 00:04:11 -0600 Subject: [PATCH 5/6] Fix hard-coded machine name in post.sh --- jobs/rocoto/post.sh | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/jobs/rocoto/post.sh b/jobs/rocoto/post.sh index af463bf78f..dee1db6510 100755 --- a/jobs/rocoto/post.sh +++ b/jobs/rocoto/post.sh @@ -15,14 +15,15 @@ source "${HOMEgfs}/ush/preamble.sh" source "${HOMEgfs}/ush/detect_machine.sh" source "${HOMEgfs}/ush/module-setup.sh" module use "${HOMEgfs}/sorc/ufs_model.fd/FV3/upp/modulefiles" -module load orion +module load "${MACHINE_ID}" if [[ "${MACHINE_ID}" = "wcoss2" ]]; then module load prod_util module load cray-pals module load cfp else module load prod-util - export UTILROOT=${prod_util_ROOT} + # shellcheck disable=SC2154 + export UTILROOT="${prod_util_ROOT}" fi module load grib-util module load wgrib2 From befc4f0b5d252be4ff54729e248b51fe927a9fbe Mon Sep 17 00:00:00 2001 From: "Walter.Kolczynski" Date: Thu, 9 Nov 2023 01:50:42 -0600 Subject: [PATCH 6/6] Change prod util module name --- jobs/rocoto/post.sh | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/jobs/rocoto/post.sh b/jobs/rocoto/post.sh index dee1db6510..ca8bd7aa24 100755 --- a/jobs/rocoto/post.sh +++ b/jobs/rocoto/post.sh @@ -16,12 +16,11 @@ source "${HOMEgfs}/ush/detect_machine.sh" source "${HOMEgfs}/ush/module-setup.sh" module use "${HOMEgfs}/sorc/ufs_model.fd/FV3/upp/modulefiles" module load "${MACHINE_ID}" +module load prod_util if [[ "${MACHINE_ID}" = "wcoss2" ]]; then - module load prod_util module load cray-pals module load cfp else - module load prod-util # shellcheck disable=SC2154 export UTILROOT="${prod_util_ROOT}" fi