From 81dd2d9855091315611106a5afbdaeaa15e6e4ae Mon Sep 17 00:00:00 2001 From: Walter Kolczynski - NOAA Date: Thu, 9 Nov 2023 17:19:57 +0000 Subject: [PATCH] Fix analysis and post on Orion (#2046) Fixes a couple things to get workflow to work on Orion again: First, while #2042 allowed post to be built, the `upp.x` executable would fail due to a missing library. Now post jobs (temporarily) uses the UPP modules (plus others needed for runtime) instead of the normal workflow runtime module. This should actually fix #2041. This hack can likely be removed once we move to spack-stack. Second, the launcher command for the analysis jobs on Orion are updated to specify the number of CPUs per task. This was necessary as a recent slurm update on Orion discontinued reading that value from the scheduler. Other jobs will be handled later (see #2044). Additionally, a change is made to `module-setup.sh` that will eliminate the annoyance of `gw_setup.sh` breaking tab completion because undefined variable checking was turned on for the script but not back off. Resolves #1996 Resolves #2041 Refs #2044 --- env/ORION.env | 8 ++++---- jobs/rocoto/post.sh | 23 ++++++++++++++++++++--- ush/module-setup.sh | 8 ++++++++ 3 files changed, 32 insertions(+), 7 deletions(-) diff --git a/env/ORION.env b/env/ORION.env index 400963d3c8..e20945bc07 100755 --- a/env/ORION.env +++ b/env/ORION.env @@ -132,20 +132,20 @@ elif [[ "${step}" = "anal" ]] || [[ "${step}" = "analcalc" ]]; then export NTHREADS_GSI=${nth_anal:-${nth_max}} [[ ${NTHREADS_GSI} -gt ${nth_max} ]] && export NTHREADS_GSI=${nth_max} - export APRUN_GSI="${launcher} -n ${npe_gsi:-${npe_anal}}" + export APRUN_GSI="${launcher} -n ${npe_gsi:-${npe_anal}} --cpus-per-task=${NTHREADS_GSI}" export NTHREADS_CALCINC=${nth_calcinc:-1} [[ ${NTHREADS_CALCINC} -gt ${nth_max} ]] && export NTHREADS_CALCINC=${nth_max} - export APRUN_CALCINC="${launcher} \$ncmd" + export APRUN_CALCINC="${launcher} \$ncmd --cpus-per-task=${NTHREADS_CALCINC}" export NTHREADS_CYCLE=${nth_cycle:-12} [[ ${NTHREADS_CYCLE} -gt ${npe_node_max} ]] && export NTHREADS_CYCLE=${npe_node_max} npe_cycle=${ntiles:-6} - export APRUN_CYCLE="${launcher} -n ${npe_cycle}" + export APRUN_CYCLE="${launcher} -n ${npe_cycle} --cpus-per-task=${NTHREADS_CYCLE}" export NTHREADS_GAUSFCANL=1 npe_gausfcanl=${npe_gausfcanl:-1} - export APRUN_GAUSFCANL="${launcher} -n ${npe_gausfcanl}" + export APRUN_GAUSFCANL="${launcher} -n ${npe_gausfcanl} --cpus-per-task=${NTHREADS_GAUSFCANL}" elif [[ "${step}" = "sfcanl" ]]; then nth_max=$((npe_node_max / npe_node_sfcanl)) diff --git a/jobs/rocoto/post.sh b/jobs/rocoto/post.sh index c91b9d4679..ca8bd7aa24 100755 --- a/jobs/rocoto/post.sh +++ b/jobs/rocoto/post.sh @@ -8,9 +8,26 @@ source "${HOMEgfs}/ush/preamble.sh" ############################################################### # Source FV3GFS workflow modules -. ${HOMEgfs}/ush/load_fv3gfs_modules.sh -status=$? -[[ ${status} -ne 0 ]] && exit ${status} +# . ${HOMEgfs}/ush/load_fv3gfs_modules.sh +# status=$? +# [[ ${status} -ne 0 ]] && exit ${status} +# Temporarily load modules from UPP +source "${HOMEgfs}/ush/detect_machine.sh" +source "${HOMEgfs}/ush/module-setup.sh" +module use "${HOMEgfs}/sorc/ufs_model.fd/FV3/upp/modulefiles" +module load "${MACHINE_ID}" +module load prod_util +if [[ "${MACHINE_ID}" = "wcoss2" ]]; then + module load cray-pals + module load cfp +else + # shellcheck disable=SC2154 + export UTILROOT="${prod_util_ROOT}" +fi +module load grib-util +module load wgrib2 +export WGRIB2=wgrib2 +# End hack export job="post" export jobid="${job}.$$" diff --git a/ush/module-setup.sh b/ush/module-setup.sh index 4279a99dd4..3f2dd9dcf9 100755 --- a/ush/module-setup.sh +++ b/ush/module-setup.sh @@ -117,3 +117,11 @@ elif [[ ${MACHINE_ID} = "noaacloud" ]]; then else echo WARNING: UNKNOWN PLATFORM 1>&2 fi + +# If this function exists in the environment, run it; else do not +ftype=$(type -t set_strict) +if [[ "${ftype}" == "function" ]]; then + set_strict +else + set +u +fi