#!/bin/sh sleep 5 SLURM_HOME=/opt/slurm PATH=$SLURM_HOME:$PATH touch /tmp/${SLURM_JOB_ID}/output_epilog.txt OUTFILE=/tmp/${SLURM_JOB_ID}/output_epilog.txt STDOUT=`scontrol show jobid ${SLURM_JOB_ID} | grep StdOut | tail -n 1 | awk 'BEGIN{FS="="}{print $2}'` STDOUT_PATH=`echo "$(dirname "${STDOUT}")"` ### This part checks the GPU performance ### result=$(grep "gpu" /tmp/${SLURM_JOB_ID}/output_prolog.txt) if [ ! -z "$result" ]; then dcgmi stats -x ${SLURM_JOB_ID} dcgmi stats -v -j ${SLURM_JOB_ID} | tee /tmp/${SLURM_JOB_ID}/gpu-stats-$SLURM_JOB_ID-$HOSTNAME.out gpu_report=/tmp/${SLURM_JOB_ID}/gpu-stats-$SLURM_JOB_ID-$HOSTNAME.out fi ### End of GPU performance measurement ### echo -e "--------------------------------------------------------------" >> ${OUTFILE} echo -e "Begin Slurm Epilogue $(date) $(date +%s)" >> ${OUTFILE} jobinfo ${SLURM_JOB_ID} >> ${OUTFILE} if [ ! -z "$result" ]; then ### report GPU model ### gpu_model=`nvidia-smi --query-gpu=gpu_name --format=csv,noheader | tail -1` gpu_measurement ${gpu_model} ${gpu_report} >> ${OUTFILE} fi echo -e "End Slurm Epilogue $(date) $(date +%s)" >> ${OUTFILE} echo -e "--------------------------------------------------------------" >> ${OUTFILE} ### Make sure only one instance is operating on standard output file ### cd ${STDOUT_PATH} /opt/slurm/lockfile -r 0 ${SLURM_JOB_ID}.lock >& ${SLURM_JOB_ID}_log if [[ ! -s ${SLURM_JOB_ID}_log ]] then if [[ -f "$gpu_report" ]]; then cp /tmp/${SLURM_JOB_ID}/gpu-stats-$SLURM_JOB_ID-$HOSTNAME.out /opt/slurm/tmp fi TEMP_FILE=${SLURM_JOB_NAME}_tmp.o${SLURM_JOB_ID} cp ${STDOUT} ${TEMP_FILE} truncate -s 0 ${STDOUT} cat /tmp/$SLURM_JOB_ID/output_prolog.txt >> "${STDOUT}" cat ${TEMP_FILE} >> "${STDOUT}" cat ${OUTFILE} >> "${STDOUT}" rm -rf ${TEMP_FILE} sleep 5 rm -rf ${SLURM_JOB_ID}.lock ${SLURM_JOB_ID}_log rm -rf /tmp/${SLURM_JOB_ID} else rm -rf /tmp/${SLURM_JOB_ID} fi