Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

error handling logic for inference runs #320

Merged
merged 4 commits into from May 26, 2020
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
29 changes: 23 additions & 6 deletions batch/inference_runner.sh
Expand Up @@ -9,6 +9,13 @@ set -x
# JOB_NAME the name of the job
# S3_RESULTS_PATH location in S3 to store the results

# Check to see if we should bail on this run because of accumulated errors in other runs
failure_count=$(aws s3 ls $S3_RESULTS_PATH/failures/ | wc -l)
if [ $failure_count -gt 10 ]; then
echo "Failing run because total number of previous child job failures is $failure_count"
exit 1
fi

# setup the python environment
HOME=/home/app
PYENV_ROOT=$HOME/.pyenv
Expand Down Expand Up @@ -41,23 +48,34 @@ if [ -n "$S3_LAST_JOB_OUTPUT" ]; then
done
fi

error_handler() {
msg=$1
if [ $AWS_BATCH_JOB_ATTEMPT -eq 3 ]; then
echo $JOB_NAME >> errorfile
echo $msg >> errorfile
aws s3 cp errorfile $S3_RESULTS_PATH/failures/$AWS_BATCH_JOB_ARRAY_INDEX
exit 0
else
echo $msg
exit 1
fi
}

# Pick up stuff that changed
# TODO(jwills): maybe move this to like a prep script?
Rscript COVIDScenarioPipeline/local_install.R
local_install_ret=$?

if [ $local_install_ret -ne 0 ]; then
echo "Error code returned from running local_install.R: $local_install_ret"
exit 1
error_handler "Error code returned from running local_install.R: $local_install_ret"
fi


(cd COVIDScenarioPipeline && python setup.py build install)
python_install_ret=$?

if [ $python_install_ret -ne 0 ]; then
echo "Error code returned from running `python setup.py install`: $python_install_ret"
exit 1
error_handler "Error code returned from running `python setup.py install`: $python_install_ret"
fi

echo "State of directory before we start"
Expand All @@ -80,8 +98,7 @@ Rscript COVIDScenarioPipeline/R/scripts/full_filter.R -p COVIDScenarioPipeline -

dvc_ret=$?
if [ $dvc_ret -ne 0 ]; then
echo "Error code returned from full_filter.R: $dvc_ret"
exit 1
error_handler "Error code returned from full_filter.R: $dvc_ret"
fi

for output in "${DVC_OUTPUTS_ARRAY[@]}"
Expand Down