Skip to content
53 changes: 26 additions & 27 deletions .github/actions/submit-delete-k8s-job/action.yml
Original file line number Diff line number Diff line change
Expand Up @@ -20,59 +20,58 @@ runs:
TIMEOUT_JOB_CREATION=60s
TIMEOUT_JOB_WAIT=14400s
TIMEOUT_JOB_START=600s
INPUT_JOB_NAME=${{ inputs.job-name }}
INPUT_JOB_CONFIG_FILE=${{ inputs.job-config-file }}

echo "Submit K8s job"
kubectl apply -f "${{ inputs.job-config-file }}"
kubectl get event | grep ${{ inputs.job-name }}
kubectl apply -f "${INPUT_JOB_CONFIG_FILE}"
kubectl get event | grep ${INPUT_JOB_NAME}
# Wait for job to be created
kubectl wait --for=create job/${{ inputs.job-name }} --timeout=$TIMEOUT_JOB_CREATION

kubectl wait --for=create job/${INPUT_JOB_NAME} --timeout=$TIMEOUT_JOB_CREATION
# Wait for job to be unsuspended
kubectl wait --for=jsonpath='{.spec.suspend}=false' job/${{ inputs.job-name }} --timeout=$TIMEOUT_JOB_WAIT

kubectl wait --for=jsonpath='{.spec.suspend}=false' job/${INPUT_JOB_NAME} --timeout=$TIMEOUT_JOB_WAIT
# Wait for pods to be running
kubectl wait --for=condition=Ready \
--selector=batch.kubernetes.io/job-name=${{ inputs.job-name }} \
--selector=batch.kubernetes.io/job-name=${INPUT_JOB_NAME} \
--timeout=$TIMEOUT_JOB_START pod

# Stream logs
kubectl logs --all-containers=true --all-pods=true --follow job/${{ inputs.job-name }}
kubectl logs --all-containers=true --all-pods=true --follow job/${INPUT_JOB_NAME}

# Detect job parallelism
parallelism=$(kubectl get job/"${{ inputs.job-name }}" -o jsonpath='{.spec.parallelism}')
parallelism=$(kubectl get job/${INPUT_JOB_NAME} -o jsonpath='{.spec.parallelism}')
# if parallelism is not set, use default value of 1
echo "Parallelism ${parallelism}"
if [ -z "${parallelism}" ]; then
echo "No parallelism specified, defaulting to 1"
parallelism=1
fi

while IFS=: read -r failures successes; do
failures="${failures:-0}"
successes="${successes:-0}"
while true; do
job_status_counts=$(kubectl get job/${INPUT_JOB_NAME} -o 'jsonpath={.status.failed}:{.status.succeeded}')

IFS=:
set -- $job_status_counts
failures=${1:-0}
successes=${2:-0}

total=$((failures + successes))

echo "status: failures=${failures}, successes=${successes}, total=${total}, parallelism=${parallelism}"

if [ $total -lt $parallelism ]; then
# neither "failed" nor "succeeded", so wait
sleep 1
elif [ $total -eq $parallelism ]; then
# we have total=parallelism => either X successes or X failures
# In any case, the job is done
break
else
# Log here
echo "Unexpected number of completed pods ${total} with parallelism ${parallelism}"
exit 255
sleep 2
continue
fi
done <<EOF
$(kubectl get job/"${{ inputs.job-name }}" -o 'jsonpath={.status.failed}:{.status.succeeded}')
EOF
break
done


# If job indicates a failure try to print out the info
if [ "${failures:-0}" -gt 0 ]; then
echo "Job ${{ inputs.job-name }} has $failures failures"
echo "Job ${INPUT_JOB_NAME} has $failures failures"
# this is for batch jobs only
pods=$(kubectl get pods --selector=batch.kubernetes.io/job-name=${{ inputs.job-name }} -o name)
pods=$(kubectl get pods --selector=batch.kubernetes.io/job-name=${INPUT_JOB_NAME} -o name)
if [ -n "${pods}" ]; then
kubectl describe ${pods}
fi
Expand Down
Loading