# Imports and Initialisation

In [8]:
import boto3
from datetime import datetime, timezone

# Initialize Batch client
batch = boto3.client('batch')

# Constants
START_DATE = datetime(2025, 7, 11, tzinfo=timezone.utc)  # 11 July 2025
JOB_QUEUES = ['prod-xlsx-to-pdf-queue']  # Replace with actual job queue(s)
JOB_STATUS = 'FAILED'

# Functions

In [10]:
def get_failed_jobs(job_queue, start_time):
    """
    Fetch failed jobs since a given start time from a job queue.
    """
    failed_jobs = []
    next_token = None

    while True:
        response = batch.list_jobs(
            jobQueue=job_queue,
            jobStatus=JOB_STATUS,
            nextToken=next_token if next_token else '',
            maxResults=100
        )

        for job in response['jobSummaryList']:
            created_at = datetime.fromtimestamp(job['createdAt'] / 1000, tz=timezone.utc)
            if created_at >= start_time:
                failed_jobs.append(job['jobId'])

        next_token = response.get('nextToken')
        if not next_token:
            break

    return failed_jobs

def rerun_job(job_id):
    """
    Re-run a job using the same job definition, queue, and overrides.
    """
    job_detail = batch.describe_jobs(jobs=[job_id])['jobs'][0]

    # Prepare containerOverrides as a dict
    container_overrides = {}
    if 'container' in job_detail and 'environment' in job_detail['container']:
        container_overrides['environment'] = job_detail['container']['environment']

    # Prepare retryStrategy with attempts
    retry_strategy = job_detail.get('retryStrategy', {})
    if 'attempts' not in retry_strategy:
        retry_strategy['attempts'] = 1  # Default to 1 attempt if not present

    new_job = batch.submit_job(
        jobName=job_detail['jobName'] + '-retry',
        jobQueue=job_detail['jobQueue'],
        jobDefinition=job_detail['jobDefinition'],
        containerOverrides=container_overrides,
        retryStrategy=retry_strategy
    )
    print(f"Re-submitted job: {new_job['jobId']} (original: {job_id})")


# Main Function

In [11]:
def main():
    for queue in JOB_QUEUES:
        print(f"Checking failed jobs in queue: {queue}")
        failed_jobs = get_failed_jobs(queue, START_DATE)

        print(f"Found {len(failed_jobs)} failed jobs since {START_DATE.date()}")

        for job_id in failed_jobs:
            try:
                rerun_job(job_id)
            except Exception as e:
                print(f"Failed to rerun job {job_id}: {e}")

if __name__ == '__main__':
    main()

Checking failed jobs in queue: prod-xlsx-to-pdf-queue
Found 17 failed jobs since 2025-07-11
Re-submitted job: bc1cada0-54f3-4ff4-9dd0-57af670f6731 (original: 01a718ec-810f-4950-82ff-ac45f89b35e6)
Re-submitted job: 7c6771ab-e577-467c-a2df-e34e4ce48c8f (original: 65fb4240-a2d6-4ebd-9785-890ecb8574e4)
Re-submitted job: a68ee268-5675-4c34-b2f1-ca673a14c9da (original: 156c2c17-753e-41db-9973-89925d10a76a)
Re-submitted job: 3cf668c1-dc21-4a86-9479-6d198714e26c (original: 3f74c0df-efe8-421a-9bd1-ac01a21620b9)
Re-submitted job: 646ea2f0-6043-40a4-ae7b-49238a6168a9 (original: 85e03e58-26c8-415d-897c-a29274f7c27f)
Re-submitted job: 42d313cb-0746-4e96-be32-ff0bd7637e81 (original: a833d64b-4e15-4d6f-a404-f1a7e94004fc)
Re-submitted job: 9ed95965-88f1-437f-8d7a-4ee49add6642 (original: 947d1bf8-f5a9-4c5e-b8d2-14578bc1ae8e)
Re-submitted job: 7bd6d267-870b-4471-b7ac-1bbefd4af560 (original: d77b7a21-aadc-4c86-aed3-c22085d76995)
Re-submitted job: 702e0d8a-2af2-464a-91cf-3edfbf476d73 (original: f2623e2a-9