## covid-19-au-health-aged-care.ipynb

Downloads pdf and Word docx files from the sub-pages below a targetted page into a local directory (datadir). Only downloads if local file does not exist.  
Reads the text from each local pdf file, extracting key fields. 
Reads the local Word docx files (dated after a specified date), extracting all table data.

Writes the collected data out as an Excel file:
- sheet: Treatments - data on treatments scraped from pdf files
- sheet: National Snapshot - data from 1st table in Word docx files
- sheet: Active Outbreak Summary - data from 2nd table in Word docx files
- sheet: Workforce Resources - data from 3rd table in Word docx files
- sheet: Vaccinations - data from 4th table in Word docx files
- sheet: Regulatory Activities - data from 5th table in Word docx files
- sheet: Active Outbreaks - data from 6th table in Word docx files

In all sheets, the source_file_name and source_file_date (derived from the file name) are added as columns. Rows are sorted by source_file_date (descending).

In the sheets sourced from Word docx tables, the column headers are not promoted so the columns are numbered from 0 instead. This is to avoid a sparse table if column headings change among files.  The column headers will appear repeated for each source file, which any downstream analysis can filter out.

In [1]:

from bs4 import BeautifulSoup
import datetime
import dateutil # pip install python-dateutil
from docx import Document
import pymupdf
import os
import pandas
import random
import re
import requests
import time

site = "https://www.health.gov.au"
main_url = "https://www.health.gov.au/resources/collections/covid-19-outbreaks-in-australian-residential-aged-care-facilities"
links_to_check = ["2025"]

datadir = 'c:/dev/covid-19-au-vaccinations/health-aged-care/'
output_filename = datadir + "health-aged-care.xlsx"

In [2]:
def download_pdf(pdf_url, local_dir):
    # Get the filename from the URL
    filename = os.path.basename(pdf_url)
    local_path = os.path.join(local_dir, filename)

    # Check if the file already exists in the local directory
    if not os.path.exists(local_path):
        # sleep for a random time before downloading 
        time.sleep(2 + ( random.randrange( 0, 30) / 10 ) )
        # Download the PDF file
        response = requests.get(pdf_url)

        # Save the PDF file to the local directory
        with open(local_path, "wb") as file:
            file.write(response.content)
        print(f"Downloaded {filename} to {local_dir}")
    else:
        print(f"{filename} already exists in {local_dir}")


In [3]:
def extract_data_from_pdf(pdf_file):

# open a pdf file, search for the key fields and return them
    
    with pymupdf.open(pdf_file) as doc:  # open document
        text = chr(12).join([page.get_text() for page in doc])

        lagevrio_treatment_courses = 0
        lagevrio_prescriptions = 0
        paxlovid_prescriptions = 0
        end_date = ''

        # search for: and up to DD MMMM YYYY (allowing for extra spaces around the month)
        pattern = r'Lagevrio.*?up\s+to\s+(\d+)(.*?)(\d{4})'
        match = re.search(pattern, text, re.DOTALL)
        if match:
            # assign result, removing excess whitespace around the month portion
            end_date = match.group(1) + ' ' + match.group(2) + ' ' + match.group(3)
            # print(f"DEBUG: End dates: {end_date}")

        # search for: deployed NNN treatment courses of Lagevrio
        pattern = r"deployed\s*(\d+(?:,\d+)?)\s*treatment\s+courses\s+of\s+Lagevrio"
        match = re.search(pattern, text, re.DOTALL)
        if match:
            # assign result
            lagevrio_treatment_courses = match.group(1)
            # print(f"DEBUG: Text: {text}")
            # print(f"DEBUG: Lagevrio treatment courses: {lagevrio_treatment_courses}")

        # search for: NNN prescriptions for Lagevrio
        pattern = r"(\d+(?:,\d+)?)\s*prescriptions\s+for\s+Lagevrio"
        match = re.search(pattern, text, re.DOTALL)
        if match:
            # assign result
            lagevrio_prescriptions = match.group(1)
            # print(f"DEBUG: Lagevrio prescriptions: {lagevrio_prescriptions}")

        # search for: NNN prescriptions for Paxlovid 
        pattern = r"further\s*(\d+(?:,\d+)?)\s*prescriptions\s+for\s+Paxlovid"
        match = re.search(pattern, text, re.DOTALL)
        if match:
            # assign result 
            paxlovid_prescriptions = match.group(1)
            # print(f"DEBUG: Paxlovid prescriptions: {paxlovid_prescriptions}")

        return end_date, lagevrio_treatment_courses, lagevrio_prescriptions, paxlovid_prescriptions

#### Get the main source page, make a list of links to check

In [4]:
response = requests.get(main_url)
soup = BeautifulSoup(response.content, "html.parser")

# Find and extract the links with text (wrapped in a span tag) containing with "COVID-19 outbreaks in Australian residential aged care facilities"
soup_page_select = "a:has(span:-soup-contains('COVID-19 outbreaks in Australian residential aged care'))"
links_year_pages = [link.get("href") for link in soup.select(soup_page_select)]
len ( links_year_pages ) 

6

#### Get each sub-page and download it's pdf and Word docx files 

In [5]:
# browse through the list of year pages. See if the link URL partially matches the links_to_check list (recent years).
link_files_to_download = ['PDF', 'Word']
for each_link_year_page_candidate in links_year_pages:
    each_link_year_page_list = [each_link_year_page_candidate for sub_string in links_to_check if(sub_string in each_link_year_page_candidate)]
    if len(each_link_year_page_list) > 0:
        # sleep for a random time before getting the sub-page 
        time.sleep(1 + ( random.randrange( 0, 20) / 10 ) )
        # get and process the sub-page from each qualifying link
        each_link_year_page = each_link_year_page_list[0]
        year_page_url = site + each_link_year_page
        year_page_response = requests.get(year_page_url)
        year_page_soup = BeautifulSoup(year_page_response.content, "html.parser")

        links = [link.get("href") for link in year_page_soup.select(soup_page_select)]

        # browse through the list of links. See if the link URL partially matches the links_to_check list (recent years).
        for each_link_candidate in links:
            each_link_list = [each_link_candidate for sub_string in links_to_check if(sub_string in each_link_candidate)]
            if len(each_link_list) > 0:
                # sleep for a random time before getting the sub-page 
                time.sleep(1 + ( random.randrange( 0, 20) / 10 ) )
                # get and process the sub-page from each qualifying link
                each_link = each_link_list[0]
                sub_page_url = site + each_link
                sub_page_response = requests.get(sub_page_url)
                sub_page_soup = BeautifulSoup(sub_page_response.content, "html.parser")

                # Find and extract the links to files
                for each_link_file_to_download in link_files_to_download:
                    soup_select = "a:has(span:-soup-contains('" + each_link_file_to_download + "'))"
                    file_links = [file_links.get("href") for file_links in sub_page_soup.select(soup_select)]

                    # downloading the first pdf file link to the local directory (if it doesnt already exist)
                    download_pdf(site + file_links[0], datadir)

Downloaded covid-19-outbreaks-in-australian-residential-aged-care-homes-12-december-2025_0.pdf to c:/dev/covid-19-au-vaccinations/health-aged-care/
Downloaded covid-19-outbreaks-in-australian-residential-aged-care-homes-12-december-2025_0.docx to c:/dev/covid-19-au-vaccinations/health-aged-care/
covid-19-outbreaks-in-australian-residential-aged-care-homes-5-december-2025_0.pdf already exists in c:/dev/covid-19-au-vaccinations/health-aged-care/
covid-19-outbreaks-in-australian-residential-aged-care-homes-5-december-2025_0.docx already exists in c:/dev/covid-19-au-vaccinations/health-aged-care/
covid-19-outbreaks-in-australian-residential-aged-care-homes-28-november-2025_0.pdf already exists in c:/dev/covid-19-au-vaccinations/health-aged-care/
covid-19-outbreaks-in-australian-residential-aged-care-homes-28-november-2025_0.docx already exists in c:/dev/covid-19-au-vaccinations/health-aged-care/
covid-19-outbreaks-in-australian-residential-aged-care-homes-21-november-2025_0.pdf already exi

#### Process local files

In [15]:
start_source_file_date_for_table_output = datetime.date(2024, 4, 12) # process files dated 12 April 2024 onwards (earlier files had different formats)

output_treatments_df = pandas.DataFrame(columns=['source_file_name', 'source_file_date', 'end_date', 'lagevrio_courses', 'lagevrio_prescriptions', 'paxlovid_prescriptions'])
  
# Initialize an empty list to store dataframes from Word tables
dfs = [pandas.DataFrame() for _ in range(6)]

for file in os.listdir(datadir):
    filename = os.fsdecode(file)
    # try to derive the source_file_date from the end of the file name
    try:
        filename_for_source_file_date = filename.split('.')[0]
        # handle file names that end with _0, _1 etc
        re_does_filename_for_source_file_date_end_in_underscore_number = re.search(r'_\d$', filename_for_source_file_date)
        if re_does_filename_for_source_file_date_end_in_underscore_number is not None:
            filename_for_source_file_date = filename_for_source_file_date[:len(filename_for_source_file_date) - 2]
        source_file_date_str = ' '.join(filename_for_source_file_date.split('-')[-3:])
        source_file_date = dateutil.parser.parse(source_file_date_str, default=datetime.date(2000, 1, 1))
    except:
        source_file_date = datetime.date(2000, 1, 1)
    
    # browse through all the local pdf files, gathering the search results into a dataframe for output
    if filename.endswith('.pdf'):
        pdf_file = datadir + filename
        end_date, lagevrio_courses, lagevrio_prescriptions, paxlovid_prescriptions  = extract_data_from_pdf(pdf_file)
        # print(f"DEBUG: End date: {end_date}")
        # print(f"DEBUG: Lagevrio treatment courses: {lagevrio_courses}")
        # print(f"DEBUG: Lagevrio prescriptions: {lagevrio_prescriptions}")
        # print(f"DEBUG: Paxlovid prescriptions: {paxlovid_prescriptions}")

        # construct the output row and add it to the dataframe
        output_row = [filename, source_file_date, end_date, lagevrio_courses, lagevrio_prescriptions, paxlovid_prescriptions]
        output_treatments_df.loc[len(output_treatments_df.index)] = output_row

    # browse through all the local docx files, gathering the tables into dataframes for output
    if filename.endswith('.docx') and source_file_date >= start_source_file_date_for_table_output:
        # Load the Word document
        doc = Document(datadir + filename)
        table_counter = -1

        # Iterate through each table in the document
        for table in doc.tables:
            table_counter = table_counter + 1

            # Create a DataFrame structure with empty strings, sized by the number of rows and columns in the table
            df = [['' for _ in range(len(table.columns))] for _ in range(len(table.rows))]
            
            # Iterate through each row in the current table
            for i, row in enumerate(table.rows):
                # Iterate through each cell in the current row
                for j, cell in enumerate(row.cells):
                    # If the cell has text, store it in the corresponding DataFrame position
                    if cell.text:
                        df[i][j] = cell.text.replace('\n',' ')
            
            # Convert the list of lists (df) to a pandas DataFrame and add it to the tables list
            table_df = pandas.DataFrame(df)
            table_df['source_file_name'] = filename
            table_df['source_file_date'] = source_file_date
            dfs[table_counter] = pandas.concat([dfs[table_counter], table_df])



#### Gather result dataframes and write out to Excel sheets.

In [16]:

# sort each df by source_file_date desc
output_treatments_df = output_treatments_df.sort_values(['source_file_date'], ascending=[False]).reset_index(drop=True)
for df_index, each_df in enumerate(dfs.copy()):
   each_df.index.name = 'row_index_per_file'
   dfs[df_index] = each_df.sort_values(['source_file_date', 'row_index_per_file'], ascending=[False, True]).reset_index()

# write all the output...dfs to an Excel file with Sheet names
dfs.insert(0,output_treatments_df)

writer = pandas.ExcelWriter(output_filename, engine='xlsxwriter')
sheet_names = ["Treatments", "National Snapshot", "Active Outbreak Summary", "Workforce Resources", "Vaccinations","Regulatory Activities","Active Outbreaks"]
for df_index, frame in enumerate(dfs):
   frame.to_excel(writer, sheet_name = sheet_names[df_index])
writer.close()


#### Debug dataframe outputs

In [17]:
output_treatments_df

Unnamed: 0,source_file_name,source_file_date,end_date,lagevrio_courses,lagevrio_prescriptions,paxlovid_prescriptions
0,covid-19-outbreaks-in-australian-residential-a...,2025-12-12,30 November 2025,48269,176202,28610
1,covid-19-outbreaks-in-australian-residential-a...,2025-12-05,31 October 2025,48269,175333,28350
2,covid-19-outbreaks-in-australian-residential-a...,2025-11-28,31 October 2025,48269,175333,28350
3,covid-19-outbreaks-in-australian-residential-a...,2025-11-21,31 October 2025,48269,175333,28350
4,covid-19-outbreaks-in-australian-residential-a...,2025-11-14,31 October 2025,48269,175333,28350
...,...,...,...,...,...,...
200,covid-19-outbreaks-in-australian-residential-a...,2022-01-28,,0,0,0
201,covid-19-outbreaks-in-australian-residential-a...,2022-01-21,,0,0,0
202,covid-19-outbreaks-in-australian-residential-a...,2022-01-14,,0,0,0
203,covid-19-outbreaks-in-australian-residential-a...,2022-01-07,,0,0,0


In [18]:
dfs[1] # National Snapshot

Unnamed: 0,row_index_per_file,0,1,2,3,4,source_file_name,source_file_date
0,0,Category,Active,Change in active (7 days),Cumulative Total,Cumulative increase (7 days),covid-19-outbreaks-in-australian-residential-a...,2025-12-12
1,1,Outbreaks,35,+5,27419,+23,covid-19-outbreaks-in-australian-residential-a...,2025-12-12
2,2,RACHs affected,35,+5,2924,0,covid-19-outbreaks-in-australian-residential-a...,2025-12-12
3,3,Resident cases,138,+24,261219,+117,covid-19-outbreaks-in-australian-residential-a...,2025-12-12
4,4,Resident deaths,,,7604,+3,covid-19-outbreaks-in-australian-residential-a...,2025-12-12
...,...,...,...,...,...,...,...,...
517,1,Outbreaks,209,33,19979,106,covid-19-outbreaks-in-australian-residential-a...,2024-04-12
518,2,RACHs affected,209,33,2867,1,covid-19-outbreaks-in-australian-residential-a...,2024-04-12
519,3,Resident cases,821,-78,190588,707,covid-19-outbreaks-in-australian-residential-a...,2024-04-12
520,4,Resident deaths,,,6398,10,covid-19-outbreaks-in-australian-residential-a...,2024-04-12


In [19]:
dfs[2] # Active Outbreaks

Unnamed: 0,row_index_per_file,0,1,2,3,4,5,6,7,8,9,source_file_name,source_file_date
0,0,,ACT,NSW,NT,QLD,SA,TAS,VIC,WA,Total,covid-19-outbreaks-in-australian-residential-a...,2025-12-12
1,1,Total RACHs with outbreaks,0,4,0,6,6,1,14,4,35,covid-19-outbreaks-in-australian-residential-a...,2025-12-12
2,2,Total number of active resident cases,0,17,0,25,16,4,60,16,138,covid-19-outbreaks-in-australian-residential-a...,2025-12-12
3,3,Total number of active staff cases,0,9,0,4,7,1,10,6,37,covid-19-outbreaks-in-australian-residential-a...,2025-12-12
4,4,Total outbreaks opened in previous 7 days,0,2,0,4,6,0,8,3,23,covid-19-outbreaks-in-australian-residential-a...,2025-12-12
...,...,...,...,...,...,...,...,...,...,...,...,...,...
517,1,Total RACHs with outbreaks,0,64,1,35,26,7,52,24,209,covid-19-outbreaks-in-australian-residential-a...,2024-04-12
518,2,Total number of active resident cases,0,249,7,121,142,29,194,79,821,covid-19-outbreaks-in-australian-residential-a...,2024-04-12
519,3,Total number of active staff cases,0,106,1,55,60,8,65,34,329,covid-19-outbreaks-in-australian-residential-a...,2024-04-12
520,4,Total outbreaks opened in previous 7 days,0,29,1,15,16,6,27,12,106,covid-19-outbreaks-in-australian-residential-a...,2024-04-12


In [20]:
dfs[3] # Workforce Resources

Unnamed: 0,row_index_per_file,0,1,2,3,source_file_name,source_file_date,4
0,0,Workforce provider,Total shifts,Previous 7 days,Previous 7 days,covid-19-outbreaks-in-australian-residential-a...,2025-12-12,
1,1,Aspen Medical8,34108,0,Randstad9,covid-19-outbreaks-in-australian-residential-a...,2025-12-12,
2,2,1642,0,Previous contracted providers10,167011,covid-19-outbreaks-in-australian-residential-a...,2025-12-12,
3,3,,Total,202761,0,covid-19-outbreaks-in-australian-residential-a...,2025-12-12,
4,4,,,,,covid-19-outbreaks-in-australian-residential-a...,2025-12-12,
...,...,...,...,...,...,...,...,...
555,4,,Aspen Medical,28843,26,covid-19-outbreaks-in-australian-residential-a...,2024-04-12,26
556,5,HealthX,32836,44,44,covid-19-outbreaks-in-australian-residential-a...,2024-04-12,Randstad
557,6,5,5,5,Torrens,covid-19-outbreaks-in-australian-residential-a...,2024-04-12,4056
558,7,,,,,covid-19-outbreaks-in-australian-residential-a...,2024-04-12,


In [21]:
dfs[4] # Vaccinations

Unnamed: 0,row_index_per_file,0,1,2,source_file_name,source_file_date
0,0,Jurisdiction,75+ with booster in last 6 months,75+ with booster in last 6 months,covid-19-outbreaks-in-australian-residential-a...,2025-12-12
1,1,Jurisdiction,Residents vaccinated,% of residents vaccinated,covid-19-outbreaks-in-australian-residential-a...,2025-12-12
2,2,National,77925,44.8%,covid-19-outbreaks-in-australian-residential-a...,2025-12-12
3,3,NSW,25523,45.5%,covid-19-outbreaks-in-australian-residential-a...,2025-12-12
4,4,VIC,20782,45.7%,covid-19-outbreaks-in-australian-residential-a...,2025-12-12
...,...,...,...,...,...,...
952,6,QLD,15.2k,40.5%,covid-19-outbreaks-in-australian-residential-a...,2024-04-12
953,7,SA,7.4k,46.5%,covid-19-outbreaks-in-australian-residential-a...,2024-04-12
954,8,TAS,2.4k,54.8%,covid-19-outbreaks-in-australian-residential-a...,2024-04-12
955,9,VIC,20.8k,43.7%,covid-19-outbreaks-in-australian-residential-a...,2024-04-12


In [22]:
dfs[5] # Regulatory Activities

Unnamed: 0,row_index_per_file,0,1,2,3,4,5,6,source_file_name,source_file_date,7,8
0,0,Regulatory Activities,2019-20 (1 Mar - 30 Jun 2020),2020-21,2021-22,2022-23,2023-24,2024-25,covid-19-outbreaks-in-australian-residential-a...,2025-12-12,2025-26 (to 6 Nov,Total
1,1,Site visits,318,3452,1732,3814,2840,1515,covid-19-outbreaks-in-australian-residential-a...,2025-12-12,125,13796
2,2,Non-site activities,3704,8396,6665,1413,487,929,covid-19-outbreaks-in-australian-residential-a...,2025-12-12,151,21745
3,3,Total activities,4022,11848,8397,5227,3327,2444,covid-19-outbreaks-in-australian-residential-a...,2025-12-12,276,35541
4,0,Regulatory Activities,2019-20 (1 Mar - 30 Jun 2020),2020-21,2021-22,2022-23,2023-24,2024-25,covid-19-outbreaks-in-australian-residential-a...,2025-12-05,2025-26 (to 6 Nov,Total
...,...,...,...,...,...,...,...,...,...,...,...,...
343,3,Total activities,4027,11833,8397,5227,2821,32305,covid-19-outbreaks-in-australian-residential-a...,2024-04-19,,
344,0,Regulatory Activities,2019-20 (1 Mar - 30 Jun 2020),2020-21,2021-22,2022-23,2023-24 (to 11 Apr),Total,covid-19-outbreaks-in-australian-residential-a...,2024-04-12,,
345,1,Site visits,318,3452,1732,3814,2334,11650,covid-19-outbreaks-in-australian-residential-a...,2024-04-12,,
346,2,Non-site activities,3709,8381,6665,1413,431,20599,covid-19-outbreaks-in-australian-residential-a...,2024-04-12,,


In [23]:
dfs[6] # Active Outbreaks

Unnamed: 0,row_index_per_file,0,1,2,3,4,5,source_file_name,source_file_date
0,0,Service Name,State,Resident Deaths,Resident Cases,Staff Cases,Total Cases,covid-19-outbreaks-in-australian-residential-a...,2025-12-12
1,1,CASS Residential Aged Care Facility,New South Wales,0,<6,0,<6,covid-19-outbreaks-in-australian-residential-a...,2025-12-12
2,2,Eloura,New South Wales,0,<6,<6,7,covid-19-outbreaks-in-australian-residential-a...,2025-12-12
3,3,Hamlyn Terrace Care Community,New South Wales,0,10,<6,n/p,covid-19-outbreaks-in-australian-residential-a...,2025-12-12
4,4,Narrandera Homestead Care Community,New South Wales,<6,12,12,24,covid-19-outbreaks-in-australian-residential-a...,2025-12-12
...,...,...,...,...,...,...,...,...,...
15165,205,St George's Care Centre,Western Australia,0,16,<6,n/p,covid-19-outbreaks-in-australian-residential-a...,2024-04-12
15166,206,The Queenslea,Western Australia,0,<6,0,<6,covid-19-outbreaks-in-australian-residential-a...,2024-04-12
15167,207,Treeby Parklands Care Community,Western Australia,0,<6,<6,<6,covid-19-outbreaks-in-australian-residential-a...,2024-04-12
15168,208,Waratah Lodge,Western Australia,0,<6,<6,<6,covid-19-outbreaks-in-australian-residential-a...,2024-04-12
