## covid-19-au-health-aged-care.ipynb

Downloads pdf and Word docx files from the sub-pages below a targetted page into a local directory (datadir). Only downloads if local file does not exist.  
Reads the text from each local pdf file, extracting key fields. 
Reads the local Word docx files (dated after a specified date), extracting all table data.

Writes the collected data out as an Excel file:
- sheet: Treatments - data on treatments scraped from pdf files
- sheet: National Snapshot - data from 1st table in Word docx files
- sheet: Active Outbreak Summary - data from 2nd table in Word docx files
- sheet: Workforce Resources - data from 3rd table in Word docx files
- sheet: Vaccinations - data from 4th table in Word docx files
- sheet: Regulatory Activities - data from 5th table in Word docx files
- sheet: Active Outbreaks - data from 6th table in Word docx files

In all sheets, the source_file_name and source_file_date (derived from the file name) are added as columns. Rows are sorted by source_file_date (descending).

In the sheets sourced from Word docx tables, the column headers are not promoted so the columns are numbered from 0 instead. This is to avoid a sparse table if column headings change among files.  The column headers will appear repeated for each source file, which any downstream analysis can filter out.

In [6]:

from bs4 import BeautifulSoup
import datetime
import dateutil # pip install python-dateutil
from docx import Document
import pymupdf
import os
import pandas
import random
import re
import requests
import time

site = "https://www.health.gov.au"
main_url = "https://www.health.gov.au/resources/collections/covid-19-outbreaks-in-australian-residential-aged-care-facilities"

datadir = 'c:/dev/covid-19-au-vaccinations/health-aged-care/'
output_filename = datadir + "health-aged-care.xlsx"

In [7]:
def download_pdf(pdf_url, local_dir):
    # Get the filename from the URL
    filename = os.path.basename(pdf_url)
    local_path = os.path.join(local_dir, filename)

    # Check if the file already exists in the local directory
    if not os.path.exists(local_path):
        # sleep for a random time before downloading 
        time.sleep(2 + ( random.randrange( 0, 30) / 10 ) )
        # Download the PDF file
        response = requests.get(pdf_url)

        # Save the PDF file to the local directory
        with open(local_path, "wb") as file:
            file.write(response.content)
        print(f"Downloaded {filename} to {local_dir}")
    else:
        print(f"{filename} already exists in {local_dir}")


In [8]:
def extract_data_from_pdf(pdf_file):

# open a pdf file, search for the key fields and return them
    
    with pymupdf.open(pdf_file) as doc:  # open document
        text = chr(12).join([page.get_text() for page in doc])

        lagevrio_treatment_courses = 0
        lagevrio_prescriptions = 0
        paxlovid_prescriptions = 0
        end_date = ''

        # search for: and up to DD MMMM YYYY (allowing for extra spaces around the month)
        pattern = r'Lagevrio.*?up\s+to\s+(\d+)(.*?)(\d{4})'
        match = re.search(pattern, text, re.DOTALL)
        if match:
            # assign result, removing excess whitespace around the month portion
            end_date = match.group(1) + ' ' + match.group(2) + ' ' + match.group(3)
            # print(f"DEBUG: End dates: {end_date}")

        # search for: deployed NNN treatment courses of Lagevrio
        pattern = r"deployed\s*(\d+(?:,\d+)?)\s*treatment\s+courses\s+of\s+Lagevrio"
        match = re.search(pattern, text, re.DOTALL)
        if match:
            # assign result
            lagevrio_treatment_courses = match.group(1)
            # print(f"DEBUG: Text: {text}")
            # print(f"DEBUG: Lagevrio treatment courses: {lagevrio_treatment_courses}")

        # search for: NNN prescriptions for Lagevrio
        pattern = r"(\d+(?:,\d+)?)\s*prescriptions\s+for\s+Lagevrio"
        match = re.search(pattern, text, re.DOTALL)
        if match:
            # assign result
            lagevrio_prescriptions = match.group(1)
            # print(f"DEBUG: Lagevrio prescriptions: {lagevrio_prescriptions}")

        # search for: NNN prescriptions for Paxlovid 
        pattern = r"further\s*(\d+(?:,\d+)?)\s*prescriptions\s+for\s+Paxlovid"
        match = re.search(pattern, text, re.DOTALL)
        if match:
            # assign result 
            paxlovid_prescriptions = match.group(1)
            # print(f"DEBUG: Paxlovid prescriptions: {paxlovid_prescriptions}")

        return end_date, lagevrio_treatment_courses, lagevrio_prescriptions, paxlovid_prescriptions

#### Get the main source page, make a list of links to check

In [9]:
response = requests.get(main_url)
soup = BeautifulSoup(response.content, "html.parser")

# Find and extract the links with text (wrapped in a span tag) containing with "COVID-19 outbreaks in Australian residential aged care facilities"
links = [link.get("href") for link in soup.select("a:has(span:-soup-contains('COVID-19 outbreaks in Australian residential aged care'))")]

#### Get each sub-page and download it's pdf and Word docx files 

In [10]:
# browse through the list of links. See if the link URL partially matches the links_to_check list.
links_to_check = ["2024"]
link_files_to_download = ['PDF', 'Word']
for each_link_candidate in links:
    each_link_list = [each_link_candidate for sub_string in links_to_check if(sub_string in each_link_candidate)]
    if len(each_link_list) > 0:
        # sleep for a random time before getting the sub-page 
        time.sleep(1 + ( random.randrange( 0, 20) / 10 ) )
        # get and process the sub-page from each qualifying link
        each_link = each_link_list[0]
        sub_page_url = site + each_link
        sub_page_response = requests.get(sub_page_url)
        sub_page_soup = BeautifulSoup(sub_page_response.content, "html.parser")

        # Find and extract the links to files
        for each_link_file_to_download in link_files_to_download:
            soup_select = "a:has(span:-soup-contains('" + each_link_file_to_download + "'))"
            file_links = [file_links.get("href") for file_links in sub_page_soup.select(soup_select)]

            # downloading the first pdf file link to the local directory (if it doesnt already exist)
            download_pdf(site + file_links[0], datadir)

Downloaded covid-19-outbreaks-in-australian-residential-aged-care-homes-18-october-2024_0.pdf to c:/dev/covid-19-au-vaccinations/health-aged-care/
Downloaded covid-19-outbreaks-in-australian-residential-aged-care-homes-18-october-2024_0.docx to c:/dev/covid-19-au-vaccinations/health-aged-care/
covid-19-outbreaks-in-australian-residential-aged-care-facilities-11-october-2024.pdf already exists in c:/dev/covid-19-au-vaccinations/health-aged-care/
covid-19-outbreaks-in-australian-residential-aged-care-facilities-11-october-2024.docx already exists in c:/dev/covid-19-au-vaccinations/health-aged-care/
covid-19-outbreaks-in-australian-residential-aged-care-facilities-4-october-2024.pdf already exists in c:/dev/covid-19-au-vaccinations/health-aged-care/
covid-19-outbreaks-in-australian-residential-aged-care-facilities-4-october-2024.docx already exists in c:/dev/covid-19-au-vaccinations/health-aged-care/
covid-19-outbreaks-in-australian-residential-aged-care-facilities-27-september-2024.pdf a

#### Process local files

In [11]:
start_source_file_date_for_table_output = datetime.date(2024, 4, 1) # process files dated 1 April 2024 onwards (March 2024 files show a format change)

output_treatments_df = pandas.DataFrame(columns=['source_file_name', 'source_file_date', 'end_date', 'lagevrio_courses', 'lagevrio_prescriptions', 'paxlovid_prescriptions'])
  
# Initialize an empty list to store dataframes from Word tables
dfs = [pandas.DataFrame() for _ in range(6)]

for file in os.listdir(datadir):
    filename = os.fsdecode(file)
    try:
        source_file_date_str = ' '.join((filename.split('.')[0]).split('-')[-3:])
        source_file_date = dateutil.parser.parse(source_file_date_str, default=datetime.date(2000, 1, 1))
    except:
        source_file_date = datetime.date(2000, 1, 1)
    
    # browse through all the local pdf files, gathering the search results into a dataframe for output
    if filename.endswith('.pdf'):
        pdf_file = datadir + filename
        end_date, lagevrio_courses, lagevrio_prescriptions, paxlovid_prescriptions  = extract_data_from_pdf(pdf_file)
        # print(f"DEBUG: End date: {end_date}")
        # print(f"DEBUG: Lagevrio treatment courses: {lagevrio_courses}")
        # print(f"DEBUG: Lagevrio prescriptions: {lagevrio_prescriptions}")
        # print(f"DEBUG: Paxlovid prescriptions: {paxlovid_prescriptions}")

        # construct the output row and add it to the dataframe
        output_row = [filename, source_file_date, end_date, lagevrio_courses, lagevrio_prescriptions, paxlovid_prescriptions]
        output_treatments_df.loc[len(output_treatments_df.index)] = output_row

    # browse through all the local docx files, gathering the tables into dataframes for output
    if filename.endswith('.docx') and source_file_date >= start_source_file_date_for_table_output:
        # Load the Word document
        doc = Document(datadir + filename)
        table_counter = -1

        # Iterate through each table in the document
        for table in doc.tables:
            table_counter = table_counter + 1

            # Create a DataFrame structure with empty strings, sized by the number of rows and columns in the table
            df = [['' for _ in range(len(table.columns))] for _ in range(len(table.rows))]
            
            # Iterate through each row in the current table
            for i, row in enumerate(table.rows):
                # Iterate through each cell in the current row
                for j, cell in enumerate(row.cells):
                    # If the cell has text, store it in the corresponding DataFrame position
                    if cell.text:
                        df[i][j] = cell.text.replace('\n',' ')
            
            # Convert the list of lists (df) to a pandas DataFrame and add it to the tables list
            table_df = pandas.DataFrame(df)
            table_df['source_file_name'] = filename
            table_df['source_file_date'] = source_file_date
            dfs[table_counter] = pandas.concat([dfs[table_counter], table_df])



#### Gather result dataframes and write out to Excel sheets.

In [12]:

# sort each df by source_file_date desc
output_treatments_df = output_treatments_df.sort_values(['source_file_date'], ascending=[False]).reset_index(drop=True)
for df_index, each_df in enumerate(dfs.copy()):
   each_df.index.name = 'row_index_per_file'
   dfs[df_index] = each_df.sort_values(['source_file_date', 'row_index_per_file'], ascending=[False, True]).reset_index()

# write all the output...dfs to an Excel file with Sheet names
dfs.insert(0,output_treatments_df)

writer = pandas.ExcelWriter(output_filename, engine='xlsxwriter')
sheet_names = ["Treatments", "National Snapshot", "Active Outbreak Summary", "Workforce Resources", "Vaccinations","Regulatory Activities","Active Outbreaks"]
for df_index, frame in enumerate(dfs):
   frame.to_excel(writer, sheet_name = sheet_names[df_index])
writer.close()


#### Debug dataframe outputs

In [13]:
output_treatments_df

Unnamed: 0,source_file_name,source_file_date,end_date,lagevrio_courses,lagevrio_prescriptions,paxlovid_prescriptions
0,covid-19-outbreaks-in-australian-residential-a...,2024-10-11,30 September 2024,48269,138624,18329
1,covid-19-outbreaks-in-australian-residential-a...,2024-10-04,31 August 2024,48269,136524,17798
2,covid-19-outbreaks-in-australian-residential-a...,2024-09-27,31 August 2024,48269,136524,17798
3,covid-19-outbreaks-in-australian-residential-a...,2024-09-20,31 August 2024,48269,136524,17798
4,covid-19-outbreaks-in-australian-residential-a...,2024-09-13,31 August 2024,48269,136524,17798
...,...,...,...,...,...,...
141,covid-19-outbreaks-in-australian-residential-a...,2000-01-01,12 March 2023,48269,54830,3129
142,covid-19-outbreaks-in-australian-residential-a...,2000-01-01,,48138,0,0
143,covid-19-outbreaks-in-australian-residential-a...,2000-01-01,12 February 2023,48269,52810,2860
144,covid-19-outbreaks-in-australian-residential-a...,2000-01-01,21 August 2022,48269,24204,575


In [14]:
dfs[1] # National Snapshot

Unnamed: 0,row_index_per_file,0,1,2,3,4,source_file_name,source_file_date
0,0,Category,Active,Change in active (7 days),Cumulative Total,Cumulative increase (7 days),covid-19-outbreaks-in-australian-residential-a...,2024-10-11
1,1,Outbreaks,121,11,23152,53,covid-19-outbreaks-in-australian-residential-a...,2024-10-11
2,2,RACHs affected,121,11,2898,0,covid-19-outbreaks-in-australian-residential-a...,2024-10-11
3,3,Resident cases,525,-50,225457,460,covid-19-outbreaks-in-australian-residential-a...,2024-10-11
4,4,Resident deaths,,,7019,7,covid-19-outbreaks-in-australian-residential-a...,2024-10-11
...,...,...,...,...,...,...,...,...
127,1,Outbreaks,222,11,20187,109,covid-19-outbreaks-in-australian-residential-a...,2024-04-26
128,2,RACHs affected,222,11,2869,1,covid-19-outbreaks-in-australian-residential-a...,2024-04-26
129,3,Resident cases,1205,57,192568,1064,covid-19-outbreaks-in-australian-residential-a...,2024-04-26
130,4,Resident deaths,,,6443,34,covid-19-outbreaks-in-australian-residential-a...,2024-04-26


In [15]:
dfs[2] # Active Outbreaks

Unnamed: 0,row_index_per_file,0,1,2,3,4,5,6,7,8,9,source_file_name,source_file_date
0,0,,ACT,NSW,NT,QLD,SA,TAS,VIC,WA,Total,covid-19-outbreaks-in-australian-residential-a...,2024-10-11
1,1,Total RACHs with outbreaks,0,36,0,10,9,3,56,7,121,covid-19-outbreaks-in-australian-residential-a...,2024-10-11
2,2,Total number of active resident cases,0,148,0,70,38,24,226,19,525,covid-19-outbreaks-in-australian-residential-a...,2024-10-11
3,3,Total number of active staff cases,0,44,0,24,10,15,82,9,184,covid-19-outbreaks-in-australian-residential-a...,2024-10-11
4,4,Total outbreaks opened in previous 7 days,0,11,0,0,5,2,31,4,53,covid-19-outbreaks-in-australian-residential-a...,2024-10-11
...,...,...,...,...,...,...,...,...,...,...,...,...,...
127,1,Total RACHs with outbreaks,6,58,1,28,26,6,70,27,222,covid-19-outbreaks-in-australian-residential-a...,2024-04-26
128,2,Total number of active resident cases,26,363,3,116,116,65,345,171,1205,covid-19-outbreaks-in-australian-residential-a...,2024-04-26
129,3,Total number of active staff cases,9,125,3,49,39,23,154,66,468,covid-19-outbreaks-in-australian-residential-a...,2024-04-26
130,4,Total outbreaks opened in previous 7 days,5,25,0,15,11,1,36,16,109,covid-19-outbreaks-in-australian-residential-a...,2024-04-26


In [16]:
dfs[3] # Workforce Resources

Unnamed: 0,row_index_per_file,0,1,2,3,source_file_name,source_file_date
0,0,Workforce provider,Total shifts,Previous 7 days,Previous 7 days,covid-19-outbreaks-in-australian-residential-a...,2024-10-11
1,1,Aspen Medical8,32106,81,HealthX9,covid-19-outbreaks-in-australian-residential-a...,2024-10-11
2,2,36768,126,Randstad10,878,covid-19-outbreaks-in-australian-residential-a...,2024-10-11
3,3,0,Previous contracted providers11,128744,,covid-19-outbreaks-in-australian-residential-a...,2024-10-11
4,4,Total,198496,207,,covid-19-outbreaks-in-australian-residential-a...,2024-10-11
...,...,...,...,...,...,...,...
169,4,Healthcare Australia (Workforce Surge),15777,,Healthcare Australia (NACER),covid-19-outbreaks-in-australian-residential-a...,2024-04-26
170,5,2295,,Mable,2711,covid-19-outbreaks-in-australian-residential-a...,2024-04-26
171,6,,Torrens,4056,,covid-19-outbreaks-in-australian-residential-a...,2024-04-26
172,7,,,,,covid-19-outbreaks-in-australian-residential-a...,2024-04-26


In [17]:
dfs[4] # Vaccinations

Unnamed: 0,row_index_per_file,0,1,2,source_file_name,source_file_date
0,0,Jurisdiction,75+ with booster in last 6 months,75+ with booster in last 6 months,covid-19-outbreaks-in-australian-residential-a...,2024-10-11
1,1,Jurisdiction,Residents vaccinated,% of residents vaccinated,covid-19-outbreaks-in-australian-residential-a...,2024-10-11
2,2,National,86426,52.4%,covid-19-outbreaks-in-australian-residential-a...,2024-10-11
3,3,NSW,27602,51.9%,covid-19-outbreaks-in-australian-residential-a...,2024-10-11
4,4,VIC,22920,53.7%,covid-19-outbreaks-in-australian-residential-a...,2024-10-11
...,...,...,...,...,...,...
237,6,QLD,15.2k,40.5%,covid-19-outbreaks-in-australian-residential-a...,2024-04-26
238,7,SA,7.4k,46.5%,covid-19-outbreaks-in-australian-residential-a...,2024-04-26
239,8,TAS,2.4k,54.8%,covid-19-outbreaks-in-australian-residential-a...,2024-04-26
240,9,VIC,20.8k,43.7%,covid-19-outbreaks-in-australian-residential-a...,2024-04-26


In [18]:
dfs[5] # Regulatory Activities

Unnamed: 0,row_index_per_file,0,1,2,3,4,5,6,source_file_name,source_file_date,7
0,0,Regulatory Activities,2019-20 (1 Mar - 30 Jun 2020),2020-21,2021-22,2022-23,2023-24,2024-25 (to 10 Oct 2024),covid-19-outbreaks-in-australian-residential-a...,2024-10-11,Total
1,1,Site visits,318,3452,1732,3814,2840,729,covid-19-outbreaks-in-australian-residential-a...,2024-10-11,12885
2,2,Non-site activities,3704,8396,6665,1413,487,168,covid-19-outbreaks-in-australian-residential-a...,2024-10-11,20833
3,3,Total activities,4022,11848,8397,5227,3327,887,covid-19-outbreaks-in-australian-residential-a...,2024-10-11,33718
4,0,Regulatory Activities,2019-20 (1 Mar - 30 Jun 2020),2020-21,2021-22,2022-23,2023-24,2024-25 (to 3 Oct 2024),covid-19-outbreaks-in-australian-residential-a...,2024-10-04,Total
...,...,...,...,...,...,...,...,...,...,...,...
83,3,Total activities,4027,11833,8397,5227,2958,32442,covid-19-outbreaks-in-australian-residential-a...,2024-05-10,
84,0,Regulatory Activities,2019-20 (1 Mar - 30 Jun 2020),2020-21,2021-22,2022-23,2023-24 (to 24 Apr),Total,covid-19-outbreaks-in-australian-residential-a...,2024-04-26,
85,1,Site visits,318,3452,1732,3814,2418,11734,covid-19-outbreaks-in-australian-residential-a...,2024-04-26,
86,2,Non-site activities,3709,8381,6665,1413,447,20615,covid-19-outbreaks-in-australian-residential-a...,2024-04-26,


In [19]:
dfs[6] # Active Outbreaks

Unnamed: 0,row_index_per_file,0,1,2,3,4,5,source_file_name,source_file_date
0,0,Service Name,State,Resident Deaths,Resident Cases,Staff Cases,Total Cases,covid-19-outbreaks-in-australian-residential-a...,2024-10-11
1,1,Anglican Care Jesmond Grove,New South Wales,<6,23,<6,n/p,covid-19-outbreaks-in-australian-residential-a...,2024-10-11
2,2,Anglicare Donington Court,New South Wales,0,<6,<6,<6,covid-19-outbreaks-in-australian-residential-a...,2024-10-11
3,3,Anglicare Elizabeth Lodge,New South Wales,0,9,<6,n/p,covid-19-outbreaks-in-australian-residential-a...,2024-10-11
4,4,Anglicare Mary Andrews Village,New South Wales,0,<6,0,<6,covid-19-outbreaks-in-australian-residential-a...,2024-10-11
...,...,...,...,...,...,...,...,...,...
5919,218,Residency by Dillons Narrogin,Western Australia,<6,9,<6,n/p,covid-19-outbreaks-in-australian-residential-a...,2024-04-26
5920,219,Rosewood Care West Perth,Western Australia,<6,59,20,79,covid-19-outbreaks-in-australian-residential-a...,2024-04-26
5921,220,The Queenslea,Western Australia,0,15,<6,n/p,covid-19-outbreaks-in-australian-residential-a...,2024-04-26
5922,221,Treeby Parklands Care Community,Western Australia,0,34,20,54,covid-19-outbreaks-in-australian-residential-a...,2024-04-26
