## covid-19-au-health-aged-care.ipynb

Downloads pdf files from the sub-pages below a targetted page into a local directory (datadir). Only downloads if local file does not exist.  
Reads the text from each local pdf file, extracting key fields. Writes those fields out as an Excel file.

In [1]:
from bs4 import BeautifulSoup
import fitz # pip install PyMuPDF
import os
import pandas
import random
import re
import requests
import time

site = "https://www.health.gov.au"
main_url = "https://www.health.gov.au/resources/collections/covid-19-outbreaks-in-australian-residential-aged-care-facilities"

datadir = 'c:/dev/covid-19-au-vaccinations/health-aged-care/'
output_filename = datadir + "health-aged-care.xlsx"

In [2]:
def download_pdf(pdf_url, local_dir):
    # Get the filename from the URL
    filename = os.path.basename(pdf_url)
    local_path = os.path.join(local_dir, filename)

    # Check if the file already exists in the local directory
    if not os.path.exists(local_path):
        # sleep for a random time before downloading 
        time.sleep(2 + ( random.randrange( 0, 30) / 10 ) )
        # Download the PDF file
        response = requests.get(pdf_url)

        # Save the PDF file to the local directory
        with open(local_path, "wb") as file:
            file.write(response.content)
        print(f"Downloaded {filename} to {local_dir}")
    else:
        print(f"{filename} already exists in {local_dir}")


In [3]:
# get an process the main page
response = requests.get(main_url)
soup = BeautifulSoup(response.content, "html.parser")

# Find and extract the links with text (wrapped in a span tag) containing with "COVID-19 outbreaks in Australian residential aged care facilities"
links = [link.get("href") for link in soup.select("a:has(span:-soup-contains('COVID-19 outbreaks in Australian residential aged care facilities'))")]

In [4]:
# browse through the list of links. See if the link URL partially matches the links_to_check list.
links_to_check = ["2022", "2023","2024"]
for each_link_candidate in links:
    each_link_list = [each_link_candidate for sub_string in links_to_check if(sub_string in each_link_candidate)]
    if len(each_link_list) > 0:
        # sleep for a random time before getting the sub-page 
        time.sleep(1 + ( random.randrange( 0, 20) / 10 ) )
        # get and process the sub-page from each qualifying link
        each_link = each_link_list[0]
        sub_page_url = site + each_link
        sub_page_response = requests.get(sub_page_url)
        sub_page_soup = BeautifulSoup(sub_page_response.content, "html.parser")

        # Find and extract the links to PDF files
        pdf_links = [pdf_link.get("href") for pdf_link in sub_page_soup.select("a:has(span:-soup-contains('PDF'))")]

        # downloading the first pdf file link to the local directory (if it doesnt already exist)
        download_pdf(site + pdf_links[0], datadir)

Downloaded covid-19-outbreaks-in-australian-residential-aged-care-facilities-12-april-2024_0.pdf to c:/dev/covid-19-au-vaccinations/health-aged-care/
covid-19-outbreaks-in-australian-residential-aged-care-facilities-5-april-2024_0.pdf already exists in c:/dev/covid-19-au-vaccinations/health-aged-care/
covid-19-outbreaks-in-australian-residential-aged-care-facilities-28-march-2024.pdf already exists in c:/dev/covid-19-au-vaccinations/health-aged-care/
covid-19-outbreaks-in-australian-residential-aged-care-facilities-22-march-2024.pdf already exists in c:/dev/covid-19-au-vaccinations/health-aged-care/
covid-19-outbreaks-in-australian-residential-aged-care-facilities-15-march-2024.pdf already exists in c:/dev/covid-19-au-vaccinations/health-aged-care/
covid-19-outbreaks-in-australian-residential-aged-care-facilities-8-march-2024_0.pdf already exists in c:/dev/covid-19-au-vaccinations/health-aged-care/
covid-19-outbreaks-in-australian-residential-aged-care-facilities-1-march-2024.pdf alrea

In [5]:
def extract_data_from_pdf(pdf_file):

# open a pdf file, search for the key fields and return them
    
    with fitz.open(pdf_file) as doc:  # open document
        text = chr(12).join([page.get_text() for page in doc])

        lagevrio_treatment_courses = 0
        lagevrio_prescriptions = 0
        paxlovid_prescriptions = 0
        end_date = ''

        # search for: and up to DD MMMM YYYY (allowing for extra spaces around the month)
        pattern = r'Lagevrio.*?up\s+to\s+(\d+)(.*?)(\d{4})'
        match = re.search(pattern, text, re.DOTALL)
        if match:
            # assign result, removing excess whitespace around the month portion
            end_date = match.group(1) + ' ' + match.group(2) + ' ' + match.group(3)
            # print(f"DEBUG: End dates: {end_date}")

        # search for: deployed NNN treatment courses of Lagevrio
        pattern = r"deployed\s*(\d+(?:,\d+)?)\s*treatment\s+courses\s+of\s+Lagevrio"
        match = re.search(pattern, text, re.DOTALL)
        if match:
            # assign result
            lagevrio_treatment_courses = match.group(1)
            # print(f"DEBUG: Text: {text}")
            # print(f"DEBUG: Lagevrio treatment courses: {lagevrio_treatment_courses}")

        # search for: NNN prescriptions for Lagevrio
        pattern = r"(\d+(?:,\d+)?)\s*prescriptions\s+for\s+Lagevrio"
        match = re.search(pattern, text, re.DOTALL)
        if match:
            # assign result
            lagevrio_prescriptions = match.group(1)
            # print(f"DEBUG: Lagevrio prescriptions: {lagevrio_prescriptions}")

        # search for: NNN prescriptions for Paxlovid 
        pattern = r"further\s*(\d+(?:,\d+)?)\s*prescriptions\s+for\s+Paxlovid"
        match = re.search(pattern, text, re.DOTALL)
        if match:
            # assign result 
            paxlovid_prescriptions = match.group(1)
            # print(f"DEBUG: Paxlovid prescriptions: {paxlovid_prescriptions}")

        return end_date, lagevrio_treatment_courses, lagevrio_prescriptions, paxlovid_prescriptions

In [6]:
# browse through all the local pdf files, gathering the search results into a dataframe for output

output_df = pandas.DataFrame(columns=['source_file_name', 'end_date', 'lagevrio_courses', 'lagevrio_prescriptions', 'paxlovid_prescriptions'])
  
for file in os.listdir(datadir):
    filename = os.fsdecode(file)
    if filename.endswith('.pdf'):
        pdf_file = datadir + filename
        end_date, lagevrio_courses, lagevrio_prescriptions, paxlovid_prescriptions  = extract_data_from_pdf(pdf_file)
        # print(f"DEBUG: End date: {end_date}")
        # print(f"DEBUG: Lagevrio treatment courses: {lagevrio_courses}")
        # print(f"DEBUG: Lagevrio prescriptions: {lagevrio_prescriptions}")
        # print(f"DEBUG: Paxlovid prescriptions: {paxlovid_prescriptions}")

        # construct the output row and add it to the dataframe
        output_row = [filename, end_date, lagevrio_courses, lagevrio_prescriptions, paxlovid_prescriptions]
        output_df.loc[len(output_df.index)] = output_row

output_df.to_excel (output_filename, index=False)

output_df

Unnamed: 0,source_file_name,end_date,lagevrio_courses,lagevrio_prescriptions,paxlovid_prescriptions
0,covid-19-outbreaks-in-australian-residential-a...,,48134,0,0
1,covid-19-outbreaks-in-australian-residential-a...,31 October 2023,48269,82412,7250
2,covid-19-outbreaks-in-australian-residential-a...,11 January 2024,48269,92911,8841
3,covid-19-outbreaks-in-australian-residential-a...,26 June 2022,48269,9010,157
4,covid-19-outbreaks-in-australian-residential-a...,31 January 2024,48269,99877,9759
...,...,...,...,...,...
113,covid-19-outbreaks-in-australian-residential-a...,3 September 2023,48269,76739,6319
114,covid-19-outbreaks-in-australian-residential-a...,4 December 2022,48269,37200,1145
115,covid-19-outbreaks-in-australian-residential-a...,31 January 2024,48269,99877,9759
116,covid-19-outbreaks-in-australian-residential-a...,4 June 2023,48269,68909,5231
