## SGP-ID-Bulletins.ipynb

Shredder for: Singapore Communicable Diseases Agency (CDA) weekly infectious disease bulletin.

Assumes downloaded pdf files from the targetted page (https://www.cda.gov.sg/resources/weekly-infectious-diseases-bulletin-2025) into a local directory (datadir). 
Reads the text from each local pdf file, extracting key fields. 

Writes the collected data out as an Excel file:
- sheet: SARS-COV-2 - data relating to SARS-COV-2 / COVID-19


In [1]:

import datetime
import dateutil # pip install python-dateutil
import pymupdf
import os
import pandas
import re
import time

datadir = 'c:/dev/covid-19-world-vaccinations/SGP/'
output_filename = datadir + "SGP-ID-Bulletins.xlsx"

In [None]:
def extract_data_from_pdf(pdf_file):

# open a pdf file, search for the key fields and return them
    
    with pymupdf.open(pdf_file) as doc:  # open document
        text = chr(12).join([page.get_text() for page in doc])

        e_week = 0
        e_week_dates = ''
        e_week_end_date = ''
        ARI_average_daily = 0
        Adult_SARS_CoV_2_pct = 0
        Paediatric_SARS_CoV_2_pct = 0
        ARI_samples_week = 0
        ARI_samples_COVID_19_pct = 0

        # search for: ARI indicators for E-week NNN (DATES)
        pattern = r'ARI\s+indicators\s+for\s+E-week\s*(\d+(?:,\d+)?)\s*\((.*?)\)'
        match = re.search(pattern, text, re.DOTALL)
        if match:
            # assign results
            e_week = int(match.group(1))
            e_week_dates = match.group(2) 
            e_week_end_date = e_week_dates.split('-')[1].strip()

        # search for: polyclinics for ARI was NNN
        pattern = r"polyclinics\s+for\s+ARI\s+was\s+(\d+(?:,\d+)?)\s+"
        match = re.search(pattern, text, re.DOTALL)
        if match:
            # assign result
            ARI_average_daily = int(match.group(1))

        # search for: Adult samples ... SARS-COV-2 (NNN
        pattern = r"Adult\s+samples.*SARS-CoV-2\s+\((\d+(?:,\d+)?)"
        match = re.search(pattern, text, re.DOTALL)
        if match:
            # assign result
            Adult_SARS_CoV_2_pct = int(match.group(1))

        # search for: Paediatric samples ... SARS-COV-2 (NNN
        pattern = r"Paediatric\s+samples.*SARS-CoV-2\s+\((\d+(?:,\d+)?)"
        match = re.search(pattern, text, re.DOTALL)
        if match:
            # assign result
            Paediatric_SARS_CoV_2_pct = int(match.group(1))

        # search for:  COVID-19 among ARI samples (n= NNN) in the community was NNN
        pattern = r"COVID-19\s+among\s+ARI\s+samples\s+\(n=\s+(\d+(?:,\d+)?).*in\s+the\s+community\s+was\s+(\d+(?:,\d+)?)"
        match = re.search(pattern, text, re.DOTALL)
        if match:
            # assign result
            ARI_samples_week = int(match.group(1))
            ARI_samples_COVID_19_pct = int(match.group(2))


        return e_week , e_week_dates , e_week_end_date, ARI_average_daily , Adult_SARS_CoV_2_pct , Paediatric_SARS_CoV_2_pct , ARI_samples_week , ARI_samples_COVID_19_pct

#### Process local files

In [6]:
output_data_df = pandas.DataFrame(columns=['source_file_name', 'e_week', 'e_week_dates', 'e_week_end_date', 'ARI_average_daily', 'Adult_SARS_CoV_2_pct', 'Paediatric_SARS_CoV_2_pct', 'ARI_samples_week', 'ARI_samples_COVID_19_pct'])
  
for file in os.listdir(datadir):
    filename = os.fsdecode(file)
    
    # browse through all the local pdf files, gathering the search results into a dataframe for output
    if filename.endswith('.pdf'):
        pdf_file = datadir + filename
        e_week , e_week_dates , e_week_end_date, ARI_average_daily , Adult_SARS_CoV_2_pct , Paediatric_SARS_CoV_2_pct , ARI_samples_week , ARI_samples_COVID_19_pct  = extract_data_from_pdf(pdf_file)


        # construct the output row and add it to the dataframe
        output_row = [filename, e_week , e_week_dates , e_week_end_date, ARI_average_daily , Adult_SARS_CoV_2_pct , Paediatric_SARS_CoV_2_pct , ARI_samples_week , ARI_samples_COVID_19_pct]
        output_data_df.loc[len(output_data_df.index)] = output_row

# sort df by e_week
output_data_df = output_data_df.sort_values(['e_week']).reset_index(drop=True)

output_data_df

AttributeError: 'str' object has no attribute 'trim'

#### Gather result dataframes and write out to Excel sheets.

In [4]:

# write the output_data_df to an Excel file with Sheet name

writer = pandas.ExcelWriter(output_filename, engine='xlsxwriter')
my_sheet_name = "SGP-ID-Bulletins"
output_data_df.to_excel(writer, sheet_name = my_sheet_name)
writer.close()
