## Extract data tables from PDFs

In [8]:
import pdfplumber
import pandas as pd
import os
import re

In [9]:
# Path to the PDF
folder_path = "Resources/CAL_FireStats"


In [10]:
# Function to extract and format the table number from the caption
def extract_table_number(caption, default_name):
    match = re.search(r'(Table\s+\d+)', caption, re.IGNORECASE)
    if match:
        return match.group(1).replace(" ", "_")
    else:
        return default_name


In [11]:
# Loop through all PDFs in the folder
for filename in os.listdir(folder_path):
    if filename.endswith(".pdf"):
        pdf_path = os.path.join(folder_path, filename)
        
        # Initialize a dictionary to hold DataFrames and their corresponding captions
        tables_dict = {}

        # Open the PDF
        with pdfplumber.open(pdf_path) as pdf:
            # Iterate through all the pages in the PDF
            for i in range(len(pdf.pages)):
                page = pdf.pages[i]
                
                # Extract text from the top of the page to identify the table caption
                text = page.extract_text()
                default_name = f"table_page_{i+1}"
                caption = text.split('\n')[0] if text else default_name
                table_name = extract_table_number(caption, default_name)
                
                # Extract table from the page
                table = page.extract_table()
                
                # If a table is found, convert it to a DataFrame and add to the dictionary
                if table:
                    df = pd.DataFrame(table[1:], columns=table[0])  # Use the first row as the header
                    
                    # Drop columns where all values are None
                    df.dropna(axis=1, how='all', inplace=True)
                    
                    # Ensure unique sheet names by appending the page number if necessary
                    if table_name in tables_dict:
                        table_name = f"{table_name}_{i+1}"
                    
                    tables_dict[table_name] = df
                    
        # Create an Excel writer object
        excel_filename = os.path.splitext(filename)[0] + ".xlsx"
        excel_path = os.path.join(folder_path, excel_filename)
        with pd.ExcelWriter(excel_path) as writer:
            for table_name, df in tables_dict.items():
                df.to_excel(writer, sheet_name=table_name, index=False)

        print(f"Processed {filename}, saved as {excel_filename}")

Processed 2015-wildfire-activity-stats.pdf, saved as 2015-wildfire-activity-stats.xlsx
Processed 2010-wildfire-activity-stats.pdf, saved as 2010-wildfire-activity-stats.xlsx
Processed 2022-redbook---wildfire-activity-statistics.pdf, saved as 2022-redbook---wildfire-activity-statistics.xlsx
Processed 2020-wildfire-activity-stats.pdf, saved as 2020-wildfire-activity-stats.xlsx
Processed 2009-wildfire-activity-stats.pdf, saved as 2009-wildfire-activity-stats.xlsx
Processed 2011-wildfire-activity-stats.pdf, saved as 2011-wildfire-activity-stats.xlsx
Processed 2014-wildfire-activity-stats.pdf, saved as 2014-wildfire-activity-stats.xlsx
Processed ca-wildfires-and-acres-for-all-jurisdictions.pdf, saved as ca-wildfires-and-acres-for-all-jurisdictions.xlsx
Processed 2021-wildfire-activity-stats.pdf, saved as 2021-wildfire-activity-stats.xlsx
Processed 2008-wildfire-activity-stats.pdf, saved as 2008-wildfire-activity-stats.xlsx
Processed 2018-wildfire-activity-stats.pdf, saved as 2018-wildfire-a