In [None]:
import fitz  # PyMuPDF  
import pandas as pd  
import json  
import re  
  
# Load the 2014 PDF  
pdf_path = 'Annual_Report_2014_508.pdf'  
doc = fitz.open(pdf_path)  
  
# Define expected page range (similar to previous years)  
start_page = 7   # 0-indexed (page 8)  
end_page = 15    # 0-indexed (page 16)  
  
print("Searching for significant incidents table in pages " + str(start_page+1) + " to " + str(end_page+1) + " of the 2014 PDF...")  
  
# Function to extract tables from a page using PyMuPDF's table extraction  
def extract_tables_from_page(page):  
    tables = page.find_tables()  
    if tables and tables.tables:  
        return [table.to_pandas() for table in tables.tables]  
    return []  
  
all_tables = []  
table_pages = []  
  
# Extract tables from the given pages  
for page_num in range(start_page, end_page+1):  
    page = doc[page_num]  
    tables = extract_tables_from_page(page)  
    if tables:  
        all_tables.extend(tables)  
        table_pages.append(page_num+1)  # 1-indexed page numbers  
print("Found " + str(len(all_tables)) + " tables on pages: " + str(table_pages))  
  
# Identify the significant incidents table by inspecting the column headers  
significant_table_index = None  
for i, table in enumerate(all_tables):  
    columns = table.columns.tolist()  
    column_str = ', '.join(columns).lower()  
    if ('name' in column_str and 'state' in column_str) or ('fire' in column_str and 'acres' in column_str):  
        significant_table_index = i  
        print("\nTable " + str(i+1) + " on page " + str(table_pages[i]) + " appears to be the significant incidents table.")  
        print("Columns:", columns)  
        break  
  
if significant_table_index is not None:  
    # Use the found table for processing  
    incidents_table = all_tables[significant_table_index]  
    print("\nSignificant incidents table found:")  
    print(incidents_table.head())  
      
    # Clean column names (replace newlines with underscore)  
    incidents_table.columns = [col.strip().replace('\n', '_') for col in incidents_table.columns]  
      
    # Save the raw data to JSON  
    raw_json = incidents_table.to_dict(orient='records')  
    raw_filename = 'incidents_2014.json'  
    with open(raw_filename, 'w') as f:  
        json.dump(raw_json, f, indent=4)  
    print("\nSaved raw data to " + raw_filename)  
      
    # Create a mapping for column names to standardize them  
    column_mapping = {}  
    for col in incidents_table.columns:  
        col_lower = col.lower()  
        if 'name' in col_lower:  
            column_mapping[col] = 'Name'  
        elif 'type' in col_lower:  
            column_mapping[col] = 'Inc_Type'  
        elif 'gacc' in col_lower:  
            column_mapping[col] = 'GACC'  
        elif 'state' in col_lower:  
            column_mapping[col] = 'State'  
        elif 'start' in col_lower and 'date' in col_lower:  
            column_mapping[col] = 'Start_Date'  
        elif ('contain' in col_lower or 'control' in col_lower) and 'date' in col_lower:  
            column_mapping[col] = 'Contain_Control_Date'  
        elif 'size' in col_lower or 'acres' in col_lower:  
            column_mapping[col] = 'Size_Acres'  
        elif 'cause' in col_lower:  
            column_mapping[col] = 'Cause'  
        elif 'cost' in col_lower:  
            column_mapping[col] = 'Cost'  
      
    print("\nProposed column mapping:")  
    print(column_mapping)  
      
    # Apply the mapping to create a structured table  
    structured_table = incidents_table.rename(columns=column_mapping)  
      
    # Add any missing columns with None values  
    required_columns = ['Name', 'Inc_Type', 'GACC', 'State', 'Start_Date',   
                       'Contain_Control_Date', 'Size_Acres', 'Cause', 'Cost']  
    for col in required_columns:  
        if col not in structured_table.columns:  
            structured_table[col] = None  
      
    # Convert to list of dictionaries for JSON  
    structured_data = structured_table.to_dict(orient='records')  
      
    # Create the structured JSON  
    structured_json = {  
        "significant_incidents": structured_data  
    }  
      
    # Save the structured data to JSON  
    structured_filename = 'structured_incidents_data_2014.json'  
    with open(structured_filename, 'w') as f:  
        json.dump(structured_json, f, indent=4)  
    print("\nSaved structured data to " + structured_filename)  
    print("Sample incident:")  
    print(json.dumps(structured_data[0], indent=2))  
      
    # Create dimension tables  
    # State dimension  
    states = structured_table['State'].dropna().unique()  
    state_dimension = [{'State_ID': i+1, 'State': state} for i, state in enumerate(sorted(states))]  
      
    # GACC dimension  
    gaccs = structured_table['GACC'].dropna().unique()  
    gacc_dimension = [{'GACC_ID': i+1, 'GACC': gacc} for i, gacc in enumerate(sorted(gaccs))]  
      
    # Incident Type dimension  
    inc_types = structured_table['Inc_Type'].dropna().unique()  
    inc_type_dimension = [{'Inc_Type_ID': i+1, 'Inc_Type': inc_type} for i, inc_type in enumerate(sorted(inc_types))]  
      
    # Cause dimension  
    causes = structured_table['Cause'].dropna().unique()  
    cause_dimension = [{'Cause_ID': i+1, 'Cause': cause} for i, cause in enumerate(sorted(causes))]  
      
    # Time dimension (using Start_Date values)  
    dates = structured_table['Start_Date'].dropna().unique()  
    time_dimension = [{'Date_ID': i+1, 'Date': date} for i, date in enumerate(sorted(dates))]  
      
    dimension_tables = {  
        'state_dimension': state_dimension,  
        'gacc_dimension': gacc_dimension,  
        'inc_type_dimension': inc_type_dimension,  
        'cause_dimension': cause_dimension,  
        'time_dimension': time_dimension  
    }  
      
    dimension_filename = 'dimension_tables_2014.json'  
    with open(dimension_filename, 'w') as f:  
        json.dump(dimension_tables, f, indent=4)  
    print("\nCreated dimension tables and saved to " + dimension_filename)  
    print("- State dimension: " + str(len(state_dimension)) + " states")  
    print("- GACC dimension: " + str(len(gacc_dimension)) + " GACCs")  
    print("- Incident Type dimension: " + str(len(inc_type_dimension)) + " types")  
    print("- Cause dimension: " + str(len(cause_dimension)) + " causes")  
    print("- Time dimension: " + str(len(time_dimension)) + " dates")  
      
else:  
    print("\nCould not identify the significant incidents table in the given page range.")  
      
    # If we can't find the table automatically, let's try to look for text that might indicate where it is  
    for page_num in range(start_page, end_page + 1):  
        page = doc[page_num]  
        text = page.get_text()  
        if "significant" in text.lower() and ("fire" in text.lower() or "incident" in text.lower()):  
            print("\nPage " + str(page_num+1) + " contains text about significant fires/incidents.")  
            print("First 200 characters of the page:")  
            print(text[:200])  
  
print("\nProcess completed.")  

In [None]:
import fitz  # PyMuPDF  
import pandas as pd  
import json  
import re  
  
# Load the 2015 PDF  
pdf_path = 'annual_report_2015_508.pdf'  
doc = fitz.open(pdf_path)  
  
# Define expected page range (similar to previous years)  
start_page = 7   # 0-indexed (page 8)  
end_page = 15    # 0-indexed (page 16)  
  
print("Searching for significant incidents table in pages " + str(start_page+1) + " to " + str(end_page+1) + " of the 2015 PDF...")  
  
# Function to extract tables from a page using PyMuPDF's table extraction  
def extract_tables_from_page(page):  
    tables = page.find_tables()  
    if tables and tables.tables:  
        return [table.to_pandas() for table in tables.tables]  
    return []  
  
all_tables = []  
table_pages = []  
  
# Extract tables from the given pages  
for page_num in range(start_page, end_page+1):  
    page = doc[page_num]  
    tables = extract_tables_from_page(page)  
    if tables:  
        all_tables.extend(tables)  
        table_pages.append(page_num+1)  # 1-indexed page numbers  
print("Found " + str(len(all_tables)) + " tables on pages: " + str(table_pages))  
  
# Identify the significant incidents table by inspecting the column headers  
significant_table_index = None  
for i, table in enumerate(all_tables):  
    columns = table.columns.tolist()  
    column_str = ', '.join(columns).lower()  
    if ('name' in column_str and 'state' in column_str) or ('fire' in column_str and 'acres' in column_str):  
        significant_table_index = i  
        print("\nTable " + str(i+1) + " on page " + str(table_pages[i]) + " appears to be the significant incidents table.")  
        print("Columns:", columns)  
        break  
  
if significant_table_index is not None:  
    # Use the found table for processing  
    incidents_table = all_tables[significant_table_index]  
    print("\nSignificant incidents table found:")  
    print(incidents_table.head())  
      
    # Clean column names (replace newlines with underscore and strip whitespace)  
    incidents_table.columns = [col.strip().replace('\n', '_') for col in incidents_table.columns]  
      
    # Save the raw table data to JSON  
    raw_json = incidents_table.to_dict(orient='records')  
    raw_filename = 'incidents_2015.json'  
    with open(raw_filename, 'w') as f:  
        json.dump(raw_json, f, indent=4)  
    print("\nSaved raw table data to " + raw_filename)  
      
    # Create a mapping for column names to standardize them  
    column_mapping = {}  
      
    # Inspect the actual columns to create the mapping  
    print("\nActual columns in the 2015 table:")  
    print(incidents_table.columns.tolist())  
      
    # Try to automatically map columns based on keywords  
    for col in incidents_table.columns:  
        col_lower = col.lower()  
        if 'name' in col_lower:  
            column_mapping[col] = 'Name'  
        elif 'type' in col_lower:  
            column_mapping[col] = 'Inc_Type'  
        elif 'gacc' in col_lower:  
            column_mapping[col] = 'GACC'  
        elif 'state' in col_lower:  
            column_mapping[col] = 'State'  
        elif 'start' in col_lower and 'date' in col_lower:  
            column_mapping[col] = 'Start_Date'  
        elif ('contain' in col_lower or 'control' in col_lower) and 'date' in col_lower:  
            column_mapping[col] = 'Contain_Control_Date'  
        elif 'size' in col_lower or 'acres' in col_lower:  
            column_mapping[col] = 'Size_Acres'  
        elif 'cause' in col_lower:  
            column_mapping[col] = 'Cause'  
        elif 'cost' in col_lower:  
            column_mapping[col] = 'Cost'  
      
    print("\nProposed column mapping:")  
    print(column_mapping)  
      
    # Apply the mapping if it's complete enough  
    if len(column_mapping) >= 5:  # Assuming we need at least 5 key columns  
        structured_table = incidents_table.rename(columns=column_mapping)  
          
        # For any missing columns in our mapping, add them with None values  
        required_columns = ['Name', 'Inc_Type', 'GACC', 'State', 'Start_Date',   
                           'Contain_Control_Date', 'Size_Acres', 'Cause', 'Cost']  
          
        for col in required_columns:  
            if col not in structured_table.columns:  
                structured_table[col] = None  
                print("Added missing column: " + col)  
          
        # Convert to list of dictionaries for JSON  
        structured_data = structured_table.to_dict(orient='records')  
          
        # Create the structured data format  
        structured_json = {  
            "significant_incidents": structured_data  
        }  
          
        # Save to JSON  
        structured_filename = 'structured_incidents_data_2015.json'  
        with open(structured_filename, 'w') as f:  
            json.dump(structured_json, f, indent=4)  
          
        print("\nSaved structured data to " + structured_filename)  
        print("Sample incident:")  
        print(json.dumps(structured_data[0], indent=2))  
          
        # Create dimension tables  
        # State dimension  
        states = structured_table['State'].dropna().unique()  
        state_dimension = [{'State_ID': i+1, 'State': state} for i, state in enumerate(sorted(states))]  
          
        # GACC dimension  
        gaccs = structured_table['GACC'].dropna().unique()  
        gacc_dimension = [{'GACC_ID': i+1, 'GACC': gacc} for i, gacc in enumerate(sorted(gaccs))]  
          
        # Incident Type dimension  
        inc_types = structured_table['Inc_Type'].dropna().unique()  
        inc_type_dimension = [{'Inc_Type_ID': i+1, 'Inc_Type': inc_type} for i, inc_type in enumerate(sorted(inc_types))]  
          
        # Cause dimension  
        causes = structured_table['Cause'].dropna().unique()  
        cause_dimension = [{'Cause_ID': i+1, 'Cause': cause} for i, cause in enumerate(sorted(causes))]  
          
        # Time dimension (using Start_Date values)  
        dates = structured_table['Start_Date'].dropna().unique()  
        time_dimension = [{'Date_ID': i+1, 'Date': date} for i, date in enumerate(sorted(dates))]  
          
        dimension_tables = {  
            'state_dimension': state_dimension,  
            'gacc_dimension': gacc_dimension,  
            'inc_type_dimension': inc_type_dimension,  
            'cause_dimension': cause_dimension,  
            'time_dimension': time_dimension  
        }  
          
        dimension_filename = 'dimension_tables_2015.json'  
        with open(dimension_filename, 'w') as f:  
            json.dump(dimension_tables, f, indent=4)  
        print("\nCreated dimension tables and saved to " + dimension_filename)  
        print("- State dimension: " + str(len(state_dimension)) + " states")  
        print("- GACC dimension: " + str(len(gacc_dimension)) + " GACCs")  
        print("- Incident Type dimension: " + str(len(inc_type_dimension)) + " types")  
        print("- Cause dimension: " + str(len(cause_dimension)) + " causes")  
        print("- Time dimension: " + str(len(time_dimension)) + " dates")  
    else:  
        print("\nCould not create a complete column mapping. Manual mapping needed.")  
          
        # Let's create a more detailed view of the table to help with manual mapping  
        print("\nDetailed view of the table:")  
        print(incidents_table.head(5))  
else:  
    print("\nCould not identify the significant incidents table in the given page range.")  
      
    # If we can't find the table automatically, let's try to look for text that might indicate where it is  
    for page_num in range(start_page, end_page + 1):  
        page = doc[page_num]  
        text = page.get_text()  
        if "significant" in text.lower() and ("fire" in text.lower() or "incident" in text.lower()):  
            print("\nPage " + str(page_num+1) + " contains text about significant fires/incidents.")  
            print("First 200 characters of the page:")  
            print(text[:200])  
  
print("\nProcess completed.")  

In [9]:
import fitz  # PyMuPDF  
import pandas as pd  
import json  
import re  
  
# Load the 2016 PDF  
pdf_path = 'annual_report_2016_508.pdf'  
doc = fitz.open(pdf_path)  
  
# Define expected page range (similar to previous years)  
start_page = 7   # 0-indexed (page 8)  
end_page = 15    # 0-indexed (page 16)  
  
print("Searching for significant incidents table in pages " + str(start_page+1) + " to " + str(end_page+1) + " of the 2016 PDF...")  
  
# Function to extract tables from a page using PyMuPDF's table extraction  
def extract_tables_from_page(page):  
    tables = page.find_tables()  
    if tables and tables.tables:  
        return [table.to_pandas() for table in tables.tables]  
    return []  
  
all_tables = []  
table_pages = []  
  
# Extract tables from the given pages  
for page_num in range(start_page, end_page+1):  
    page = doc[page_num]  
    tables = extract_tables_from_page(page)  
    if tables:  
        all_tables.extend(tables)  
        table_pages.append(page_num+1)  # 1-indexed page numbers  
print("Found " + str(len(all_tables)) + " tables on pages: " + str(table_pages))  
  
# Identify the significant incidents table by inspecting the column headers  
significant_table_index = None  
for i, table in enumerate(all_tables):  
    columns = table.columns.tolist()  
    column_str = ', '.join(columns).lower()  
    print(f"Table {i+1} columns: {column_str}")  
    if ('name' in column_str and 'state' in column_str) or ('fire' in column_str and 'acres' in column_str):  
        significant_table_index = i  
        print("\nTable " + str(i+1) + " on page " + str(table_pages[i]) + " appears to be the significant incidents table.")  
        print("Columns:", columns)  
        break  
  
if significant_table_index is not None:  
    # Use the identified table for processing  
    incidents_table = all_tables[significant_table_index]  
    print("Significant incidents table extracted:")  
    print(incidents_table.head())  
      
    # Clean column names: strip spaces and newline  
    incidents_table.columns = [col.strip().replace('\n', '_') for col in incidents_table.columns]  
      
    # Save the raw extracted table as JSON (filename: incidents_2016.json)  
    raw_filename = 'incidents_2016.json'  
    incidents_table.to_json(raw_filename, orient='records', indent=4)  
    print("Raw incidents table saved to " + raw_filename)  
      
    # Mapping for structured data. Adjust mapping if needed.  
    mapping = {}  
    for col in incidents_table.columns:  
        std = col.lower().replace(' ', '_').replace('\n', '_').replace('(', '').replace(')', '')  
        if 'name' in std:  
            mapping[col] = 'Name'  
        elif 'inc' in std:  
            mapping[col] = 'Inc_Type'  
        elif 'gacc' in std:  
            mapping[col] = 'GACC'  
        elif 'state' in std:  
            mapping[col] = 'State'  
        elif 'start' in std:  
            mapping[col] = 'Start_Date'  
        elif 'last' in std:  
            mapping[col] = 'Last_Report_Date'  
        elif 'acre' in std:  
            mapping[col] = 'Size_Acres'  
        elif 'cause' in std:  
            mapping[col] = 'Cause'  
        elif 'cost' in std:  
            mapping[col] = 'Estimated_Cost'  
        else:  
            mapping[col] = col  
      
    print("\nProposed column mapping:")  
    print(mapping)  
      
    # Apply mapping to get structured data  
    structured_table = incidents_table.rename(columns=mapping)  
      
    # Save structured data to JSON file  
    structured_filename = 'structured_incidents_data_2016.json'  
    structured_table.to_json(structured_filename, orient='records', indent=4)  
    print("Structured incidents data saved to " + structured_filename)  
      
    # Generate dimension tables  
    # State dimension  
    states = structured_table['State'].dropna().unique()  
    state_dimension = [{'State_ID': i+1, 'State': state} for i, state in enumerate(sorted(states))]  
      
    # GACC dimension  
    gaccs = structured_table['GACC'].dropna().unique()  
    gacc_dimension = [{'GACC_ID': i+1, 'GACC': gacc} for i, gacc in enumerate(sorted(gaccs))]  
      
    # Incident Type dimension (if present, otherwise create empty)  
    if 'Inc_Type' in structured_table.columns:  
        inc_types = structured_table['Inc_Type'].dropna().unique()  
        inc_type_dimension = [{'Inc_Type_ID': i+1, 'Inc_Type': inc_type} for i, inc_type in enumerate(sorted(inc_types))]  
    else:  
        inc_type_dimension = []  
      
    # Cause dimension  
    causes = structured_table['Cause'].dropna().unique()  
    cause_dimension = [{'Cause_ID': i+1, 'Cause': cause} for i, cause in enumerate(sorted(causes))]  
      
    # Time dimension using Start_Date column  
    dates = structured_table['Start_Date'].dropna().unique()  
    time_dimension = [{'Date_ID': i+1, 'Date': date} for i, date in enumerate(sorted(dates))]  
      
    dimension_tables = {  
        'state_dimension': state_dimension,  
        'gacc_dimension': gacc_dimension,  
        'inc_type_dimension': inc_type_dimension,  
        'cause_dimension': cause_dimension,  
        'time_dimension': time_dimension  
    }  
      
    dimension_filename = 'dimension_tables_2016.json'  
    with open(dimension_filename, 'w') as f:  
        json.dump(dimension_tables, f, indent=4)  
    print("Created dimension tables and saved to " + dimension_filename)  
    print("- State dimension: " + str(len(state_dimension)) + " states")  
    print("- GACC dimension: " + str(len(gacc_dimension)) + " GACCs")  
    print("- Incident Type dimension: " + str(len(inc_type_dimension)) + " types")  
    print("- Cause dimension: " + str(len(cause_dimension)) + " causes")  
    print("- Time dimension: " + str(len(time_dimension)) + " dates")  
else:  
    print("\nCould not identify the significant incidents table in the given page range.")  
    # Fallback: inspect pages for text hints  
    for page_num in range(start_page, end_page+1):  
        page = doc[page_num]  
        text = page.get_text()  
        if "significant" in text.lower() and ("fire" in text.lower() or "incident" in text.lower()):  
            print("\nPage " + str(page_num+1) + " contains text about significant fires/incidents.")  
            print("First 200 characters of the page:")  
            print(text[:200])  
  
print("\nProcess completed.")  

Searching for significant incidents table in pages 8 to 16 of the 2016 PDF...
Found 4 tables on pages: [10, 11, 13]
Table 1 columns: international:, through the nifc-ciffc agreement the u.s. provided two heavy air, col2
Table 2 columns: through the nifc-ciffc agreement canada provided two air tanker groups to the u.s., col1
Table 3 columns: name, gacc, state, start
date, last
report
date, size in
acres, cause*, estimated
cost

Table 3 on page 13 appears to be the significant incidents table.
Columns: ['Name', 'GACC', 'State', 'Start\nDate', 'Last\nReport\nDate', 'Size In\nAcres', 'Cause*', 'Estimated\nCost']
Significant incidents table extracted:
             Name GACC State Start\nDate Last\nReport\nDate Size In\nAcres  \
0  Anderson Creek   SA    OK        3/23                4/4        367,740   
1         Pioneer   GB    ID        7/18              10/27        188,404   
2        Range 12   NW    WA        7/30                8/8        176,600   
3       Soberanes   SO    CA     

In [6]:
import fitz  # PyMuPDF  
import pandas as pd  
import json  
import re  
  
# Load the 2017 PDF  
pdf_path = 'annual_report_2017_508_0.pdf'  
doc = fitz.open(pdf_path)  
  
# Define expected page range (similar to previous years)  
start_page = 7   # 0-indexed (page 8)  
end_page = 15    # 0-indexed (page 16)  
  
print("Searching for significant incidents table in pages " + str(start_page+1) + " to " + str(end_page+1) + " of the 2017 PDF...")  
  
# Function to extract tables from a page using PyMuPDF's table extraction  
def extract_tables_from_page(page):  
    tables = page.find_tables()  
    if tables and tables.tables:  
        return [table.to_pandas() for table in tables.tables]  
    return []  
  
all_tables = []  
table_pages = []  
  
# Extract tables from the given pages  
for page_num in range(start_page, end_page+1):  
    page = doc[page_num]  
    tables = extract_tables_from_page(page)  
    if tables:  
        all_tables.extend(tables)  
        table_pages.append(page_num+1)  # 1-indexed page numbers  
print("Found " + str(len(all_tables)) + " tables on pages: " + str(table_pages))  
  
# Identify the significant incidents table by inspecting the column headers  
significant_table_index = None  
for i, table in enumerate(all_tables):  
    columns = table.columns.tolist()  
    column_str = ', '.join(columns).lower()  
    print(f"Table {i+1} columns: {column_str}")  
    if ('name' in column_str and 'state' in column_str) or ('fire' in column_str and 'acres' in column_str):  
        significant_table_index = i  
        print("\nTable " + str(i+1) + " on page " + str(table_pages[i]) + " appears to be the significant incidents table.")  
        print("Columns:", columns)  
        break  
  
if significant_table_index is not None:  
    # Use the identified table for processing  
    incidents_table = all_tables[significant_table_index]  
    print("Significant incidents table extracted:")  
    print(incidents_table.head())  
      
    # Clean column names: strip spaces and newline  
    incidents_table.columns = [col.strip().replace('\n', '_') for col in incidents_table.columns]  
      
    # Save the raw extracted table as JSON (filename: incidents_2017.json)  
    raw_filename = 'incidents_2017.json'  
    incidents_table.to_json(raw_filename, orient='records', indent=4)  
    print("Raw incidents table saved to " + raw_filename)  
      
    # Mapping for structured data. Adjust mapping if needed.  
    mapping = {}  
    for col in incidents_table.columns:  
        std = col.lower().replace(' ', '_').replace('\n', '_').replace('(', '').replace(')', '')  
        if 'name' in std:  
            mapping[col] = 'Name'  
        elif 'inc' in std:  
            mapping[col] = 'Inc_Type'  
        elif 'gacc' in std:  
            mapping[col] = 'GACC'  
        elif 'state' in std:  
            mapping[col] = 'State'  
        elif 'start' in std:  
            mapping[col] = 'Start_Date'  
        elif 'last' in std or 'control' in std or 'contain' in std:  
            mapping[col] = 'Last_Report_Date'  
        elif 'acre' in std or 'size' in std:  
            mapping[col] = 'Size_Acres'  
        elif 'cause' in std:  
            mapping[col] = 'Cause'  
        elif 'cost' in std:  
            mapping[col] = 'Estimated_Cost'  
        else:  
            mapping[col] = col  
      
    print("\nProposed column mapping:")  
    print(mapping)  
      
    # Apply mapping to get structured data  
    structured_table = incidents_table.rename(columns=mapping)  
      
    # Save structured data to JSON file  
    structured_filename = 'structured_incidents_data_2017.json'  
    structured_table.to_json(structured_filename, orient='records', indent=4)  
    print("Structured incidents data saved to " + structured_filename)  
      
    # Generate dimension tables  
    # State dimension  
    states = structured_table['State'].dropna().unique()  
    state_dimension = [{'State_ID': i+1, 'State': state} for i, state in enumerate(sorted(states))]  
      
    # GACC dimension  
    gaccs = structured_table['GACC'].dropna().unique()  
    gacc_dimension = [{'GACC_ID': i+1, 'GACC': gacc} for i, gacc in enumerate(sorted(gaccs))]  
      
    # Incident Type dimension (check if exists, otherwise create empty)  
    if 'Inc_Type' in structured_table.columns:  
        inc_types = structured_table['Inc_Type'].dropna().unique()  
        inc_type_dimension = [{'Inc_Type_ID': i+1, 'Inc_Type': inc_type} for i, inc_type in enumerate(sorted(inc_types))]  
    else:  
        inc_type_dimension = []  
      
    # Cause dimension  
    causes = structured_table['Cause'].dropna().unique()  
    cause_dimension = [{'Cause_ID': i+1, 'Cause': cause} for i, cause in enumerate(sorted(causes))]  
      
    # Time dimension using Start_Date column  
    dates = structured_table['Start_Date'].dropna().unique()  
    time_dimension = [{'Date_ID': i+1, 'Date': date} for i, date in enumerate(sorted(dates))]  
      
    dimension_tables = {  
        'state_dimension': state_dimension,  
        'gacc_dimension': gacc_dimension,  
        'inc_type_dimension': inc_type_dimension,  
        'cause_dimension': cause_dimension,  
        'time_dimension': time_dimension  
    }  
      
    dimension_filename = 'dimension_tables_2017.json'  
    with open(dimension_filename, 'w') as f:  
        json.dump(dimension_tables, f, indent=4)  
    print("Created dimension tables and saved to " + dimension_filename)  
    print("- State dimension: " + str(len(state_dimension)) + " states")  
    print("- GACC dimension: " + str(len(gacc_dimension)) + " GACCs")  
    print("- Incident Type dimension: " + str(len(inc_type_dimension)) + " types")  
    print("- Cause dimension: " + str(len(cause_dimension)) + " causes")  
    print("- Time dimension: " + str(len(time_dimension)) + " dates")  
else:  
    print("\nCould not identify the significant incidents table in the given page range.")  
    # Fallback: inspect pages for text hints  
    for page_num in range(start_page, end_page+1):  
        page = doc[page_num]  
        text = page.get_text()  
        if "significant" in text.lower() and ("fire" in text.lower() or "incident" in text.lower()):  
            print("\nPage " + str(page_num+1) + " contains text about significant fires/incidents.")  
            print("First 200 characters of the page:")  
            print(text[:200])  
              
    # If we still can't find it, let's expand our search to more pages  
    print("\nExpanding search to more pages...")  
    for page_num in range(0, doc.page_count):  
        if page_num < start_page or page_num > end_page:  # Only check pages we haven't checked yet  
            page = doc[page_num]  
            text = page.get_text()  
            if "significant" in text.lower() and ("fire" in text.lower() or "incident" in text.lower()):  
                print("\nPage " + str(page_num+1) + " contains text about significant fires/incidents.")  
                print("First 200 characters of the page:")  
                print(text[:200])  
                  
                # Try to extract tables from this page  
                tables = extract_tables_from_page(page)  
                if tables:  
                    print(f"Found {len(tables)} tables on page {page_num+1}")  
                    for i, table in enumerate(tables):  
                        print(f"Table {i+1} columns: {', '.join(table.columns.tolist())}")  
  
print("\nProcess completed.")  

Searching for significant incidents table in pages 8 to 16 of the 2017 PDF...
Found 4 tables on pages: [10, 11, 12, 14]
Table 1 columns: col0, through the nifc-ciffc agreement canada provided
Table 2 columns: name, gacc, state, start
date, last
report
date, size in
acres, cause*, estimated cost

Table 2 on page 11 appears to be the significant incidents table.
Columns: ['Name', 'GACC', 'State', 'Start\nDate', 'Last\nReport\nDate', 'Size In\nAcres', 'Cause*', 'Estimated Cost']
Significant incidents table extracted:
                   Name GACC State Start\nDate  Last\nReport\nDate  \
0  NW Oklahoma\nComplex   SA    OK         3/7                3/24   
1              Perryton   SA    TX         3/6                3/13   
2                Thomas   SO    CA        12/4  Active\ninto\n2018   
3    Lodgepole\nComplex   NR    MT        7/20                8/11   
4         Roosters Comb   GB    NV         7/9                7/24   

  Size In\nAcres Cause* Estimated Cost  
0        779,292  

In [10]:
import fitz  # PyMuPDF  
import pandas as pd  
import json  
import re  
  
# Load the 2018 PDF  
pdf_path = 'annual_report_ 2018_508.pdf'  
doc = fitz.open(pdf_path)  
  
# Define expected page range (similar to previous years)  
start_page = 7   # 0-indexed (page 8)  
end_page = 15    # 0-indexed (page 16)  
  
print("Searching for significant incidents table in pages " + str(start_page+1) + " to " + str(end_page+1) + " of the 2018 PDF...")  
  
# Function to extract tables from a page using PyMuPDF's table extraction  
def extract_tables_from_page(page):  
    tables = page.find_tables()  
    if tables and tables.tables:  
        return [table.to_pandas() for table in tables.tables]  
    return []  
  
all_tables = []  
table_pages = []  
  
# Extract tables from the given pages  
for page_num in range(start_page, end_page+1):  
    page = doc[page_num]  
    tables = extract_tables_from_page(page)  
    if tables:  
        all_tables.extend(tables)  
        table_pages.append(page_num+1)  # Use 1-indexed page numbers  
print("Found " + str(len(all_tables)) + " tables on pages: " + str(table_pages))  
  
# Identify the significant incidents table by inspecting column headers  
significant_table_index = None  
for i, table in enumerate(all_tables):  
    columns = table.columns.tolist()  
    column_str = ', '.join(columns).lower()  
    print("Table " + str(i+1) + " columns: " + column_str)  
    if ('name' in column_str and 'state' in column_str) or ('fire' in column_str and 'acres' in column_str):  
        significant_table_index = i  
        print("\nTable " + str(i+1) + " on page " + str(table_pages[i]) + " appears to be the significant incidents table.")  
        print("Columns:", columns)  
        break  
  
if significant_table_index is not None:  
    # Process the identified table  
    incidents_table = all_tables[significant_table_index]  
    print("\nSignificant incidents table extracted:")  
    print(incidents_table.head())  
      
    # Clean column names: strip spaces and replace newlines  
    incidents_table.columns = [col.strip().replace('\n', '_') for col in incidents_table.columns]  
      
    # Save the raw table as JSON  
    raw_filename = 'incidents_2018.json'  
    incidents_table.to_json(raw_filename, orient='records', indent=4)  
    print("Raw incidents table saved to " + raw_filename)  
      
    # Mapping for structured data  
    mapping = {}  
    for col in incidents_table.columns:  
        std = col.lower().replace(' ', '_').replace('\n', '_').replace('(', '').replace(')', '')  
        if 'name' in std:  
            mapping[col] = 'Name'  
        elif 'inc' in std:  
            mapping[col] = 'Inc_Type'  
        elif 'gacc' in std:  
            mapping[col] = 'GACC'  
        elif 'state' in std:  
            mapping[col] = 'State'  
        elif 'start' in std:  
            mapping[col] = 'Start_Date'  
        elif 'last' in std:  
            mapping[col] = 'Last_Report_Date'  
        elif 'acre' in std:  
            mapping[col] = 'Size_Acres'  
        elif 'cause' in std:  
            mapping[col] = 'Cause'  
        elif 'cost' in std:  
            mapping[col] = 'Estimated_Cost'  
        else:  
            mapping[col] = col  
    print("\nProposed column mapping:")  
    print(mapping)  
      
    # Create structured data  
    structured_table = incidents_table.rename(columns=mapping)  
      
    # Save structured data as JSON  
    structured_filename = 'structured_incidents_data_2018.json'  
    structured_table.to_json(structured_filename, orient='records', indent=4)  
    print("Structured incidents data saved to " + structured_filename)  
      
    # Generate dimension tables  
      
    # State dimension  
    states = structured_table['State'].dropna().unique()  
    state_dimension = [{'State_ID': i+1, 'State': state} for i, state in enumerate(sorted(states))]  
      
    # GACC dimension  
    gaccs = structured_table['GACC'].dropna().unique()  
    gacc_dimension = [{'GACC_ID': i+1, 'GACC': gacc} for i, gacc in enumerate(sorted(gaccs))]  
      
    # Incident Type dimension, if available  
    if 'Inc_Type' in structured_table.columns:  
        inc_types = structured_table['Inc_Type'].dropna().unique()  
        inc_type_dimension = [{'Inc_Type_ID': i+1, 'Inc_Type': inc_type} for i, inc_type in enumerate(sorted(inc_types))]  
    else:  
        inc_type_dimension = []  
      
    # Cause dimension  
    causes = structured_table['Cause'].dropna().unique()  
    cause_dimension = [{'Cause_ID': i+1, 'Cause': cause} for i, cause in enumerate(sorted(causes))]  
      
    # Time dimension using Start_Date  
    dates = structured_table['Start_Date'].dropna().unique()  
    time_dimension = [{'Date_ID': i+1, 'Date': date} for i, date in enumerate(sorted(dates))]  
      
    dimension_tables = {  
        'state_dimension': state_dimension,  
        'gacc_dimension': gacc_dimension,  
        'inc_type_dimension': inc_type_dimension,  
        'cause_dimension': cause_dimension,  
        'time_dimension': time_dimension  
    }  
      
    dimension_filename = 'dimension_tables_2018.json'  
    with open(dimension_filename, 'w') as f:  
        json.dump(dimension_tables, f, indent=4)  
    print("Created dimension tables and saved to " + dimension_filename)  
    print("- State dimension: " + str(len(state_dimension)) + " states")  
    print("- GACC dimension: " + str(len(gacc_dimension)) + " GACCs")  
    print("- Incident Type dimension: " + str(len(inc_type_dimension)) + " types")  
    print("- Cause dimension: " + str(len(cause_dimension)) + " causes")  
    print("- Time dimension: " + str(len(time_dimension)) + " dates")  
else:  
    print("\nCould not identify the significant incidents table in the given page range.")  
    # Fallback: inspect pages for text hints and extra tables  
    for page_num in range(start_page, end_page+1):  
        page = doc[page_num]  
        text = page.get_text()  
        if "significant" in text.lower() and ("fire" in text.lower() or "incident" in text.lower()):  
            print("\nPage " + str(page_num+1) + " appears to contain relevant text for fires/incidents.")  
            print("First 200 characters of the page:")  
            print(text[:200])  
      
    print("\nExpanding search to additional pages...")  
    for page_num in range(0, doc.page_count):  
        if page_num < start_page or page_num > end_page:  
            page = doc[page_num]  
            text = page.get_text()  
            if "significant" in text.lower() and ("fire" in text.lower() or "incident" in text.lower()):  
                print("\nPage " + str(page_num+1) + " appears to contain relevant text for fires/incidents.")  
                print("First 200 characters of the page:")  
                print(text[:200])  
                # Attempt to extract tables from this additional page  
                tables = extract_tables_from_page(page)  
                if tables:  
                    print("Found " + str(len(tables)) + " tables on page " + str(page_num+1))  
                    for i, table in enumerate(tables):  
                        print("Table " + str(i+1) + " columns: " + ', '.join(table.columns.tolist()))  
print("\nProcess completed.")  

Searching for significant incidents table in pages 8 to 16 of the 2018 PDF...
Found 2 tables on pages: [9, 10]
Table 1 columns: name, gacc, state, start
date, contain or last
report date, size
(acres), cause*, estimated
cost

Table 1 on page 9 appears to be the significant incidents table.
Columns: ['Name', 'GACC', 'State', 'Start\nDate', 'Contain or Last\nReport Date', 'Size\n(acres)', 'Cause*', 'Estimated\nCost']

Significant incidents table extracted:
                Name GACC State Start\nDate Contain or Last\nReport Date  \
0  Mendocino Complex   NO    CA        7/27                         9/18   
1             Martin   GB    NV         7/5                         7/21   
2               Rhea   SA    OK        4/12                         4/26   
3    South Sugarloaf   GB    NV        8/17                        10/10   
4               Carr   NO    CA        7/23                         8/30   

  Size\n(acres) Cause* Estimated\nCost  
0       459,123      U     220,000,000  
1 

In [12]:
import fitz  # PyMuPDF  
import pandas as pd  
import json  
import re  
  
# Load the 2019 PDF  
pdf_path = 'annual_report_2019_508.pdf'  
doc = fitz.open(pdf_path)  
  
# Define expected page range (pages 8 to 16)  
start_page = 7   # 0-indexed (page 8)  
end_page = 15    # 0-indexed (page 16)  
  
print("Searching for significant incidents table in pages " + str(start_page+1) + " to " + str(end_page+1) + " of the 2019 PDF...")  
  
# Function to extract tables from a page using PyMuPDF's table extraction  
def extract_tables_from_page(page):  
    tables = page.find_tables()  
    if tables and tables.tables:  
        return [table.to_pandas() for table in tables.tables]  
    return []  
  
all_tables = []  
table_pages = []  
  
# Extract tables from the specified pages  
for page_num in range(start_page, end_page+1):  
    page = doc[page_num]  
    tables = extract_tables_from_page(page)  
    if tables:  
        all_tables.extend(tables)  
        table_pages.append(page_num+1)  # Use 1-indexed page numbers  
print("Found " + str(len(all_tables)) + " tables on pages: " + str(table_pages))  
  
# Identify the significant incidents table by inspecting table headers  
significant_table_index = None  
for i, table in enumerate(all_tables):  
    columns = table.columns.tolist()  
    column_str = ', '.join(columns).lower()  
    print("Table " + str(i+1) + " columns: " + column_str)  
    if ('name' in column_str and 'state' in column_str) or ('fire' in column_str and 'acres' in column_str):  
        significant_table_index = i  
        print("\nTable " + str(i+1) + " on page " + str(table_pages[i]) + " appears to be the significant incidents table.")  
        print("Columns:", columns)  
        break  
  
if significant_table_index is not None:  
    # Process the identified table  
    incidents_table = all_tables[significant_table_index]  
    print("\nSignificant incidents table extracted:")  
    print(incidents_table.head())  
      
    # Clean column names: Remove extra spaces and newline characters  
    incidents_table.columns = [col.strip().replace('\n', '_') for col in incidents_table.columns]  
      
    # Save the raw extracted table as JSON  
    raw_filename = 'incidents_2019.json'  
    incidents_table.to_json(raw_filename, orient='records', indent=4)  
    print("Raw incidents table saved to " + raw_filename)  
      
    # Mapping for structured data based on probable column names in the incidents table  
    mapping = {}  
    for col in incidents_table.columns:  
        std = col.lower().replace(' ', '_').replace('\n', '_').replace('(', '').replace(')', '')  
        if 'name' in std:  
            mapping[col] = 'Name'  
        elif 'inc' in std:  
            mapping[col] = 'Inc_Type'  
        elif 'gacc' in std:  
            mapping[col] = 'GACC'  
        elif 'state' in std:  
            mapping[col] = 'State'  
        elif 'start' in std:  
            mapping[col] = 'Start_Date'  
        elif 'last' in std:  
            mapping[col] = 'Last_Report_Date'  
        elif 'acre' in std:  
            mapping[col] = 'Size_Acres'  
        elif 'cause' in std:  
            mapping[col] = 'Cause'  
        elif 'cost' in std:  
            mapping[col] = 'Estimated_Cost'  
        else:  
            mapping[col] = col  
      
    print("\nProposed column mapping:")  
    print(mapping)  
      
    # Apply column mapping to generate structured data  
    structured_table = incidents_table.rename(columns=mapping)  
      
    # Save structured data to JSON  
    structured_filename = 'structured_incidents_data_2019.json'  
    structured_table.to_json(structured_filename, orient='records', indent=4)  
    print("Structured incidents data saved to " + structured_filename)  
      
    # Generate dimension tables  
  
    # State dimension  
    states = structured_table['State'].dropna().unique()  
    state_dimension = [{'State_ID': i+1, 'State': state} for i, state in enumerate(sorted(states))]  
  
    # GACC dimension  
    gaccs = structured_table['GACC'].dropna().unique()  
    gacc_dimension = [{'GACC_ID': i+1, 'GACC': gacc} for i, gacc in enumerate(sorted(gaccs))]  
  
    # Incident Type dimension (if exists)  
    if 'Inc_Type' in structured_table.columns:  
        inc_types = structured_table['Inc_Type'].dropna().unique()  
        inc_type_dimension = [{'Inc_Type_ID': i+1, 'Inc_Type': inc_type} for i, inc_type in enumerate(sorted(inc_types))]  
    else:  
        inc_type_dimension = []  
  
    # Cause dimension  
    causes = structured_table['Cause'].dropna().unique()  
    cause_dimension = [{'Cause_ID': i+1, 'Cause': cause} for i, cause in enumerate(sorted(causes))]  
  
    # Time dimension using Start_Date column  
    dates = structured_table['Start_Date'].dropna().unique()  
    time_dimension = [{'Date_ID': i+1, 'Date': date} for i, date in enumerate(sorted(dates))]  
  
    dimension_tables = {  
        'state_dimension': state_dimension,  
        'gacc_dimension': gacc_dimension,  
        'inc_type_dimension': inc_type_dimension,  
        'cause_dimension': cause_dimension,  
        'time_dimension': time_dimension  
    }  
  
    dimension_filename = 'dimension_tables_2019.json'  
    with open(dimension_filename, 'w') as f:  
        json.dump(dimension_tables, f, indent=4)  
    print("Created dimension tables and saved to " + dimension_filename)  
    print("- State dimension: " + str(len(state_dimension)) + " states")  
    print("- GACC dimension: " + str(len(gacc_dimension)) + " GACCs")  
    print("- Incident Type dimension: " + str(len(inc_type_dimension)) + " types")  
    print("- Cause dimension: " + str(len(cause_dimension)) + " causes")  
    print("- Time dimension: " + str(len(time_dimension)) + " dates")  
else:  
    print("\nCould not identify the significant incidents table in the given page range.")  
    # Fallback: Inspect additional pages for text hints  
    for page_num in range(start_page, end_page+1):  
        page = doc[page_num]  
        text = page.get_text()  
        if "significant" in text.lower() and ("fire" in text.lower() or "incident" in text.lower()):  
            print("\nPage " + str(page_num+1) + " appears to contain relevant text for fires/incidents.")  
            print("First 200 characters of the page:")  
            print(text[:200])  
      
    print("\nExpanding search to additional pages...")  
    for page_num in range(0, doc.page_count):  
        if page_num < start_page or page_num > end_page:  
            page = doc[page_num]  
            text = page.get_text()  
            if "significant" in text.lower() and ("fire" in text.lower() or "incident" in text.lower()):  
                print("\nPage " + str(page_num+1) + " appears to contain relevant text for fires/incidents.")  
                print("First 200 characters of the page:")  
                print(text[:200])  
                # Attempt to extract tables from this page  
                tables = extract_tables_from_page(page)  
                if tables:  
                    print("Found " + str(len(tables)) + " tables on page " + str(page_num+1))  
                    for i, table in enumerate(tables):  
                        print("Table " + str(i+1) + " columns: " + ', '.join(table.columns.tolist()))  
                          
print("\nProcess completed.")  

Searching for significant incidents table in pages 8 to 16 of the 2019 PDF...
Found 2 tables on pages: [10, 11]
Table 1 columns: name, gacc, state, start
date, contain or last
report date, size
(acres), cause*, estimated
cost

Table 1 on page 10 appears to be the significant incidents table.
Columns: ['Name', 'GACC', 'State', 'Start\nDate', 'Contain or Last\nReport Date', 'Size\n(acres)', 'Cause*', 'Estimated\nCost']

Significant incidents table extracted:
             Name GACC State Start\nDate Contain or Last\nReport Date  \
0  Old Grouch Top   AK    AK         6/5                          8/1   
1     Frozen Calf   AK    AK        6/24                         7/11   
2      Hess Creek   AK    AK        6/21                          8/1   
3       Swan Lake   AK    AK         6/5                         10/2   
4   Bearnose Hill   AK    AK        6/29                         7/11   

  Size\n(acres) Cause* Estimated\nCost  
0       307,969      L         $61,000  
1       240,543   

In [13]:
import fitz  # PyMuPDF  
import pandas as pd  
import json  
import re  
  
# Load the 2020 PDF  
pdf_path = 'annual_report_2020.pdf'  
doc = fitz.open(pdf_path)  
  
# Define expected page range (pages 8 to 16)  
start_page = 7   # 0-indexed (page 8)  
end_page = 15    # 0-indexed (page 16)  
  
print("Searching for significant incidents table in pages " + str(start_page+1) + " to " + str(end_page+1) + " of the 2020 PDF...")  
  
# Function to extract tables from a page using PyMuPDF's table extraction  
def extract_tables_from_page(page):  
    tables = page.find_tables()  
    if tables and tables.tables:  
        return [table.to_pandas() for table in tables.tables]  
    return []  
  
all_tables = []  
table_pages = []  
  
# Extract tables from the given pages  
for page_num in range(start_page, end_page+1):  
    page = doc[page_num]  
    tables = extract_tables_from_page(page)  
    if tables:  
        all_tables.extend(tables)  
        table_pages.append(page_num+1)  # Use 1-indexed page numbers  
print("Found " + str(len(all_tables)) + " tables on pages: " + str(table_pages))  
  
# Identify the significant incidents table by inspecting table headers  
significant_table_index = None  
for i, table in enumerate(all_tables):  
    columns = table.columns.tolist()  
    column_str = ', '.join(columns).lower()  
    print("Table " + str(i+1) + " columns: " + column_str)  
    if ('name' in column_str and 'state' in column_str) or ('fire' in column_str and 'acres' in column_str):  
        significant_table_index = i  
        print("\nTable " + str(i+1) + " on page " + str(table_pages[i]) + " appears to be the significant incidents table.")  
        print("Columns:", columns)  
        break  
  
if significant_table_index is not None:  
    # Use the identified table for processing  
    incidents_table = all_tables[significant_table_index]  
    print("\nSignificant incidents table extracted:")  
    print(incidents_table.head())  
      
    # Clean column names: remove surrounding spaces and replace newline with underscore  
    incidents_table.columns = [col.strip().replace('\n', '_') for col in incidents_table.columns]  
      
    # Save the raw extracted table as JSON (filename: incidents_2020.json)  
    raw_filename = 'incidents_2020.json'  
    incidents_table.to_json(raw_filename, orient='records', indent=4)  
    print("Raw incidents table saved to " + raw_filename)  
      
    # Create a column mapping for structured data based on column names we expect  
    mapping = {}  
    for col in incidents_table.columns:  
        std = col.lower().replace(' ', '_').replace('\n', '_').replace('(', '').replace(')', '')  
        if 'name' in std:  
            mapping[col] = 'Name'  
        elif 'inc' in std:  
            mapping[col] = 'Inc_Type'  
        elif 'gacc' in std:  
            mapping[col] = 'GACC'  
        elif 'state' in std:  
            mapping[col] = 'State'  
        elif 'start' in std:  
            mapping[col] = 'Start_Date'  
        elif 'last' in std:  
            mapping[col] = 'Last_Report_Date'  
        elif 'acre' in std:  
            mapping[col] = 'Size_Acres'  
        elif 'cause' in std:  
            mapping[col] = 'Cause'  
        elif 'cost' in std:  
            mapping[col] = 'Estimated_Cost'  
        else:  
            mapping[col] = col  
    print("\nProposed column mapping:")  
    print(mapping)  
      
    # Apply the mapping to create a structured table  
    structured_table = incidents_table.rename(columns=mapping)  
      
    # Save the structured table as JSON (filename: structured_incidents_data_2020.json)  
    structured_filename = 'structured_incidents_data_2020.json'  
    structured_table.to_json(structured_filename, orient='records', indent=4)  
    print("Structured incidents data saved to " + structured_filename)  
      
    # Generate dimension tables  
    # State dimension  
    states = structured_table['State'].dropna().unique()  
    state_dimension = [{'State_ID': i+1, 'State': state} for i, state in enumerate(sorted(states))]  
      
    # GACC dimension  
    gaccs = structured_table['GACC'].dropna().unique()  
    gacc_dimension = [{'GACC_ID': i+1, 'GACC': gacc} for i, gacc in enumerate(sorted(gaccs))]  
      
    # Incident Type dimension (if available)  
    if 'Inc_Type' in structured_table.columns:  
        inc_types = structured_table['Inc_Type'].dropna().unique()  
        inc_type_dimension = [{'Inc_Type_ID': i+1, 'Inc_Type': inc_type} for i, inc_type in enumerate(sorted(inc_types))]  
    else:  
        inc_type_dimension = []  
      
    # Cause dimension  
    causes = structured_table['Cause'].dropna().unique()  
    cause_dimension = [{'Cause_ID': i+1, 'Cause': cause} for i, cause in enumerate(sorted(causes))]  
      
    # Time dimension using Start_Date  
    dates = structured_table['Start_Date'].dropna().unique()  
    time_dimension = [{'Date_ID': i+1, 'Date': date} for i, date in enumerate(sorted(dates))]  
      
    dimension_tables = {  
        'state_dimension': state_dimension,  
        'gacc_dimension': gacc_dimension,  
        'inc_type_dimension': inc_type_dimension,  
        'cause_dimension': cause_dimension,  
        'time_dimension': time_dimension  
    }  
      
    dimension_filename = 'dimension_tables_2020.json'  
    with open(dimension_filename, 'w') as f:  
        json.dump(dimension_tables, f, indent=4)  
    print("Created dimension tables and saved to " + dimension_filename)  
    print("- State dimension: " + str(len(state_dimension)) + " states")  
    print("- GACC dimension: " + str(len(gacc_dimension)) + " GACCs")  
    print("- Incident Type dimension: " + str(len(inc_type_dimension)) + " types")  
    print("- Cause dimension: " + str(len(cause_dimension)) + " causes")  
    print("- Time dimension: " + str(len(time_dimension)) + " dates")  
else:  
    print("\nCould not identify the significant incidents table in the given page range.")  
    # Fallback: inspect pages for text hints and additional tables  
    for page_num in range(start_page, end_page+1):  
        page = doc[page_num]  
        text = page.get_text()  
        if "significant" in text.lower() and ("fire" in text.lower() or "incident" in text.lower()):  
            print("\nPage " + str(page_num+1) + " appears to contain relevant text for fires/incidents.")  
            print("First 200 characters of the page:")  
            print(text[:200])  
      
    print("\nExpanding search to additional pages...")  
    for page_num in range(0, doc.page_count):  
        if page_num < start_page or page_num > end_page:  
            page = doc[page_num]  
            text = page.get_text()  
            if "significant" in text.lower() and ("fire" in text.lower() or "incident" in text.lower()):  
                print("\nPage " + str(page_num+1) + " appears to contain relevant text for fires/incidents.")  
                print("First 200 characters of the page:")  
                print(text[:200])  
                # Attempt to extract tables from this page  
                tables = extract_tables_from_page(page)  
                if tables:  
                    print("Found " + str(len(tables)) + " tables on page " + str(page_num+1))  
                    for i, table in enumerate(tables):  
                        print("Table " + str(i+1) + " columns: " + ', '.join(table.columns.tolist()))  
  
print("\nProcess completed.")  

Searching for significant incidents table in pages 8 to 16 of the 2020 PDF...
Found 2 tables on pages: [10, 11]
Table 1 columns: name, gacc, state, start
date, contain or last
report date, size
(acres), cause*, estimated
cost

Table 1 on page 10 appears to be the significant incidents table.
Columns: ['Name', 'GACC', 'State', 'Start\nDate', 'Contain or Last\nReport Date', 'Size\n(acres)', 'Cause*', 'Estimated\nCost']

Significant incidents table extracted:
                     Name GACC State Start\nDate Contain or Last\nReport Date  \
0          August Complex   NO    CA        8/17                        11/11   
1  SCU Lightning\nComplex   NO    CA        8/16                         9/14   
2             SHF Elkhorn   NO    CA        8/29                          9/9   
3                   Creek   SO    CA         9/4                        12/17   
4  LNU Lightning\nComplex   NO    CA        8/17                         10/1   

  Size\n(acres) Cause* Estimated\nCost  
0     1,032

In [14]:
import fitz  # PyMuPDF  
import pandas as pd  
import json  
import re  
  
# Load the 2021 PDF  
pdf_path = 'annual_report_2021.pdf'  
doc = fitz.open(pdf_path)  
  
# Define expected page range (pages 8 to 16)  
start_page = 7   # 0-indexed (page 8)  
end_page = 15    # 0-indexed (page 16)  
  
print("Searching for significant incidents table in pages " + str(start_page+1) + " to " + str(end_page+1) + " of the 2021 PDF...")  
  
# Function to extract tables from a page using PyMuPDF's table extraction  
def extract_tables_from_page(page):  
    tables = page.find_tables()  
    if tables and tables.tables:  
        return [table.to_pandas() for table in tables.tables]  
    return []  
  
all_tables = []  
table_pages = []  
  
# Extract tables from the specified pages  
for page_num in range(start_page, end_page+1):  
    page = doc[page_num]  
    tables = extract_tables_from_page(page)  
    if tables:  
        all_tables.extend(tables)  
        table_pages.append(page_num+1)  # 1-indexed page numbers  
  
print("Found " + str(len(all_tables)) + " tables on pages: " + str(table_pages))  
  
# Identify the significant incidents table by inspecting table headers  
significant_table_index = None  
for i, table in enumerate(all_tables):  
    columns = table.columns.tolist()  
    column_str = ', '.join(columns).lower()  
    print("Table " + str(i+1) + " columns: " + column_str)  
    if ('name' in column_str and 'state' in column_str) or ('fire' in column_str and 'acres' in column_str):  
        significant_table_index = i  
        print("\nTable " + str(i+1) + " on page " + str(table_pages[i]) + " appears to be the significant incidents table.")  
        print("Columns:", columns)  
        break  
  
if significant_table_index is not None:  
    # Use the identified table for processing  
    incidents_table = all_tables[significant_table_index]  
    print("\nSignificant incidents table extracted:")  
    print(incidents_table.head())  
      
    # Clean column names: remove newlines and excess spaces  
    incidents_table.columns = [col.strip().replace('\n', '_') for col in incidents_table.columns]  
      
    # Save the raw extracted table as JSON (filename: incidents_2021.json)  
    raw_filename = 'incidents_2021.json'  
    incidents_table.to_json(raw_filename, orient='records', indent=4)  
    print("Raw incidents table saved to " + raw_filename)  
      
    # Mapping for structured data  
    mapping = {}  
    for col in incidents_table.columns:  
        std = col.lower().replace(' ', '_').replace('\n', '_').replace('(', '').replace(')', '')  
        if 'name' in std:  
            mapping[col] = 'Name'  
        elif 'gacc' in std:  
            mapping[col] = 'GACC'  
        elif 'state' in std:  
            mapping[col] = 'State'  
        elif 'start' in std:  
            mapping[col] = 'Start_Date'  
        elif 'contain' in std or 'last' in std:  
            mapping[col] = 'Contain_or_Last_Report_Date'  
        elif 'acre' in std:  
            mapping[col] = 'Size_Acres'  
        elif 'cause' in std:  
            mapping[col] = 'Cause'  
        elif 'cost' in std:  
            mapping[col] = 'Estimated_Cost'  
        else:  
            mapping[col] = col  
    print("\nProposed column mapping:")  
    print(mapping)  
      
    # Apply mapping to get structured data  
    structured_table = incidents_table.rename(columns=mapping)  
      
    structured_filename = 'structured_incidents_data_2021.json'  
    structured_table.to_json(structured_filename, orient='records', indent=4)  
    print("Structured incidents data saved to " + structured_filename)  
      
    # Generate dimension tables  
    # State dimension  
    states = structured_table['State'].dropna().unique()  
    state_dimension = [{'State_ID': i+1, 'State': state} for i, state in enumerate(sorted(states))]  
      
    # GACC dimension  
    gaccs = structured_table['GACC'].dropna().unique()  
    gacc_dimension = [{'GACC_ID': i+1, 'GACC': gacc} for i, gacc in enumerate(sorted(gaccs))]  
      
    # Incident Type dimension, if available  
    if 'Inc_Type' in structured_table.columns:  
        inc_types = structured_table['Inc_Type'].dropna().unique()  
        inc_type_dimension = [{'Inc_Type_ID': i+1, 'Inc_Type': inc_type} for i, inc_type in enumerate(sorted(inc_types))]  
    else:  
        inc_type_dimension = []  
      
    # Cause dimension  
    causes = structured_table['Cause'].dropna().unique()  
    cause_dimension = [{'Cause_ID': i+1, 'Cause': cause} for i, cause in enumerate(sorted(causes))]  
      
    # Time dimension using Start_Date column  
    dates = structured_table['Start_Date'].dropna().unique()  
    time_dimension = [{'Date_ID': i+1, 'Date': date} for i, date in enumerate(sorted(dates))]  
      
    dimension_tables = {  
        'state_dimension': state_dimension,  
        'gacc_dimension': gacc_dimension,  
        'inc_type_dimension': inc_type_dimension,  
        'cause_dimension': cause_dimension,  
        'time_dimension': time_dimension  
    }  
      
    dimension_filename = 'dimension_tables_2021.json'  
    with open(dimension_filename, 'w') as f:  
        json.dump(dimension_tables, f, indent=4)  
    print("Created dimension tables and saved to " + dimension_filename)  
    print("- State dimension: " + str(len(state_dimension)) + " states")  
    print("- GACC dimension: " + str(len(gacc_dimension)) + " GACCs")  
    print("- Incident Type dimension: " + str(len(inc_type_dimension)) + " types")  
    print("- Cause dimension: " + str(len(cause_dimension)) + " causes")  
    print("- Time dimension: " + str(len(time_dimension)) + " dates")  
else:  
    print("\nCould not identify the significant incidents table in the given page range.")  
    # Fallback: Expand search to additional pages if needed  
    for page_num in range(0, doc.page_count):  
        if page_num < start_page or page_num > end_page:  
            page = doc[page_num]  
            text = page.get_text()  
            if "significant" in text.lower() and ("fire" in text.lower() or "incident" in text.lower()):  
                print("\nPage " + str(page_num+1) + " contains text about significant fires/incidents.")  
                print("First 200 characters of the page:")  
                print(text[:200])  
                # Try to extract tables from the page  
                tables = extract_tables_from_page(page)  
                if tables:  
                    print("Found " + str(len(tables)) + " tables on page " + str(page_num+1))  
                    for i, table in enumerate(tables):  
                        print("Table " + str(i+1) + " columns: " + ', '.join(table.columns.tolist()))  
print("\nProcess completed.")  

Searching for significant incidents table in pages 8 to 16 of the 2021 PDF...
Found 2 tables on pages: [11, 12]
Table 1 columns: name, gacc, state, start
date, contain or last
report date, size
(acres), cause*, estimated
cost

Table 1 on page 11 appears to be the significant incidents table.
Columns: ['Name', 'GACC', 'State', 'Start\nDate', 'Contain or Last\nReport Date', 'Size\n(acres)', 'Cause*', 'Estimated\nCost']

Significant incidents table extracted:
            Name GACC State Start\nDate Contain or Last\nReport Date  \
0          Dixie   NO    CA        7/13                        10/23   
1        Bootleg   NW    OR         7/6                         8/13   
2       Monument   NO    CA        7/31                        10/25   
3         Caldor   NO    CA        8/14                        10/20   
4  River Complex   NO    CA        7/30                        10/24   

  Size\n(acres) Cause* Estimated\nCost  
0       963,309      U    $637,428,216  
1       413,717      L  

In [17]:
import fitz  # PyMuPDF  
import pandas as pd  
import json  
import re  
  
# -------------------------------  
# Load and process annual_report.2022.pdf  
# -------------------------------  
pdf_path = 'annual_report.2022.pdf'  
doc = fitz.open(pdf_path)  
  
# Define expected page range (pages 8 to 16, i.e., indices 7 to 15)  
start_page = 7   # page 8 (0-indexed)  
end_page = 15    # page 16 (0-indexed)  
  
print("Processing annual_report.2022.pdf; searching pages " + str(start_page+1) + " to " + str(end_page+1))  
  
# Function to extract tables from a page using PyMuPDF's table extraction  
def extract_tables_from_page(page):  
    tables = page.find_tables()  
    if tables and tables.tables:  
        return [table.to_pandas() for table in tables.tables]  
    return []  
  
all_tables = []  
table_pages = []  
  
# Extract tables from specified page range  
for page_num in range(start_page, end_page+1):  
    page = doc[page_num]  
    tables = extract_tables_from_page(page)  
    if tables:  
        all_tables.extend(tables)  
        table_pages.append(page_num+1)  # saving 1-indexed page numbers  
  
print("Found " + str(len(all_tables)) + " tables on pages: " + str(table_pages))  
  
# Identify the significant incidents table by column headers  
significant_table_index = None  
for i, table in enumerate(all_tables):  
    columns = table.columns.tolist()  
    column_str = ', '.join(columns).lower()  
    print("Table " + str(i+1) + " columns: " + column_str)  
    # Check for typical keywords indicative of the incidents table  
    if ('name' in column_str and 'state' in column_str) or ('fire' in column_str and 'acres' in column_str):  
        significant_table_index = i  
        print("\nTable " + str(i+1) + " on page " + str(table_pages[i]) + " appears to be the significant incidents table.")  
        print("Columns:", columns)  
        break  
  
if significant_table_index is None:  
    print("\nSignificant incidents table not found in expected pages. Exiting processing.")  
else:  
    # Process the identified table  
    incidents_table = all_tables[significant_table_index]  
    print("\nSignificant incidents table extracted (raw preview):")  
    print(incidents_table.head())  
      
    # -------------------------------  
    # Step 1: Clean the table columns  
    # -------------------------------  
    incidents_table.columns = [col.strip().replace('\n','_') for col in incidents_table.columns]  
      
    # Save the raw incidents table as JSON  
    raw_filename = 'incidents_2022.json'  
    incidents_table.to_json(raw_filename, orient='records', indent=4)  
    print("\nRaw incidents table saved to " + raw_filename)  
      
    # -------------------------------  
    # Step 2: Mapping column names to standardized names  
    # -------------------------------  
    mapping = {}  
    for col in incidents_table.columns:  
        std = col.lower().replace(' ','_').replace('\n','_').replace('(','').replace(')','')  
        if 'name' in std:  
            mapping[col] = 'Name'  
        elif 'gacc' in std:  
            mapping[col] = 'GACC'  
        elif 'state' in std:  
            mapping[col] = 'State'  
        elif 'start' in std:  
            mapping[col] = 'Start_Date'  
        elif 'report' in std:  
            mapping[col] = 'Last_Report_Date'  
        elif 'acre' in std:  
            mapping[col] = 'Size_Acres'  
        elif 'cause' in std:  
            mapping[col] = 'Cause'  
        elif 'cost' in std:  
            mapping[col] = 'Estimated_Cost'  
        else:  
            mapping[col] = col  
    print("\nProposed column mapping:")  
    print(mapping)  
      
    # Apply mapping for standardized (structured) data  
    structured_table = incidents_table.rename(columns=mapping)  
    structured_filename = 'structured_incidents_data_2022.json'  
    structured_table.to_json(structured_filename, orient='records', indent=4)  
    print("\nStructured incidents data saved to " + structured_filename)  
      
    # -------------------------------  
    # Step 3: Generate Dimension Tables  
    # -------------------------------  
    # Create state dimension  
    states = structured_table['State'].dropna().unique()  
    state_dimension = [{'State_ID': i+1, 'State': state} for i, state in enumerate(sorted(states))]  
      
    # GACC dimension  
    gaccs = structured_table['GACC'].dropna().unique()  
    gacc_dimension = [{'GACC_ID': i+1, 'GACC': gacc} for i, gacc in enumerate(sorted(gaccs))]  
      
    # Assuming Inc_Type is not available, so skip incident type dimension  
    inc_type_dimension = []  
      
    # Cause dimension  
    causes = structured_table['Cause'].dropna().unique()  
    cause_dimension = [{'Cause_ID': i+1, 'Cause': cause} for i, cause in enumerate(sorted(causes))]  
      
    # Time dimension from Start_Date (as string; further transformation may be applied if needed)  
    dates = structured_table['Start_Date'].dropna().unique()  
    time_dimension = [{'Date_ID': i+1, 'Date': date} for i, date in enumerate(sorted(dates))]  
      
    dimension_tables = {  
        'state_dimension': state_dimension,  
        'gacc_dimension': gacc_dimension,  
        'inc_type_dimension': inc_type_dimension,  
        'cause_dimension': cause_dimension,  
        'time_dimension': time_dimension  
    }  
      
    dimension_filename = 'dimension_tables_2022.json'  
    with open(dimension_filename, 'w') as f:  
        json.dump(dimension_tables, f, indent=4)  
    print("\nCreated dimension tables and saved to " + dimension_filename)  
    print("- State dimension: " + str(len(state_dimension)) + " states")  
    print("- GACC dimension: " + str(len(gacc_dimension)) + " GACCs")  
    print("- Incident Type dimension: " + str(len(inc_type_dimension)) + " types")  
    print("- Cause dimension: " + str(len(cause_dimension)) + " causes")  
    print("- Time dimension: " + str(len(time_dimension)) + " dates")  
      
print("\nProcess completed for annual_report.2022.pdf")  

Processing annual_report.2022.pdf; searching pages 8 to 16
Found 2 tables on pages: [9, 10]
Table 1 columns: name, gacc, state, start
date, last
report
date, size in
acres, cause*, estimated cost

Table 1 on page 9 appears to be the significant incidents table.
Columns: ['Name', 'GACC', 'State', 'Start\nDate', 'Last\nReport\nDate', 'Size In\nAcres', 'Cause*', 'Estimated Cost']

Significant incidents table extracted (raw preview):
               Name GACC State Start\nDate Last\nReport\nDate Size In\nAcres  \
0      Lime Complex   AK    AK        6/15               7/26        865,625   
1      Hermits Peak   SW    NM         4/7              10/20        341,735   
2             Black   SW    NM        5/13              11/10        325,136   
3  Paradise Complex   AK    AK         7/7               8/11        275,703   
4       Tatlawiksuk   AK    AK         6/5               7/22        229,439   

  Cause* Estimated Cost  
0      U    $12,726,992  
1      H   $330,100,293  
2      

In [18]:
import fitz  # PyMuPDF  
import pandas as pd  
import json  
import re  
  
# -------------------------------  
# Load and process annual_report_2023_508_0.pdf  
# -------------------------------  
pdf_path = 'annual_report_2023_508_0.pdf'  
doc = fitz.open(pdf_path)  
  
# Define expected page range (pages 8 to 16, i.e., indices 7 to 15)  
start_page = 7   # page 8 (0-indexed)  
end_page = 15    # page 16 (0-indexed)  
  
print("Processing annual_report_2023_508_0.pdf; searching pages " + str(start_page+1) + " to " + str(end_page+1))  
  
# Function to extract tables from a page using PyMuPDF's table extraction  
def extract_tables_from_page(page):  
    tables = page.find_tables()  
    if tables and tables.tables:  
        return [table.to_pandas() for table in tables.tables]  
    return []  
  
all_tables = []  
table_pages = []  
  
# Extract tables from specified page range  
for page_num in range(start_page, end_page+1):  
    page = doc[page_num]  
    tables = extract_tables_from_page(page)  
    if tables:  
        all_tables.extend(tables)  
        table_pages.append(page_num+1)  # saving 1-indexed page numbers  
  
print("Found " + str(len(all_tables)) + " tables on pages: " + str(table_pages))  
  
# Identify the significant incidents table by column headers  
significant_table_index = None  
for i, table in enumerate(all_tables):  
    columns = table.columns.tolist()  
    column_str = ', '.join(columns).lower()  
    print("Table " + str(i+1) + " columns: " + column_str)  
    # Check for typical keywords indicative of the incidents table  
    if ('name' in column_str and 'state' in column_str) or ('fire' in column_str and 'acres' in column_str):  
        significant_table_index = i  
        print("\nTable " + str(i+1) + " on page " + str(table_pages[i]) + " appears to be the significant incidents table.")  
        print("Columns:", columns)  
        break  
  
if significant_table_index is None:  
    print("\nSignificant incidents table not found in expected pages. Exiting processing.")  
else:  
    # Process the identified table  
    incidents_table = all_tables[significant_table_index]  
      
    # Clean column names: strip spaces and newlines  
    incidents_table.columns = [col.strip().replace('\\n', '_') for col in incidents_table.columns]  
      
    # Save the raw extracted table as JSON  
    raw_filename = 'incidents_2023.json'  
    incidents_table.to_json(raw_filename, orient='records', indent=4)  
    print("\nRaw incidents table saved to " + raw_filename)  
      
    # Apply mapping to standardize column names  
    mapping = {}  
    for col in incidents_table.columns:  
        std = col.lower().replace(' ', '_').replace('\\n', '_').replace('(', '').replace(')', '')  
        if 'name' in std:  
            mapping[col] = 'Name'  
        elif 'gacc' in std:  
            mapping[col] = 'GACC'  
        elif 'state' in std:  
            mapping[col] = 'State'  
        elif 'start' in std:  
            mapping[col] = 'Start_Date'  
        elif 'report' in std:  
            mapping[col] = 'Last_Report_Date'  
        elif 'acre' in std:  
            mapping[col] = 'Size_Acres'  
        elif 'cause' in std:  
            mapping[col] = 'Cause'  
        elif 'cost' in std:  
            mapping[col] = 'Estimated_Cost'  
        else:  
            mapping[col] = col  
              
    print("\nProposed column mapping:")  
    print(mapping)  
      
    # Apply mapping for structured data  
    structured_table = incidents_table.rename(columns=mapping)  
    structured_filename = 'structured_incidents_data_2023.json'  
    structured_table.to_json(structured_filename, orient='records', indent=4)  
    print("Structured incidents data saved to " + structured_filename)  
      
    # Generate dimension tables  
    # State dimension  
    states = structured_table['State'].dropna().unique()  
    state_dimension = [{'State_ID': i+1, 'State': state} for i, state in enumerate(sorted(states))]  
      
    # GACC dimension  
    gaccs = structured_table['GACC'].dropna().unique()  
    gacc_dimension = [{'GACC_ID': i+1, 'GACC': gacc} for i, gacc in enumerate(sorted(gaccs))]  
      
    # Incident type dimension (if available; not assumed here)  
    inc_type_dimension = []  
      
    # Cause dimension  
    causes = structured_table['Cause'].dropna().unique()  
    cause_dimension = [{'Cause_ID': i+1, 'Cause': cause} for i, cause in enumerate(sorted(causes))]  
      
    # Time dimension from Start_Date (as string; further transformation can be done if needed)  
    dates = structured_table['Start_Date'].dropna().unique()  
    time_dimension = [{'Date_ID': i+1, 'Date': date} for i, date in enumerate(sorted(dates))]  
      
    dimension_tables = {  
        'state_dimension': state_dimension,  
        'gacc_dimension': gacc_dimension,  
        'inc_type_dimension': inc_type_dimension,  
        'cause_dimension': cause_dimension,  
        'time_dimension': time_dimension  
    }  
      
    dimension_filename = 'dimension_tables_2023.json'  
    with open(dimension_filename, 'w') as f:  
        json.dump(dimension_tables, f, indent=4)  
    print("\nCreated dimension tables and saved to " + dimension_filename)  
    print("- State dimension: " + str(len(state_dimension)) + " states")  
    print("- GACC dimension: " + str(len(gacc_dimension)) + " GACCs")  
    print("- Incident Type dimension: " + str(len(inc_type_dimension)) + " types")  
    print("- Cause dimension: " + str(len(cause_dimension)) + " causes")  
    print("- Time dimension: " + str(len(time_dimension)) + " dates")  
      
print("\nProcess completed for annual_report_2023_508_0.pdf")  

Processing annual_report_2023_508_0.pdf; searching pages 8 to 16
Found 6 tables on pages: [9, 10, 15, 16]
Table 1 columns: gacc, single
residences, mixed
commercial-
residential, multiple
residences, nonresidential
commercial
property, other
minor
structures, total
Table 2 columns: name, gacc, state, start
date, last
report
date, size in
acres, cause*

Table 2 on page 10 appears to be the significant incidents table.
Columns: ['Name', 'GACC', 'State', 'Start\nDate', 'Last\nReport\nDate', 'Size In\nAcres', 'Cause*']

Raw incidents table saved to incidents_2023.json

Proposed column mapping:
{'Name': 'Name', 'GACC': 'GACC', 'State': 'State', 'Start\nDate': 'Start_Date', 'Last\nReport\nDate': 'Last_Report_Date', 'Size In\nAcres': 'Size_Acres', 'Cause*': 'Cause'}
Structured incidents data saved to structured_incidents_data_2023.json

Created dimension tables and saved to dimension_tables_2023.json
- State dimension: 5 states
- GACC dimension: 6 GACCs
- Incident Type dimension: 0 types
- Ca

In [21]:
import fitz  # PyMuPDF
import pandas as pd
import json
import re

# -------------------------------
# Load and process annual_report_2024.pdf
# -------------------------------
pdf_path = 'annual_report_2024.pdf'
doc = fitz.open(pdf_path)

print("Processing annual_report_2024.pdf; searching all " + str(doc.page_count) + " pages")

# Function to extract tables from a page using PyMuPDF's table extraction
def extract_tables_from_page(page):
    tables = page.find_tables()
    if tables and tables.tables:
        return [table.to_pandas() for table in tables.tables]
    return []

all_tables = []
table_pages = []

# Extract tables from all pages
for page_num in range(doc.page_count):
    page = doc[page_num]
    tables = extract_tables_from_page(page)
    if tables:
        for table in tables:
            all_tables.append(table)
            table_pages.append(page_num+1)  # saving 1-indexed page numbers

print("Found " + str(len(all_tables)) + " tables on pages: " + str(table_pages))

# Identify the significant incidents table by column headers
significant_table_index = None
for i, table in enumerate(all_tables):
    columns = table.columns.tolist()
    column_str = ', '.join(columns).lower()
    print("Table " + str(i+1) + " on page " + str(table_pages[i]) + " columns: " + column_str)
    
    # Check for typical keywords indicative of the incidents table
    if ('name' in column_str and 'state' in column_str) or ('fire' in column_str and 'acres' in column_str):
        significant_table_index = i
        print("\
Table " + str(i+1) + " on page " + str(table_pages[i]) + " appears to be the significant incidents table.")
        print("Columns:", columns)
        break

# If not found by column names, try to find by looking at the content
if significant_table_index is None:
    print("\
Searching for tables with fire names or incident data...")
    for i, table in enumerate(all_tables):
        # Check if the table has at least 3 columns and more than 2 rows
        if len(table.columns) >= 3 and len(table) > 2:
            # Convert to string to check content
            table_str = table.to_string().lower()
            # Look for common fire names or patterns
            if any(keyword in table_str for keyword in ['fire', 'complex', 'wildfire', 'incident']):
                significant_table_index = i
                print("\
Table " + str(i+1) + " on page " + str(table_pages[i]) + " may contain fire incident data.")
                print("Sample content:")
                print(table.head(3))
                break

if significant_table_index is None:
    print("\
No table identified as significant incidents table. Printing all table headers for review:")
    for i, table in enumerate(all_tables):
        print("\
Table " + str(i+1) + " on page " + str(table_pages[i]) + ":")
        print(table.head(2))
    
    print("\
Significant incidents table not found in the document. Exiting processing.")
    print("\
Process completed for annual_report_2024.pdf")
else:
    # Process the identified table
    incidents_table = all_tables[significant_table_index]
    print("\
Significant incidents table extracted:")
    print(incidents_table.head())
    
    # Clean column names: strip spaces and newlines
    incidents_table.columns = [col.strip().replace('\
', '_') for col in incidents_table.columns]
    
    # Save the raw extracted table as JSON
    raw_filename = 'incidents_2024.json'
    incidents_table.to_json(raw_filename, orient='records', indent=4)
    print("Raw incidents table saved to " + raw_filename)
    
    # Apply mapping to standardize column names
    mapping = {}
    for col in incidents_table.columns:
        std = col.lower().replace(' ', '_').replace('\
', '_').replace('(', '').replace(')', '')
        if 'name' in std:
            mapping[col] = 'Name'
        elif 'gacc' in std:
            mapping[col] = 'GACC'
        elif 'state' in std:
            mapping[col] = 'State'
        elif 'start' in std:
            mapping[col] = 'Start_Date'
        elif 'report' in std:
            mapping[col] = 'Last_Report_Date'
        elif 'acre' in std:
            mapping[col] = 'Size_Acres'
        elif 'cause' in std:
            mapping[col] = 'Cause'
        elif 'cost' in std:
            mapping[col] = 'Estimated_Cost'
        else:
            mapping[col] = col
    
    print("\
Proposed column mapping:")
    print(mapping)
    
    # Apply mapping for structured data
    structured_table = incidents_table.rename(columns=mapping)
    structured_filename = 'structured_incidents_data_2024.json'
    structured_table.to_json(structured_filename, orient='records', indent=4)
    print("Structured incidents data saved to " + structured_filename)
    
    # Generate dimension tables
    # State dimension
    states = structured_table['State'].dropna().unique() if 'State' in structured_table.columns else []
    state_dimension = [{'State_ID': i+1, 'State': state} for i, state in enumerate(sorted(states))]
    
    # GACC dimension
    gaccs = structured_table['GACC'].dropna().unique() if 'GACC' in structured_table.columns else []
    gacc_dimension = [{'GACC_ID': i+1, 'GACC': gacc} for i, gacc in enumerate(sorted(gaccs))]
    
    # Incident type dimension (if available; not assumed here)
    inc_type_dimension = []
    
    # Cause dimension
    causes = structured_table['Cause'].dropna().unique() if 'Cause' in structured_table.columns else []
    cause_dimension = [{'Cause_ID': i+1, 'Cause': cause} for i, cause in enumerate(sorted(causes))]
    
    # Time dimension from Start_Date (as string; further transformation can be done if needed)
    dates = structured_table['Start_Date'].dropna().unique() if 'Start_Date' in structured_table.columns else []
    time_dimension = [{'Date_ID': i+1, 'Date': date} for i, date in enumerate(sorted(dates))]
    
    dimension_tables = {
        'state_dimension': state_dimension,
        'gacc_dimension': gacc_dimension,
        'inc_type_dimension': inc_type_dimension,
        'cause_dimension': cause_dimension,
        'time_dimension': time_dimension
    }
    
    dimension_filename = 'dimension_tables_2024.json'
    with open(dimension_filename, 'w') as f:
        json.dump(dimension_tables, f, indent=4)
    print("\
Created dimension tables and saved to " + dimension_filename)
    print("- State dimension: " + str(len(state_dimension)) + " states")
    print("- GACC dimension: " + str(len(gacc_dimension)) + " GACCs")
    print("- Incident Type dimension: " + str(len(inc_type_dimension)) + " types")
    print("- Cause dimension: " + str(len(cause_dimension)) + " causes")
    print("- Time dimension: " + str(len(time_dimension)) + " dates")
    
    print("\
Process completed for annual_report_2024.pdf")

Processing annual_report_2024.pdf; searching all 57 pages
Found 68 tables on pages: [13, 17, 18, 18, 19, 19, 20, 22, 22, 23, 41, 41, 45, 45, 45, 45, 45, 46, 46, 46, 46, 46, 46, 47, 47, 47, 47, 47, 47, 48, 48, 48, 48, 48, 48, 48, 49, 49, 49, 49, 49, 49, 50, 50, 50, 50, 50, 50, 51, 51, 51, 51, 51, 52, 52, 52, 52, 52, 52, 53, 53, 53, 53, 53, 53, 54, 54, 54]
Table 1 on page 13 columns: gacc, single
residences, mixed
commercial-
residential, multiple
residences, nonresidential
commercial
property, other
minor
structures, total
Table 2 on page 17 columns: name, gacc, state, start
date, last
report
date, size in
acres, cause*
Table 2 on page 17 appears to be the significant incidents table.
Columns: ['Name', 'GACC', 'State', 'Start\nDate', 'Last\nReport\nDate', 'Size In\nAcres', 'Cause*']
Significant incidents table extracted:
               Name GACC State Start\nDate Last\nReport\nDate Size In\nAcres  \
0       Betty's Way   RM    NE        2/26               3/11         69,810   
1  Smoke