In [1]:
import fitz  # PyMuPDF  
import pandas as pd  
import json  
import re  
  
# Load the PDF for 2011 annual report  
pdf_path = 'annual_report_2011_508.pdf'  
doc = fitz.open(pdf_path)  
  
# We use the table of contents (if available) or the expected page range.  
# Assuming the significant incidents table is near page 9 (as in previous reports)  
start_page = 7  # 0-indexed, page 8  
end_page = 15   # 0-indexed, page 16  
  
print("Searching for significant incidents table in pages " + str(start_page+1) + " to " + str(end_page+1) + " of the 2011 PDF...")  
  
def extract_tables_from_page(page):  
    tables = page.find_tables()  
    if tables and tables.tables:  
        return [table.to_pandas() for table in tables.tables]  
    return []  
  
all_tables = []  
table_pages = []  
  
for page_num in range(start_page, end_page+1):  
    page = doc[page_num]  
    tables = extract_tables_from_page(page)  
    if tables:  
        all_tables.extend(tables)  
        table_pages.append(page_num+1)  # 1-indexed  
  
print("Found " + str(len(all_tables)) + " tables on pages: " + str(table_pages))  
  
# Identify the table likely containing the significant incidents data:  
significant_table_index = None  
for i, table in enumerate(all_tables):  
    columns = table.columns.tolist()  
    column_str = ', '.join(columns).lower()  
    if ('name' in column_str and 'state' in column_str) or ('fire' in column_str and 'acres' in column_str):  
        significant_table_index = i  
        print("\nTable " + str(i+1) + " on page " + str(table_pages[i]) + " appears to be the significant incidents table.")  
        print("Columns:", columns)  
        break  
  
if significant_table_index is not None:  
    incidents_table = all_tables[significant_table_index]  
    print("\nSignificant incidents table found:")  
    print(incidents_table.head())  
      
    # Clean column names (replace newlines with underscores, strip spaces)  
    incidents_table.columns = [col.strip().replace('\n', '_') for col in incidents_table.columns]  
      
    # Create the structured JSON (mapped to common format)  
    # We manually map columns names here; adjust as needed based on actual column names  
    column_mapping = {}  
    for col in incidents_table.columns:  
        col_lower = col.lower()  
        if 'name' in col_lower:  
            column_mapping[col] = 'Name'  
        elif 'type' in col_lower:  
            column_mapping[col] = 'Inc_Type'  
        elif 'gacc' in col_lower:  
            column_mapping[col] = 'GACC'  
        elif 'state' in col_lower:  
            column_mapping[col] = 'State'  
        elif 'start' in col_lower and 'date' in col_lower:  
            column_mapping[col] = 'Start_Date'  
        elif ('contain' in col_lower or 'control' in col_lower) and 'date' in col_lower:  
            column_mapping[col] = 'Contain_Control_Date'  
        elif 'size' in col_lower or 'acres' in col_lower:  
            column_mapping[col] = 'Size_Acres'  
        elif 'cause' in col_lower:  
            column_mapping[col] = 'Cause'  
        elif 'cost' in col_lower:  
            column_mapping[col] = 'Cost'  
      
    print("\nProposed column mapping:")  
    print(column_mapping)  
      
    # Rename columns using the mapping  
    incidents_table_mapped = incidents_table.rename(columns=column_mapping)  
      
    # Ensure that all required columns exist, add missing ones as None  
    required_columns = ['Name', 'Inc_Type', 'GACC', 'State', 'Start_Date',   
                        'Contain_Control_Date', 'Size_Acres', 'Cause', 'Cost']  
    for col in required_columns:  
        if col not in incidents_table_mapped.columns:  
            incidents_table_mapped[col] = None  
            print("Added missing column:", col)  
      
    # Create structured JSON  
    incidents_2011 = incidents_table_mapped.to_dict(orient='records')  
    structured_data_2011 = {"significant_incidents": incidents_2011}  
      
    structured_filename = 'structured_incidents_data_2011.json'  
    with open(structured_filename, 'w') as f:  
        json.dump(structured_data_2011, f, indent=4)  
    print("\nSaved structured JSON data to " + structured_filename)  
    print("Sample incident (structured):")  
    print(json.dumps(incidents_2011[0], indent=2))  
      
    # Create an untouched JSON file (direct output from table extraction)  
    untouched_json = incidents_table.to_dict(orient='records')  
    untouched_filename = 'untouched_incidents_2011.json'  
    with open(untouched_filename, 'w') as f:  
        json.dump(untouched_json, f, indent=4)  
    print("\nSaved untouched JSON data to " + untouched_filename)  
      
    # Create dimension tables  
    # State dimension  
    states = incidents_table_mapped['State'].dropna().unique()  
    state_dimension = [{'State_ID': i+1, 'State': state} for i, state in enumerate(sorted(states))]  
      
    # GACC dimension  
    gaccs = incidents_table_mapped['GACC'].dropna().unique()  
    gacc_dimension = [{'GACC_ID': i+1, 'GACC': gacc} for i, gacc in enumerate(sorted(gaccs))]  
      
    # Incident Type dimension  
    inc_types = incidents_table_mapped['Inc_Type'].dropna().unique()  
    inc_type_dimension = [{'Inc_Type_ID': i+1, 'Inc_Type': inc_type} for i, inc_type in enumerate(sorted(inc_types))]  
      
    # Cause dimension  
    causes = incidents_table_mapped['Cause'].dropna().unique()  
    cause_dimension = [{'Cause_ID': i+1, 'Cause': cause} for i, cause in enumerate(sorted(causes))]  
      
    # Time dimension using Start_Date values  
    dates = incidents_table_mapped['Start_Date'].dropna().unique()  
    time_dimension = [{'Date_ID': i+1, 'Date': date} for i, date in enumerate(sorted(dates))]  
      
    dimension_tables = {  
        'state_dimension': state_dimension,  
        'gacc_dimension': gacc_dimension,  
        'inc_type_dimension': inc_type_dimension,  
        'cause_dimension': cause_dimension,  
        'time_dimension': time_dimension  
    }  
      
    dimension_filename = 'dimension_tables_2011.json'  
    with open(dimension_filename, 'w') as f:  
        json.dump(dimension_tables, f, indent=4)  
      
    print("\nCreated dimension tables and saved to " + dimension_filename)  
    print("- State dimension: " + str(len(state_dimension)) + " states")  
    print("- GACC dimension: " + str(len(gacc_dimension)) + " GACCs")  
    print("- Incident Type dimension: " + str(len(inc_type_dimension)) + " types")  
    print("- Cause dimension: " + str(len(cause_dimension)) + " causes")  
    print("- Time dimension: " + str(len(time_dimension)) + " dates")  
      
else:  
    print("\nCould not identify a significant incidents table in the given page range.")  
      
print("\nProcess completed.")  

Searching for significant incidents table in pages 8 to 16 of the 2011 PDF...
Found 2 tables on pages: [12, 14]

Table 1 on page 12 appears to be the significant incidents table.
Columns: ['Name', 'GACC', 'State', 'Start Date', 'Contain or\nControl\nDate', 'Size\n(Acres)', 'Cause', 'Estimated\nCost']

Significant incidents table found:
            Name GACC State Start Date Contain or\nControl\nDate  \
0         Wallow   SW    AZ  29-May-11                  8-Jul-11   
1     Rock House   SA    TX   9-Apr-11                 12-May-11   
2  Honey Prairie   SA    GA  30-Apr-11                 28-Dec-11   
3    Horseshoe 2   SW    AZ   8-May-11                 20-Jul-11   
4    Deaton Cole   SA    TX  25-Apr-11                 11-May-11   

  Size\n(Acres) Cause Estimated\nCost  
0       538,049     U    $109,000,000  
1       314,444     H      $8,399,072  
2       309,200     L     $53,420,000  
3       222,954     H     $52,000,000  
4       175,000     U              NR  

Proposed col