In [1]:
import fitz  # PyMuPDF  
import pandas as pd  
import json  
import re  
  
# Load the 2013 PDF  
pdf_path = 'Annual_Report_2013_508.pdf'  
doc = fitz.open(pdf_path)  
  
# Define expected page range (similar to previous years)  
start_page = 7   # 0-indexed (page 8)  
end_page = 15    # 0-indexed (page 16)  
  
print("Searching for significant incidents table in pages " + str(start_page+1) + " to " + str(end_page+1) + " of the 2013 PDF...")  
  
# Function to extract tables from a page using PyMuPDF's table extraction  
def extract_tables_from_page(page):  
    tables = page.find_tables()  
    if tables and tables.tables:  
        return [table.to_pandas() for table in tables.tables]  
    return []  
  
all_tables = []  
table_pages = []  
  
# Extract tables from the given pages  
for page_num in range(start_page, end_page+1):  
    page = doc[page_num]  
    tables = extract_tables_from_page(page)  
    if tables:  
        all_tables.extend(tables)  
        table_pages.append(page_num+1)  # 1-indexed page numbers  
print("Found " + str(len(all_tables)) + " tables on pages: " + str(table_pages))  
  
# Identify the significant incidents table by inspecting the column headers  
significant_table_index = None  
for i, table in enumerate(all_tables):  
    columns = table.columns.tolist()  
    column_str = ', '.join(columns).lower()  
    if ('name' in column_str and 'state' in column_str) or ('fire' in column_str and 'acres' in column_str):  
        significant_table_index = i  
        print("\nTable " + str(i+1) + " on page " + str(table_pages[i]) + " appears to be the significant incidents table.")  
        print("Columns:", columns)  
        break  
  
if significant_table_index is not None:  
    # Use the found table for processing  
    incidents_table = all_tables[significant_table_index]  
    print("\nSignificant incidents table found:")  
    print(incidents_table.head())  
      
    # Clean up column names by stripping whitespace and replacing newline characters with underscores  
    incidents_table.columns = [col.strip().replace('\n', '_') for col in incidents_table.columns]  
      
    # Save the raw table as JSON (call this incidents_2013.json)  
    raw_data = incidents_table.to_dict(orient='records')  
    raw_filename = 'incidents_2013.json'  
    with open(raw_filename, 'w') as f:  
        json.dump(raw_data, f, indent=4)  
    print("\nSaved raw extracted data to " + raw_filename)  
      
    # Define a mapping to standardize the column names (adjust based on actual column names)  
    column_mapping = {}  
    for col in incidents_table.columns:  
        col_lower = col.lower()  
        if 'name' in col_lower:  
            column_mapping[col] = 'Name'  
        elif 'type' in col_lower:  
            column_mapping[col] = 'Inc_Type'  
        elif 'gacc' in col_lower:  
            column_mapping[col] = 'GACC'  
        elif 'state' in col_lower:  
            column_mapping[col] = 'State'  
        elif 'start' in col_lower and 'date' in col_lower:  
            column_mapping[col] = 'Start_Date'  
        elif ('contain' in col_lower or 'control' in col_lower) and 'date' in col_lower:  
            column_mapping[col] = 'Contain_Control_Date'  
        elif 'size' in col_lower or 'acres' in col_lower:  
            column_mapping[col] = 'Size_Acres'  
        elif 'cause' in col_lower:  
            column_mapping[col] = 'Cause'  
        elif 'cost' in col_lower:  
            column_mapping[col] = 'Cost'  
    print("\nProposed column mapping:")  
    print(column_mapping)  
      
    # Apply the mapping to create the structured incidents data  
    structured_table = incidents_table.rename(columns=column_mapping)  
      
    # For any required key columns not present post-mapping, add them as None  
    required_columns = ['Name', 'Inc_Type', 'GACC', 'State', 'Start_Date', 'Contain_Control_Date', 'Size_Acres', 'Cause', 'Cost']  
    for col in required_columns:  
        if col not in structured_table.columns:  
            structured_table[col] = None  
            print("Added missing column: " + col)  
      
    structured_data = structured_table.to_dict(orient='records')  
    structured_filename = 'structured_incidents_data_2013.json'  
    with open(structured_filename, 'w') as f:  
        json.dump(structured_data, f, indent=4)  
    print("\nSaved structured data to " + structured_filename)  
    print("Sample structured incident:")  
    print(json.dumps(structured_data[0], indent=2))  
      
    # Create dimension tables  
    # State dimension  
    states = structured_table['State'].dropna().unique()  
    state_dimension = [{'State_ID': i+1, 'State': state} for i, state in enumerate(sorted(states))]  
      
    # GACC dimension  
    gaccs = structured_table['GACC'].dropna().unique()  
    gacc_dimension = [{'GACC_ID': i+1, 'GACC': gacc} for i, gacc in enumerate(sorted(gaccs))]  
      
    # Incident Type dimension  
    inc_types = structured_table['Inc_Type'].dropna().unique()  
    inc_type_dimension = [{'Inc_Type_ID': i+1, 'Inc_Type': inc_type} for i, inc_type in enumerate(sorted(inc_types))]  
      
    # Cause dimension  
    causes = structured_table['Cause'].dropna().unique()  
    cause_dimension = [{'Cause_ID': i+1, 'Cause': cause} for i, cause in enumerate(sorted(causes))]  
      
    # Time dimension (using Start_Date values)  
    dates = structured_table['Start_Date'].dropna().unique()  
    time_dimension = [{'Date_ID': i+1, 'Date': date} for i, date in enumerate(sorted(dates))]  
      
    dimension_tables = {  
        'state_dimension': state_dimension,  
        'gacc_dimension': gacc_dimension,  
        'inc_type_dimension': inc_type_dimension,  
        'cause_dimension': cause_dimension,  
        'time_dimension': time_dimension  
    }  
      
    dimension_filename = 'dimension_tables_2013.json'  
    with open(dimension_filename, 'w') as f:  
        json.dump(dimension_tables, f, indent=4)  
    print("\nCreated dimension tables and saved to " + dimension_filename)  
    print("- State dimension: " + str(len(state_dimension)) + " states")  
    print("- GACC dimension: " + str(len(gacc_dimension)) + " GACCs")  
    print("- Incident Type dimension: " + str(len(inc_type_dimension)) + " types")  
    print("- Cause dimension: " + str(len(cause_dimension)) + " causes")  
    print("- Time dimension: " + str(len(time_dimension)) + " dates")  
      
else:  
    print("\nCould not identify the significant incidents table in the given page range.")  
  
print("\nProcess completed.")  

Searching for significant incidents table in pages 8 to 16 of the 2013 PDF...
Found 2 tables on pages: [12, 15]

Table 1 on page 12 appears to be the significant incidents table.
Columns: ['Name', 'GACC', 'State', 'Start\nDate', 'Last\nReport\nDate', 'Size In\nAcres', 'Cause*', 'Estimated\nCost']

Significant incidents table found:
           Name GACC State Start\nDate Last\nReport\nDate Size In\nAcres  \
0           Rim   SO    CA        8/17              10/24        257,314   
1    Lime Hills   AK    AK        5/31               8/29        201,808   
2   Moore Creek   AK    AK         6/2               8/29        157,747   
3  Pony Complex   EB    ID         8/9               8/19        149,384   
4        Silver   SW    NM         6/7               9/10        138,546   

  Cause* Estimated\nCost  
0      U    $127,350,000  
1      L      $2,883,457  
2      L        $371,499  
3      L      $4,000,000  
4      L     $14,300,000  

Saved raw extracted data to incidents_2013.jso