In [1]:
# Let's examine the PDF to find the significant incidents table
import fitz  # PyMuPDF
import pandas as pd
import json
import re

# Load the PDF
pdf_path = 'annual_report_2010_508.pdf'
doc = fitz.open(pdf_path)

# Based on the TOC, the significant fire activity is on page 8
# Let's check a few pages around that to find the table
start_page = 7  # 0-indexed, so page 8 is index 7
end_page = 15   # Check a few pages after

print(f"Searching for significant incidents table in pages {start_page+1} to {end_page+1}...")

# Function to extract tables from a page
def extract_tables_from_page(page):
    tables = page.find_tables()
    if tables and tables.tables:
        return [table.to_pandas() for table in tables.tables]
    return []

# Search for tables in the specified page range
all_tables = []
table_pages = []

for page_num in range(start_page, end_page + 1):
    page = doc[page_num]
    tables = extract_tables_from_page(page)
    if tables:
        all_tables.extend(tables)
        table_pages.append(page_num + 1)  # Convert to 1-indexed for reporting

print(f"Found {len(all_tables)} tables on pages: {table_pages}")

# Let's examine each table to find the one with significant incidents
for i, table in enumerate(all_tables):
    print(f"\
Table {i+1} on page {table_pages[i]} has shape {table.shape}:")
    print(table.head(2))  # Show first 2 rows to identify the table

# Based on the output, let's identify which table contains the significant incidents
# Let's assume it's the table with columns like Name, Type, GACC, State, etc.
# We'll examine each table more carefully

significant_table_index = None
for i, table in enumerate(all_tables):
    # Check if this looks like the significant incidents table
    columns = table.columns.tolist()
    column_str = ', '.join(columns).lower()
    
    # Look for keywords that might indicate this is the incidents table
    if ('name' in column_str and 'state' in column_str) or ('fire' in column_str and 'acres' in column_str):
        significant_table_index = i
        print(f"\
Table {i+1} on page {table_pages[i]} appears to be the significant incidents table.")
        print("Columns:", columns)
        break

# If we found the table, let's process it
if significant_table_index is not None:
    incidents_table = all_tables[significant_table_index]
    print("\
Significant incidents table found:")
    print(incidents_table.head())
    
    # Clean up column names
    incidents_table.columns = [col.strip().replace('\
', '_') for col in incidents_table.columns]
    
    # Save the raw table to CSV
    csv_filename = 'significant_incidents_2010.csv'
    incidents_table.to_csv(csv_filename, index=False)
    print(f"\
Saved raw table to {csv_filename}")
    
    # Create the structured JSON format (similar to 2008/2009)
    # First, map column names to match the 2008/2009 format
    column_mapping = {}
    
    # We need to inspect the actual columns to create the mapping
    print("\
Actual columns in the 2010 table:")
    print(incidents_table.columns.tolist())
    
    # Let's try to automatically map columns based on keywords
    for col in incidents_table.columns:
        col_lower = col.lower()
        if 'name' in col_lower:
            column_mapping[col] = 'Name'
        elif 'type' in col_lower:
            column_mapping[col] = 'Inc_Type'
        elif 'gacc' in col_lower:
            column_mapping[col] = 'GACC'
        elif 'state' in col_lower:
            column_mapping[col] = 'State'
        elif 'start' in col_lower and 'date' in col_lower:
            column_mapping[col] = 'Start_Date'
        elif ('contain' in col_lower or 'control' in col_lower) and 'date' in col_lower:
            column_mapping[col] = 'Contain_Control_Date'
        elif 'size' in col_lower or 'acres' in col_lower:
            column_mapping[col] = 'Size_Acres'
        elif 'cause' in col_lower:
            column_mapping[col] = 'Cause'
        elif 'cost' in col_lower:
            column_mapping[col] = 'Cost'
    
    print("\
Proposed column mapping:")
    print(column_mapping)
    
    # Apply the mapping if it's complete
    if len(column_mapping) >= 5:  # Assuming we need at least 5 key columns
        incidents_table_mapped = incidents_table.rename(columns=column_mapping)
        
        # For any missing columns in our mapping, add them with None values
        required_columns = ['Name', 'Inc_Type', 'GACC', 'State', 'Start_Date', 
                           'Contain_Control_Date', 'Size_Acres', 'Cause', 'Cost']
        
        for col in required_columns:
            if col not in incidents_table_mapped.columns:
                incidents_table_mapped[col] = None
                print(f"Added missing column: {col}")
        
        # Convert to list of dictionaries
        incidents_2010 = incidents_table_mapped.to_dict(orient='records')
        
        # Create the structured data
        structured_data_2010 = {
            "significant_incidents": incidents_2010
        }
        
        # Save to JSON
        json_filename = 'structured_incidents_data_2010.json'
        with open(json_filename, 'w') as f:
            json.dump(structured_data_2010, f, indent=4)
        
        print(f"\
Saved structured data to {json_filename}")
        print("Sample incident:")
        print(json.dumps(incidents_2010[0], indent=2))
        
        # Also create the untouched JSON
        untouched_json = incidents_table.to_dict(orient='records')
        untouched_filename = 'untouched_incidents_2010.json'
        with open(untouched_filename, 'w') as f:
            json.dump(untouched_json, f, indent=4)
        
        print(f"\
Saved untouched data to {untouched_filename}")
        
        # Create dimension tables
        # State dimension
        states = incidents_table_mapped['State'].dropna().unique()
        state_dimension = [{'State_ID': i+1, 'State': state} for i, state in enumerate(sorted(states))]
        
        # GACC dimension
        gaccs = incidents_table_mapped['GACC'].dropna().unique()
        gacc_dimension = [{'GACC_ID': i+1, 'GACC': gacc} for i, gacc in enumerate(sorted(gaccs))]
        
        # Incident Type dimension
        inc_types = incidents_table_mapped['Inc_Type'].dropna().unique()
        inc_type_dimension = [{'Inc_Type_ID': i+1, 'Inc_Type': inc_type} for i, inc_type in enumerate(sorted(inc_types))]
        
        # Cause dimension
        causes = incidents_table_mapped['Cause'].dropna().unique()
        cause_dimension = [{'Cause_ID': i+1, 'Cause': cause} for i, cause in enumerate(sorted(causes))]
        
        # Time dimension (simplified)
        dates = incidents_table_mapped['Start_Date'].dropna().unique()
        time_dimension = [{'Date_ID': i+1, 'Date': date} for i, date in enumerate(sorted(dates))]
        
        # Create the dimension tables JSON
        dimension_tables = {
            'state_dimension': state_dimension,
            'gacc_dimension': gacc_dimension,
            'inc_type_dimension': inc_type_dimension,
            'cause_dimension': cause_dimension,
            'time_dimension': time_dimension
        }
        
        # Save the dimension tables
        dimension_filename = 'dimension_tables_2010.json'
        with open(dimension_filename, 'w') as f:
            json.dump(dimension_tables, f, indent=4)
        
        print(f"\
Created {dimension_filename} with the following dimensions:")
        print(f"- State dimension: {len(state_dimension)} states")
        print(f"- GACC dimension: {len(gacc_dimension)} GACCs")
        print(f"- Incident Type dimension: {len(inc_type_dimension)} types")
        print(f"- Cause dimension: {len(cause_dimension)} causes")
        print(f"- Time dimension: {len(time_dimension)} dates")
    else:
        print("\
Could not create a complete column mapping. Manual mapping needed.")
        
        # Let's create a more detailed view of the table to help with manual mapping
        print("\
Detailed view of the table:")
        print(incidents_table.head(5))
else:
    print("\
Could not identify the significant incidents table.")
    
    # Let's try a different approach - look for text that might indicate the table
    for page_num in range(start_page, end_page + 1):
        page = doc[page_num]
        text = page.get_text()
        if "significant" in text.lower() and ("fire" in text.lower() or "incident" in text.lower()):
            print(f"\
Page {page_num+1} contains text about significant fires/incidents.")
            print("First 500 characters of the page:")
            print(text[:500])

print("\
Process completed.")

Searching for significant incidents table in pages 8 to 16...
Found 2 tables on pages: [9, 11]
Table 1 on page 9 has shape (9, 9):
         Name Inc.\nType GACC State Start Date Contain or\nControl\nDate  \
0  Long Butte         WF   EB    ID  21-Aug-10                  3-Sep-10   
1      Toklat         WF   AK    AK  16-May-10                  4-Jun-10   

  Size\n(Acres) Cause Estimated\nCost  
0       306,113     L     $ 4,225,000  
1       171,727     L      $2,109,186  
Table 2 on page 11 has shape (1, 11):
    AK  NW  NO  SO  NR  EB  WB  SW  RM   EA   SA
0  11%  4%  2%  4%  4%  9%  1%  7%  6%  11%  42%
Table 1 on page 9 appears to be the significant incidents table.
Columns: ['Name', 'Inc.\nType', 'GACC', 'State', 'Start Date', 'Contain or\nControl\nDate', 'Size\n(Acres)', 'Cause', 'Estimated\nCost']
Significant incidents table found:
             Name Inc.\nType GACC State Start Date Contain or\nControl\nDate  \
0      Long Butte         WF   EB    ID  21-Aug-10                 

In [2]:
# Load the PDF  
pdf_path = 'annual_report_2010_508.pdf'  
doc = fitz.open(pdf_path)  
  
# Based on the TOC, we expect the significant incidents table on page 8 or nearby.  
# We'll check pages 8 to 16 (0-indexed: pages 7 to 15)  
start_page = 7  # page 8 (0-indexed)  
end_page = 15   # page 16  
  
print("Searching for significant incidents table in pages " + str(start_page+1) + " to " + str(end_page+1) + "...")  
  
# Function to extract tables from a page  
def extract_tables_from_page(page):  
    tables = page.find_tables()  
    if tables and tables.tables:  
        return [table.to_pandas() for table in tables.tables]  
    return []  
  
# Search for tables in the specified page range  
all_tables = []  
table_pages = []  
  
for page_num in range(start_page, end_page + 1):  
    page = doc[page_num]  
    tables = extract_tables_from_page(page)  
    if tables:  
        all_tables.extend(tables)  
        table_pages.append(page_num + 1)  # 1-indexed  
  
print("Found " + str(len(all_tables)) + " tables on pages: " + str(table_pages))  
  
# Examine tables to find the one with significant incidents  
significant_table_index = None  
for i, table in enumerate(all_tables):  
    columns = table.columns.tolist()  
    column_str = ', '.join(columns).lower()  
      
    # Look for keywords like name, state, fire, or acres to decide if it is the incidents table  
    if ('name' in column_str and 'state' in column_str) or ('fire' in column_str and 'acres' in column_str):  
        significant_table_index = i  
        print("\nTable " + str(i+1) + " on page " + str(table_pages[i]) + " appears to be the significant incidents table.")  
        print("Columns:", columns)  
        break  
  
if significant_table_index is not None:  
    incidents_table = all_tables[significant_table_index]  
    print("\nSignificant incidents table found:")  
    print(incidents_table.head())  
      
    # Clean up column names  
    incidents_table.columns = [col.strip().replace('\n', '_') for col in incidents_table.columns]  
      
    # Save the raw table as CSV for reference  
    csv_filename = 'significant_incidents_2010.csv'  
    incidents_table.to_csv(csv_filename, index=False)  
    print("\nSaved raw table to " + csv_filename)  
      
    # Map columns to match the expected structure (similar to 2008/2009)  
    column_mapping = {}  
    print("\nActual columns in the 2010 table:")  
    print(incidents_table.columns.tolist())  
      
    # Map based on keywords  
    for col in incidents_table.columns:  
        col_lower = col.lower()  
        if 'name' in col_lower:  
            column_mapping[col] = 'Name'  
        elif 'type' in col_lower:  
            column_mapping[col] = 'Inc_Type'  
        elif 'gacc' in col_lower:  
            column_mapping[col] = 'GACC'  
        elif 'state' in col_lower:  
            column_mapping[col] = 'State'  
        elif 'start' in col_lower and 'date' in col_lower:  
            column_mapping[col] = 'Start_Date'  
        elif ('contain' in col_lower or 'control' in col_lower) and 'date' in col_lower:  
            column_mapping[col] = 'Contain_Control_Date'  
        elif 'size' in col_lower or 'acres' in col_lower:  
            column_mapping[col] = 'Size_Acres'  
        elif 'cause' in col_lower:  
            column_mapping[col] = 'Cause'  
        elif 'cost' in col_lower:  
            column_mapping[col] = 'Cost'  
      
    print("\nProposed column mapping:")  
    print(column_mapping)  
      
    # Apply mapping if sufficient key columns exist  
    if len(column_mapping) >= 5:  
        incidents_table_mapped = incidents_table.rename(columns=column_mapping)  
          
        # Ensure required columns exist; add missing columns as None  
        required_columns = ['Name', 'Inc_Type', 'GACC', 'State', 'Start_Date',   
                           'Contain_Control_Date', 'Size_Acres', 'Cause', 'Cost']  
        for col in required_columns:  
            if col not in incidents_table_mapped.columns:  
                incidents_table_mapped[col] = None  
                print("Added missing column: " + col)  
          
        # Convert the mapped DataFrame to a list of dictionaries  
        incidents_2010 = incidents_table_mapped.to_dict(orient='records')  
          
        # Create the structured data JSON  
        structured_data_2010 = {  
            "significant_incidents": incidents_2010  
        }  
          
        # Save the structured JSON  
        json_filename = 'structured_incidents_data_2010.json'  
        with open(json_filename, 'w') as f:  
            json.dump(structured_data_2010, f, indent=4)  
              
        print("\nSaved structured data to " + json_filename)  
        print("Sample incident:")  
        print(json.dumps(incidents_2010[0], indent=2))  
    else:  
        print("\nCould not create a complete column mapping. Manual mapping needed.")  
else:  
    print("\nCould not identify the significant incidents table.")  

Searching for significant incidents table in pages 8 to 16...
Found 2 tables on pages: [9, 11]

Table 1 on page 9 appears to be the significant incidents table.
Columns: ['Name', 'Inc.\nType', 'GACC', 'State', 'Start Date', 'Contain or\nControl\nDate', 'Size\n(Acres)', 'Cause', 'Estimated\nCost']

Significant incidents table found:
             Name Inc.\nType GACC State Start Date Contain or\nControl\nDate  \
0      Long Butte         WF   EB    ID  21-Aug-10                  3-Sep-10   
1          Toklat         WF   AK    AK  16-May-10                  4-Jun-10   
2       Jefferson         WF   EB    ID  13-Jul-10                 17-Jul-10   
3  Turquoise Lake         WF   AK    AK  18-May-10                  7-Sep-10   
4    Big Mountain         WF   AK    AK   2-Jun-10                  2-Sep-10   

  Size\n(Acres) Cause Estimated\nCost  
0       306,113     L     $ 4,225,000  
1       171,727     L      $2,109,186  
2       109,436     H       $ 700,819  
3        91,885     H    

In [3]:
# Continuing from the previous code, if we have the raw incidents_table:  
if 'incidents_table' in locals():  
    # Convert the raw table (untouched) directly to JSON  
    untouched_json = incidents_table.to_dict(orient='records')  
    untouched_filename = 'untouched_incidents_2010.json'  
    with open(untouched_filename, 'w') as f:  
        json.dump(untouched_json, f, indent=4)  
      
    print("\nSaved untouched data to " + untouched_filename)  


Saved untouched data to untouched_incidents_2010.json


In [4]:
# Once we have the mapped table (incidents_table_mapped), we can create dimension tables.  
if 'incidents_table_mapped' in locals():  
    # State dimension  
    states = incidents_table_mapped['State'].dropna().unique()  
    state_dimension = [{'State_ID': i+1, 'State': state} for i, state in enumerate(sorted(states))]  
      
    # GACC dimension  
    gaccs = incidents_table_mapped['GACC'].dropna().unique()  
    gacc_dimension = [{'GACC_ID': i+1, 'GACC': gacc} for i, gacc in enumerate(sorted(gaccs))]  
      
    # Incident Type dimension  
    inc_types = incidents_table_mapped['Inc_Type'].dropna().unique()  
    inc_type_dimension = [{'Inc_Type_ID': i+1, 'Inc_Type': inc_type} for i, inc_type in enumerate(sorted(inc_types))]  
      
    # Cause dimension  
    causes = incidents_table_mapped['Cause'].dropna().unique()  
    cause_dimension = [{'Cause_ID': i+1, 'Cause': cause} for i, cause in enumerate(sorted(causes))]  
      
    # Time dimension (using Start_Date as-is)  
    dates = incidents_table_mapped['Start_Date'].dropna().unique()  
    time_dimension = [{'Date_ID': i+1, 'Date': date} for i, date in enumerate(sorted(dates))]  
      
    # Structure the dimension tables together  
    dimension_tables = {  
        'state_dimension': state_dimension,  
        'gacc_dimension': gacc_dimension,  
        'inc_type_dimension': inc_type_dimension,  
        'cause_dimension': cause_dimension,  
        'time_dimension': time_dimension  
    }  
      
    # Save the dimension tables to a JSON file  
    dimension_filename = 'dimension_tables_2010.json'  
    with open(dimension_filename, 'w') as f:  
        json.dump(dimension_tables, f, indent=4)  
          
    print("\nCreated " + dimension_filename + " with the following dimensions:")  
    print("- State dimension: " + str(len(state_dimension)) + " states")  
    print("- GACC dimension: " + str(len(gacc_dimension)) + " GACCs")  
    print("- Incident Type dimension: " + str(len(inc_type_dimension)) + " types")  
    print("- Cause dimension: " + str(len(cause_dimension)) + " causes")  
    print("- Time dimension: " + str(len(time_dimension)) + " dates")  


Created dimension_tables_2010.json with the following dimensions:
- State dimension: 3 states
- GACC dimension: 2 GACCs
- Incident Type dimension: 1 types
- Cause dimension: 2 causes
- Time dimension: 9 dates
