In [3]:
# Import necessary libraries  
import PyPDF2
import re  
import json  
import pandas as pd  
from datetime import datetime  

In [22]:
# Convert PDF to text  
def extract_text_from_pdf(pdf_path):  
    text = ""  
    with open(pdf_path, 'rb') as file:  
        pdf_reader = PyPDF2.PdfReader(file)  
        for page_num in range(len(pdf_reader.pages)):  
            page = pdf_reader.pages[page_num]  
            text += page.extract_text() + "\n\n"  
    return text  
  
# Save text to file  
def save_text_to_file(text, output_path):  
    with open(output_path, 'w', encoding='utf-8') as file:  
        file.write(text)  
    print(f"Text saved to {output_path}")  
  
# Extract text from PDF  
pdf_path = 'annual_report_2009_508.pdf'  
output_text_path = 'annual_report_2009_text.txt'  
  
print("Extracting text from PDF...")  
extracted_text = extract_text_from_pdf(pdf_path)  
save_text_to_file(extracted_text, output_text_path)  
  
# Step 2: Process the text to extract incident data  
print("\nProcessing text to extract incident data...")  
  
# Read the text file  
with open(output_text_path, 'r', encoding='utf-8') as file:  
    text_content = file.read()  
  
# Look for the significant incidents section  
# This pattern might need adjustment based on the actual format  
incidents_section_pattern = r"(?:Significant\s+Incidents|Large\s+Fires|Major\s+Incidents).*?(?=\n\n\w+|\Z)"  
incidents_section_match = re.search(incidents_section_pattern, text_content, re.DOTALL | re.IGNORECASE)  
  
if incidents_section_match:  
    incidents_section = incidents_section_match.group(0)  
    print("Found incidents section. Length:", len(incidents_section))  
    print("First 500 characters:", incidents_section[:500])  
else:  
    print("Incidents section not found. Looking for table patterns directly.")  
    incidents_section = text_content  
  
# Look for table patterns in the incidents section or full text  
# This pattern looks for lines that might be table rows with fire data  
table_pattern = r"([A-Za-z\s\-\.&]+)\s+(\w+)\s+(\w+)\s+(\w+)\s+(\d{1,2}/\d{1,2}/\d{4})\s+(\d{1,2}/\d{1,2}/\d{4})\s+(\d{1,3}(?:,\d{3})*)\s+([A-Z])"  
table_matches = re.findall(table_pattern, incidents_section)  
  
if not table_matches:  
    print("No table matches found with the specific pattern. Trying alternative pattern.")  
    # Try an alternative pattern that's more flexible  
    table_pattern = r"([A-Za-z\s\-\.&]+?)\s+(\w+)\s+(\w+)\s+(\w+)\s+(\d{1,2}/\d{1,2}/\d{2,4})\s+(\d{1,2}/\d{1,2}/\d{2,4})\s+(\d{1,3}(?:,\d{3})*)\s+([A-Z])"  
    table_matches = re.findall(table_pattern, incidents_section)  
  
# If still no matches, try a more general approach  
if not table_matches:  
    print("Still no matches. Trying line-by-line approach.")  
    lines = incidents_section.split('\n')  
    table_matches = []  
      
    # Look for lines that might contain fire data  
    for line in lines:  
        # This pattern looks for lines with dates and numbers that might be fire data  
        if re.search(r'\d{1,2}/\d{1,2}/\d{2,4}.*\d{1,2}/\d{1,2}/\d{2,4}.*\d{1,3}(?:,\d{3})*', line):  
            # Try to parse the line into components  
            parts = re.split(r'\s{2,}', line.strip())  
            if len(parts) >= 8:  # Assuming at least 8 columns  
                # Extract the components  
                name = parts[0].strip()  
                inc_type = parts[1].strip() if len(parts) > 1 else ""  
                gacc = parts[2].strip() if len(parts) > 2 else ""  
                state = parts[3].strip() if len(parts) > 3 else ""  
                start_date = parts[4].strip() if len(parts) > 4 else ""  
                contain_date = parts[5].strip() if len(parts) > 5 else ""  
                size_acres = parts[6].strip() if len(parts) > 6 else ""  
                cause = parts[7].strip() if len(parts) > 7 else ""  
                  
                table_matches.append((name, inc_type, gacc, state, start_date, contain_date, size_acres, cause))  
  
# Step 3: Structure the extracted data  
significant_incidents = []  
  
if table_matches:  
    for match in table_matches:  
        name, inc_type, gacc, state, start_date, contain_date, size_acres, cause = match  
          
        # Clean up the data  
        name = name.strip()  
        size_acres = size_acres.replace(',', '')  # Remove commas from numbers  
          
        # Create a dictionary for this incident  
        incident = {  
            'Name': name,  
            'Inc_Type': inc_type,  
            'GACC': gacc,  
            'State': state,  
            'Start_Date': start_date,  
            'Contain_Control_Date': contain_date,  
            'Size_Acres': size_acres,  
            'Cause': cause,  
            'Cause_Description': 'Human-caused' if cause == 'H' else 'Lightning' if cause == 'L' else 'Unknown'  
        }  
          
        significant_incidents.append(incident)  
      
    # Step 4: Create dimension tables for the star schema  
    # State dimension  
    states = list(set(incident['State'] for incident in significant_incidents))  
    state_dim = [{'State_Code': state, 'Region': 'Unknown'} for state in states]  
      
    # GACC dimension  
    gaccs = list(set(incident['GACC'] for incident in significant_incidents))  
    gacc_dim = [{'GACC_Code': gacc, 'GACC_Name': 'Unknown'} for gacc in gaccs]  
      
    # Time dimension  
    time_dim = []  
    for i, incident in enumerate(significant_incidents):  
        date_str = incident.get('Start_Date', 'Unknown')  
          
        # Try to parse the date  
        if date_str != 'Unknown':  
            try:  
                # Handle different date formats  
                if '/' in date_str:  
                    # Format: MM/DD/YYYY or MM/DD/YY  
                    parts = date_str.split('/')  
                    if len(parts[2]) == 2:  # Two-digit year  
                        parts[2] = '20' + parts[2]  # Assume 2000s  
                    date_obj = datetime(int(parts[2]), int(parts[0]), int(parts[1]))  
                    month = date_obj.month  
                    year = date_obj.year  
                    formatted_date = date_obj.strftime('%Y-%m-%d')  
                else:  
                    # Try standard datetime parsing  
                    date_obj = datetime.strptime(date_str, '%Y-%m-%d')  
                    month = date_obj.month  
                    year = date_obj.year  
                    formatted_date = date_str  
            except ValueError:  
                month = 0  
                year = 0  
                formatted_date = date_str  
        else:  
            month = 0  
            year = 0  
            formatted_date = date_str  
          
        time_dim.append({  
            'Date_ID': i + 1,  
            'Date': formatted_date,  
            'Month': month,  
            'Year': year  
        })  
      
    # Step 5: Save the structured data to JSON files  
    structured_data = {  
        'significant_incidents': significant_incidents  
    }  
      
    dimensions = {  
        'state_dimension': state_dim,  
        'gacc_dimension': gacc_dim,  
        'time_dimension': time_dim  
    }  
      
    with open('structured_incidents_data_2009.json', 'w') as f:  
        json.dump(structured_data, f, indent=4)  
      
    with open('dimension_tables_2009.json', 'w') as f:  
        json.dump(dimensions, f, indent=4)  
      
    print("Data successfully extracted and saved to JSON files.")  
    print(f"Found {len(significant_incidents)} significant incidents.")  
    print(f"Created dimension tables: States ({len(state_dim)}), GACCs ({len(gacc_dim)}), Time ({len(time_dim)}).")  
      
    # Display a sample of the extracted data  
    print("\nSample of extracted incidents (first 5):")  
    for i, incident in enumerate(significant_incidents[:5]):  
        print(f"{i+1}. {incident['Name']} - {incident['State']} - {incident['Start_Date']} - {incident['Size_Acres']} acres")  
else:  
    print("No data to save.")  
  
# Optional: If you want to see the data in a DataFrame format  
if significant_incidents:  
    df = pd.DataFrame(significant_incidents)  
    print("\nDataFrame view of the first 5 incidents:")  
    print(df.head())  

Extracting text from PDF...
Text saved to annual_report_2009_text.txt

Processing text to extract incident data...
Found incidents section. Length: 32267
First 500 characters: large fires were 
rapidly contained. By the end of February, the Southern Area was very active with numerous large fires mostly in Texas, Oklahoma, and Florida. The Southern Area had 7,424 fires (133 percent  of normal) during January and Februar y, which burned 136,020 acres (118 percent  of 
normal) for the same time period.  
 
Spring (March –  May)  
 
Spring was warmer than normal in Alaska as well as the Southwest and Northeast quarters of 
the country. After a dry winter, the Southeast ex
No table matches found with the specific pattern. Trying alternative pattern.
Still no matches. Trying line-by-line approach.
No data to save.


In [23]:
# Read the text file  
with open('annual_report_2009_text.txt', 'r', encoding='utf-8') as file:  
    text_content = file.read()  
  
# Step 2: Extract the significant incidents table  
# We'll look for patterns that indicate the start and end of the table  
# This might need adjustment based on the actual format in the text file  
  
# First, let's try to find the section with significant incidents  
# Look for a section header like "Significant Incidents" or similar  
incidents_section_pattern = r"(?:Significant\s+Incidents|Large\s+Fires|Major\s+Incidents).*?(?=\n\n\w+|\Z)"  
incidents_section_match = re.search(incidents_section_pattern, text_content, re.DOTALL | re.IGNORECASE)  
  
if incidents_section_match:  
    incidents_section = incidents_section_match.group(0)  
    print("Found incidents section. Length:", len(incidents_section))  
    print("First 500 characters:", incidents_section[:500])  
else:  
    print("Incidents section not found. Looking for table patterns directly.")  
    incidents_section = text_content  
  
# Look for table patterns in the incidents section or full text  
# This pattern looks for lines that might be table rows with fire data  
# Adjust the pattern based on the actual format in your text  
table_pattern = r"([A-Za-z\s\-\.&]+)\s+(\w+)\s+(\w+)\s+(\w+)\s+(\d{1,2}/\d{1,2}/\d{4})\s+(\d{1,2}/\d{1,2}/\d{4})\s+(\d{1,3}(?:,\d{3})*)\s+([A-Z])"  
table_matches = re.findall(table_pattern, incidents_section)  
  
if not table_matches:  
    print("No table matches found with the specific pattern. Trying alternative pattern.")  
    # Try an alternative pattern that's more flexible  
    table_pattern = r"([A-Za-z\s\-\.&]+?)\s+(\w+)\s+(\w+)\s+(\w+)\s+(\d{1,2}/\d{1,2}/\d{2,4})\s+(\d{1,2}/\d{1,2}/\d{2,4})\s+(\d{1,3}(?:,\d{3})*)\s+([A-Z])"  
    table_matches = re.findall(table_pattern, incidents_section)  
  
# If still no matches, try a more general approach  
if not table_matches:  
    print("Still no matches. Trying line-by-line approach.")  
    lines = incidents_section.split('\n')  
    table_data = []  
      
    # Look for lines that might contain fire data  
    for line in lines:  
        # Skip empty lines and headers  
        if not line.strip() or re.match(r'^(Name|Type|GACC|State|Date|Size|Cause)', line, re.IGNORECASE):  
            continue  
          
        # Check if line contains date patterns and numbers that might be acres  
        if re.search(r'\d{1,2}/\d{1,2}/\d{2,4}', line) and re.search(r'\d{1,3}(?:,\d{3})*', line):  
            # Split the line by multiple spaces to separate columns  
            columns = re.split(r'\s{2,}', line.strip())  
            if len(columns) >= 7:  # Assuming at least 7 columns for a valid row  
                table_data.append(columns)  
      
    if table_data:  
        print(f"Found {len(table_data)} potential incident rows using line-by-line approach.")  
        # Convert to the format expected by the rest of the code  
        table_matches = []  
        for row in table_data:  
            # Ensure we have enough columns, pad with empty strings if needed  
            while len(row) < 8:  
                row.append("")  
            # Extract the columns we need  
            name = row[0]  
            inc_type = row[1] if len(row) > 1 else ""  
            gacc = row[2] if len(row) > 2 else ""  
            state = row[3] if len(row) > 3 else ""  
            start_date = row[4] if len(row) > 4 else ""  
            contain_date = row[5] if len(row) > 5 else ""  
            size_acres = row[6] if len(row) > 6 else ""  
            cause = row[7] if len(row) > 7 else ""  
              
            table_matches.append((name, inc_type, gacc, state, start_date, contain_date, size_acres, cause))  
  
# Step 3: Convert the extracted data to a structured format  
significant_incidents = []  
  
if table_matches:  
    print(f"Found {len(table_matches)} incident rows.")  
    for match in table_matches:  
        name, inc_type, gacc, state, start_date, contain_date, size_acres, cause = match  
          
        # Clean up the data  
        name = name.strip()  
        inc_type = inc_type.strip()  
        gacc = gacc.strip()  
        state = state.strip()  
        start_date = start_date.strip()  
        contain_date = contain_date.strip()  
        size_acres = size_acres.strip().replace(',', '')  # Remove commas from numbers  
        cause = cause.strip()  
          
        # Map cause codes to descriptions  
        cause_description = {  
            'H': 'Human-caused',  
            'L': 'Lightning',  
            'U': 'Unknown',  
            'NR': 'Not Reported'  
        }.get(cause, 'Unknown')  
          
        # Create a dictionary for this incident  
        incident = {  
            'Name': name,  
            'Inc_Type': inc_type,  
            'GACC': gacc,  
            'State': state,  
            'Start_Date': start_date,  
            'Contain_Control_Date': contain_date,  
            'Size_Acres': size_acres,  
            'Cause': cause,  
            'Cause_Description': cause_description  
        }  
          
        significant_incidents.append(incident)  
else:  
    print("No incident data found. Check the text file format and adjust the extraction patterns.")  
  
# Step 4: Create the star schema structure  
# Fact table is already created as significant_incidents  
  
# Create dimension tables  
if significant_incidents:  
    # State dimension  
    states = list(set(incident['State'] for incident in significant_incidents)) 
    state_dim = [{'State_Code': state, 'Region': 'Unknown'} for state in states]  
      
    # GACC dimension  
    gaccs = list(set(incident['GACC'] for incident in significant_incidents))  
    gacc_dim = [{'GACC_Code': gacc, 'GACC_Name': 'Unknown'} for gacc in gaccs]  
      
    # Time dimension  
    time_dim = []  
    for i, incident in enumerate(significant_incidents):  
        date_str = incident.get('Start_Date', 'Unknown')  
          
        # Try to parse the date  
        try:  
            # First try MM/DD/YYYY format  
            date_obj = datetime.strptime(date_str, '%m/%d/%Y')  
            month = date_obj.month  
            year = date_obj.year  
            formatted_date = date_obj.strftime('%Y-%m-%d')  
        except ValueError:  
            try:  
                # Then try MM/DD/YY format  
                date_obj = datetime.strptime(date_str, '%m/%d/%y')  
                month = date_obj.month  
                year = date_obj.year  
                formatted_date = date_obj.strftime('%Y-%m-%d')  
            except ValueError:  
                month = 0  
                year = 0  
                formatted_date = date_str  
          
        time_dim.append({  
            'Date_ID': i + 1,  
            'Date': formatted_date,  
            'Month': month,  
            'Year': year  
        })  
      
    # Step 5: Save the structured data to JSON files  
    structured_data = {  
        'significant_incidents': significant_incidents  
    }  
      
    dimensions = {  
        'state_dimension': state_dim,  
        'gacc_dimension': gacc_dim,  
        'time_dimension': time_dim  
    }  
      
    with open('structured_incidents_data_2009.json', 'w') as f:  
        json.dump(structured_data, f, indent=4)  
      
    with open('dimension_tables_2009.json', 'w') as f:  
        json.dump(dimensions, f, indent=4)  
      
    print("Data successfully extracted and saved to JSON files.")  
    print(f"Found {len(significant_incidents)} significant incidents.")  
    print(f"Created dimension tables: States ({len(state_dim)}), GACCs ({len(gacc_dim)}), Time ({len(time_dim)}).")  
      
    # Display a sample of the extracted data  
    print("\nSample of extracted incidents (first 5):")  
    for i, incident in enumerate(significant_incidents[:5]):  
        print(f"{i+1}. {incident['Name']} - {incident['State']} - {incident['Start_Date']} - {incident['Size_Acres']} acres")  
else:  
    print("No data to save.")  
  
# Optional: If you want to see the data in a DataFrame format  
if significant_incidents:  
    df = pd.DataFrame(significant_incidents)  
    print("\nDataFrame view of the first 5 incidents:")  
    print(df.head())  

Found incidents section. Length: 32267
First 500 characters: large fires were 
rapidly contained. By the end of February, the Southern Area was very active with numerous large fires mostly in Texas, Oklahoma, and Florida. The Southern Area had 7,424 fires (133 percent  of normal) during January and Februar y, which burned 136,020 acres (118 percent  of 
normal) for the same time period.  
 
Spring (March –  May)  
 
Spring was warmer than normal in Alaska as well as the Southwest and Northeast quarters of 
the country. After a dry winter, the Southeast ex
No table matches found with the specific pattern. Trying alternative pattern.
Still no matches. Trying line-by-line approach.
No incident data found. Check the text file format and adjust the extraction patterns.
No data to save.


In [2]:
# Import necessary libraries  
import re  
import json  
import pandas as pd  
from datetime import datetime  
  
# Step 1: Read the text file  
with open('annual_report_2009_text.txt', 'r', encoding='utf-8') as file:  
    text_content = file.read()  
  
# Step 2: Look for the significant incidents table  
# We'll search for patterns that might indicate the table of significant incidents  
print("Searching for significant incidents table...")  
  
# Try to find a table with fire data  
# This pattern looks for lines with fire name, type, location, dates, and acreage  
table_pattern = r"([A-Za-z0-9\s\-\.&,()]+?)\s+(\w+)\s+(\w+)\s+(\w+)\s+(\d{1,2}/\d{1,2}/\d{2,4})\s+(\d{1,2}/\d{1,2}/\d{2,4})\s+(\d{1,3}(?:,\d{3})*)\s+([A-Z])"  
table_matches = re.findall(table_pattern, text_content)  
  
# If no matches found, try a more flexible pattern  
if not table_matches:  
    print("No matches found with specific pattern. Trying alternative pattern...")  
    # More flexible pattern  
    table_pattern = r"([A-Za-z0-9\s\-\.&,()]+?)\s+(\w+)\s+(\w+)\s+(\w+)\s+(\d{1,2}/\d{1,2}/\d{2,4})\s+(\d{1,2}/\d{1,2}/\d{2,4})\s+(\d{1,3}(?:,\d{3})*)"  
    table_matches = re.findall(table_pattern, text_content)  
  
# If still no matches, try to find the table by looking for a section header  
if not table_matches:  
    print("Still no matches. Looking for section headers...")  
    # Look for section headers that might indicate the start of the table  
    section_headers = ["Significant Incidents", "Large Fire Activity", "Major Incidents"]  
      
    for header in section_headers:  
        header_pattern = f"{header}.*?(?=\n\n\w+|\Z)"  
        section_match = re.search(header_pattern, text_content, re.DOTALL | re.IGNORECASE)  
          
        if section_match:  
            section_text = section_match.group(0)  
            print(f"Found section: {header}")  
              
            # Try to extract table rows from this section  
            table_pattern = r"([A-Za-z0-9\s\-\.&,()]+?)\s+(\w+)\s+(\w+)\s+(\w+)\s+(\d{1,2}/\d{1,2}/\d{2,4})\s+(\d{1,2}/\d{1,2}/\d{2,4})\s+(\d{1,3}(?:,\d{3})*)\s+([A-Z])"  
            table_matches = re.findall(table_pattern, section_text)  
              
            if table_matches:  
                break  
  
# Step 3: Process the extracted data  
if table_matches:  
    print(f"Found {len(table_matches)} potential incident records.")  
      
    # Create a list to store the structured incident data  
    significant_incidents = []  
      
    # Process each match  
    for match in table_matches:  
        # Clean up the fire name (remove leading/trailing spaces, replace multiple spaces with single space)  
        name = re.sub(r'\s+', ' ', match[0].strip())  
          
        # Create a dictionary for this incident  
        incident = {  
            'Name': name,  
            'Inc_Type': match[1],  
            'GACC': match[2],  
            'State': match[3],  
            'Start_Date': match[4],  
            'Contain_Control_Date': match[5],  
            'Size_Acres': match[6],  
            'Cause': match[7] if len(match) > 7 else "Unknown"  
        }  
          
        # Add cause description based on cause code  
        if incident['Cause'] == 'H':  
            incident['Cause_Description'] = 'Human-caused'  
        elif incident['Cause'] == 'L':  
            incident['Cause_Description'] = 'Lightning-caused'  
        else:  
            incident['Cause_Description'] = 'Unknown'  
          
        significant_incidents.append(incident)  
      
    # Step 4: Create dimension tables for the star schema  
      
    # State dimension  
    states = list(set(incident['State'] for incident in significant_incidents))  
    state_dim = [{'State_Code': state, 'Region': 'Unknown'} for state in states]  
      
    # GACC dimension  
    gaccs = list(set(incident['GACC'] for incident in significant_incidents))  
    gacc_dim = [{'GACC_Code': gacc, 'GACC_Name': 'Unknown'} for gacc in gaccs]  
      
    # Time dimension  
    time_dim = []  
    for i, incident in enumerate(significant_incidents):  
        date_str = incident.get('Start_Date', 'Unknown')  
          
        # Try to parse the date  
        try:  
            # Handle different date formats (MM/DD/YYYY or MM/DD/YY)  
            if len(date_str.split('/')[2]) == 2:  
                # Assume 20xx for two-digit years  
                date_str = f"{date_str.split('/')[0]}/{date_str.split('/')[1]}/20{date_str.split('/')[2]}"  
              
            date_obj = datetime.strptime(date_str, '%m/%d/%Y')  
            month = date_obj.month  
            year = date_obj.year  
            formatted_date = date_obj.strftime('%Y-%m-%d')  
        except ValueError:  
            month = 0  
            year = 0  
            formatted_date = date_str  
          
        time_dim.append({  
            'Date_ID': i + 1,  
            'Date': formatted_date,  
            'Month': month,  
            'Year': year  
        })  
      
    # Step 5: Save the structured data to JSON files  
    structured_data = {  
        'significant_incidents': significant_incidents  
    }  
      
    dimensions = {  
        'state_dimension': state_dim,  
        'gacc_dimension': gacc_dim,  
        'time_dimension': time_dim  
    }  
      
    with open('structured_incidents_data_2009.json', 'w') as f:  
        json.dump(structured_data, f, indent=4)  
      
    with open('dimension_tables_2009.json', 'w') as f:  
        json.dump(dimensions, f, indent=4)  
      
    print("Data successfully extracted and saved to JSON files:")  
    print("- structured_incidents_data_2009.json")  
    print("- dimension_tables_2009.json")  
    print(f"Found {len(significant_incidents)} significant incidents.")  
    print(f"Created dimension tables: States ({len(state_dim)}), GACCs ({len(gacc_dim)}), Time ({len(time_dim)}).")  
      
    # Display a sample of the extracted data  
    print("\nSample of extracted incidents (first 5):")  
    for i, incident in enumerate(significant_incidents[:5]):  
        print(f"{i+1}. {incident['Name']} - {incident['State']} - {incident['Start_Date']} - {incident['Size_Acres']} acres")  
      
    # Create a DataFrame for easier viewing  
    df = pd.DataFrame(significant_incidents)  
    print("\nDataFrame view of the first 5 incidents:")  
    print(df.head())  
else:  
    print("No incident data found in the text file.")  
    print("You may need to manually inspect the text file and adjust the pattern matching approach.")  

Searching for significant incidents table...


  header_pattern = f"{header}.*?(?=\n\n\w+|\Z)"


No matches found with specific pattern. Trying alternative pattern...
Still no matches. Looking for section headers...
Found section: Large Fire Activity
No incident data found in the text file.
You may need to manually inspect the text file and adjust the pattern matching approach.


In [5]:
%pip install tabula-py PyMuPDF pandas

Collecting tabula-py
  Using cached tabula_py-2.10.0-py3-none-any.whl.metadata (7.6 kB)
Downloading tabula_py-2.10.0-py3-none-any.whl (12.0 MB)
   ---------------------------------------- 0.0/12.0 MB ? eta -:--:--
    --------------------------------------- 0.2/12.0 MB 4.7 MB/s eta 0:00:03
   - -------------------------------------- 0.4/12.0 MB 4.9 MB/s eta 0:00:03
   -- ------------------------------------- 0.6/12.0 MB 4.8 MB/s eta 0:00:03
   --- ------------------------------------ 1.0/12.0 MB 4.7 MB/s eta 0:00:03
   ---- ----------------------------------- 1.4/12.0 MB 5.4 MB/s eta 0:00:02
   ----- ---------------------------------- 1.7/12.0 MB 5.3 MB/s eta 0:00:02
   ------ --------------------------------- 2.0/12.0 MB 5.2 MB/s eta 0:00:02
   ------- -------------------------------- 2.2/12.0 MB 5.1 MB/s eta 0:00:02
   -------- ------------------------------- 2.5/12.0 MB 5.0 MB/s eta 0:00:02
   --------- ------------------------------ 2.8/12.0 MB 5.0 MB/s eta 0:00:02
   ---------- --

In [16]:
# Import necessary libraries
import fitz  # PyMuPDF
import re
import json
import pandas as pd
from datetime import datetime

# Step 1: Extract text from the PDF using PyMuPDF
pdf_path = 'annual_report_2009_508.pdf'
text_content = ""

print("Extracting text from PDF...")
doc = fitz.open(pdf_path)
for page in doc:
    text_content += page.get_text()

# Save the extracted text to a file for reference
with open('annual_report_2009_extracted.txt', 'w', encoding='utf-8') as f:
    f.write(text_content)

print("Text extraction complete. Saved to 'annual_report_2009_extracted.txt'")

# Step 2: Look for sections that might contain significant incidents data
print("\
Searching for sections with incident data...")

# Let's look for section headers that might indicate significant incidents
section_headers = [
    "Significant Incidents", 
    "Large Fire Activity", 
    "Major Incidents",
    "Significant Wildland Fires"
]

# Search for these headers in the text
found_sections = []
for header in section_headers:
    matches = re.finditer(r'(?i)' + re.escape(header), text_content)
    for match in matches:
        start_pos = match.start()
        # Get some context (500 characters) around the match
        context_start = max(0, start_pos - 100)
        context_end = min(len(text_content), start_pos + 500)
        context = text_content[context_start:context_end]
        found_sections.append((header, start_pos, context))

print(f"Found {len(found_sections)} potential sections with incident data.")
for i, (header, pos, context) in enumerate(found_sections):
    print(f"\
Section {i+1}: {header} (position {pos})")
    print(f"Context: {context[:200]}...")  # Show first 200 chars of context

# Step 3: Let's look for patterns that might indicate incident data
# Common patterns in wildfire incident reports include:
# - Fire names followed by acreage
# - Dates in MM/DD/YYYY format
# - State abbreviations

# Look for date patterns
date_pattern = r'\d{1,2}/\d{1,2}/\d{4}'
dates = re.findall(date_pattern, text_content)
print(f"\
Found {len(dates)} date patterns in the text.")
print("Sample dates:", dates[:10])

# Look for state abbreviations (2 uppercase letters)
state_pattern = r'\([A-Z]{2})\'
states = re.findall(state_pattern, text_content)
print(f"\
Found {len(states)} potential state abbreviations.")
print("Sample states:", states[:20])

# Look for acreage patterns (numbers followed by "acres")
acreage_pattern = r'(\d{1,3}(?:,\d{3})*)\s*acres'
acreages = re.findall(acreage_pattern, text_content, re.IGNORECASE)
print(f"\
Found {len(acreages)} acreage mentions.")
print("Sample acreages:", acreages[:10])

# Step 4: Let's try to find tables or structured data
# Look for patterns that might indicate table rows with fire data
# This is a more complex pattern that tries to match fire name, location, date, and acreage
fire_pattern = r'([A-Za-z\s\-\.&]+)\s+(?:Fire|Complex)\s+(?:in\s+)?([A-Z]{2})\s+.*?(\d{1,2}/\d{1,2}/\d{4}).*?(\d{1,3}(?:,\d{3})*)\s*acres'
fire_matches = re.findall(fire_pattern, text_content, re.IGNORECASE)

print(f"\
Found {len(fire_matches)} potential fire incident entries.")
print("Sample entries:")
for i, match in enumerate(fire_matches[:5]):
    print(f"{i+1}. {match}")

# Let's also try to find paragraphs that mention fires
fire_paragraphs = []
paragraphs = re.split(r'\
\s*\
', text_content)
for para in paragraphs:
    if re.search(r'(?:Fire|Complex|Wildfire)', para, re.IGNORECASE) and re.search(date_pattern, para):
        fire_paragraphs.append(para)

print(f"\
Found {len(fire_paragraphs)} paragraphs mentioning fires with dates.")
print("Sample paragraphs:")
for i, para in enumerate(fire_paragraphs[:3]):
    print(f"\
Paragraph {i+1}: {para[:200]}...")

# Step 5: Let's try to extract structured data from these paragraphs
# We'll look for patterns like:
# - Fire name (often ends with "Fire" or "Complex")
# - State abbreviation
# - Start date
# - Containment date (if available)
# - Acreage

structured_incidents = []

for para in fire_paragraphs:
    # Try to extract fire name
    name_match = re.search(r'([A-Za-z\s\-\.&]+?)(?:Fire|Complex)', para, re.IGNORECASE)
    if name_match:
        name = name_match.group(1).strip()
    else:
        continue  # Skip if no fire name found
    
    # Try to extract state
    state_match = re.search(state_pattern, para)
    state = state_match.group(1) if state_match else "Unknown"
    
    # Try to extract dates
    dates = re.findall(date_pattern, para)
    start_date = dates[0] if dates else "Unknown"
    contain_date = dates[1] if len(dates) > 1 else "Unknown"
    
    # Try to extract acreage
    acreage_match = re.search(acreage_pattern, para, re.IGNORECASE)
    acres = acreage_match.group(1).replace(',', '') if acreage_match else "0"
    
    # Try to extract GACC (Geographic Area Coordination Center)
    gacc_match = re.search(r'\(EA|SA|SW|RM|GB|NW|NO|SO|NR|AK)\', para)
    gacc = gacc_match.group(1) if gacc_match else "Unknown"
    
    # Try to extract cause
    cause_match = re.search(r'caused by\s+([A-Za-z\s]+)', para, re.IGNORECASE)
    cause = cause_match.group(1).strip() if cause_match else "Unknown"
    
    # Add to structured incidents
    structured_incidents.append({
        'Name': name,
        'State': state,
        'GACC': gacc,
        'Start_Date': start_date,
        'Contain_Date': contain_date,
        'Size_Acres': acres,
        'Cause': cause
    })

print(f"\
Extracted {len(structured_incidents)} structured incident records.")
print("Sample structured incidents:")
for i, incident in enumerate(structured_incidents[:5]):
    print(f"{i+1}. {incident}")

# Step 6: Create dimension tables for the star schema
# State Dimension
states = set(incident['State'] for incident in structured_incidents)
state_dim = [{'State_ID': i+1, 'State': state} for i, state in enumerate(sorted(states))]

# GACC Dimension
gaccs = set(incident['GACC'] for incident in structured_incidents)
gacc_dim = [{'GACC_ID': i+1, 'GACC': gacc} for i, gacc in enumerate(sorted(gaccs))]

# Time Dimension
dates = set(incident['Start_Date'] for incident in structured_incidents if incident['Start_Date'] != "Unknown")
time_dim = []
for i, date_str in enumerate(sorted(dates)):
    try:
        date_obj = datetime.strptime(date_str, '%m/%d/%Y')
        time_dim.append({
            'Date_ID': i+1,
            'Date': date_obj.strftime('%Y-%m-%d'),
            'Month': date_obj.month,
            'Year': date_obj.year
        })
    except ValueError:
        pass  # Skip invalid dates

# Step 7: Create the fact table with dimension keys
fact_table = []
state_id_map = {state['State']: state['State_ID'] for state in state_dim}
gacc_id_map = {gacc['GACC']: gacc['GACC_ID'] for gacc in gacc_dim}
date_id_map = {time['Date']: time['Date_ID'] for time in time_dim}

for incident in structured_incidents:
    # Convert Start_Date to the format used in time_dim
    start_date_formatted = None
    if incident['Start_Date'] != "Unknown":
        try:
            date_obj = datetime.strptime(incident['Start_Date'], '%m/%d/%Y')
            start_date_formatted = date_obj.strftime('%Y-%m-%d')
        except ValueError:
            pass
    
    fact_table.append({
        'Incident_ID': len(fact_table) + 1,
        'Name': incident['Name'],
        'State_ID': state_id_map.get(incident['State'], None),
        'GACC_ID': gacc_id_map.get(incident['GACC'], None),
        'Date_ID': date_id_map.get(start_date_formatted, None) if start_date_formatted else None,
        'Start_Date': incident['Start_Date'],
        'Contain_Date': incident['Contain_Date'],
        'Size_Acres': incident['Size_Acres'],
        'Cause': incident['Cause']
    })

# Step 8: Save the structured data to JSON files
structured_data = {
    'significant_incidents': fact_table
}

dimensions = {
    'state_dimension': state_dim,
    'gacc_dimension': gacc_dim,
    'time_dimension': time_dim
}

with open('structured_incidents_data_2009.json', 'w') as f:
    json.dump(structured_data, f, indent=4)

with open('dimension_tables_2009.json', 'w') as f:
    json.dump(dimensions, f, indent=4)

print("\
Data successfully extracted and saved to JSON files:")
print("- structured_incidents_data_2009.json")
print("- dimension_tables_2009.json")
print(f"Created dimension tables: States ({len(state_dim)}), GACCs ({len(gacc_dim)}), Time ({len(time_dim)}).")

# Create a DataFrame for easier viewing
if structured_incidents:
    df = pd.DataFrame(structured_incidents)
    print("\
DataFrame view of the first 5 incidents:")
    print(df.head())

Extracting text from PDF...
Text extraction complete. Saved to 'annual_report_2009_extracted.txt'
Searching for sections with incident data...
Found 2 potential sections with incident data.
Section 1: Large Fire Activity (position 9902)
Context: the 90th percentile or historic maximums in several locations, 
especially in Texas. By late April, large fire activity began to taper off in Texas and pick up in 
Florida and the Eastern, Southwest, ...
Section 2: Large Fire Activity (position 16190)
Context: Colorado. Much of the West saw frequent mixed wet and dry lightning storms, yet initial attack 
and large fire activity continued to be below normal for the most part. Nationally, the 
preparedness le...
Found 0 date patterns in the text.
Sample dates: []
Found 0 potential state abbreviations.
Sample states: []
Found 23 acreage mentions.
Sample acreages: ['40,000', '1,000', '136,020', '897,496', '1,400,185', '109,988', '8', '2,934,455', '90,000', '160,000']
Found 0 potential fire incident

In [23]:
# Let's examine the PDF structure more closely
import fitz
import pandas as pd
import json
import re
from collections import defaultdict

pdf_path = 'annual_report_2009_508.pdf'
doc = fitz.open(pdf_path)

# Let's look at the table of contents or bookmarks to find relevant sections
toc = doc.get_toc()
print("Table of Contents:")
for item in toc:
    level, title, page = item
    print(f"{'  ' * (level-1)}{title} (Page {page})")

# Let's also check for any tables in the document
print("\
Searching for tables in the document...")
tables_found = []

for page_num in range(len(doc)):
    page = doc[page_num]
    
    # Check if this page might contain a table
    text = page.get_text()
    
    # Look for patterns that might indicate tables
    # Tables often have multiple lines with similar structure
    lines = text.split(' ')
    
    # Count lines with similar patterns (e.g., lines with multiple spaces between words)
    pattern_count = defaultdict(int)
    for line in lines:
        # Create a simplified pattern of the line (spaces vs non-spaces)
        pattern = re.sub(r'\S+', 'W', line)  # Replace word characters with 'W'
        pattern = re.sub(r'\s+', ' ', pattern)  # Normalize spaces
        pattern_count[pattern] += 1
    
    # If we have multiple lines with the same pattern, it might be a table
    potential_tables = [pattern for pattern, count in pattern_count.items() if count >= 3 and 'W W' in pattern]
    
    if potential_tables:
        tables_found.append((page_num + 1, len(potential_tables), text[:200]))

print(f"Found potential tables on {len(tables_found)} pages.")
for page_num, num_tables, sample_text in tables_found[:10]:  # Show first 10
    print(f"\
Page {page_num}: {num_tables} potential tables")
    print(f"Sample text: {sample_text}...")

# Let's look at specific pages that might contain significant incident data
# Based on the TOC or common report structure, we might want to check specific pages
# For example, pages with "Significant Incidents" or "Large Fires" in the title

# Let's check a few specific pages that might contain incident data
# We'll look at pages 10-20 as an example (adjust based on TOC findings)
target_pages = range(10, 21)
print("\
Examining specific pages for incident data...")

for page_num in target_pages:
    if page_num < len(doc):
        page = doc[page_num]
        text = page.get_text()
        
        # Check if this page might contain incident data
        if re.search(r'(Fire|Incident|Wildfire|Acres|Significant)', text, re.IGNORECASE):
            print(f"\
Page {page_num+1} might contain incident data:")
            print(text[:300] + "...")  # Print first 300 chars

# Let's also try to extract any tables directly using PyMuPDF's table extraction
print("\
Attempting to extract tables directly...")

for page_num in range(len(doc)):
    page = doc[page_num]
    
    # Extract tables from the page
    tables = page.find_tables()
    
    if tables and tables.tables:
        print(f"\
Found {len(tables.tables)} tables on page {page_num+1}")
        
        for i, table in enumerate(tables.tables):
            df = table.to_pandas()
            print(f"Table {i+1} on page {page_num+1}:")
            print(df.head())
            
            # Check if this table might contain incident data
            # Look for columns that might indicate fire incidents
            columns = [str(col).lower() for col in df.columns]
            if any(keyword in ' '.join(columns) for keyword in ['fire', 'incident', 'acre', 'date']):
                print(f"This table likely contains incident data!")
                
                # Save this table to a CSV file
                csv_filename = f"extracted_table_page_{page_num+1}_table_{i+1}.csv"
                df.to_csv(csv_filename, index=False)
                print(f"Saved to {csv_filename}")

# Let's also try to find specific sections in the text that might contain incident data
print("\
Searching for specific sections with incident data...")

# Common section titles for incident data
section_titles = [
    "Significant Wildland Fires",
    "Large Fire Activity",
    "Significant Incidents",
    "Major Incidents",
    "Appendix"  # Appendices often contain tables with incident data
]

# Search for these section titles in the text
for page_num in range(len(doc)):
    page = doc[page_num]
    text = page.get_text()
    
    for title in section_titles:
        if title in text:
            print(f"\
Found '{title}' on page {page_num+1}")
            
            # Get the position of the title
            title_pos = text.find(title)
            
            # Get the text following the title (up to 500 chars)
            section_text = text[title_pos:title_pos+500]
            print(f"Section text: {section_text}...")

# Close the document
doc.close()

print("\
Analysis complete. Check the extracted tables and sections for incident data.")

Table of Contents:
Identifier Legend (Page 3)
Preface (Page 4)
2009 Fire Season Summary (Page 5)
National Fire Activity Synopsis (Page 12)
Fires and Complexes Over 40,000 Acres in 2009 (Page 13)
Significant Fire Activity (Page 14)
Wildfires Reported to NICC (Page 16)
National Preparedness Levels (Page 31)
  National Preparedness Level Summary (Page 32)
Incident Management Team Mobilizations (Page 33)
Department of Defense Mobilization (Page 39)
Crew Mobilization (Page 39)
Engine Mobilization (Page 42)
Overhead Mobilization (Page 45)
Helicopter Mobilization (Page 47)
Fixed Wing Aircraft Mobilization (Page 51)
Large Transportation Aircraft (Page 55)
Light Cargo and Passenger Flights by Requesting Agency and Geographic Area (Page 57)
Equipment Services Mobilization (Page 58)
Radio and Weather Equipment Mobilization (Page 60)
Average Worst Summary (Page 62)
NICC Benchmarks (Page 63)
Acronyms and Terminology (Page 64)
National Report of Wildland Fires and Acres Burned by State (Page 65)
Sea

In [24]:
import json  
import pandas as pd  
  
# Step 1: Read the 2009 CSV file  
df_2009 = pd.read_csv('extracted_table_page_13_table_1.csv')  
print("Loaded 2009 CSV data with", len(df_2009), "incidents.")  
  
# Step 2: Rename the columns to match the 2008 JSON structure  
column_mapping = {  
    'Name': 'Name',  
    'Inc.\nType': 'Inc_Type',  
    'GACC': 'GACC',  
    'State': 'State',  
    'Start Date': 'Start_Date',  
    'Contain or\nControl Date': 'Contain_Control_Date',  
    'Size\n(Acres)': 'Size_Acres',  
    'Cause': 'Cause',  
    'Estimated\nCost': 'Cost'  
}  
  
df_2009_cleaned = df_2009.rename(columns=column_mapping)  
  
# Step 3: Convert the DataFrame into a list of incident dictionaries  
incidents_2009 = df_2009_cleaned.to_dict(orient='records')  
  
# Create new structured JSON data  
structured_data_2009 = {  
    "significant_incidents": incidents_2009  
}  
  
# Step 4: Output the JSON into the notebook  
json_output = json.dumps(structured_data_2009, indent=4)  
print("Structured JSON data for the 2009 incidents:")  
print(json_output)  
  
# Step 5: Save the JSON data into a new file  
with open('structured_incidents_data_2009.json', 'w') as f:  
    json.dump(structured_data_2009, f, indent=4)  
  
print("Saved the structured data to structured_incidents_data_2009.json")  

Loaded 2009 CSV data with 27 incidents.
Structured JSON data for the 2009 incidents:
{
    "significant_incidents": [
        {
            "Name": "Chester",
            "Inc_Type": "WF",
            "GACC": "SA",
            "State": "OK",
            "Start_Date": "10-Jul-09",
            "Contain_Control_Date": "17-Jul-09",
            "Size_Acres": "41,497",
            "Cause": "U",
            "Cost": "$250,000"
        },
        {
            "Name": "Bluff Creek",
            "Inc_Type": "WF",
            "GACC": "AK",
            "State": "AK",
            "Start_Date": "26-Jul-09",
            "Contain_Control_Date": "10-Nov-09",
            "Size_Acres": "41,756",
            "Cause": "L",
            "Cost": "NR"
        },
        {
            "Name": "Big Pole",
            "Inc_Type": "WF",
            "GACC": "EB",
            "State": "UT",
            "Start_Date": "6-Aug-09",
            "Contain_Control_Date": "9-Nov-09",
            "Size_Acres": "44,345",
     