In [3]:
import pandas as pd  
import json  
import re  
from datetime import datetime  
import os  
from tqdm import tqdm  
  
def clean_incident_data(incident):  
    """Clean and standardize a single incident record with improved date handling"""  
    cleaned = {}  
      
    # Copy all fields with standardized names  
    for key, value in incident.items():  
        # Standardize key names (remove dots, and replace spaces with underscores)  
        clean_key = key.replace('.', '').replace(' ', '_')  
        cleaned[clean_key] = value  
      
    # Clean date fields - convert to proper datetime objects  
    date_fields = ['Start_Date', 'Contain_Control_Date']  
    for date_field in date_fields:  
        if date_field in cleaned and cleaned[date_field]:  
            date_value = cleaned[date_field]  
            try:  
                # Try different date formats:  
                if re.match(r'\d{1,2}-[A-Za-z]{3}-\d{2}', date_value):  # e.g., "10-Jul-09"  
                    date_obj = datetime.strptime(date_value, '%d-%b-%y')  
                elif re.match(r'\d{1,2}/\d{1,2}/\d{4}', date_value):  # e.g., "2/25/2008"  
                    date_obj = datetime.strptime(date_value, '%m/%d/%Y')  
                else:  
                    # Format unknown; set to None  
                    cleaned[date_field] = None  
                    continue  
                cleaned[date_field] = date_obj  
            except Exception as e:  
                print("Error parsing date " + date_value + ": " + str(e))  
                cleaned[date_field] = None  
  
    # Clean numeric fields  
    # Size_Acres  
    if 'Size_Acres' in cleaned and cleaned['Size_Acres']:  
        try:  
            size_str = str(cleaned['Size_Acres']).replace(',', '')  
            cleaned['Size_Acres'] = float(size_str)  
        except:  
            cleaned['Size_Acres'] = None  
  
    # Cost field  
    if 'Cost' in cleaned and cleaned['Cost']:  
        try:  
            # Remove $, commas, spaces and convert to float  
            cost_str = str(cleaned['Cost']).replace('$', '').replace(',', '').replace(' ', '')  
            if cost_str.lower() not in ['nr', 'n/a', 'unknown']:  
                cleaned['Cost'] = float(cost_str)  
            else:  
                cleaned['Cost'] = None  
        except:  
            cleaned['Cost'] = None  
  
    return cleaned  
  
# Set the path to the folder containing structured incident files.  
folder_path = './structurejson'  
structured_files = [os.path.join(folder_path, f) for f in os.listdir(folder_path) if f.startswith('structured_incidents_data')]  
  
print("Found " + str(len(structured_files)) + " structured incident files in " + folder_path)  
  
all_incidents = []  
file_counts = {}  
  
for file_path in tqdm(structured_files):  
    file_name = os.path.basename(file_path)  
    try:  
        with open(file_path, 'r') as f:  
            data = json.load(f)  
          
        if 'significant_incidents' in data:  
            incidents = data['significant_incidents']  
            cleaned_incidents = [clean_incident_data(incident) for incident in incidents]  
              
            # Add source file info  
            for incident in cleaned_incidents:  
                incident['source_file'] = file_name  
            all_incidents.extend(cleaned_incidents)  
            file_counts[file_name] = len(cleaned_incidents)  
        else:  
            print("Warning: No 'significant_incidents' key in " + file_name)  
    except Exception as e:  
        print("Error processing " + file_name + ": " + str(e))  
  
# Convert to DataFrame  
incidents_df = pd.DataFrame(all_incidents)  
  
# Display summary  
print("\nTotal incidents collected:", len(incidents_df))  
print("\nIncidents per file:")  
for file_name, count in file_counts.items():  
    print(file_name + ": " + str(count) + " incidents")  
  
# Save the combined dataset  
incidents_df.to_csv('combined_incidents.csv', index=False)  
print("\nSaved combined dataset to 'combined_incidents.csv'")  
  
# Verify the CSV file was created and check its contents  
if os.path.exists('combined_incidents.csv'):  
    df_from_csv = pd.read_csv('combined_incidents.csv')  
    print("\nVerified CSV file exists with " + str(len(df_from_csv)) + " rows and " + str(len(df_from_csv.columns)) + " columns")  
    print("\nCSV file columns:")  
    print(df_from_csv.columns.tolist())  
    print("\nFirst few rows of the CSV file:")  
    print(df_from_csv.head())  
else:  
    print("\nError: CSV file was not created")  

Found 17 structured incident files in ./structurejson


100%|██████████████████████████████████████████████████████████████████████████████████| 17/17 [00:00<00:00, 34.13it/s]







Total incidents collected: 206

Incidents per file:
structured_incidents_data.json: 23 incidents
structured_incidents_data_2009.json: 27 incidents
structured_incidents_data_2010.json: 9 incidents
structured_incidents_data_2011.json: 41 incidents
structured_incidents_data_2012.json: 51 incidents
structured_incidents_data_2014.json: 9 incidents
structured_incidents_data_2015.json: 46 incidents

Saved combined dataset to 'combined_incidents.csv'

Verified CSV file exists with 206 rows and 11 columns

CSV file columns:
['Name', 'Inc_Type', 'GACC', 'State', 'Start_Date', 'Contain_Control_Date', 'Size_Acres', 'Cause', 'Cost', 'source_file', 'Last_Report_Date']

First few rows of the CSV file:
                      Name Inc_Type GACC State  Start_Date  \
0               Glass Fire       WF   SA    TX  2008-02-25   
1          Klamath Theater       WF   NO    CA  2008-06-21   
2            Basin Complex       WF   SO    CA  2008-06-21   
3  Iron & Alps \nComplexes       WF   NO    CA  2008-06

In [5]:
# Load the combined_incidents.csv file
import pandas as pd
import re

# Load the CSV file
df = pd.read_csv('combined_incidents.csv')

# Check for rows with names containing asterisks or unusual patterns
print("Checking for names with asterisks or unusual patterns:")
for idx, name in enumerate(df['Name']):
    if isinstance(name, str) and ('*' in name or re.match(r'^\d+$', name)):
        print(f"Row {idx}: {name}")

# Fill the missing name with a placeholder
df['Name'] = df['Name'].fillna('Unknown Incident')

# Check if the placeholder was applied correctly
print("\
After filling missing name:")
print(df.iloc[12][['Name', 'State', 'GACC', 'Start_Date', 'Size_Acres']])

# Check if there are any remaining missing names
print(f"\
Remaining missing names: {df['Name'].isnull().sum()}")

# Save the updated dataframe
df.to_csv('combined_incidents_cleaned.csv', index=False)
print("\
Saved updated dataframe to 'combined_incidents_cleaned.csv'")

# Display a sample of the cleaned data
print("\
Sample of cleaned data:")
print(df.sample(5)[['Name', 'State', 'GACC', 'Start_Date', 'Size_Acres', 'Cause']])

Checking for names with asterisks or unusual patterns:
Row 38: Rex Creek *
Row 46: Little Black One **
Row 47: Crazy Mountain
Complex **
Row 48: Minto Flats South *
Row 49: Railbelt Complex *
After filling missing name:
Name          Unknown Incident
State                       TX
GACC                        SA
Start_Date          2008-03-14
Size_Acres             67500.0
Name: 12, dtype: object
Remaining missing names: 0
Saved updated dataframe to 'combined_incidents_cleaned.csv'
Sample of cleaned data:
            Name State GACC  Start_Date  Size_Acres Cause
30          Cato    NM   SW  2009-06-10     55080.0     L
65       Wildcat    TX   SA  2011-04-11    159308.0     L
66   Las Conchas    NM   SW  2011-06-26    156593.0     H
39  Wood River 1    AK   AK  2009-07-12    125382.0     L
8        Complex    CA   NO  2008-06-21     82186.0     L


In [7]:
import pandas as pd

# Load the cleaned CSV file
df = pd.read_csv('combined_incidents_cleaned.csv')

# Define a function to clean the name field by removing asterisks and trailing numbers
import re

def clean_name(name):
    if pd.isnull(name):
        return name
    # Remove asterisks
    name_cleaned = re.sub(r'\*+', '', name)
    # Remove trailing numbers (e.g., ' Wood River 1' becomes ' Wood River')
    name_cleaned = re.sub(r'\s+\d+$', '', name_cleaned)
    # Strip extra whitespace
    return name_cleaned.strip()

# Apply cleaning function to Name column
df['Name'] = df['Name'].apply(clean_name)

# Save the updated dataframe to a new CSV file
df.to_csv('combined_incidents_cleaned_no_special_names.csv', index=False)

# Show a sample of the cleaned names
print('Sample of cleaned names:')
print(df['Name'].sample(10))

print('\
Saved updated dataframe to combined_incidents_cleaned_no_special_names.csv')

Sample of cleaned names:
24            Bluff Creek
49       Railbelt Complex
164            North Star
126    Powell SBW Complex
168      Okanogan Complex
137                Seeley
96           Matador West
121              Wellnitz
80                Prairie
158          July Complex
Name: Name, dtype: object
Saved updated dataframe to combined_incidents_cleaned_no_special_names.csv


In [27]:
import os
import json
import re
import pandas as pd
from tqdm import tqdm
from datetime import datetime

# Check if the directory exists
structurejson_dir = 'structurejson'
if not os.path.exists(structurejson_dir):
    print(f"Directory '{structurejson_dir}' does not exist.")
    # Check if it exists in the current directory structure
    if os.path.exists('Wildfire/Fire report/structurejson'):
        structurejson_dir = 'Wildfire/Fire report/structurejson'
        print(f"Using directory '{structurejson_dir}' instead.")
    else:
        print("Could not find the structurejson directory.")
        exit()

# List all structured_incidents_data files from 2012 to 2024
print(f"Looking for structured_incidents_data files in {structurejson_dir}...")
structured_files = []
for year in range(2012, 2025):
    file_name = f"structured_incidents_data_{year}.json"
    file_path = os.path.join(structurejson_dir, file_name)
    if os.path.exists(file_path):
        structured_files.append((file_path, year))

print(f"Found {len(structured_files)} structured_incidents_data files from 2012 to 2024.")

# Function to convert date format from "9/4" to "04-Sep-12" format
def convert_date_format(date_str, year):
    if not date_str or pd.isna(date_str) or (isinstance(date_str, str) and date_str.strip() == ""):
        return None
    
    # If it already has the desired format (DD-MMM-YY), return as is
    if isinstance(date_str, str) and re.match(r'\d{1,2}-[A-Za-z]{3}-\d{2}', date_str):
        return date_str
    
    try:
        # Handle M/D format (e.g., "9/4")
        if isinstance(date_str, str) and re.match(r'\d{1,2}/\d{1,2}', date_str):
            month, day = map(int, date_str.split('/'))
            # Create a datetime object
            date_obj = datetime(year, month, day)
            # Format as "DD-MMM-YY"
            return date_obj.strftime("%d-%b-%y")
        
        # Handle other formats if needed
        # For now, return as is if format is not recognized
        return date_str
    except:
        # If conversion fails, return the original string
        return date_str

# Process each file
for file_path, year in tqdm(structured_files):
    print(f"\
Processing {file_path}...")
    
    # Create a backup if it doesn't exist
    backup_path = file_path + '.bak'
    if not os.path.exists(backup_path):
        import shutil
        shutil.copy2(file_path, backup_path)
        print(f"Created backup: {backup_path}")
    
    # Load the JSON data
    with open(file_path, 'r') as f:
        data = json.load(f)
    
    # Track changes
    changes_made = 0
    
    # Process the data based on its structure
    if isinstance(data, list):
        # If data is a list of records
        for record in data:
            if isinstance(record, dict):
                # Update Start_Date
                if 'Start_Date' in record:
                    old_date = record['Start_Date']
                    new_date = convert_date_format(old_date, year)
                    if old_date != new_date:
                        record['Start_Date'] = new_date
                        changes_made += 1
                
                # Update Control_Date or Contain_Control_Date
                for date_field in ['Control_Date', 'Contain_Control_Date']:
                    if date_field in record:
                        old_date = record[date_field]
                        new_date = convert_date_format(old_date, year)
                        if old_date != new_date:
                            record[date_field] = new_date
                            changes_made += 1
    
    elif isinstance(data, dict):
        # If data is a dictionary with nested records
        for key, value in data.items():
            if isinstance(value, list):
                for record in value:
                    if isinstance(record, dict):
                        # Update Start_Date
                        if 'Start_Date' in record:
                            old_date = record['Start_Date']
                            new_date = convert_date_format(old_date, year)
                            if old_date != new_date:
                                record['Start_Date'] = new_date
                                changes_made += 1
                        
                        # Update Control_Date or Contain_Control_Date
                        for date_field in ['Control_Date', 'Contain_Control_Date']:
                            if date_field in record:
                                old_date = record[date_field]
                                new_date = convert_date_format(old_date, year)
                                if old_date != new_date:
                                    record[date_field] = new_date
                                    changes_made += 1
    
    # Save the updated data
    with open(file_path, 'w') as f:
        json.dump(data, f, indent=2)
    
    print(f"Made {changes_made} date format changes in {file_path}")

# Verify changes in a sample file
if structured_files:
    sample_file_path, sample_year = structured_files[0]
    print(f"\
Verifying changes in {sample_file_path}:")
    
    with open(sample_file_path, 'r') as f:
        sample_data = json.load(f)
    
    # Display sample records
    if isinstance(sample_data, list) and sample_data:
        print("\
Sample records:")
        for i, record in enumerate(sample_data[:3]):  # Show first 3 records
            if 'Start_Date' in record:
                print(f"Record {i+1} Start_Date: {record['Start_Date']}")
            if 'Name' in record:
                print(f"Record {i+1} Name: {record['Name']}")
    
    elif isinstance(sample_data, dict):
        print("\
Sample records:")
        for key, value in list(sample_data.items())[:1]:  # Show first key
            print(f"Key: {key}")
            if isinstance(value, list) and value:
                for i, record in enumerate(value[:3]):  # Show first 3 records
                    if isinstance(record, dict):
                        if 'Start_Date' in record:
                            print(f"  Record {i+1} Start_Date: {record['Start_Date']}")
                        if 'Name' in record:
                            print(f"  Record {i+1} Name: {record['Name']}")

print("\
Date format conversion completed.")

Looking for structured_incidents_data files in structurejson...
Found 13 structured_incidents_data files from 2012 to 2024.


100%|██████████████████████████████████████████████████████████████████████████████████| 13/13 [00:00<00:00, 97.96it/s]

Processing structurejson\structured_incidents_data_2012.json...
Created backup: structurejson\structured_incidents_data_2012.json.bak
Made 102 date format changes in structurejson\structured_incidents_data_2012.json
Processing structurejson\structured_incidents_data_2013.json...
Created backup: structurejson\structured_incidents_data_2013.json.bak
Made 20 date format changes in structurejson\structured_incidents_data_2013.json
Processing structurejson\structured_incidents_data_2014.json...
Created backup: structurejson\structured_incidents_data_2014.json.bak
Made 9 date format changes in structurejson\structured_incidents_data_2014.json
Processing structurejson\structured_incidents_data_2015.json...
Created backup: structurejson\structured_incidents_data_2015.json.bak
Made 46 date format changes in structurejson\structured_incidents_data_2015.json
Processing structurejson\structured_incidents_data_2016.json...
Created backup: structurejson\structured_incidents_data_2016.json.bak
Made 1




In [41]:
import os
import json
import pandas as pd
import re
from tqdm import tqdm

# Set the directory where the JSON files are located (just /structurejson)
target_dir = "structurejson"

# List of file names provided (only the structurejson files)
file_names = [
    "structured_incidents_data_2014.json",
    "structured_incidents_data.json",
    "structured_incidents_data_2017.json",
    "structured_incidents_data_2013.json",
    "structured_incidents_data_2022.json",
    "structured_incidents_data_2010.json",
    "structured_incidents_data_2016.json",
    "structured_incidents_data_2012.json",
    "structured_incidents_data_2020.json",
    "structured_incidents_data_2018.json",
    "structured_incidents_data_2023.json",
    "structured_incidents_data_2015.json",
    "structured_incidents_data_2021.json",
    "structured_incidents_data_2024.json",
    "structured_incidents_data_2019.json",
    "structured_incidents_data_2011.json",
    "structured_incidents_data_2009.json"
]

# Remove duplicates if any
file_names = list(dict.fromkeys(file_names))

# Prepare a list to hold all incidents
all_incidents = []

# Regex to extract a 4-digit year from file names if available
year_pattern = re.compile(r'structured_incidents_data_(\d{4})\.json')

# Process each file from the /structurejson folder
for file in tqdm(file_names, desc="Processing files"):
    file_path = os.path.join(target_dir, file)
    if not os.path.exists(file_path):
        print("File not found:", file_path)
        continue
    try:
        with open(file_path, "r") as f:
            data = json.load(f)
    except Exception as e:
        print("Error loading", file_path, ":", e)
        continue

    # Determine year from filename if available
    m = year_pattern.search(file)
    if m:
        year = int(m.group(1))
    else:
        year = None

    # Extract incident records
    incidents = []
    if isinstance(data, list):
        incidents = data
    elif isinstance(data, dict):
        # Extract list from any key that is a list
        for key, value in data.items():
            if isinstance(value, list):
                incidents = value
                break

    # Add year information to each incident (if available)
    for incident in incidents:
        if isinstance(incident, dict) and year is not None:
            incident["Year"] = year

    all_incidents.extend(incidents)

print("\
Total incidents collected:", len(all_incidents))

# Convert the combined list of incidents into a DataFrame
print("\
Converting to DataFrame...")
df = pd.DataFrame(all_incidents)

print("\
Columns in the DataFrame:", df.columns.tolist())
print("\
Sample of the combined data:")
print(df.head())

# Define output directory as 'Wildfire/Fire report'
output_dir = os.path.join("Wildfire", "Fire report")
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

output_csv = os.path.join(output_dir, "combined_incidents.csv")
print("\
Saving the CSV to", output_csv, "...")
df.to_csv(output_csv, index=False)
print("CSV file created:", output_csv)

Processing files: 100%|██████████████████████████████████████████████████████████████| 17/17 [00:00<00:00, 2840.99it/s]

Total incidents collected: 470
Converting to DataFrame...
Columns in the DataFrame: ['Name', 'GACC', 'State', 'Start_Date', 'Last_Report_Date', 'Size_Acres', 'Cause', 'Cost', 'Inc_Type', 'Contain_Control_Date', 'Year', 'Estimated_Cost', 'Contain_or_Last_Report_Date', 'Start Date', 'Last Report Date', 'Size In Acres']
Sample of the combined data:
                 Name GACC State Start_Date Last_Report_Date Size_Acres Cause  \
0     Buzzard Complex   NW    OR  14-Jul-14             9/11    395,747     L   
1     Carlton Complex   NW    WA  14-Jul-14             8/28    256,108     L   
2         Funny River   AK    AK  19-May-14             8/14    195,858     H   
3  Happy Camp Complex   NO    CA  14-Aug-14             12/4    134,056     L   
4                King   NO    CA  13-Sep-14             10/9     97,717     H   

           Cost Inc_Type Contain_Control_Date    Year Estimated_Cost  \
0   $11,062,411     None                 None  2014.0            NaN   
1   $68,800,000     N


