In [3]:
import pandas as pd  
import json  
import re  
from datetime import datetime  
import os  
from tqdm import tqdm  
  
def clean_incident_data(incident):  
    """Clean and standardize a single incident record with improved date handling"""  
    cleaned = {}  
      
    # Copy all fields with standardized names  
    for key, value in incident.items():  
        # Standardize key names (remove dots, and replace spaces with underscores)  
        clean_key = key.replace('.', '').replace(' ', '_')  
        cleaned[clean_key] = value  
      
    # Clean date fields - convert to proper datetime objects  
    date_fields = ['Start_Date', 'Contain_Control_Date']  
    for date_field in date_fields:  
        if date_field in cleaned and cleaned[date_field]:  
            date_value = cleaned[date_field]  
            try:  
                # Try different date formats:  
                if re.match(r'\d{1,2}-[A-Za-z]{3}-\d{2}', date_value):  # e.g., "10-Jul-09"  
                    date_obj = datetime.strptime(date_value, '%d-%b-%y')  
                elif re.match(r'\d{1,2}/\d{1,2}/\d{4}', date_value):  # e.g., "2/25/2008"  
                    date_obj = datetime.strptime(date_value, '%m/%d/%Y')  
                else:  
                    # Format unknown; set to None  
                    cleaned[date_field] = None  
                    continue  
                cleaned[date_field] = date_obj  
            except Exception as e:  
                print("Error parsing date " + date_value + ": " + str(e))  
                cleaned[date_field] = None  
  
    # Clean numeric fields  
    # Size_Acres  
    if 'Size_Acres' in cleaned and cleaned['Size_Acres']:  
        try:  
            size_str = str(cleaned['Size_Acres']).replace(',', '')  
            cleaned['Size_Acres'] = float(size_str)  
        except:  
            cleaned['Size_Acres'] = None  
  
    # Cost field  
    if 'Cost' in cleaned and cleaned['Cost']:  
        try:  
            # Remove $, commas, spaces and convert to float  
            cost_str = str(cleaned['Cost']).replace('$', '').replace(',', '').replace(' ', '')  
            if cost_str.lower() not in ['nr', 'n/a', 'unknown']:  
                cleaned['Cost'] = float(cost_str)  
            else:  
                cleaned['Cost'] = None  
        except:  
            cleaned['Cost'] = None  
  
    return cleaned  
  
# Set the path to the folder containing structured incident files.  
folder_path = './structurejson'  
structured_files = [os.path.join(folder_path, f) for f in os.listdir(folder_path) if f.startswith('structured_incidents_data')]  
  
print("Found " + str(len(structured_files)) + " structured incident files in " + folder_path)  
  
all_incidents = []  
file_counts = {}  
  
for file_path in tqdm(structured_files):  
    file_name = os.path.basename(file_path)  
    try:  
        with open(file_path, 'r') as f:  
            data = json.load(f)  
          
        if 'significant_incidents' in data:  
            incidents = data['significant_incidents']  
            cleaned_incidents = [clean_incident_data(incident) for incident in incidents]  
              
            # Add source file info  
            for incident in cleaned_incidents:  
                incident['source_file'] = file_name  
            all_incidents.extend(cleaned_incidents)  
            file_counts[file_name] = len(cleaned_incidents)  
        else:  
            print("Warning: No 'significant_incidents' key in " + file_name)  
    except Exception as e:  
        print("Error processing " + file_name + ": " + str(e))  
  
# Convert to DataFrame  
incidents_df = pd.DataFrame(all_incidents)  
  
# Display summary  
print("\nTotal incidents collected:", len(incidents_df))  
print("\nIncidents per file:")  
for file_name, count in file_counts.items():  
    print(file_name + ": " + str(count) + " incidents")  
  
# Save the combined dataset  
incidents_df.to_csv('combined_incidents.csv', index=False)  
print("\nSaved combined dataset to 'combined_incidents.csv'")  
  
# Verify the CSV file was created and check its contents  
if os.path.exists('combined_incidents.csv'):  
    df_from_csv = pd.read_csv('combined_incidents.csv')  
    print("\nVerified CSV file exists with " + str(len(df_from_csv)) + " rows and " + str(len(df_from_csv.columns)) + " columns")  
    print("\nCSV file columns:")  
    print(df_from_csv.columns.tolist())  
    print("\nFirst few rows of the CSV file:")  
    print(df_from_csv.head())  
else:  
    print("\nError: CSV file was not created")  

Found 17 structured incident files in ./structurejson


100%|██████████████████████████████████████████████████████████████████████████████████| 17/17 [00:00<00:00, 34.13it/s]







Total incidents collected: 206

Incidents per file:
structured_incidents_data.json: 23 incidents
structured_incidents_data_2009.json: 27 incidents
structured_incidents_data_2010.json: 9 incidents
structured_incidents_data_2011.json: 41 incidents
structured_incidents_data_2012.json: 51 incidents
structured_incidents_data_2014.json: 9 incidents
structured_incidents_data_2015.json: 46 incidents

Saved combined dataset to 'combined_incidents.csv'

Verified CSV file exists with 206 rows and 11 columns

CSV file columns:
['Name', 'Inc_Type', 'GACC', 'State', 'Start_Date', 'Contain_Control_Date', 'Size_Acres', 'Cause', 'Cost', 'source_file', 'Last_Report_Date']

First few rows of the CSV file:
                      Name Inc_Type GACC State  Start_Date  \
0               Glass Fire       WF   SA    TX  2008-02-25   
1          Klamath Theater       WF   NO    CA  2008-06-21   
2            Basin Complex       WF   SO    CA  2008-06-21   
3  Iron & Alps \nComplexes       WF   NO    CA  2008-06

In [5]:
# Load the combined_incidents.csv file
import pandas as pd
import re

# Load the CSV file
df = pd.read_csv('combined_incidents.csv')

# Check for rows with names containing asterisks or unusual patterns
print("Checking for names with asterisks or unusual patterns:")
for idx, name in enumerate(df['Name']):
    if isinstance(name, str) and ('*' in name or re.match(r'^\d+$', name)):
        print(f"Row {idx}: {name}")

# Fill the missing name with a placeholder
df['Name'] = df['Name'].fillna('Unknown Incident')

# Check if the placeholder was applied correctly
print("\
After filling missing name:")
print(df.iloc[12][['Name', 'State', 'GACC', 'Start_Date', 'Size_Acres']])

# Check if there are any remaining missing names
print(f"\
Remaining missing names: {df['Name'].isnull().sum()}")

# Save the updated dataframe
df.to_csv('combined_incidents_cleaned.csv', index=False)
print("\
Saved updated dataframe to 'combined_incidents_cleaned.csv'")

# Display a sample of the cleaned data
print("\
Sample of cleaned data:")
print(df.sample(5)[['Name', 'State', 'GACC', 'Start_Date', 'Size_Acres', 'Cause']])

Checking for names with asterisks or unusual patterns:
Row 38: Rex Creek *
Row 46: Little Black One **
Row 47: Crazy Mountain
Complex **
Row 48: Minto Flats South *
Row 49: Railbelt Complex *
After filling missing name:
Name          Unknown Incident
State                       TX
GACC                        SA
Start_Date          2008-03-14
Size_Acres             67500.0
Name: 12, dtype: object
Remaining missing names: 0
Saved updated dataframe to 'combined_incidents_cleaned.csv'
Sample of cleaned data:
            Name State GACC  Start_Date  Size_Acres Cause
30          Cato    NM   SW  2009-06-10     55080.0     L
65       Wildcat    TX   SA  2011-04-11    159308.0     L
66   Las Conchas    NM   SW  2011-06-26    156593.0     H
39  Wood River 1    AK   AK  2009-07-12    125382.0     L
8        Complex    CA   NO  2008-06-21     82186.0     L


In [7]:
import pandas as pd

# Load the cleaned CSV file
df = pd.read_csv('combined_incidents_cleaned.csv')

# Define a function to clean the name field by removing asterisks and trailing numbers
import re

def clean_name(name):
    if pd.isnull(name):
        return name
    # Remove asterisks
    name_cleaned = re.sub(r'\*+', '', name)
    # Remove trailing numbers (e.g., ' Wood River 1' becomes ' Wood River')
    name_cleaned = re.sub(r'\s+\d+$', '', name_cleaned)
    # Strip extra whitespace
    return name_cleaned.strip()

# Apply cleaning function to Name column
df['Name'] = df['Name'].apply(clean_name)

# Save the updated dataframe to a new CSV file
df.to_csv('combined_incidents_cleaned_no_special_names.csv', index=False)

# Show a sample of the cleaned names
print('Sample of cleaned names:')
print(df['Name'].sample(10))

print('\
Saved updated dataframe to combined_incidents_cleaned_no_special_names.csv')

Sample of cleaned names:
24            Bluff Creek
49       Railbelt Complex
164            North Star
126    Powell SBW Complex
168      Okanogan Complex
137                Seeley
96           Matador West
121              Wellnitz
80                Prairie
158          July Complex
Name: Name, dtype: object
Saved updated dataframe to combined_incidents_cleaned_no_special_names.csv


In [19]:
import os
import json
import re
import pandas as pd
from tqdm import tqdm

# First, let's find where the structured_incidents_data files are located
print("Looking for structured_incidents_data files in the current directory...")

# List all structured_incidents_data files in the current directory
structured_files = [f for f in os.listdir('.') if f.startswith('structured_incidents_data_') and f.endswith('.json')]
print(f"Found {len(structured_files)} structured_incidents_data files in the current directory.")

# Function to fix partial dates by adding the year
def fix_date(date_str, year):
    if not date_str or pd.isna(date_str) or (isinstance(date_str, str) and date_str.strip() == ""):
        return None
    
    # If it's already a full date (contains a year), return as is
    if isinstance(date_str, str) and re.search(r'\d{4}', date_str):
        return date_str
    
    # Handle various date formats
    if isinstance(date_str, str) and re.match(r'\d{1,2}/\d{1,2}', date_str):  # Format: M/D
        return f"{date_str}/{year}"
    elif isinstance(date_str, str) and re.match(r'\d{1,2}-[A-Za-z]{3}', date_str):  # Format: D-Mon
        return f"{date_str}-{year}"
    elif isinstance(date_str, str) and re.match(r'[A-Za-z]+', date_str):  # Format: Month name only
        return f"{date_str} {year}"
    else:
        # If format is unknown, return as is
        return date_str

# Process structured files from 2012 onward
structured_pattern = re.compile(r'^structured_incidents_data_(\d{4})\.json$')
structured_files_2012_on = []

for f in structured_files:
    m = structured_pattern.match(f)
    if m and int(m.group(1)) >= 2012:
        structured_files_2012_on.append(f)

print(f"Processing {len(structured_files_2012_on)} structured incident files from 2012 onward")

# Process each structured file
for file_name in tqdm(structured_files_2012_on):
    year = int(structured_pattern.match(file_name).group(1))
    file_path = os.path.join('.', file_name)
    
    # Load the file
    with open(file_path, 'r') as f:
        data = json.load(f)
    
    # Check if data is a list or dict
    if isinstance(data, list):
        # Process each record
        for record in data:
            if isinstance(record, dict):
                # Fix date fields
                if 'Start_Date' in record:
                    record['Start_Date'] = fix_date(record['Start_Date'], year)
                
                # Handle different field names for control date
                control_date_fields = ['Contain_Control_Date', 'Control_Date']
                for field in control_date_fields:
                    if field in record:
                        record[field] = fix_date(record[field], year)
    elif isinstance(data, dict):
        # If it's a dict, look for lists of records
        for key, value in data.items():
            if isinstance(value, list):
                for record in value:
                    if isinstance(record, dict):
                        # Fix date fields
                        if 'Start_Date' in record:
                            record['Start_Date'] = fix_date(record['Start_Date'], year)
                        
                        # Handle different field names for control date
                        control_date_fields = ['Contain_Control_Date', 'Control_Date']
                        for field in control_date_fields:
                            if field in record:
                                record[field] = fix_date(record[field], year)
    
    # Create a backup of the original file if it doesn't already exist
    backup_file = file_path + '.bak'
    if not os.path.exists(backup_file):
        os.rename(file_path, backup_file)
        # Save the updated data back to the original file
        with open(file_path, 'w') as f:
            json.dump(data, f, indent=2)
        print(f"Processed structured file {file_name} (backup saved as {backup_file})")
    else:
        # If backup already exists, just overwrite the original file
        with open(file_path, 'w') as f:
            json.dump(data, f, indent=2)
        print(f"Processed structured file {file_name} (backup already existed)")

# Now let's do the same for the incident files
incident_pattern = re.compile(r'^incidents_(\d{4})\.json$')
incident_files = [f for f in os.listdir('.') if incident_pattern.match(f)]
incident_files_2012_on = [f for f in incident_files if int(incident_pattern.match(f).group(1)) >= 2012]

print(f"\
Processing {len(incident_files_2012_on)} incident files from 2012 onward")

# Process each incident file
for file_name in tqdm(incident_files_2012_on):
    year = int(incident_pattern.match(file_name).group(1))
    file_path = os.path.join('.', file_name)
    
    # Load the file
    with open(file_path, 'r') as f:
        data = json.load(f)
    
    # Process each record
    for record in data:
        # Fix date fields
        if 'Start_Date' in record:
            record['Start_Date'] = fix_date(record['Start_Date'], year)
        
        # Handle different field names for control date
        control_date_fields = ['Contain/_Control Date', 'Contain_Control_Date', 'Control_Date']
        for field in control_date_fields:
            if field in record:
                record[field] = fix_date(record[field], year)
    
    # Create a backup of the original file if it doesn't already exist
    backup_file = file_path + '.bak'
    if not os.path.exists(backup_file):
        os.rename(file_path, backup_file)
        # Save the updated data back to the original file
        with open(file_path, 'w') as f:
            json.dump(data, f, indent=2)
        print(f"Processed {len(data)} records in {file_name} (backup saved as {backup_file})")
    else:
        # If backup already exists, just overwrite the original file
        with open(file_path, 'w') as f:
            json.dump(data, f, indent=2)
        print(f"Processed {len(data)} records in {file_name} (backup already existed)")

print("\
All files have been processed. Original files were backed up with .bak extension.")

# Let's verify the changes by examining a few records from each file type
print("\
Verifying changes in structured files:")
for file_name in structured_files_2012_on[:2]:  # Check first 2 files
    with open(file_name, 'r') as f:
        data = json.load(f)
    
    if isinstance(data, list) and len(data) > 0:
        print(f"\
Sample from {file_name}:")
        for i, record in enumerate(data[:3]):  # Show first 3 records
            if 'Start_Date' in record:
                print(f"Record {i+1} Start_Date: {record['Start_Date']}")
            if 'Name' in record:
                print(f"Record {i+1} Name: {record['Name']}")
    elif isinstance(data, dict):
        print(f"\
Sample from {file_name} (dictionary format):")
        for key, value in list(data.items())[:2]:  # Show first 2 keys
            print(f"Key: {key}")
            if isinstance(value, list) and len(value) > 0:
                for i, record in enumerate(value[:2]):  # Show first 2 records
                    if isinstance(record, dict):
                        if 'Start_Date' in record:
                            print(f"  Record {i+1} Start_Date: {record['Start_Date']}")
                        if 'Name' in record:
                            print(f"  Record {i+1} Name: {record['Name']}")

print("\
Verifying changes in incident files:")
for file_name in incident_files_2012_on[:2]:  # Check first 2 files
    with open(file_name, 'r') as f:
        data = json.load(f)
    
    if len(data) > 0:
        print(f"\
Sample from {file_name}:")
        for i, record in enumerate(data[:3]):  # Show first 3 records
            if 'Start_Date' in record:
                print(f"Record {i+1} Start_Date: {record['Start_Date']}")
            if 'Name' in record:
                print(f"Record {i+1} Name: {record['Name']}")

print("done")

Looking for structured_incidents_data files in the current directory...
Found 0 structured_incidents_data files in the current directory.
Processing 0 structured incident files from 2012 onward


0it [00:00, ?it/s]


Processing 0 incident files from 2012 onward


0it [00:00, ?it/s]

All files have been processed. Original files were backed up with .bak extension.
Verifying changes in structured files:
Verifying changes in incident files:
done





Processing files in the root directory...


0it [00:00, ?it/s]

Verifying changes in a sample file:
done



