# Converting Fire Stat data for use with Maps

In [25]:
import pandas as pd
import fiona

# Open the geopackage file directly using Fiona to access the attribute data without geometry
with fiona.open('Resources/California_County_Boundaries.gpkg') as county_file:
    county_attributes = [feature['properties'] for feature in county_file]

# Convert the attributes to a DataFrame
county_attributes_df = pd.DataFrame(county_attributes)

# Rename the COUNTY_NAME to County for the join
county_attributes_df = county_attributes_df.rename(columns={'COUNTY_NAME': 'County'})

# Perform the join on attributes only
merged_df = county_attributes_df[['County']].merge(df, on='County')

# Display the merged DataFrame
merged_df.head()




Unnamed: 0,County,Fire Name,Start,Contained,Acres,Strux_Destr,Strux_Dmgd,Deaths_FF,Deaths_Civil,Duration
0,Alameda,TESLA,2015-08-19,2015-08-25,2850,1,0,0,0,7.0
1,Alameda,GRANT,2018-07-08,2018-07-09,480,1,0,0,0,2.0
2,Alpine,MOKELUMNE,2016-08-19,2016-09-20,655,0,0,0,0,33.0
3,Amador,BUTTE,2015-09-09,2015-10-23,70868,965,0,0,2,45.0
4,Amador,IRISH,2018-07-06,2018-07-09,825,1,1,0,0,4.0


In [26]:
# Load the fire data CSV file
file_path = r'Outputs/fires_2015_2019_cleaned.csv'
df = pd.read_csv(file_path)

# Clean the 'Acres' column by removing commas and quotes, then convert to integer
df['Acres'] = df['Acres'].replace({',': '', '"': ''}, regex=True).astype(int)

In [27]:
# Get the unique county names from both datasets
fire_counties = df['County'].unique()
county_attributes_counties = county_attributes_df['County'].unique()

# Compare the county names to identify discrepancies
fire_counties_set = set(fire_counties)
county_attributes_counties_set = set(county_attributes_counties)

# Counties in the fire data not in the county attributes
missing_in_county_attributes = fire_counties_set - county_attributes_counties_set

# Counties in the county attributes not in the fire data
missing_in_fire_data = county_attributes_counties_set - fire_counties_set

missing_in_county_attributes, missing_in_fire_data


({'ALAMEDA',
  'ALPINE',
  'AMADOR',
  'BUTTE',
  'CALAVERAS',
  'COLUSA',
  'COLUSA, GLENN,\nLAKE, MENDOCINO',
  'COLUSA, LAKE,\nMENDOCINO',
  'CONTRA COSTA',
  'DEL NORTE',
  'EL DORADO',
  'FRESNO',
  'GLENN',
  'HUMBOLDT',
  'INYO',
  'JACKSON (OR)',
  'KERN',
  'KINGS',
  'LAKE',
  'LASSEN',
  'LOS ANGELES',
  'MADERA',
  'MARIN',
  'MARIPOSA',
  'MENDOCINO',
  'MERCED',
  'MODOC',
  'MONO',
  'MONTEREY',
  'NAPA',
  'NEVADA',
  'ORANGE',
  'PLACER',
  'PLUMAS',
  'RIVERSIDE',
  'SACRAMENTO',
  'SAN BENITO',
  'SAN BERNARDINO',
  'SAN DIEGO',
  'SAN JOAQUIN',
  'SAN LUIS OBISPO',
  'SANTA BARBARA',
  'SANTA CLARA',
  'SANTA CRUZ',
  'SHASTA',
  'SISKIYOU',
  'SOLANO',
  'SONOMA',
  'STANISLAUS',
  'SUTTER',
  'TAHEMA',
  'TEHAMA',
  'TOULUMNE',
  'TRINITY',
  'TULARE',
  'TUOLOMNE',
  'TUOLUMNE',
  'VENTURA',
  'VENTURA/SANTA\nBARBARA',
  'WASHOE',
  'WASHOE (NV)',
  'YOLO',
  'YUBA'},
 {'Alameda',
  'Alpine',
  'Amador',
  'Butte',
  'Calaveras',
  'Colusa',
  'Contra Costa',
  '

In [28]:
# Get the unique county names from both datasets
fire_counties = df['County'].unique()
county_attributes_counties = county_attributes_df['County'].unique()

# Compare the county names to identify discrepancies
fire_counties_set = set(fire_counties)
county_attributes_counties_set = set(county_attributes_counties)

# Counties in the fire data not in the county attributes
missing_in_county_attributes = fire_counties_set - county_attributes_counties_set

# Counties in the county attributes not in the fire data
missing_in_fire_data = county_attributes_counties_set - fire_counties_set

missing_in_county_attributes, missing_in_fire_data


({'ALAMEDA',
  'ALPINE',
  'AMADOR',
  'BUTTE',
  'CALAVERAS',
  'COLUSA',
  'COLUSA, GLENN,\nLAKE, MENDOCINO',
  'COLUSA, LAKE,\nMENDOCINO',
  'CONTRA COSTA',
  'DEL NORTE',
  'EL DORADO',
  'FRESNO',
  'GLENN',
  'HUMBOLDT',
  'INYO',
  'JACKSON (OR)',
  'KERN',
  'KINGS',
  'LAKE',
  'LASSEN',
  'LOS ANGELES',
  'MADERA',
  'MARIN',
  'MARIPOSA',
  'MENDOCINO',
  'MERCED',
  'MODOC',
  'MONO',
  'MONTEREY',
  'NAPA',
  'NEVADA',
  'ORANGE',
  'PLACER',
  'PLUMAS',
  'RIVERSIDE',
  'SACRAMENTO',
  'SAN BENITO',
  'SAN BERNARDINO',
  'SAN DIEGO',
  'SAN JOAQUIN',
  'SAN LUIS OBISPO',
  'SANTA BARBARA',
  'SANTA CLARA',
  'SANTA CRUZ',
  'SHASTA',
  'SISKIYOU',
  'SOLANO',
  'SONOMA',
  'STANISLAUS',
  'SUTTER',
  'TAHEMA',
  'TEHAMA',
  'TOULUMNE',
  'TRINITY',
  'TULARE',
  'TUOLOMNE',
  'TUOLUMNE',
  'VENTURA',
  'VENTURA/SANTA\nBARBARA',
  'WASHOE',
  'WASHOE (NV)',
  'YOLO',
  'YUBA'},
 {'Alameda',
  'Alpine',
  'Amador',
  'Butte',
  'Calaveras',
  'Colusa',
  'Contra Costa',
  '

In [29]:
# Function to standardize county names
def standardize_county_names(county_name):
    county_name = county_name.title().replace('\n', ', ')
    return county_name

# Apply the standardization to the fire data counties
df['County'] = df['County'].apply(standardize_county_names)

# Identify multi-county entries
multi_county_entries = df[df['County'].str.contains(', ')]['County'].unique()

# Recheck which counties are still mismatched
fire_counties = df['County'].unique()
fire_counties_set = set(fire_counties)
county_attributes_counties_set = set(county_attributes_counties)

# Counties in the fire data not in the county attributes after standardization
missing_in_county_attributes_after = fire_counties_set - county_attributes_counties_set

multi_county_entries, missing_in_county_attributes_after

(array(['Ventura/Santa, Barbara', 'Colusa, Glenn,, Lake, Mendocino',
        'Colusa, Lake,, Mendocino'], dtype=object),
 {'Colusa, Glenn,, Lake, Mendocino',
  'Colusa, Lake,, Mendocino',
  'Jackson (Or)',
  'Tahema',
  'Toulumne',
  'Tuolomne',
  'Ventura/Santa, Barbara',
  'Washoe',
  'Washoe (Nv)'})

In [30]:


# Correcting spelling errors in the County column
corrections = {
    'Tahema': 'Tehama',
    'Toulumne': 'Tuolumne',
    'Tuolomne': 'Tuolumne',
    'Jackson (Or)': 'Jackson (OR)',
    'Washoe': 'Washoe (NV)',
    'Washoe (Nv)': 'Washoe (NV)'
}
df['County'] = df['County'].replace(corrections)

# Extracting and splitting multi-county entries
multi_county_df = df[df['County'].str.contains(', ')].copy()

# Split the 'County' column into multiple rows
multi_county_expanded = multi_county_df['County'].str.split(', ', expand=True).stack().reset_index(level=1, drop=True)
multi_county_df = multi_county_df.drop(columns=['County']).join(multi_county_expanded.rename('County'))

# Save the multi-county entries to a new CSV file
multi_county_file_path = 'Outputs/multi_county_fires.csv'
multi_county_df.to_csv(multi_county_file_path, index=False)

# Remove multi-county entries from the original dataset
df = df[~df['County'].str.contains(', ')]

# Save the dataset with the multi-county entries removed
cleaned_file_path = 'Outputs/fires_cleaned_single_county.csv'
df.to_csv(cleaned_file_path, index=False)

# Provide the path to the saved file
cleaned_file_path


# Display first few rows of the cleaned multi-county DataFrame and save file path
multi_county_df.head(), multi_county_file_path


(                      Fire Name       Start   Contained   Acres  Strux_Destr  \
 243                      THOMAS  2017-12-04  2018-01-12  281893            0   
 243                      THOMAS  2017-12-04  2018-01-12  281893            0   
 287  RANCH - MENDOCINO\nCOMPLEX  2018-07-27  2018-08-17  410203          246   
 287  RANCH - MENDOCINO\nCOMPLEX  2018-07-27  2018-08-17  410203          246   
 287  RANCH - MENDOCINO\nCOMPLEX  2018-07-27  2018-08-17  410203          246   
 
      Strux_Dmgd  Deaths_FF  Deaths_Civil  Duration         County  
 243         274          1             1      40.0  Ventura/Santa  
 243         274          1             1      40.0        Barbara  
 287          27          1             0      22.0         Colusa  
 287          27          1             0      22.0         Glenn,  
 287          27          1             0      22.0           Lake  ,
 'Outputs/multi_county_fires.csv')

Checking if code worked as wanted

In [31]:
# Load the cleaned single-county and multi-county datasets
single_county_df = pd.read_csv('Outputs/fires_cleaned_single_county.csv')
multi_county_df = pd.read_csv('Outputs/multi_county_fires.csv')

# Check for fire names that are in both datasets
common_fire_names = single_county_df[single_county_df['Fire Name'].isin(multi_county_df['Fire Name'])]['Fire Name'].unique()

common_fire_names


array([], dtype=object)

In [32]:
# Sort the single-county dataset by 'County' name
single_county_df_sorted = single_county_df.sort_values(by='County')

# Group by 'County' and calculate the sum for 'Acres' through 'Duration' columns
summary_totals = single_county_df_sorted.groupby('County')[['Acres', 'Strux_Destr', 'Strux_Dmgd', 'Deaths_FF', 'Deaths_Civil', 'Duration']].sum()

# Export the summary totals to a CSV file
summary_totals_file_path = 'Outputs/summary_totals_by_county.csv'
summary_totals.to_csv(summary_totals_file_path)


summary_totals



Unnamed: 0_level_0,Acres,Strux_Destr,Strux_Dmgd,Deaths_FF,Deaths_Civil,Duration
County,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Alameda,3330,2,0,0,0,9.0
Alpine,655,0,0,0,0,33.0
Amador,71693,966,1,0,2,49.0
Butte,184162,247,768,0,85,258.0
Calaveras,1101,3,1,0,0,29.0
Colusa,2220,4,0,0,0,374.0
Contra Costa,340,0,0,0,0,5.0
Del Norte,59247,0,0,0,0,208.0
El Dorado,4177,0,0,0,0,64.0
Fresno,186058,19,1,0,0,238.0
