# Converting Fire Stat data for use with Maps

In [2]:
import pandas as pd
import fiona

# Open the geopackage file directly using Fiona to access the attribute data without geometry
with fiona.open('Resources/California_County_Boundaries.gpkg') as county_file:
    county_attributes = [feature['properties'] for feature in county_file]

# Convert the attributes to a DataFrame
county_attributes_df = pd.DataFrame(county_attributes)

# Rename the COUNTY_NAME to County for the join
county_attributes_df = county_attributes_df.rename(columns={'COUNTY_NAME': 'County'})

# Perform the join on attributes only
merged_df = county_attributes_df[['County']].merge(county_attributes_df, on='County')

# Display the merged DataFrame
merged_df.head()




Unnamed: 0,County,COUNTY_ABBREV,COUNTY_CODE,COUNTY_FIPS,COUNTY_NUM,GlobalID,ISLAND
0,Alameda,ALA,1,1,1,{E6F92268-D2DD-4CFB-8B79-5B4B2F07C559},
1,Alpine,ALP,2,3,2,{870479B2-480A-494B-8352-AD60578839C1},
2,Amador,AMA,3,5,3,{4F45B3A6-BE10-461C-8945-6B2AAA7119F6},
3,Butte,BUT,4,7,4,{44FBA680-AECC-4E04-A499-29D69AFFBD4A},
4,Calaveras,CAL,5,9,5,{D11EF739-4A1E-414E-BFD1-E7DCD56CD61E},


In [3]:
# Load the fire data CSV file
file_path = r'Outputs/fires_2015_2019_cleaned.csv'
df = pd.read_csv(file_path)

# Clean the 'Acres' column by removing commas and quotes, then convert to integer
df['Acres'] = df['Acres'].replace({',': '', '"': ''}, regex=True).astype(int)

In [4]:
# Get the unique county names from both datasets
fire_counties = df['County'].unique()
county_attributes_counties = county_attributes_df['County'].unique()

# Compare the county names to identify discrepancies
fire_counties_set = set(fire_counties)
county_attributes_counties_set = set(county_attributes_counties)

# Counties in the fire data not in the county attributes
missing_in_county_attributes = fire_counties_set - county_attributes_counties_set

# Counties in the county attributes not in the fire data
missing_in_fire_data = county_attributes_counties_set - fire_counties_set

missing_in_county_attributes, missing_in_fire_data


({'ALAMEDA',
  'ALPINE',
  'AMADOR',
  'BUTTE',
  'CALAVERAS',
  'COLUSA',
  'COLUSA, GLENN,\nLAKE, MENDOCINO',
  'COLUSA, LAKE,\nMENDOCINO',
  'CONTRA COSTA',
  'DEL NORTE',
  'EL DORADO',
  'FRESNO',
  'GLENN',
  'HUMBOLDT',
  'INYO',
  'JACKSON (OR)',
  'KERN',
  'KINGS',
  'LAKE',
  'LASSEN',
  'LOS ANGELES',
  'MADERA',
  'MARIN',
  'MARIPOSA',
  'MENDOCINO',
  'MERCED',
  'MODOC',
  'MONO',
  'MONTEREY',
  'NAPA',
  'NEVADA',
  'ORANGE',
  'PLACER',
  'PLUMAS',
  'RIVERSIDE',
  'SACRAMENTO',
  'SAN BENITO',
  'SAN BERNARDINO',
  'SAN DIEGO',
  'SAN JOAQUIN',
  'SAN LUIS OBISPO',
  'SANTA BARBARA',
  'SANTA CLARA',
  'SANTA CRUZ',
  'SHASTA',
  'SISKIYOU',
  'SOLANO',
  'SONOMA',
  'STANISLAUS',
  'SUTTER',
  'TAHEMA',
  'TEHAMA',
  'TOULUMNE',
  'TRINITY',
  'TULARE',
  'TUOLOMNE',
  'TUOLUMNE',
  'VENTURA',
  'VENTURA/SANTA\nBARBARA',
  'WASHOE',
  'WASHOE (NV)',
  'YOLO',
  'YUBA'},
 {'Alameda',
  'Alpine',
  'Amador',
  'Butte',
  'Calaveras',
  'Colusa',
  'Contra Costa',
  '

In [5]:
# Get the unique county names from both datasets
fire_counties = df['County'].unique()
county_attributes_counties = county_attributes_df['County'].unique()

# Compare the county names to identify discrepancies
fire_counties_set = set(fire_counties)
county_attributes_counties_set = set(county_attributes_counties)

# Counties in the fire data not in the county attributes
missing_in_county_attributes = fire_counties_set - county_attributes_counties_set

# Counties in the county attributes not in the fire data
missing_in_fire_data = county_attributes_counties_set - fire_counties_set

missing_in_county_attributes, missing_in_fire_data


({'ALAMEDA',
  'ALPINE',
  'AMADOR',
  'BUTTE',
  'CALAVERAS',
  'COLUSA',
  'COLUSA, GLENN,\nLAKE, MENDOCINO',
  'COLUSA, LAKE,\nMENDOCINO',
  'CONTRA COSTA',
  'DEL NORTE',
  'EL DORADO',
  'FRESNO',
  'GLENN',
  'HUMBOLDT',
  'INYO',
  'JACKSON (OR)',
  'KERN',
  'KINGS',
  'LAKE',
  'LASSEN',
  'LOS ANGELES',
  'MADERA',
  'MARIN',
  'MARIPOSA',
  'MENDOCINO',
  'MERCED',
  'MODOC',
  'MONO',
  'MONTEREY',
  'NAPA',
  'NEVADA',
  'ORANGE',
  'PLACER',
  'PLUMAS',
  'RIVERSIDE',
  'SACRAMENTO',
  'SAN BENITO',
  'SAN BERNARDINO',
  'SAN DIEGO',
  'SAN JOAQUIN',
  'SAN LUIS OBISPO',
  'SANTA BARBARA',
  'SANTA CLARA',
  'SANTA CRUZ',
  'SHASTA',
  'SISKIYOU',
  'SOLANO',
  'SONOMA',
  'STANISLAUS',
  'SUTTER',
  'TAHEMA',
  'TEHAMA',
  'TOULUMNE',
  'TRINITY',
  'TULARE',
  'TUOLOMNE',
  'TUOLUMNE',
  'VENTURA',
  'VENTURA/SANTA\nBARBARA',
  'WASHOE',
  'WASHOE (NV)',
  'YOLO',
  'YUBA'},
 {'Alameda',
  'Alpine',
  'Amador',
  'Butte',
  'Calaveras',
  'Colusa',
  'Contra Costa',
  '

In [6]:
# Function to standardize county names
def standardize_county_names(county_name):
    county_name = county_name.title().replace('\n', ', ')
    return county_name

# Apply the standardization to the fire data counties
df['County'] = df['County'].apply(standardize_county_names)

# Identify multi-county entries
multi_county_entries = df[df['County'].str.contains(', ')]['County'].unique()

# Recheck which counties are still mismatched
fire_counties = df['County'].unique()
fire_counties_set = set(fire_counties)
county_attributes_counties_set = set(county_attributes_counties)

# Counties in the fire data not in the county attributes after standardization
missing_in_county_attributes_after = fire_counties_set - county_attributes_counties_set

multi_county_entries, missing_in_county_attributes_after

(array(['Ventura/Santa, Barbara', 'Colusa, Glenn,, Lake, Mendocino',
        'Colusa, Lake,, Mendocino'], dtype=object),
 {'Colusa, Glenn,, Lake, Mendocino',
  'Colusa, Lake,, Mendocino',
  'Jackson (Or)',
  'Tahema',
  'Toulumne',
  'Tuolomne',
  'Ventura/Santa, Barbara',
  'Washoe',
  'Washoe (Nv)'})

In [7]:


# Correcting spelling errors in the County column
corrections = {
    'Tahema': 'Tehama',
    'Toulumne': 'Tuolumne',
    'Tuolomne': 'Tuolumne',
    'Jackson (Or)': 'Jackson (OR)',
    'Washoe': 'Washoe (NV)',
    'Washoe (Nv)': 'Washoe (NV)'
}
df['County'] = df['County'].replace(corrections)

# Extracting and splitting multi-county entries
multi_county_df = df[df['County'].str.contains(', ')].copy()

# Split the 'County' column into multiple rows
multi_county_expanded = multi_county_df['County'].str.split(', ', expand=True).stack().reset_index(level=1, drop=True)
multi_county_df = multi_county_df.drop(columns=['County']).join(multi_county_expanded.rename('County'))

# Save the multi-county entries to a new CSV file
multi_county_file_path = 'Outputs/multi_county_fires.csv'
multi_county_df.to_csv(multi_county_file_path, index=False)

# Remove multi-county entries from the original dataset
df = df[~df['County'].str.contains(', ')]

# Save the dataset with the multi-county entries removed
cleaned_file_path = 'Outputs/fires_cleaned_single_county.csv'
df.to_csv(cleaned_file_path, index=False)

# Provide the path to the saved file
cleaned_file_path


# Display first few rows of the cleaned multi-county DataFrame and save file path
multi_county_df.head(), multi_county_file_path


(                      Fire Name      Start  Contained   Acres  Strux_Destr  \
 243                      THOMAS  12/4/2017  1/12/2018  281893            0   
 243                      THOMAS  12/4/2017  1/12/2018  281893            0   
 290  RANCH - MENDOCINO\nCOMPLEX  7/27/2018  8/17/2018  410203          246   
 290  RANCH - MENDOCINO\nCOMPLEX  7/27/2018  8/17/2018  410203          246   
 290  RANCH - MENDOCINO\nCOMPLEX  7/27/2018  8/17/2018  410203          246   
 
      Strux_Dmgd  Deaths_FF  Deaths_Civil Duration  Duration_Days  \
 243         274          1             1  40 days             40   
 243         274          1             1  40 days             40   
 290          27          1             0  22 days             22   
 290          27          1             0  22 days             22   
 290          27          1             0  22 days             22   
 
             County  
 243  Ventura/Santa  
 243        Barbara  
 290         Colusa  
 290         Glenn, 

Checking if code worked as wanted

In [8]:
# Load the cleaned single-county and multi-county datasets
single_county_df = pd.read_csv('Outputs/fires_cleaned_single_county.csv')
multi_county_df = pd.read_csv('Outputs/multi_county_fires.csv')

# Check for fire names that are in both datasets
common_fire_names = single_county_df[single_county_df['Fire Name'].isin(multi_county_df['Fire Name'])]['Fire Name'].unique()

common_fire_names


array([], dtype=object)

In [16]:
# Load the cleaned single-county and multi-county datasets
single_county_df = pd.read_csv('Outputs/fires_cleaned_single_county.csv')

# Extract the year from the 'Start' column and add it as a new column
single_county_df['Year'] = pd.to_datetime(single_county_df['Start']).dt.year

# Display the updated DataFrame
single_county_df.head()


Unnamed: 0,County,Fire Name,Start,Contained,Acres,Strux_Destr,Strux_Dmgd,Deaths_FF,Deaths_Civil,Duration,Year
0,Inyo,ROUND,2015-02-06,2015-02-13,7000,43,5,0,0,8.0,2015
1,Mono,VAN DYKE,2015-02-06,2015-02-10,509,0,0,0,0,5.0,2015
2,Riverside,HIGHWAY,2015-04-18,2015-04-24,1049,0,0,0,0,7.0,2015
3,San Diego,CARL,2015-04-28,2015-04-29,4000,0,0,0,0,2.0,2015
4,San Diego,MORTAR,2015-04-28,2015-04-29,800,0,0,0,0,2.0,2015


In [17]:
# Sort the single-county dataset by 'Year' and 'County' name
single_county_df_sorted = single_county_df.sort_values(by=['Year', 'County'])

# Group by 'Year' and 'County' and calculate the sum for 'Acres' through 'Duration' columns
summary_totals = single_county_df_sorted.groupby(['Year', 'County'])[['Acres', 'Strux_Destr', 'Strux_Dmgd', 'Deaths_FF', 'Deaths_Civil', 'Duration']].sum()

# Convert the 'Start' and 'Contained' columns to datetime
single_county_df['Start'] = pd.to_datetime(single_county_df['Start'])
single_county_df['Contained'] = pd.to_datetime(single_county_df['Contained'])

# Function to generate a set of active days for a fire
def get_active_days(row):
    return pd.date_range(start=row['Start'], end=row['Contained'])

# Apply the function to each row to generate the active days
single_county_df['Active_Days'] = single_county_df.apply(get_active_days, axis=1)

# Group by 'Year' and 'County' and union all active days
def union_active_days(group):
    all_days = set().union(*group['Active_Days'])
    return len(all_days)

# Apply the union function to get the count of unique fire days
unique_fire_days = single_county_df.groupby(['Year', 'County']).apply(union_active_days)

# Add this count to your summary totals
summary_totals['Unique_Fire_Days'] = unique_fire_days

# Display the updated summary totals
summary_totals.head()

# Count the number of fires per year and county
fire_counts = single_county_df_sorted.groupby(['Year', 'County']).size()

# Add the count of fires to the summary totals
summary_totals['Total_Fires'] = fire_counts

# Display the summary
summary_totals.head()

# Export the summary totals to a CSV file
summary_totals_file_path = 'Outputs/summary_totals_by_county.csv'
summary_totals.to_csv(summary_totals_file_path)

# Display the final summary
summary_totals




  unique_fire_days = single_county_df.groupby(['Year', 'County']).apply(union_active_days)


Unnamed: 0_level_0,Unnamed: 1_level_0,Acres,Strux_Destr,Strux_Dmgd,Deaths_FF,Deaths_Civil,Duration,Unique_Fire_Days,Total_Fires
Year,County,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2015,Alameda,2850,1,0,0,0,7.0,7,1
2015,Amador,70868,965,0,0,2,45.0,45,1
2015,Butte,2300,16,0,0,0,32.0,32,3
2015,Del Norte,37870,0,0,0,0,138.0,77,2
2015,Fresno,151623,4,0,0,0,124.0,124,1
...,...,...,...,...,...,...,...,...,...
2019,Trinity,1749,0,0,0,0,69.0,69,2
2019,Tulare,3753,0,0,0,0,151.0,102,4
2019,Tuolumne,731,0,0,0,0,112.0,112,1
2019,Ventura,12374,5,1,0,0,17.0,10,2
