In [1]:
import pandas as pd
import warnings
warnings.filterwarnings("ignore", message="Could not infer format, so each element will be parsed individually")


In [2]:
# PART 1:  EXTRACT DATA FOR 2011 - 2013, 2016, and 2018 - 2019 FIRES 

# Define path to the source files
file_2011 = 'Resources/CAL_FireStats/2011-wildfire-activity-stats.xlsx'
file_2012 = 'Resources/CAL_FireStats/2012-wildfire-activity-stats.xlsx'
file_2013 = 'Resources/CAL_FireStats/2013-wildfire-activity-stats.xlsx'
file_2016 = 'Resources/CAL_FireStats/2016-wildfire-activity-stats.xlsx'
file_2018 = 'Resources/CAL_FireStats/2018-wildfire-activity-stats.xlsx'
file_2019 = 'Resources/CAL_FireStats/2019-wildfire-activity-stats.xlsx'

In [3]:
# Define the sheets to be read (all these files contain needed information on the same pages)
sheets = ['table_page_13', 'table_page_14', 'table_page_15', 'table_page_16']

In [4]:
# Define the columns to be extracted
columns_to_extract = ['County', 'Fire Name', 'Start', 'Cont.', 'Total', 
                      'Dest.', 'Dam.', 'Fire', 'Civil']

In [5]:
# Define a function to load and process the data from the Excel files
def load_wildfire_data(file_path, sheets, columns_to_extract):
    # Initiate an empty Data Frame
    extracted_data = pd.DataFrame()
    # Set a 'for' loop to go through identified path, sheets, and columns
    # skip the 1st row as it does not contain iseful information
    for sheet_name in sheets:
        data = pd.read_excel(file_path, sheet_name=sheet_name, skiprows=1, usecols=columns_to_extract)
        # Add the data to an existing collection of data, Index=True to re-number the rows after adding a new data
        extracted_data = pd.concat([extracted_data, data], ignore_index=True)
    
    # 'TRANSFORM': Convert 'Start' and 'Cont.' columns to datetime format
    extracted_data['Start'] = pd.to_datetime(extracted_data['Start'])
    extracted_data['Cont.'] = pd.to_datetime(extracted_data['Cont.'])
    
    # 'TRANSFORM': Convert 'Dest.', 'Dam.', 'Fire', and 'Civil' columns to integers and replace NaN values with zero
    extracted_data['Dest.'] = pd.to_numeric(extracted_data['Dest.'], errors='coerce').fillna(0).astype(int)
    extracted_data['Dam.'] = pd.to_numeric(extracted_data['Dam.'], errors='coerce').fillna(0).astype(int)
    extracted_data['Fire'] = pd.to_numeric(extracted_data['Fire'], errors='coerce').fillna(0).astype(int)
    extracted_data['Civil'] = pd.to_numeric(extracted_data['Civil'], errors='coerce').fillna(0).astype(int)
    
    # Rename columns
    extracted_data = extracted_data.rename(columns={'Cont.': "Contained", 'Total': 'Acres', 'Dest.': 'Strux_Destr', 'Dam.': 'Strux_Dmgd', 
                                                    'Fire': 'Deaths_FF', 'Civil': 'Deaths_Civil'})
    
    # Create a new column 'Duration' that calculates the number of days between 'Start' and 'Contained'
    extracted_data['Duration'] = (extracted_data['Contained'] - extracted_data['Start']).dt.days + 1

    # Reorder columns
    extracted_data = extracted_data[['County', 'Fire Name', 'Start', 'Contained', 'Acres', 
                                     'Strux_Destr', 'Strux_Dmgd', 'Deaths_FF', 'Deaths_Civil', 'Duration']]
    # Return the processed data as a DataFrame so it can be used outside the function
    return extracted_data


In [8]:
# Load data for 2016, 2018, 2019 years
fires_2011 = load_wildfire_data(file_2011, sheets, columns_to_extract)
fires_2012 = load_wildfire_data(file_2012, sheets, columns_to_extract)
fires_2013 = load_wildfire_data(file_2013, sheets, columns_to_extract)
fires_2016 = load_wildfire_data(file_2016, sheets, columns_to_extract)
fires_2018 = load_wildfire_data(file_2018, sheets, columns_to_extract)
fires_2019 = load_wildfire_data(file_2019, sheets, columns_to_extract)

In [9]:
# Combine the data from 2016, 2018, and 2019 into one DataFrame
fires_2011_2012_2013_2016_2018_2019_data = pd.concat([fires_2011, fires_2012, fires_2013, fires_2016, fires_2018, fires_2019], ignore_index=True)

In [12]:
# Display the first two rows of the 2011 and combined dataframes to check the results
print("2011 Fire Data:")
print(fires_2011.head(2))
print("--------------------------------------")
print("\nCombined Fire Data (2011-2013,2016, 2018-2019):")
print(fires_2011_2012_2013_2016_2018_2019_data.head(3))

2011 Fire Data:
  County  Fire Name      Start  Contained Acres  Strux_Destr  Strux_Dmgd  \
0   INYO  WINTERTON 2011-03-09 2011-03-09   395            0           0   
1   INYO     CENTER 2011-03-18 2011-03-23   850           19           0   

   Deaths_FF  Deaths_Civil  Duration  
0          0             0       1.0  
1          0             0       6.0  
--------------------------------------

Combined Fire Data (2011-2013,2016, 2018-2019):
     County  Fire Name      Start  Contained Acres  Strux_Destr  Strux_Dmgd  \
0      INYO  WINTERTON 2011-03-09 2011-03-09   395            0           0   
1      INYO     CENTER 2011-03-18 2011-03-23   850           19           0   
2  MONTEREY       METZ 2011-05-12 2011-05-14   832            0           0   

   Deaths_FF  Deaths_Civil  Duration  
0          0             0       1.0  
1          0             0       6.0  
2          0             0       3.0  


In [13]:
# Save Data Frames to 'Outputs' folder
fires_2011.to_csv('Outputs/2011_wildfire_data.csv', index=False)
fires_2012.to_csv('Outputs/2012_wildfire_data.csv', index=False)
fires_2013.to_csv('Outputs/2013_wildfire_data.csv', index=False)
fires_2016.to_csv('Outputs/2016_wildfire_data.csv', index=False)
fires_2018.to_csv('Outputs/2018_wildfire_data.csv', index=False)
fires_2019.to_csv('Outputs/2019_wildfire_data.csv', index=False)
fires_2011_2012_2013_2016_2018_2019_data.to_csv('Outputs/2011_2012_2013_2016_2018_2019_wildfire_data.csv', index=False)

In [14]:
#  PART 2: EXTRACT DATA FOR 2017 FIRES:

# Define path to the source file of fires 2017
file_2017 = 'Resources/CAL_FireStats/2017-wildfire-activity-stats.xlsx'
# Define the sheets to be read thta contain needed information (different number of pages than 2016, 2018-2019 files)
sheets = ['table_page_13', 'table_page_14', 'table_page_15', 'table_page_16', 'table_page_17', 'table_page_18']
# Define the columns to be extracted
columns_to_extract = ['County', 'Fire Name', 'Start', 'Cont.', 'Total', 
                      'Dest.', 'Dam.', 'Fire', 'Civil']
# Function to load and process the data from the given Excel file
def load_wildfire_data(file_path, sheets, columns_to_extract):
    extracted_data = pd.DataFrame()
    
    for sheet_name in sheets:
        data = pd.read_excel(file_path, sheet_name=sheet_name, skiprows=1, usecols=columns_to_extract)
        extracted_data = pd.concat([extracted_data, data], ignore_index=True)
    
    # 'TRANSFORM': Convert 'Start' and 'Cont.' columns to datetime format
    extracted_data['Start'] = pd.to_datetime(extracted_data['Start'])
    extracted_data['Cont.'] = pd.to_datetime(extracted_data['Cont.'])
    
    # 'TRANSFORM': Convert 'Dest.', 'Dam.', 'Fire', and 'Civil' columns to integers and replace NaN values with zero
    extracted_data['Dest.'] = pd.to_numeric(extracted_data['Dest.'], errors='coerce').fillna(0).astype(int)
    extracted_data['Dam.'] = pd.to_numeric(extracted_data['Dam.'], errors='coerce').fillna(0).astype(int)
    extracted_data['Fire'] = pd.to_numeric(extracted_data['Fire'], errors='coerce').fillna(0).astype(int)
    extracted_data['Civil'] = pd.to_numeric(extracted_data['Civil'], errors='coerce').fillna(0).astype(int)
    
    # Rename columns
    extracted_data = extracted_data.rename(columns={'Cont.': "Contained", 'Total': 'Acres', 'Dest.': 'Strux_Destr', 'Dam.': 'Strux_Dmgd', 
                                                    'Fire': 'Deaths_FF', 'Civil': 'Deaths_Civil'})
    
    # Create a new column 'Duration' that calculates the number of days between 'Start' and 'Contained'
    extracted_data['Duration'] = (extracted_data['Contained'] - extracted_data['Start']).dt.days + 1

    # Reorder columns
    extracted_data = extracted_data[['County', 'Fire Name', 'Start', 'Contained', 'Acres', 
                                     'Strux_Destr', 'Strux_Dmgd', 'Deaths_FF', 'Deaths_Civil', 'Duration']]
    
    return extracted_data
# Load data for 2017
fires_2017 = load_wildfire_data(file_2017, sheets, columns_to_extract)
# Display the first two rows of the data frame
print("2017 Fire Data:")
print(fires_2017.head(2))
# Save Data Frames to 'Outputs' folder
fires_2017.to_csv('Outputs/2017_wildfire_data.csv', index=False)


2017 Fire Data:
   County  Fire Name      Start  Contained  Acres  Strux_Destr  Strux_Dmgd  \
0  FRESNO      JAYNE 2017-04-20 2017-04-21  4,532            0           0   
1  FRESNO  EL DORADO 2017-04-28 2017-04-28    750            0           0   

   Deaths_FF  Deaths_Civil  Duration  
0          0             0       2.0  
1          0             0       1.0  


In [15]:
#  PART 3: EXTRACT DATA FOR 2014 - 2015 FIRES

# Define path to the source file of fires 2014 - 2015
file_2014 = 'Resources/CAL_FireStats/2014-wildfire-activity-stats.xlsx'
file_2015 = 'Resources/CAL_FireStats/2015-wildfire-activity-stats.xlsx'
# Define the sheets to be read thta contain needed information (different number of pages than 2016, 2018-2019 files)
sheets = ['table_page_13', 'table_page_14', 'table_page_15']
# Define the columns to be extracted
columns_to_extract = ['County', 'Fire Name', 'Start', 'Cont.', 'Total', 
                      'Dest.', 'Dam.', 'Fire', 'Civil']
# Function to load and process the data from the given Excel file
def load_wildfire_data(file_path, sheets, columns_to_extract):
    extracted_data = pd.DataFrame()
    
    for sheet_name in sheets:
        data = pd.read_excel(file_path, sheet_name=sheet_name, skiprows=1, usecols=columns_to_extract)
        extracted_data = pd.concat([extracted_data, data], ignore_index=True)
    
    # 'TRANSFORM': Convert 'Start' and 'Cont.' columns to datetime format
    extracted_data['Start'] = pd.to_datetime(extracted_data['Start'])
    extracted_data['Cont.'] = pd.to_datetime(extracted_data['Cont.'])
    
    # 'TRANSFORM': Convert 'Dest.', 'Dam.', 'Fire', and 'Civil' columns to integers and replace NaN values with zero
    extracted_data['Dest.'] = pd.to_numeric(extracted_data['Dest.'], errors='coerce').fillna(0).astype(int)
    extracted_data['Dam.'] = pd.to_numeric(extracted_data['Dam.'], errors='coerce').fillna(0).astype(int)
    extracted_data['Fire'] = pd.to_numeric(extracted_data['Fire'], errors='coerce').fillna(0).astype(int)
    extracted_data['Civil'] = pd.to_numeric(extracted_data['Civil'], errors='coerce').fillna(0).astype(int)
    
    # Rename columns
    extracted_data = extracted_data.rename(columns={'Cont.': "Contained", 'Total': 'Acres', 'Dest.': 'Strux_Destr', 'Dam.': 'Strux_Dmgd', 
                                                    'Fire': 'Deaths_FF', 'Civil': 'Deaths_Civil'})
    
    # Create a new column 'Duration' that calculates the number of days between 'Start' and 'Contained'
    extracted_data['Duration'] = (extracted_data['Contained'] - extracted_data['Start']).dt.days + 1

    # Reorder columns
    extracted_data = extracted_data[['County', 'Fire Name', 'Start', 'Contained', 'Acres', 
                                     'Strux_Destr', 'Strux_Dmgd', 'Deaths_FF', 'Deaths_Civil', 'Duration']]
    
    return extracted_data
# Load data for 2014 - 2015
fires_2014 = load_wildfire_data(file_2014, sheets, columns_to_extract)
fires_2015 = load_wildfire_data(file_2015, sheets, columns_to_extract)
# Display the first two rows of the data frame
print("2014 Fire Data:")
print(fires_2014.head(2))
print("--------------------------------------------")
print("2015 Fire Data:")
print(fires_2015.head(2))
# Save Data Frames to 'Outputs' folder
fires_2014.to_csv('Outputs/2014_wildfire_data.csv', index=False)
fires_2015.to_csv('Outputs/2015_wildfire_data.csv', index=False)

2014 Fire Data:
      County Fire Name      Start  Contained Acres  Strux_Destr  Strux_Dmgd  \
0   HUMBOLDT       RED 2014-01-04 2014-01-12   333            0           0   
1  RIVERSIDE    PIERCE 2014-03-15 2014-03-16   350            0           0   

   Deaths_FF  Deaths_Civil  Duration  
0          0             0       9.0  
1          0             0       2.0  
--------------------------------------------
2015 Fire Data:
      County Fire Name      Start  Contained  Acres  Strux_Destr  Strux_Dmgd  \
0       INYO     ROUND 2015-02-06 2015-02-13  7,000           43           5   
1  RIVERSIDE   HIGHWAY 2015-04-18 2015-04-24  1,049            0           0   

   Deaths_FF  Deaths_Civil  Duration  
0          0             0       8.0  
1          0             0       7.0  


In [17]:
#  PART 4: EXTRACT DATA FOR 2010 FIRES:

# Define path to the source file of fires 2010
file_2010 = 'Resources/CAL_FireStats/2010-wildfire-activity-stats.xlsx'
# Define the sheets to be read thta contain needed information (different number of pages than 2016, 2018-2019 files)
sheets = ['table_page_11', 'table_page_12']
# Define the columns to be extracted
columns_to_extract = ['County', 'Fire Name', 'Start', 'Cont.', 'Total', 
                      'Dest.', 'Dam.', 'Fire', 'Civil']
# Function to load and process the data from the given Excel file
def load_wildfire_data(file_path, sheets, columns_to_extract):
    extracted_data = pd.DataFrame()
    
    for sheet_name in sheets:
        data = pd.read_excel(file_path, sheet_name=sheet_name, skiprows=1, usecols=columns_to_extract)
        extracted_data = pd.concat([extracted_data, data], ignore_index=True)
    
    # 'TRANSFORM': Convert 'Start' and 'Cont.' columns to datetime format
    extracted_data['Start'] = pd.to_datetime(extracted_data['Start'])
    extracted_data['Cont.'] = pd.to_datetime(extracted_data['Cont.'])
    
    # 'TRANSFORM': Convert 'Dest.', 'Dam.', 'Fire', and 'Civil' columns to integers and replace NaN values with zero
    extracted_data['Dest.'] = pd.to_numeric(extracted_data['Dest.'], errors='coerce').fillna(0).astype(int)
    extracted_data['Dam.'] = pd.to_numeric(extracted_data['Dam.'], errors='coerce').fillna(0).astype(int)
    extracted_data['Fire'] = pd.to_numeric(extracted_data['Fire'], errors='coerce').fillna(0).astype(int)
    extracted_data['Civil'] = pd.to_numeric(extracted_data['Civil'], errors='coerce').fillna(0).astype(int)
    
    # Rename columns
    extracted_data = extracted_data.rename(columns={'Cont.': "Contained", 'Total': 'Acres', 'Dest.': 'Strux_Destr', 'Dam.': 'Strux_Dmgd', 
                                                    'Fire': 'Deaths_FF', 'Civil': 'Deaths_Civil'})
    
    # Create a new column 'Duration' that calculates the number of days between 'Start' and 'Contained'
    extracted_data['Duration'] = (extracted_data['Contained'] - extracted_data['Start']).dt.days + 1

    # Reorder columns
    extracted_data = extracted_data[['County', 'Fire Name', 'Start', 'Contained', 'Acres', 
                                     'Strux_Destr', 'Strux_Dmgd', 'Deaths_FF', 'Deaths_Civil', 'Duration']]
    
    return extracted_data
# Load data for 2010
fires_2010 = load_wildfire_data(file_2010, sheets, columns_to_extract)
# Display the first two rows of the data frame
print("2010 Fire Data:")
print(fires_2010.head(2))
# Save Data Frames to 'Outputs' folder
fires_2010.to_csv('Outputs/2010_wildfire_data.csv', index=False)

2010 Fire Data:
      County Fire Name      Start  Contained Acres  Strux_Destr  Strux_Dmgd  \
0  Riverside    PEDLEY 2010-05-12 2010-05-13   850            0           0   
1       Kern    METZEN 2010-05-15 2010-05-15   360            1           0   

   Deaths_FF  Deaths_Civil  Duration  
0          0             0       2.0  
1          0             0       1.0  


In [26]:
#  PART 5: EXTRACT DATA FOR 2009 FIRES:

# Define path to the source file of fires 2009
file_2009 = 'Resources/CAL_FireStats/2009-wildfire-activity-stats.xlsx'
# Define the sheets to be read thta contain needed information (different number of pages than 2016, 2018-2019 files)
sheets = ['table_page_15', 'table_page_16', 'table_page_17', 'table_page_18']
# Define the columns to be extracted
columns_to_extract = ['County', 'Fire Name', 'Start', 'Cont.', 'Total', 
                        'Fire', 'Civil']
# Function to load and process the data from the given Excel file
def load_wildfire_data(file_path, sheets, columns_to_extract):
    extracted_data = pd.DataFrame()
    
    for sheet_name in sheets:
        data = pd.read_excel(file_path, sheet_name=sheet_name, skiprows=1, usecols=columns_to_extract)
        extracted_data = pd.concat([extracted_data, data], ignore_index=True)
    
    # 'TRANSFORM': Convert 'Start' and 'Cont.' columns to datetime format
    extracted_data['Start'] = pd.to_datetime(extracted_data['Start'])
    extracted_data['Cont.'] = pd.to_datetime(extracted_data['Cont.'])
    
    # 'TRANSFORM': Convert 'Fire', and 'Civil' columns to integers and replace NaN values with zero
    extracted_data['Fire'] = pd.to_numeric(extracted_data['Fire'], errors='coerce').fillna(0).astype(int)
    extracted_data['Civil'] = pd.to_numeric(extracted_data['Civil'], errors='coerce').fillna(0).astype(int)
    
    # Rename columns
    extracted_data = extracted_data.rename(columns={'Cont.': "Contained", 'Total': 'Acres',
                                                    'Fire': 'Deaths_FF', 'Civil': 'Deaths_Civil'})
    
    # Create a new column 'Duration' that calculates the number of days between 'Start' and 'Contained'
    extracted_data['Duration'] = (extracted_data['Contained'] - extracted_data['Start']).dt.days + 1

    # Reorder columns
    extracted_data = extracted_data[['County', 'Fire Name', 'Start', 'Contained', 'Acres', 
                                    'Deaths_FF', 'Deaths_Civil', 'Duration']]
    
    return extracted_data
# Load data for 2009
fires_2009 = load_wildfire_data(file_2009, sheets, columns_to_extract)
# Display the first two rows of the data frame
print("2009 Fire Data:")
print(fires_2009.head(2))
# Save Data Frames to 'Outputs' folder
fires_2009.to_csv('Outputs/2009_wildfire_data.csv', index=False)

2009 Fire Data:
           County Fire Name      Start  Contained Acres  Deaths_FF  \
0  San Bernardino      Fort 2009-02-05 2009-02-07   945          0   
1      Stanislaus   Mustang 2009-05-13 2009-05-16   570          0   

   Deaths_Civil  Duration  
0             0       3.0  
1             0       4.0  


In [28]:
#  PART 6: EXTRACT DATA FOR 2008 FIRES:

# Define path to the source file of fires 2008
file_2008 = 'Resources/CAL_FireStats/2008-wildfire-activity-stats.xlsx'
# Define the sheets to be read thta contain needed information (different number of pages than 2016, 2018-2019 files)
sheets = ['table_page_15', 'table_page_16', 'table_page_17', 'table_page_18', 'table_page_19', 'table_page_20', 'table_page_21', 'table_page_22']
# Define the columns to be extracted
columns_to_extract = ['County', 'Fire Name', 'Start', 'Cont.', 'Total', 
                        'Fire', 'Civil']
# Function to load and process the data from the given Excel file
def load_wildfire_data(file_path, sheets, columns_to_extract):
    extracted_data = pd.DataFrame()
    
    for sheet_name in sheets:
        data = pd.read_excel(file_path, sheet_name=sheet_name, skiprows=1, usecols=columns_to_extract)
        extracted_data = pd.concat([extracted_data, data], ignore_index=True)
    
    # 'TRANSFORM': Convert 'Start' and 'Cont.' columns to datetime format
    extracted_data['Start'] = pd.to_datetime(extracted_data['Start'])
    extracted_data['Cont.'] = pd.to_datetime(extracted_data['Cont.'])
    
    # 'TRANSFORM': Convert 'Fire', and 'Civil' columns to integers and replace NaN values with zero
    extracted_data['Fire'] = pd.to_numeric(extracted_data['Fire'], errors='coerce').fillna(0).astype(int)
    extracted_data['Civil'] = pd.to_numeric(extracted_data['Civil'], errors='coerce').fillna(0).astype(int)
    
    # Rename columns
    extracted_data = extracted_data.rename(columns={'Cont.': "Contained", 'Total': 'Acres',
                                                    'Fire': 'Deaths_FF', 'Civil': 'Deaths_Civil'})
    
    # Create a new column 'Duration' that calculates the number of days between 'Start' and 'Contained'
    extracted_data['Duration'] = (extracted_data['Contained'] - extracted_data['Start']).dt.days + 1

    # Reorder columns
    extracted_data = extracted_data[['County', 'Fire Name', 'Start', 'Contained', 'Acres', 
                                    'Deaths_FF', 'Deaths_Civil', 'Duration']]
    
    return extracted_data
# Load data for 2008
fires_2008 = load_wildfire_data(file_2008, sheets, columns_to_extract)
# Display the first two rows of the data frame
print("2008 Fire Data:")
print(fires_2008.head(2))
# Save Data Frames to 'Outputs' folder
fires_2008.to_csv('Outputs/2009_wildfire_data.csv', index=False)

2008 Fire Data:
           County Fire Name      Start  Contained  Acres  Deaths_FF  \
0  San Bernardino     Bluff 2008-03-16 2008-03-20    680          0   
1    Tehama-Glenn   Colyear 2008-05-06 2008-05-09  1,331          0   

   Deaths_Civil  Duration  
0             0       5.0  
1             0       4.0  


In [29]:
#  PART 7:  Combine the data from 2008 to 2019

fires_2008_2019= pd.concat([fires_2008, fires_2009, fires_2010, fires_2014, fires_2015, fires_2017, fires_2011_2012_2013_2016_2018_2019_data], ignore_index=True)
                           
# Save Data Frames to 'Outputs' folder
fires_2008_2019.to_csv('Outputs/2008_2019_wildfire_data.csv', index=False)


In [30]:
# PART 8: FINAL CLEAN UP OF THE COMBINED DATA

# Sort the final data by the start and then contained dates (in case of multiple fires started on the same date)
fires_2008_2019_sorted = fires_2008_2019.sort_values(by=['Start', 'Contained'], ascending=True)

# Remove rows where 'County' column is empty (NaN or empty strings) to remove summary lines from the data
fires_2008_2019_filtered = fires_2008_2019_sorted.dropna(subset=['County']) # Remove Nan values
fires_2008_2019_filtered = fires_2008_2019_filtered[fires_2008_2019_filtered['County'].str.strip() != ''] #Remove empty lines

# Identify where 'Start' is after 'Cont.'
incorrect_dates = fires_2008_2019_filtered[fires_2008_2019_filtered['Start'] > fires_2008_2019_filtered['Contained']]

# Display the rows where 'Start' is after 'Cont.'
if not incorrect_dates.empty:
    print("Rows where 'Start' is after 'Cont.':")
    print(incorrect_dates)
else:
        print("No rows found where 'Start' is after 'Cont.'")


Rows where 'Start' is after 'Cont.':
       County Fire Name      Start  Contained  Acres  Deaths_FF  Deaths_Civil  \
259  Tuolumne     SLOPE 2010-07-25 2010-01-23  1,711          0             0   
798    TEHAMA      DALE 2028-07-09 2018-07-09    856          0             0   

     Duration  Strux_Destr  Strux_Dmgd  
259    -182.0          0.0         0.0  
798   -3652.0          0.0         0.0  


In [31]:
# The records  above indicate incorrect dates (fire can't start after 'contained' date.)
# Correcting dates based on verified historical records:

# Correcting record 798 to '2017/07/09' and record 259 to '2010-08-23'
fires_2008_2019_filtered.at[798, 'Start'] = '2017/07/09'
fires_2008_2019_filtered.at[259, 'Contained'] = '2010-08-23'

# Ensure that the changes are in datetime format
fires_2008_2019_filtered['Start'] = pd.to_datetime(fires_2008_2019_filtered['Start'], errors='coerce')
fires_2008_2019_filtered['Contained'] = pd.to_datetime(fires_2008_2019_filtered['Contained'], errors='coerce')

# Optional: Check if the corrections were successful
print(fires_2008_2019_filtered.loc[[259, 798], ['Start', 'Contained']])

         Start  Contained
259 2010-01-23 2010-08-23
798 2017-07-09 2018-07-09


In [32]:
# Sort the cleaned data by 'Start' and 'Contained' dates
fires_2008_2019_cleaned = fires_2008_2019_filtered.sort_values(by=['Start', 'Contained'], ascending=True)

# Calculate the duration of the fire (assuming 'Contained' is the end date and 'Start' is the start date)
fires_2008_2019_cleaned['Duration'] = fires_2008_2019_cleaned['Contained'] - fires_2008_2019_cleaned['Start']

# To add one day to the duration to include the start date fully, you can do this:
fires_2008_2019_cleaned['Duration'] = fires_2008_2019_cleaned['Duration'] + pd.Timedelta(days=1)

# Display the rows where 'Duration' is negative
negative_duration = fires_2008_2019_cleaned[fires_2008_2019_cleaned['Duration'] < pd.Timedelta(0)]
if not negative_duration.empty:
    print("Duration is incorrect:")
    print(negative_duration)
else:
    print("No errors were found in fire duration count.")

# Print the cleaned data to verify no errors
print(fires_2008_2019_cleaned[['Start', 'Contained', 'Duration']].tail())

No errors were found in fire duration count.
         Start  Contained Duration
889 2019-11-25 2019-11-25   1 days
917 2019-11-25 2019-12-13  19 days
99         NaT        NaT      NaT
187        NaT        NaT      NaT
251        NaT        NaT      NaT


In [36]:
# Display above shows summary lines appeared on our combined data

# Remove rows where 'Start', 'Contained', or 'Duration' columns have NaT values
fires_2008_2019_cleaned = fires_2008_2019_cleaned.dropna(subset=['Start', 'Contained', 'Duration'])

# Check if any such rows remain
remaining_nat_records = fires_2008_2019_cleaned[fires_2008_2019_cleaned[['Start', 'Contained', 'Duration']].isna().any(axis=1)]
print(f"Remaining rows with NaT in 'Start', 'Contained', or 'Duration': {len(remaining_nat_records)}")

# Print the cleaned data to verify all was corrected
print(fires_2008_2019_cleaned[['Start', 'Contained', 'Duration']].tail())

Remaining rows with NaT in 'Start', 'Contained', or 'Duration': 0
         Start  Contained Duration
885 2019-10-31 2019-11-03   4 days
887 2019-10-31 2019-11-06   7 days
888 2019-11-03 2019-11-06   4 days
889 2019-11-25 2019-11-25   1 days
917 2019-11-25 2019-12-13  19 days


In [38]:
# Verifying if there are any other errors in the data (like suspiciously long fires)

# Define a timedelta representing 150 days
threshold_duration = pd.Timedelta(days=150)
# Find a d display if any fires lasted longer than 150 days according to our data
long_durations = fires_2008_2019_cleaned[fires_2008_2019_cleaned['Duration'] > threshold_duration]
print("Suspiciously long durations:")
print(long_durations)

Suspiciously long durations:
       County Fire Name      Start  Contained   Acres  Deaths_FF  \
146  MONTEREY     CHALK 2008-09-25 2028-10-29  16,269          0   
259  Tuolumne     SLOPE 2010-01-23 2010-08-23   1,711          0   
645  MARIPOSA   CASCADE 2012-06-16 2012-11-26   1,705          0   
798    TEHAMA      DALE 2017-07-09 2018-07-09     856          0   
860    COLUSA      SAND 2019-06-08 2020-06-15   2,220          0   

     Deaths_Civil  Duration  Strux_Destr  Strux_Dmgd  
146             0 7340 days          NaN         NaN  
259             0  213 days          0.0         0.0  
645             0  164 days          0.0         0.0  
798             0  366 days          0.0         0.0  
860             0  374 days          4.0         0.0  


In [40]:
# According to Google SAND fired was contained on 2019-06-15, 1 year earlier than our data shows.
# Correct the year in the 'Contained' column to 2019
fires_2008_2019_cleaned.loc[340, 'Contained'] = fires_2008_2019_cleaned.loc[340, 'Contained'].replace(year=2019)
# Verify the updated record
print("Corrected record:")
print(fires_2008_2019_cleaned.loc[340])

Corrected record:
County                       MERCED
Fire Name                     CREEK
Start           2015-07-31 00:00:00
Contained       2019-07-31 00:00:00
Acres                         1,450
Deaths_FF                         0
Deaths_Civil                      0
Duration            1 days 00:00:00
Strux_Destr                     0.0
Strux_Dmgd                      0.0
Name: 340, dtype: object


In [41]:
# Convert the 'Duration' column to timedelta if it is not already
fires_2008_2019_cleaned['Duration'] = pd.to_timedelta(fires_2008_2019_cleaned['Duration'], errors='coerce')

# Check the dtype to verify the conversion
print(fires_2008_2019_cleaned['Duration'].dtype)

# Recalculate the 'Duration' for the corrected record
duration = fires_2008_2019_cleaned.loc[340, 'Contained'] - fires_2008_2019_cleaned.loc[340, 'Start'] + pd.Timedelta(days=1)

# Update the 'Duration' column with the recalculated duration
fires_2008_2019_cleaned.at[340, 'Duration'] = duration

# Print the updated record to verify the recalculated 'Duration'
print("Updated record with recalculated Duration:")
print(fires_2008_2019_cleaned.loc[340])
print(fires_2008_2019_cleaned['Duration'].dtype)

timedelta64[ns]
Updated record with recalculated Duration:
County                       MERCED
Fire Name                     CREEK
Start           2015-07-31 00:00:00
Contained       2019-07-31 00:00:00
Acres                         1,450
Deaths_FF                         0
Deaths_Civil                      0
Duration         1462 days 00:00:00
Strux_Destr                     0.0
Strux_Dmgd                      0.0
Name: 340, dtype: object
timedelta64[ns]


In [42]:
# Convert 'Duration' to timedelta format
fires_2008_2019_cleaned['Duration'] = pd.to_timedelta(fires_2008_2019_cleaned['Duration'])

# Calculate the duration in days and convert it to an integer
fires_2008_2019_cleaned['Duration_Days'] = fires_2008_2019_cleaned['Duration'].dt.days

# If you want to format the 'Duration' as a string like 'X days HH:MM:SS'
# fires_2015_2019_cleaned['Duration_Str'] = fires_2015_2019_cleaned['Duration'].apply(
    #lambda x: f"{x.days} days {x.components.hours:02}:{x.components.minutes:02}:{x.components.seconds:02}"
#)

# Print the cleaned data to check results
print(fires_2008_2019_cleaned.head())

             County              Fire Name      Start  Contained  Acres  \
100            LAKE  CONTROL BURN, GEYSERS 2008-02-13 2008-02-13    400   
0    San Bernardino                  Bluff 2008-03-16 2008-03-20    680   
101        MARIPOSA              WAWONA NW 2008-04-09 2008-04-19  1,130   
102     LOS ANGELES            SANTA ANITA 2008-04-26 2008-05-02    584   
103       RIVERSIDE                 APACHE 2008-04-29 2008-05-04    769   

     Deaths_FF  Deaths_Civil Duration  Strux_Destr  Strux_Dmgd  Duration_Days  
100          0             0   1 days          NaN         NaN              1  
0            0             0   5 days          NaN         NaN              5  
101          0             0  11 days          NaN         NaN             11  
102          0             0   7 days          NaN         NaN              7  
103          0             0   6 days          NaN         NaN              6  


In [48]:
# Check if any county are misspelled 
# List of valid county names
valid_counties = [
    'ALAMEDA', 'ALPINE', 'AMADOR', 'BUTTE', 'CALAVERAS', 'COLUSA', 'CONTRA COSTA', 
    'DEL NORTE', 'EL DORADO', 'FRESNO', 'GLENN', 'HUMBOLDT', 'IMPERIAL', 'INYO', 
    'KERN', 'KINGS', 'LAKE', 'LASSEN', 'LOS ANGELES', 'MADERA', 'MARIN', 'MARIPOSA', 
    'MENDOCINO', 'MERCED', 'MODOC', 'MONO', 'MONTEREY', 'NAPA', 'NEVADA', 'ORANGE', 
    'PLACER', 'PLUMAS', 'RIVERSIDE', 'SACRAMENTO', 'SAN BENITO', 'SAN BERNARDINO', 
    'SAN DIEGO', 'SAN FRANCISCO', 'SAN JOAQUIN', 'SAN LUIS OBISPO', 'SAN MATEO', 
    'SANTA BARBARA', 'SANTA CLARA', 'SANTA CRUZ', 'SHASTA', 'SIERRA', 'SISKIYOU', 
    'SOLANO', 'SONOMA', 'STANISLAUS', 'SUTTER', 'TEHAMA', 'TRINITY', 'TULARE', 
    'TUOLUMNE', 'VENTURA', 'YOLO', 'YUBA', 'JACKSON (OR)', 'WASHOE (NV)'
]
# Convert the 'County' column to uppercase to ensure matching
fires_2008_2019_cleaned['County'] = fires_2008_2019_cleaned['County'].str.upper()

# Find counties that are not in the valid counties list
invalid_counties = fires_2008_2019_cleaned[~fires_2008_2019_cleaned['County'].isin(valid_counties)]

# Display the rows with invalid counties
print("Counties that do not match the valid list:")
print(invalid_counties[['County']])
# Saving incorrect list of misspelled counties for review
invalid_counties.to_csv('Outputs/invalid_counties.csv', index = False)

Counties that do not match the valid list:
               County
1        TEHAMA-GLENN
2        FRESNO-KINGS
3        FRESNO-KINGS
6    SONOMA-LAKE-NAPA
9    MARDERA-MARIPOSA
..                ...
600      LAKE/ COLUSA
599       LAKE/COLUSA
604   TEHAMA/\nSHASTA
467            WASHOE
493            WASHOE

[71 rows x 1 columns]


In [57]:
# 71 rows were not identified correctly (please see the output above)

# Define a function to clean up the 'County' column
def clean_county_name(county_name):
    # Remove text after '-', ',', and '/' and strip any extra whitespace
    return county_name.split('-')[0].split(',')[0].split('/')[0].split('\\')[0].split('\\n')[0].strip()

# Apply the cleaning function to the 'County' column
fires_2008_2019_cleaned['County'] = fires_2008_2019_cleaned['County'].apply(clean_county_name)

In [61]:
# Continue with your corrections dictionary for specific misspellings
corrections = {
    'TAHEMA': 'TEHAMA',
    'ELDORADO': 'EL DORADO',
    'TEHEMA': 'TEHAMA',
    'TOULUMNE': 'TUOLUMNE',
    'WASHOE': 'WASHOE (NV)',
    'VANDENBURG AFB': 'SANTA BARBARA',
    'MARDERA': 'MADERA',
    'VENTURA/SANTA\nBARBARA': 'VENTURA',
    'COLUSA, GLENN,\nLAKE, MENDOCINO': 'COLUSA',
    'COLUSA, LAKE,\nMENDOCINO': 'COLUSA',
    'SANTA\nBARBARA': 'SANTA BARBARA',
    'SAN LUIS\nOBISPO': 'SAN LUIS OBISPO',
    'SAN\nBERNARDINO': 'SAN BERNARDINO',
    'VANDENBURG AFB': 'SANTA BARBARA'
}

# Apply the corrections to the 'County' column
fires_2008_2019_cleaned['County'] = fires_2008_2019_cleaned['County'].replace(corrections)


Invalid counties found:
[]


In [62]:
# Find counties that are not in the valid counties list
invalid_counties_test = fires_2008_2019_cleaned[~fires_2008_2019_cleaned['County'].isin(valid_counties)]

# Display the rows with invalid counties
print("Counties that do not match the valid list:")
print(invalid_counties_test[['County']])

Counties that do not match the valid list:
Empty DataFrame
Columns: [County]
Index: []


In [63]:
# Save the cleaned data to the "Outputs" folder
fires_2008_2019_cleaned.to_csv('Outputs/fires_2008_2019_cleaned.csv', index=False)

# Print the cleaned data to check
print(fires_2008_2019_cleaned.tail(5))

            County  Fire Name      Start  Contained  Acres  Deaths_FF  \
885      RIVERSIDE       HILL 2019-10-31 2019-11-03    628          0   
887        VENTURA      MARIA 2019-10-31 2019-11-06  9,999          0   
888         TEHAMA      RANCH 2019-11-03 2019-11-06  2,534          0   
889         PLACER  FOOTHILLS 2019-11-25 2019-11-25    308          0   
917  SANTA BARBARA       CAVE 2019-11-25 2019-12-13  3,126          0   

     Deaths_Civil Duration  Strux_Destr  Strux_Dmgd  Duration_Days  
885             0   4 days          0.0         0.0              4  
887             0   7 days          4.0         0.0              7  
888             0   4 days          0.0         1.0              4  
889             0   1 days          0.0         0.0              1  
917             0  19 days          0.0         1.0             19  
