In [1]:
import pandas as pd
import warnings
warnings.filterwarnings("ignore", message="Could not infer format, so each element will be parsed individually")


In [2]:
# PART 1:  EXTRACT DATA FOR 2016, 2018, and 2019 FIRES 

# Define path to the source files
file_2016 = 'Resources/CAL_FireStats/2016-wildfire-activity-stats.xlsx'
file_2018 = 'Resources/CAL_FireStats/2018-wildfire-activity-stats.xlsx'
file_2019 = 'Resources/CAL_FireStats/2019-wildfire-activity-stats.xlsx'

In [3]:
# Define the sheets to be read (all these files contain needed information on the same pages)
sheets = ['table_page_13', 'table_page_14', 'table_page_15', 'table_page_16']

In [4]:
# Define the columns to be extracted
columns_to_extract = ['County', 'Fire Name', 'Start', 'Cont.', 'Total', 
                      'Dest.', 'Dam.', 'Fire', 'Civil']

In [5]:
# Define a function to load and process the data from the Excel files
def load_wildfire_data(file_path, sheets, columns_to_extract):
    # Initiate an empty Data Frame
    extracted_data = pd.DataFrame()
    # Set a 'for' loop to go through identified path, sheets, and columns
    # skip the 1st row as it does not contain iseful information
    for sheet_name in sheets:
        data = pd.read_excel(file_path, sheet_name=sheet_name, skiprows=1, usecols=columns_to_extract)
        # Add the data to an existing collection of data, Index=True to re-number the rows after adding a new data
        extracted_data = pd.concat([extracted_data, data], ignore_index=True)
    
    # 'TRANSFORM': Convert 'Start' and 'Cont.' columns to datetime format
    extracted_data['Start'] = pd.to_datetime(extracted_data['Start'])
    extracted_data['Cont.'] = pd.to_datetime(extracted_data['Cont.'])
    
    # 'TRANSFORM': Convert 'Dest.', 'Dam.', 'Fire', and 'Civil' columns to integers and replace NaN values with zero
    extracted_data['Dest.'] = pd.to_numeric(extracted_data['Dest.'], errors='coerce').fillna(0).astype(int)
    extracted_data['Dam.'] = pd.to_numeric(extracted_data['Dam.'], errors='coerce').fillna(0).astype(int)
    extracted_data['Fire'] = pd.to_numeric(extracted_data['Fire'], errors='coerce').fillna(0).astype(int)
    extracted_data['Civil'] = pd.to_numeric(extracted_data['Civil'], errors='coerce').fillna(0).astype(int)
    
    # Rename columns
    extracted_data = extracted_data.rename(columns={'Cont.': "Contained", 'Total': 'Acres', 'Dest.': 'Strux_Destr', 'Dam.': 'Strux_Dmgd', 
                                                    'Fire': 'Deaths_FF', 'Civil': 'Deaths_Civil'})
    
    # Create a new column 'Duration' that calculates the number of days between 'Start' and 'Contained'
    extracted_data['Duration'] = (extracted_data['Contained'] - extracted_data['Start']).dt.days + 1

    # Reorder columns
    extracted_data = extracted_data[['County', 'Fire Name', 'Start', 'Contained', 'Acres', 
                                     'Strux_Destr', 'Strux_Dmgd', 'Deaths_FF', 'Deaths_Civil', 'Duration']]
    # Return the processed data as a DataFrame so it can be used outside the function
    return extracted_data


In [6]:
# Load data for 2016, 2018, 2019 years
fires_2016 = load_wildfire_data(file_2016, sheets, columns_to_extract)
fires_2018 = load_wildfire_data(file_2018, sheets, columns_to_extract)
fires_2019 = load_wildfire_data(file_2019, sheets, columns_to_extract)

In [7]:
# Combine the data from 2016, 2018, and 2019 into one DataFrame
fires_2016_2018_2019_data = pd.concat([fires_2016, fires_2018, fires_2019], ignore_index=True)

In [8]:
# Display the first two rows of the 2016, 2018, 2019, and combined dataframes to check the results
print("2016 Fire Data:")
print(fires_2016.head(2))
print("--------------------------------------")
print("\n2018 Fire Data:")
print(fires_2018.head(3))
print("--------------------------------------")
print("\n2019 Fire Data:")
print(fires_2019.head(3))
print("--------------------------------------")
print("\nCombined Fire Data (2016, 2018, 2019):")
print(fires_2016_2018_2019_data.head(3))

2016 Fire Data:
      County Fire Name      Start  Contained  Acres  Strux_Destr  Strux_Dmgd  \
0   MONTEREY      METZ 2016-05-22 2016-05-25  3,876            0           0   
1  SAN DIEGO  BORDER 3 2016-06-19 2016-07-01  7,609           16           3   

   Deaths_FF  Deaths_Civil  Duration  
0          0             0       4.0  
1          0             2      13.0  
--------------------------------------

2018 Fire Data:
      County  Fire Name      Start  Contained  Acres  Strux_Destr  Strux_Dmgd  \
0       INYO     MOFFAT 2018-04-19 2018-04-24  1,065            0           0   
1     MERCED       NEES 2018-05-02 2018-05-02  1,756            0           0   
2  RIVERSIDE  PATTERSON 2018-05-17 2018-05-18  1,261            0           0   

   Deaths_FF  Deaths_Civil  Duration  
0          0             0       6.0  
1          0             0       1.0  
2          0             0       2.0  
--------------------------------------

2019 Fire Data:
            County Fire Name     

In [9]:
# Save Data Frames to 'Outputs' folder
fires_2016.to_csv('Outputs/2016_wildfire_data.csv', index=False)
fires_2018.to_csv('Outputs/2018_wildfire_data.csv', index=False)
fires_2019.to_csv('Outputs/2019_wildfire_data.csv', index=False)
fires_2016_2018_2019_data.to_csv('Outputs/2016_2018_2019_wildfire_data.csv', index=False)

In [10]:
#  PART 2: EXTRACT DATA FOR 2017 FIRES:

# Define path to the source file of fires 2017
file_2017 = 'Resources/CAL_FireStats/2017-wildfire-activity-stats.xlsx'
# Define the sheets to be read thta contain needed information (different number of pages than 2016, 2018-2019 files)
sheets = ['table_page_13', 'table_page_14', 'table_page_15', 'table_page_16', 'table_page_17', 'table_page_18']
# Define the columns to be extracted
columns_to_extract = ['County', 'Fire Name', 'Start', 'Cont.', 'Total', 
                      'Dest.', 'Dam.', 'Fire', 'Civil']
# Function to load and process the data from the given Excel file
def load_wildfire_data(file_path, sheets, columns_to_extract):
    extracted_data = pd.DataFrame()
    
    for sheet_name in sheets:
        data = pd.read_excel(file_path, sheet_name=sheet_name, skiprows=1, usecols=columns_to_extract)
        extracted_data = pd.concat([extracted_data, data], ignore_index=True)
    
    # 'TRANSFORM': Convert 'Start' and 'Cont.' columns to datetime format
    extracted_data['Start'] = pd.to_datetime(extracted_data['Start'])
    extracted_data['Cont.'] = pd.to_datetime(extracted_data['Cont.'])
    
    # 'TRANSFORM': Convert 'Dest.', 'Dam.', 'Fire', and 'Civil' columns to integers and replace NaN values with zero
    extracted_data['Dest.'] = pd.to_numeric(extracted_data['Dest.'], errors='coerce').fillna(0).astype(int)
    extracted_data['Dam.'] = pd.to_numeric(extracted_data['Dam.'], errors='coerce').fillna(0).astype(int)
    extracted_data['Fire'] = pd.to_numeric(extracted_data['Fire'], errors='coerce').fillna(0).astype(int)
    extracted_data['Civil'] = pd.to_numeric(extracted_data['Civil'], errors='coerce').fillna(0).astype(int)
    
    # Rename columns
    extracted_data = extracted_data.rename(columns={'Cont.': "Contained", 'Total': 'Acres', 'Dest.': 'Strux_Destr', 'Dam.': 'Strux_Dmgd', 
                                                    'Fire': 'Deaths_FF', 'Civil': 'Deaths_Civil'})
    
    # Create a new column 'Duration' that calculates the number of days between 'Start' and 'Contained'
    extracted_data['Duration'] = (extracted_data['Contained'] - extracted_data['Start']).dt.days + 1

    # Reorder columns
    extracted_data = extracted_data[['County', 'Fire Name', 'Start', 'Contained', 'Acres', 
                                     'Strux_Destr', 'Strux_Dmgd', 'Deaths_FF', 'Deaths_Civil', 'Duration']]
    
    return extracted_data
# Load data for 2017
fires_2017 = load_wildfire_data(file_2017, sheets, columns_to_extract)
# Display the first two rows of the data frame
print("2017 Fire Data:")
print(fires_2017.head(2))
# Save Data Frames to 'Outputs' folder
fires_2017.to_csv('Outputs/2017_wildfire_data.csv', index=False)


2017 Fire Data:
   County  Fire Name      Start  Contained  Acres  Strux_Destr  Strux_Dmgd  \
0  FRESNO      JAYNE 2017-04-20 2017-04-21  4,532            0           0   
1  FRESNO  EL DORADO 2017-04-28 2017-04-28    750            0           0   

   Deaths_FF  Deaths_Civil  Duration  
0          0             0       2.0  
1          0             0       1.0  


In [11]:
#  PART 3: EXTRACT DATA FOR 2015 FIRES

# Define path to the source file of fires 2015
file_2015 = 'Resources/CAL_FireStats/2015-wildfire-activity-stats.xlsx'
# Define the sheets to be read thta contain needed information (different number of pages than 2016, 2018-2019 files)
sheets = ['table_page_13', 'table_page_14', 'table_page_15']
# Define the columns to be extracted
columns_to_extract = ['County', 'Fire Name', 'Start', 'Cont.', 'Total', 
                      'Dest.', 'Dam.', 'Fire', 'Civil']
# Function to load and process the data from the given Excel file
def load_wildfire_data(file_path, sheets, columns_to_extract):
    extracted_data = pd.DataFrame()
    
    for sheet_name in sheets:
        data = pd.read_excel(file_path, sheet_name=sheet_name, skiprows=1, usecols=columns_to_extract)
        extracted_data = pd.concat([extracted_data, data], ignore_index=True)
    
    # 'TRANSFORM': Convert 'Start' and 'Cont.' columns to datetime format
    extracted_data['Start'] = pd.to_datetime(extracted_data['Start'])
    extracted_data['Cont.'] = pd.to_datetime(extracted_data['Cont.'])
    
    # 'TRANSFORM': Convert 'Dest.', 'Dam.', 'Fire', and 'Civil' columns to integers and replace NaN values with zero
    extracted_data['Dest.'] = pd.to_numeric(extracted_data['Dest.'], errors='coerce').fillna(0).astype(int)
    extracted_data['Dam.'] = pd.to_numeric(extracted_data['Dam.'], errors='coerce').fillna(0).astype(int)
    extracted_data['Fire'] = pd.to_numeric(extracted_data['Fire'], errors='coerce').fillna(0).astype(int)
    extracted_data['Civil'] = pd.to_numeric(extracted_data['Civil'], errors='coerce').fillna(0).astype(int)
    
    # Rename columns
    extracted_data = extracted_data.rename(columns={'Cont.': "Contained", 'Total': 'Acres', 'Dest.': 'Strux_Destr', 'Dam.': 'Strux_Dmgd', 
                                                    'Fire': 'Deaths_FF', 'Civil': 'Deaths_Civil'})
    
    # Create a new column 'Duration' that calculates the number of days between 'Start' and 'Contained'
    extracted_data['Duration'] = (extracted_data['Contained'] - extracted_data['Start']).dt.days + 1

    # Reorder columns
    extracted_data = extracted_data[['County', 'Fire Name', 'Start', 'Contained', 'Acres', 
                                     'Strux_Destr', 'Strux_Dmgd', 'Deaths_FF', 'Deaths_Civil', 'Duration']]
    
    return extracted_data
# Load data for 2017
fires_2015 = load_wildfire_data(file_2015, sheets, columns_to_extract)
# Display the first two rows of the data frame
print("2015 Fire Data:")
print(fires_2015.head(2))
# Save Data Frames to 'Outputs' folder
fires_2015.to_csv('Outputs/2015_wildfire_data.csv', index=False)

2015 Fire Data:
      County Fire Name      Start  Contained  Acres  Strux_Destr  Strux_Dmgd  \
0       INYO     ROUND 2015-02-06 2015-02-13  7,000           43           5   
1  RIVERSIDE   HIGHWAY 2015-04-18 2015-04-24  1,049            0           0   

   Deaths_FF  Deaths_Civil  Duration  
0          0             0       8.0  
1          0             0       7.0  


In [12]:
#  PART 4:  Combine the data from 2016, 2018, and 2019 with data from 2015, and 2017

fires_2015_2019= pd.concat([fires_2015, fires_2017, fires_2016_2018_2019_data], ignore_index=True)
                           
# Save Data Frames to 'Outputs' folder
fires_2015_2019.to_csv('Outputs/2015_2019_wildfire_data.csv', index=False)


In [13]:
# PART 5: FINAL CLEAN UP OF THE COMBINED DATA

# Sort the final data by the start and then contained dates (in case of multiple fires started on the same date)
fires_2015_2019_sorted = fires_2015_2019.sort_values(by=['Start', 'Contained'], ascending=True)

# Remove rows where 'County' column is empty (NaN or empty strings) to remove summary lines from the data
fires_2015_2019_filtered = fires_2015_2019_sorted.dropna(subset=['County']) # Remove Nan values
fires_2015_2019_filtered = fires_2015_2019_filtered[fires_2015_2019_filtered['County'].str.strip() != ''] #Remove empty lines

# Identify where 'Start' is after 'Cont.'
incorrect_dates = fires_2015_2019_filtered[fires_2015_2019_filtered['Start'] > fires_2015_2019_filtered['Contained']]

# Display the rows where 'Start' is after 'Cont.'
if not incorrect_dates.empty:
    print("Rows where 'Start' is after 'Cont.':")
    print(incorrect_dates)
else:
        print("No rows found where 'Start' is after 'Cont.'")

# Update 'Start' to be equal to 'Cont.' where 'Start' is after 'Cont.' as in it seems like simplie mistype in the year
fires_2015_2019_filtered.loc[fires_2015_2019_filtered['Start'] > fires_2015_2019_filtered['Contained'], 'Start'] = fires_2015_2019_filtered['Contained']


Rows where 'Start' is after 'Cont.':
     County Fire Name      Start  Contained Acres  Strux_Destr  Strux_Dmgd  \
278  TEHAMA      DALE 2028-07-09 2018-07-09   856            0           0   

     Deaths_FF  Deaths_Civil  Duration  
278          0             0   -3652.0  


In [14]:
# Sort the cleaned data by 'Start' and 'Contained' dates
fires_2015_2019_cleaned = fires_2015_2019_filtered.sort_values(by=['Start', 'Contained'], ascending=True)

# Calculate the duration of the fire (assuming 'Contained' is the end date and 'Start' is the start date)
fires_2015_2019_cleaned['Duration'] = fires_2015_2019_cleaned['Contained'] - fires_2015_2019_cleaned['Start']

# To add one day to the duration to include the start date fully, you can do this:
fires_2015_2019_cleaned['Duration'] = fires_2015_2019_cleaned['Duration'] + pd.Timedelta(days=1)


# Display the rows where 'Duration' is negative
negative_duration = fires_2015_2019_cleaned[fires_2015_2019_cleaned['Duration'] < pd.Timedelta(0)]
if not negative_duration.empty:
    print("Duration is incorrect:")
    print(negative_duration)
else:
    print("No errors were found in fire duration count.")

# Print the cleaned data to verify no errors
print(fires_2015_2019_cleaned[['Start', 'Contained', 'Duration']].tail())

No errors were found in fire duration count.
         Start  Contained Duration
365 2019-10-31 2019-11-03   4 days
367 2019-10-31 2019-11-06   7 days
368 2019-11-03 2019-11-06   4 days
369 2019-11-25 2019-11-25   1 days
397 2019-11-25 2019-12-13  19 days


In [15]:
# Verifying if there are any other errors in the data (like suspiciously long fires)

# Define a timedelta representing 150 days
threshold_duration = pd.Timedelta(days=150)
# Find a d display if any fires lasted longer than 150 days according to our data
long_durations = fires_2015_2019_cleaned[fires_2015_2019_cleaned['Duration'] > threshold_duration]
print("Suspiciously long durations:")
print(long_durations)



Suspiciously long durations:
     County Fire Name      Start  Contained  Acres  Strux_Destr  Strux_Dmgd  \
340  COLUSA      SAND 2019-06-08 2020-06-15  2,220            4           0   

     Deaths_FF  Deaths_Civil Duration  
340          0             0 374 days  


In [16]:
# According to Google SAND fired was contained on 2019-06-15, 1 year earlier than our data shows.
# Correct the year in the 'Contained' column to 2019
fires_2015_2019_cleaned.loc[340, 'Contained'] = fires_2015_2019_cleaned.loc[340, 'Contained'].replace(year=2019)
# Verify the updated record
print("Corrected record:")
print(fires_2015_2019_cleaned.loc[340])

Corrected record:
County                       COLUSA
Fire Name                      SAND
Start           2019-06-08 00:00:00
Contained       2019-06-15 00:00:00
Acres                         2,220
Strux_Destr                       4
Strux_Dmgd                        0
Deaths_FF                         0
Deaths_Civil                      0
Duration          374 days 00:00:00
Name: 340, dtype: object


In [17]:
# Convert the 'Duration' column to timedelta if it is not already
fires_2015_2019_cleaned['Duration'] = pd.to_timedelta(fires_2015_2019_cleaned['Duration'], errors='coerce')

# Check the dtype to verify the conversion
print(fires_2015_2019_cleaned['Duration'].dtype)

# Recalculate the 'Duration' for the corrected record
duration = fires_2015_2019_cleaned.loc[340, 'Contained'] - fires_2015_2019_cleaned.loc[340, 'Start'] + pd.Timedelta(days=1)

# Update the 'Duration' column with the recalculated duration
fires_2015_2019_cleaned.at[340, 'Duration'] = duration

# Print the updated record to verify the recalculated 'Duration'
print("Updated record with recalculated Duration:")
print(fires_2015_2019_cleaned.loc[340])
print(fires_2015_2019_cleaned['Duration'].dtype)

timedelta64[ns]
Updated record with recalculated Duration:
County                       COLUSA
Fire Name                      SAND
Start           2019-06-08 00:00:00
Contained       2019-06-15 00:00:00
Acres                         2,220
Strux_Destr                       4
Strux_Dmgd                        0
Deaths_FF                         0
Deaths_Civil                      0
Duration            8 days 00:00:00
Name: 340, dtype: object
timedelta64[ns]


In [18]:
# Convert 'Duration' to timedelta format
fires_2015_2019_cleaned['Duration'] = pd.to_timedelta(fires_2015_2019_cleaned['Duration'])

# Calculate the duration in days and convert it to an integer
fires_2015_2019_cleaned['Duration_Days'] = fires_2015_2019_cleaned['Duration'].dt.days

# If you want to format the 'Duration' as a string like 'X days HH:MM:SS'
# fires_2015_2019_cleaned['Duration_Str'] = fires_2015_2019_cleaned['Duration'].apply(
    #lambda x: f"{x.days} days {x.components.hours:02}:{x.components.minutes:02}:{x.components.seconds:02}"
#)

# Print the cleaned data to check results
print(fires_2015_2019_cleaned.head())

       County Fire Name      Start  Contained  Acres  Strux_Destr  Strux_Dmgd  \
26       MONO  VAN DYKE 2015-02-06 2015-02-10    509            0           0   
0        INYO     ROUND 2015-02-06 2015-02-13  7,000           43           5   
1   RIVERSIDE   HIGHWAY 2015-04-18 2015-04-24  1,049            0           0   
27  SAN DIEGO      CARL 2015-04-28 2015-04-29  4,000            0           0   
28  SAN DIEGO    MORTAR 2015-04-28 2015-04-29    800            0           0   

    Deaths_FF  Deaths_Civil Duration  Duration_Days  
26          0             0   5 days              5  
0           0             0   8 days              8  
1           0             0   7 days              7  
27          0             0   2 days              2  
28          0             0   2 days              2  


In [19]:
# Save the cleaned data to the "Outputs" folder
fires_2015_2019_cleaned.to_csv('Outputs/fires_2015_2019_cleaned.csv', index=False)

# Print the cleaned data to check
print(fires_2015_2019_cleaned.tail(5))

            County  Fire Name      Start  Contained  Acres  Strux_Destr  \
365      RIVERSIDE       HILL 2019-10-31 2019-11-03    628            0   
367        VENTURA      MARIA 2019-10-31 2019-11-06  9,999            4   
368         TEHAMA      RANCH 2019-11-03 2019-11-06  2,534            0   
369         PLACER  FOOTHILLS 2019-11-25 2019-11-25    308            0   
397  SANTA BARBARA       CAVE 2019-11-25 2019-12-13  3,126            0   

     Strux_Dmgd  Deaths_FF  Deaths_Civil Duration  Duration_Days  
365           0          0             0   4 days              4  
367           0          0             0   7 days              7  
368           1          0             0   4 days              4  
369           0          0             0   1 days              1  
397           1          0             0  19 days             19  
