In [1]:
import pandas as pd
import warnings
warnings.filterwarnings("ignore", message="Could not infer format, so each element will be parsed individually")

In [2]:
# Load the Excel file
file_path = 'Resources/Consolidated_Incidents_by_Year.xlsx'
xl = pd.ExcelFile(file_path)

# Initialize an empty DataFrame to store the extracted data
final_data = pd.DataFrame()

# Loop through each sheet in the Excel file
for sheet_name in xl.sheet_names:
    # Skip the first two rows and import columns 2, 3, 4, and 7
    data = pd.read_excel(file_path, sheet_name=sheet_name, skiprows=1, 
                         usecols=[1, 2, 3, 4, 6, 9, 10, 11 , 12])
    
    # Append the data to the final DataFrame
    final_data = pd.concat([final_data, data], ignore_index=True)


# 'TRANSFORM': Convert 'Start' and 'Contained' columns to datetime format
final_data['Start'] = pd.to_datetime(final_data['Start'])
final_data['Contained'] = pd.to_datetime(final_data['Contained'])

# 'TRANSFORM': Convert 'Dest.', 'Dam.', 'Fire', and 'Civil' columns to integers and replace NaN values with zero
final_data['Dest.'] = pd.to_numeric(final_data['Dest.'], errors='coerce').fillna(0).astype(int)
final_data['Dam.'] = pd.to_numeric(final_data['Dam.'], errors='coerce').fillna(0).astype(int)
final_data['Fire'] = pd.to_numeric(final_data['Fire'], errors='coerce').fillna(0).astype(int)
final_data['Civil'] = pd.to_numeric(final_data['Civil'], errors='coerce').fillna(0).astype(int)

# Rename columns
final_data = final_data.rename(columns={'Total': 'Acres', 'Dest.': 'Strux_Destr', 'Dam.': 'Strux_Dmgd', 'Fire': 'Deaths_FF', 'Civil': 'Deaths_Civil'})

# Calculate the duration in days
final_data['Duration'] = (final_data['Contained'] - final_data['Start']).dt.days+1

# Get the last column name
last_column = final_data.columns[-1]

# Move the 'Duration' column to after Start and Contained?


# Display the extracted data
print(final_data.head(5))
final_data.info()

final_data.to_csv('Outputs/fires_2020_2022.csv', index=False)

xl.close()


     County  Fire Name      Start  Contained  Acres  Strux_Destr  Strux_Dmgd  \
0  MONTEREY   COLORADO 2022-01-21 2022-02-02    687            1           1   
1      INYO    AIRPORT 2022-02-16 2022-02-26  4,136            0           0   
2    MADERA     VALLEY 2022-04-13 2022-04-13    680            0           0   
3    MERCED      SANDY 2022-05-15 2022-05-15    457            0           1   
4      KERN  EDMONSTON 2022-05-19 2022-05-24    682            0           0   

   Deaths_FF  Deaths_Civil  Duration  
0          0             0        13  
1          0             0        11  
2          0             0         1  
3          0             0         1  
4          0             0         6  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 206 entries, 0 to 205
Data columns (total 10 columns):
 #   Column        Non-Null Count  Dtype         
---  ------        --------------  -----         
 0   County        206 non-null    object        
 1   Fire Name     206 non-nul