In [21]:
# Improt dependencies
import pandas as pd
import ast # process trees of the Python abstract syntax grammar

In [22]:
# Load the cleaned data from csv
file_path = 'Outputs/fires_2008_2022_cleaned.csv'
data = pd.read_csv(file_path)

In [23]:
# Remove commas and convert to numeric
data['Acres'] = data['Acres'].str.replace(',', '').astype(float)

In [24]:
# Ensure 'Deaths_FF' and 'Deaths_Civil' are numeric to calculate total deaths
data['Deaths_FF'] = pd.to_numeric(data['Deaths_FF'], errors='coerce')
data['Deaths_Civil'] = pd.to_numeric(data['Deaths_Civil'], errors='coerce')

# Calculate total deaths
data['Total Deaths'] = data['Deaths_FF'] + data['Deaths_Civil']

# Display the results
print(data[['Fire Name', 'Total Deaths']])

                  Fire Name  Total Deaths
0     CONTROL BURN, GEYSERS             0
1                     Bluff             0
2                 WAWONA NW             0
3               SANTA ANITA             0
4                    APACHE             0
...                     ...           ...
1092               MOUNTAIN             0
1093               FAIRVIEW             2
1094               MOSQUITO             0
1095                   FORK             0
1096                 BARNES             0

[1097 rows x 2 columns]


In [25]:
## Convert 'Start' and 'Contained" to datetime format to avoid errors
data['Start'] = pd.to_datetime(data['Start'], errors='coerce')
data['Contained'] = pd.to_datetime(data['Contained'], errors='coerce')

In [26]:
# Create a new column for the 'Year' and 'Month'
data['Year'] = data['Start'].dt.year
data['Month'] = data['Start'].dt.month
# Print data 3 first lines to verify correctness
print(data[['Start', 'Contained', 'Year', 'Month']].head(3))

       Start  Contained  Year  Month
0 2008-02-13 2008-02-13  2008      2
1 2008-03-16 2008-03-20  2008      3
2 2008-04-09 2008-04-19  2008      4


In [27]:
# Create a copy to prevent altering the original data in case of any errors
Corrected_Duration_Long_Fires = data.copy()

In [28]:
# Invoke a custom function:
def calculate_duration(row):
    # Initialize an empty dictionary to hold the duration for each month
    duration_per_month = {}

    start_date = row['Start'] # set start dat
    end_date = row['Contained'] # set end date
    
    # Calculate duration for the first month
    # Offsets here helps to end the count at the end of the month
    end_of_start_month = start_date + pd.offsets.MonthEnd(0)
    if end_date <= end_of_start_month: #set conditions
        # If the end date is in the same month
        first_month_duration = (end_date - start_date).days + 1
        duration_per_month[f'{start_date.year}-{start_date.month:02d}'] = first_month_duration
    else: #set contiotion if the 1st is not met
        first_month_duration = (end_of_start_month - start_date).days + 1
        duration_per_month[f'{start_date.year}-{start_date.month:02d}'] = first_month_duration

        # Handle months between 'Start' and 'Contained'
        # Offsets here helps to start the count at the begining of the month
        current_month = start_date + pd.offsets.MonthBegin(1)
        while current_month <= end_date:
            end_of_current_month = current_month + pd.offsets.MonthEnd(0)
            if end_date <= end_of_current_month:
                duration_in_current_month = (end_date - current_month).days + 1
                duration_per_month[f'{current_month.year}-{current_month.month:02d}'] = duration_in_current_month
                break
            else:
                duration_in_current_month = (end_of_current_month - current_month).days + 1
                duration_per_month[f'{current_month.year}-{current_month.month:02d}'] = duration_in_current_month
                current_month = current_month + pd.offsets.MonthBegin(1)

    return duration_per_month # used return function for correct count in the loop

# Apply the function to calculate the duration per month for each row
Corrected_Duration_Long_Fires['Duration_Per_Month'] = data.apply(calculate_duration, axis=1)

# Display the result to check
print(Corrected_Duration_Long_Fires[['Start', 'Contained', 'Duration_Per_Month']])
Corrected_Duration_Long_Fires.to_csv('Outputs/Corrected_Duration_Long_Fires.csv', index=False)

          Start  Contained              Duration_Per_Month
0    2008-02-13 2008-02-13                  {'2008-02': 1}
1    2008-03-16 2008-03-20                  {'2008-03': 5}
2    2008-04-09 2008-04-19                 {'2008-04': 11}
3    2008-04-26 2008-05-02    {'2008-04': 5, '2008-05': 2}
4    2008-04-29 2008-05-04    {'2008-04': 2, '2008-05': 4}
...         ...        ...                             ...
1092 2022-09-02 2022-10-30  {'2022-09': 29, '2022-10': 30}
1093 2022-09-05 2022-09-22                 {'2022-09': 18}
1094 2022-09-06 2022-10-22  {'2022-09': 25, '2022-10': 22}
1095 2022-09-07 2022-09-14                  {'2022-09': 8}
1096 2022-09-07 2022-09-28                 {'2022-09': 22}

[1097 rows x 3 columns]


In [29]:
# Display columns in our data to use correct names of indexes
Corrected_Duration_Long_Fires.columns

Index(['County', 'Fire Name', 'Start', 'Contained', 'Acres', 'Deaths_FF',
       'Deaths_Civil', 'Duration', 'Strux_Destr', 'Strux_Dmgd', 'Total Deaths',
       'Year', 'Month', 'Duration_Per_Month'],
      dtype='object')

In [30]:
# Function to split rows where 'Duration_Per_Month' has multiple entries
# This is needed for fires that started in one month and continued into the following month(s)
def split_rows(row, group_counts):
    durations = row['Duration_Per_Month']
    if len(durations) > 1:
        rows = []
        for month, duration in durations.items():
            new_row = row.copy()
            new_row['Month'] = int(month.split('-')[1])  # Set the correct month
            new_row['Duration'] = duration  # Set the correct duration
            new_row['Year'] = int(month.split('-')[0])  # Set the correct year
            new_row['Duration_Per_Month'] = {month: duration}  # Keep only the current month in the dictionary
            
            # Create a unique key for checking how many times this Fire Name, County, and Year combo appears
            key = (new_row['Fire Name'], new_row['County'], new_row['Year'])
            
            # Check if the key exists in group_counts dictionary
            if key in group_counts and group_counts[key] > 0:
                # Set values to 0 for subsequent occurrences
                new_row['Acres'] = 0
                new_row['Deaths_FF'] = 0
                new_row['Deaths_Civil'] = 0
                new_row['Strux_Destr'] = 0
                new_row['Strux_Dmgd'] = 0
                new_row['Total Deaths'] = 0
                
                # Decrement the count for this key
                group_counts[key] -= 1
            
            rows.append(new_row)
        return rows
    else:
        # Only return the row itself if it doesn't need splitting
        return [row]

# Create a dictionary to track the count of each 'Fire Name', 'County', and 'Year' combination
group_counts = Corrected_Duration_Long_Fires.groupby(['Fire Name', 'County', 'Year']).size().to_dict()

# Apply the split_rows function to each row and pass in the group_counts dictionary
expanded_rows = Corrected_Duration_Long_Fires.apply(lambda row: split_rows(row, group_counts), axis=1).tolist()
expanded_rows = [item for sublist in expanded_rows for item in sublist]  # Flatten the list of lists

# Convert the list of rows into a DataFrame
updated_corrected_duration = pd.DataFrame(expanded_rows)

# Convert the 'Duration_Per_Month' dictionary to a string to make it hashable
updated_corrected_duration['Duration_Per_Month'] = updated_corrected_duration['Duration_Per_Month'].apply(lambda x: str(x))

# Remove any duplicate rows to prevent data alteration
updated_corrected_duration = updated_corrected_duration.drop_duplicates()

# Reset the index to make sure the DataFrame is well-formed
updated_corrected_duration = updated_corrected_duration.reset_index(drop=True)

# Display the resulting DataFrame
print(updated_corrected_duration.head(3))

# Save the result to a CSV file
updated_corrected_duration.to_csv('Outputs/updated_corrected_duration.csv', index=False)
print(updated_corrected_duration['Duration_Per_Month'].dtype)

        County    Fire Name      Start  Contained   Acres  Deaths_FF  \
0        MODOC       BARNES 2022-09-07 2022-09-28  5843.0          0   
1  LOS ANGELES  SANTA ANITA 2008-04-26 2008-05-02     0.0          0   
2  LOS ANGELES  SANTA ANITA 2008-04-26 2008-05-02   584.0          0   

   Deaths_Civil  Duration  Strux_Destr  Strux_Dmgd  Total Deaths  Year  Month  \
0             0        22            2           0             0  2022      9   
1             0         5            0           0             0  2008      4   
2             0         2            0           0             0  2008      5   

  Duration_Per_Month  
0    {'2022-09': 22}  
1     {'2008-04': 5}  
2     {'2008-05': 2}  
object


In [31]:
# Create a copy of the DataFrame to prevent modifying the original data in case of any errors
df_duration = updated_corrected_duration.copy()

# Convert strings of dictionaries to actual dictionaries
# As a first step to extract data to get the duration of each fire in each month
def convert_to_dict(duration_str): # defining function to convert
    try:
        return ast.literal_eval(duration_str) # convert to strings
    except (ValueError, SyntaxError): # set how to handle errors
        return {} # if conversion fails, then return empty value
    
# Apply this function to each element
df_duration['Duration_Per_Month'] = df_duration['Duration_Per_Month'].apply(convert_to_dict)

# Function to extract the numeric value from the dictionary in Duration_Per_Month
def extract_duration(duration_dict):
    return list(duration_dict.values())[0] if duration_dict else 0 #if distionary is empty ruturn '0' value

# Apply the extraction function to each dictionary
df_duration['Duration'] = df_duration['Duration_Per_Month'].apply(extract_duration)

# Display the updated DataFrame
print(df_duration[['Fire Name', 'Start', 'Contained', 'Month', 'Duration_Per_Month', 'Duration', 'Deaths_FF', 'Deaths_Civil']])

       Fire Name      Start  Contained  Month Duration_Per_Month  Duration  \
0         BARNES 2022-09-07 2022-09-28      9    {'2022-09': 22}        22   
1    SANTA ANITA 2008-04-26 2008-05-02      4     {'2008-04': 5}         5   
2    SANTA ANITA 2008-04-26 2008-05-02      5     {'2008-05': 2}         2   
3         APACHE 2008-04-29 2008-05-04      4     {'2008-04': 2}         2   
4         APACHE 2008-04-29 2008-05-04      5     {'2008-05': 4}         4   
..           ...        ...        ...    ...                ...       ...   
946        ROUTE 2022-08-31 2022-09-06      9     {'2022-09': 6}         6   
947     MOUNTAIN 2022-09-02 2022-10-30      9    {'2022-09': 29}        29   
948     MOUNTAIN 2022-09-02 2022-10-30     10    {'2022-10': 30}        30   
949     MOSQUITO 2022-09-06 2022-10-22      9    {'2022-09': 25}        25   
950     MOSQUITO 2022-09-06 2022-10-22     10    {'2022-10': 22}        22   

     Deaths_FF  Deaths_Civil  
0            0             0  
1

In [32]:
# Ensure 'Total Deaths'are numeric to avoid errors
df_duration['Total Deaths'] = pd.to_numeric(df_duration['Total Deaths'], errors='coerce')

# Convert 'Duration' to integer
df_duration['Duration'] = df_duration['Duration'].astype(int)

# Display the result
print(df_duration[['Duration', 'Total Deaths']])

     Duration  Total Deaths
0          22             0
1           5             0
2           2             0
3           2             0
4           4             0
..        ...           ...
946         6             0
947        29             0
948        30             0
949        25             0
950        22             0

[951 rows x 2 columns]


In [33]:
# Our data only specify the dates of each fire, total acres and deaths.
# This code ensures that we don't get our values duplicated on addition rows we previously created
 
# Select only the columns needed
df_duration = df_duration[['County', 'Fire Name', 'Start', 'Contained', 'Acres', 'Duration', 'Total Deaths', 'Year', 'Month']]

# Identify rows with duplicate 'Fire Name', 'Start' combinations
duplicate_groups = df_duration[df_duration.duplicated(subset=['Fire Name', 'Start'], keep=False)]

# DataFrame with unique rows (no duplicates)
unique_rows = df_duration.drop_duplicates(subset=['Fire Name', 'Start'], keep=False)

# Function to sum the values and assign to the first row, then set the rest to 0
def assign_to_first_row(group):
    # Sort the group by 'Month' or any other column if needed
    group = group.sort_values(by='Month')
    
    # Sum the values across the group
    total_deaths = group['Total Deaths'].sum()
    total_acres = group['Acres'].sum()
    
    # Assign the summed values to the first row
    group.iloc[0, group.columns.get_loc('Total Deaths')] = total_deaths
    group.iloc[0, group.columns.get_loc('Acres')] = total_acres
    
    # Set the rest of the rows to 0
    if len(group) > 1:
        group.iloc[1:, group.columns.get_loc('Total Deaths')] = 0
        group.iloc[1:, group.columns.get_loc('Acres')] = 0
    
    return group

# Apply the function to each group
duplicate_rows_with_corrections = duplicate_groups.groupby(['Fire Name', 'Start'], group_keys=False).apply(assign_to_first_row)

# Combine unique rows and corrected duplicate rows
combined_df = pd.concat([unique_rows, duplicate_rows_with_corrections])

# Reset the index to make sure the DataFrame is well-formed
combined_df = combined_df.reset_index(drop=True)

# Sort the combined DataFrame by the 'Start' date
combined_df = combined_df.sort_values(by='Start')

# Save the final DataFrame to a CSV file named 'combined_monthly_stats.csv' in the 'Outputs' folder
combined_df.to_csv('Outputs/combined_monthly_stats.csv', index=False)

# Display the final DataFrame
print("Combined DataFrame:")
print(combined_df.head())

Combined DataFrame:
          County    Fire Name      Start  Contained  Acres  Duration  \
739  LOS ANGELES  SANTA ANITA 2008-04-26 2008-05-02    0.0         2   
738  LOS ANGELES  SANTA ANITA 2008-04-26 2008-05-02  584.0         5   
17     RIVERSIDE       APACHE 2008-04-29 2008-05-04  769.0         2   
18     RIVERSIDE       APACHE 2008-04-29 2008-05-04    0.0         4   
840  SANTA CLARA       Summit 2008-05-22 2008-06-15    0.0        15   

     Total Deaths  Year  Month  
739             0  2008      5  
738             0  2008      4  
17              0  2008      4  
18              0  2008      5  
840             0  2008      6  


In [34]:
# Calculate the statistics for each year
# for yearly stats the original 'data' file was used as we do not need to devide duration by the month end

# Exclude the first row from the DataFrame as this is header
data_without_first_row = data.iloc[1:]

# Calculate the statistics grouped by year
yearly_stats_data = data.groupby('Year').agg(
    total_fires=('Fire Name', 'size'),  # 'size' counts the number of rows
    total_acres=('Acres', 'sum'),
    median_yearly_duration=('Duration', 'median'),  # Median fire duration
    mode_yearly_duration=('Duration', lambda x: x.mode()[0] if not x.mode().empty else None),  # Mode fire duration
    total_deaths=('Total Deaths', 'sum')
).reset_index()

# Convert 'median_yearly_duration' to integers for visualization simplicity
yearly_stats_data['median_yearly_duration'] = yearly_stats_data['median_yearly_duration'].astype(int)

# Display the yearly statistics DataFrame
print(yearly_stats_data)

    Year  total_fires  total_acres  median_yearly_duration  \
0   2008          154    1360379.0                       8   
1   2009           67     422400.0                       7   
2   2010           43      96745.0                       4   
3   2011           62     174361.0                       4   
4   2012           72     730733.0                       5   
5   2013           54     546298.0                       6   
6   2014           52     535318.0                       9   
7   2015           55     773415.0                       8   
8   2016           69     542952.0                      11   
9   2017          125    1418515.0                       9   
10  2018           77    1532601.0                       8   
11  2019           61     267306.0                       6   
12  2020          113    4178768.0                       9   
13  2021           65    2502181.0                      11   
14  2022           28     177266.0                       6   

    mod

In [35]:
# Display columns in my df (duration is taken into account month-end)
print(combined_df.columns)

Index(['County', 'Fire Name', 'Start', 'Contained', 'Acres', 'Duration',
       'Total Deaths', 'Year', 'Month'],
      dtype='object')


In [36]:

# Exclude the first row from the DataFrame as this is header not to count the header
# Fires names are not unque and were reused more than once, that is why size option was more suitable
df_without_first_row = combined_df.iloc[1:]

# Calculate statistics by Month of each Year
monthly_stats = df_without_first_row.groupby(['Year', 'Month']).agg(
    total_fires=('Fire Name', 'size'),
    total_acres=('Acres', 'sum'),
    median_monthly_duration=('Duration', 'median'),  # median was used due to a very wide range in fire durations
    total_deaths=('Total Deaths', 'sum')
).reset_index()

# Convert 'avg_monthly_duration' to integers for viewers simplicity during visulization
monthly_stats['median_monthly_duration'] = monthly_stats['median_monthly_duration'].astype(int)

# Display the first few rows of the monthly_stats DataFrame
print(monthly_stats.head(3))

   Year  Month  total_fires  total_acres  median_monthly_duration  \
0  2008      4            2       1353.0                        3   
1  2008      5            3      34870.0                        4   
2  2008      6           57    1879198.0                       10   

   total_deaths  
0             0  
1             0  
2            33  


In [37]:
# Merge yearly statistical data with information about financial losses: 

# Read the Excel file wiht monetary losses information from "Outputs" folder
file_path = 'Outputs\merged_summary_dollar_data_8.25.24.csv'
df = pd.read_csv(file_path)
# Calculate totals for each year
totals_by_year = df.groupby('Year')['Tot_Damage'].sum().reset_index()
# Merge both datas on year
yearly_stats = pd.merge(yearly_stats_data, totals_by_year, on='Year', how='left')
# Display the merged DataFrame to verify results
print(yearly_stats.tail(3))

    Year  total_fires  total_acres  median_yearly_duration  \
12  2020          113    4178768.0                       9   
13  2021           65    2502181.0                      11   
14  2022           28     177266.0                       6   

    mode_yearly_duration  total_deaths    Tot_Damage  
12                     1            33  4.725059e+09  
13                     2             0  5.141048e+08  
14                     6             4  3.133646e+08  


In [38]:
# Replace NaN values with 0 in the merged DataFrame
yearly_stats = yearly_stats.fillna(0)

In [39]:
# Save the processed data to be used in HTML
yearly_stats.to_csv('Outputs/yearly_stats.csv', index=False)
monthly_stats.to_csv('Outputs/monthly_stats.csv', index=False)

#Print resullts
print(f"Yearly stats saved to Outputs/yearly_stats.csv") # Display the location and the file name where the data was saved
print(f"Monthly stats saved to Outputs/monthly_stats.csv")  


Yearly stats saved to Outputs/yearly_stats.csv
Monthly stats saved to Outputs/monthly_stats.csv
