In [1]:
# Improt dependencies
import pandas as pd
import ast # process trees of the Python abstract syntax grammar

In [2]:
# Load the cleaned data from csv
file_path = 'Outputs/fires_2008_2022_cleaned.csv'
data = pd.read_csv(file_path)

In [3]:
# Remove commas and convert to numeric
data['Acres'] = data['Acres'].str.replace(',', '').astype(float)

In [4]:
## Convert 'Start' and 'Contained" to datetime format to avoid errors
data['Start'] = pd.to_datetime(data['Start'], errors='coerce')
data['Contained'] = pd.to_datetime(data['Contained'], errors='coerce')

In [5]:
# Create a new column for the 'Year' and 'Month'
data['Year'] = data['Start'].dt.year
data['Month'] = data['Start'].dt.month
# Print data 3 first lines to verify correctness
print(data[['Start', 'Contained', 'Year', 'Month']].head(3))

       Start  Contained  Year  Month
0 2008-02-13 2008-02-13  2008      2
1 2008-03-16 2008-03-20  2008      3
2 2008-04-09 2008-04-19  2008      4


In [6]:
# Create a copy to prevent altering the original data in case of any errors
Corrected_Duration_Long_Fires = data.copy()

In [7]:
# Invoke a custom function:
def calculate_duration(row):
    # Initialize an empty dictionary to hold the duration for each month
    duration_per_month = {}

    start_date = row['Start'] # set start dat
    end_date = row['Contained'] # set end date
    
    # Calculate duration for the first month
    # Offsets here helps to end the count at the end of the month
    end_of_start_month = start_date + pd.offsets.MonthEnd(0)
    if end_date <= end_of_start_month:
        # If the end date is in the same month
        first_month_duration = (end_date - start_date).days + 1
        duration_per_month[f'{start_date.year}-{start_date.month:02d}'] = first_month_duration
    else:
        first_month_duration = (end_of_start_month - start_date).days + 1
        duration_per_month[f'{start_date.year}-{start_date.month:02d}'] = first_month_duration

        # Handle months between 'Start' and 'Contained'
        # Offsets here helps to start the count at the begining of the month
        current_month = start_date + pd.offsets.MonthBegin(1)
        while current_month <= end_date:
            end_of_current_month = current_month + pd.offsets.MonthEnd(0)
            if end_date <= end_of_current_month:
                duration_in_current_month = (end_date - current_month).days + 1
                duration_per_month[f'{current_month.year}-{current_month.month:02d}'] = duration_in_current_month
                break
            else:
                duration_in_current_month = (end_of_current_month - current_month).days + 1
                duration_per_month[f'{current_month.year}-{current_month.month:02d}'] = duration_in_current_month
                current_month = current_month + pd.offsets.MonthBegin(1)

    return duration_per_month # used return function for correct count in the loop

# Apply the function to calculate the duration per month for each row
Corrected_Duration_Long_Fires['Duration_Per_Month'] = data.apply(calculate_duration, axis=1)

# Display the result to check
print(Corrected_Duration_Long_Fires[['Start', 'Contained', 'Duration_Per_Month']])
Corrected_Duration_Long_Fires.to_csv('Outputs/Corrected_Duration_Long_Fires.csv', index=False)

          Start  Contained              Duration_Per_Month
0    2008-02-13 2008-02-13                  {'2008-02': 1}
1    2008-03-16 2008-03-20                  {'2008-03': 5}
2    2008-04-09 2008-04-19                 {'2008-04': 11}
3    2008-04-26 2008-05-02    {'2008-04': 5, '2008-05': 2}
4    2008-04-29 2008-05-04    {'2008-04': 2, '2008-05': 4}
...         ...        ...                             ...
1093 2022-09-02 2022-10-30  {'2022-09': 29, '2022-10': 30}
1094 2022-09-05 2022-09-22                 {'2022-09': 18}
1095 2022-09-06 2022-10-22  {'2022-09': 25, '2022-10': 22}
1096 2022-09-07 2022-09-14                  {'2022-09': 8}
1097 2022-09-07 2022-09-28                 {'2022-09': 22}

[1098 rows x 3 columns]


In [8]:
# Display columns in our data to use correct names of indexes
Corrected_Duration_Long_Fires.columns

Index(['County', 'Fire Name', 'Start', 'Contained', 'Acres', 'Deaths_FF',
       'Deaths_Civil', 'Duration', 'Strux_Destr', 'Strux_Dmgd', 'Year',
       'Month', 'Duration_Per_Month'],
      dtype='object')

In [9]:
# Function to split rows where 'Duration_Per_Month' has multiple entries
# That is needed for fires that started in one month and contunues in the folowing month(s)
def split_rows(row):
    durations = row['Duration_Per_Month']
    if len(durations) > 1:
        rows = []
        for month, duration in durations.items():
            new_row = row.copy()
            new_row['Month'] = int(month.split('-')[1])  # Set the correct month
            new_row['Duration'] = duration  # Set the correct duration
            new_row['Year'] = int(month.split('-')[0])  # Set the correct year
            new_row['Duration_Per_Month'] = {month: duration}  # Keep only the current month in the dictionary
            rows.append(new_row)
        return rows
    else:
        # Only return the row itself if it doesn't need splitting
        return [row]

# Apply the split_rows function to each row and concatenate the lists into a new DataFrame
expanded_rows = Corrected_Duration_Long_Fires.apply(split_rows, axis=1).tolist()
expanded_rows = [item for sublist in expanded_rows for item in sublist]  # Flatten the list of lists

# Convert the list of rows into a DataFrame
updated_corrected_duration = pd.DataFrame(expanded_rows)

# Convert the 'Duration_Per_Month' dictionary to a string to make it hashable
updated_corrected_duration['Duration_Per_Month'] = updated_corrected_duration['Duration_Per_Month'].apply(lambda x: str(x))

# Remove any duplicate rows to prevent data alteration
updated_corrected_duration = updated_corrected_duration.drop_duplicates()

# Reset the index to make sure the DataFrame is well-formed
updated_corrected_duration = updated_corrected_duration.reset_index(drop=True)

# Display the resulting DataFrame
print(updated_corrected_duration.head(3))

# Save the result to a CSV file
updated_corrected_duration.to_csv('Outputs/updated_corrected_duration.csv', index=False)
print(updated_corrected_duration['Duration_Per_Month'].dtype)

        County    Fire Name      Start  Contained   Acres  Deaths_FF  \
0        MODOC       BARNES 2022-09-07 2022-09-28  5843.0          0   
1  LOS ANGELES  SANTA ANITA 2008-04-26 2008-05-02   584.0          0   
2  LOS ANGELES  SANTA ANITA 2008-04-26 2008-05-02   584.0          0   

   Deaths_Civil  Duration  Strux_Destr  Strux_Dmgd  Year  Month  \
0             0        22            2           0  2022      9   
1             0         5            0           0  2008      4   
2             0         2            0           0  2008      5   

  Duration_Per_Month  
0    {'2022-09': 22}  
1     {'2008-04': 5}  
2     {'2008-05': 2}  
object


In [10]:
# Create a copy of the DataFrame to prevent modifying the original data in case of any errors
df = updated_corrected_duration.copy()

# Convert strings of dictionaries to actual dictionaries
# As a first step to extract data to get the duration of each fire in each month
def convert_to_dict(duration_str): # defining function to convert
    try:
        return ast.literal_eval(duration_str) # convert to strings
    except (ValueError, SyntaxError): # set how to handle errors
        return {} # if conversion fails, then return empty value
    
# Apply this function to each element
df['Duration_Per_Month'] = df['Duration_Per_Month'].apply(convert_to_dict)

# Function to extract the numeric value from the dictionary in Duration_Per_Month
def extract_duration(duration_dict):
    return list(duration_dict.values())[0] if duration_dict else 0 #if distionary is empty ruturn '0' value

# Apply the extraction function to each dictionary
df['Duration'] = df['Duration_Per_Month'].apply(extract_duration)

# Display the updated DataFrame
print(df[['Fire Name', 'Start', 'Contained', 'Month', 'Duration_Per_Month', 'Duration', 'Deaths_FF', 'Deaths_Civil']])

       Fire Name      Start  Contained  Month Duration_Per_Month  Duration  \
0         BARNES 2022-09-07 2022-09-28      9    {'2022-09': 22}        22   
1    SANTA ANITA 2008-04-26 2008-05-02      4     {'2008-04': 5}         5   
2    SANTA ANITA 2008-04-26 2008-05-02      5     {'2008-05': 2}         2   
3         APACHE 2008-04-29 2008-05-04      4     {'2008-04': 2}         2   
4         APACHE 2008-04-29 2008-05-04      5     {'2008-05': 4}         4   
..           ...        ...        ...    ...                ...       ...   
946        ROUTE 2022-08-31 2022-09-06      9     {'2022-09': 6}         6   
947     MOUNTAIN 2022-09-02 2022-10-30      9    {'2022-09': 29}        29   
948     MOUNTAIN 2022-09-02 2022-10-30     10    {'2022-10': 30}        30   
949     MOSQUITO 2022-09-06 2022-10-22      9    {'2022-09': 25}        25   
950     MOSQUITO 2022-09-06 2022-10-22     10    {'2022-10': 22}        22   

     Deaths_FF  Deaths_Civil  
0            0             0  
1

In [11]:
# Ensure 'Deaths_FF' and 'Deaths_Civil' are numeric
df['Deaths_FF'] = pd.to_numeric(df['Deaths_FF'], errors='coerce')
df['Deaths_Civil'] = pd.to_numeric(df['Deaths_Civil'], errors='coerce')

# Convert 'Duration' to integer
df['Duration'] = df['Duration'].astype(int)

# Calculate total deaths
df['Total Deaths'] = df['Deaths_FF'] + df['Deaths_Civil']

# Display the result
print(df[['Duration', 'Total Deaths']])

     Duration  Total Deaths
0          22             0
1           5             0
2           2             0
3           2             0
4           4             0
..        ...           ...
946         6             0
947        29             0
948        30             0
949        25             0
950        22             0

[951 rows x 2 columns]


In [12]:
# Calculate the statistics for each year
yearly_stats_df = df.groupby('Year').agg(
    total_fires=('Fire Name', 'count'),
    total_acres=('Acres', 'sum'),
    average_yearly_duration=('Duration', 'mean'),
    total_deaths=('Total Deaths', 'sum')
).reset_index()
# Convert 'avg_yearly_duration' to integers
yearly_stats_df ['average_yearly_duration'] = yearly_stats_df['average_yearly_duration'].astype(int)
print(yearly_stats_df.head(3))

   Year  total_fires  total_acres  average_yearly_duration  total_deaths
0  2008          150    3135375.0                       14            44
1  2009           44     607371.0                       11             6
2  2010           29     134862.0                       15             0


In [13]:
# Display columns in my data
print(df.columns)

Index(['County', 'Fire Name', 'Start', 'Contained', 'Acres', 'Deaths_FF',
       'Deaths_Civil', 'Duration', 'Strux_Destr', 'Strux_Dmgd', 'Year',
       'Month', 'Duration_Per_Month', 'Total Deaths'],
      dtype='object')


In [14]:
# Calculate statistics by Month of each Year
monthly_stats = df.groupby(['Year', 'Month']).agg(
    total_fires=('Fire Name', 'count'),
    total_acres=('Acres', 'sum'),
    avg_monthly_duration=('Duration', 'mean'),  # Calculate average duration
    total_deaths=('Total Deaths', 'sum')
).reset_index()

# Convert 'avg_monthly_duration' to integers
monthly_stats['avg_monthly_duration'] = monthly_stats['avg_monthly_duration'].astype(int)

# Display the first few rows of the monthly_stats DataFrame
print(monthly_stats.head(3))

   Year  Month  total_fires  total_acres  avg_monthly_duration  total_deaths
0  2008      4            2       1353.0                     3             0
1  2008      5            4      20923.0                     5             0
2  2008      6           57    1007829.0                    10            11


In [15]:
# Merge yearly statistical data with information about financial losses: 

# Read the Excel file wiht monetary losses information from "Outputs" folder
file_path = 'Outputs\merged_summary_dollar_data.csv'
df = pd.read_csv(file_path)
# Calculate totals for each year
totals_by_year = df.groupby('Year')['Tot_Damage'].sum().reset_index()
# Merge both datas on year
yearly_stats = pd.merge(yearly_stats_df, totals_by_year, on='Year', how='left')
# Display the merged DataFrame to verify results
print(yearly_stats.tail(3))

    Year  total_fires  total_acres  average_yearly_duration  total_deaths  \
12  2020          125   12994819.0                       20            99   
13  2021           98    9696610.0                       16             0   
14  2022           22     269089.0                       10             0   

      Tot_Damage  
12  4.725059e+09  
13  5.194675e+08  
14           NaN  


In [16]:
# Replace NaN values with 0 in the merged DataFrame
yearly_stats = yearly_stats.fillna(0)

In [17]:
# Save the processed data to be used in HTML
yearly_stats.to_csv('Outputs/yearly_stats.csv', index=False)
monthly_stats.to_csv('Outputs/monthly_stats.csv', index=False)

#Print resullts
print(f"Yearly stats saved to Outputs/yearly_stats.csv") # Display the location and the file name where the data was saved
print(f"Monthly stats saved to Outputs/monthly_stats.csv")  


Yearly stats saved to Outputs/yearly_stats.csv
Monthly stats saved to Outputs/monthly_stats.csv
