In [1]:
import pandas as pd

In [2]:
# Load the cleaned data
file_path = 'Outputs/fires_2015_2019_cleaned.csv'
data = pd.read_csv(file_path)

In [3]:
# Remove commas and convert to numeric
data['Acres'] = data['Acres'].str.replace(',', '').astype(float)

In [4]:
## Convert 'Start' and 'Contained" to datetime format to avoid errors
data['Start'] = pd.to_datetime(data['Start'], errors='coerce')
data['Contained'] = pd.to_datetime(data['Contained'], errors='coerce')

In [5]:
# Create a new column for the 'Year' and 'Month'
data['Year'] = data['Start'].dt.year
data['Month'] = data['Start'].dt.month
# Print data 3 first lines to verify correctness
print(data[['Start', 'Contained', 'Year', 'Month']].head(3))

       Start  Contained  Year  Month
0 2015-02-06 2015-02-10  2015      2
1 2015-02-06 2015-02-13  2015      2
2 2015-04-18 2015-04-24  2015      4


In [7]:
def calculate_duration(row):
    # Initialize a dictionary to hold the duration for each month
    duration_per_month = {}

    start_date = row['Start']
    end_date = row['Contained']
    
    # Calculate duration for the first month
    end_of_start_month = start_date + pd.offsets.MonthEnd(0)
    first_month_duration = (end_of_start_month - start_date).days + 1

    # Assign the duration to the start month
    duration_per_month[f'{start_date.year}-{start_date.month:02d}'] = first_month_duration

    # If 'Contained' is in the same month as 'Start'
    if start_date.month == end_date.month and start_date.year == end_date.year:
        duration_per_month[f'{start_date.year}-{start_date.month:02d}'] = (end_date - start_date).days + 1
    else:
        # Handle months between 'Start' and 'Contained'
        current_month = start_date + pd.offsets.MonthBegin(1)
        while current_month < end_date:
            # Full month duration
            days_in_month = current_month + pd.offsets.MonthEnd(0)
            duration_per_month[f'{current_month.year}-{current_month.month:02d}'] = (days_in_month - current_month).days + 1
            current_month = current_month + pd.offsets.MonthBegin(1)

        # Calculate duration for the last month
        last_month_duration = (end_date - (end_date - pd.offsets.MonthBegin(0))).days + 1
        duration_per_month[f'{end_date.year}-{end_date.month:02d}'] = last_month_duration

    return duration_per_month

# Apply the function to calculate the duration per month for each row
data['Duration_Per_Month'] = data.apply(calculate_duration, axis=1)

# Display the result to check
print(data[['Start', 'Contained', 'Duration_Per_Month']])

         Start  Contained              Duration_Per_Month
0   2015-02-06 2015-02-10                  {'2015-02': 5}
1   2015-02-06 2015-02-13                  {'2015-02': 8}
2   2015-04-18 2015-04-24                  {'2015-04': 7}
3   2015-04-28 2015-04-29                  {'2015-04': 2}
4   2015-04-28 2015-04-29                  {'2015-04': 2}
..         ...        ...                             ...
383 2019-10-31 2019-11-03  {'2019-10': 1, '2019-11': -27}
384 2019-10-31 2019-11-06  {'2019-10': 1, '2019-11': -24}
385 2019-11-03 2019-11-06                  {'2019-11': 4}
386 2019-11-25 2019-11-25                  {'2019-11': 1}
387 2019-11-25 2019-12-13  {'2019-11': 6, '2019-12': -18}

[388 rows x 3 columns]


In [8]:
# Function to identify rows where any month has a duration greater than or equal to 30 days
# to ensure it is assigned to the correct month
def has_long_duration(duration_dict):
    return any(duration >= 30 for duration in duration_dict.values())

# Apply the function to create a boolean mask
long_duration_mask = data['Duration_Per_Month'].apply(has_long_duration)

# Filter the rows based on the mask
long_durations = data[long_duration_mask]

# Display the results
if not long_durations.empty:
    print("Rows with duration in any month more or equal to 30 days:")
    print(long_durations[['Start', 'Contained', 'Duration_Per_Month']])
else:
    print("No durations found that exceed 30 days in any month.")

Rows with duration in any month more or equal to 30 days:
         Start  Contained                                 Duration_Per_Month
18  2015-07-19 2015-09-05     {'2015-07': 13, '2015-08': 31, '2015-09': -25}
29  2015-07-30 2015-09-03      {'2015-07': 2, '2015-08': 31, '2015-09': -27}
30  2015-07-30 2015-10-15  {'2015-07': 2, '2015-08': 31, '2015-09': 30, '...
32  2015-07-31 2015-10-15  {'2015-07': 1, '2015-08': 31, '2015-09': 30, '...
33  2015-07-31 2015-12-01  {'2015-07': 1, '2015-08': 31, '2015-09': 30, '...
34  2015-08-01 2015-09-30                      {'2015-08': 31, '2015-09': 0}
35  2015-08-01 2015-10-01       {'2015-08': 31, '2015-09': 30, '2015-10': 1}
63  2016-06-07 2016-10-02  {'2016-06': 24, '2016-07': 31, '2016-08': 31, ...
71  2016-06-26 2016-10-31  {'2016-06': 5, '2016-07': 31, '2016-08': 31, '...
73  2016-06-28 2016-08-14      {'2016-06': 3, '2016-07': 31, '2016-08': -17}
85  2016-07-22 2016-10-13  {'2016-07': 10, '2016-08': 31, '2016-09': 30, ...
119 2016-09-26 201

In [9]:
# Convert to numeric in case there are any non-numeric values in the new columns
data['Deaths_FF'] = pd.to_numeric(data['Deaths_FF'], errors='coerce')
data['Deaths_Civil'] = pd.to_numeric(data['Deaths_Civil'], errors='coerce')
data['Duration_Per_Month'] = pd.to_numeric(data['Duration_Per_Month'], errors='coerce')

#Drop Nan values and converting them to integers in 'Duration_Per_Month' column
data['Duration_Per_Month'] = pd.to_numeric(data['Duration_Per_Month'], errors='coerce').fillna(0).astype(int)


# Now calculate total deaths
data['Total Deaths'] = data['Deaths_FF'] + data['Deaths_Civil']
print(data[['Duration_Per_Month', 'Total Deaths']])

     Duration_Per_Month  Total Deaths
0                     0             0
1                     0             0
2                     0             0
3                     0             0
4                     0             0
..                  ...           ...
383                   0             0
384                   0             0
385                   0             0
386                   0             0
387                   0             0

[388 rows x 2 columns]


In [10]:
# Calculate the statistics for each year
yearly_stats = data.groupby('Year').agg(
    total_fires=('Fire Name', 'count'),
    total_acres=('Acres', 'sum'),
    average_duration=('Duration_Per_Month', 'mean'),
    total_deaths=('Total Deaths', 'sum')
).reset_index()
print(yearly_stats.head(3))

   Year  total_fires  total_acres  average_duration  total_deaths
0  2015           55     773415.0               0.0             7
1  2016           70     546828.0               0.0             8
2  2017          124    1417659.0               0.0            47


In [11]:
# Display columns in my data
print(data.columns)

Index(['County', 'Fire Name', 'Start', 'Contained', 'Acres', 'Strux_Destr',
       'Strux_Dmgd', 'Deaths_FF', 'Deaths_Civil', 'Duration', 'Year', 'Month',
       'Duration_Per_Month', 'Total Deaths'],
      dtype='object')


In [12]:
# Calculate statistics by Month of each Year
monthly_stats = data.groupby(['Year', 'Month']).agg(
    total_fires=('Fire Name', 'count'),
    total_acres=('Acres', 'sum'),
    avg_duration=('Duration_Per_Month', 'mean'),  # Calculate average duration
    total_deaths=('Total Deaths', 'sum')
).reset_index()

# Display the first few rows of the monthly_stats DataFrame
print(monthly_stats.head(3))

   Year  Month  total_fires  total_acres  avg_duration  total_deaths
0  2015      2            2       7509.0           0.0             0
1  2015      4            4       7874.0           0.0             0
2  2015      5            1        692.0           0.0             0


In [13]:
# Save the processed data to be used in HTML
yearly_stats.to_csv('Outputs/yearly_stats.csv', index=False)
monthly_stats.to_csv('Outputs/monthly_stats.csv', index=False)

print(f"Yearly stats saved to Outputs/yearly_stats.csv") # Display the location and the file name where the data was saved
print(f"Monthly stats saved to Outputs/monthly_stats.csv")  


Yearly stats saved to Outputs/yearly_stats.csv
Monthly stats saved to Outputs/monthly_stats.csv
