# Exercise 7-1: Prepare the Forest Fires data

**Name:**  Leah Nicholson <br/>
**Class:** Data Visualization DEV228 <br/>
**Date:** 5/24/2025 <br/>
**Description:** Preparation of DataFrame "Forest Fires" for analysis <br/>

## Read the data

In [4]:
import pandas as pd

In [5]:
fires_by_month = pd.read_pickle('fires_by_month.pkl')

In [6]:
# display the first 5 rows
fires_by_month.head()

Unnamed: 0,state,fire_year,fire_month,acres_burned,days_burning,fire_count
0,AK,1992,5,4202.0,135.0,14
1,AK,1992,6,86401.0,417.0,23
2,AK,1992,7,48516.7,500.0,26
3,AK,1992,8,3305.0,92.0,4
4,AK,1992,9,20.0,1.0,1


## Add and modify columns

In [8]:
# add a column for the mean number of acres_burned per day (for each row)
fires_by_month['mean_acres_burned_daily'] = fires_by_month['acres_burned'] / fires_by_month['days_burning']

In [9]:
fires_by_month.head()

# Sanity check:  acres_burned / days_burning = 4202.0 / 135.0 = 31.1259259 [checks out]

Unnamed: 0,state,fire_year,fire_month,acres_burned,days_burning,fire_count,mean_acres_burned_daily
0,AK,1992,5,4202.0,135.0,14,31.125926
1,AK,1992,6,86401.0,417.0,23,207.196643
2,AK,1992,7,48516.7,500.0,26,97.0334
3,AK,1992,8,3305.0,92.0,4,35.923913
4,AK,1992,9,20.0,1.0,1,20.0


In [10]:
# add a column that uses a lambda to calculate the mean number of acres burned per day
# by applying a lambda expression. Use an if-else structure to handle the division by 0 error. 

fires_by_month['mean_acres_burned_daily_lambda'] = fires_by_month.apply(
    lambda row: row['acres_burned'] / row['days_burning'] if row['days_burning'] != 0 else 0,
    axis=1)

# fires_by_month['mean_acres_burned_daily_lambda']  This makes new column
# fires_by_month.apply()    Applies the operations to the DataFrame 
# row['acres_burned'] / row['days_burning']    Returns the operation for each row if the denominator is not zero
# else 0    Returns 0 if the division by zero occurs
# axis = 1    Specifies row-wise operation

In [11]:
fires_by_month.head()

# Sanity check:  should be same result as above [mean_acres_burned_daily == mean_acres_burned_daily_lambda] [checks out!]

Unnamed: 0,state,fire_year,fire_month,acres_burned,days_burning,fire_count,mean_acres_burned_daily,mean_acres_burned_daily_lambda
0,AK,1992,5,4202.0,135.0,14,31.125926,31.125926
1,AK,1992,6,86401.0,417.0,23,207.196643,207.196643
2,AK,1992,7,48516.7,500.0,26,97.0334,97.0334
3,AK,1992,8,3305.0,92.0,4,35.923913,35.923913
4,AK,1992,9,20.0,1.0,1,20.0,20.0


In [12]:
# write a function to convert the fire_month column from an int value to a string value such as 'Jan', 'Feb', etc.
import datetime

def month_to_string(month_int):
    if 1 <= month_int <= 12:
        return datetime.date(2025, month_int, 1).strftime('%b')   # b refers to format code for strftime 3-letter month
    else:
        return None


# datetime.date() requires year, month, and day 
# So, an arbitrary value of 2025 (year) and 1 (day) are in the function to meet criteria of a date


In [13]:
# apply the function to the fire_month column

fires_by_month['fire_month'] = fires_by_month['fire_month'].apply(month_to_string)

# We apply the function on a Series object (each row), so no axis parameter with apply

In [14]:
# Check work 
fires_by_month.head()

# Sanity check:  May is 5, Jun 6, July 7, Aug 8, Sep 9 [checks out!]

Unnamed: 0,state,fire_year,fire_month,acres_burned,days_burning,fire_count,mean_acres_burned_daily,mean_acres_burned_daily_lambda
0,AK,1992,May,4202.0,135.0,14,31.125926,31.125926
1,AK,1992,Jun,86401.0,417.0,23,207.196643,207.196643
2,AK,1992,Jul,48516.7,500.0,26,97.0334,97.0334
3,AK,1992,Aug,3305.0,92.0,4,35.923913,35.923913
4,AK,1992,Sep,20.0,1.0,1,20.0,20.0


## Work with indexes

In [16]:
# set an index on the state, fire_year, and fire_month columns  --- I am assuming a multi-index:
fires_by_month = fires_by_month.set_index(['state', 'fire_year', 'fire_month'])
fires_by_month.head(15)
# I see that the multi-index worked - state, fire_year, and fire_month are left aligned, with a hierarchy

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,acres_burned,days_burning,fire_count,mean_acres_burned_daily,mean_acres_burned_daily_lambda
state,fire_year,fire_month,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
AK,1992,May,4202.0,135.0,14,31.125926,31.125926
AK,1992,Jun,86401.0,417.0,23,207.196643,207.196643
AK,1992,Jul,48516.7,500.0,26,97.0334,97.0334
AK,1992,Aug,3305.0,92.0,4,35.923913,35.923913
AK,1992,Sep,20.0,1.0,1,20.0,20.0
AK,1993,Apr,113.0,7.0,3,16.142857,16.142857
AK,1993,May,4152.0,112.0,9,37.071429,37.071429
AK,1993,Jun,197740.2,1211.0,59,163.286705,163.286705
AK,1993,Jul,483493.3,1983.0,68,243.819112,243.819112
AK,1993,Aug,1132.0,60.0,5,18.866667,18.866667


In [17]:
# unstack the fire_month column and store the resulting DataFrame in a different variable

fires_by_month_wide = fires_by_month.unstack(level = 'fire_month')
fires_by_month_wide.head(10)

# Sanity check:  I saw a lot of NaNs appear which worried me - but looking at the DataFrame above, data are missing for certain months
# unstack() fills missing combinations with NaN
# It could be that no fires took place in certain months - seems right, given that fires are more likely in warmer months (we see high acres_burned in Jun/Jul/Aug)


Unnamed: 0_level_0,Unnamed: 1_level_0,acres_burned,acres_burned,acres_burned,acres_burned,acres_burned,acres_burned,acres_burned,acres_burned,acres_burned,acres_burned,...,mean_acres_burned_daily_lambda,mean_acres_burned_daily_lambda,mean_acres_burned_daily_lambda,mean_acres_burned_daily_lambda,mean_acres_burned_daily_lambda,mean_acres_burned_daily_lambda,mean_acres_burned_daily_lambda,mean_acres_burned_daily_lambda,mean_acres_burned_daily_lambda,mean_acres_burned_daily_lambda
Unnamed: 0_level_1,fire_month,Apr,Aug,Dec,Feb,Jan,Jul,Jun,Mar,May,Nov,...,Dec,Feb,Jan,Jul,Jun,Mar,May,Nov,Oct,Sep
state,fire_year,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2
AK,1992,,3305.0,,,,48516.7,86401.0,,4202.0,,...,,,,97.0334,207.196643,,31.125926,,,20.0
AK,1993,113.0,1132.0,,,,483493.3,197740.2,,4152.0,,...,,,,243.819112,163.286705,,37.071429,,,
AK,1994,,24139.0,,,,50510.0,184428.0,200.0,1127.7,,...,,,,102.246964,126.580645,0.0,49.030435,,333.333333,12.5
AK,1995,810.6,,,,,27856.0,5622.0,,9394.0,,...,,,,87.05,20.078571,,52.188889,,14.0,5.0
AK,1996,190.8,10.0,,,,56143.0,446273.4,,92540.0,,...,,,,186.521595,279.09531,,451.414634,,812.5,
AK,1997,14113.0,4930.0,,,,404660.8,1601457.0,300.0,357.2,,...,,,,345.568574,1593.489552,0.0,59.533333,,,
AK,1998,454.2,66.0,,,,33937.2,31701.0,,54445.0,,...,,,,308.52,304.817308,,274.974747,,,
AK,1999,503.79999,,,,,90037.2,914305.4,,714.0,,...,,,,107.571326,639.374406,,102.0,,,1.681818
AK,2000,30.0,30.0,12891.0,,,3571.0,738484.9,,1154.5,,...,1841.571429,,,20.061798,483.61814,,88.807692,,,
AK,2001,,,,,,534.0,213918.9,,1629.3,,...,,,,7.026316,140.182765,,90.516667,,0.0,92.435897


In [18]:
# reset the index for the fires_by_month DataFrame

fires_by_month = fires_by_month.reset_index()    # Must reset the DataFrame by assigning the result of the reset_index() to the DataFrame itself

# Checked that the multi-index was gone  [checks out!]
# Checked the end of the data for 9299 to be labeled [checks out!]
fires_by_month.tail()        

# Restarting kernel and running all cells helped to clear out old columns properly
# Note: Re-running this cell creates more and more indexes (each reset results in the old index getting stored as a column in the DataFrame - to fix, Restart/Run all cells)

Unnamed: 0,state,fire_year,fire_month,acres_burned,days_burning,fire_count,mean_acres_burned_daily,mean_acres_burned_daily_lambda
9295,WY,2015,Aug,4646.3,150.0,25,30.975333,30.975333
9296,WY,2015,Sep,5977.0,126.0,25,47.436508,47.436508
9297,WY,2015,Oct,10337.8,15.0,6,689.186667,689.186667
9298,WY,2015,Nov,509.3,3.0,4,169.766667,169.766667
9299,WY,2015,Dec,72.0,0.0,2,inf,0.0


## Combine data

In [20]:
# create new fire data
new_fire = pd.DataFrame(data=[['CA',2021,'June',1000,100,1,10,10]], columns=fires_by_month.columns)
new_fire.head()

Unnamed: 0,state,fire_year,fire_month,acres_burned,days_burning,fire_count,mean_acres_burned_daily,mean_acres_burned_daily_lambda
0,CA,2021,June,1000,100,1,10,10


In [21]:
# add the new fire data to the end of the original DataFrame

fires_by_month = pd.concat([fires_by_month, new_fire])   
# Omitting ignore_index = True to do it manually below 

In [22]:
# display the last five rows of the original DataFrame
fires_by_month.tail()

# Sanity check:  Seeing if new_fire is appended to bottom of fires_by_month, IE: ['CA',2021,'June',1000,100,1,10,10] is at the bottom [checks out!]

Unnamed: 0,state,fire_year,fire_month,acres_burned,days_burning,fire_count,mean_acres_burned_daily,mean_acres_burned_daily_lambda
9296,WY,2015,Sep,5977.0,126.0,25,47.436508,47.436508
9297,WY,2015,Oct,10337.8,15.0,6,689.186667,689.186667
9298,WY,2015,Nov,509.3,3.0,4,169.766667,169.766667
9299,WY,2015,Dec,72.0,0.0,2,inf,0.0
0,CA,2021,June,1000.0,100.0,1,10.0,10.0


In [23]:
# reset the index for the original DataFrame, dropping the numeric index
fires_by_month = fires_by_month.reset_index(drop = True)


In [24]:
# display the last five rows of the DataFrame again
fires_by_month.tail()

# Sanity check:  Last entry is now 9300 instead of 0 [checks out!]

Unnamed: 0,state,fire_year,fire_month,acres_burned,days_burning,fire_count,mean_acres_burned_daily,mean_acres_burned_daily_lambda
9296,WY,2015,Sep,5977.0,126.0,25,47.436508,47.436508
9297,WY,2015,Oct,10337.8,15.0,6,689.186667,689.186667
9298,WY,2015,Nov,509.3,3.0,4,169.766667,169.766667
9299,WY,2015,Dec,72.0,0.0,2,inf,0.0
9300,CA,2021,June,1000.0,100.0,1,10.0,10.0


## Fix the SettingWithCopyWarning

In [26]:
# the cell that causes the SettingWithCopyWarning
fires_ak = fires_by_month.query('state == "AK"').copy()    # making fires_ak a copy removes the SettingWithCopyError
fires_ak.mean_acres_burned_daily = fires_ak.mean_acres_burned_daily.round()    # changed wording from mean_acres_per_day to mean_acres_burned_daily to properly get warning
fires_ak.head()

Unnamed: 0,state,fire_year,fire_month,acres_burned,days_burning,fire_count,mean_acres_burned_daily,mean_acres_burned_daily_lambda
0,AK,1992,May,4202.0,135.0,14,31.0,31.125926
1,AK,1992,Jun,86401.0,417.0,23,207.0,207.196643
2,AK,1992,Jul,48516.7,500.0,26,97.0,97.0334
3,AK,1992,Aug,3305.0,92.0,4,36.0,35.923913
4,AK,1992,Sep,20.0,1.0,1,20.0,20.0


In [27]:
fires_by_month.head()
# Since fires_ak did not alter original DataFrame, the copy above is not particularly needed, but I will include for completeness

Unnamed: 0,state,fire_year,fire_month,acres_burned,days_burning,fire_count,mean_acres_burned_daily,mean_acres_burned_daily_lambda
0,AK,1992,May,4202.0,135.0,14,31.125926,31.125926
1,AK,1992,Jun,86401.0,417.0,23,207.196643,207.196643
2,AK,1992,Jul,48516.7,500.0,26,97.0334,97.0334
3,AK,1992,Aug,3305.0,92.0,4,35.923913,35.923913
4,AK,1992,Sep,20.0,1.0,1,20.0,20.0
