In [102]:
# Provisional machine learning model looking the accuracy of predicting forest fires in Alberta, CA
# Segment 1 Deliverables 

In [206]:
# Import dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
import pandas as pd
import tensorflow as tf

In [207]:
# Read CSVs (using raw files from https://wildfire.alberta.ca/resources/historical-data/historical-wildfire-database.aspx until ERD is set up)
fire_df = pd.read_csv("fires_2006to2018.csv")
fire_df


Unnamed: 0,fire_number,fire_name,fire_year,calendar_year,assessment_datetime,assessment_hectares,current_size,size_class,fire_location_latitude,fire_location_longitude,...,fuel_type,other_fuel_type,bh_fs_date,bh_hectares,uc_fs_date,uc_hectares,to_fs_date,to_hectares,ex_fs_date,ex_hectares
0,CWF001,,2006,2006,2006-04-02 16:00:00,0.20,0.20,B,51.152933,-115.034600,...,O1b,,2006-04-02 16:00:00,0.20,2006-04-02 16:00:00,0.20,,,2006-04-03 18:00:00,0.20
1,CWF002,,2006,2006,2006-04-03 16:45:00,0.01,0.01,A,51.157633,-115.002133,...,O1b,,2006-04-03 16:45:00,0.01,2006-04-03 16:45:00,0.01,,,2006-04-03 16:50:00,0.01
2,CWF003,,2006,2006,2006-04-08 20:05:00,0.01,0.01,A,51.194400,-114.516167,...,,Campfire,2006-04-08 20:05:00,0.01,2006-04-08 20:05:00,0.01,,,2006-04-09 20:30:00,0.01
3,CWF004,,2006,2006,2006-04-13 18:20:00,0.75,0.75,B,51.125617,-114.841683,...,O1a,,2006-04-13 18:20:00,0.75,2006-04-13 18:20:00,0.75,,,2006-04-13 20:00:00,0.75
4,CWF005,,2006,2006,2006-04-14 17:25:00,0.01,0.01,A,50.409833,-114.478967,...,O1a,,2006-04-14 17:25:00,0.01,2006-04-14 17:25:00,0.01,,,2006-04-14 17:40:00,0.01
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19839,WWF051,,2018,2018,2018-07-29 17:39:00,0.01,0.01,A,54.626783,-115.598067,...,C2,,2018-07-29 18:40:00,0.01,2018-07-29 20:20:00,0.01,,,2018-07-29 20:53:00,0.01
19840,WWF052,,2018,2018,2018-08-20 12:09:00,0.01,0.01,A,54.350250,-115.083683,...,,Duff,2018-08-20 12:09:00,0.01,2018-08-20 14:05:00,0.01,2018-08-20 14:15:00,0.01,2018-08-21 14:09:00,0.01
19841,WWF053,,2018,2018,2018-10-22 14:45:00,0.01,0.01,A,54.022550,-115.668667,...,D1,,2018-10-22 14:45:00,0.01,2018-10-22 14:46:00,0.01,2018-10-22 15:30:00,0.01,2018-10-23 15:54:00,0.01
19842,WWF054,,2018,2018,2018-10-23 15:45:00,0.01,0.01,A,54.023100,-115.669533,...,,Abandoned Campfire,2018-10-23 15:45:00,0.01,2018-10-23 15:45:00,0.01,,,2018-10-23 16:00:00,0.01


In [208]:
# Clean data by removing columns containing less useful data 
clean_fire_df = fire_df.drop(columns=["fire_number", "assessment_hectares", "assessment_datetime", "fire_start_date", "bh_fs_date", "uc_fs_date", "fire_name", "fire_year", "to_fs_date", "to_hectares", "ex_fs_date", "ex_hectares", "industry_identifier_desc", "initial_action_by", "current_size", "size_class", "fire_origin"])
clean_fire_df

Unnamed: 0,calendar_year,fire_location_latitude,fire_location_longitude,general_cause_desc,responsible_group_desc,activity_class,true_cause,permit_detail_desc,det_agent_type,det_agent,...,start_for_fire_date,fire_fighting_start_date,fire_fighting_start_size,fire_type,fire_position_on_slope,weather_conditions_over_fire,fuel_type,other_fuel_type,bh_hectares,uc_hectares
0,2006,51.152933,-115.034600,Resident,Resident,Unclassified,Unsafe Fire,,UNP,310,...,2006-04-02 14:45:00,,,Surface,Flat,Clear,O1b,,0.20,0.20
1,2006,51.157633,-115.002133,Undetermined,,,,,UNP,310,...,2006-04-03 15:50:00,,,Surface,Flat,Cloudy,O1b,,0.01,0.01
2,2006,51.194400,-114.516167,Recreation,Hikers,Cooking,Abandoned Fire,,UNP,310,...,2006-04-08 19:30:00,2006-04-08 20:05:00,0.01,Ground,Bottom,Cloudy,,Campfire,0.01,0.01
3,2006,51.125617,-114.841683,Resident,Resident,Smoking,Burning Substance,,UNP,PUB,...,2006-04-13 17:52:00,,,Surface,Bottom,Cloudy,O1a,,0.75,0.75
4,2006,50.409833,-114.478967,Resident,Resident,Cooking,Unsafe Fire,,UNP,PUB,...,2006-04-14 15:31:00,,,Surface,Flat,Cloudy,O1a,,0.01,0.01
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19839,2018,54.626783,-115.598067,Lightning,,,,,LKT,IM,...,2018-07-29 17:08:00,2018-07-29 17:50:00,0.01,Surface,Flat,Clear,C2,,0.01,0.01
19840,2018,54.350250,-115.083683,Resident,,Debris Disposal,Unsafe Fire,Burning Without Permit,UNP,GOVT,...,2018-08-20 11:49:00,,,Ground,Flat,Clear,,Duff,0.01,0.01
19841,2018,54.022550,-115.668667,Resident,,Debris Disposal,Unsafe Fire,Burning Without Permit,AIR,HAC,...,2018-10-22 13:06:00,,,Surface,Upper 1/3,Clear,D1,,0.01,0.01
19842,2018,54.023100,-115.669533,Resident,,Cooking and Warming,Unsafe Fire,,UNP,LFS,...,2018-10-23 15:15:00,2018-10-23 15:45:00,0.01,Surface,Flat,Clear,,Abandoned Campfire,0.01,0.01


In [209]:
# Review data types
clean_fire_df.dtypes

calendar_year                     int64
fire_location_latitude          float64
fire_location_longitude         float64
general_cause_desc               object
responsible_group_desc           object
activity_class                   object
true_cause                       object
permit_detail_desc               object
det_agent_type                   object
det_agent                        object
discovered_date                  object
reported_date                    object
start_for_fire_date              object
fire_fighting_start_date         object
fire_fighting_start_size        float64
fire_type                        object
fire_position_on_slope           object
weather_conditions_over_fire     object
fuel_type                        object
other_fuel_type                  object
bh_hectares                     float64
uc_hectares                     float64
dtype: object

In [210]:
#Convert float to obj
clean_fire_df["bh_hectares"] = clean_fire_df["bh_hectares"].astype(str)
print()




In [211]:
print(clean_fire_df.dtypes)

calendar_year                     int64
fire_location_latitude          float64
fire_location_longitude         float64
general_cause_desc               object
responsible_group_desc           object
activity_class                   object
true_cause                       object
permit_detail_desc               object
det_agent_type                   object
det_agent                        object
discovered_date                  object
reported_date                    object
start_for_fire_date              object
fire_fighting_start_date         object
fire_fighting_start_size        float64
fire_type                        object
fire_position_on_slope           object
weather_conditions_over_fire     object
fuel_type                        object
other_fuel_type                  object
bh_hectares                      object
uc_hectares                     float64
dtype: object


In [212]:
# Review Null Values
clean_fire_df.isnull().sum()

calendar_year                       0
fire_location_latitude              0
fire_location_longitude             0
general_cause_desc                  0
responsible_group_desc          11454
activity_class                   7721
true_cause                       8990
permit_detail_desc              18766
det_agent_type                      1
det_agent                           1
discovered_date                  2665
reported_date                       1
start_for_fire_date                 0
fire_fighting_start_date         5279
fire_fighting_start_size         5279
fire_type                        2253
fire_position_on_slope           2344
weather_conditions_over_fire     2349
fuel_type                        6040
other_fuel_type                 16075
bh_hectares                         0
uc_hectares                         0
dtype: int64

In [213]:
# Total null values in data
clean_fire_df.isnull().sum().sum()

89218

In [214]:
# Replace Null Values 
final_df = clean_fire_df.fillna(value = "_")
final_df

Unnamed: 0,calendar_year,fire_location_latitude,fire_location_longitude,general_cause_desc,responsible_group_desc,activity_class,true_cause,permit_detail_desc,det_agent_type,det_agent,...,start_for_fire_date,fire_fighting_start_date,fire_fighting_start_size,fire_type,fire_position_on_slope,weather_conditions_over_fire,fuel_type,other_fuel_type,bh_hectares,uc_hectares
0,2006,51.152933,-115.034600,Resident,Resident,Unclassified,Unsafe Fire,_,UNP,310,...,2006-04-02 14:45:00,_,_,Surface,Flat,Clear,O1b,_,0.2,0.20
1,2006,51.157633,-115.002133,Undetermined,_,_,_,_,UNP,310,...,2006-04-03 15:50:00,_,_,Surface,Flat,Cloudy,O1b,_,0.01,0.01
2,2006,51.194400,-114.516167,Recreation,Hikers,Cooking,Abandoned Fire,_,UNP,310,...,2006-04-08 19:30:00,2006-04-08 20:05:00,0.01,Ground,Bottom,Cloudy,_,Campfire,0.01,0.01
3,2006,51.125617,-114.841683,Resident,Resident,Smoking,Burning Substance,_,UNP,PUB,...,2006-04-13 17:52:00,_,_,Surface,Bottom,Cloudy,O1a,_,0.75,0.75
4,2006,50.409833,-114.478967,Resident,Resident,Cooking,Unsafe Fire,_,UNP,PUB,...,2006-04-14 15:31:00,_,_,Surface,Flat,Cloudy,O1a,_,0.01,0.01
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19839,2018,54.626783,-115.598067,Lightning,_,_,_,_,LKT,IM,...,2018-07-29 17:08:00,2018-07-29 17:50:00,0.01,Surface,Flat,Clear,C2,_,0.01,0.01
19840,2018,54.350250,-115.083683,Resident,_,Debris Disposal,Unsafe Fire,Burning Without Permit,UNP,GOVT,...,2018-08-20 11:49:00,_,_,Ground,Flat,Clear,_,Duff,0.01,0.01
19841,2018,54.022550,-115.668667,Resident,_,Debris Disposal,Unsafe Fire,Burning Without Permit,AIR,HAC,...,2018-10-22 13:06:00,_,_,Surface,Upper 1/3,Clear,D1,_,0.01,0.01
19842,2018,54.023100,-115.669533,Resident,_,Cooking and Warming,Unsafe Fire,_,UNP,LFS,...,2018-10-23 15:15:00,2018-10-23 15:45:00,0.01,Surface,Flat,Clear,_,Abandoned Campfire,0.01,0.01


In [215]:
# Generate catergorical variable list 
df_cat = final_df.dtypes[final_df.dtypes == "object"].index.tolist()
df_cat

['general_cause_desc',
 'responsible_group_desc',
 'activity_class',
 'true_cause',
 'permit_detail_desc',
 'det_agent_type',
 'det_agent',
 'discovered_date',
 'reported_date',
 'start_for_fire_date',
 'fire_fighting_start_date',
 'fire_fighting_start_size',
 'fire_type',
 'fire_position_on_slope',
 'weather_conditions_over_fire',
 'fuel_type',
 'other_fuel_type',
 'bh_hectares']

In [216]:
# Check number of unique values in each column
final_df[df_cat].nunique()

general_cause_desc                 15
responsible_group_desc             33
activity_class                     35
true_cause                         22
permit_detail_desc                  3
det_agent_type                      5
det_agent                         148
discovered_date                 16387
reported_date                   18904
start_for_fire_date             18940
fire_fighting_start_date        13856
fire_fighting_start_size          275
fire_type                           6
fire_position_on_slope              6
weather_conditions_over_fire        6
fuel_type                          15
other_fuel_type                  1102
bh_hectares                       676
dtype: int64

In [217]:
# Decide on features and label: 
# Features are date of the fire, latitude, longitude, cause of fire, who detected the fire, was there a permit for the fire, type of fire, weather conditions, fire position, fuel type 
# Output labels is the date & size of the fire being under controlled i.e. how big the fire became & how long it took to be under controlled
# The model will aim to calculate these parameters input and provide the probability in predicting the size of potential forest fires

# Output labels 
y = final_df["bh_hectares"]

# Features data 
X = final_df.drop(columns=["bh_hectares"])



In [218]:
# X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [219]:
# Create a StandardScaler instance
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data 
X_trained_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

ValueError: could not convert string to float: 'Resident'