In [566]:
# Provisional machine learning model looking the accuracy of predicting forest fires in Alberta, CA
# Segment 1 Deliverables 

In [567]:
# Import dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
import pandas as pd
import tensorflow as tf
import numpy as np

In [568]:
# Read CSVs (using raw files from https://wildfire.alberta.ca/resources/historical-data/historical-wildfire-database.aspx until ERD is set up)
fire_df = pd.read_csv("fires_2006to2018.csv")
fire_df


Unnamed: 0,fire_number,fire_name,fire_year,calendar_year,assessment_datetime,assessment_hectares,current_size,size_class,fire_location_latitude,fire_location_longitude,...,fuel_type,other_fuel_type,bh_fs_date,bh_hectares,uc_fs_date,uc_hectares,to_fs_date,to_hectares,ex_fs_date,ex_hectares
0,CWF001,,2006,2006,2006-04-02 16:00:00,0.20,0.20,B,51.152933,-115.034600,...,O1b,,2006-04-02 16:00:00,0.20,2006-04-02 16:00:00,0.20,,,2006-04-03 18:00:00,0.20
1,CWF002,,2006,2006,2006-04-03 16:45:00,0.01,0.01,A,51.157633,-115.002133,...,O1b,,2006-04-03 16:45:00,0.01,2006-04-03 16:45:00,0.01,,,2006-04-03 16:50:00,0.01
2,CWF003,,2006,2006,2006-04-08 20:05:00,0.01,0.01,A,51.194400,-114.516167,...,,Campfire,2006-04-08 20:05:00,0.01,2006-04-08 20:05:00,0.01,,,2006-04-09 20:30:00,0.01
3,CWF004,,2006,2006,2006-04-13 18:20:00,0.75,0.75,B,51.125617,-114.841683,...,O1a,,2006-04-13 18:20:00,0.75,2006-04-13 18:20:00,0.75,,,2006-04-13 20:00:00,0.75
4,CWF005,,2006,2006,2006-04-14 17:25:00,0.01,0.01,A,50.409833,-114.478967,...,O1a,,2006-04-14 17:25:00,0.01,2006-04-14 17:25:00,0.01,,,2006-04-14 17:40:00,0.01
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19839,WWF051,,2018,2018,2018-07-29 17:39:00,0.01,0.01,A,54.626783,-115.598067,...,C2,,2018-07-29 18:40:00,0.01,2018-07-29 20:20:00,0.01,,,2018-07-29 20:53:00,0.01
19840,WWF052,,2018,2018,2018-08-20 12:09:00,0.01,0.01,A,54.350250,-115.083683,...,,Duff,2018-08-20 12:09:00,0.01,2018-08-20 14:05:00,0.01,2018-08-20 14:15:00,0.01,2018-08-21 14:09:00,0.01
19841,WWF053,,2018,2018,2018-10-22 14:45:00,0.01,0.01,A,54.022550,-115.668667,...,D1,,2018-10-22 14:45:00,0.01,2018-10-22 14:46:00,0.01,2018-10-22 15:30:00,0.01,2018-10-23 15:54:00,0.01
19842,WWF054,,2018,2018,2018-10-23 15:45:00,0.01,0.01,A,54.023100,-115.669533,...,,Abandoned Campfire,2018-10-23 15:45:00,0.01,2018-10-23 15:45:00,0.01,,,2018-10-23 16:00:00,0.01


In [569]:
# Clean data by removing columns containing less useful data 
# Note that dates require reformatting for final modelling (currently removed for simplicity)
clean_fire_df = fire_df.drop(columns=["fire_number", "assessment_hectares", "assessment_datetime", "fire_start_date", "bh_fs_date", "uc_fs_date", "fire_name", "fire_year", "to_fs_date", "to_hectares", "ex_fs_date", "ex_hectares", "industry_identifier_desc", "initial_action_by", "current_size", "size_class", "fire_origin", "permit_detail_desc", "det_agent_type", "det_agent", "fire_fighting_start_date", "fire_fighting_start_size", "responsible_group_desc", "reported_date", "start_for_fire_date", "discovered_date"])
clean_fire_df

Unnamed: 0,calendar_year,fire_location_latitude,fire_location_longitude,general_cause_desc,activity_class,true_cause,fire_type,fire_position_on_slope,weather_conditions_over_fire,fuel_type,other_fuel_type,bh_hectares,uc_hectares
0,2006,51.152933,-115.034600,Resident,Unclassified,Unsafe Fire,Surface,Flat,Clear,O1b,,0.20,0.20
1,2006,51.157633,-115.002133,Undetermined,,,Surface,Flat,Cloudy,O1b,,0.01,0.01
2,2006,51.194400,-114.516167,Recreation,Cooking,Abandoned Fire,Ground,Bottom,Cloudy,,Campfire,0.01,0.01
3,2006,51.125617,-114.841683,Resident,Smoking,Burning Substance,Surface,Bottom,Cloudy,O1a,,0.75,0.75
4,2006,50.409833,-114.478967,Resident,Cooking,Unsafe Fire,Surface,Flat,Cloudy,O1a,,0.01,0.01
...,...,...,...,...,...,...,...,...,...,...,...,...,...
19839,2018,54.626783,-115.598067,Lightning,,,Surface,Flat,Clear,C2,,0.01,0.01
19840,2018,54.350250,-115.083683,Resident,Debris Disposal,Unsafe Fire,Ground,Flat,Clear,,Duff,0.01,0.01
19841,2018,54.022550,-115.668667,Resident,Debris Disposal,Unsafe Fire,Surface,Upper 1/3,Clear,D1,,0.01,0.01
19842,2018,54.023100,-115.669533,Resident,Cooking and Warming,Unsafe Fire,Surface,Flat,Clear,,Abandoned Campfire,0.01,0.01


In [570]:
# Review data types
clean_fire_df.dtypes

calendar_year                     int64
fire_location_latitude          float64
fire_location_longitude         float64
general_cause_desc               object
activity_class                   object
true_cause                       object
fire_type                        object
fire_position_on_slope           object
weather_conditions_over_fire     object
fuel_type                        object
other_fuel_type                  object
bh_hectares                     float64
uc_hectares                     float64
dtype: object

In [571]:
# Convert data types
clean_fire_df = clean_fire_df.astype({'calendar_year': 'float64'})

In [572]:
#Convert float to obj
#clean_fire_df["bh_hectares"] = clean_fire_df["bh_hectares"].astype(str)

In [586]:
# Check if changing data types worked
print(clean_fire_df.dtypes)

calendar_year                   float64
fire_location_latitude          float64
fire_location_longitude         float64
general_cause_desc               object
activity_class                   object
true_cause                       object
fire_type                        object
fire_position_on_slope           object
weather_conditions_over_fire     object
fuel_type                        object
other_fuel_type                  object
bh_hectares                     float64
uc_hectares                     float64
dtype: object


In [587]:
# Review Null Values
clean_fire_df.isnull().sum()

calendar_year                       0
fire_location_latitude              0
fire_location_longitude             0
general_cause_desc                  0
activity_class                   7721
true_cause                       8990
fire_type                        2253
fire_position_on_slope           2344
weather_conditions_over_fire     2349
fuel_type                        6040
other_fuel_type                 16075
bh_hectares                         0
uc_hectares                         0
dtype: int64

In [588]:
# Total null values in data
clean_fire_df.isnull().sum().sum()

45772

In [599]:
# Replace Null Values with blank
final_df = clean_fire_df.fillna(value = "_")
final_df

Unnamed: 0,calendar_year,fire_location_latitude,fire_location_longitude,general_cause_desc,activity_class,true_cause,fire_type,fire_position_on_slope,weather_conditions_over_fire,fuel_type,other_fuel_type,bh_hectares,uc_hectares
0,2006.0,51.152933,-115.034600,Resident,Unclassified,Unsafe Fire,Surface,Flat,Clear,O1b,_,0.20,0.20
1,2006.0,51.157633,-115.002133,Undetermined,_,_,Surface,Flat,Cloudy,O1b,_,0.01,0.01
2,2006.0,51.194400,-114.516167,Recreation,Cooking,Abandoned Fire,Ground,Bottom,Cloudy,_,Campfire,0.01,0.01
3,2006.0,51.125617,-114.841683,Resident,Smoking,Burning Substance,Surface,Bottom,Cloudy,O1a,_,0.75,0.75
4,2006.0,50.409833,-114.478967,Resident,Cooking,Unsafe Fire,Surface,Flat,Cloudy,O1a,_,0.01,0.01
...,...,...,...,...,...,...,...,...,...,...,...,...,...
19839,2018.0,54.626783,-115.598067,Lightning,_,_,Surface,Flat,Clear,C2,_,0.01,0.01
19840,2018.0,54.350250,-115.083683,Resident,Debris Disposal,Unsafe Fire,Ground,Flat,Clear,_,Duff,0.01,0.01
19841,2018.0,54.022550,-115.668667,Resident,Debris Disposal,Unsafe Fire,Surface,Upper 1/3,Clear,D1,_,0.01,0.01
19842,2018.0,54.023100,-115.669533,Resident,Cooking and Warming,Unsafe Fire,Surface,Flat,Clear,_,Abandoned Campfire,0.01,0.01


In [600]:
# Join fuel type columns
final_df ['all_fuel_type'] = final_df['fuel_type'] + final_df['other_fuel_type']
final_df


Unnamed: 0,calendar_year,fire_location_latitude,fire_location_longitude,general_cause_desc,activity_class,true_cause,fire_type,fire_position_on_slope,weather_conditions_over_fire,fuel_type,other_fuel_type,bh_hectares,uc_hectares,all_fuel_type
0,2006.0,51.152933,-115.034600,Resident,Unclassified,Unsafe Fire,Surface,Flat,Clear,O1b,_,0.20,0.20,O1b_
1,2006.0,51.157633,-115.002133,Undetermined,_,_,Surface,Flat,Cloudy,O1b,_,0.01,0.01,O1b_
2,2006.0,51.194400,-114.516167,Recreation,Cooking,Abandoned Fire,Ground,Bottom,Cloudy,_,Campfire,0.01,0.01,_Campfire
3,2006.0,51.125617,-114.841683,Resident,Smoking,Burning Substance,Surface,Bottom,Cloudy,O1a,_,0.75,0.75,O1a_
4,2006.0,50.409833,-114.478967,Resident,Cooking,Unsafe Fire,Surface,Flat,Cloudy,O1a,_,0.01,0.01,O1a_
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19839,2018.0,54.626783,-115.598067,Lightning,_,_,Surface,Flat,Clear,C2,_,0.01,0.01,C2_
19840,2018.0,54.350250,-115.083683,Resident,Debris Disposal,Unsafe Fire,Ground,Flat,Clear,_,Duff,0.01,0.01,_Duff
19841,2018.0,54.022550,-115.668667,Resident,Debris Disposal,Unsafe Fire,Surface,Upper 1/3,Clear,D1,_,0.01,0.01,D1_
19842,2018.0,54.023100,-115.669533,Resident,Cooking and Warming,Unsafe Fire,Surface,Flat,Clear,_,Abandoned Campfire,0.01,0.01,_Abandoned Campfire


In [601]:
# Drop fuel type & other fuel type columns
final_df.drop(['fuel_type', 'other_fuel_type'], axis=1, inplace=True)

In [602]:
# Generate catergorical variable list 
df_cat = final_df.dtypes[final_df.dtypes == "object"].index.tolist()
df_cat

['general_cause_desc',
 'activity_class',
 'true_cause',
 'fire_type',
 'fire_position_on_slope',
 'weather_conditions_over_fire',
 'all_fuel_type']

In [603]:
# Check number of unique values in each column
final_df[df_cat].nunique()

general_cause_desc                15
activity_class                    35
true_cause                        22
fire_type                          6
fire_position_on_slope             6
weather_conditions_over_fire       6
all_fuel_type                   1121
dtype: int64

In [604]:
# Print out the fuel type value counts
fuel_counts = final_df.all_fuel_type.value_counts()
fuel_counts

C2_               5173
O1a_              2874
__                2278
M2_               1774
O1b_              1380
                  ... 
_grass/brush         1
_Furniture           1
_School bus          1
_Dry Log Decks       1
_ground fire         1
Name: all_fuel_type, Length: 1121, dtype: int64

In [605]:
# Determine which values to replace 
replace_fueltypes = list(fuel_counts[fuel_counts < 100].index)

# Replace in DataFrame
for fueltypes in replace_fueltypes:
    final_df.all_fuel_type = final_df.all_fuel_type.replace(fueltypes, "Other")

# Check to make sure binning was successful
final_df.all_fuel_type.value_counts

<bound method IndexOpsMixin.value_counts of 0             O1b_
1             O1b_
2        _Campfire
3             O1a_
4             O1a_
           ...    
19839          C2_
19840        Other
19841          D1_
19842        Other
19843          M1_
Name: all_fuel_type, Length: 19844, dtype: object>

In [606]:
# Print out the fuel type value counts
fuel_counts = final_df.all_fuel_type.value_counts()
fuel_counts

C2_                    5173
O1a_                   2874
Other                  2749
__                     2278
M2_                    1774
O1b_                   1380
C3_                     538
M1_                     477
_Campfire               473
C1_                     446
D1_                     350
S1_                     331
S2_                     329
_Abandoned campfire     306
_Garbage                142
C4_                     112
_campfire               112
Name: all_fuel_type, dtype: int64

In [607]:
# Check number of unique values in each column
final_df[df_cat].nunique()

general_cause_desc              15
activity_class                  35
true_cause                      22
fire_type                        6
fire_position_on_slope           6
weather_conditions_over_fire     6
all_fuel_type                   17
dtype: int64

In [None]:
# Need to review other cat values and keep binning if over 10 values

In [608]:
# Create the OneHotEncoder instance
enc = OneHotEncoder(sparse=False)

# Fit the encoder & produce encoded DataFrame
encode_df = pd.DataFrame(enc.fit_transform(final_df[df_cat]))

# Rename encoded columns
encode_df.columns = enc.get_feature_names([df_cat])
encode_df.head()

ValueError: input_features should have length equal to number of features (7), got 1

In [None]:
# Merge the two Dataframes together and drop all fuel types column
## final_df.merge(encode_df, left_index=True, right_index=True).drop('all_fuel_type',1)

In [None]:
# Decide on features and label: 
# Features are date of the fire, latitude, longitude, cause of fire, type of fire, weather conditions, fire position, fuel type 
# Output labels is the date & size of the fire being under controlled i.e. how big the fire became
# The model will aim to calculate these parameters input and provide the probability in predicting the size of potential forest fires

# Output labels 
y = final_df["bh_hectares"]

# Features data 
X = final_df.drop(columns=["bh_hectares"])



In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [None]:
# Create a StandardScaler instance
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data 
X_trained_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

ValueError: could not convert string to float: 'Lightning'