In [341]:
# Provisional machine learning model looking the accuracy of predicting forest fires in Alberta, CA
# Segment 1 Deliverables 

In [342]:
# Import dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import accuracy_score
import pandas as pd
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt

In [343]:
# Read CSVs (using raw files from https://wildfire.alberta.ca/resources/historical-data/historical-wildfire-database.aspx until ERD is set up)
fire_df = pd.read_csv("fires_2006to2018.csv")
fire_df


Unnamed: 0,fire_number,fire_name,fire_year,calendar_year,assessment_datetime,assessment_hectares,current_size,size_class,fire_location_latitude,fire_location_longitude,...,fuel_type,other_fuel_type,bh_fs_date,bh_hectares,uc_fs_date,uc_hectares,to_fs_date,to_hectares,ex_fs_date,ex_hectares
0,CWF001,,2006,2006,2006-04-02 16:00:00,0.20,0.20,B,51.152933,-115.034600,...,O1b,,2006-04-02 16:00:00,0.20,2006-04-02 16:00:00,0.20,,,2006-04-03 18:00:00,0.20
1,CWF002,,2006,2006,2006-04-03 16:45:00,0.01,0.01,A,51.157633,-115.002133,...,O1b,,2006-04-03 16:45:00,0.01,2006-04-03 16:45:00,0.01,,,2006-04-03 16:50:00,0.01
2,CWF003,,2006,2006,2006-04-08 20:05:00,0.01,0.01,A,51.194400,-114.516167,...,,Campfire,2006-04-08 20:05:00,0.01,2006-04-08 20:05:00,0.01,,,2006-04-09 20:30:00,0.01
3,CWF004,,2006,2006,2006-04-13 18:20:00,0.75,0.75,B,51.125617,-114.841683,...,O1a,,2006-04-13 18:20:00,0.75,2006-04-13 18:20:00,0.75,,,2006-04-13 20:00:00,0.75
4,CWF005,,2006,2006,2006-04-14 17:25:00,0.01,0.01,A,50.409833,-114.478967,...,O1a,,2006-04-14 17:25:00,0.01,2006-04-14 17:25:00,0.01,,,2006-04-14 17:40:00,0.01
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19839,WWF051,,2018,2018,2018-07-29 17:39:00,0.01,0.01,A,54.626783,-115.598067,...,C2,,2018-07-29 18:40:00,0.01,2018-07-29 20:20:00,0.01,,,2018-07-29 20:53:00,0.01
19840,WWF052,,2018,2018,2018-08-20 12:09:00,0.01,0.01,A,54.350250,-115.083683,...,,Duff,2018-08-20 12:09:00,0.01,2018-08-20 14:05:00,0.01,2018-08-20 14:15:00,0.01,2018-08-21 14:09:00,0.01
19841,WWF053,,2018,2018,2018-10-22 14:45:00,0.01,0.01,A,54.022550,-115.668667,...,D1,,2018-10-22 14:45:00,0.01,2018-10-22 14:46:00,0.01,2018-10-22 15:30:00,0.01,2018-10-23 15:54:00,0.01
19842,WWF054,,2018,2018,2018-10-23 15:45:00,0.01,0.01,A,54.023100,-115.669533,...,,Abandoned Campfire,2018-10-23 15:45:00,0.01,2018-10-23 15:45:00,0.01,,,2018-10-23 16:00:00,0.01


In [344]:
# Clean data by creating DataFrame with selected columns
clean_fire_df = fire_df[['start_for_fire_date', 'discovered_date', 'fire_fighting_start_size','bh_hectares' ]].copy()
clean_fire_df

Unnamed: 0,start_for_fire_date,discovered_date,fire_fighting_start_size,bh_hectares
0,2006-04-02 14:45:00,2006-04-02 14:27:00,,0.20
1,2006-04-03 15:50:00,,,0.01
2,2006-04-08 19:30:00,,0.01,0.01
3,2006-04-13 17:52:00,2006-04-13 17:33:00,,0.75
4,2006-04-14 15:31:00,2006-04-14 15:17:00,,0.01
...,...,...,...,...
19839,2018-07-29 17:08:00,2018-07-29 16:58:00,0.01,0.01
19840,2018-08-20 11:49:00,,,0.01
19841,2018-10-22 13:06:00,2018-10-22 13:06:00,,0.01
19842,2018-10-23 15:15:00,2018-10-23 15:15:00,0.01,0.01


In [345]:
# Review data types
clean_fire_df.dtypes

start_for_fire_date          object
discovered_date              object
fire_fighting_start_size    float64
bh_hectares                 float64
dtype: object

In [346]:
# Convert objects to dates for the 2 columns
clean_fire_df[['start_for_fire_date', 'discovered_date']] = clean_fire_df[['start_for_fire_date', 'discovered_date']].apply(pd.to_datetime)

In [347]:
# Check that the changed worked
clean_fire_df.dtypes

start_for_fire_date         datetime64[ns]
discovered_date             datetime64[ns]
fire_fighting_start_size           float64
bh_hectares                        float64
dtype: object

In [348]:
# Review Null Values
clean_fire_df.isnull().sum()

start_for_fire_date            0
discovered_date             2665
fire_fighting_start_size    5279
bh_hectares                    0
dtype: int64

In [349]:
# Total null values in data
clean_fire_df.isnull().sum().sum()

7944

In [350]:
# Replace Null Values in size columns with 0
clean_fire_df['fire_fighting_start_size'].fillna(value=0, inplace=True)


In [351]:
# Replace Null Values in discovered  date with start for fire date values
clean_fire_df['discovered_date'].fillna(clean_fire_df['start_for_fire_date'], inplace=True)

In [352]:
# Review Null Values
clean_fire_df.isnull().sum()

start_for_fire_date         0
discovered_date             0
fire_fighting_start_size    0
bh_hectares                 0
dtype: int64

In [353]:
# Create an hours between column by subtracting start for fire date to discovered date so data can be scaled
clean_fire_df['hours'] = clean_fire_df['start_for_fire_date'].sub(clean_fire_df['discovered_date'], axis=0).dt.total_seconds()
clean_fire_df

Unnamed: 0,start_for_fire_date,discovered_date,fire_fighting_start_size,bh_hectares,hours
0,2006-04-02 14:45:00,2006-04-02 14:27:00,0.00,0.20,1080.0
1,2006-04-03 15:50:00,2006-04-03 15:50:00,0.00,0.01,0.0
2,2006-04-08 19:30:00,2006-04-08 19:30:00,0.01,0.01,0.0
3,2006-04-13 17:52:00,2006-04-13 17:33:00,0.00,0.75,1140.0
4,2006-04-14 15:31:00,2006-04-14 15:17:00,0.00,0.01,840.0
...,...,...,...,...,...
19839,2018-07-29 17:08:00,2018-07-29 16:58:00,0.01,0.01,600.0
19840,2018-08-20 11:49:00,2018-08-20 11:49:00,0.00,0.01,0.0
19841,2018-10-22 13:06:00,2018-10-22 13:06:00,0.00,0.01,0.0
19842,2018-10-23 15:15:00,2018-10-23 15:15:00,0.01,0.01,0.0


In [354]:
# Drop date columns for scaling 
final_df = clean_fire_df[['hours', 'fire_fighting_start_size', 'bh_hectares']].copy()
final_df

Unnamed: 0,hours,fire_fighting_start_size,bh_hectares
0,1080.0,0.00,0.20
1,0.0,0.00,0.01
2,0.0,0.01,0.01
3,1140.0,0.00,0.75
4,840.0,0.00,0.01
...,...,...,...
19839,600.0,0.01,0.01
19840,0.0,0.00,0.01
19841,0.0,0.00,0.01
19842,0.0,0.01,0.01


In [355]:
# Check all data types are the same for scaling
final_df.dtypes

hours                       float64
fire_fighting_start_size    float64
bh_hectares                 float64
dtype: object

In [356]:
# Decide on features and label: 
#"user-input" for how many days before a fire is predicted, we estimate the size before and the fire size is being held i.e. predicting change in starting fire size and discovered size by the number of days 

# Split our preprocessed data into our features and target arrays
# Output labels 
y = final_df["bh_hectares"]

# Features data 
X = final_df.drop(columns=["bh_hectares"])

# Split the preprocessed data into a training & test dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)

In [357]:
# The shape of X is 39688 samples, with a single feature (column)
X.shape

(19844, 2)

In [358]:
# Check y shape samples
y.shape

(19844,)

In [359]:
# Define the linear regression model
model = LinearRegression()


In [360]:
# Create a StandardScaler instance
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data 
X_trained_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [361]:
# Train the model
model.fit(X_train, y_train)


LinearRegression()

In [362]:
# Evaluate the model
y_pred = model.predict(X_test_scaled)
y_pred

array([125.85138547, 125.84935738, 125.84969547, ..., 125.84969541,
       125.84935723, 125.85273739])

In [363]:
# Retrieving the model intercept and slope 
print(model.coef_)
print(model.intercept_)

[-3.17564407e-05  2.96492937e+00]
125.97131328637674


In [364]:
# Evaluation model in dataframe 
final_df = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})
final_df

Unnamed: 0,Actual,Predicted
14607,0.20,125.851385
5910,0.01,125.849357
855,0.01,125.849695
5143,0.01,125.849695
4454,1.00,125.883158
...,...,...
3735,0.01,125.849683
4719,1.25,125.859498
8417,0.01,125.849695
2773,0.50,125.849357
