In [1]:
import pandas as pd
pd.set_option('display.max_columns', 300)
pd.set_option('display.max_rows', 200)
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.metrics import f1_score, accuracy_score, recall_score, confusion_matrix
from sklearn.utils import resample
from xgboost import XGBClassifier

import pickle


In [2]:
df = pd.read_csv('../data/california_wildfires.csv')

In [3]:
# Create dummy variables for the county column
counties = pd.get_dummies(df.county, drop_first = True)
# Drop county column along with unnecessary columns (Unnamed columns, year, and acres burned)
df2 = df.drop(columns = ['Unnamed: 0', 'Unnamed: 0.1', 'county', 'year', 'acres_burned'], axis = 1)

In [4]:
# Feature engineer month column from the date column
df2['month'] = pd.DatetimeIndex(df2['date']).month
# Drop the date column
df2.drop(columns = ['date'], axis = 1, inplace = True)

In [5]:
# Create dummy variables for the months
month = pd.get_dummies(df2.month, prefix = 'month_', drop_first = True)
# Drop the month column
df2.drop(columns = 'month', axis =1, inplace = True)

In [6]:
# Combine the original dataframe with the dummy variables
df2 = pd.concat([df2, counties, month], axis = 1)

In [7]:
# Split dataset into target variable and features
y = df2.fire_started
X = df2.drop(columns = ['fire_started'], axis = 1)

In [8]:
list(X.columns)

['Alfalfa & Hay_acres',
 'Alfalfa & Hay_percentage',
 'Almonds_acres',
 'Almonds_percentage',
 'Barren_acres',
 'Barren_percentage',
 'Corn_acres',
 'Corn_percentage',
 'Cotton_acres',
 'Cotton_percentage',
 'Deciduous Forest_acres',
 'Deciduous Forest_percentage',
 'Evergreen Forest_acres',
 'Evergreen Forest_percentage',
 'Fallow_acres',
 'Fallow_percentage',
 'Fruit Trees_acres',
 'Fruit Trees_percentage',
 'Grain Crops_acres',
 'Grain Crops_percentage',
 'Grapes_acres',
 'Grapes_percentage',
 'Grassland_acres',
 'Grassland_percentage',
 'High Intensity Developed_acres',
 'High Intensity Developed_percentage',
 'Low Intensity Developed_acres',
 'Low Intensity Developed_percentage',
 'Mixed Forest_acres',
 'Mixed Forest_percentage',
 'Other Ocean/Mexico_acres',
 'Other Ocean/Mexico_percentage',
 'Other Tree Crops_acres',
 'Other Tree Crops_percentage',
 'Other_acres',
 'Other_percentage',
 'Rice_acres',
 'Rice_percentage',
 'Shrubland_acres',
 'Shrubland_percentage',
 'Tomatoes_acres

In [9]:
df_features = X[list(X.columns)]

In [10]:
poly_2 = PolynomialFeatures(degree=2, include_bias=False)
poly2_data = poly_2.fit_transform(df_features)
poly2_columns = poly_2.get_feature_names(df_features.columns)
df_poly2 = pd.DataFrame(poly2_data, columns=poly2_columns)
df_poly2.head()

Unnamed: 0,Alfalfa & Hay_acres,Alfalfa & Hay_percentage,Almonds_acres,Almonds_percentage,Barren_acres,Barren_percentage,Corn_acres,Corn_percentage,Cotton_acres,Cotton_percentage,Deciduous Forest_acres,Deciduous Forest_percentage,Evergreen Forest_acres,Evergreen Forest_percentage,Fallow_acres,Fallow_percentage,Fruit Trees_acres,Fruit Trees_percentage,Grain Crops_acres,Grain Crops_percentage,Grapes_acres,Grapes_percentage,Grassland_acres,Grassland_percentage,High Intensity Developed_acres,High Intensity Developed_percentage,Low Intensity Developed_acres,Low Intensity Developed_percentage,Mixed Forest_acres,Mixed Forest_percentage,Other Ocean/Mexico_acres,Other Ocean/Mexico_percentage,Other Tree Crops_acres,Other Tree Crops_percentage,Other_acres,Other_percentage,Rice_acres,Rice_percentage,Shrubland_acres,Shrubland_percentage,Tomatoes_acres,Tomatoes_percentage,Vegs & Fruits_acres,Vegs & Fruits_percentage,Walnuts_acres,Walnuts_percentage,Water_acres,Water_percentage,Wetlands_acres,Wetlands_percentage,Winter Wheat_acres,Winter Wheat_percentage,max_elevation,min_elevation,Avg Air Temp (F)_Weekly,Avg Rel Hum (%)_Weekly,Avg Wind Speed (mph)_Weekly,Dew Point (F)_Weekly,Max Air Temp (F)_Weekly,Max Rel Hum (%)_Weekly,Min Air Temp (F)_Weekly,Min Rel Hum (%)_Weekly,Precip (in)_Weekly,Avg Air Temp (F)_month,Avg Rel Hum (%)_month,Avg Wind Speed (mph)_month,Dew Point (F)_month,Max Air Temp (F)_month,Max Rel Hum (%)_month,Min Air Temp (F)_month,Min Rel Hum (%)_month,Precip (in)_month,Population,county_acres,pop_density,Alpine,Amador,Butte,Calaveras,Colusa,Contra Costa,Del Norte,El Dorado,Fresno,Glenn,Humboldt,Imperial,Inyo,Kern,Kings,Lake,Lassen,Los Angeles,Madera,Marin,Mariposa,Mendocino,Merced,Modoc,Mono,Monterey,Napa,Nevada,Orange,Placer,Plumas,Riverside,Sacramento,San Benito,San Bernardino,San Diego,San Francisco,San Joaquin,San Luis Obispo,San Mateo,Santa Barbara,Santa Clara,Santa Cruz,Shasta,Sierra,Siskiyou,Solano,Sonoma,Stanislaus,Sutter,Tehama,Trinity,Tulare,Tuolumne,Ventura,Yolo,Yuba,month__2,month__3,month__4,month__5,month__6,month__7,month__8,month__9,month__10,month__11,month__12,Alfalfa & Hay_acres^2,Alfalfa & Hay_acres Alfalfa & Hay_percentage,Alfalfa & Hay_acres Almonds_acres,Alfalfa & Hay_acres Almonds_percentage,Alfalfa & Hay_acres Barren_acres,Alfalfa & Hay_acres Barren_percentage,Alfalfa & Hay_acres Corn_acres,...,Trinity Ventura,Trinity Yolo,Trinity Yuba,Trinity month__2,Trinity month__3,Trinity month__4,Trinity month__5,Trinity month__6,Trinity month__7,Trinity month__8,Trinity month__9,Trinity month__10,Trinity month__11,Trinity month__12,Tulare^2,Tulare Tuolumne,Tulare Ventura,Tulare Yolo,Tulare Yuba,Tulare month__2,Tulare month__3,Tulare month__4,Tulare month__5,Tulare month__6,Tulare month__7,Tulare month__8,Tulare month__9,Tulare month__10,Tulare month__11,Tulare month__12,Tuolumne^2,Tuolumne Ventura,Tuolumne Yolo,Tuolumne Yuba,Tuolumne month__2,Tuolumne month__3,Tuolumne month__4,Tuolumne month__5,Tuolumne month__6,Tuolumne month__7,Tuolumne month__8,Tuolumne month__9,Tuolumne month__10,Tuolumne month__11,Tuolumne month__12,Ventura^2,Ventura Yolo,Ventura Yuba,Ventura month__2,Ventura month__3,Ventura month__4,Ventura month__5,Ventura month__6,Ventura month__7,Ventura month__8,Ventura month__9,Ventura month__10,Ventura month__11,Ventura month__12,Yolo^2,Yolo Yuba,Yolo month__2,Yolo month__3,Yolo month__4,Yolo month__5,Yolo month__6,Yolo month__7,Yolo month__8,Yolo month__9,Yolo month__10,Yolo month__11,Yolo month__12,Yuba^2,Yuba month__2,Yuba month__3,Yuba month__4,Yuba month__5,Yuba month__6,Yuba month__7,Yuba month__8,Yuba month__9,Yuba month__10,Yuba month__11,Yuba month__12,month__2^2,month__2 month__3,month__2 month__4,month__2 month__5,month__2 month__6,month__2 month__7,month__2 month__8,month__2 month__9,month__2 month__10,month__2 month__11,month__2 month__12,month__3^2,month__3 month__4,month__3 month__5,month__3 month__6,month__3 month__7,month__3 month__8,month__3 month__9,month__3 month__10,month__3 month__11,month__3 month__12,month__4^2,month__4 month__5,month__4 month__6,month__4 month__7,month__4 month__8,month__4 month__9,month__4 month__10,month__4 month__11,month__4 month__12,month__5^2,month__5 month__6,month__5 month__7,month__5 month__8,month__5 month__9,month__5 month__10,month__5 month__11,month__5 month__12,month__6^2,month__6 month__7,month__6 month__8,month__6 month__9,month__6 month__10,month__6 month__11,month__6 month__12,month__7^2,month__7 month__8,month__7 month__9,month__7 month__10,month__7 month__11,month__7 month__12,month__8^2,month__8 month__9,month__8 month__10,month__8 month__11,month__8 month__12,month__9^2,month__9 month__10,month__9 month__11,month__9 month__12,month__10^2,month__10 month__11,month__10 month__12,month__11^2,month__11 month__12,month__12^2
0,1102.856805,0.300074,4.225505,0.00115,194.595625,0.052947,4.670295,0.001271,0.0,0.0,5.33748,0.001452,7838.756565,2.132827,1536.74945,0.41813,1.77916,0.000484,991.214515,0.269697,3722.44751,1.012831,153671.38668,41.812059,28431.42159,7.735834,39470.886995,10.739534,74885.956375,20.375531,0.0,0.0,8.673405,0.00236,0.0,0.0,0.88958,0.000242,30958.051185,8.423298,4.670295,0.001271,164.12751,0.044657,4.670295,0.001271,19403.51896,5.279454,4497.494085,1.223712,624.48516,0.169915,1242.0,-42.0,44.214286,82.785714,2.392857,39.321429,54.157143,96.5,35.771429,60.785714,0.095714,45.506897,78.189655,2.915517,38.932759,55.896552,95.448276,35.725862,55.810345,0.130172,1567167.0,528000.0,2.968119,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1216293.0,330.938124,4660.127,1.267962,214611.1,58.392994,5150.667,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,189.03575,0.04008,0.0,0.0,15482.472715,3.28265,0.0,0.0,0.0,0.0,194.595625,0.041259,195088.00753,41.363269,0.44479,9.4e-05,0.222395,4.7e-05,0.0,0.0,0.0,0.0,5644.82989,1.196837,121.42767,0.025746,3192.480225,0.676881,0.667185,0.000141,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,247783.390805,52.535935,0.0,0.0,0.0,0.0,0.0,0.0,2650.50361,0.561969,1297.45243,0.275091,0.0,0.0,3556.0,1442.0,29.657143,76.514286,3.228571,21.328571,34.428571,91.857143,22.857143,55.428571,0.0,30.789655,68.162069,4.968966,19.6,39.344828,86.0,22.758621,46.344828,0.0,1164.0,465280.0,0.002502,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,35734.51,7.576562,0.0,0.0,2926741.0,620.538233,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1326.80857,0.41429,16.679625,0.005208,1873.01069,0.58484,242.632945,0.075761,0.0,0.0,17190.911105,5.367789,114386.866695,35.71681,168.13062,0.052498,12.00933,0.00375,120.0933,0.037499,2587.34343,0.807887,112912.61024,35.25648,440.119705,0.137425,8263.975805,2.580391,1727.11957,0.539286,0.0,0.0,1.33437,0.000417,0.0,0.0,1.111975,0.000347,52457.865415,16.379744,0.0,0.0,1.77916,0.000556,122.094855,0.038124,5822.74589,1.818128,105.86002,0.033054,479.48362,0.149717,3121.0,43.0,34.114286,83.571429,3.157143,29.585714,40.071429,96.0,27.757143,66.571429,0.141429,34.289655,76.724138,3.606897,27.410345,41.2,93.172414,27.768966,58.310345,0.155517,37304.0,384640.0,0.096984,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1760421.0,549.68393,22130.67,6.910207,2485127.0,775.970174,321927.5,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,3777.15668,0.374865,46196.556585,4.584787,1869.45237,0.185534,2023.34971,0.200808,9.118195,0.000905,33181.556395,3.293111,408193.790775,40.511281,56434.51041,5.600855,10563.31771,1.048359,2628.041715,0.260821,247.525635,0.024566,170758.216925,16.946936,4421.65739,0.438828,25520.048645,2.532743,165.684275,0.016443,0.0,0.0,790.39183,0.078443,0.0,0.0,105624.2813,10.482705,55372.129495,5.495419,94.29548,0.009358,469.475845,0.046593,42057.340845,4.17399,21360.372565,2.119915,11589.89303,1.150241,4257.08509,0.422495,2192.0,-1.0,40.985714,81.285714,3.142857,35.557143,50.114286,91.285714,32.171429,62.857143,0.117143,42.389655,77.448276,3.848276,35.586207,52.455172,88.965517,33.365517,58.862069,0.175517,222185.0,1065600.0,0.208507,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,14266910.0,1415.922813,174491600.0,17317.459639,7061215.0,700.791755,7642509.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,31.802485,0.005011,28.46656,0.004485,218.39189,0.034409,0.88958,0.00014,0.0,0.0,34882.878145,5.495994,255438.00431,40.245698,28.688955,0.00452,12.231725,0.001927,2.22395,0.00035,522.18346,0.082273,207502.763615,32.69323,465.027945,0.073268,12257.745215,1.931277,3351.270255,0.528012,0.0,0.0,0.88958,0.00014,0.0,0.0,0.0,0.0,106091.088405,16.715249,0.0,0.0,0.667185,0.000105,425.66403,0.067066,13178.68291,2.076376,245.07929,0.038614,11.786935,0.001857,3522.0,787.0,41.928571,93.014286,5.657143,39.0,50.142857,100.0,35.571429,74.142857,0.0,42.931034,87.017241,6.268966,37.196552,52.827586,97.551724,34.344828,61.275862,0.0,45424.0,663040.0,0.068509,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1011.398,0.159351,905.3073,0.142636,6945.405,1.094288,28.29085,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [11]:
X_train, X_test, y_train, y_test = train_test_split(df_poly2, y, random_state = 0)

In [13]:
from sklearn.feature_selection import SelectKBest, f_classif, mutual_info_classif

selector = SelectKBest(f_classif, k=100)

selector.fit(X_train, y_train)

  f = msb / msw


SelectKBest(k=100)

In [14]:
selected_columns = X_train.columns[selector.get_support()]
removed_columns = X_train.columns[~selector.get_support()]

In [15]:
list(selected_columns)

['Avg Air Temp (F)_Weekly',
 'Dew Point (F)_Weekly',
 'Max Air Temp (F)_Weekly',
 'Min Air Temp (F)_Weekly',
 'Avg Air Temp (F)_month',
 'Dew Point (F)_month',
 'Max Air Temp (F)_month',
 'Min Air Temp (F)_month',
 'month__7',
 'Grassland_acres Min Air Temp (F)_Weekly',
 'Grassland_acres month__7',
 'Low Intensity Developed_acres Avg Air Temp (F)_Weekly',
 'Low Intensity Developed_acres Max Air Temp (F)_Weekly',
 'Low Intensity Developed_acres Min Air Temp (F)_Weekly',
 'Low Intensity Developed_acres Avg Air Temp (F)_month',
 'Low Intensity Developed_acres Max Air Temp (F)_month',
 'Low Intensity Developed_acres Min Air Temp (F)_month',
 'Low Intensity Developed_acres month__7',
 'Shrubland_percentage Avg Air Temp (F)_Weekly',
 'Shrubland_percentage Dew Point (F)_Weekly',
 'Shrubland_percentage Min Air Temp (F)_Weekly',
 'Shrubland_percentage Avg Air Temp (F)_month',
 'Shrubland_percentage Dew Point (F)_month',
 'Shrubland_percentage Min Air Temp (F)_month',
 'Shrubland_percentage mont

In [16]:
# Concatenate the X_train and y_train back into one training dataframe
training = pd.concat([X_train, y_train], axis=1)

In [17]:
# Split the target variable by class into two dataframes
no_fire = training[training.fire_started == 0] # 0 = No Wildfire
fire = training[training.fire_started == 1] # 1 = Wildfire

In [18]:
# Resample the minority class (wildfire)
fire_upsampled = resample(fire,
                          replace=True, # sample with replacement
                          n_samples=no_fire.shape[0], # match number in majority class
                          random_state=42) # reproducible result

In [19]:
# Create a new dataframe combining the target classes
resampled_df = pd.concat([no_fire, fire_upsampled])

In [20]:
# Double-check the different target classes
resampled_df.fire_started.value_counts()

1.0    12812
0.0    12812
Name: fire_started, dtype: int64

In [21]:
y_train = resampled_df.fire_started
X_train = resampled_df.drop(columns = ['fire_started'], axis = 1)

# Modeling

## Logistic Regression

### Base Model

In [22]:
# Instantiate a logistic regression model
logreg = LogisticRegression(random_state = 0) # random state for consistant results
# Train model on resampled training data
logreg.fit(X_train[selected_columns], y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression(random_state=0)

In [23]:
# Use the model to predict the target variable on the training dataset
y_hat_train = logreg.predict(X_train[selected_columns])
# Use the model to predict the target variable on the test dataset
y_hat_test = logreg.predict(X_test[selected_columns])

In [24]:
# Print the f1 score metric on both the training and test predictions to check for overfitting
print('F1 Score: Training', f1_score(y_train, y_hat_train), 'Test', f1_score(y_test, y_hat_test))
# Print the accuracy score metric on both the training and test predictions to check for overfitting
print('Accuracy Score: Training', accuracy_score(y_train, y_hat_train), 'Test', accuracy_score(y_test, y_hat_test))
# Print the recall score metric on both the training and test predictions to check for overfitting
print('Recall Score: Training', recall_score(y_train, y_hat_train), 'Test', recall_score(y_test, y_hat_test))

F1 Score: Training 0.6468699839486357 Test 0.1438188494492044
Accuracy Score: Training 0.5707149547299407 Test 0.3835646618197841
Recall Score: Training 0.7863721511083359 Test 0.8103448275862069


In [25]:
confusion_matrix(y_test, y_hat_test)

array([[1506, 2743],
       [  55,  235]])

### Model - Scaled Data

In [26]:
# Insantiate the StandardScaler()
ss = StandardScaler()
# Fit the feature training data
ss.fit(X_train[selected_columns])

# Transform both the training and test features
X_train_scaled = ss.transform(X_train[selected_columns])
X_test_scaled = ss.transform(X_test[selected_columns])

In [27]:
# Instantiate a new logistic regression model
logreg1 = LogisticRegression(solver = 'liblinear')
# Fit the data to the new scaled data
logreg1.fit(X_train_scaled, y_train)

LogisticRegression(solver='liblinear')

In [28]:
# Use model to predict target variable on the training dataset
y_hat_train1 = logreg1.predict(X_train_scaled)
# Use model to predict target variable on the test dataset
y_hat_test1 = logreg1.predict(X_test_scaled)

In [29]:
# Print the f1 score metric on both the training and test predictions to check for overfitting
print('F1 Score: Training', f1_score(y_train, y_hat_train1), 'Test', f1_score(y_test, y_hat_test1))
# Print the accuracy score metric on both the training and test predictions to check for overfitting
print('Accuracy Score: Training', accuracy_score(y_train, y_hat_train1), 'Test', accuracy_score(y_test, y_hat_test1))
# Print the recall score metric on both the training and test predictions to check for overfitting
print('Recall Score: Training', recall_score(y_train, y_hat_train1), 'Test', recall_score(y_test, y_hat_test1))

F1 Score: Training 0.7512939665268805 Test 0.25905598243688255
Accuracy Score: Training 0.7430924133624727 Test 0.7025776602775942
Recall Score: Training 0.7760693100218545 Test 0.8137931034482758


In [30]:
confusion_matrix(y_test, y_hat_test1)

array([[2953, 1296],
       [  54,  236]])

In [31]:
scores = cross_val_score(logreg1, X_train_scaled, y_train, cv=10)
print(scores.mean())

0.7423113191599787


## K Nearest Neighbor (KNN)

### Base Model

In [32]:
knn = KNeighborsClassifier()

In [33]:
# Fit knn model using the scaled data from the previous scaled logistic model
knn.fit(X_train_scaled, y_train)

KNeighborsClassifier()

In [34]:
# Predict target variable for both the train and test datasets.
knn_train = knn.predict(X_train_scaled)
knn_test = knn.predict(X_test_scaled)

In [35]:
# Print the f1 score metric on both the training and test predictions to check for overfitting
print('F1 Score: Training', f1_score(y_train, knn_train), 'Test', f1_score(y_test, knn_test))
# Print the sccuracy score metric on both the training and test predictions to check for overfitting
print('Accuracy Score: Training', accuracy_score(y_train, knn_train), 'Test', accuracy_score(y_test, knn_test))
# Print the recall score metric on both the training and test predictions to check for overfitting
print('Recall Score: Training', recall_score(y_train, knn_train), 'Test', recall_score(y_test, knn_test))

F1 Score: Training 0.9560123866731337 Test 0.25708502024291496
Accuracy Score: Training 0.9539884483296909 Test 0.8382903723287067
Recall Score: Training 1.0 Test 0.4379310344827586


In [36]:
confusion_matrix(y_test, knn_test)

array([[3678,  571],
       [ 163,  127]])

### Tuning Number of Nearest Neighbors

In [37]:
# Create a function to find the max recall score and return the score along with the k value
def max_value(l):
    max_val = max(l)
    max_idx = l.index(max_val)
    return max_idx, max_val

In [38]:
# Create an empty list for recall scores
k_scores = []
# Choose a range of k values to test
k_range = list(range(1, 21))
# Iterate through the different k values
for k in k_range:
    # Instantiate new knn model with k nearest neighbors
    knn = KNeighborsClassifier(n_neighbors = k)
    # Fit knn model on scaled training data
    knn.fit(X_train_scaled, y_train)
    # Use model to predict target variable on testing set
    y_pred = knn.predict(X_test_scaled)
    # Find the recall score
    recall = recall_score(y_test, y_pred)
    # Append recall score to list of recall scorees
    k_scores.append(recall)

# Find max recall score
idx, val = max_value(k_scores)
# Print max recall score and it corresponding k value
print(idx + 1, val)

19 0.6862068965517242


In [39]:
knn = KNeighborsClassifier(n_neighbors = 19)
knn.fit(X_train_scaled, y_train)
y_pred = knn.predict(X_test_scaled)
knn_train1 = knn.predict(X_train_scaled)
print('F1 Score: Training', f1_score(y_train, knn_train1), 'Test', f1_score(y_test, y_pred))
print('Accuracy Score: Training', accuracy_score(y_train, knn_train1), 'Test', accuracy_score(y_test, y_pred))
print('Recall Score: Training', recall_score(y_train, knn_train1), 'Test', recall_score(y_test, y_pred))

F1 Score: Training 0.8805031446540881 Test 0.2275586049170955
Accuracy Score: Training 0.8650483921323759 Test 0.7023573474333553
Recall Score: Training 0.9943802684982829 Test 0.6862068965517242


In [40]:
confusion_matrix(y_test, y_pred)

array([[2989, 1260],
       [  91,  199]])

In [41]:
scores_knn = cross_val_score(knn, X_train_scaled, y_train, cv=10)
print(scores_knn.mean())

0.841828360902448


## Decision Tree

### Base Model

In [42]:
dt = DecisionTreeClassifier(random_state = 0)
dt.fit(X_train[selected_columns], y_train)

DecisionTreeClassifier(random_state=0)

In [43]:
dt_train = dt.predict(X_train[selected_columns])
dt_test = dt.predict(X_test[selected_columns])

print('F1 Score: Training', f1_score(y_train, dt_train), 'Test', f1_score(y_test, dt_test))
print('Accuracy Score: Training', accuracy_score(y_train, dt_train), 'Test', accuracy_score(y_test, dt_test))
print('Recall Score: Training', recall_score(y_train, dt_train), 'Test', recall_score(y_test, dt_test))

F1 Score: Training 1.0 Test 0.1898066783831283
Accuracy Score: Training 1.0 Test 0.8984357788059044
Recall Score: Training 1.0 Test 0.18620689655172415


In [44]:
confusion_matrix(y_test, dt_test)

array([[4024,  225],
       [ 236,   54]])

### Training Parameters - GridSearchCV

In [45]:
parameters = {'max_depth' : range(10, 15, 1), 'max_features' : range(55, 75, 1), 'min_samples_split' : range(10, 20, 1)}

In [46]:
dtg = DecisionTreeClassifier(random_state = 0)
grid_model = GridSearchCV(dtg, parameters, cv = 10, scoring = 'f1', verbose = 1, n_jobs = -1)

grid_model.fit(X_train[selected_columns], y_train)

Fitting 10 folds for each of 1000 candidates, totalling 10000 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:    3.8s
[Parallel(n_jobs=-1)]: Done 168 tasks      | elapsed:   13.1s
[Parallel(n_jobs=-1)]: Done 418 tasks      | elapsed:   30.4s
[Parallel(n_jobs=-1)]: Done 768 tasks      | elapsed:   53.9s
[Parallel(n_jobs=-1)]: Done 1218 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done 1768 tasks      | elapsed:  2.2min
[Parallel(n_jobs=-1)]: Done 2418 tasks      | elapsed:  3.0min
[Parallel(n_jobs=-1)]: Done 3168 tasks      | elapsed:  3.8min
[Parallel(n_jobs=-1)]: Done 4018 tasks      | elapsed:  4.9min
[Parallel(n_jobs=-1)]: Done 4968 tasks      | elapsed:  6.0min
[Parallel(n_jobs=-1)]: Done 6018 tasks      | elapsed:  7.4min
[Parallel(n_jobs=-1)]: Done 7168 tasks      | elapsed:  8.8min
[Parallel(n_jobs=-1)]: Done 8418 tasks      | elapsed: 10.4min
[Parallel(n_jobs=-1)]: Done 9768 tasks      | elapsed: 12.2min
[Parallel(n_jobs=-1)]: Done 10000 out of 1000

GridSearchCV(cv=10, estimator=DecisionTreeClassifier(random_state=0), n_jobs=-1,
             param_grid={'max_depth': range(10, 15),
                         'max_features': range(55, 75),
                         'min_samples_split': range(10, 20)},
             scoring='f1', verbose=1)

In [47]:
print(grid_model.best_score_)
print(grid_model.best_params_)
print(grid_model.best_estimator_)

0.9182986641199241
{'max_depth': 14, 'max_features': 55, 'min_samples_split': 13}
DecisionTreeClassifier(max_depth=14, max_features=55, min_samples_split=13,
                       random_state=0)


In [48]:
dt_train2 = grid_model.best_estimator_.predict(X_train[selected_columns])
dt_test2 = grid_model.best_estimator_.predict(X_test[selected_columns])

print('F1 Score: Training', f1_score(y_train, dt_train2), 'Test', f1_score(y_test, dt_test2))
print('Accuracy Score: Training', accuracy_score(y_train, dt_train2), 'Test', accuracy_score(y_test, dt_test2))
print('Recall Score: Training', recall_score(y_train, dt_train2), 'Test', recall_score(y_test, dt_test2))

F1 Score: Training 0.926453488372093 Test 0.2733118971061093
Accuracy Score: Training 0.9210115516703091 Test 0.8008371888081075
Recall Score: Training 0.9950046831095848 Test 0.5862068965517241


In [49]:
parameters2 = {'max_depth' : range(10, 12, 1), 'max_features' : range(55, 75, 1), 'min_samples_split' : range(10, 20, 1)}

In [50]:
dtg2 = DecisionTreeClassifier(random_state = 0)
dtg_model2 = GridSearchCV(dtg2, parameters2, cv = 10, scoring = 'f1', verbose = 1, n_jobs = -1)

dtg_model2.fit(X_train[selected_columns], y_train)

Fitting 10 folds for each of 400 candidates, totalling 4000 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:    3.5s
[Parallel(n_jobs=-1)]: Done 168 tasks      | elapsed:   12.2s
[Parallel(n_jobs=-1)]: Done 418 tasks      | elapsed:   27.6s
[Parallel(n_jobs=-1)]: Done 768 tasks      | elapsed:   50.4s
[Parallel(n_jobs=-1)]: Done 1218 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done 1768 tasks      | elapsed:  2.1min
[Parallel(n_jobs=-1)]: Done 2418 tasks      | elapsed:  2.9min
[Parallel(n_jobs=-1)]: Done 3168 tasks      | elapsed:  3.7min
[Parallel(n_jobs=-1)]: Done 4000 out of 4000 | elapsed:  4.8min finished


GridSearchCV(cv=10, estimator=DecisionTreeClassifier(random_state=0), n_jobs=-1,
             param_grid={'max_depth': range(10, 12),
                         'max_features': range(55, 75),
                         'min_samples_split': range(10, 20)},
             scoring='f1', verbose=1)

In [51]:
print(dtg_model2.best_score_)
print(dtg_model2.best_params_)
print(dtg_model2.best_estimator_)

0.9012191226496503
{'max_depth': 11, 'max_features': 63, 'min_samples_split': 11}
DecisionTreeClassifier(max_depth=11, max_features=63, min_samples_split=11,
                       random_state=0)


In [52]:
dt_train3 = dtg_model2.best_estimator_.predict(X_train[selected_columns])
dt_test3 = dtg_model2.best_estimator_.predict(X_test[selected_columns])

print('F1 Score: Training', f1_score(y_train, dt_train3), 'Test', f1_score(y_test, dt_test3))
print('Accuracy Score: Training', accuracy_score(y_train, dt_train3), 'Test', accuracy_score(y_test, dt_test3))
print('Recall Score: Training', recall_score(y_train, dt_train3), 'Test', recall_score(y_test, dt_test3))

F1 Score: Training 0.9087257468708532 Test 0.27266338721011946
Accuracy Score: Training 0.9006790508897908 Test 0.7719762062128223
Recall Score: Training 0.9888385888229785 Test 0.6689655172413793


In [53]:
confusion_matrix(y_test, dt_test3)

array([[3310,  939],
       [  96,  194]])