In [1]:
import pandas as pd
pd.set_option('display.max_columns', 300)
pd.set_option('display.max_rows', 200)
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.metrics import f1_score, accuracy_score, recall_score, confusion_matrix
from sklearn.utils import resample
from xgboost import XGBClassifier

import pickle


In [2]:
df = pd.read_csv('../data/california_wildfires.csv')

In [3]:
# Create dummy variables for the county column
counties = pd.get_dummies(df.county, drop_first = True)

In [4]:
# Drop county column along with unnecessary columns (Unnamed columns, year, and acres burned)
df2 = df.drop(columns = ['Unnamed: 0', 'Unnamed: 0.1', 'county', 'year', 'acres_burned'], axis = 1)

In [5]:
# Feature engineer month column from the date column
df2['month'] = pd.DatetimeIndex(df2['date']).month
# Drop the date column
df2.drop(columns = ['date'], axis = 1, inplace = True)

# Create dummy variables for the months
month = pd.get_dummies(df2.month, prefix = 'month_', drop_first = True)
# Drop the month column
df2.drop(columns = 'month', axis =1, inplace = True)

In [6]:
# Combine the original dataframe with the dummy variables
df2 = pd.concat([df2, counties, month], axis = 1)

# Split dataset into target variable and features
y = df2.fire_started
X = df2.drop(columns = ['fire_started'], axis = 1)

In [7]:
list(X.columns)

['Alfalfa & Hay_acres',
 'Alfalfa & Hay_percentage',
 'Almonds_acres',
 'Almonds_percentage',
 'Barren_acres',
 'Barren_percentage',
 'Corn_acres',
 'Corn_percentage',
 'Cotton_acres',
 'Cotton_percentage',
 'Deciduous Forest_acres',
 'Deciduous Forest_percentage',
 'Evergreen Forest_acres',
 'Evergreen Forest_percentage',
 'Fallow_acres',
 'Fallow_percentage',
 'Fruit Trees_acres',
 'Fruit Trees_percentage',
 'Grain Crops_acres',
 'Grain Crops_percentage',
 'Grapes_acres',
 'Grapes_percentage',
 'Grassland_acres',
 'Grassland_percentage',
 'High Intensity Developed_acres',
 'High Intensity Developed_percentage',
 'Low Intensity Developed_acres',
 'Low Intensity Developed_percentage',
 'Mixed Forest_acres',
 'Mixed Forest_percentage',
 'Other Ocean/Mexico_acres',
 'Other Ocean/Mexico_percentage',
 'Other Tree Crops_acres',
 'Other Tree Crops_percentage',
 'Other_acres',
 'Other_percentage',
 'Rice_acres',
 'Rice_percentage',
 'Shrubland_acres',
 'Shrubland_percentage',
 'Tomatoes_acres

In [8]:
df_features = X[list(X.columns)]

In [9]:
poly_2 = PolynomialFeatures(degree=2, interaction_only = True, include_bias=False)
poly2_data = poly_2.fit_transform(df_features)
poly2_columns = poly_2.get_feature_names(df_features.columns)
df_poly2 = pd.DataFrame(poly2_data, columns=poly2_columns)
df_poly2.head()

Unnamed: 0,Alfalfa & Hay_acres,Alfalfa & Hay_percentage,Almonds_acres,Almonds_percentage,Barren_acres,Barren_percentage,Corn_acres,Corn_percentage,Cotton_acres,Cotton_percentage,Deciduous Forest_acres,Deciduous Forest_percentage,Evergreen Forest_acres,Evergreen Forest_percentage,Fallow_acres,Fallow_percentage,Fruit Trees_acres,Fruit Trees_percentage,Grain Crops_acres,Grain Crops_percentage,Grapes_acres,Grapes_percentage,Grassland_acres,Grassland_percentage,High Intensity Developed_acres,High Intensity Developed_percentage,Low Intensity Developed_acres,Low Intensity Developed_percentage,Mixed Forest_acres,Mixed Forest_percentage,Other Ocean/Mexico_acres,Other Ocean/Mexico_percentage,Other Tree Crops_acres,Other Tree Crops_percentage,Other_acres,Other_percentage,Rice_acres,Rice_percentage,Shrubland_acres,Shrubland_percentage,Tomatoes_acres,Tomatoes_percentage,Vegs & Fruits_acres,Vegs & Fruits_percentage,Walnuts_acres,Walnuts_percentage,Water_acres,Water_percentage,Wetlands_acres,Wetlands_percentage,Winter Wheat_acres,Winter Wheat_percentage,max_elevation,min_elevation,Avg Air Temp (F)_Weekly,Avg Rel Hum (%)_Weekly,Avg Wind Speed (mph)_Weekly,Dew Point (F)_Weekly,Max Air Temp (F)_Weekly,Max Rel Hum (%)_Weekly,Min Air Temp (F)_Weekly,Min Rel Hum (%)_Weekly,Precip (in)_Weekly,Avg Air Temp (F)_month,Avg Rel Hum (%)_month,Avg Wind Speed (mph)_month,Dew Point (F)_month,Max Air Temp (F)_month,Max Rel Hum (%)_month,Min Air Temp (F)_month,Min Rel Hum (%)_month,Precip (in)_month,Population,county_acres,pop_density,Alpine,Amador,Butte,Calaveras,Colusa,Contra Costa,Del Norte,El Dorado,Fresno,Glenn,Humboldt,Imperial,Inyo,Kern,Kings,Lake,Lassen,Los Angeles,Madera,Marin,Mariposa,Mendocino,Merced,Modoc,Mono,Monterey,Napa,Nevada,Orange,Placer,Plumas,Riverside,Sacramento,San Benito,San Bernardino,San Diego,San Francisco,San Joaquin,San Luis Obispo,San Mateo,Santa Barbara,Santa Clara,Santa Cruz,Shasta,Sierra,Siskiyou,Solano,Sonoma,Stanislaus,Sutter,Tehama,Trinity,Tulare,Tuolumne,Ventura,Yolo,Yuba,month__2,month__3,month__4,month__5,month__6,month__7,month__8,month__9,month__10,month__11,month__12,Alfalfa & Hay_acres Alfalfa & Hay_percentage,Alfalfa & Hay_acres Almonds_acres,Alfalfa & Hay_acres Almonds_percentage,Alfalfa & Hay_acres Barren_acres,Alfalfa & Hay_acres Barren_percentage,Alfalfa & Hay_acres Corn_acres,Alfalfa & Hay_acres Corn_percentage,...,Tehama Ventura,Tehama Yolo,Tehama Yuba,Tehama month__2,Tehama month__3,Tehama month__4,Tehama month__5,Tehama month__6,Tehama month__7,Tehama month__8,Tehama month__9,Tehama month__10,Tehama month__11,Tehama month__12,Trinity Tulare,Trinity Tuolumne,Trinity Ventura,Trinity Yolo,Trinity Yuba,Trinity month__2,Trinity month__3,Trinity month__4,Trinity month__5,Trinity month__6,Trinity month__7,Trinity month__8,Trinity month__9,Trinity month__10,Trinity month__11,Trinity month__12,Tulare Tuolumne,Tulare Ventura,Tulare Yolo,Tulare Yuba,Tulare month__2,Tulare month__3,Tulare month__4,Tulare month__5,Tulare month__6,Tulare month__7,Tulare month__8,Tulare month__9,Tulare month__10,Tulare month__11,Tulare month__12,Tuolumne Ventura,Tuolumne Yolo,Tuolumne Yuba,Tuolumne month__2,Tuolumne month__3,Tuolumne month__4,Tuolumne month__5,Tuolumne month__6,Tuolumne month__7,Tuolumne month__8,Tuolumne month__9,Tuolumne month__10,Tuolumne month__11,Tuolumne month__12,Ventura Yolo,Ventura Yuba,Ventura month__2,Ventura month__3,Ventura month__4,Ventura month__5,Ventura month__6,Ventura month__7,Ventura month__8,Ventura month__9,Ventura month__10,Ventura month__11,Ventura month__12,Yolo Yuba,Yolo month__2,Yolo month__3,Yolo month__4,Yolo month__5,Yolo month__6,Yolo month__7,Yolo month__8,Yolo month__9,Yolo month__10,Yolo month__11,Yolo month__12,Yuba month__2,Yuba month__3,Yuba month__4,Yuba month__5,Yuba month__6,Yuba month__7,Yuba month__8,Yuba month__9,Yuba month__10,Yuba month__11,Yuba month__12,month__2 month__3,month__2 month__4,month__2 month__5,month__2 month__6,month__2 month__7,month__2 month__8,month__2 month__9,month__2 month__10,month__2 month__11,month__2 month__12,month__3 month__4,month__3 month__5,month__3 month__6,month__3 month__7,month__3 month__8,month__3 month__9,month__3 month__10,month__3 month__11,month__3 month__12,month__4 month__5,month__4 month__6,month__4 month__7,month__4 month__8,month__4 month__9,month__4 month__10,month__4 month__11,month__4 month__12,month__5 month__6,month__5 month__7,month__5 month__8,month__5 month__9,month__5 month__10,month__5 month__11,month__5 month__12,month__6 month__7,month__6 month__8,month__6 month__9,month__6 month__10,month__6 month__11,month__6 month__12,month__7 month__8,month__7 month__9,month__7 month__10,month__7 month__11,month__7 month__12,month__8 month__9,month__8 month__10,month__8 month__11,month__8 month__12,month__9 month__10,month__9 month__11,month__9 month__12,month__10 month__11,month__10 month__12,month__11 month__12
0,1102.856805,0.300074,4.225505,0.00115,194.595625,0.052947,4.670295,0.001271,0.0,0.0,5.33748,0.001452,7838.756565,2.132827,1536.74945,0.41813,1.77916,0.000484,991.214515,0.269697,3722.44751,1.012831,153671.38668,41.812059,28431.42159,7.735834,39470.886995,10.739534,74885.956375,20.375531,0.0,0.0,8.673405,0.00236,0.0,0.0,0.88958,0.000242,30958.051185,8.423298,4.670295,0.001271,164.12751,0.044657,4.670295,0.001271,19403.51896,5.279454,4497.494085,1.223712,624.48516,0.169915,1242.0,-42.0,44.214286,82.785714,2.392857,39.321429,54.157143,96.5,35.771429,60.785714,0.095714,45.506897,78.189655,2.915517,38.932759,55.896552,95.448276,35.725862,55.810345,0.130172,1567167.0,528000.0,2.968119,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,330.938124,4660.127,1.267962,214611.1,58.392994,5150.667,1.401432,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,189.03575,0.04008,0.0,0.0,15482.472715,3.28265,0.0,0.0,0.0,0.0,194.595625,0.041259,195088.00753,41.363269,0.44479,9.4e-05,0.222395,4.7e-05,0.0,0.0,0.0,0.0,5644.82989,1.196837,121.42767,0.025746,3192.480225,0.676881,0.667185,0.000141,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,247783.390805,52.535935,0.0,0.0,0.0,0.0,0.0,0.0,2650.50361,0.561969,1297.45243,0.275091,0.0,0.0,3556.0,1442.0,29.657143,76.514286,3.228571,21.328571,34.428571,91.857143,22.857143,55.428571,0.0,30.789655,68.162069,4.968966,19.6,39.344828,86.0,22.758621,46.344828,0.0,1164.0,465280.0,0.002502,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7.576562,0.0,0.0,2926741.0,620.538233,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1326.80857,0.41429,16.679625,0.005208,1873.01069,0.58484,242.632945,0.075761,0.0,0.0,17190.911105,5.367789,114386.866695,35.71681,168.13062,0.052498,12.00933,0.00375,120.0933,0.037499,2587.34343,0.807887,112912.61024,35.25648,440.119705,0.137425,8263.975805,2.580391,1727.11957,0.539286,0.0,0.0,1.33437,0.000417,0.0,0.0,1.111975,0.000347,52457.865415,16.379744,0.0,0.0,1.77916,0.000556,122.094855,0.038124,5822.74589,1.818128,105.86002,0.033054,479.48362,0.149717,3121.0,43.0,34.114286,83.571429,3.157143,29.585714,40.071429,96.0,27.757143,66.571429,0.141429,34.289655,76.724138,3.606897,27.410345,41.2,93.172414,27.768966,58.310345,0.155517,37304.0,384640.0,0.096984,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,549.68393,22130.67,6.910207,2485127.0,775.970174,321927.5,100.520477,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,3777.15668,0.374865,46196.556585,4.584787,1869.45237,0.185534,2023.34971,0.200808,9.118195,0.000905,33181.556395,3.293111,408193.790775,40.511281,56434.51041,5.600855,10563.31771,1.048359,2628.041715,0.260821,247.525635,0.024566,170758.216925,16.946936,4421.65739,0.438828,25520.048645,2.532743,165.684275,0.016443,0.0,0.0,790.39183,0.078443,0.0,0.0,105624.2813,10.482705,55372.129495,5.495419,94.29548,0.009358,469.475845,0.046593,42057.340845,4.17399,21360.372565,2.119915,11589.89303,1.150241,4257.08509,0.422495,2192.0,-1.0,40.985714,81.285714,3.142857,35.557143,50.114286,91.285714,32.171429,62.857143,0.117143,42.389655,77.448276,3.848276,35.586207,52.455172,88.965517,33.365517,58.862069,0.175517,222185.0,1065600.0,0.208507,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1415.922813,174491600.0,17317.459639,7061215.0,700.791755,7642509.0,758.48244,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,31.802485,0.005011,28.46656,0.004485,218.39189,0.034409,0.88958,0.00014,0.0,0.0,34882.878145,5.495994,255438.00431,40.245698,28.688955,0.00452,12.231725,0.001927,2.22395,0.00035,522.18346,0.082273,207502.763615,32.69323,465.027945,0.073268,12257.745215,1.931277,3351.270255,0.528012,0.0,0.0,0.88958,0.00014,0.0,0.0,0.0,0.0,106091.088405,16.715249,0.0,0.0,0.667185,0.000105,425.66403,0.067066,13178.68291,2.076376,245.07929,0.038614,11.786935,0.001857,3522.0,787.0,41.928571,93.014286,5.657143,39.0,50.142857,100.0,35.571429,74.142857,0.0,42.931034,87.017241,6.268966,37.196552,52.827586,97.551724,34.344828,61.275862,0.0,45424.0,663040.0,0.068509,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.159351,905.3073,0.142636,6945.405,1.094288,28.29085,0.004457,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [10]:
X_train, X_test, y_train, y_test = train_test_split(df_poly2, y, random_state = 0)

In [11]:
from sklearn.feature_selection import SelectKBest, f_classif, mutual_info_classif

selector = SelectKBest(f_classif, k=100)

selector.fit(X_train, y_train)

  f = msb / msw


SelectKBest(k=100)

In [12]:
selected_columns = X_train.columns[selector.get_support()]
removed_columns = X_train.columns[~selector.get_support()]

In [13]:
list(selected_columns)

['Avg Air Temp (F)_Weekly',
 'Dew Point (F)_Weekly',
 'Max Air Temp (F)_Weekly',
 'Min Air Temp (F)_Weekly',
 'Avg Air Temp (F)_month',
 'Dew Point (F)_month',
 'Max Air Temp (F)_month',
 'Min Air Temp (F)_month',
 'month__7',
 'Grassland_acres Avg Air Temp (F)_Weekly',
 'Grassland_acres Max Air Temp (F)_Weekly',
 'Grassland_acres Min Air Temp (F)_Weekly',
 'Grassland_acres Avg Air Temp (F)_month',
 'Grassland_acres Max Air Temp (F)_month',
 'Grassland_acres Min Air Temp (F)_month',
 'Grassland_acres month__7',
 'Low Intensity Developed_acres Avg Air Temp (F)_Weekly',
 'Low Intensity Developed_acres Dew Point (F)_Weekly',
 'Low Intensity Developed_acres Max Air Temp (F)_Weekly',
 'Low Intensity Developed_acres Min Air Temp (F)_Weekly',
 'Low Intensity Developed_acres Avg Air Temp (F)_month',
 'Low Intensity Developed_acres Dew Point (F)_month',
 'Low Intensity Developed_acres Max Air Temp (F)_month',
 'Low Intensity Developed_acres Min Air Temp (F)_month',
 'Low Intensity Developed_acr

In [14]:
chosen_features = [ 'Deciduous Forest_acres month__7',
                     'Deciduous Forest_acres month__8',
                     'Evergreen Forest_acres Population',
                     'Evergreen Forest_acres month__7',
                     'Evergreen Forest_acres month__8',
                     'Evergreen Forest_percentage month__7',
                     'Fallow_acres month__7',
                     'Grassland_acres Low Intensity Developed_acres',
                     'Grassland_acres Shrubland_acres',
                     'Grassland_acres Shrubland_percentage',
                     'Grassland_acres max_elevation',
                     'Grassland_acres Avg Air Temp (F)_Weekly',
                     'Grassland_acres Avg Wind Speed (mph)_Weekly',
                     'Grassland_acres Dew Point (F)_Weekly',
                     'Grassland_acres Max Air Temp (F)_Weekly',
                     'Grassland_acres Min Air Temp (F)_Weekly',
                     'Grassland_acres Avg Air Temp (F)_month',
                     'Grassland_acres Avg Wind Speed (mph)_month',
                     'Grassland_acres Dew Point (F)_month',
                     'Grassland_acres Max Air Temp (F)_month',
                     'Grassland_acres Min Air Temp (F)_month',
                     'Grassland_acres month__6',
                     'Grassland_acres month__7',
                     'Grassland_percentage Shrubland_acres',
                     'Grassland_percentage month__7',
                     'Low Intensity Developed_acres Shrubland_percentage',
                     'Low Intensity Developed_acres max_elevation',
                     'Low Intensity Developed_acres Avg Air Temp (F)_Weekly',
                     'Low Intensity Developed_acres Avg Wind Speed (mph)_Weekly',
                     'Low Intensity Developed_acres Dew Point (F)_Weekly',
                     'Low Intensity Developed_acres Max Air Temp (F)_Weekly',
                     'Low Intensity Developed_acres Max Rel Hum (%)_Weekly',
                     'Low Intensity Developed_acres Min Air Temp (F)_Weekly',
                     'Low Intensity Developed_acres Avg Air Temp (F)_month',
                     'Low Intensity Developed_acres Avg Wind Speed (mph)_month',
                     'Low Intensity Developed_acres Dew Point (F)_month',
                     'Low Intensity Developed_acres Max Air Temp (F)_month',
                     'Low Intensity Developed_acres Max Rel Hum (%)_month',
                     'Low Intensity Developed_acres Min Air Temp (F)_month',
                     'Low Intensity Developed_acres county_acres',
                     'Low Intensity Developed_acres month__6',
                     'Low Intensity Developed_acres month__7',
                     'Low Intensity Developed_acres month__8',
                     'Low Intensity Developed_percentage Shrubland_acres',
                     'Low Intensity Developed_percentage county_acres',
                     'Mixed Forest_acres month__7',
                     'Shrubland_acres Avg Air Temp (F)_Weekly',
                     'Shrubland_acres Dew Point (F)_Weekly',
                     'Shrubland_acres Min Air Temp (F)_Weekly',
                     'Shrubland_acres Avg Air Temp (F)_month',
                     'Shrubland_acres Dew Point (F)_month',
                     'Shrubland_acres Min Air Temp (F)_month',
                     'Shrubland_acres month__7',
                     'Shrubland_percentage Avg Air Temp (F)_Weekly',
                     'Shrubland_percentage Dew Point (F)_Weekly',
                     'Shrubland_percentage Max Air Temp (F)_Weekly',
                     'Shrubland_percentage Min Air Temp (F)_Weekly',
                     'Shrubland_percentage Avg Air Temp (F)_month',
                     'Shrubland_percentage Dew Point (F)_month',
                     'Shrubland_percentage Max Air Temp (F)_month',
                     'Shrubland_percentage Min Air Temp (F)_month',
                     'Shrubland_percentage month__7',
                     'Shrubland_percentage month__8',]

In [15]:
total_features = list(X.columns) + chosen_features
total_features

['Alfalfa & Hay_acres',
 'Alfalfa & Hay_percentage',
 'Almonds_acres',
 'Almonds_percentage',
 'Barren_acres',
 'Barren_percentage',
 'Corn_acres',
 'Corn_percentage',
 'Cotton_acres',
 'Cotton_percentage',
 'Deciduous Forest_acres',
 'Deciduous Forest_percentage',
 'Evergreen Forest_acres',
 'Evergreen Forest_percentage',
 'Fallow_acres',
 'Fallow_percentage',
 'Fruit Trees_acres',
 'Fruit Trees_percentage',
 'Grain Crops_acres',
 'Grain Crops_percentage',
 'Grapes_acres',
 'Grapes_percentage',
 'Grassland_acres',
 'Grassland_percentage',
 'High Intensity Developed_acres',
 'High Intensity Developed_percentage',
 'Low Intensity Developed_acres',
 'Low Intensity Developed_percentage',
 'Mixed Forest_acres',
 'Mixed Forest_percentage',
 'Other Ocean/Mexico_acres',
 'Other Ocean/Mexico_percentage',
 'Other Tree Crops_acres',
 'Other Tree Crops_percentage',
 'Other_acres',
 'Other_percentage',
 'Rice_acres',
 'Rice_percentage',
 'Shrubland_acres',
 'Shrubland_percentage',
 'Tomatoes_acres

In [16]:
poly2_df = df_poly2[total_features]
poly2_df.head()

Unnamed: 0,Alfalfa & Hay_acres,Alfalfa & Hay_percentage,Almonds_acres,Almonds_percentage,Barren_acres,Barren_percentage,Corn_acres,Corn_percentage,Cotton_acres,Cotton_percentage,Deciduous Forest_acres,Deciduous Forest_percentage,Evergreen Forest_acres,Evergreen Forest_percentage,Fallow_acres,Fallow_percentage,Fruit Trees_acres,Fruit Trees_percentage,Grain Crops_acres,Grain Crops_percentage,Grapes_acres,Grapes_percentage,Grassland_acres,Grassland_percentage,High Intensity Developed_acres,High Intensity Developed_percentage,Low Intensity Developed_acres,Low Intensity Developed_percentage,Mixed Forest_acres,Mixed Forest_percentage,Other Ocean/Mexico_acres,Other Ocean/Mexico_percentage,Other Tree Crops_acres,Other Tree Crops_percentage,Other_acres,Other_percentage,Rice_acres,Rice_percentage,Shrubland_acres,Shrubland_percentage,Tomatoes_acres,Tomatoes_percentage,Vegs & Fruits_acres,Vegs & Fruits_percentage,Walnuts_acres,Walnuts_percentage,Water_acres,Water_percentage,Wetlands_acres,Wetlands_percentage,Winter Wheat_acres,Winter Wheat_percentage,max_elevation,min_elevation,Avg Air Temp (F)_Weekly,Avg Rel Hum (%)_Weekly,Avg Wind Speed (mph)_Weekly,Dew Point (F)_Weekly,Max Air Temp (F)_Weekly,Max Rel Hum (%)_Weekly,Min Air Temp (F)_Weekly,Min Rel Hum (%)_Weekly,Precip (in)_Weekly,Avg Air Temp (F)_month,Avg Rel Hum (%)_month,Avg Wind Speed (mph)_month,Dew Point (F)_month,Max Air Temp (F)_month,Max Rel Hum (%)_month,Min Air Temp (F)_month,Min Rel Hum (%)_month,Precip (in)_month,Population,county_acres,pop_density,Alpine,Amador,Butte,Calaveras,Colusa,Contra Costa,Del Norte,El Dorado,Fresno,Glenn,Humboldt,Imperial,Inyo,Kern,Kings,Lake,Lassen,Los Angeles,Madera,Marin,Mariposa,Mendocino,Merced,Modoc,Mono,Monterey,Napa,Nevada,Orange,Placer,Plumas,Riverside,Sacramento,San Benito,San Bernardino,San Diego,San Francisco,San Joaquin,San Luis Obispo,San Mateo,Santa Barbara,Santa Clara,Santa Cruz,Shasta,Sierra,Siskiyou,Solano,Sonoma,Stanislaus,Sutter,Tehama,Trinity,Tulare,Tuolumne,Ventura,Yolo,Yuba,month__2,month__3,month__4,month__5,month__6,month__7,month__8,month__9,month__10,month__11,month__12,Deciduous Forest_acres month__7,Deciduous Forest_acres month__8,Evergreen Forest_acres Population,Evergreen Forest_acres month__7,Evergreen Forest_acres month__8,Evergreen Forest_percentage month__7,Fallow_acres month__7,Grassland_acres Low Intensity Developed_acres,Grassland_acres Shrubland_acres,Grassland_acres Shrubland_percentage,Grassland_acres max_elevation,Grassland_acres Avg Air Temp (F)_Weekly,Grassland_acres Avg Wind Speed (mph)_Weekly,Grassland_acres Dew Point (F)_Weekly,Grassland_acres Max Air Temp (F)_Weekly,Grassland_acres Min Air Temp (F)_Weekly,Grassland_acres Avg Air Temp (F)_month,Grassland_acres Avg Wind Speed (mph)_month,Grassland_acres Dew Point (F)_month,Grassland_acres Max Air Temp (F)_month,Grassland_acres Min Air Temp (F)_month,Grassland_acres month__6,Grassland_acres month__7,Grassland_percentage Shrubland_acres,Grassland_percentage month__7,Low Intensity Developed_acres Shrubland_percentage,Low Intensity Developed_acres max_elevation,Low Intensity Developed_acres Avg Air Temp (F)_Weekly,Low Intensity Developed_acres Avg Wind Speed (mph)_Weekly,Low Intensity Developed_acres Dew Point (F)_Weekly,Low Intensity Developed_acres Max Air Temp (F)_Weekly,Low Intensity Developed_acres Max Rel Hum (%)_Weekly,Low Intensity Developed_acres Min Air Temp (F)_Weekly,Low Intensity Developed_acres Avg Air Temp (F)_month,Low Intensity Developed_acres Avg Wind Speed (mph)_month,Low Intensity Developed_acres Dew Point (F)_month,Low Intensity Developed_acres Max Air Temp (F)_month,Low Intensity Developed_acres Max Rel Hum (%)_month,Low Intensity Developed_acres Min Air Temp (F)_month,Low Intensity Developed_acres county_acres,Low Intensity Developed_acres month__6,Low Intensity Developed_acres month__7,Low Intensity Developed_acres month__8,Low Intensity Developed_percentage Shrubland_acres,Low Intensity Developed_percentage county_acres,Mixed Forest_acres month__7,Shrubland_acres Avg Air Temp (F)_Weekly,Shrubland_acres Dew Point (F)_Weekly,Shrubland_acres Min Air Temp (F)_Weekly,Shrubland_acres Avg Air Temp (F)_month,Shrubland_acres Dew Point (F)_month,Shrubland_acres Min Air Temp (F)_month,Shrubland_acres month__7,Shrubland_percentage Avg Air Temp (F)_Weekly,Shrubland_percentage Dew Point (F)_Weekly,Shrubland_percentage Max Air Temp (F)_Weekly,Shrubland_percentage Min Air Temp (F)_Weekly,Shrubland_percentage Avg Air Temp (F)_month,Shrubland_percentage Dew Point (F)_month,Shrubland_percentage Max Air Temp (F)_month,Shrubland_percentage Min Air Temp (F)_month,Shrubland_percentage month__7,Shrubland_percentage month__8
0,1102.856805,0.300074,4.225505,0.00115,194.595625,0.052947,4.670295,0.001271,0.0,0.0,5.33748,0.001452,7838.756565,2.132827,1536.74945,0.41813,1.77916,0.000484,991.214515,0.269697,3722.44751,1.012831,153671.38668,41.812059,28431.42159,7.735834,39470.886995,10.739534,74885.956375,20.375531,0.0,0.0,8.673405,0.00236,0.0,0.0,0.88958,0.000242,30958.051185,8.423298,4.670295,0.001271,164.12751,0.044657,4.670295,0.001271,19403.51896,5.279454,4497.494085,1.223712,624.48516,0.169915,1242.0,-42.0,44.214286,82.785714,2.392857,39.321429,54.157143,96.5,35.771429,60.785714,0.095714,45.506897,78.189655,2.915517,38.932759,55.896552,95.448276,35.725862,55.810345,0.130172,1567167.0,528000.0,2.968119,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,12284640000.0,0.0,0.0,0.0,0.0,6065546000.0,4757367000.0,1294420.0,190859900.0,6794471.0,367713.7,6042578.0,8322403.0,5497045.0,6993108.0,448031.6,5982851.0,8589701.0,5490043.0,0.0,0.0,1294420.0,0.0,332475.039702,49022840.0,1745177.0,94448.193881,1552052.0,2137630.0,3808941.0,1411930.0,1796198.0,115078.051566,1536711.0,2206286.0,3767428.0,1410131.0,20840630000.0,0.0,0.0,0.0,332475.039702,5670474.0,0.0,1368788.0,1217315.0,1107414.0,1408805.0,1205282.0,1106003.0,0.0,372.4301,331.216107,456.181748,301.313399,383.318146,327.942224,470.833307,300.929579,0.0,0.0
1,189.03575,0.04008,0.0,0.0,15482.472715,3.28265,0.0,0.0,0.0,0.0,194.595625,0.041259,195088.00753,41.363269,0.44479,9.4e-05,0.222395,4.7e-05,0.0,0.0,0.0,0.0,5644.82989,1.196837,121.42767,0.025746,3192.480225,0.676881,0.667185,0.000141,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,247783.390805,52.535935,0.0,0.0,0.0,0.0,0.0,0.0,2650.50361,0.561969,1297.45243,0.275091,0.0,0.0,3556.0,1442.0,29.657143,76.514286,3.228571,21.328571,34.428571,91.857143,22.857143,55.428571,0.0,30.789655,68.162069,4.968966,19.6,39.344828,86.0,22.758621,46.344828,0.0,1164.0,465280.0,0.002502,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,227082400.0,0.0,0.0,0.0,0.0,18021010.0,1398695000.0,296556.4,20073020.0,167409.5,18224.74,120396.2,194343.4,129024.7,173802.4,28048.97,110638.7,222094.9,128468.5,0.0,0.0,296556.4,0.0,167719.934542,11352460.0,94679.84,10307.150441,68091.04,109912.5,293252.1,72970.98,98295.37,15863.324152,62572.61,125607.6,274553.3,72656.45,1485397000.0,0.0,0.0,0.0,167719.934542,314939.3,0.0,7348547.0,5284866.0,5663620.0,7629165.0,4856554.0,5639208.0,0.0,1558.065738,1120.516449,1808.737201,1200.821378,1617.563332,1029.704332,2067.017316,1195.645424,0.0,0.0
2,1326.80857,0.41429,16.679625,0.005208,1873.01069,0.58484,242.632945,0.075761,0.0,0.0,17190.911105,5.367789,114386.866695,35.71681,168.13062,0.052498,12.00933,0.00375,120.0933,0.037499,2587.34343,0.807887,112912.61024,35.25648,440.119705,0.137425,8263.975805,2.580391,1727.11957,0.539286,0.0,0.0,1.33437,0.000417,0.0,0.0,1.111975,0.000347,52457.865415,16.379744,0.0,0.0,1.77916,0.000556,122.094855,0.038124,5822.74589,1.818128,105.86002,0.033054,479.48362,0.149717,3121.0,43.0,34.114286,83.571429,3.157143,29.585714,40.071429,96.0,27.757143,66.571429,0.141429,34.289655,76.724138,3.606897,27.410345,41.2,93.172414,27.768966,58.310345,0.155517,37304.0,384640.0,0.096984,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4267088000.0,0.0,0.0,0.0,0.0,933107100.0,5923155000.0,1849480.0,352400300.0,3851933.0,356481.2,3340600.0,4524570.0,3134131.0,3871734.0,407264.1,3094974.0,4652000.0,3135466.0,0.0,0.0,1849480.0,0.0,135361.811055,25791870.0,281919.6,26090.552184,244495.6,331149.3,793341.7,229384.4,283368.9,29807.305835,226518.4,340475.8,769974.6,229482.1,3178656000.0,0.0,0.0,0.0,135361.811055,992521.6,0.0,1789563.0,1552003.0,1456080.0,1798762.0,1437888.0,1456701.0,0.0,558.783279,484.606437,656.359756,454.654904,561.655786,448.974441,674.845467,454.848556,0.0,0.0
3,3777.15668,0.374865,46196.556585,4.584787,1869.45237,0.185534,2023.34971,0.200808,9.118195,0.000905,33181.556395,3.293111,408193.790775,40.511281,56434.51041,5.600855,10563.31771,1.048359,2628.041715,0.260821,247.525635,0.024566,170758.216925,16.946936,4421.65739,0.438828,25520.048645,2.532743,165.684275,0.016443,0.0,0.0,790.39183,0.078443,0.0,0.0,105624.2813,10.482705,55372.129495,5.495419,94.29548,0.009358,469.475845,0.046593,42057.340845,4.17399,21360.372565,2.119915,11589.89303,1.150241,4257.08509,0.422495,2192.0,-1.0,40.985714,81.285714,3.142857,35.557143,50.114286,91.285714,32.171429,62.857143,0.117143,42.389655,77.448276,3.848276,35.586207,52.455172,88.965517,33.365517,58.862069,0.175517,222185.0,1065600.0,0.208507,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,90694540000.0,0.0,0.0,0.0,0.0,4357758000.0,9455246000.0,938387.9,374302000.0,6998647.0,536668.7,6071674.0,8557426.0,5493536.0,7238382.0,657124.7,6076637.0,8957152.0,5697436.0,0.0,0.0,938387.9,0.0,140243.358184,55939950.0,1045957.0,80205.86717,907420.0,1278919.0,2329616.0,821016.4,1081786.0,98208.187199,908161.7,1338659.0,2270404.0,851489.6,27194160000.0,0.0,0.0,0.0,140243.358184,2698891.0,0.0,2269466.0,1968875.0,1781401.0,2347205.0,1970484.0,1847520.0,0.0,225.23367,195.401396,275.398994,176.795477,232.948913,195.561115,288.263147,183.357495,0.0,0.0
4,31.802485,0.005011,28.46656,0.004485,218.39189,0.034409,0.88958,0.00014,0.0,0.0,34882.878145,5.495994,255438.00431,40.245698,28.688955,0.00452,12.231725,0.001927,2.22395,0.00035,522.18346,0.082273,207502.763615,32.69323,465.027945,0.073268,12257.745215,1.931277,3351.270255,0.528012,0.0,0.0,0.88958,0.00014,0.0,0.0,0.0,0.0,106091.088405,16.715249,0.0,0.0,0.667185,0.000105,425.66403,0.067066,13178.68291,2.076376,245.07929,0.038614,11.786935,0.001857,3522.0,787.0,41.928571,93.014286,5.657143,39.0,50.142857,100.0,35.571429,74.142857,0.0,42.931034,87.017241,6.268966,37.196552,52.827586,97.551724,34.344828,61.275862,0.0,45424.0,663040.0,0.068509,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,11603020000.0,0.0,0.0,0.0,0.0,2543516000.0,22014190000.0,3468460.0,730824700.0,8700294.0,1173873.0,8092608.0,10404780.0,7381170.0,8908308.0,1300828.0,7718387.0,10961870.0,7126647.0,0.0,0.0,3468460.0,0.0,204891.264092,43171780.0,513949.7,69343.815788,478052.1,614638.4,1225775.0,436025.5,526237.7,76843.382072,455945.9,647547.1,1195764.0,420990.1,8127375000.0,0.0,0.0,0.0,204891.264092,1280514.0,0.0,4448248.0,4137552.0,3773812.0,4554600.0,3946223.0,3643680.0,0.0,700.846514,651.894713,838.150345,594.585288,717.602933,621.749626,883.02626,574.082347,0.0,0.0


In [20]:
X_train, X_test, y_train, y_test = train_test_split(poly2_df, y, random_state = 0)

In [22]:
# Concatenate the X_train and y_train back into one training dataframe
training = pd.concat([X_train, y_train], axis=1)

In [23]:
# Split the target variable by class into two dataframes
no_fire = training[training.fire_started == 0] # 0 = No Wildfire
fire = training[training.fire_started == 1] # 1 = Wildfire

In [24]:
# Resample the minority class (wildfire)
fire_upsampled = resample(fire,
                          replace=True, # sample with replacement
                          n_samples=no_fire.shape[0], # match number in majority class
                          random_state=42) # reproducible result

In [25]:
# Create a new dataframe combining the target classes
resampled_df = pd.concat([no_fire, fire_upsampled])

In [26]:
# Double-check the different target classes
resampled_df.fire_started.value_counts()

1.0    12812
0.0    12812
Name: fire_started, dtype: int64

In [27]:
y_train = resampled_df.fire_started
X_train = resampled_df.drop(columns = ['fire_started'], axis = 1)

## Logistic Models

### Base Model

In [28]:
# Instantiate a logistic regression model
logreg = LogisticRegression(random_state = 0) # random state for consistant results
# Train model on resampled training data
logreg.fit(X_train, y_train)

LogisticRegression(random_state=0)

In [29]:
# Use the model to predict the target variable on the training dataset
y_hat_train = logreg.predict(X_train)
# Use the model to predict the target variable on the test dataset
y_hat_test = logreg.predict(X_test)

In [30]:
# Print the f1 score metric on both the training and test predictions to check for overfitting
print('F1 Score: Training', f1_score(y_train, y_hat_train), 'Test', f1_score(y_test, y_hat_test))
# Print the accuracy score metric on both the training and test predictions to check for overfitting
print('Accuracy Score: Training', accuracy_score(y_train, y_hat_train), 'Test', accuracy_score(y_test, y_hat_test))
# Print the recall score metric on both the training and test predictions to check for overfitting
print('Recall Score: Training', recall_score(y_train, y_hat_train), 'Test', recall_score(y_test, y_hat_test))

F1 Score: Training 0.6666666666666666 Test 0.12010768275005178
Accuracy Score: Training 0.5 Test 0.06389072482925755
Recall Score: Training 1.0 Test 1.0


In [31]:
confusion_matrix(y_test, y_hat_test)

array([[   0, 4249],
       [   0,  290]])

### Model - Scaled Data

In [32]:
# Instantiate the StandardScaler()
ss = StandardScaler()
# Fit the feature training data
ss.fit(X_train)

# Transform both the training and test features
X_train_scaled = ss.transform(X_train)
X_test_scaled = ss.transform(X_test)

In [33]:
# Instantiate a new logistic regression model
logreg1 = LogisticRegression(solver = 'liblinear')
# Fit the data to the new scaled data
logreg1.fit(X_train_scaled, y_train)

LogisticRegression(solver='liblinear')

In [34]:
# Use model to predict target variable on the training dataset
y_hat_train1 = logreg1.predict(X_train_scaled)
# Use model to predict target variable on the test dataset
y_hat_test1 = logreg1.predict(X_test_scaled)

In [35]:
# Print the f1 score metric on both the training and test predictions to check for overfitting
print('F1 Score: Training', f1_score(y_train, y_hat_train1), 'Test', f1_score(y_test, y_hat_test1))
# Print the accuracy score metric on both the training and test predictions to check for overfitting
print('Accuracy Score: Training', accuracy_score(y_train, y_hat_train1), 'Test', accuracy_score(y_test, y_hat_test1))
# Print the recall score metric on both the training and test predictions to check for overfitting
print('Recall Score: Training', recall_score(y_train, y_hat_train1), 'Test', recall_score(y_test, y_hat_test1))

F1 Score: Training 0.8223723277909738 Test 0.29129886506935687
Accuracy Score: Training 0.8132219793943178 Test 0.7523683630755673
Recall Score: Training 0.864736184826725 Test 0.7965517241379311


In [36]:
confusion_matrix(y_test, y_hat_test1)

array([[3184, 1065],
       [  59,  231]])

In [37]:
scores = cross_val_score(logreg1, X_train_scaled, y_train, cv=10)
print(scores.mean())

0.8080708685999616


## K Nearest Neighbor (KNN)

### Base Model

In [38]:
knn = KNeighborsClassifier()

In [39]:
# Fit knn model using the scaled data from the previous scaled logistic model
knn.fit(X_train_scaled, y_train)

KNeighborsClassifier()

In [40]:
# Predict target variable for both the train and test datasets.
knn_train = knn.predict(X_train_scaled)
knn_test = knn.predict(X_test_scaled)

In [41]:
# Print the f1 score metric on both the training and test predictions to check for overfitting
print('F1 Score: Training', f1_score(y_train, knn_train), 'Test', f1_score(y_test, knn_test))
# Print the sccuracy score metric on both the training and test predictions to check for overfitting
print('Accuracy Score: Training', accuracy_score(y_train, knn_train), 'Test', accuracy_score(y_test, knn_test))
# Print the recall score metric on both the training and test predictions to check for overfitting
print('Recall Score: Training', recall_score(y_train, knn_train), 'Test', recall_score(y_test, knn_test))

F1 Score: Training 0.9567262815965352 Test 0.288911495422177
Accuracy Score: Training 0.9547689665938183 Test 0.8460013218770654
Recall Score: Training 1.0 Test 0.4896551724137931


In [42]:
confusion_matrix(y_test, knn_test)

array([[3698,  551],
       [ 148,  142]])

### Tuning Number of Nearest Neighbors

In [43]:
# Create a function to find the max recall score and return the score along with the k value
def max_value(l):
    max_val = max(l)
    max_idx = l.index(max_val)
    return max_idx, max_val

In [44]:
# Create an empty list for recall scores
k_scores = []
# Choose a range of k values to test
k_range = list(range(1, 21))
# Iterate through the different k values
for k in k_range:
    # Instantiate new knn model with k nearest neighbors
    knn = KNeighborsClassifier(n_neighbors = k)
    # Fit knn model on scaled training data
    knn.fit(X_train_scaled, y_train)
    # Use model to predict target variable on testing set
    y_pred = knn.predict(X_test_scaled)
    # Find the recall score
    recall = recall_score(y_test, y_pred)
    # Append recall score to list of recall scorees
    k_scores.append(recall)

# Find max recall score
idx, val = max_value(k_scores)
# Print max recall score and it corresponding k value
print(idx + 1, val)

19 0.7551724137931034


In [45]:
knn = KNeighborsClassifier(n_neighbors = 19)
knn.fit(X_train_scaled, y_train)
y_pred = knn.predict(X_test_scaled)
knn_train1 = knn.predict(X_train_scaled)
print('F1 Score: Training', f1_score(y_train, knn_train1), 'Test', f1_score(y_test, y_pred))
print('Accuracy Score: Training', accuracy_score(y_train, knn_train1), 'Test', accuracy_score(y_test, y_pred))
print('Recall Score: Training', recall_score(y_train, knn_train1), 'Test', recall_score(y_test, y_pred))

F1 Score: Training 0.8837338134478222 Test 0.25186889016676245
Accuracy Score: Training 0.8689509834530128 Test 0.7133729896452963
Recall Score: Training 0.9960974086793631 Test 0.7551724137931034


In [46]:
confusion_matrix(y_test, y_pred)

array([[3019, 1230],
       [  71,  219]])

In [47]:
scores_knn = cross_val_score(knn, X_train_scaled, y_train, cv=10)
print(scores_knn.mean())

0.8478385436416817


## Decision Tree

### Base Model

In [50]:
dt = DecisionTreeClassifier(random_state = 0)
dt.fit(X_train, y_train)

DecisionTreeClassifier(random_state=0)

In [51]:
dt_train = dt.predict(X_train)
dt_test = dt.predict(X_test)

print('F1 Score: Training', f1_score(y_train, dt_train), 'Test', f1_score(y_test, dt_test))
print('Accuracy Score: Training', accuracy_score(y_train, dt_train), 'Test', accuracy_score(y_test, dt_test))
print('Recall Score: Training', recall_score(y_train, dt_train), 'Test', recall_score(y_test, dt_test))

F1 Score: Training 1.0 Test 0.24561403508771928
Accuracy Score: Training 1.0 Test 0.9052654769773077
Recall Score: Training 1.0 Test 0.2413793103448276


In [52]:
confusion_matrix(y_test, dt_test)

array([[4039,  210],
       [ 220,   70]])

### Training Parameters - GridSearchCV

In [53]:
parameters = {'max_depth' : range(10, 15, 1), 'max_features' : range(55, 75, 1), 'min_samples_split' : range(10, 20, 1)}

In [54]:
dtg = DecisionTreeClassifier(random_state = 0)
grid_model = GridSearchCV(dtg, parameters, cv = 10, scoring = 'f1', verbose = 1, n_jobs = -1)

grid_model.fit(X_train, y_train)

Fitting 10 folds for each of 1000 candidates, totalling 10000 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:    3.0s
[Parallel(n_jobs=-1)]: Done 168 tasks      | elapsed:    8.4s
[Parallel(n_jobs=-1)]: Done 418 tasks      | elapsed:   18.5s
[Parallel(n_jobs=-1)]: Done 768 tasks      | elapsed:   33.0s
[Parallel(n_jobs=-1)]: Done 1218 tasks      | elapsed:   52.7s
[Parallel(n_jobs=-1)]: Done 1768 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 2418 tasks      | elapsed:  1.8min
[Parallel(n_jobs=-1)]: Done 3168 tasks      | elapsed:  2.4min
[Parallel(n_jobs=-1)]: Done 4018 tasks      | elapsed:  3.1min
[Parallel(n_jobs=-1)]: Done 4968 tasks      | elapsed:  3.9min
[Parallel(n_jobs=-1)]: Done 6018 tasks      | elapsed:  4.8min
[Parallel(n_jobs=-1)]: Done 7168 tasks      | elapsed:  5.8min
[Parallel(n_jobs=-1)]: Done 8418 tasks      | elapsed:  6.8min
[Parallel(n_jobs=-1)]: Done 9768 tasks      | elapsed:  7.9min
[Parallel(n_jobs=-1)]: Done 10000 out of 1000

GridSearchCV(cv=10, estimator=DecisionTreeClassifier(random_state=0), n_jobs=-1,
             param_grid={'max_depth': range(10, 15),
                         'max_features': range(55, 75),
                         'min_samples_split': range(10, 20)},
             scoring='f1', verbose=1)

In [55]:
print(grid_model.best_score_)
print(grid_model.best_params_)
print(grid_model.best_estimator_)

0.9284293620407518
{'max_depth': 14, 'max_features': 59, 'min_samples_split': 12}
DecisionTreeClassifier(max_depth=14, max_features=59, min_samples_split=12,
                       random_state=0)


In [56]:
dt_train2 = grid_model.best_estimator_.predict(X_train)
dt_test2 = grid_model.best_estimator_.predict(X_test)

print('F1 Score: Training', f1_score(y_train, dt_train2), 'Test', f1_score(y_test, dt_test2))
print('Accuracy Score: Training', accuracy_score(y_train, dt_train2), 'Test', accuracy_score(y_test, dt_test2))
print('Recall Score: Training', recall_score(y_train, dt_train2), 'Test', recall_score(y_test, dt_test2))

F1 Score: Training 0.9250541516245487 Test 0.2765121759622938
Accuracy Score: Training 0.9189822041835779 Test 0.7970918704560476
Recall Score: Training 1.0 Test 0.6068965517241379
