# Importing libraries
### Done by Gabriel, Colby, Heejun

In [None]:
# import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sqlite3
import gc

# import preprocessing classes
from sklearn.preprocessing import OrdinalEncoder, LabelEncoder

# feature selection
from sklearn.feature_selection import chi2, mutual_info_regression
from sklearn.feature_selection import SelectKBest, chi2

# import models
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR

# import train test split
from sklearn.model_selection import train_test_split

# feature importance
from sklearn.inspection import permutation_importance

# import metrics
from sklearn.metrics import mean_absolute_error, mean_squared_error

# functions to save data
from joblib import dump, load

# read data
## mount google drive
from google.colab import drive
drive.mount("/content/drive/")
%cd '/content/drive/MyDrive/DS440/dataset/Data/'
!pwd

## read in files
dir = '/content/drive/MyDrive/DS440/dataset/Data/'
cnx = sqlite3.connect(dir + 'wildfire.sqlite')
cursor = cnx.cursor()
cursor.execute("SELECT name FROM sqlite_master WHERE type='Fires';")
tables = cursor.fetchall()
Fires = pd.read_sql_query("SELECT * FROM 'Fires'", cnx).set_index('FOD_ID')
gc.collect()
# Drop unnecessary column
Fires.drop(columns=['FPA_ID','SOURCE_SYSTEM_TYPE','SOURCE_SYSTEM','NWCG_REPORTING_AGENCY','NWCG_REPORTING_UNIT_ID',
                    'NWCG_REPORTING_UNIT_NAME','SOURCE_REPORTING_UNIT','SOURCE_REPORTING_UNIT_NAME','LOCAL_FIRE_REPORT_ID',
                    'LOCAL_INCIDENT_ID','FIRE_CODE','FIRE_NAME','ICS_209_PLUS_INCIDENT_JOIN_ID','ICS_209_PLUS_COMPLEX_JOIN_ID',
                    'MTBS_ID','MTBS_FIRE_NAME','COMPLEX_NAME','OWNER_DESCR','NWCG_CAUSE_CLASSIFICATION',
                    'NWCG_CAUSE_AGE_CATEGORY','FIRE_SIZE_CLASS','CONT_TIME','CONT_DATE'],inplace=True)

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).
/content/drive/MyDrive/DS440/dataset/Data
/content/drive/MyDrive/DS440/dataset/Data


In [None]:
Fires.head()

Unnamed: 0_level_0,FIRE_YEAR,DISCOVERY_DATE,DISCOVERY_DOY,DISCOVERY_TIME,NWCG_GENERAL_CAUSE,CONT_DOY,FIRE_SIZE,LATITUDE,LONGITUDE,STATE,COUNTY,FIPS_CODE,FIPS_NAME
FOD_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
1,2005,2/2/2005 0:00,33,1300.0,Power generation/transmission/distribution,33.0,0.1,40.036944,-121.005833,CA,63,6063,Plumas County
2,2004,5/12/2004 0:00,133,845.0,Natural,133.0,0.25,38.933056,-120.404444,CA,61,6061,Placer County
3,2004,5/31/2004 0:00,152,1921.0,Debris and open burning,152.0,0.1,38.984167,-120.735556,CA,17,6017,El Dorado County
4,2004,6/28/2004 0:00,180,1600.0,Natural,185.0,0.1,38.559167,-119.913333,CA,3,6003,Alpine County
5,2004,6/28/2004 0:00,180,1600.0,Natural,185.0,0.1,38.559167,-119.933056,CA,3,6003,Alpine County


# Data Preprocessing
## convert all features into usable format
## Use random sample from originial dataset due to high computational cost
## Take 5% of original data
## Purpose : 
1. To handle big data
2. To check scalability

In [None]:
# Later just remove this line of code when using original dataset
Fires = Fires.sample(frac=0.05,replace=False)

In [None]:
Fires

Unnamed: 0_level_0,FIRE_YEAR,DISCOVERY_DATE,DISCOVERY_DOY,DISCOVERY_TIME,NWCG_GENERAL_CAUSE,CONT_DOY,FIRE_SIZE,LATITUDE,LONGITUDE,STATE,COUNTY,FIPS_CODE,FIPS_NAME
FOD_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
191519,1994,7/15/1994 0:00,196,1519.0,Equipment and vehicle use,196.0,0.1,48.369900,-119.401200,WA,,,
1128845,1992,4/7/1992 0:00,98,,Debris and open burning,,1.0,34.958300,-79.238300,NC,,,
300230849,2015,4/30/2015 0:00,120,,Equipment and vehicle use,,15.0,44.620500,-100.000000,SD,Sully,46119,Sully County
201412005,2011,4/27/2011 0:00,117,,Debris and open burning,,1.0,26.875909,-99.285234,TX,Zapata,48505,Zapata County
201619181,2012,7/9/2012 0:00,191,,Misuse of fire by a minor,191.0,1.0,33.209936,-87.064989,AL,Bibb,01007,Bibb County
...,...,...,...,...,...,...,...,...,...,...,...,...,...
663451,2005,4/10/2005 0:00,100,,Debris and open burning,100.0,2.0,32.252220,-94.989440,TX,Smith,48423,Smith County
146980,2001,9/10/2001 0:00,253,1030.0,Natural,253.0,0.1,43.712222,-115.371389,ID,,,
815386,1995,5/29/1995 0:00,149,1302.0,Debris and open burning,151.0,1.0,33.248719,-112.109264,AZ,Maricopa,04013,Maricopa County
445004,2005,2/10/2005 0:00,41,1355.0,Missing data/not specified/undetermined,41.0,12.0,30.423330,-82.243050,FL,Baker,12003,Baker County


# Colby's Coding

In [None]:
# extract month from discovery date
def get_first_element(date_list):
  return date_list[0]

Fires['DISCOVERY_DATE'] = Fires['DISCOVERY_DATE'].astype("string")
Fires['DISCOVERY_LIST'] = Fires['DISCOVERY_DATE'].str.split(pat='/')
Fires['DISCOVERY_MONTH'] = Fires['DISCOVERY_LIST'].map(get_first_element)
Fires['DISCOVERY_MONTH'] = Fires['DISCOVERY_MONTH'].astype("int64")

# encode causes into integers
Fires['NWCG_GENERAL_CAUSE'] = Fires['NWCG_GENERAL_CAUSE'].astype("string")
cause_enc = LabelEncoder()
Fires['NWCG_CAUSE_CLASSIFICATION_ORD'] = cause_enc.fit_transform(Fires['NWCG_GENERAL_CAUSE'])


# figure out number of days to contain the fire
Fires['DAYS_TO_CONT'] = Fires['CONT_DOY'] - Fires['DISCOVERY_DOY']
##### if it was contained the next year, need to do something about that
def convert_negative_days(day):
  if day < 0:
    return 365 + day
  else:
    return day

Fires['DAYS_TO_CONT'] = Fires['DAYS_TO_CONT'].map(convert_negative_days)


# encode state categories into integers
Fires['STATE'] = Fires['STATE'].astype("string")
state_enc = LabelEncoder()
Fires['STATE_ORD'] = state_enc.fit_transform(Fires['STATE'])

# what to do about counties? Use fips code (must be an int)
Fires['FIPS_CODE'] = Fires['FIPS_CODE'].astype("string")
Fires['FIPS_CODE'].fillna('1000000',inplace=True)
Fires['FIPS_CODE'] = Fires['FIPS_CODE'].astype("int64")

# fill NA with mean of day to containment (mean = 0.92)
Fires['DAYS_TO_CONT'].fillna(value=Fires['DAYS_TO_CONT'].mean(),inplace=True)


# drop date and list
Fires.drop(columns=['DISCOVERY_DATE','DISCOVERY_LIST','NWCG_GENERAL_CAUSE',
                    'CONT_DOY','DISCOVERY_DOY','STATE','FIPS_NAME',
                    'COUNTY'],inplace=True)
gc.collect()

19

In [None]:
Fires.head()

Unnamed: 0_level_0,FIRE_YEAR,DISCOVERY_TIME,FIRE_SIZE,LATITUDE,LONGITUDE,FIPS_CODE,DISCOVERY_MONTH,NWCG_CAUSE_CLASSIFICATION_ORD,DAYS_TO_CONT,STATE_ORD
FOD_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
191519,1994,1519.0,0.1,48.3699,-119.4012,1000000,7,2,0.0,48
1128845,1992,,1.0,34.9583,-79.2383,1000000,4,1,0.884012,27
300230849,2015,,15.0,44.6205,-100.0,46119,4,2,0.884012,42
201412005,2011,,1.0,26.875909,-99.285234,48505,4,1,0.884012,44
201619181,2012,,1.0,33.209936,-87.064989,1007,7,6,0.0,1


#** Code by Heejun Son**
### Need to handle missing values in DISCOVERY_TIME column 
### Need to compare MICE vs KNN vs Replacement with the most frequent number
I did not drop DISCOVERY_TIME column.


In [None]:
#Replace with the most_Frequent value
Fires_Freq = Fires.sample(frac=1,random_state=0)
Fires_Freq['DISCOVERY_TIME'] = Fires_Freq[['DISCOVERY_TIME']].fillna(Fires_Freq[['DISCOVERY_TIME']].mode().iloc[0])
Fires_Freq

Unnamed: 0_level_0,FIRE_YEAR,DISCOVERY_TIME,FIRE_SIZE,LATITUDE,LONGITUDE,FIPS_CODE,DISCOVERY_MONTH,NWCG_CAUSE_CLASSIFICATION_ORD,DAYS_TO_CONT,STATE_ORD
FOD_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
470453,2006,1400.0,6.17,34.033110,-83.605716,13013,10,1,0.884012,10
1373969,1994,1400.0,0.10,34.463056,-117.581111,1000000,8,7,0.884012,4
1061992,2002,1400.0,10.00,36.545000,-84.145000,1000000,4,1,0.884012,43
201614127,2012,1730.0,1.00,26.636800,-81.675900,12071,6,0,0.000000,9
725998,2007,1400.0,0.50,37.633890,-78.895840,51125,12,1,0.884012,46
...,...,...,...,...,...,...,...,...,...,...
300253779,2015,1400.0,2.00,48.429500,-99.959200,38069,4,1,0.884012,28
300143923,2014,1400.0,0.50,44.083178,-103.545229,46103,4,5,0.884012,42
874734,2006,1400.0,0.01,44.471170,-122.806500,41043,9,8,0.000000,37
400303774,2018,2324.0,0.04,45.359722,-116.360833,16049,8,5,0.884012,13


In [None]:
#Replace NaN with values predicted by KNN
from sklearn.impute import KNNImputer
Fires_KNN = Fires.sample(frac=1,random_state=0)
Input = Fires_KNN[['FIRE_YEAR','DISCOVERY_TIME','FIRE_SIZE','LATITUDE','LONGITUDE','DISCOVERY_MONTH','NWCG_CAUSE_CLASSIFICATION_ORD','DAYS_TO_CONT','STATE_ORD']]
imputer = KNNImputer(n_neighbors=5)

In [None]:
result = imputer.fit_transform(Input)
check = pd.DataFrame(result)
New_Discovery_Time = check[[1]].values.tolist()

In [None]:
Fires_KNN['Assumption'] = New_Discovery_Time
Fires_KNN.head()

Unnamed: 0_level_0,FIRE_YEAR,DISCOVERY_TIME,FIRE_SIZE,LATITUDE,LONGITUDE,FIPS_CODE,DISCOVERY_MONTH,NWCG_CAUSE_CLASSIFICATION_ORD,DAYS_TO_CONT,STATE_ORD,Assumption
FOD_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
470453,2006,,6.17,34.03311,-83.605716,13013,10,1,0.884012,10,[1284.6]
1373969,1994,,0.1,34.463056,-117.581111,1000000,8,7,0.884012,4,[1237.6]
1061992,2002,,10.0,36.545,-84.145,1000000,4,1,0.884012,43,[1206.0]
201614127,2012,1730.0,1.0,26.6368,-81.6759,12071,6,0,0.0,9,[1730.0]
725998,2007,,0.5,37.63389,-78.89584,51125,12,1,0.884012,46,[1234.0]


In [None]:
# Convert object to float64
Fires_KNN['Assumption'] = Fires_KNN['Assumption'].astype("string")
Fires_KNN['Assumption'] = Fires_KNN['Assumption'].str.strip('[]')
Fires_KNN['Assumption'] = Fires_KNN['Assumption'].astype("float64")
# Drop DISCOVERY_TIME column that contains NaN
Fires_KNN.drop(columns=['DISCOVERY_TIME'],inplace=True)
# Change column name from Assumption to DISCOVERY_TIME
Fires_KNN['DISCOVERY_TIME'] = Fires_KNN['Assumption']
Fires_KNN.drop(columns=['Assumption'],inplace=True)
Fires_KNN

Unnamed: 0_level_0,FIRE_YEAR,FIRE_SIZE,LATITUDE,LONGITUDE,FIPS_CODE,DISCOVERY_MONTH,NWCG_CAUSE_CLASSIFICATION_ORD,DAYS_TO_CONT,STATE_ORD,DISCOVERY_TIME
FOD_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
470453,2006,6.17,34.033110,-83.605716,13013,10,1,0.884012,10,1284.6
1373969,1994,0.10,34.463056,-117.581111,1000000,8,7,0.884012,4,1237.6
1061992,2002,10.00,36.545000,-84.145000,1000000,4,1,0.884012,43,1206.0
201614127,2012,1.00,26.636800,-81.675900,12071,6,0,0.000000,9,1730.0
725998,2007,0.50,37.633890,-78.895840,51125,12,1,0.884012,46,1234.0
...,...,...,...,...,...,...,...,...,...,...
300253779,2015,2.00,48.429500,-99.959200,38069,4,1,0.884012,28,1400.0
300143923,2014,0.50,44.083178,-103.545229,46103,4,5,0.884012,42,1919.2
874734,2006,0.01,44.471170,-122.806500,41043,9,8,0.000000,37,1280.4
400303774,2018,0.04,45.359722,-116.360833,16049,8,5,0.884012,13,2324.0


In [None]:
#Replace NaN with values predicted by Multivariate feature imputation
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
Fires_MICE = Fires.sample(frac=1,random_state=0)
Input = Fires_KNN[['FIRE_YEAR','DISCOVERY_TIME','FIRE_SIZE','LATITUDE','LONGITUDE','DISCOVERY_MONTH','NWCG_CAUSE_CLASSIFICATION_ORD','DAYS_TO_CONT','STATE_ORD']]
imp_15 = IterativeImputer(max_iter=15, random_state=0)
imp_15.fit(Input)
result_mice = imp_15.transform(Input)
check_mice = pd.DataFrame(result_mice)
New_Discovery_Time_mice = check_mice[[1]].values.tolist()

In [None]:
Fires_MICE['Assumption'] = New_Discovery_Time_mice
Fires_MICE.head()

Unnamed: 0_level_0,FIRE_YEAR,DISCOVERY_TIME,FIRE_SIZE,LATITUDE,LONGITUDE,FIPS_CODE,DISCOVERY_MONTH,NWCG_CAUSE_CLASSIFICATION_ORD,DAYS_TO_CONT,STATE_ORD,Assumption
FOD_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
470453,2006,,6.17,34.03311,-83.605716,13013,10,1,0.884012,10,[1284.6]
1373969,1994,,0.1,34.463056,-117.581111,1000000,8,7,0.884012,4,[1237.6]
1061992,2002,,10.0,36.545,-84.145,1000000,4,1,0.884012,43,[1206.0]
201614127,2012,1730.0,1.0,26.6368,-81.6759,12071,6,0,0.0,9,[1730.0]
725998,2007,,0.5,37.63389,-78.89584,51125,12,1,0.884012,46,[1234.0]


In [None]:
Fires_MICE.dtypes

FIRE_YEAR                          int64
DISCOVERY_TIME                   float64
FIRE_SIZE                        float64
LATITUDE                         float64
LONGITUDE                        float64
FIPS_CODE                          int64
DISCOVERY_MONTH                    int64
NWCG_CAUSE_CLASSIFICATION_ORD      int64
DAYS_TO_CONT                     float64
STATE_ORD                          int64
Assumption                        object
dtype: object

In [None]:
# Convert object to float64
Fires_MICE['Assumption'] = Fires_MICE['Assumption'].astype("string")
Fires_MICE['Assumption'] = Fires_MICE['Assumption'].str.strip('[]')
Fires_MICE['Assumption'] = Fires_MICE['Assumption'].astype("float64")
# Drop DISCOVERY_TIME column that contains NaN
Fires_MICE.drop(columns=['DISCOVERY_TIME'],inplace=True)
# Change column name from Assumption to DISCOVERY_TIME
Fires_MICE['DISCOVERY_TIME'] = Fires_MICE['Assumption']
Fires_MICE.drop(columns=['Assumption'],inplace=True)
Fires_MICE

Unnamed: 0_level_0,FIRE_YEAR,FIRE_SIZE,LATITUDE,LONGITUDE,FIPS_CODE,DISCOVERY_MONTH,NWCG_CAUSE_CLASSIFICATION_ORD,DAYS_TO_CONT,STATE_ORD,DISCOVERY_TIME
FOD_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
470453,2006,6.17,34.033110,-83.605716,13013,10,1,0.884012,10,1284.6
1373969,1994,0.10,34.463056,-117.581111,1000000,8,7,0.884012,4,1237.6
1061992,2002,10.00,36.545000,-84.145000,1000000,4,1,0.884012,43,1206.0
201614127,2012,1.00,26.636800,-81.675900,12071,6,0,0.000000,9,1730.0
725998,2007,0.50,37.633890,-78.895840,51125,12,1,0.884012,46,1234.0
...,...,...,...,...,...,...,...,...,...,...
300253779,2015,2.00,48.429500,-99.959200,38069,4,1,0.884012,28,1400.0
300143923,2014,0.50,44.083178,-103.545229,46103,4,5,0.884012,42,1919.2
874734,2006,0.01,44.471170,-122.806500,41043,9,8,0.000000,37,1280.4
400303774,2018,0.04,45.359722,-116.360833,16049,8,5,0.884012,13,2324.0


# Create a baseline mse and mae

In [None]:
FIRE_SIZE_PERMUTE = Fires['FIRE_SIZE'].sample(frac=1,random_state=0)

In [None]:
print("MAE: ",mean_absolute_error(FIRE_SIZE_PERMUTE,Fires['FIRE_SIZE']))
print("MSE: ",mean_squared_error(FIRE_SIZE_PERMUTE,Fires['FIRE_SIZE']))

MAE:  137.95039381900162
MSE:  7011028.678069244


# ** Code by Heejun **
## Feature selection information
Later need to convert LONGITUDE to negative values!!

Reason for using abs() : There are no positive values !!
                        
(United States only have negative lonitude values)

In [None]:
# Convert negative value to positive
# Feature Selection must have non-negative values
Fires_Freq['LONGITUDE'] = Fires_Freq['LONGITUDE'].abs()
Fires_KNN['LONGITUDE'] = Fires_KNN['LONGITUDE'].abs()
Fires_MICE['LONGITUDE'] = Fires_MICE['LONGITUDE'].abs()

In [None]:
from sklearn.feature_selection import SelectKBest, chi2
from sklearn import preprocessing
############################################################################################################################################################
X_Freq = Fires_Freq[['FIRE_YEAR','LATITUDE','LONGITUDE','FIPS_CODE','DISCOVERY_MONTH','NWCG_CAUSE_CLASSIFICATION_ORD','DAYS_TO_CONT','STATE_ORD','DISCOVERY_TIME']]
y_Freq = Fires_Freq['FIRE_SIZE']
# Transform FIRE_SIZE
# SelectKBest() cannot handle countinous numbers
Encoder_Freq = preprocessing.LabelEncoder()
Firs_size_encoded_Freq = Encoder_Freq.fit_transform(y_Freq)

Best_Features_Freq = dict()
for i in range(1,9):
  selector_Freq = SelectKBest(chi2, k=i)
  selector_Freq.fit(X_Freq, Firs_size_encoded_Freq)
  Features_Freq = list(X_Freq.columns[selector_Freq.get_support(indices=True)])
  Best_Features_Freq[i] = Features_Freq
  print("(Freq) When k= ", i, ",", Features_Freq)

############################################################################################################################################################
X_KNN = Fires_KNN[['FIRE_YEAR','LATITUDE','LONGITUDE','FIPS_CODE','DISCOVERY_MONTH','NWCG_CAUSE_CLASSIFICATION_ORD','DAYS_TO_CONT','STATE_ORD','DISCOVERY_TIME']]
y_KNN = Fires_KNN['FIRE_SIZE']
# Transform FIRE_SIZE
# SelectKBest() cannot handle countinous numbers
Encoder_KNN = preprocessing.LabelEncoder()
Firs_size_encoded_KNN = Encoder_KNN.fit_transform(y_KNN)

Best_Features_KNN = dict()
for i in range(1,9):
  selector_KNN = SelectKBest(chi2, k=i)
  selector_KNN.fit(X_KNN, Firs_size_encoded_KNN)
  Features_KNN = list(X_KNN.columns[selector_KNN.get_support(indices=True)])
  Best_Features_KNN[i] = Features_KNN
  print("(KNN) When k= ", i, ",", Features_KNN)
  ############################################################################################################################################################
X_MICE = Fires_MICE[['FIRE_YEAR','LATITUDE','LONGITUDE','FIPS_CODE','DISCOVERY_MONTH','NWCG_CAUSE_CLASSIFICATION_ORD','DAYS_TO_CONT','STATE_ORD','DISCOVERY_TIME']]
y_MICE = Fires_MICE['FIRE_SIZE']
# Transform FIRE_SIZE
# SelectKBest() cannot handle countinous numbers
Encoder_MICE = preprocessing.LabelEncoder()
Firs_size_encoded_MICE = Encoder_MICE.fit_transform(y_MICE)

Best_Features_MICE = dict()
for i in range(1,9):
  selector_MICE = SelectKBest(chi2, k=i)
  selector_MICE.fit(X_MICE, Firs_size_encoded_MICE)
  Features_MICE = list(X_MICE.columns[selector_MICE.get_support(indices=True)])
  Best_Features_MICE[i] = Features_MICE
  print("(MICE) When k= ", i, ",", Features_MICE)

(Freq) When k=  1 , ['FIPS_CODE']
(Freq) When k=  2 , ['FIPS_CODE', 'DAYS_TO_CONT']
(Freq) When k=  3 , ['FIPS_CODE', 'DAYS_TO_CONT', 'DISCOVERY_TIME']
(Freq) When k=  4 , ['FIPS_CODE', 'DAYS_TO_CONT', 'STATE_ORD', 'DISCOVERY_TIME']
(Freq) When k=  5 , ['LONGITUDE', 'FIPS_CODE', 'DAYS_TO_CONT', 'STATE_ORD', 'DISCOVERY_TIME']
(Freq) When k=  6 , ['LONGITUDE', 'FIPS_CODE', 'NWCG_CAUSE_CLASSIFICATION_ORD', 'DAYS_TO_CONT', 'STATE_ORD', 'DISCOVERY_TIME']
(Freq) When k=  7 , ['LATITUDE', 'LONGITUDE', 'FIPS_CODE', 'NWCG_CAUSE_CLASSIFICATION_ORD', 'DAYS_TO_CONT', 'STATE_ORD', 'DISCOVERY_TIME']
(Freq) When k=  8 , ['LATITUDE', 'LONGITUDE', 'FIPS_CODE', 'DISCOVERY_MONTH', 'NWCG_CAUSE_CLASSIFICATION_ORD', 'DAYS_TO_CONT', 'STATE_ORD', 'DISCOVERY_TIME']
(KNN) When k=  1 , ['FIPS_CODE']
(KNN) When k=  2 , ['FIPS_CODE', 'DAYS_TO_CONT']
(KNN) When k=  3 , ['FIPS_CODE', 'DAYS_TO_CONT', 'DISCOVERY_TIME']
(KNN) When k=  4 , ['FIPS_CODE', 'DAYS_TO_CONT', 'STATE_ORD', 'DISCOVERY_TIME']
(KNN) When k=  5 , [

Freatures to predict class:
- fire year, general cause, lat, long

Features to predict cause:
- fire year, size class, lat, long

# MODEL

In [None]:
#### split data into training, validation, and testing
X_Freq = Fires_Freq[['FIRE_YEAR','LATITUDE','LONGITUDE','FIPS_CODE','DISCOVERY_MONTH','NWCG_CAUSE_CLASSIFICATION_ORD','DAYS_TO_CONT','STATE_ORD','DISCOVERY_TIME']]
y_Freq = Fires_Freq['FIRE_SIZE']
X_Freq_train, X_Freq_test, y_Freq_train, y_Freq_test = train_test_split(X_Freq, y_Freq, test_size=0.2, random_state=0)
######################################################################################################################################################################
X_KNN = Fires_KNN[['FIRE_YEAR','LATITUDE','LONGITUDE','FIPS_CODE','DISCOVERY_MONTH','NWCG_CAUSE_CLASSIFICATION_ORD','DAYS_TO_CONT','STATE_ORD','DISCOVERY_TIME']]
y_KNN = Fires_KNN['FIRE_SIZE']
X_KNN_train, X_KNN_test, y_KNN_train, y_KNN_test = train_test_split(X_KNN, y_KNN, test_size=0.2, random_state=0)
######################################################################################################################################################################
X_MICE = Fires_MICE[['FIRE_YEAR','LATITUDE','LONGITUDE','FIPS_CODE','DISCOVERY_MONTH','NWCG_CAUSE_CLASSIFICATION_ORD','DAYS_TO_CONT','STATE_ORD','DISCOVERY_TIME']]
y_MICE = Fires_MICE['FIRE_SIZE']
X_MICE_train, X_MICE_test, y_MICE_train, y_MICE_test = train_test_split(X_MICE, y_MICE, test_size=0.2, random_state=0)

In [None]:
#### list of models to train
# xgboost, linear regression (with and without regularization), random forest, naive bayes, nearest neighbors
# k-means, svm
model_list = [LinearRegression, RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor,
               KNeighborsRegressor, SVR] #GaussianNB, 
################################################################################################################################               
model_scores_Freq = dict()
for model in model_list:
  # fit model to data
  print(model)
  regressor_Freq = model().fit(X_Freq_train,y_Freq_train)
  y_Freq_pred = regressor_Freq.predict(X_Freq_test)
  model_Freq_mse = mean_squared_error(y_Freq_pred,y_Freq_test)
  model_Freq_mae = mean_absolute_error(y_Freq_pred,y_Freq_test)
  model_scores_Freq[model] = (regressor_Freq,y_Freq_pred,model_Freq_mse,model_Freq_mae)
  print("(Freq) MSE = ", model_Freq_mse, '\n', "(Freq) MAE = ", model_Freq_mae)
################################################################################################################################
model_scores_KNN = dict()
for model in model_list:
  # fit model to data
  print(model)
  regressor_KNN = model().fit(X_KNN_train,y_KNN_train)
  y_KNN_pred = regressor_KNN.predict(X_KNN_test)
  model_KNN_mse = mean_squared_error(y_KNN_pred,y_KNN_test)
  model_KNN_mae = mean_absolute_error(y_KNN_pred,y_KNN_test)
  model_scores_KNN[model] = (regressor_KNN,y_KNN_pred,model_KNN_mse,model_KNN_mae)
  print("(KNN) MSE = ", model_KNN_mse, '\n', "(KNN) MAE = ", model_KNN_mae)
################################################################################################################################
model_scores_MICE = dict()
for model in model_list:
  # fit model to data
  print(model)
  regressor_MICE = model().fit(X_MICE_train,y_MICE_train)
  y_MICE_pred = regressor_MICE.predict(X_MICE_test)
  model_MICE_mse = mean_squared_error(y_MICE_pred,y_MICE_test)
  model_MICE_mae = mean_absolute_error(y_MICE_pred,y_MICE_test)
  model_scores_MICE[model] = (regressor_MICE,y_MICE_pred,model_MICE_mse,model_MICE_mae)
  print("(MICE) MSE = ", model_MICE_mse, '\n', "(MICE) MAE = ", model_MICE_mae)  

<class 'sklearn.linear_model._base.LinearRegression'>
(Freq) MSE =  2286933.786290844 
 (Freq) MAE =  130.32734433140266
<class 'sklearn.ensemble._forest.RandomForestRegressor'>
(Freq) MSE =  3128065.504761847 
 (Freq) MAE =  129.9926680027144
<class 'sklearn.ensemble._gb.GradientBoostingRegressor'>
(Freq) MSE =  2466894.949344048 
 (Freq) MAE =  114.09054200518905
<class 'sklearn.ensemble._weight_boosting.AdaBoostRegressor'>
(Freq) MSE =  123685182.04843305 
 (Freq) MAE =  10139.22634530058
<class 'sklearn.neighbors._regression.KNeighborsRegressor'>
(Freq) MSE =  2547966.6933085704 
 (Freq) MAE =  104.73366069286507
<class 'sklearn.svm._classes.SVR'>
(Freq) MSE =  2372324.2424827702 
 (Freq) MAE =  67.84303921225712
<class 'sklearn.linear_model._base.LinearRegression'>
(KNN) MSE =  2286924.2793642487 
 (KNN) MAE =  130.48595908739733
<class 'sklearn.ensemble._forest.RandomForestRegressor'>
(KNN) MSE =  3107429.382585456 
 (KNN) MAE =  129.98218240264623
<class 'sklearn.ensemble._gb.Gr

# Save Models

In [None]:
from joblib import dump, load

In [None]:
for m in model_scores.keys():
  # save model object
  dump(model_scores[m][0],'/content/drive/MyDrive/DS 440/Models/ModelIs%s.joblib' %str(model_scores[m][0]))
  # save predictions
  dump(model_scores[m][1],'/content/drive/MyDrive/DS 440/Predictions/PredFor%s.joblib' %str(model_scores[m][0]))
  

NameError: ignored

In [None]:
# visualize features - might need more notebooks for this




In [None]:
# develop model using hyperopt



In [None]:
# use permutation importance on each feature



In [None]:
 # put results in a dataframe

 

In [None]:
############### SAVE FOR LATER ###########################
# polynomial features


# determine which ones are useful 
## use mutual information criterion to do this

