In [1]:
# import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sqlite3
import gc
# import preprocessing classes
from sklearn.preprocessing import OrdinalEncoder, LabelEncoder

# feature selection
from sklearn.feature_selection import chi2, mutual_info_regression
from sklearn.feature_selection import SelectKBest, chi2

# import models
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR

# import train test split
from sklearn.model_selection import train_test_split

# feature importance
from sklearn.inspection import permutation_importance

# import metrics
from sklearn.metrics import mean_absolute_error, mean_squared_error

# functions to save data
from joblib import dump, load

# read data
## mount google drive
from google.colab import drive
drive.mount("/content/drive/")
%cd '/content/drive/MyDrive/DS440/dataset/Data/'
!pwd

## read in files
dir = '/content/drive/MyDrive/DS440/dataset/Data/'
cnx = sqlite3.connect(dir + 'wildfire.sqlite')
cursor = cnx.cursor()
cursor.execute("SELECT name FROM sqlite_master WHERE type='Fires';")
tables = cursor.fetchall()
Fires = pd.read_sql_query("SELECT * FROM 'Fires'", cnx).set_index('FOD_ID')
gc.collect()
# Drop unnecessary column
Fires.drop(columns=['FPA_ID','SOURCE_SYSTEM_TYPE','SOURCE_SYSTEM','NWCG_REPORTING_AGENCY','NWCG_REPORTING_UNIT_ID',
                    'NWCG_REPORTING_UNIT_NAME','SOURCE_REPORTING_UNIT','SOURCE_REPORTING_UNIT_NAME','LOCAL_FIRE_REPORT_ID',
                    'LOCAL_INCIDENT_ID','FIRE_CODE','FIRE_NAME','ICS_209_PLUS_INCIDENT_JOIN_ID','ICS_209_PLUS_COMPLEX_JOIN_ID',
                    'MTBS_ID','MTBS_FIRE_NAME','COMPLEX_NAME','OWNER_DESCR','NWCG_CAUSE_CLASSIFICATION',
                    'NWCG_CAUSE_AGE_CATEGORY','CONT_TIME','CONT_DATE'],inplace=True)

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).
/content/drive/MyDrive/DS440/dataset/Data
/content/drive/MyDrive/DS440/dataset/Data


In [2]:
Fires.head()

Unnamed: 0_level_0,FIRE_YEAR,DISCOVERY_DATE,DISCOVERY_DOY,DISCOVERY_TIME,NWCG_GENERAL_CAUSE,CONT_DOY,FIRE_SIZE,FIRE_SIZE_CLASS,LATITUDE,LONGITUDE,STATE,COUNTY,FIPS_CODE,FIPS_NAME
FOD_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
1,2005,2/2/2005 0:00,33,1300.0,Power generation/transmission/distribution,33.0,0.1,A,40.036944,-121.005833,CA,63,6063,Plumas County
2,2004,5/12/2004 0:00,133,845.0,Natural,133.0,0.25,A,38.933056,-120.404444,CA,61,6061,Placer County
3,2004,5/31/2004 0:00,152,1921.0,Debris and open burning,152.0,0.1,A,38.984167,-120.735556,CA,17,6017,El Dorado County
4,2004,6/28/2004 0:00,180,1600.0,Natural,185.0,0.1,A,38.559167,-119.913333,CA,3,6003,Alpine County
5,2004,6/28/2004 0:00,180,1600.0,Natural,185.0,0.1,A,38.559167,-119.933056,CA,3,6003,Alpine County


# Data Preprocessing
## convert all features into usable format
## Use random sample from originial dataset due to high computational cost
## Take 1% from each FIRE_SIZE_CLASS to ensure that fire size distribution is uniform after random sampling
## Purpose : 
1. To handle big data
2. To debug and to see that the code actually works

In [3]:
Fires_A = Fires[Fires['FIRE_SIZE_CLASS'] == 'A'].sample(frac=0.03,replace=False)
Fires_B = Fires[Fires['FIRE_SIZE_CLASS'] == 'B'].sample(frac=0.03,replace=False)
Fires_C = Fires[Fires['FIRE_SIZE_CLASS'] == 'C'].sample(frac=0.03,replace=False)
Fires_D = Fires[Fires['FIRE_SIZE_CLASS'] == 'D'].sample(frac=0.03,replace=False)
Fires_E = Fires[Fires['FIRE_SIZE_CLASS'] == 'E'].sample(frac=0.03,replace=False)
Fires_F = Fires[Fires['FIRE_SIZE_CLASS'] == 'F'].sample(frac=0.03,replace=False)
Fires_G = Fires[Fires['FIRE_SIZE_CLASS'] == 'G'].sample(frac=0.03,replace=False)
Fires_AB = pd.concat([Fires_A, Fires_B], ignore_index=True)
Fires_CD = pd.concat([Fires_C, Fires_D], ignore_index=True)
Fires_EF = pd.concat([Fires_E, Fires_F], ignore_index=True)
Fires_EFG = pd.concat([Fires_EF, Fires_G], ignore_index=True)
Fires_ABCD = pd.concat([Fires_AB, Fires_CD], ignore_index=True)
Fires = pd.concat([Fires_ABCD, Fires_EFG], ignore_index=True)
Fires

Unnamed: 0,FIRE_YEAR,DISCOVERY_DATE,DISCOVERY_DOY,DISCOVERY_TIME,NWCG_GENERAL_CAUSE,CONT_DOY,FIRE_SIZE,FIRE_SIZE_CLASS,LATITUDE,LONGITUDE,STATE,COUNTY,FIPS_CODE,FIPS_NAME
0,2009,7/21/2009 0:00,202,1720.0,Firearms and explosives use,202.0,0.10,A,43.261100,-116.084700,ID,Ada,16001,Ada County
1,1992,1/26/1992 0:00,26,1035.0,Arson/incendiarism,26.0,0.08,A,34.207000,-84.137000,GA,Forsyth,13117,Forsyth County
2,2015,4/21/2015 0:00,111,407.0,Debris and open burning,114.0,0.10,A,31.991400,-111.737500,AZ,,,
3,2002,8/31/2002 0:00,243,1447.0,Arson/incendiarism,243.0,0.10,A,41.066500,-123.684500,CA,,,
4,2008,9/5/2008 0:00,249,1053.0,Missing data/not specified/undetermined,249.0,0.10,A,41.323620,-74.117227,NY,ORANGE,36071,Orange County
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
64998,2006,3/8/2006 0:00,67,1233.0,Missing data/not specified/undetermined,69.0,10700.00,G,37.842500,-96.930556,KS,BUTLER,20015,Butler County
64999,2000,5/10/2000 0:00,131,1200.0,Debris and open burning,167.0,15427.00,G,36.319167,-111.984722,AZ,,,
65000,2005,7/22/2005 0:00,203,1230.0,Natural,,8675.00,G,36.833000,-118.671010,CA,,,
65001,1999,11/1/1999 0:00,305,,Missing data/not specified/undetermined,,8927.00,G,35.632799,-102.863489,TX,,,


# Gabriel's Coding

In [4]:
# extract month from discovery date
def get_first_element(date_list):
  return date_list[0]

Fires['DISCOVERY_DATE'] = Fires['DISCOVERY_DATE'].astype("string")
Fires['DISCOVERY_LIST'] = Fires['DISCOVERY_DATE'].str.split(pat='/')
Fires['DISCOVERY_MONTH'] = Fires['DISCOVERY_LIST'].map(get_first_element)
Fires['DISCOVERY_MONTH'] = Fires['DISCOVERY_MONTH'].astype("int64")

# encode causes into integers
Fires['NWCG_GENERAL_CAUSE'] = Fires['NWCG_GENERAL_CAUSE'].astype("string")
cause_enc = LabelEncoder()
Fires['NWCG_CAUSE_CLASSIFICATION_ORD'] = cause_enc.fit_transform(Fires['NWCG_GENERAL_CAUSE'])


# figure out number of days to contain the fire
Fires['DAYS_TO_CONT'] = Fires['CONT_DOY'] - Fires['DISCOVERY_DOY']
##### if it was contained the next year, need to do something about that
def convert_negative_days(day):
  if day < 0:
    return 365 + day
  else:
    return day

Fires['DAYS_TO_CONT'] = Fires['DAYS_TO_CONT'].map(convert_negative_days)


# encode state categories into integers
Fires['STATE'] = Fires['STATE'].astype("string")
state_enc = LabelEncoder()
Fires['STATE_ORD'] = state_enc.fit_transform(Fires['STATE'])

# what to do about counties? Use fips code (must be an int)
Fires['FIPS_CODE'] = Fires['FIPS_CODE'].astype("string")
Fires['FIPS_CODE'].fillna('1000000',inplace=True)
Fires['FIPS_CODE'] = Fires['FIPS_CODE'].astype("int64")

# fill NA with mean of day to containment (mean = 0.92)
Fires['DAYS_TO_CONT'].fillna(value=Fires['DAYS_TO_CONT'].mean(),inplace=True)


# drop date and list
Fires.drop(columns=['DISCOVERY_DATE','DISCOVERY_LIST','NWCG_GENERAL_CAUSE',
                    'CONT_DOY','STATE',
                    'COUNTY'],inplace=True)
gc.collect()

30

In [5]:
Fires.head()

Unnamed: 0,FIRE_YEAR,DISCOVERY_DOY,DISCOVERY_TIME,FIRE_SIZE,FIRE_SIZE_CLASS,LATITUDE,LONGITUDE,FIPS_CODE,FIPS_NAME,DISCOVERY_MONTH,NWCG_CAUSE_CLASSIFICATION_ORD,DAYS_TO_CONT,STATE_ORD
0,2009,202,1720.0,0.1,A,43.2611,-116.0847,16001,Ada County,7,3,0.0,13
1,1992,26,1035.0,0.08,A,34.207,-84.137,13117,Forsyth County,1,0,0.0,10
2,2015,111,407.0,0.1,A,31.9914,-111.7375,1000000,,4,1,3.0,3
3,2002,243,1447.0,0.1,A,41.0665,-123.6845,1000000,,8,0,0.0,4
4,2008,249,1053.0,0.1,A,41.32362,-74.117227,36071,Orange County,9,5,0.0,34


In [6]:
Fires.isna().sum()

FIRE_YEAR                            0
DISCOVERY_DOY                        0
DISCOVERY_TIME                   22599
FIRE_SIZE                            0
FIRE_SIZE_CLASS                      0
LATITUDE                             0
LONGITUDE                            0
FIPS_CODE                            0
FIPS_NAME                        19857
DISCOVERY_MONTH                      0
NWCG_CAUSE_CLASSIFICATION_ORD        0
DAYS_TO_CONT                         0
STATE_ORD                            0
dtype: int64

#** Code by Heejun Son**
### Need to handle missing values in DISCOVERY_TIME column


In [7]:
Fires['FIPS_NAME'].fillna('1000000',inplace=True)

In [8]:
Fires = Fires[Fires['FIPS_NAME'] != '1000000']
Fires.drop(columns=['FIPS_CODE','DAYS_TO_CONT'],inplace=True)
# encode state categories into integers
FIPS_enc = LabelEncoder()
Fires['FIPS_NAME'] = FIPS_enc.fit_transform(Fires['FIPS_NAME'])

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """


In [9]:
#Drop all the NaN values
Fires_Drop = Fires.sample(frac=1,random_state=0).dropna()
#Replace NaN with mean
Fires_Mean = Fires.sample(frac=1,random_state=0)
Fires_Mean['DISCOVERY_TIME'] = Fires_Mean[['DISCOVERY_TIME']].fillna(Fires_Mean[['DISCOVERY_TIME']].mean().iloc[0])
#Replace NaN with median
Fires_Med = Fires.sample(frac=1,random_state=0)
Fires_Med['DISCOVERY_TIME'] = Fires_Med[['DISCOVERY_TIME']].fillna(Fires_Med[['DISCOVERY_TIME']].median().iloc[0])
#Replace NaN with mode
Fires_Freq = Fires.sample(frac=1,random_state=0)
Fires_Freq['DISCOVERY_TIME'] = Fires_Freq[['DISCOVERY_TIME']].fillna(Fires_Freq[['DISCOVERY_TIME']].mode().iloc[0])

In [10]:
#Replace NaN with values predicted by KNN
from sklearn.impute import KNNImputer
Fires_KNN = Fires.sample(frac=1,random_state=0)
Input = Fires_KNN[['FIRE_YEAR','DISCOVERY_DOY','DISCOVERY_TIME','LATITUDE','LONGITUDE','FIPS_NAME','DISCOVERY_MONTH','NWCG_CAUSE_CLASSIFICATION_ORD','STATE_ORD']]
imputer = KNNImputer(n_neighbors=10)

In [11]:
result = imputer.fit_transform(Input)
check = pd.DataFrame(result)
New_Discovery_Time = check[[1]].values.tolist()

In [12]:
Fires_KNN['Assumption'] = New_Discovery_Time
# Convert object to float64
Fires_KNN['Assumption'] = Fires_KNN['Assumption'].astype("string")
Fires_KNN['Assumption'] = Fires_KNN['Assumption'].str.strip('[]')
Fires_KNN['Assumption'] = Fires_KNN['Assumption'].astype("float64")
# Drop DISCOVERY_TIME column that contains NaN
Fires_KNN.drop(columns=['DISCOVERY_TIME'],inplace=True)
# Change column name from Assumption to DISCOVERY_TIME
Fires_KNN['DISCOVERY_TIME'] = Fires_KNN['Assumption']
Fires_KNN.drop(columns=['Assumption'],inplace=True)
Fires_KNN

Unnamed: 0,FIRE_YEAR,DISCOVERY_DOY,FIRE_SIZE,FIRE_SIZE_CLASS,LATITUDE,LONGITUDE,FIPS_NAME,DISCOVERY_MONTH,NWCG_CAUSE_CLASSIFICATION_ORD,STATE_ORD,DISCOVERY_TIME
7862,2017,184,0.10,A,40.030700,-105.043100,149,7,5,5,184.0
50928,2001,190,0.30,B,43.322100,-119.760800,618,7,7,37,190.0
61772,2011,109,21.80,C,33.086575,-83.155433,73,4,5,10,109.0
36350,2006,63,1.00,B,29.532350,-98.349990,124,3,6,44,63.0
40194,2000,111,0.53,B,32.904000,-83.131400,1504,4,1,10,111.0
...,...,...,...,...,...,...,...,...,...,...,...
43848,2005,244,1.00,B,33.565201,-91.955113,430,9,0,2,244.0
30736,2000,192,1.00,B,33.744600,-85.350200,609,7,1,10,192.0
61235,2013,234,96.00,C,37.331700,-114.776900,819,8,7,33,234.0
62621,1998,72,17.00,C,31.158001,-89.765922,868,3,0,25,72.0


In [13]:
#### split data into training and testing
X_drop_train = Fires_Drop[Fires_Drop['FIRE_YEAR'] < 2018][['DISCOVERY_DOY','DISCOVERY_TIME','LATITUDE','LONGITUDE','FIPS_NAME','DISCOVERY_MONTH','NWCG_CAUSE_CLASSIFICATION_ORD','STATE_ORD']]
X_drop_test = Fires_Drop[Fires_Drop['FIRE_YEAR'] < 2018]['FIRE_SIZE']
y_drop_train = Fires_Drop[Fires_Drop['FIRE_YEAR'] == 2018][['DISCOVERY_DOY','DISCOVERY_TIME','LATITUDE','LONGITUDE','FIPS_NAME','DISCOVERY_MONTH','NWCG_CAUSE_CLASSIFICATION_ORD','STATE_ORD']]
y_drop_test = Fires_Drop[Fires_Drop['FIRE_YEAR'] == 2018]['FIRE_SIZE']
######################################################################################################################################################################
X_mean_train = Fires_Mean[Fires_Mean['FIRE_YEAR'] < 2018][['DISCOVERY_DOY','DISCOVERY_TIME','LATITUDE','LONGITUDE','FIPS_NAME','DISCOVERY_MONTH','NWCG_CAUSE_CLASSIFICATION_ORD','STATE_ORD']]
X_mean_test = Fires_Mean[Fires_Mean['FIRE_YEAR'] < 2018]['FIRE_SIZE']
y_mean_train = Fires_Mean[Fires_Mean['FIRE_YEAR'] == 2018][['DISCOVERY_DOY','DISCOVERY_TIME','LATITUDE','LONGITUDE','FIPS_NAME','DISCOVERY_MONTH','NWCG_CAUSE_CLASSIFICATION_ORD','STATE_ORD']]
y_mean_test = Fires_Mean[Fires_Mean['FIRE_YEAR'] == 2018]['FIRE_SIZE']
######################################################################################################################################################################
X_med_train = Fires_Med[Fires_Med['FIRE_YEAR'] < 2018][['DISCOVERY_DOY','DISCOVERY_TIME','LATITUDE','LONGITUDE','FIPS_NAME','DISCOVERY_MONTH','NWCG_CAUSE_CLASSIFICATION_ORD','STATE_ORD']]
X_med_test = Fires_Med[Fires_Med['FIRE_YEAR'] < 2018]['FIRE_SIZE']
y_med_train = Fires_Med[Fires_Med['FIRE_YEAR'] == 2018][['DISCOVERY_DOY','DISCOVERY_TIME','LATITUDE','LONGITUDE','FIPS_NAME','DISCOVERY_MONTH','NWCG_CAUSE_CLASSIFICATION_ORD','STATE_ORD']]
y_med_test = Fires_Med[Fires_Med['FIRE_YEAR'] == 2018]['FIRE_SIZE']
######################################################################################################################################################################
X_mod_train = Fires_Freq[Fires_Freq['FIRE_YEAR'] < 2018][['DISCOVERY_DOY','DISCOVERY_TIME','LATITUDE','LONGITUDE','FIPS_NAME','DISCOVERY_MONTH','NWCG_CAUSE_CLASSIFICATION_ORD','STATE_ORD']]
X_mod_test = Fires_Freq[Fires_Freq['FIRE_YEAR'] < 2018]['FIRE_SIZE']
y_mod_train = Fires_Freq[Fires_Freq['FIRE_YEAR'] == 2018][['DISCOVERY_DOY','DISCOVERY_TIME','LATITUDE','LONGITUDE','FIPS_NAME','DISCOVERY_MONTH','NWCG_CAUSE_CLASSIFICATION_ORD','STATE_ORD']]
y_mod_test = Fires_Freq[Fires_Freq['FIRE_YEAR'] == 2018]['FIRE_SIZE']
######################################################################################################################################################################
X_KNN_train = Fires_KNN[Fires_KNN['FIRE_YEAR'] < 2018][['DISCOVERY_DOY','DISCOVERY_TIME','LATITUDE','LONGITUDE','FIPS_NAME','DISCOVERY_MONTH','NWCG_CAUSE_CLASSIFICATION_ORD','STATE_ORD']]
X_KNN_test = Fires_KNN[Fires_KNN['FIRE_YEAR'] < 2018]['FIRE_SIZE']
y_KNN_train = Fires_KNN[Fires_KNN['FIRE_YEAR'] == 2018][['DISCOVERY_DOY','DISCOVERY_TIME','LATITUDE','LONGITUDE','FIPS_NAME','DISCOVERY_MONTH','NWCG_CAUSE_CLASSIFICATION_ORD','STATE_ORD']]
y_KNN_test = Fires_KNN[Fires_KNN['FIRE_YEAR'] == 2018]['FIRE_SIZE']

- Drop

In [14]:
#### list of models to train
# xgboost, linear regression (with and without regularization), random forest, naive bayes, nearest neighbors
# k-means, svm
model_list = [LinearRegression, RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor,
               KNeighborsRegressor]
model_scores_drop = dict()    
model_drop_df = pd.DataFrame(columns=['Model', 'MSE', 'MAE'])
for model in model_list:
  regressor_drop = model().fit(X_drop_train,X_drop_test)
  y_drop_pred = regressor_drop.predict(y_drop_train)
  model_drop_mse = mean_squared_error(y_drop_pred,y_drop_test)
  model_drop_mae = mean_absolute_error(y_drop_pred,y_drop_test)
  model_scores_drop[model] = (regressor_drop,y_drop_test,y_drop_pred)
  model_drop_df = model_drop_df.append({'Model':model,'MSE':model_drop_mse,'MAE':model_drop_mae}, ignore_index=True)
  model_drop_df = model_drop_df.sort_values(by='MSE', ascending= True)
model_drop_df

Unnamed: 0,Model,MSE,MAE
0,<class 'sklearn.linear_model._base.LinearRegre...,3059594.0,209.852553
1,<class 'sklearn.ensemble._forest.RandomForestR...,3916447.0,239.294301
2,<class 'sklearn.ensemble._weight_boosting.AdaB...,4258937.0,538.385542
3,<class 'sklearn.ensemble._gb.GradientBoostingR...,4749539.0,213.92015
4,<class 'sklearn.neighbors._regression.KNeighbo...,4864644.0,191.599951


- Mean

In [16]:
#### list of models to train
# xgboost, linear regression (with and without regularization), random forest, naive bayes, nearest neighbors
# k-means, svm
model_list = [LinearRegression, RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor,
               KNeighborsRegressor]
model_scores_mean = dict()    
model_mean_df = pd.DataFrame(columns=['Model', 'MSE', 'MAE'])
for model in model_list:
  regressor_mean = model().fit(X_mean_train,X_mean_test)
  y_mean_pred = regressor_mean.predict(y_mean_train)
  model_mean_mse = mean_squared_error(y_mean_pred,y_mean_test)
  model_mean_mae = mean_absolute_error(y_mean_pred,y_mean_test)
  model_scores_mean[model] = (regressor_mean,y_mean_test,y_mean_pred)
  model_mean_df = model_mean_df.append({'Model':model,'MSE':model_mean_mse,'MAE':model_mean_mae}, ignore_index=True)
  model_mean_df = model_mean_df.sort_values(by='MSE', ascending= True)
model_mean_df

Unnamed: 0,Model,MSE,MAE
0,<class 'sklearn.linear_model._base.LinearRegre...,2654789.0,175.166572
1,<class 'sklearn.ensemble._weight_boosting.AdaB...,2779662.0,267.994506
2,<class 'sklearn.ensemble._forest.RandomForestR...,3514821.0,224.586555
3,<class 'sklearn.ensemble._gb.GradientBoostingR...,4128713.0,185.880034
4,<class 'sklearn.neighbors._regression.KNeighbo...,4218457.0,168.720639


- Median

In [17]:
#### list of models to train
# xgboost, linear regression (with and without regularization), random forest, naive bayes, nearest neighbors
# k-means, svm
model_list = [LinearRegression, RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor,
               KNeighborsRegressor]
model_scores_med = dict()    
model_med_df = pd.DataFrame(columns=['Model', 'MSE', 'MAE'])
for model in model_list:
  regressor_med = model().fit(X_med_train,X_med_test)
  y_med_pred = regressor_med.predict(y_med_train)
  model_med_mse = mean_squared_error(y_med_pred,y_med_test)
  model_med_mae = mean_absolute_error(y_med_pred,y_med_test)
  model_scores_med[model] = (regressor_med,y_med_test,y_med_pred)
  model_med_df = model_med_df.append({'Model':model,'MSE':model_med_mse,'MAE':model_med_mae}, ignore_index=True)
  model_med_df = model_med_df.sort_values(by='MSE', ascending= True)
model_med_df

Unnamed: 0,Model,MSE,MAE
0,<class 'sklearn.linear_model._base.LinearRegre...,2654789.0,175.20896
1,<class 'sklearn.ensemble._forest.RandomForestR...,3568948.0,224.752324
2,<class 'sklearn.ensemble._gb.GradientBoostingR...,4128713.0,185.880034
4,<class 'sklearn.neighbors._regression.KNeighbo...,4218463.0,168.809036
3,<class 'sklearn.ensemble._weight_boosting.AdaB...,31195040.0,2481.547955


- Mode

In [18]:
#### list of models to train
# xgboost, linear regression (with and without regularization), random forest, naive bayes, nearest neighbors
# k-means, svm
model_list = [LinearRegression, RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor,
               KNeighborsRegressor]
model_scores_mod = dict()    
model_mod_df = pd.DataFrame(columns=['Model', 'MSE', 'MAE'])
for model in model_list:
  regressor_mod = model().fit(X_mod_train,X_mod_test)
  y_mod_pred = regressor_mod.predict(y_mod_train)
  model_mod_mse = mean_squared_error(y_mod_pred,y_mod_test)
  model_mod_mae = mean_absolute_error(y_mod_pred,y_mod_test)
  model_scores_mod[model] = (regressor_mod,y_mod_test,y_mod_pred)
  model_mod_df = model_mod_df.append({'Model':model,'MSE':model_mod_mse,'MAE':model_mod_mae}, ignore_index=True)
  model_mod_df = model_mod_df.sort_values(by='MSE', ascending= True)
model_mod_df

Unnamed: 0,Model,MSE,MAE
0,<class 'sklearn.linear_model._base.LinearRegre...,2654775.0,174.800975
1,<class 'sklearn.ensemble._weight_boosting.AdaB...,3504519.0,260.946653
2,<class 'sklearn.ensemble._forest.RandomForestR...,3505489.0,221.250373
3,<class 'sklearn.ensemble._gb.GradientBoostingR...,4132540.0,185.891689
4,<class 'sklearn.neighbors._regression.KNeighbo...,4219190.0,169.856378


- KNN

In [19]:
#### list of models to train
# xgboost, linear regression (with and without regularization), random forest, naive bayes, nearest neighbors
# k-means, svm
model_list = [LinearRegression, RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor,
               KNeighborsRegressor]
model_scores_KNN = dict()    
model_KNN_df = pd.DataFrame(columns=['Model', 'MSE', 'MAE'])
for model in model_list:
  regressor_KNN = model().fit(X_KNN_train,X_KNN_test)
  y_KNN_pred = regressor_KNN.predict(y_KNN_train)
  model_KNN_mse = mean_squared_error(y_KNN_pred,y_KNN_test)
  model_KNN_mae = mean_absolute_error(y_KNN_pred,y_KNN_test)
  model_scores_KNN[model] = (regressor_KNN,y_KNN_test,y_KNN_pred)
  model_KNN_df = model_KNN_df.append({'Model':model,'MSE':model_KNN_mse,'MAE':model_KNN_mae}, ignore_index=True)
  model_KNN_df = model_KNN_df.sort_values(by='MSE', ascending= True)
model_KNN_df

Unnamed: 0,Model,MSE,MAE
0,<class 'sklearn.linear_model._base.LinearRegre...,2653863.0,173.473494
1,<class 'sklearn.ensemble._forest.RandomForestR...,3461537.0,219.790686
2,<class 'sklearn.ensemble._gb.GradientBoostingR...,4186753.0,188.963618
4,<class 'sklearn.neighbors._regression.KNeighbo...,4334328.0,179.987306
3,<class 'sklearn.ensemble._weight_boosting.AdaB...,5005267.0,501.205976
