# Importing libraries
### Done by Gabriel, Colby, Heejun

In [None]:
# import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sqlite3
import gc

# import preprocessing classes
from sklearn.preprocessing import OrdinalEncoder, LabelEncoder

# feature selection
from sklearn.feature_selection import chi2, mutual_info_regression
from sklearn.feature_selection import SelectKBest, chi2

# import models
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR

# import train test split
from sklearn.model_selection import train_test_split

# feature importance
from sklearn.inspection import permutation_importance

# import metrics
from sklearn.metrics import mean_absolute_error, mean_squared_error

# functions to save data
from joblib import dump, load

# read data
## mount google drive
from google.colab import drive
drive.mount("/content/drive/")
%cd '/content/drive/MyDrive/DS440/dataset/Data/'
!pwd

## read in files
dir = '/content/drive/MyDrive/DS440/dataset/Data/'
cnx = sqlite3.connect(dir + 'wildfire.sqlite')
cursor = cnx.cursor()
cursor.execute("SELECT name FROM sqlite_master WHERE type='Fires';")
tables = cursor.fetchall()
Fires = pd.read_sql_query("SELECT * FROM 'Fires'", cnx).set_index('FOD_ID')
gc.collect()
# Drop unnecessary column
Fires.drop(columns=['FPA_ID','SOURCE_SYSTEM_TYPE','SOURCE_SYSTEM','NWCG_REPORTING_AGENCY','NWCG_REPORTING_UNIT_ID',
                    'NWCG_REPORTING_UNIT_NAME','SOURCE_REPORTING_UNIT','SOURCE_REPORTING_UNIT_NAME','LOCAL_FIRE_REPORT_ID',
                    'LOCAL_INCIDENT_ID','FIRE_CODE','FIRE_NAME','ICS_209_PLUS_INCIDENT_JOIN_ID','ICS_209_PLUS_COMPLEX_JOIN_ID',
                    'MTBS_ID','MTBS_FIRE_NAME','COMPLEX_NAME','OWNER_DESCR','NWCG_CAUSE_CLASSIFICATION',
                    'NWCG_CAUSE_AGE_CATEGORY','FIRE_SIZE_CLASS','CONT_TIME','CONT_DATE'],inplace=True)

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).
/content/drive/MyDrive/DS440/dataset/Data
/content/drive/MyDrive/DS440/dataset/Data


In [None]:
Fires.head()

Unnamed: 0_level_0,FIRE_YEAR,DISCOVERY_DATE,DISCOVERY_DOY,DISCOVERY_TIME,NWCG_GENERAL_CAUSE,CONT_DOY,FIRE_SIZE,LATITUDE,LONGITUDE,STATE,COUNTY,FIPS_CODE,FIPS_NAME
FOD_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
1,2005,2/2/2005 0:00,33,1300.0,Power generation/transmission/distribution,33.0,0.1,40.036944,-121.005833,CA,63,6063,Plumas County
2,2004,5/12/2004 0:00,133,845.0,Natural,133.0,0.25,38.933056,-120.404444,CA,61,6061,Placer County
3,2004,5/31/2004 0:00,152,1921.0,Debris and open burning,152.0,0.1,38.984167,-120.735556,CA,17,6017,El Dorado County
4,2004,6/28/2004 0:00,180,1600.0,Natural,185.0,0.1,38.559167,-119.913333,CA,3,6003,Alpine County
5,2004,6/28/2004 0:00,180,1600.0,Natural,185.0,0.1,38.559167,-119.933056,CA,3,6003,Alpine County


# Data Preprocessing
## convert all features into usable format
## Use random sample from originial dataset due to high computational cost
## Take 1% of original data
## Purpose : 
1. To handle big data
2. To debug and to see that the code actually works

In [None]:
# Later just remove this line of code when using original dataset - Heejun Code
Fires = Fires.sample(frac=0.01,replace=False)

In [None]:
Fires

Unnamed: 0_level_0,FIRE_YEAR,DISCOVERY_DATE,DISCOVERY_DOY,DISCOVERY_TIME,NWCG_GENERAL_CAUSE,CONT_DOY,FIRE_SIZE,LATITUDE,LONGITUDE,STATE,COUNTY,FIPS_CODE,FIPS_NAME
FOD_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
1647705,2001,2/3/2001 0:00,34,1106.0,Debris and open burning,34.0,7.95,31.978500,-83.753600,GA,Crisp,13081,Crisp County
1360733,1997,8/12/1997 0:00,224,,Equipment and vehicle use,,0.20,38.861111,-121.311111,CA,,,
19906582,1995,5/28/1995 0:00,148,1930.0,Debris and open burning,,0.30,31.720080,-88.304490,AL,CW,01023,Choctaw County
1467631,2009,3/18/2009 0:00,77,1625.0,Debris and open burning,77.0,1.00,39.375677,-89.032346,IL,Christian,17021,Christian County
1123871,2000,6/23/2000 0:00,175,,Natural,,4.00,27.680000,-80.850000,FL,Indian River,12061,Indian River County
...,...,...,...,...,...,...,...,...,...,...,...,...,...
42885,1992,8/26/1992 0:00,239,1615.0,Recreation and ceremony,239.0,1.00,45.051667,-110.641667,MT,,,
713623,2008,3/17/2008 0:00,77,,Debris and open burning,,0.10,30.936050,-97.786220,TX,Bell,48027,Bell County
12702,2006,8/17/2006 0:00,229,1034.0,Natural,229.0,0.10,45.570833,-105.921389,MT,75,30075,Powder River County
960704,1998,11/1/1998 0:00,305,,Arson/incendiarism,,4.00,34.341400,-85.608100,AL,,,


# Colby's Coding

In [None]:
# extract month from discovery date
def get_first_element(date_list):
  return date_list[0]

Fires['DISCOVERY_DATE'] = Fires['DISCOVERY_DATE'].astype("string")
Fires['DISCOVERY_LIST'] = Fires['DISCOVERY_DATE'].str.split(pat='/')
Fires['DISCOVERY_MONTH'] = Fires['DISCOVERY_LIST'].map(get_first_element)
Fires['DISCOVERY_MONTH'] = Fires['DISCOVERY_MONTH'].astype("int64")

# encode causes into integers
Fires['NWCG_GENERAL_CAUSE'] = Fires['NWCG_GENERAL_CAUSE'].astype("string")
cause_enc = LabelEncoder()
Fires['NWCG_CAUSE_CLASSIFICATION_ORD'] = cause_enc.fit_transform(Fires['NWCG_GENERAL_CAUSE'])


# figure out number of days to contain the fire
Fires['DAYS_TO_CONT'] = Fires['CONT_DOY'] - Fires['DISCOVERY_DOY']
##### if it was contained the next year, need to do something about that
def convert_negative_days(day):
  if day < 0:
    return 365 + day
  else:
    return day

Fires['DAYS_TO_CONT'] = Fires['DAYS_TO_CONT'].map(convert_negative_days)


# encode state categories into integers
Fires['STATE'] = Fires['STATE'].astype("string")
state_enc = LabelEncoder()
Fires['STATE_ORD'] = state_enc.fit_transform(Fires['STATE'])

# what to do about counties? Use fips code (must be an int)
Fires['FIPS_CODE'] = Fires['FIPS_CODE'].astype("string")
Fires['FIPS_CODE'].fillna('1000000',inplace=True)
Fires['FIPS_CODE'] = Fires['FIPS_CODE'].astype("int64")

# fill NA with mean of day to containment (mean = 0.92)
Fires['DAYS_TO_CONT'].fillna(value=Fires['DAYS_TO_CONT'].mean(),inplace=True)


# drop date and list
Fires.drop(columns=['DISCOVERY_DATE','DISCOVERY_LIST','NWCG_GENERAL_CAUSE',
                    'CONT_DOY','DISCOVERY_DOY','STATE','FIPS_NAME',
                    'COUNTY'],inplace=True)
gc.collect()

30

In [None]:
Fires.head()

Unnamed: 0_level_0,FIRE_YEAR,DISCOVERY_TIME,FIRE_SIZE,LATITUDE,LONGITUDE,FIPS_CODE,DISCOVERY_MONTH,NWCG_CAUSE_CLASSIFICATION_ORD,DAYS_TO_CONT,STATE_ORD
FOD_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1647705,2001,1106.0,7.95,31.9785,-83.7536,13081,2,1,0.0,9
1360733,1997,,0.2,38.861111,-121.311111,1000000,8,2,1.009867,4
19906582,1995,1930.0,0.3,31.72008,-88.30449,1023,5,1,1.009867,1
1467631,2009,1625.0,1.0,39.375677,-89.032346,17021,3,1,0.0,13
1123871,2000,,4.0,27.68,-80.85,12061,6,7,1.009867,8


#** Code by Heejun Son**
### Need to handle missing values in DISCOVERY_TIME column 
### Need to find the best hyperparameter (best n_neighbors)
### Need to compare MICE vs KNN vs Replacement with the most frequent number
Since my task is to find best features by using feature selection technique, I did not drop DISCOVERY_TIME column. 

(For Colby and Gabriel, they did drop)


In [None]:
from sklearn.impute import KNNImputer
Input = Fires[['FIRE_YEAR','DISCOVERY_TIME','FIRE_SIZE','LATITUDE','LONGITUDE','DISCOVERY_MONTH','NWCG_CAUSE_CLASSIFICATION_ORD','DAYS_TO_CONT','STATE_ORD']]
imputer = KNNImputer(n_neighbors=5)
result = imputer.fit_transform(Input)

In [None]:
check = pd.DataFrame(result)
New_Discovery_Time = check[[1]].values.tolist()

In [None]:
Fires['Assumption'] = New_Discovery_Time

In [None]:
Fires.head()

Unnamed: 0_level_0,FIRE_YEAR,DISCOVERY_TIME,FIRE_SIZE,LATITUDE,LONGITUDE,FIPS_CODE,DISCOVERY_MONTH,NWCG_CAUSE_CLASSIFICATION_ORD,DAYS_TO_CONT,STATE_ORD,Assumption
FOD_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1647705,2001,1106.0,7.95,31.9785,-83.7536,13081,2,1,0.0,9,[1106.0]
1360733,1997,,0.2,38.861111,-121.311111,1000000,8,2,1.009867,4,[1296.8]
19906582,1995,1930.0,0.3,31.72008,-88.30449,1023,5,1,1.009867,1,[1930.0]
1467631,2009,1625.0,1.0,39.375677,-89.032346,17021,3,1,0.0,13,[1625.0]
1123871,2000,,4.0,27.68,-80.85,12061,6,7,1.009867,8,[1409.2]


In [None]:
Fires.dtypes

FIRE_YEAR                          int64
DISCOVERY_TIME                   float64
FIRE_SIZE                        float64
LATITUDE                         float64
LONGITUDE                        float64
FIPS_CODE                          int64
DISCOVERY_MONTH                    int64
NWCG_CAUSE_CLASSIFICATION_ORD      int64
DAYS_TO_CONT                     float64
STATE_ORD                          int64
Assumption                        object
dtype: object

In [None]:
# Convert object to float64
Fires['Assumption'] = Fires['Assumption'].astype("string")
Fires['Assumption'] = Fires['Assumption'].str.strip('[]')
Fires['Assumption'] = Fires['Assumption'].astype("float64")
Fires

Unnamed: 0_level_0,FIRE_YEAR,DISCOVERY_TIME,FIRE_SIZE,LATITUDE,LONGITUDE,FIPS_CODE,DISCOVERY_MONTH,NWCG_CAUSE_CLASSIFICATION_ORD,DAYS_TO_CONT,STATE_ORD,Assumption
FOD_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1647705,2001,1106.0,7.95,31.978500,-83.753600,13081,2,1,0.000000,9,1106.0
1360733,1997,,0.20,38.861111,-121.311111,1000000,8,2,1.009867,4,1296.8
19906582,1995,1930.0,0.30,31.720080,-88.304490,1023,5,1,1.009867,1,1930.0
1467631,2009,1625.0,1.00,39.375677,-89.032346,17021,3,1,0.000000,13,1625.0
1123871,2000,,4.00,27.680000,-80.850000,12061,6,7,1.009867,8,1409.2
...,...,...,...,...,...,...,...,...,...,...,...
42885,1992,1615.0,1.00,45.051667,-110.641667,1000000,8,11,0.000000,25,1615.0
713623,2008,,0.10,30.936050,-97.786220,48027,3,1,1.009867,43,1426.0
12702,2006,1034.0,0.10,45.570833,-105.921389,30075,8,7,0.000000,25,1034.0
960704,1998,,4.00,34.341400,-85.608100,1000000,11,0,1.009867,1,1558.0


In [None]:
# Drop DISCOVERY_TIME column that contains NaN
Fires.drop(columns=['DISCOVERY_TIME'],inplace=True)
# Change column name from Assumption to DISCOVERY_TIME
Fires['DISCOVERY_TIME'] = Fires['Assumption']
Fires.drop(columns=['Assumption'],inplace=True)
Fires

Unnamed: 0_level_0,FIRE_YEAR,FIRE_SIZE,LATITUDE,LONGITUDE,FIPS_CODE,DISCOVERY_MONTH,NWCG_CAUSE_CLASSIFICATION_ORD,DAYS_TO_CONT,STATE_ORD,DISCOVERY_TIME
FOD_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1647705,2001,7.95,31.978500,-83.753600,13081,2,1,0.000000,9,1106.0
1360733,1997,0.20,38.861111,-121.311111,1000000,8,2,1.009867,4,1296.8
19906582,1995,0.30,31.720080,-88.304490,1023,5,1,1.009867,1,1930.0
1467631,2009,1.00,39.375677,-89.032346,17021,3,1,0.000000,13,1625.0
1123871,2000,4.00,27.680000,-80.850000,12061,6,7,1.009867,8,1409.2
...,...,...,...,...,...,...,...,...,...,...
42885,1992,1.00,45.051667,-110.641667,1000000,8,11,0.000000,25,1615.0
713623,2008,0.10,30.936050,-97.786220,48027,3,1,1.009867,43,1426.0
12702,2006,0.10,45.570833,-105.921389,30075,8,7,0.000000,25,1034.0
960704,1998,4.00,34.341400,-85.608100,1000000,11,0,1.009867,1,1558.0


# Create a baseline mse and mae

In [None]:
FIRE_SIZE_PERMUTE = Fires['FIRE_SIZE'].sample(frac=1,random_state=0)

In [None]:
print("MAE: ",mean_absolute_error(FIRE_SIZE_PERMUTE,Fires['FIRE_SIZE']))
print("MSE: ",mean_squared_error(FIRE_SIZE_PERMUTE,Fires['FIRE_SIZE']))

MAE:  188.70000729185892
MSE:  14536076.451223897


# ** Code by Heejun **
## Feature selection information
Later need to convert LONGITUDE to negative values!!

Reason for using abs() : There are no positive values !!
                        
(United States only have negative lonitude values)

In [None]:
# Convert negative value to positive
# Feature Selection must have non-negative values
Fires['LONGITUDE'] = Fires['LONGITUDE'].abs()
Fires

Unnamed: 0_level_0,FIRE_YEAR,FIRE_SIZE,LATITUDE,LONGITUDE,FIPS_CODE,DISCOVERY_MONTH,NWCG_CAUSE_CLASSIFICATION_ORD,DAYS_TO_CONT,STATE_ORD,DISCOVERY_TIME
FOD_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1647705,2001,7.95,31.978500,83.753600,13081,2,1,0.000000,9,1106.0
1360733,1997,0.20,38.861111,121.311111,1000000,8,2,1.009867,4,1296.8
19906582,1995,0.30,31.720080,88.304490,1023,5,1,1.009867,1,1930.0
1467631,2009,1.00,39.375677,89.032346,17021,3,1,0.000000,13,1625.0
1123871,2000,4.00,27.680000,80.850000,12061,6,7,1.009867,8,1409.2
...,...,...,...,...,...,...,...,...,...,...
42885,1992,1.00,45.051667,110.641667,1000000,8,11,0.000000,25,1615.0
713623,2008,0.10,30.936050,97.786220,48027,3,1,1.009867,43,1426.0
12702,2006,0.10,45.570833,105.921389,30075,8,7,0.000000,25,1034.0
960704,1998,4.00,34.341400,85.608100,1000000,11,0,1.009867,1,1558.0


In [None]:
Fires.dtypes

FIRE_YEAR                          int64
FIRE_SIZE                        float64
LATITUDE                         float64
LONGITUDE                        float64
FIPS_CODE                          int64
DISCOVERY_MONTH                    int64
NWCG_CAUSE_CLASSIFICATION_ORD      int64
DAYS_TO_CONT                     float64
STATE_ORD                          int64
DISCOVERY_TIME                   float64
dtype: object

In [None]:
from sklearn.feature_selection import SelectKBest, chi2
from sklearn import preprocessing
X = Fires[['FIRE_YEAR','LATITUDE','LONGITUDE','FIPS_CODE','DISCOVERY_MONTH','NWCG_CAUSE_CLASSIFICATION_ORD','DAYS_TO_CONT','STATE_ORD','DISCOVERY_TIME']]
y = Fires['FIRE_SIZE']
# Transform FIRE_SIZE
# SelectKBest() cannot handle countinous numbers
Encoder = preprocessing.LabelEncoder()
Firs_size_encoded = Encoder.fit_transform(y)

Best_Features = dict()
for i in range(1,9):
  selector = SelectKBest(chi2, k=i)
  selector.fit(X, Firs_size_encoded)
  Features = list(X.columns[selector.get_support(indices=True)])
  Best_Features[i] = Features
  print("When k= ", i, ",", Features)

When k=  1 , ['FIPS_CODE']
When k=  2 , ['FIPS_CODE', 'DAYS_TO_CONT']
When k=  3 , ['FIPS_CODE', 'DAYS_TO_CONT', 'DISCOVERY_TIME']
When k=  4 , ['FIPS_CODE', 'DAYS_TO_CONT', 'STATE_ORD', 'DISCOVERY_TIME']
When k=  5 , ['LONGITUDE', 'FIPS_CODE', 'DAYS_TO_CONT', 'STATE_ORD', 'DISCOVERY_TIME']
When k=  6 , ['LONGITUDE', 'FIPS_CODE', 'NWCG_CAUSE_CLASSIFICATION_ORD', 'DAYS_TO_CONT', 'STATE_ORD', 'DISCOVERY_TIME']
When k=  7 , ['LATITUDE', 'LONGITUDE', 'FIPS_CODE', 'NWCG_CAUSE_CLASSIFICATION_ORD', 'DAYS_TO_CONT', 'STATE_ORD', 'DISCOVERY_TIME']
When k=  8 , ['LATITUDE', 'LONGITUDE', 'FIPS_CODE', 'DISCOVERY_MONTH', 'NWCG_CAUSE_CLASSIFICATION_ORD', 'DAYS_TO_CONT', 'STATE_ORD', 'DISCOVERY_TIME']


Freatures to predict class:
- fire year, general cause, lat, long

Features to predict cause:
- fire year, size class, lat, long

# MODEL

In [None]:
X

Unnamed: 0_level_0,FIRE_YEAR,LATITUDE,LONGITUDE,FIPS_CODE,DISCOVERY_MONTH,NWCG_CAUSE_CLASSIFICATION_ORD,DAYS_TO_CONT,STATE_ORD
FOD_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
15041739,1993,41.316486,-73.093164,9001,7,0,0.923788,6
1077617,2000,37.171700,-82.288300,1000000,3,1,0.923788,46
834776,1994,33.295833,-79.308333,45043,1,5,0.000000,41
400012744,2017,36.866111,-93.106944,29043,2,5,2.000000,24
983664,1999,33.441400,-93.617200,1000000,8,1,0.923788,2
...,...,...,...,...,...,...,...,...
1076674,1999,37.118300,-82.291700,1000000,3,5,0.923788,46
1079299,2009,44.569440,-124.046390,41041,7,1,0.000000,37
400030402,2016,43.925300,-110.536700,1000000,8,7,8.000000,51
400010162,2017,30.550000,-95.475000,48471,7,1,0.000000,44


In [None]:
#### split data into training, validation, and testing
X = Fires[['FIRE_YEAR','LATITUDE','LONGITUDE','FIPS_CODE','DISCOVERY_MONTH','NWCG_CAUSE_CLASSIFICATION_ORD','DAYS_TO_CONT','STATE_ORD','DISCOVERY_TIME']]
y= Fires['FIRE_SIZE']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [None]:
#### list of models to train
# xgboost, linear regression (with and without regularization), random forest, naive bayes, nearest neighbors
# k-means, svm
model_list = [LinearRegression, RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor,
               KNeighborsRegressor, SVR] #GaussianNB, 
model_scores = dict()
for model in model_list:
  # fit model to data
  print(model)
  regressor = model().fit(X_train,y_train)
  y_pred = regressor.predict(X_test)
  model_mse = mean_squared_error(y_pred,y_test)
  model_mae = mean_absolute_error(y_pred,y_test)
  model_scores[model] = (regressor,y_pred,model_mse,model_mae)
  print("MSE = ", model_mse, '\n', "MAE = ", model_mae)

<class 'sklearn.linear_model._base.LinearRegression'>
1105579.7000120282 137.71520786551915
<class 'sklearn.ensemble._forest.RandomForestRegressor'>
1015133.9467494518 98.3887830971235
<class 'sklearn.ensemble._gb.GradientBoostingRegressor'>
2881449.5629463214 117.79100281368285
<class 'sklearn.ensemble._weight_boosting.AdaBoostRegressor'>
1779454.5014908398 573.4827039557592
<class 'sklearn.neighbors._regression.KNeighborsRegressor'>
1408524.9035277097 87.38043761098294
<class 'sklearn.svm._classes.SVR'>
1313697.8139151211 55.64356449892774


# Save Models

In [None]:
# visualize features - might need more notebooks for this




In [None]:
# develop model using hyperopt



In [None]:
# use permutation importance on each feature



In [None]:
 # put results in a dataframe

 

In [None]:
############### SAVE FOR LATER ###########################
# polynomial features


# determine which ones are useful 
## use mutual information criterion to do this

