# Importing libraries
### Done by Gabriel, Colby, Heejun

In [1]:
# import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sqlite3
import gc

# import preprocessing classes
from sklearn.preprocessing import OrdinalEncoder, LabelEncoder

# feature selection
from sklearn.feature_selection import chi2, mutual_info_regression
from sklearn.feature_selection import SelectKBest, chi2

# import models
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR

# import train test split
from sklearn.model_selection import train_test_split

# feature importance
from sklearn.inspection import permutation_importance

# import metrics
from sklearn.metrics import mean_absolute_error, mean_squared_error

# functions to save data
from joblib import dump, load

# read data
## mount google drive
from google.colab import drive
drive.mount("/content/drive/")
%cd '/content/drive/MyDrive/DS440/dataset/Data/'
!pwd

## read in files
dir = '/content/drive/MyDrive/DS440/dataset/Data/'
cnx = sqlite3.connect(dir + 'wildfire.sqlite')
cursor = cnx.cursor()
cursor.execute("SELECT name FROM sqlite_master WHERE type='Fires';")
tables = cursor.fetchall()
Fires = pd.read_sql_query("SELECT * FROM 'Fires'", cnx).set_index('FOD_ID')
gc.collect()
# Drop unnecessary column
Fires.drop(columns=['FPA_ID','SOURCE_SYSTEM_TYPE','SOURCE_SYSTEM','NWCG_REPORTING_AGENCY','NWCG_REPORTING_UNIT_ID',
                    'NWCG_REPORTING_UNIT_NAME','SOURCE_REPORTING_UNIT','SOURCE_REPORTING_UNIT_NAME','LOCAL_FIRE_REPORT_ID',
                    'LOCAL_INCIDENT_ID','FIRE_CODE','FIRE_NAME','ICS_209_PLUS_INCIDENT_JOIN_ID','ICS_209_PLUS_COMPLEX_JOIN_ID',
                    'MTBS_ID','MTBS_FIRE_NAME','COMPLEX_NAME','OWNER_DESCR','NWCG_CAUSE_CLASSIFICATION',
                    'NWCG_CAUSE_AGE_CATEGORY','FIRE_SIZE_CLASS','CONT_TIME','CONT_DATE'],inplace=True)

Mounted at /content/drive/
/content/drive/MyDrive/DS440/dataset/Data
/content/drive/MyDrive/DS440/dataset/Data


In [2]:
Fires.head()

Unnamed: 0_level_0,FIRE_YEAR,DISCOVERY_DATE,DISCOVERY_DOY,DISCOVERY_TIME,NWCG_GENERAL_CAUSE,CONT_DOY,FIRE_SIZE,LATITUDE,LONGITUDE,STATE,COUNTY,FIPS_CODE,FIPS_NAME
FOD_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
1,2005,2/2/2005 0:00,33,1300.0,Power generation/transmission/distribution,33.0,0.1,40.036944,-121.005833,CA,63,6063,Plumas County
2,2004,5/12/2004 0:00,133,845.0,Natural,133.0,0.25,38.933056,-120.404444,CA,61,6061,Placer County
3,2004,5/31/2004 0:00,152,1921.0,Debris and open burning,152.0,0.1,38.984167,-120.735556,CA,17,6017,El Dorado County
4,2004,6/28/2004 0:00,180,1600.0,Natural,185.0,0.1,38.559167,-119.913333,CA,3,6003,Alpine County
5,2004,6/28/2004 0:00,180,1600.0,Natural,185.0,0.1,38.559167,-119.933056,CA,3,6003,Alpine County


In [3]:
#ToCheck = Fires.drop(columns=['FIRE_YEAR','DISCOVERY_DATE','DISCOVERY_DOY','NWCG_GENERAL_CAUSE','CONT_DOY','LATITUDE','LONGITUDE','STATE','COUNTY','FIPS_CODE','FIPS_NAME'])
#NaN = ToCheck[ToCheck['DISCOVERY_TIME'].isna()]
NaN = Fires[Fires['DISCOVERY_TIME'].isna()]
NaN

Unnamed: 0_level_0,FIRE_YEAR,DISCOVERY_DATE,DISCOVERY_DOY,DISCOVERY_TIME,NWCG_GENERAL_CAUSE,CONT_DOY,FIRE_SIZE,LATITUDE,LONGITUDE,STATE,COUNTY,FIPS_CODE,FIPS_NAME
FOD_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
174655,1993,6/17/1993 0:00,168,,Natural,168.0,0.30,66.583000,-145.252600,AK,,,
175122,1993,6/1/1993 0:00,152,,Natural,152.0,0.10,62.032600,-163.485700,AK,,,
180687,1994,5/26/1994 0:00,146,,Missing data/not specified/undetermined,146.0,1.00,44.073600,-99.440400,SD,,,
190717,1994,7/4/1994 0:00,185,,Fireworks,185.0,0.10,44.749800,-121.251200,OR,,,
196902,1994,8/5/1994 0:00,217,,Missing data/not specified/undetermined,217.0,0.10,48.483300,-108.767400,MT,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
400431685,2010,4/10/2010 0:00,100,,Missing data/not specified/undetermined,,60.40,43.036149,-97.441521,SD,Yankton,46135,Yankton County
400431687,2010,7/18/2010 0:00,199,,Natural,,1.30,44.099200,-103.317500,SD,PENNINGTON,46103,Pennington County
400431688,2010,7/20/2010 0:00,201,,Missing data/not specified/undetermined,,0.25,44.617264,-103.385940,SD,Butte,46019,Butte County
400431690,2010,5/24/2010 0:00,144,,Missing data/not specified/undetermined,,40.00,43.414400,-96.694279,SD,Lincoln,46083,Lincoln County


In [4]:
a = Fires.sort_values('FIRE_SIZE')
NaN_a = a[a['DISCOVERY_TIME'].isna()]
NaN_a

Unnamed: 0_level_0,FIRE_YEAR,DISCOVERY_DATE,DISCOVERY_DOY,DISCOVERY_TIME,NWCG_GENERAL_CAUSE,CONT_DOY,FIRE_SIZE,LATITUDE,LONGITUDE,STATE,COUNTY,FIPS_CODE,FIPS_NAME
FOD_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
201816381,2013,12/18/2013 0:00,352,,Missing data/not specified/undetermined,,0.00001,44.997000,-101.233300,SD,Dewey,46041,Dewey County
850766,1999,10/30/1999 0:00,303,,Missing data/not specified/undetermined,,0.00010,43.536440,-96.860460,SD,,,
850630,1999,11/2/1999 0:00,306,,Equipment and vehicle use,,0.00010,44.389590,-99.471440,SD,,,
850801,1999,12/29/1999 0:00,363,,Missing data/not specified/undetermined,,0.00010,43.507650,-96.681020,SD,,,
400116141,2017,10/16/2017 0:00,289,,Equipment and vehicle use,,0.00010,43.875600,-99.737400,SD,Lyman,46085,Lyman County
...,...,...,...,...,...,...,...,...,...,...,...,...,...
403500,2002,5/26/2002 0:00,146,,Natural,248.0,84015.00000,63.618700,-155.820600,AK,,,
553116,2008,8/22/2008 0:00,235,,Natural,,102379.53000,46.237700,-108.341700,MT,Yellowstone,30111,Yellowstone County
201752827,2011,4/14/2011 0:00,104,,Missing data/not specified/undetermined,,106027.00000,32.237396,-102.917336,TX,,,
552247,2006,7/12/2006 0:00,193,,Natural,,121687.00000,45.922800,-107.856100,MT,Big Horn,30003,Big Horn County


# Data Preprocessing
## convert all features into usable format
## Use random sample from originial dataset due to high computational cost
## Take 1% of original data
## Purpose : 
1. To handle big data
2. To debug and to see that the code actually works

In [5]:
# Later just remove this line of code when using original dataset
Fires = Fires.sample(frac=0.01,replace=False)

In [6]:
Fires

Unnamed: 0_level_0,FIRE_YEAR,DISCOVERY_DATE,DISCOVERY_DOY,DISCOVERY_TIME,NWCG_GENERAL_CAUSE,CONT_DOY,FIRE_SIZE,LATITUDE,LONGITUDE,STATE,COUNTY,FIPS_CODE,FIPS_NAME
FOD_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
1446213,2010,5/1/2010 0:00,121,,Natural,,0.75,41.934768,-75.272540,NY,DELAWARE,36025,Delaware County
726658,2008,11/1/2008 0:00,306,,Arson/incendiarism,,2.00,37.359720,-82.006940,VA,Buchanan,51027,Buchanan County
300258777,2015,8/9/2015 0:00,221,1543.0,Arson/incendiarism,221.0,5.00,33.997170,-94.670170,OK,45,40089,McCurtain County
137246,2001,4/8/2001 0:00,98,1300.0,Debris and open burning,98.0,10.00,45.763889,-112.783333,MT,,,
1805301,1992,11/19/1992 0:00,324,800.0,Debris and open burning,324.0,2.00,37.943935,-81.883902,WV,Logan,54045,Logan County
...,...,...,...,...,...,...,...,...,...,...,...,...,...
646045,2000,8/12/2000 0:00,225,,Natural,225.0,209.00,30.202080,-95.018750,TX,Liberty,48291,Liberty County
219995,1997,6/8/1997 0:00,159,1030.0,Equipment and vehicle use,159.0,0.10,34.730500,-115.969400,CA,,,
31247,2008,6/2/2008 0:00,154,1530.0,Arson/incendiarism,155.0,0.17,34.693889,-81.471111,SC,87,45087,Union County
1040666,1999,3/16/1999 0:00,75,,Debris and open burning,,5.00,32.641700,-81.041700,SC,,,


# Colby's Coding

In [7]:
# extract month from discovery date
def get_first_element(date_list):
  return date_list[0]

Fires['DISCOVERY_DATE'] = Fires['DISCOVERY_DATE'].astype("string")
Fires['DISCOVERY_LIST'] = Fires['DISCOVERY_DATE'].str.split(pat='/')
Fires['DISCOVERY_MONTH'] = Fires['DISCOVERY_LIST'].map(get_first_element)
Fires['DISCOVERY_MONTH'] = Fires['DISCOVERY_MONTH'].astype("int64")

# encode causes into integers
Fires['NWCG_GENERAL_CAUSE'] = Fires['NWCG_GENERAL_CAUSE'].astype("string")
cause_enc = LabelEncoder()
Fires['NWCG_CAUSE_CLASSIFICATION_ORD'] = cause_enc.fit_transform(Fires['NWCG_GENERAL_CAUSE'])


# figure out number of days to contain the fire
Fires['DAYS_TO_CONT'] = Fires['CONT_DOY'] - Fires['DISCOVERY_DOY']
##### if it was contained the next year, need to do something about that
def convert_negative_days(day):
  if day < 0:
    return 365 + day
  else:
    return day

Fires['DAYS_TO_CONT'] = Fires['DAYS_TO_CONT'].map(convert_negative_days)


# encode state categories into integers
Fires['STATE'] = Fires['STATE'].astype("string")
state_enc = LabelEncoder()
Fires['STATE_ORD'] = state_enc.fit_transform(Fires['STATE'])

# what to do about counties? Use fips code (must be an int)
Fires['FIPS_CODE'] = Fires['FIPS_CODE'].astype("string")
Fires['FIPS_CODE'].fillna('1000000',inplace=True)
Fires['FIPS_CODE'] = Fires['FIPS_CODE'].astype("int64")

# fill NA with mean of day to containment (mean = 0.92)
Fires['DAYS_TO_CONT'].fillna(value=Fires['DAYS_TO_CONT'].mean(),inplace=True)


# drop date and list
Fires.drop(columns=['DISCOVERY_DATE','DISCOVERY_LIST','NWCG_GENERAL_CAUSE',
                    'CONT_DOY','DISCOVERY_DOY','STATE','FIPS_NAME',
                    'COUNTY'],inplace=True)
gc.collect()

30

In [8]:
Fires.head()

Unnamed: 0_level_0,FIRE_YEAR,DISCOVERY_TIME,FIRE_SIZE,LATITUDE,LONGITUDE,FIPS_CODE,DISCOVERY_MONTH,NWCG_CAUSE_CLASSIFICATION_ORD,DAYS_TO_CONT,STATE_ORD
FOD_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1446213,2010,,0.75,41.934768,-75.27254,36025,5,7,0.881603,33
726658,2008,,2.0,37.35972,-82.00694,51027,11,0,0.881603,45
300258777,2015,1543.0,5.0,33.99717,-94.67017,40089,8,0,0.0,35
137246,2001,1300.0,10.0,45.763889,-112.783333,1000000,4,1,0.0,25
1805301,1992,800.0,2.0,37.943935,-81.883902,54045,11,1,0.0,49


#** Code by Heejun Son**
### Need to handle missing values in DISCOVERY_TIME column 
### Need to compare MICE vs KNN vs Replacement with the most frequent number
I did not drop DISCOVERY_TIME column.


In [9]:
# To see whether there is missing values
print("No. NaN in FIRE_YEAR: ", Fires['FIRE_YEAR'].isna().sum())
print("No. NaN in DISCOVERY_TIME: ", Fires['DISCOVERY_TIME'].isna().sum())
print("No. NaN in FIRE_SIZE: ", Fires['FIRE_SIZE'].isna().sum())
print("No. NaN in LATITUDE: ", Fires['LATITUDE'].isna().sum())
print("No. NaN in LONGITUDE: ", Fires['LONGITUDE'].isna().sum())
print("No. NaN in FIPS_CODE: ", Fires['FIPS_CODE'].isna().sum())
print("No. NaN in DISCOVERY_MONTH: ", Fires['DISCOVERY_MONTH'].isna().sum())
print("No. NaN in NWCG_CAUSE_CLASSIFICATION_ORD: ", Fires['NWCG_CAUSE_CLASSIFICATION_ORD'].isna().sum())
print("No. NaN in DAYS_TO_CONT: ", Fires['DAYS_TO_CONT'].isna().sum())
print("No. NaN in STATE_ORD: ", Fires['STATE_ORD'].isna().sum())

No. NaN in FIRE_YEAR:  0
No. NaN in DISCOVERY_TIME:  7640
No. NaN in FIRE_SIZE:  0
No. NaN in LATITUDE:  0
No. NaN in LONGITUDE:  0
No. NaN in FIPS_CODE:  0
No. NaN in DISCOVERY_MONTH:  0
No. NaN in NWCG_CAUSE_CLASSIFICATION_ORD:  0
No. NaN in DAYS_TO_CONT:  0
No. NaN in STATE_ORD:  0


In [10]:
#Drop all the NaN values
Fires_Drop = Fires.sample(frac=1,random_state=0).dropna()
Fires_Drop

Unnamed: 0_level_0,FIRE_YEAR,DISCOVERY_TIME,FIRE_SIZE,LATITUDE,LONGITUDE,FIPS_CODE,DISCOVERY_MONTH,NWCG_CAUSE_CLASSIFICATION_ORD,DAYS_TO_CONT,STATE_ORD
FOD_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
153479,2002,1913.0,5.00,34.273056,-117.548056,1000000,6,12,1.000000,4
14627,2006,1230.0,0.10,43.791389,-122.108333,41039,7,7,0.000000,36
300142767,2014,316.0,0.75,37.974254,-122.546380,6041,5,5,0.881603,4
213859,1994,1522.0,5.00,65.599700,-142.219100,1000000,7,7,6.000000,0
376955,2002,1730.0,0.10,35.899264,-119.312916,1000000,4,12,0.000000,4
...,...,...,...,...,...,...,...,...,...,...
19097081,1993,1356.0,0.55,34.549600,-83.451700,13137,4,1,0.000000,9
400041769,2016,1210.0,0.10,41.545700,-74.380300,36071,6,2,0.000000,33
19073913,1994,1915.0,9.35,32.647100,-83.852800,13079,5,1,0.000000,9
1813345,2003,111.0,0.10,40.879319,-72.400995,36103,11,5,0.000000,33


In [11]:
#Replace with the most_Frequent value
Fires_Freq = Fires.sample(frac=1,random_state=0)
Fires_Freq['DISCOVERY_TIME'] = Fires_Freq[['DISCOVERY_TIME']].fillna(Fires_Freq[['DISCOVERY_TIME']].mode().iloc[0])
Fires_Freq

Unnamed: 0_level_0,FIRE_YEAR,DISCOVERY_TIME,FIRE_SIZE,LATITUDE,LONGITUDE,FIPS_CODE,DISCOVERY_MONTH,NWCG_CAUSE_CLASSIFICATION_ORD,DAYS_TO_CONT,STATE_ORD
FOD_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
153479,2002,1913.0,5.00,34.273056,-117.548056,1000000,6,12,1.000000,4
14627,2006,1230.0,0.10,43.791389,-122.108333,41039,7,7,0.000000,36
836761,1993,1400.0,0.10,32.941667,-80.612500,45029,9,5,0.000000,40
1137933,1994,1400.0,5.00,34.478300,-77.968300,1000000,5,0,0.881603,26
300142767,2014,316.0,0.75,37.974254,-122.546380,6041,5,5,0.881603,4
...,...,...,...,...,...,...,...,...,...,...
400041769,2016,1210.0,0.10,41.545700,-74.380300,36071,6,2,0.000000,33
19073913,1994,1915.0,9.35,32.647100,-83.852800,13079,5,1,0.000000,9
1813345,2003,111.0,0.10,40.879319,-72.400995,36103,11,5,0.000000,33
897367,1994,1435.0,0.10,45.866367,-89.651038,55085,5,0,0.000000,48


In [12]:
#Replace NaN with values predicted by KNN
from sklearn.impute import KNNImputer
Fires_KNN = Fires.sample(frac=1,random_state=0)
Input = Fires_KNN[['FIRE_YEAR','DISCOVERY_TIME','FIRE_SIZE','LATITUDE','LONGITUDE','DISCOVERY_MONTH','NWCG_CAUSE_CLASSIFICATION_ORD','DAYS_TO_CONT','STATE_ORD']]
imputer = KNNImputer(n_neighbors=5)

In [13]:
result = imputer.fit_transform(Input)
check = pd.DataFrame(result)
New_Discovery_Time = check[[1]].values.tolist()

In [14]:
Fires_KNN['Assumption'] = New_Discovery_Time
Fires_KNN.head()

Unnamed: 0_level_0,FIRE_YEAR,DISCOVERY_TIME,FIRE_SIZE,LATITUDE,LONGITUDE,FIPS_CODE,DISCOVERY_MONTH,NWCG_CAUSE_CLASSIFICATION_ORD,DAYS_TO_CONT,STATE_ORD,Assumption
FOD_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
153479,2002,1913.0,5.0,34.273056,-117.548056,1000000,6,12,1.0,4,[1913.0]
14627,2006,1230.0,0.1,43.791389,-122.108333,41039,7,7,0.0,36,[1230.0]
836761,1993,,0.1,32.941667,-80.6125,45029,9,5,0.0,40,[1496.0]
1137933,1994,,5.0,34.4783,-77.9683,1000000,5,0,0.881603,26,[1765.0]
300142767,2014,316.0,0.75,37.974254,-122.54638,6041,5,5,0.881603,4,[316.0]


In [15]:
# Convert object to float64
Fires_KNN['Assumption'] = Fires_KNN['Assumption'].astype("string")
Fires_KNN['Assumption'] = Fires_KNN['Assumption'].str.strip('[]')
Fires_KNN['Assumption'] = Fires_KNN['Assumption'].astype("float64")
# Drop DISCOVERY_TIME column that contains NaN
Fires_KNN.drop(columns=['DISCOVERY_TIME'],inplace=True)
# Change column name from Assumption to DISCOVERY_TIME
Fires_KNN['DISCOVERY_TIME'] = Fires_KNN['Assumption']
Fires_KNN.drop(columns=['Assumption'],inplace=True)
Fires_KNN

Unnamed: 0_level_0,FIRE_YEAR,FIRE_SIZE,LATITUDE,LONGITUDE,FIPS_CODE,DISCOVERY_MONTH,NWCG_CAUSE_CLASSIFICATION_ORD,DAYS_TO_CONT,STATE_ORD,DISCOVERY_TIME
FOD_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
153479,2002,5.00,34.273056,-117.548056,1000000,6,12,1.000000,4,1913.0
14627,2006,0.10,43.791389,-122.108333,41039,7,7,0.000000,36,1230.0
836761,1993,0.10,32.941667,-80.612500,45029,9,5,0.000000,40,1496.0
1137933,1994,5.00,34.478300,-77.968300,1000000,5,0,0.881603,26,1765.0
300142767,2014,0.75,37.974254,-122.546380,6041,5,5,0.881603,4,316.0
...,...,...,...,...,...,...,...,...,...,...
400041769,2016,0.10,41.545700,-74.380300,36071,6,2,0.000000,33,1210.0
19073913,1994,9.35,32.647100,-83.852800,13079,5,1,0.000000,9,1915.0
1813345,2003,0.10,40.879319,-72.400995,36103,11,5,0.000000,33,111.0
897367,1994,0.10,45.866367,-89.651038,55085,5,0,0.000000,48,1435.0


In [16]:
#Replace NaN with values predicted by Multivariate feature imputation
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
Fires_MICE = Fires.sample(frac=1,random_state=0)
Input = Fires_KNN[['FIRE_YEAR','DISCOVERY_TIME','FIRE_SIZE','LATITUDE','LONGITUDE','DISCOVERY_MONTH','NWCG_CAUSE_CLASSIFICATION_ORD','DAYS_TO_CONT','STATE_ORD']]
imp_15 = IterativeImputer(max_iter=15, random_state=0)
imp_15.fit(Input)
result_mice = imp_15.transform(Input)
check_mice = pd.DataFrame(result_mice)
New_Discovery_Time_mice = check_mice[[1]].values.tolist()

In [17]:
Fires_MICE['Assumption'] = New_Discovery_Time_mice
Fires_MICE.head()

Unnamed: 0_level_0,FIRE_YEAR,DISCOVERY_TIME,FIRE_SIZE,LATITUDE,LONGITUDE,FIPS_CODE,DISCOVERY_MONTH,NWCG_CAUSE_CLASSIFICATION_ORD,DAYS_TO_CONT,STATE_ORD,Assumption
FOD_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
153479,2002,1913.0,5.0,34.273056,-117.548056,1000000,6,12,1.0,4,[1913.0]
14627,2006,1230.0,0.1,43.791389,-122.108333,41039,7,7,0.0,36,[1230.0]
836761,1993,,0.1,32.941667,-80.6125,45029,9,5,0.0,40,[1496.0]
1137933,1994,,5.0,34.4783,-77.9683,1000000,5,0,0.881603,26,[1765.0]
300142767,2014,316.0,0.75,37.974254,-122.54638,6041,5,5,0.881603,4,[316.0]


In [18]:
Fires_MICE.dtypes

FIRE_YEAR                          int64
DISCOVERY_TIME                   float64
FIRE_SIZE                        float64
LATITUDE                         float64
LONGITUDE                        float64
FIPS_CODE                          int64
DISCOVERY_MONTH                    int64
NWCG_CAUSE_CLASSIFICATION_ORD      int64
DAYS_TO_CONT                     float64
STATE_ORD                          int64
Assumption                        object
dtype: object

In [19]:
# Convert object to float64
Fires_MICE['Assumption'] = Fires_MICE['Assumption'].astype("string")
Fires_MICE['Assumption'] = Fires_MICE['Assumption'].str.strip('[]')
Fires_MICE['Assumption'] = Fires_MICE['Assumption'].astype("float64")
# Drop DISCOVERY_TIME column that contains NaN
Fires_MICE.drop(columns=['DISCOVERY_TIME'],inplace=True)
# Change column name from Assumption to DISCOVERY_TIME
Fires_MICE['DISCOVERY_TIME'] = Fires_MICE['Assumption']
Fires_MICE.drop(columns=['Assumption'],inplace=True)
Fires_MICE

Unnamed: 0_level_0,FIRE_YEAR,FIRE_SIZE,LATITUDE,LONGITUDE,FIPS_CODE,DISCOVERY_MONTH,NWCG_CAUSE_CLASSIFICATION_ORD,DAYS_TO_CONT,STATE_ORD,DISCOVERY_TIME
FOD_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
153479,2002,5.00,34.273056,-117.548056,1000000,6,12,1.000000,4,1913.0
14627,2006,0.10,43.791389,-122.108333,41039,7,7,0.000000,36,1230.0
836761,1993,0.10,32.941667,-80.612500,45029,9,5,0.000000,40,1496.0
1137933,1994,5.00,34.478300,-77.968300,1000000,5,0,0.881603,26,1765.0
300142767,2014,0.75,37.974254,-122.546380,6041,5,5,0.881603,4,316.0
...,...,...,...,...,...,...,...,...,...,...
400041769,2016,0.10,41.545700,-74.380300,36071,6,2,0.000000,33,1210.0
19073913,1994,9.35,32.647100,-83.852800,13079,5,1,0.000000,9,1915.0
1813345,2003,0.10,40.879319,-72.400995,36103,11,5,0.000000,33,111.0
897367,1994,0.10,45.866367,-89.651038,55085,5,0,0.000000,48,1435.0


# Create a baseline mse and mae

In [20]:
FIRE_SIZE_PERMUTE = Fires['FIRE_SIZE'].sample(frac=1,random_state=0)

In [21]:
print("MAE: ",mean_absolute_error(FIRE_SIZE_PERMUTE,Fires['FIRE_SIZE']))
print("MSE: ",mean_squared_error(FIRE_SIZE_PERMUTE,Fires['FIRE_SIZE']))

MAE:  157.81074681488832
MSE:  16982015.826126453


# ** Code by Heejun **
## Feature selection information
Later need to convert LONGITUDE to negative values!!

Reason for using abs() : There are no positive values !!
                        
(United States only have negative lonitude values)

In [22]:
# Convert negative value to positive
# Feature Selection must have non-negative values
Fires_Drop['LONGITUDE'] = Fires_Drop['LONGITUDE'].abs()
Fires_Freq['LONGITUDE'] = Fires_Freq['LONGITUDE'].abs()
Fires_KNN['LONGITUDE'] = Fires_KNN['LONGITUDE'].abs()
Fires_MICE['LONGITUDE'] = Fires_MICE['LONGITUDE'].abs()

In [23]:
from sklearn.feature_selection import SelectKBest, chi2
from sklearn import preprocessing
############################################################################################################################################################
X_Drop = Fires_Drop[['FIRE_YEAR','LATITUDE','LONGITUDE','FIPS_CODE','DISCOVERY_MONTH','NWCG_CAUSE_CLASSIFICATION_ORD','DAYS_TO_CONT','STATE_ORD','DISCOVERY_TIME']]
y_Drop = Fires_Drop['FIRE_SIZE']
# Transform FIRE_SIZE
# SelectKBest() cannot handle countinous numbers
Encoder_Drop = preprocessing.LabelEncoder()
Firs_size_encoded_Drop = Encoder_Drop.fit_transform(y_Drop)

Best_Features_Drop = dict()
for i in range(1,9):
  selector_Drop = SelectKBest(chi2, k=i)
  selector_Drop.fit(X_Drop, Firs_size_encoded_Drop)
  Features_Drop = list(X_Drop.columns[selector_Drop.get_support(indices=True)])
  Best_Features_Drop[i] = Features_Drop
  print("(Drop) When k= ", i, ",", Features_Drop)

############################################################################################################################################################
X_Freq = Fires_Freq[['FIRE_YEAR','LATITUDE','LONGITUDE','FIPS_CODE','DISCOVERY_MONTH','NWCG_CAUSE_CLASSIFICATION_ORD','DAYS_TO_CONT','STATE_ORD','DISCOVERY_TIME']]
y_Freq = Fires_Freq['FIRE_SIZE']
# Transform FIRE_SIZE
# SelectKBest() cannot handle countinous numbers
Encoder_Freq = preprocessing.LabelEncoder()
Firs_size_encoded_Freq = Encoder_Freq.fit_transform(y_Freq)

Best_Features_Freq = dict()
for i in range(1,9):
  selector_Freq = SelectKBest(chi2, k=i)
  selector_Freq.fit(X_Freq, Firs_size_encoded_Freq)
  Features_Freq = list(X_Freq.columns[selector_Freq.get_support(indices=True)])
  Best_Features_Freq[i] = Features_Freq
  print("(Freq) When k= ", i, ",", Features_Freq)

############################################################################################################################################################
X_KNN = Fires_KNN[['FIRE_YEAR','LATITUDE','LONGITUDE','FIPS_CODE','DISCOVERY_MONTH','NWCG_CAUSE_CLASSIFICATION_ORD','DAYS_TO_CONT','STATE_ORD','DISCOVERY_TIME']]
y_KNN = Fires_KNN['FIRE_SIZE']
# Transform FIRE_SIZE
# SelectKBest() cannot handle countinous numbers
Encoder_KNN = preprocessing.LabelEncoder()
Firs_size_encoded_KNN = Encoder_KNN.fit_transform(y_KNN)

Best_Features_KNN = dict()
for i in range(1,9):
  selector_KNN = SelectKBest(chi2, k=i)
  selector_KNN.fit(X_KNN, Firs_size_encoded_KNN)
  Features_KNN = list(X_KNN.columns[selector_KNN.get_support(indices=True)])
  Best_Features_KNN[i] = Features_KNN
  print("(KNN) When k= ", i, ",", Features_KNN)
  ############################################################################################################################################################
X_MICE = Fires_MICE[['FIRE_YEAR','LATITUDE','LONGITUDE','FIPS_CODE','DISCOVERY_MONTH','NWCG_CAUSE_CLASSIFICATION_ORD','DAYS_TO_CONT','STATE_ORD','DISCOVERY_TIME']]
y_MICE = Fires_MICE['FIRE_SIZE']
# Transform FIRE_SIZE
# SelectKBest() cannot handle countinous numbers
Encoder_MICE = preprocessing.LabelEncoder()
Firs_size_encoded_MICE = Encoder_MICE.fit_transform(y_MICE)

Best_Features_MICE = dict()
for i in range(1,9):
  selector_MICE = SelectKBest(chi2, k=i)
  selector_MICE.fit(X_MICE, Firs_size_encoded_MICE)
  Features_MICE = list(X_MICE.columns[selector_MICE.get_support(indices=True)])
  Best_Features_MICE[i] = Features_MICE
  print("(MICE) When k= ", i, ",", Features_MICE)

(Drop) When k=  1 , ['FIPS_CODE']
(Drop) When k=  2 , ['FIPS_CODE', 'DAYS_TO_CONT']
(Drop) When k=  3 , ['FIPS_CODE', 'DAYS_TO_CONT', 'DISCOVERY_TIME']
(Drop) When k=  4 , ['FIPS_CODE', 'DAYS_TO_CONT', 'STATE_ORD', 'DISCOVERY_TIME']
(Drop) When k=  5 , ['LONGITUDE', 'FIPS_CODE', 'DAYS_TO_CONT', 'STATE_ORD', 'DISCOVERY_TIME']
(Drop) When k=  6 , ['LONGITUDE', 'FIPS_CODE', 'NWCG_CAUSE_CLASSIFICATION_ORD', 'DAYS_TO_CONT', 'STATE_ORD', 'DISCOVERY_TIME']
(Drop) When k=  7 , ['LATITUDE', 'LONGITUDE', 'FIPS_CODE', 'NWCG_CAUSE_CLASSIFICATION_ORD', 'DAYS_TO_CONT', 'STATE_ORD', 'DISCOVERY_TIME']
(Drop) When k=  8 , ['LATITUDE', 'LONGITUDE', 'FIPS_CODE', 'DISCOVERY_MONTH', 'NWCG_CAUSE_CLASSIFICATION_ORD', 'DAYS_TO_CONT', 'STATE_ORD', 'DISCOVERY_TIME']
(Freq) When k=  1 , ['FIPS_CODE']
(Freq) When k=  2 , ['FIPS_CODE', 'DAYS_TO_CONT']
(Freq) When k=  3 , ['FIPS_CODE', 'DAYS_TO_CONT', 'DISCOVERY_TIME']
(Freq) When k=  4 , ['FIPS_CODE', 'DAYS_TO_CONT', 'STATE_ORD', 'DISCOVERY_TIME']
(Freq) When k=  

Freatures to predict class:
- fire year, general cause, lat, long

Features to predict cause:
- fire year, size class, lat, long

# With missing values(NaN), models cannot be trained!

Below code will show the error!

In [24]:
#### split data into training, validation, and testing
X = Fires[['FIRE_YEAR','LATITUDE','LONGITUDE','FIPS_CODE','DISCOVERY_MONTH','NWCG_CAUSE_CLASSIFICATION_ORD','DAYS_TO_CONT','STATE_ORD','DISCOVERY_TIME']]
y = Fires['FIRE_SIZE']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
#### List of models
model_list = [LinearRegression, RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor,
               KNeighborsRegressor, SVR] #GaussianNB, 
################################################################################################################################               
model_scores = dict()
for model in model_list:
  # fit model to data
  print(model)
  regressor = model().fit(X_train,y_train)
  y_pred = regressor.predict(X_test)
  model_mse = mean_squared_error(y_pred,y_test)
  model_mae = mean_absolute_error(y_pred,y_test)
  model_scores[model] = (regressor,y_pred,model_mse,model_mae)
  print("MSE = ", model_mse, '\n', "MAE = ", model_mae)

<class 'sklearn.linear_model._base.LinearRegression'>


ValueError: ignored

# MODEL

In [25]:
#### split data into training, validation, and testing
X_Drop = Fires_Drop[['FIRE_YEAR','LATITUDE','LONGITUDE','FIPS_CODE','DISCOVERY_MONTH','NWCG_CAUSE_CLASSIFICATION_ORD','DAYS_TO_CONT','STATE_ORD','DISCOVERY_TIME']]
y_Drop = Fires_Drop['FIRE_SIZE']
X_Drop_train, X_Drop_test, y_Drop_train, y_Drop_test = train_test_split(X_Drop, y_Drop, test_size=0.2, random_state=0)
######################################################################################################################################################################
X_Freq = Fires_Freq[['FIRE_YEAR','LATITUDE','LONGITUDE','FIPS_CODE','DISCOVERY_MONTH','NWCG_CAUSE_CLASSIFICATION_ORD','DAYS_TO_CONT','STATE_ORD','DISCOVERY_TIME']]
y_Freq = Fires_Freq['FIRE_SIZE']
X_Freq_train, X_Freq_test, y_Freq_train, y_Freq_test = train_test_split(X_Freq, y_Freq, test_size=0.2, random_state=0)
######################################################################################################################################################################
X_KNN = Fires_KNN[['FIRE_YEAR','LATITUDE','LONGITUDE','FIPS_CODE','DISCOVERY_MONTH','NWCG_CAUSE_CLASSIFICATION_ORD','DAYS_TO_CONT','STATE_ORD','DISCOVERY_TIME']]
y_KNN = Fires_KNN['FIRE_SIZE']
X_KNN_train, X_KNN_test, y_KNN_train, y_KNN_test = train_test_split(X_KNN, y_KNN, test_size=0.2, random_state=0)
######################################################################################################################################################################
X_MICE = Fires_MICE[['FIRE_YEAR','LATITUDE','LONGITUDE','FIPS_CODE','DISCOVERY_MONTH','NWCG_CAUSE_CLASSIFICATION_ORD','DAYS_TO_CONT','STATE_ORD','DISCOVERY_TIME']]
y_MICE = Fires_MICE['FIRE_SIZE']
X_MICE_train, X_MICE_test, y_MICE_train, y_MICE_test = train_test_split(X_MICE, y_MICE, test_size=0.2, random_state=0)

# Need to handle imbalanced data

# SMOTER will apply oversampling to minority class(large fire size)

In [28]:
pip install smogn

Collecting smogn
  Downloading smogn-0.1.2-py3-none-any.whl (30 kB)
Installing collected packages: smogn
Successfully installed smogn-0.1.2


In [29]:
import smogn
Smoter_KNN = smogn.smoter(

    data = Fires_KNN.reset_index(), 
    y = "FIRE_SIZE"
)

dist_matrix: 100%|##########| 4687/4687 [2:17:25<00:00,  1.76s/it]
synth_matrix: 100%|##########| 4687/4687 [00:08<00:00, 538.34it/s]
r_index: 100%|##########| 1459/1459 [00:02<00:00, 538.17it/s]


In [39]:
Smoter_KNN = Smoter_KNN.set_index('FOD_ID')
Smoter_KNN

Unnamed: 0_level_0,FIRE_YEAR,FIRE_SIZE,LATITUDE,LONGITUDE,FIPS_CODE,DISCOVERY_MONTH,NWCG_CAUSE_CLASSIFICATION_ORD,DAYS_TO_CONT,STATE_ORD,DISCOVERY_TIME
FOD_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
20025409,2011,4.505495,33.621096,93.369433,1000000,2,0,0.466887,32,1436.036471
400125055,2017,5.700372,32.711216,82.276847,13114,4,1,0.000000,9,1651.743707
400121320,2017,4.322261,32.668116,89.362293,22664,2,1,0.000000,17,1271.356338
473858,2006,5.705935,33.104536,84.606110,13263,2,0,0.881603,9,1409.678624
1461948,2010,5.573437,32.831534,83.127388,13016,3,1,0.000000,9,1452.303613
...,...,...,...,...,...,...,...,...,...,...
562231,2005,1.000000,35.327630,78.208040,37191,3,1,0.000000,26,1433.000000
300137686,2014,1.000000,34.514626,85.732378,1049,11,5,0.000000,1,1244.800000
19097081,1993,0.550000,34.549600,83.451700,13137,4,1,0.000000,9,1356.000000
400041769,2016,0.100000,41.545700,74.380300,36071,6,2,0.000000,33,1210.000000


In [40]:
X_KNN_SMOTER = Smoter_KNN[['FIRE_YEAR','LATITUDE','LONGITUDE','FIPS_CODE','DISCOVERY_MONTH','NWCG_CAUSE_CLASSIFICATION_ORD','DAYS_TO_CONT','STATE_ORD','DISCOVERY_TIME']]
y_KNN_SMOTER = Smoter_KNN['FIRE_SIZE']
X_KNN_SMOTER_train, X_KNN_SMOTER_test, y_KNN_SMOTER_train, y_KNN_SMOTER_test = train_test_split(X_KNN_SMOTER, y_KNN_SMOTER, test_size=0.2, random_state=0)

In [41]:
#### list of models to train
# xgboost, linear regression (with and without regularization), random forest, naive bayes, nearest neighbors
# k-means, svm
model_list = [LinearRegression, RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor,
               KNeighborsRegressor, SVR] #GaussianNB, 
################################################################################################################################               
model_scores_Drop = dict()
for model in model_list:
  # fit model to data
  print(model)
  regressor_Drop = model().fit(X_Drop_train,y_Drop_train)
  y_pred_Drop = regressor_Drop.predict(X_Drop_test)
  model_Drop_mse = mean_squared_error(y_pred_Drop,y_Drop_test)
  model_Drop_mae = mean_absolute_error(y_pred_Drop,y_Drop_test)
  model_scores_Drop[model] = (regressor_Drop,y_pred_Drop,model_Drop_mse,model_Drop_mae)
  print("(Drop) MSE = ", model_Drop_mse, '\n', "MAE = ", model_Drop_mae)
################################################################################################################################               
model_scores_Freq = dict()
for model in model_list:
  # fit model to data
  print(model)
  regressor_Freq = model().fit(X_Freq_train,y_Freq_train)
  y_Freq_pred = regressor_Freq.predict(X_Freq_test)
  model_Freq_mse = mean_squared_error(y_Freq_pred,y_Freq_test)
  model_Freq_mae = mean_absolute_error(y_Freq_pred,y_Freq_test)
  model_scores_Freq[model] = (regressor_Freq,y_Freq_pred,model_Freq_mse,model_Freq_mae)
  print("(Freq) MSE = ", model_Freq_mse, '\n', "(Freq) MAE = ", model_Freq_mae)
################################################################################################################################
model_scores_KNN = dict()
for model in model_list:
  # fit model to data
  print(model)
  regressor_KNN = model().fit(X_KNN_train,y_KNN_train)
  y_KNN_pred = regressor_KNN.predict(X_KNN_test)
  model_KNN_mse = mean_squared_error(y_KNN_pred,y_KNN_test)
  model_KNN_mae = mean_absolute_error(y_KNN_pred,y_KNN_test)
  model_scores_KNN[model] = (regressor_KNN,y_KNN_pred,model_KNN_mse,model_KNN_mae)
  print("(KNN) MSE = ", model_KNN_mse, '\n', "(KNN) MAE = ", model_KNN_mae)
################################################################################################################################
model_scores_MICE = dict()
for model in model_list:
  # fit model to data
  print(model)
  regressor_MICE = model().fit(X_MICE_train,y_MICE_train)
  y_MICE_pred = regressor_MICE.predict(X_MICE_test)
  model_MICE_mse = mean_squared_error(y_MICE_pred,y_MICE_test)
  model_MICE_mae = mean_absolute_error(y_MICE_pred,y_MICE_test)
  model_scores_MICE[model] = (regressor_MICE,y_MICE_pred,model_MICE_mse,model_MICE_mae)
  print("(MICE) MSE = ", model_MICE_mse, '\n', "(MICE) MAE = ", model_MICE_mae)
  ##################################################################################################################################
model_scores_KNN_SMOTER = dict()
for model in model_list:
  # fit model to data
  print(model)
  regressor_KNN_SMOTER = model().fit(X_KNN_SMOTER_train,y_KNN_SMOTER_train)
  y_KNN_SMOTER_pred = regressor_KNN_SMOTER.predict(X_KNN_SMOTER_test)
  model_KNN_SMOTER_mse = mean_squared_error(y_KNN_SMOTER_pred,y_KNN_SMOTER_test)
  model_KNN_SMOTER_mae = mean_absolute_error(y_KNN_SMOTER_pred,y_KNN_SMOTER_test)
  model_scores_KNN_SMOTER[model] = (regressor_KNN_SMOTER,y_KNN_SMOTER_pred,model_KNN_SMOTER_mse,model_KNN_SMOTER_mae)
  print("(KNN_SMOTER) MSE = ", model_KNN_SMOTER_mse, '\n', "(KNN) MAE = ", model_KNN_SMOTER_mae)

<class 'sklearn.linear_model._base.LinearRegression'>
(Drop) MSE =  1739296.2370843058 
 MAE =  176.39380876878622
<class 'sklearn.ensemble._forest.RandomForestRegressor'>
(Drop) MSE =  3657867.216830225 
 MAE =  182.3512577464659
<class 'sklearn.ensemble._gb.GradientBoostingRegressor'>
(Drop) MSE =  5139248.145206348 
 MAE =  196.1233834080455
<class 'sklearn.ensemble._weight_boosting.AdaBoostRegressor'>
(Drop) MSE =  3074307.469563027 
 MAE =  551.8069218688919
<class 'sklearn.neighbors._regression.KNeighborsRegressor'>
(Drop) MSE =  4043470.7570600277 
 MAE =  133.86355411528868
<class 'sklearn.svm._classes.SVR'>
(Drop) MSE =  1776525.0412034583 
 MAE =  63.49687146476582
<class 'sklearn.linear_model._base.LinearRegression'>
(Freq) MSE =  1785266.8143636712 
 (Freq) MAE =  139.90293619479033
<class 'sklearn.ensemble._forest.RandomForestRegressor'>
(Freq) MSE =  1894735.0129911366 
 (Freq) MAE =  98.19852011859602
<class 'sklearn.ensemble._gb.GradientBoostingRegressor'>
(Freq) MSE = 

# Save Result for future work

In [42]:
# Make a dataframe with True values and Predicted values
result_Drop = pd.DataFrame({'True': y_Drop_test, 'Pred': y_pred_Drop})
result_Freq = pd.DataFrame({'True': y_Freq_test, 'Pred': y_Freq_pred})
result_KNN = pd.DataFrame({'True': y_KNN_test, 'Pred': y_KNN_pred})
result_MICE = pd.DataFrame({'True': y_MICE_test, 'Pred': y_MICE_pred})
result_KNN_SMOTER = pd.DataFrame({'True': y_KNN_SMOTER_test, 'Pred': y_KNN_SMOTER_pred})
# Add State, Lat, Long information by using merge() function
Fires_State = Fires[['LATITUDE','LONGITUDE','STATE_ORD']]
result_State_Drop = pd.merge(result_Drop, Fires_State, how='inner', on='FOD_ID')
result_State_Freq = pd.merge(result_Freq, Fires_State, how='inner', on='FOD_ID')
result_State_KNN = pd.merge(result_KNN, Fires_State, how='inner', on='FOD_ID')
result_State_MICE = pd.merge(result_MICE, Fires_State, how='inner', on='FOD_ID')
result_State_KNN_SMOTER = pd.merge(result_KNN_SMOTER, Fires_State, how='inner', on='FOD_ID')
# Add squared-error between True and Pred values
result_State_Drop['Error'] = (result_State_Drop['True'] - result_State_Drop['Pred'])**2
result_State_Freq['Error'] = (result_State_Freq['True'] - result_State_Freq['Pred'])**2
result_State_KNN['Error'] = (result_State_KNN['True'] - result_State_KNN['Pred'])**2
result_State_MICE['Error'] = (result_State_MICE['True'] - result_State_MICE['Pred'])**2
result_State_KNN_SMOTER['Error'] = (result_State_KNN_SMOTER['True'] - result_State_KNN_SMOTER['Pred'])**2
# To decode STATE_ORD to STATE
result_State_Drop['STATE'] = state_enc.inverse_transform(result_State_Drop['STATE_ORD'])
result_State_Freq['STATE'] = state_enc.inverse_transform(result_State_Freq['STATE_ORD'])
result_State_KNN['STATE'] = state_enc.inverse_transform(result_State_KNN['STATE_ORD'])
result_State_MICE['STATE'] = state_enc.inverse_transform(result_State_MICE['STATE_ORD'])
result_State_KNN_SMOTER['STATE'] = state_enc.inverse_transform(result_State_KNN_SMOTER['STATE_ORD'])
# To check code works properly
result_State_KNN_SMOTER

Unnamed: 0_level_0,True,Pred,LATITUDE,LONGITUDE,STATE_ORD,Error,STATE
FOD_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1475933,0.1,7.577367,37.157200,-112.066400,44,55.911011,UT
400021596,201.0,7.332002,47.408900,-107.828900,25,37507.293535,MT
19961887,7.0,7.551870,36.158333,-88.398333,42,0.304561,TN
1013781,2.0,7.600030,30.832200,-93.292800,17,31.360338,LA
201917708,0.1,7.241246,44.704889,-67.905611,20,50.997394,ME
...,...,...,...,...,...,...,...
1146937,1.0,7.599934,35.235000,-83.161700,26,43.559123,NC
990054,0.3,7.599954,34.481400,-84.530800,9,53.289333,GA
159920,0.1,7.600276,41.333889,-121.055833,4,56.254135,CA
448892,25.0,7.099688,27.752220,-82.282500,8,320.421186,FL


In [43]:
result_State_Drop.to_csv('Drop Result.csv', index = False)
result_State_Freq.to_csv('Freq Result.csv', index = False)
result_State_KNN.to_csv('KNN Result.csv', index = False)
result_State_MICE.to_csv('MICE Result.csv', index = False)
result_State_KNN_SMOTER.to_csv('KNN SMOTER Result.csv', index = False)

In [38]:
Smoter_KNN.to_csv('SMOTER_KNN.csv', index = False)