In [5]:
import json # will be needed for saving preprocessing details
import numpy as np # for data engineering
import pandas as pd # for data engineering
import joblib # saving algorithm and preprocessing serialized objects
from sklearn.model_selection import train_test_split # used for data splitting
from sklearn.preprocessing import LabelEncoder # used for preprocessing
from sklearn.ensemble import RandomForestClassifier # training the algorithm
from sklearn.ensemble import ExtraTreesClassifier # training the algorithm

In [6]:
# load test dataset: 
df = pd.read_csv('/Users/micahkirkpatrick/Documents/decision_support_tool/model_data/base6.csv', skipinitialspace=False)
x = df.drop(['SerialNumber', 'HurricaneName', 'DeclarationDate', 'IncidentBeginDate', 'IncidentEndDate', 'DisasterCloseOutDate', 'Declared', 'GeoName', 'County', 'State'], axis = 1)
# set input matrix and target column:
y = df['Declared']
# show first rows of data:
x.head()

Unnamed: 0,FIPS,MaxPrecipitation,WindSwathRadii,CountyOverlap,VALUE_1,VALUE_2,VALUE_3,VALUE_4,VALUE_5,VALUE_7,...,Deciduous_Forest,Evergreen_Forest,Mixed_Forest,Shrub_Scrub,Herbaceuous,Hay_Pasture,Cultivated_Crops,Woody_Wetlands,Emergent_Herbaceuous_Wetlands,Population
0,22001,"4"" to 6""",58,100.0,24766790.0,14172920.0,1105560.0,0.0,0.0,0.0,...,3099600,29845800,31617900,2769300,2081700,114534900,1062193500,248421600,2347200,62590.0
1,22009,"2"" to 4""",39,100.0,,,,,,,...,47990700,52580700,37260000,12894300,12469500,254997900,830644200,758770200,16775100,40980.0
2,22011,"1"" to 2""",58,12.217249,,,,,,,...,8638200,1443164400,53488800,338255100,136060200,226836900,31453200,535906800,5139000,36928.0
3,22013,"1"" to 2""",39,100.0,,,,,,,...,109230300,1048653000,23819400,237616200,147206700,22209300,2137500,366525000,2635200,13638.0
4,22015,"0"" to 1""",39,43.920893,,,,,,,...,121778100,799141500,84913200,95726700,58199400,273266100,48853800,433539900,12926700,127634.0


In [7]:
# data split into train & test sets:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.3, random_state = 1234)

In [8]:
train_mode = dict(x_train.mode().iloc[0])
x_train = x_train.fillna(train_mode)
print(x_train)

       FIPS MaxPrecipitation  WindSwathRadii  CountyOverlap       VALUE_1  \
1047  13211         4" to 6"              39     100.000000  3.667232e+03   
299   51550         1" to 2"              39     100.000000  3.099532e+07   
1676  12049         4" to 6"               0       0.000000  3.667232e+03   
614   37091         4" to 6"              39      90.374872  1.171629e+07   
875   28013         0" to 1"               0       0.000000  3.667232e+03   
...     ...              ...             ...            ...           ...   
279   51165         2" to 4"               0       0.000000  3.667232e+03   
2041  12127        6" to 10"              74      46.300204  3.044275e+07   
664   45087         2" to 4"               0       0.000000  3.667232e+03   
1318  48075         2" to 4"               0       0.000000  3.667232e+03   
723   51169         2" to 4"               0       0.000000  3.667232e+03   

           VALUE_2       VALUE_3       VALUE_4       VALUE_5  VALUE_7  ... 

In [10]:
# convert categorical input feature elements:
x_encoders = {}

for column in ['FIPS',
              'MaxPrecipitation',
              'WindSwathRadii']:
    categorical_convert = LabelEncoder()
    x_train[column] = categorical_convert.fit_transform(x_train[column])
    x_encoders[column] = categorical_convert
    
# convert the categorical target elements:
categorical_convert = LabelEncoder()
y_train = categorical_convert.fit_transform(y_train)

In [13]:
y_train

array([1, 0, 0, ..., 0, 0, 0])

In [14]:
# train the random forest algorithm:
rf = RandomForestClassifier(n_estimators=100)
rf = rf.fit(x_train, y_train)

In [15]:
rf

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [16]:
# train the Extra Trees algorithm:
et = ExtraTreesClassifier(n_estimators = 100)
et = et.fit(x_train, y_train)

In [18]:
et

ExtraTreesClassifier(bootstrap=False, ccp_alpha=0.0, class_weight=None,
                     criterion='gini', max_depth=None, max_features='auto',
                     max_leaf_nodes=None, max_samples=None,
                     min_impurity_decrease=0.0, min_impurity_split=None,
                     min_samples_leaf=1, min_samples_split=2,
                     min_weight_fraction_leaf=0.0, n_estimators=100,
                     n_jobs=None, oob_score=False, random_state=None, verbose=0,
                     warm_start=False)

In [23]:
# save preprocessing objects, RF & ET algorithms:
joblib.dump(train_mode, "./train_mode.joblib", compress=True)
joblib.dump(x_encoders, "./encoders.joblib", compress=True)
joblib.dump(rf, "./random_forest.joblib", compress=True)
joblib.dump(et, "./extra_trees.joblib", compress=True)

['./extra_trees.joblib']