In [1]:
import os
import pandas as pd

from sklearn.ensemble import RandomForestClassifier
from dask_ml.wrappers import ParallelPostFit

import model_prep_and_evals as mpe

from joblib import dump

In [2]:
# **************************************************************
# **************************************************************
# whole_set = True => merge train+test sets and train model with combined dataset
# whole_set = False => train model only with train set
whole_set = False

# **************************************************************
# **************************************************************

In [3]:
# open train data & select features and labels
# select features from r (Red band) to avg_lidar
# excludes x,y, pts_crs, aoi, naip_id, polygon_id, iceplant and lidar features
X_train = pd.read_csv(os.path.join(os.getcwd(),'data_sampling_workflow','train_set.csv')).loc[:,'r':'day_in_year']

# select iceplant feature column
y_train = pd.read_csv(os.path.join(os.getcwd(),'data_sampling_workflow','train_set.csv')).loc[:,'iceplant'] 

# --------------------------------------------------------------------
# open test data & select features and labels
X_test = pd.read_csv(os.path.join(os.getcwd(),'data_sampling_workflow','test_set.csv')).loc[:,'r':'day_in_year']
y_test = pd.read_csv(os.path.join(os.getcwd(),'data_sampling_workflow','test_set.csv')).loc[:,'iceplant'] 

if whole_set == True:
    X_train = pd.concat([X_train, X_test], axis=0)
    y_train = pd.concat([y_train, y_test], axis=0)

In [4]:
feature_names = X_train.columns.to_list()
X_train = X_train.to_numpy()
y_train = y_train.to_numpy()

In [5]:
rfc = ParallelPostFit(RandomForestClassifier(n_estimators = 100, 
                                             random_state = 42))
rfc.fit(X_train, y_train)
dump(rfc, 'spectral_rfc.joblib')

['spectral_rfc.joblib']

In [6]:
# TO DO: make txt with model metadata. Include feature names in order
# TO DO: include model's cross validation?

In [6]:
if whole_set == False:
    preds = rfc.predict(X_test.to_numpy())
    mpe.print_accuracy_info(y_test.to_numpy(),preds)

true negatives: 80844     false positives: 4986
false negatives: 4284     true positives: 72988

sensitivity (TP/P): 94.46 %
specificity (TN/N): 94.19 %
G-mean:  0.94

precision (TP/(TP+FP)): 93.61 %

MCC:  0.8860995697109832

F1-measure:  0.94029
F0.5-measure (min false positives):  0.93774
F2-measure (min false negatives)  :  0.94285

accuracy: 94.32 %
