In [1]:
import os
import time
import pandas as pd

from sklearn.ensemble import RandomForestClassifier
from dask_ml.wrappers import ParallelPostFit  # wrapper so that model predicts using dask

from joblib import dump

import model_prep_and_evals as mpe # custom module

In [2]:
whole_set = False

In [3]:
# ------------------------------
# Import train data

# select features from r (Red band) to avg_lidar // excludes geometry, aoi, naip_id, polygon)id and iceplant features
X_train = pd.read_csv(os.path.join(os.getcwd(),'data_sampling_workflow','train_set.csv')).loc[:,'r':'avg_lidar']

# select iceplant feature column
y_train = pd.read_csv(os.path.join(os.getcwd(),'data_sampling_workflow','train_set.csv')).loc[:,'iceplant'] 

# remove 2012 and 2014 since these do not have LIDAR data of their own.
X_train = X_train.loc[X_train.year != 2012]
X_train = X_train.loc[X_train.year != 2014]

y_train = y_train.iloc[X_train.index]

# ------------------------------
# Import test data

# select features from r (Red band) to avg_lidar // excludes geometry, aoi, naip_id, polygon)id and iceplant features
X_test = pd.read_csv(os.path.join(os.getcwd(),'data_sampling_workflow','test_set.csv')).loc[:,'r':'avg_lidar']

# select iceplant feature column
y_test = pd.read_csv(os.path.join(os.getcwd(),'data_sampling_workflow','test_set.csv')).loc[:,'iceplant'] 

# remove 2012 and 2014 since these do not have LIDAR data of their own.
X_test = X_test.loc[X_test.year != 2012]
X_test = X_test.loc[X_test.year != 2014]

y_test = y_test.iloc[X_test.index]

In [4]:
if whole_set == True:
    X_train = pd.concat([X_train, X_test], axis=0)
    y_train = pd.concat([y_train, y_test], axis=0)

feature_names = X_train.columns.to_list()
print(feature_names)

X_train = X_train.to_numpy()
y_train = y_train.to_numpy()

['r', 'g', 'b', 'nir', 'ndvi', 'year', 'month', 'day_in_year', 'lidar', 'max_lidar', 'min_lidar', 'min_max_diff', 'avg_lidar']


In [5]:
rfc = ParallelPostFit(RandomForestClassifier(n_estimators = 100, 
                                             random_state = 42))
rfc.fit(X_train, y_train)
dump(rfc, 'lidar_spectral_rfc.joblib')

['spectral_rfc.joblib']