In [1]:
import os
import time
import pandas as pd

from sklearn.ensemble import RandomForestClassifier
from dask_ml.wrappers import ParallelPostFit  # wrapper so that model predicts using dask

from joblib import dump

import model_prep_and_evals as mpe # custom module

In [2]:
split = False
whole_set = False

In [3]:
if split == True:
    # ------------------------------
    # Import train data

    # select features from r (Red band) to avg_lidar // excludes geometry, aoi, naip_id, polygon)id and iceplant features
    X_train = pd.read_csv(os.path.join(os.getcwd(),'data_sampling_workflow','train_set.csv')).loc[:,'r':'avg_lidar']

    # select iceplant feature column
    y_train = pd.read_csv(os.path.join(os.getcwd(),'data_sampling_workflow','train_set.csv')).loc[:,'iceplant'] 

    # remove 2012 and 2014 since these do not have LIDAR data of their own.
    X_train = X_train.loc[X_train.year != 2012]
    X_train = X_train.loc[X_train.year != 2014]

    y_train = y_train.iloc[X_train.index]

    # ------------------------------
    # Import test data

    # select features from r (Red band) to avg_lidar // excludes geometry, aoi, naip_id, polygon)id and iceplant features
    X_test = pd.read_csv(os.path.join(os.getcwd(),'data_sampling_workflow','test_set.csv')).loc[:,'r':'avg_lidar']

    # select iceplant feature column
    y_test = pd.read_csv(os.path.join(os.getcwd(),'data_sampling_workflow','test_set.csv')).loc[:,'iceplant'] 

    # remove 2012 and 2014 since these do not have LIDAR data of their own.
    X_test = X_test.loc[X_test.year != 2012]
    X_test = X_test.loc[X_test.year != 2014]

    y_test = y_test.iloc[X_test.index]
    
    if whole_set == True:
    X_train = pd.concat([X_train, X_test], axis=0)
    y_train = pd.concat([y_train, y_test], axis=0)

    feature_names = X_train.columns.to_list()
    print(feature_names)

    X_train = X_train.to_numpy()
    y_train = y_train.to_numpy()

In [3]:
if split == False:
    #------------------------------
    # Import train data
    X_train = pd.read_csv(os.path.join(os.getcwd(),'data_sampling_workflow','samples_for_model.csv')).loc[:,'r':'avg_lidar']
    # select iceplant feature column
    y_train = pd.read_csv(os.path.join(os.getcwd(),'data_sampling_workflow','samples_for_model.csv')).loc[:,'iceplant'] 

    # remove 2012 and 2014 since these do not have LIDAR data of their own.
    X_train = X_train.loc[X_train.year != 2012]
    X_train = X_train.loc[X_train.year != 2014]

    y_train = y_train.iloc[X_train.index]

In [8]:
X_train.loc[X_train.min_lidar<0]

Unnamed: 0,r,g,b,nir,ndvi,year,month,day_in_year,lidar,max_lidar,min_lidar,min_max_diff,avg_lidar


In [9]:
rfc = ParallelPostFit(RandomForestClassifier(n_estimators = 100, 
                                             random_state = 42))
rfc.fit(X_train, y_train)
dump(rfc, 'lidar_spectral_rfc_with_negatives.joblib')

['lidar_spectral_rfc_with_negatives.joblib']

In [3]:
old_train = pd.read_csv(os.path.join(os.getcwd(),'train_set.csv'))

In [5]:
old_train.loc[old_train.min_lidar<0]

Unnamed: 0,r,g,b,nir,ndvi,year,month,day_in_year,lidar,max_lidar,min_lidar,min_max_diff,avg_lidar,geometry,aoi,naip_id,polygon_id,iceplant
91166,85,93,75,158,0.300412,2020,5,142,1,2,-15,17,0.0,POINT (236811.7432546652 3811265.620213558),campus_lagoon,ca_m_3411934_sw_11_060_20200521,11,1
91175,69,83,69,145,0.355140,2020,5,142,1,2,-15,17,0.0,POINT (236806.67447600968 3811261.2598478217),campus_lagoon,ca_m_3411934_sw_11_060_20200521,11,1
91180,111,111,93,159,0.177778,2020,5,142,1,2,-15,17,0.0,POINT (236824.47653539397 3811273.0784437936),campus_lagoon,ca_m_3411934_sw_11_060_20200521,11,1
91191,66,85,70,156,0.405405,2020,5,142,1,2,-15,17,0.0,POINT (236804.0331534393 3811260.775524029),campus_lagoon,ca_m_3411934_sw_11_060_20200521,11,1
91346,71,88,82,142,0.333333,2020,5,142,1,2,-15,17,0.0,POINT (236816.18901807428 3811262.5072553656),campus_lagoon,ca_m_3411934_sw_11_060_20200521,11,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
394596,123,95,77,154,0.111913,2020,6,159,0,2,-15,17,0.0,POINT (733829.4174302175 3814253.439915656),point_conception,ca_m_3412037_nw_10_060_20200607,8,1
395281,102,85,69,151,0.193676,2020,6,159,1,4,-15,19,0.0,POINT (734000.1685230015 3814170.5711912974),point_conception,ca_m_3412037_nw_10_060_20200607,9,1
395421,100,91,71,151,0.203187,2020,6,159,3,3,-15,18,0.0,POINT (734001.9113857687 3814163.230243757),point_conception,ca_m_3412037_nw_10_060_20200607,9,1
395579,82,75,64,159,0.319502,2020,6,159,0,3,-15,18,0.0,POINT (734022.109994212 3814159.5868103337),point_conception,ca_m_3412037_nw_10_060_20200607,9,1
