In [183]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

In [184]:
import pandas as pd
import numpy as np
import pickle

In [185]:
import matplotlib.pyplot as plt
import seaborn as sns

In [186]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier

In [187]:
from imblearn.over_sampling import SMOTENC
from imblearn.combine import SMOTEENN
from imblearn.under_sampling import EditedNearestNeighbours

In [188]:
from pycaret.classification import *

# CONTENTS
### preamble   
- logical to control dataset size for modeling
- pickle file name   
   
### read wrangled data from pickle
- define lists of column names  
   
### function to build feature columns list
### select columns
### option to use small sample size for testing
### column selection for fitting model
### `imblearn` preparation
### `pycaret` preparation and `setup`
### fit tuned decision tree model with all data
- investigate finalized model performance   
   
### save model

# preamble
### logical to control dataset size for modeling
##### `True` gives very small dataset for testing notebook
##### `False` uses entire dataset

In [189]:
#use_small_data_set = True
use_small_data_set = False

### pickle file name

In [190]:
running_on_local_machine = True
#running_on_local_machine = False

if running_on_local_machine:
    pickle_file_name = 'data/wrangled_data.pkl'
    tuning_results_dir = 'data/'
else:
    pickle_file_name = '/storage/wrangled_data_update_soil.pkl'
    tuning_results_dir = 'tuning_results/'

# read wrangled data from pickle

In [191]:
df_data = pd.read_pickle(pickle_file_name)

In [192]:
df_data.shape
df_data.columns
df_data.head()

(581012, 71)

Index(['elevation', 'aspect', 'slope', 'HD_hydrology', 'VD_hydrology',
       'HD_roadways', 'hillshade_9am', 'hillshade_noon', 'hillshade_3pm',
       'HD_fire_points', 'wild_area_rawah', 'wild_area_neota',
       'wild_area_comanche_peak', 'wild_area_cache_la_poudre', 'soil_type_1',
       'soil_type_2', 'soil_type_3', 'soil_type_4', 'soil_type_5',
       'soil_type_6', 'soil_type_7', 'soil_type_8', 'soil_type_9',
       'soil_type_10', 'soil_type_11', 'soil_type_12', 'soil_type_13',
       'soil_type_14', 'soil_type_15', 'soil_type_16', 'soil_type_17',
       'soil_type_18', 'soil_type_19', 'soil_type_20', 'soil_type_21',
       'soil_type_22', 'soil_type_23', 'soil_type_24', 'soil_type_25',
       'soil_type_26', 'soil_type_27', 'soil_type_28', 'soil_type_29',
       'soil_type_30', 'soil_type_31', 'soil_type_32', 'soil_type_33',
       'soil_type_34', 'soil_type_35', 'soil_type_36', 'soil_type_37',
       'soil_type_38', 'soil_type_39', 'soil_type_40', 'cover_type',
       'wilder

Unnamed: 0,elevation,aspect,slope,HD_hydrology,VD_hydrology,HD_roadways,hillshade_9am,hillshade_noon,hillshade_3pm,HD_fire_points,...,mms_elevation,mms_aspect,mms_slope,mms_HD_hydrology,mms_VD_hydrology,mms_HD_roadways,mms_hillshade_9am,mms_hillshade_noon,mms_hillshade_3pm,mms_HD_fire_points
0,2596,51,3,258,0,510,221,232,148,6279,...,0.368684,0.141667,0.045455,0.184681,0.223514,0.071659,0.870079,0.913386,0.582677,0.875366
1,2590,56,2,212,-6,390,220,235,151,6225,...,0.365683,0.155556,0.030303,0.151754,0.215762,0.054798,0.866142,0.925197,0.594488,0.867838
2,2804,139,9,268,65,3180,234,238,135,6121,...,0.472736,0.386111,0.136364,0.19184,0.307494,0.446817,0.92126,0.937008,0.531496,0.853339
3,2785,155,18,242,118,3090,238,238,122,6211,...,0.463232,0.430556,0.272727,0.173228,0.375969,0.434172,0.937008,0.937008,0.480315,0.865886
4,2595,45,2,153,-1,391,220,234,150,6172,...,0.368184,0.125,0.030303,0.10952,0.222222,0.054939,0.866142,0.92126,0.590551,0.860449


### define lists of column names

In [193]:
column = 'wild_area_'
areas = ['rawah', 'neota', 'comanche_peak', 'cache_la_poudre']
wild_area_cols = []
for i in areas:
    wild_area_cols.append(column + i)

column = 'soil_type_'
soil_type_cols = []
for i in range(1,41):
    soil_type_cols.append(column + str(i))

numerical_cols = ['elevation', 'aspect', 'slope', 'HD_hydrology', 'VD_hydrology', 'HD_roadways',
                  'hillshade_9am', 'hillshade_noon', 'hillshade_3pm', 'HD_fire_points']

scaled_numerical_cols = []
for col in numerical_cols:
    scaled_numerical_cols.append('mms_' + col)

target_col = ['cover_type']

target_names = ['spruce_fir', 'lodgepole_pine', 'ponderosa_pine', 'cottonwood_willow', 'aspen', 'douglas_fir', 'krummholz']

reverse_one_hot_cols = ['wilderness_index', 'wilderness_area', 'soil_index']

soil_cluster_cols = ['climatic_zone', 'geologic_zone', 'both_zones']

print('numerical_cols:')
pp(numerical_cols)

print('\nscaled_numerical_cols:')
pp(scaled_numerical_cols)

print('\nwild_area_cols:')
pp(wild_area_cols)

print('\nsoil_type_cols:')
pp(soil_type_cols)

print('\nsoil_cluster_cols:')
pp(soil_cluster_cols)

print('\nreverse_one_hot_cols:')
pp(reverse_one_hot_cols)

print('\ntarget_col:')
pp(target_col)

print('\ntarget_names:')
pp(target_names)

numerical_cols:
['elevation',
 'aspect',
 'slope',
 'HD_hydrology',
 'VD_hydrology',
 'HD_roadways',
 'hillshade_9am',
 'hillshade_noon',
 'hillshade_3pm',
 'HD_fire_points']

scaled_numerical_cols:
['mms_elevation',
 'mms_aspect',
 'mms_slope',
 'mms_HD_hydrology',
 'mms_VD_hydrology',
 'mms_HD_roadways',
 'mms_hillshade_9am',
 'mms_hillshade_noon',
 'mms_hillshade_3pm',
 'mms_HD_fire_points']

wild_area_cols:
['wild_area_rawah',
 'wild_area_neota',
 'wild_area_comanche_peak',
 'wild_area_cache_la_poudre']

soil_type_cols:
['soil_type_1',
 'soil_type_2',
 'soil_type_3',
 'soil_type_4',
 'soil_type_5',
 'soil_type_6',
 'soil_type_7',
 'soil_type_8',
 'soil_type_9',
 'soil_type_10',
 'soil_type_11',
 'soil_type_12',
 'soil_type_13',
 'soil_type_14',
 'soil_type_15',
 'soil_type_16',
 'soil_type_17',
 'soil_type_18',
 'soil_type_19',
 'soil_type_20',
 'soil_type_21',
 'soil_type_22',
 'soil_type_23',
 'soil_type_24',
 'soil_type_25',
 'soil_type_26',
 'soil_type_27',
 'soil_type_28',
 's

In [194]:
len(df_data.columns)

71

In [195]:
len(numerical_cols + scaled_numerical_cols + wild_area_cols
    + soil_type_cols + reverse_one_hot_cols + target_col + soil_cluster_cols)

71

# function to build feature columns list

In [196]:
def make_feature_columns_list(numerical_scaled, wild_area_one_hot, soil_type_one_hot, soil_type_cluster='none'):
    feature_cols_list = []
    
    if numerical_scaled:
        feature_cols_list += scaled_numerical_cols
    else:
        feature_cols_list += numerical_cols
    
    if wild_area_one_hot:
        feature_cols_list += wild_area_cols
    else:
        feature_cols_list.append(reverse_one_hot_cols[0])
    
    if soil_type_one_hot:
        feature_cols_list += soil_type_cols
    elif soil_type_cluster=='none':
        feature_cols_list.append(reverse_one_hot_cols[2])
    elif soil_type_cluster=='climatic':
        feature_cols_list.append(soil_cluster_cols[0])
    elif soil_type_cluster=='geologic':
        feature_cols_list.append(soil_cluster_cols[1])
    elif soil_type_cluster=='both':
        feature_cols_list.append(soil_cluster_cols[2])
    else:
        print('ERROR ERROR ERROR ERROR in selection of soil type columns')
    
    return feature_cols_list

In [197]:
temp_list = make_feature_columns_list(numerical_scaled=True, wild_area_one_hot=False,
                                      soil_type_one_hot=False, soil_type_cluster='both')
pp(temp_list)

['mms_elevation',
 'mms_aspect',
 'mms_slope',
 'mms_HD_hydrology',
 'mms_VD_hydrology',
 'mms_HD_roadways',
 'mms_hillshade_9am',
 'mms_hillshade_noon',
 'mms_hillshade_3pm',
 'mms_HD_fire_points',
 'wilderness_index',
 'both_zones']


# select columns

In [198]:
feature_cols_all = make_feature_columns_list(numerical_scaled=False, wild_area_one_hot=False,
                                                soil_type_one_hot=False, soil_type_cluster='none')
feature_cols_all += soil_cluster_cols
feature_cols_all

['elevation',
 'aspect',
 'slope',
 'HD_hydrology',
 'VD_hydrology',
 'HD_roadways',
 'hillshade_9am',
 'hillshade_noon',
 'hillshade_3pm',
 'HD_fire_points',
 'wilderness_index',
 'soil_index',
 'climatic_zone',
 'geologic_zone',
 'both_zones']

# option to use small sample size for testing

In [199]:
if use_small_data_set:
    train_sample_size = 10000
    test_sample_size  = 3000

    X_train, X_test, y_train, y_test = train_test_split(df_data[feature_cols_all], df_data[target_col[0]],
                                       train_size=train_sample_size, test_size=test_sample_size, random_state=59)
    df_data_to_use = (pd.concat([X_train, y_train], axis=1)).append(pd.concat([X_test, y_test], axis=1))

else:
    df_data_to_use = df_data[feature_cols_all + target_col]

In [200]:
df_data_to_use.shape
df_data_to_use.head()

(581012, 16)

Unnamed: 0,elevation,aspect,slope,HD_hydrology,VD_hydrology,HD_roadways,hillshade_9am,hillshade_noon,hillshade_3pm,HD_fire_points,wilderness_index,soil_index,climatic_zone,geologic_zone,both_zones,cover_type
0,2596,51,3,258,0,510,221,232,148,6279,0,29,7,7,77,5
1,2590,56,2,212,-6,390,220,235,151,6225,0,29,7,7,77,5
2,2804,139,9,268,65,3180,234,238,135,6121,0,12,4,7,47,2
3,2785,155,18,242,118,3090,238,238,122,6211,0,30,7,7,77,2
4,2595,45,2,153,-1,391,220,234,150,6172,0,29,7,7,77,5


# column selection for fitting model
### numeric; index wild; geologic soil

In [201]:
features_to_use = make_feature_columns_list(numerical_scaled=False, wild_area_one_hot=False,
                                            soil_type_one_hot=False, soil_type_cluster='geologic')

features_cat = features_to_use[-2:]
cols_to_use = features_to_use + target_col
df_su = df_data_to_use[cols_to_use]

cols_to_use
features_cat

['elevation',
 'aspect',
 'slope',
 'HD_hydrology',
 'VD_hydrology',
 'HD_roadways',
 'hillshade_9am',
 'hillshade_noon',
 'hillshade_3pm',
 'HD_fire_points',
 'wilderness_index',
 'geologic_zone',
 'cover_type']

['wilderness_index', 'geologic_zone']

# `imblearn` preparation
### `pycaret` encodes target classes as `0 to 6` before using `imbalanced_learn`
### `pycaret` one-hot encodes `wilderness_index` before using `imbalanced_learn`   
### `pycaret` applies `imbalanced_learn` to training set (I believe)

In [202]:
# THIS CELL FOR GEOLOGIC ENCODING
features_cat_pycaret_pre_process = features_cat
features_cat_imblearn = [10, 11, 12, 13, 14, 15, 16, 17]
features_cat_pycaret_pre_process
features_cat_imblearn

['wilderness_index', 'geologic_zone']

[10, 11, 12, 13, 14, 15, 16, 17]

In [203]:
def create_dict_target_count(orig_counts, dict_factors, train_fraction):
    dict_target_count = {}
    list_classes_clean = []
    
    for target, factor in dict_factors.items():
        dict_target_count[target-1] = int(orig_counts.loc[target].values[0] * factor)
        
        if factor != 1:
            list_classes_clean.append(target-1)
            dict_target_count[target-1] = int(dict_target_count[target-1] * train_fraction)
    
    return dict_target_count, list_classes_clean

In [204]:
target_counts_to_use = df_data_to_use[target_col].value_counts().sort_index()
target_counts_to_use

cover_type
1             211840
2             283301
3              35754
4               2747
5               9493
6              17367
7              20510
dtype: int64

In [205]:
#target_factors = [1, 1, 3.04, 3.14, 3.06, 3.05, 3.02]
dict_factors = {1:1, 2:1, 3:3.04, 4:3.14, 5:3.06, 6:3.05, 7:3.02}
train_fraction = 0.7
dict_target_counts, list_classes_clean = create_dict_target_count(target_counts_to_use, dict_factors, train_fraction)
dict_target_counts
list_classes_clean

{0: 211840, 1: 283301, 2: 76084, 3: 6037, 4: 20333, 5: 37078, 6: 43358}

[2, 3, 4, 5, 6]

In [206]:
the_smote_nc = SMOTENC(categorical_features=features_cat_imblearn, random_state=59,
                       sampling_strategy=dict_target_counts)

the_enn = EditedNearestNeighbours(sampling_strategy=list_classes_clean,
                                  n_neighbors=3, kind_sel='mode')

the_smote = SMOTEENN(random_state=59, smote=the_smote_nc, enn=the_enn)

# `pycaret` preparation and `setup`
### `silent = True` to supress interactive confirmation of data types

In [207]:
#su_log_profile = True
su_log_profile = False

su_silent = True
#su_silent = False

#su_log_exp = True
su_log_exp = False

su_use_gpu = False
#su_use_gpu = not running_on_local_machine
print('su_use_gpu = ', su_use_gpu)

su_name_exp = 'tune decision tree'

su_use_gpu =  False


In [208]:
su_grand     = setup(data = df_su, target = target_col[0],
                     categorical_features = features_cat_pycaret_pre_process,
                     normalize = True, normalize_method = 'minmax',
                     fix_imbalance = True, fix_imbalance_method = the_smote,
                     train_size = 0.7, fold = 3, session_id = 59, use_gpu = su_use_gpu,
                     log_experiment = su_log_exp, experiment_name = su_name_exp, log_profile = su_log_profile,
                     #profile = True
                     silent = su_silent
                     )

Unnamed: 0,Description,Value
0,session_id,59
1,Target,cover_type
2,Target Type,Multiclass
3,Label Encoded,"1: 0, 2: 1, 3: 2, 4: 3, 5: 4, 6: 5, 7: 6"
4,Original Data,"(581012, 13)"
5,Missing Values,False
6,Numeric Features,10
7,Categorical Features,2
8,Ordinal Features,False
9,High Cardinality Features,False


# fit tuned decision tree model with all data
https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeClassifier.html#sklearn.tree.DecisionTreeClassifier

In [209]:
dt_with_tuned_parameters = DecisionTreeClassifier(max_depth=20, min_samples_leaf=4, min_samples_split=6,
                                                  criterion='entropy', max_features=1.0)

In [None]:
finalized_dt = create_model(estimator=dt_with_tuned_parameters, cross_validation=False)

### investigate finalized model performance

In [None]:
plot_model(finalized_dt, plot='error')

In [None]:
plot_model(finalized_dt, plot='class_report')

# save model

In [None]:
save_model(finalized_dt, 'finalized_dt_model')

In [None]:
saved_finalized_model_dt = load_model('finalized_dt_model')

In [None]:
type(saved_finalized_model_dt)
saved_finalized_model_dt