## Fune-tune the System - Decision Tree Classifier

#### 0. Import modules and define parameters

In [10]:
import sys
sys.path.append("..")

from module.utils import general_utils
from module.utils import data_prepare_utils
import module.utils.bin_class_utils as bin_class_utils

from sklearn.pipeline import Pipeline

import numpy as np
import pandas as pd
import time

In [11]:
TRAIN_FILE_PATH = "../data/train_df.csv"
VALIDATION_FILE_PATH = "../data/validation_df.csv"

TARGET_ATTR = "label"
TARGET_TYPE = "binary"

ESTIMATOR_NAME = "DecisionTreeClassifier"

SAMPLE_SIZE = 0.05
SAMPLE_RANDOM_STATE = 24
TARGET_ENCODING_RANDOM_STATE = 42
MODEL_RANDOM_STATE = 42
PERMUTATION_IMPORTANCE_RANDOM_STATE = 0

ATTRS_TO_DROP = ['task_id',
                 'dev_id',
                 'tags',
                 'app_score',
                 'uid',
                 'up_life_duration',
                 'inter_type_cd',
                 'city',
                 'consume_purchase',
                 'app_first_class',
                 'emui_dev',
                 'gender',
                 'city_rank',
                 'spread_app_id',
                 'his_on_shelf_time']

CVS_SCORING_LIST = ['accuracy', 'precision', 'recall', 'recall', 'f1']
PERMUTATION_SCORING_LIST = ['average_precision', 'roc_auc']

#### 1. Import data and identify attributes

In [12]:
train_df = general_utils.read_csv(TRAIN_FILE_PATH)


Read CSV file ../data/train_df.csv into DataFrame:
df.head(): 


Unnamed: 0,uid,task_id,adv_id,creat_type_cd,adv_prim_id,dev_id,inter_type_cd,slot_id,spread_app_id,tags,...,device_price,up_life_duration,up_membership_grade,membership_life_duration,consume_purchase,communication_onlinerate,communication_avgonline_30d,indu_name,pt_d,label
0,1920544,3854,3367,7,207,17,5,11,13,37,...,4,20,-1,-1,2,7^8^9^10^11^12^13^14^15^16^17^18^19^20^21^22^23,10,17,6,0
1,1850503,3903,6370,7,173,52,5,12,69,11,...,2,-1,-1,-1,2,5^6^7^8^9^10^11^12^13^14^15^16^17^18^19^20^21^...,12,50,7,0
2,2157496,1847,6428,6,178,17,5,18,70,39,...,7,20,1,-1,9,0^1^2^3^4^5^6^7^8^9^10^11^12^13^14^15^16^17^18...,13,36,7,0
3,2072043,4125,4468,7,168,37,5,12,44,40,...,2,20,-1,-1,2,6^7^8^9^10^11^12^13^14^15^16^17^18^19^20^21^22^23,11,17,6,0
4,1084626,4811,3326,7,183,29,5,12,86,37,...,3,18,-1,-1,2,7^8^9^10^11^12^13^14^15^16^17^18^19^20^21^22^23,10,17,6,0


df.shape: (670513, 36)


In [13]:
train_df_sample = data_prepare_utils.sample_data(train_df, SAMPLE_SIZE, SAMPLE_RANDOM_STATE)
del train_df


Sample 0.05 fraction from DataFrame:
sample_df.shape: (33525, 36)


In [14]:
train_cap_x_df, train_y_df = train_df_sample.drop(columns=TARGET_ATTR), train_df_sample[[TARGET_ATTR]]

In [15]:
numerical_attr_list = []
categorical_attr_list = [attr for attr in train_cap_x_df.columns if attr not in numerical_attr_list and attr != TARGET_ATTR]
attr_list = numerical_attr_list + categorical_attr_list

#### 2. Build composite estimator

In [16]:
preprocessor = bin_class_utils.build_preprocessing_pipeline(numerical_attr_list, categorical_attr_list, ATTRS_TO_DROP, TARGET_TYPE, TARGET_ENCODING_RANDOM_STATE)
estimator = bin_class_utils.get_default_model(ESTIMATOR_NAME, MODEL_RANDOM_STATE)

In [17]:
composite_estimator = Pipeline(steps=[
    ('preprocessor', preprocessor), 
    ('estimator', estimator)
])

#### 3. Design grid search and tune hyperparameters of composite estimator

In [20]:
class_weight_options = bin_class_utils.balance_class_weight(train_y_df[TARGET_ATTR])
best_model = bin_class_utils.tune_hyperparameters_dt(train_cap_x_df, train_y_df, composite_estimator, class_weight_options)

KeyboardInterrupt: 

#### 4. Evaluate hyperparameter-tuned estimator on train set

In [None]:
bin_class_utils.eval_class(train_cap_x_df, train_y_df, best_model, "train sample", CVS_SCORING_LIST)

#### 5. Evaluate hyperparameter-tuned estimator on validation set

In [None]:
validation_df = general_utils.read_csv(VALIDATION_FILE_PATH)

In [None]:
validation_df_sample = data_prepare_utils.sample_data(validation_df, SAMPLE_SIZE, SAMPLE_RANDOM_STATE)
del validation_df

In [None]:
validation_cap_x_df, validation_y_df = validation_df_sample.drop(columns=TARGET_ATTR), validation_df_sample[[TARGET_ATTR]]
del validation_df_sample

In [None]:
bin_class_utils.eval_class(validation_cap_x_df, validation_y_df, best_model, "validation sample", CVS_SCORING_LIST)

#### 6. Check out permutation feature importance

In [None]:
perm_results_df = bin_class_utils.check_out_permutation_importance(
    best_model, 
    train_cap_x_df, 
    train_y_df, 
    PERMUTATION_IMPORTANCE_RANDOM_STATE,
    PERMUTATION_SCORING_LIST
)
perm_results_df

#### 7. Check for false discoveries

In [None]:
bin_class_utils.avoiding_false_discoveries_class_helper(best_model, train_cap_x_df, train_y_df, validation_cap_x_df,
                                            validation_y_df, num_samples=20)

#### 8. Selection the best model

#### 9. Tune classification threshold for classification - assess threshold list

In [None]:
thresholds = np.arange(0, 0.05, 0.01)

bin_class_utils.print_classification_metrics_at_thresholds(best_model, validation_cap_x_df, validation_y_df, thresholds)

#### 10. Get the best classification threshold

#### 11. Evaluate the best classification threshold on validation set

#### 12. Evaluate the best model and best threshold on the test set