## BINARY CLASSIFICATION NOTEBOOK

This is a notebook intended for a semi automated binary classification problem. User inputs are explicitly called out, primarily around specific null treatment, feature encoding, sampling technique, modelling evaluation function. All of them have defaults, so only the mandatory inputs are listed here

***

### Contents
1. Pre-processing
2. Feature Engineering
3. Feature Selection
4. Modelling
    - xgboost
    - lightgbm
    - randomforest
    - simple logit
    - SVC
    - 1class methods
    - h2o AUTOML framework
    
***
- *Author* : VARUN V
- *Language* : python
- *Date* : 17-08-2017
***

In [8]:
## importing the relevant packages:

# clear the workspace
%reset -f

# print list of files in directory
import os
print(os.listdir())

# print/display all plots inline
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

# the base packages
import collections # for the Counter function
import csv # for reading/writing csv files
import pandas as pd, numpy as np, time, gc, bisect

# the various packages/modules used across processing (sklearn), modelling (lightgbm) and bayesian optimization (hyperopt, bayes_opt)
import sklearn
from sklearn.model_selection import train_test_split
from sklearn import metrics, preprocessing, decomposition
from sklearn.cross_validation import cross_val_score, StratifiedKFold, StratifiedShuffleSplit
from sklearn.base import TransformerMixin
from sklearn.utils import shuffle
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.random_projection import GaussianRandomProjection, SparseRandomProjection
import category_encoders as ce

from bayes_opt import BayesianOptimization
from tqdm import tqdm
from hyperopt import hp, tpe, STATUS_OK, fmin, Trials
from hyperopt.fmin import fmin
from hyperopt.pyll.stochastic import sample

# modelling algorithms
import xgboost as xgb
import lightgbm as lgb
from sklearn.ensemble import RandomForestClassifier

# Evaluation of the model
from sklearn import model_selection
from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score
from timeit import default_timer as timer

# Exporting packages for SHAP/LIME
import shap
import lime
import lime.lime_tabular

# missing value imputation
from fancyimpute import KNN, MICE #, NuclearNormMinimization

# define the global variables used later
MAX_EVALS = 2 # number of iterations/parameter sets created towards tuning (all hyper-opt frameworks)
N_FOLDS = 5 # number of cv folds
randomseed = 1 # the value for the random state used at various points in the pipeline

['.ipynb_checkpoints', 'FeatureEncoder.py', 'FEATURE_ENGINEERING.ipynb', 'MODEL_SELECTION_TUNING_TEST_2017Dec_model3.ipynb', 'OTHER_MODELS.ipynb', 'ULTIMATE.ipynb']


In [None]:
x=pd.read_csv('')

In [192]:
x1=pd.read_csv('train_final.csv', na_values=['No Data', ' ', 'UNKNOWN'])
x2=pd.read_csv('test_final.csv', na_values=['No Data', ' ', 'UNKNOWN'])
x3=pd.read_csv('valid_final.csv', na_values=['No Data', ' ', 'UNKNOWN'])

print(x1.shape)
print(x2.shape)
print(x3.shape)

chars_to_remove = [' ', '.', '(', ')', '__', '-']
for i in chars_to_remove:
    x1.columns = x1.columns.str.strip().str.lower().str.replace(i, '_')
    x2.columns = x2.columns.str.strip().str.lower().str.replace(i, '_')
    x3.columns = x3.columns.str.strip().str.lower().str.replace(i, '_')

(6102, 92)
(5897, 92)
(5734, 92)


In [221]:
def num_missing(self):
    return sum(self.isnull())
x11=x1.apply(num_missing, axis=0)
x21=x2.apply(num_missing, axis=0)
x31=x3.apply(num_missing, axis=0)
missing = pd.DataFrame({'dec16': x11, 'june17': x21, 'dec17': x31}) # display count of NAs per column
missing['dec16_perc'] = 100*(missing['dec16']/x1.shape[0])
missing['june17_perc'] = 100*(missing['june17']/x2.shape[0])
missing['dec17_perc'] = 100*(missing['dec17']/x3.shape[0])

missing.index.name = 'columns'
missing.reset_index(inplace=True)

missing.to_csv('NAZ_missing_distribution.csv', index=False)

In [7]:
#### MAIN CLASSES ####

## Two defined for now ##
# 1. DataFrame Imputer
#    - for imputing missing values
# 2. Prepare Data
#    - for sourcing, processing, and returning the train/test datasets

class DataFrameImputer(TransformerMixin):

    def __init__(self):
        """Impute missing values.
        Columns of dtype object are imputed with the most frequent value 
        in column.
        Columns of other types are imputed with mean of column.
        """
        
    def fit(self, X, y=None):
        """
        specify columns to be grouped by before imputation. use only for specific cases when user knows a specific level
        at which a column can be grouped at for missing value treatment
        
        X.groupby(['...'])
        """
        self.fill = pd.Series([X[c].value_counts().index[0] if X[c].dtype == np.dtype('O') else X[c].mean() for c in X], 
                              index=X.columns)
        return self

    def transform(self, X, y=None):
        return X.fillna(self.fill)
    
    def num_missing(self):
        return sum(self.isnull())
    
    def imputer_method(self, column, method=['mean', 'median', 'most_frequent']):
        x = Imputer(missing_values = 'NaN', strategy = method, axis = 0)
        return x.fit_transform(self[[column]]).ravel()
    
    def fancy_impute(X, which_method):
        """ currently supported algorithms are KNN, NNM and MICE from the fancyimpute package
        which_method = ['KNN', 'NNM', 'MICE']
        """
        if which_method == 'NNM': X = NuclearNormMinimization().complete(X) # NNM method
        if which_method == 'KNN': X = KNN(k=7, verbose=False).complete(X) # KNN method
        if which_method == 'MICE':
            X_complete_df = X.copy()
            mice = MICE(verbose=False)
            X_complete = mice.complete(np.asarray(X.values, dtype=float))
            X_complete_df.loc[:, X.columns] = X_complete[:][:]
            X = X_complete_df
        return X

class prepare_data():
    
    def __init__(self):
        """ To prepare data,
                1. read in data
                2. pre-processing/cleaning
                3. creating helper objects for later steps
                4. processing for modelling
                5. function return objects are the train, valid, response, categ cols/indices, feature names
        """
    
    def labelEncoder(train_df, valid_df, cat_columns, categorical_names):
        for feature in tqdm(cat_columns):
            le = preprocessing.LabelEncoder()
            le.fit(train_df[feature].astype(str))
            train_df[feature] = le.transform(train_df[feature].astype(str))
            valid_df[feature] = valid_df[feature].map(lambda i: 'No Data' if i not in le.classes_ else i)
            le_classes = le.classes_.tolist()
            bisect.insort_left(le_classes, 'No Data')
            le.classes_ = le_classes
            valid_df[feature] = le.transform(valid_df[feature].astype(str))
            categorical_names[feature] = le.classes_
            return train_df, valid_df, categorical_names
        
    def ce_encodings(train_df, valid_df, y_train, y_valid, cat_columns, encoding):
        if encoding=='bne':
            enc1=ce.BaseNEncoder(cat_columns, base=3)
            enc1.fit(train_df)
            train_df=enc1.transform(train_df)
            valid_df=enc1.transform(valid_df)
        elif encoding=='be':
            enc2=ce.BinaryEncoder(cat_columns)
            enc2.fit(train_df)
            train_df=enc2.transform(train_df)
            valid_df=enc2.transform(valid_df)
        elif encoding=='he':
            enc3=ce.HashingEncoder(cat_columns)
            enc3.fit(train_df, y_train)
            train_df=enc3.transform(train_df)
            valid_df=enc3.transform(valid_df)
        elif encoding=='oe':
            enc4=ce.OrdinalEncoder(cat_columns)
            enc4.fit(train_df, y_train)
            train_df=enc4.transform(train_df)
            valid_df=enc4.transform(valid_df)
        return train_df, valid_df
    
    ## function to get frequency count of elements in a vector/list
    def freq_count(input_vector):
        return collections.Counter(input_vector)
    
    def categ_feats(train_df, valid_df, y_train, y_valid, encoding='le'):
        x = list(train_df.dtypes)
        x_1 = [1 if x == 'O' else 0 for x in x]
        categorical_idx = [i for i, x in enumerate(x_1) if x == 1]
        # Get feature names and their values for categorical data (needed for LIME)
        cat_columns = train_df.select_dtypes(include=['object']).columns.values
        categorical_names = {}

        if encoding=='le':
            train_df, valid_df, categorical_names = prepare_data.labelEncoder(train_df, valid_df, cat_columns, categorical_names)
        elif encoding in ['be', 'bne', 'he', 'oe']:
            train_df, valid_df = prepare_data.ce_encodings(train_df, valid_df, y_train, y_valid, cat_columns, encoding)
        else :
            print('Not supported. Use one of [be, bne, he, oe]')
        return train_df, valid_df, categorical_names, categorical_idx

    def create(input_file_path, input_file_path_2, response, cols_to_remove = ['id'], random_seed = 1,
                            encoding = 'le'):
        train = pd.read_csv(input_file_path, na_values=['No Data', ' ', 'UNKNOWN'])
        test = pd.read_csv(input_file_path_2, na_values=['No Data', ' ', 'UNKNOWN'])
        
        train.drop(cols_to_remove, axis = 1, inplace = True)
        test = pd.DataFrame(data = test[train.columns])
        
        chars_to_remove = [' ', '.', '(', ')', '__', '-']
        for i in chars_to_remove:
            train.columns = train.columns.str.strip().str.lower().str.replace(i, '_')
            test.columns = test.columns.str.strip().str.lower().str.replace(i, '_')
            
        print(train.shape, '\n')
        train.dropna(thresh=0.5*(train.shape[0]), axis=1, inplace = True)
        train.dropna(thresh=0.4*(train.shape[1]), axis=0, inplace = True)
        print(train.shape, '\n')
        test = test[train.columns]
        test.dropna(thresh=0.5*(test.shape[0]), axis=1, inplace = True)
        train = train[test.columns]
        
        print(prepare_data.freq_count(train[response]), '\n')

        # shuffle the dataframes so that the training is done in a random order.
        train = shuffle(train)
        test = shuffle(test)
        
        # creating the response vector
        y_train = train[response].values
        X_train = train.drop([response], axis = 1)
        y_valid = test[response].values
        X_valid = test.drop([response], axis = 1)
        
        X_train = pd.DataFrame(X_train)
        X_valid = pd.DataFrame(X_valid)
        
        X_train, X_valid, categ_names, categ_idx = prepare_data.categ_feats(X_train, X_valid, y_train, y_valid, encoding)
        
        feat_names = X_train.columns.values
        feat_names2 = X_valid.columns.values
        
        X_train = DataFrameImputer.fancy_impute(X_train, which_method='MICE')
        X_valid = DataFrameImputer.fancy_impute(X_valid, which_method='MICE')
        
        # returning as pandas dataframes to retain feature names for LIME and feature importance plots
        X_train = pd.DataFrame(data=X_train, columns=feat_names)
        X_valid = pd.DataFrame(data=X_valid, columns=feat_names2)
        
        return X_train, X_valid, y_train, y_valid, categ_names, categ_idx, feat_names

In [189]:
# create data function call
# CV approach
# train and valid features/response dataframes returned
# categorical column names/indices and all feature names also returned

X_train, X_valid, y_train, y_valid, categ_names, categ_idx, feat_names = prepare_data.create(input_file_path='../input/adult.data',
                                                                  input_file_path_2='../input/adult.test.csv', response = 'label',
                                cols_to_remove = [], encoding = 'bne')

(6102, 77) 

(6085, 75) 

Counter({0: 5796, 1: 289}) 

['pay_scale_group_0' 'pay_scale_group_1' 'pay_scale_group_2'
 'pay_scale_group_3' 'gender_0' 'gender_1' 'abinbev_entity2_0'
 'abinbev_entity2_1' 'abinbev_entity2_2' 'abinbev_entity2_3'
 'macro_entity4_0' 'macro_entity4_1' 'macro_entity4_2' 'macro_entity4_3'
 'macro_entity4_4' 'physical_work_location_code_0'
 'physical_work_location_code_1' 'physical_work_location_code_2'
 'physical_work_location_code_3' 'physical_work_location_code_4'
 'physical_work_location_code_5' 'global_job_0' 'global_job_1'
 'global_job_2' 'global_job_3' 'global_job_4' 'global_job_5'
 'global_job_6' 'job_family_0' 'job_family_1' 'job_family_2'
 'job_family_3' 'job_family_4' 'job_family_5' 'job_family_6'
 'functional_area_0' 'functional_area_1' 'functional_area_2'
 'functional_area_3' 'functional_area_4' 'functional_area_5'
 'cost_center_0' 'cost_center_1' 'cost_center_2' 'cost_center_3'
 'cost_center_4' 'cost_center_5' 'cost_center_6' 'macro_entity_type_0'
 '

ValueError: Input matrix is not missing any values

In [155]:
print(X_train.shape)
#print(X_test.shape)
print(X_valid.shape)

print(y_train.shape)
#print(y_test.shape)
print(y_valid.shape)

print(collections.Counter(y_train))
#print(collections.Counter(y_test))
print(collections.Counter(y_valid))

(6085, 74)
(5734, 74)
(6085,)
(5734,)
Counter({0: 5796, 1: 289})
Counter({0: 5453, 1: 281})


# FEATURE ENGINEERING MODULE

1. PCA
2. ICA
3. tSVD
4. GRP
5. SRP
6. Binning
7. Deviation Encoding features
8. Salary related features
9. ...

In [158]:
class feat_eng():
    
    def __init__():
        """ this module contains several functions for creating new features. find below a brief description of each """
    
    def scalers(train, valid, which_method):
        if which_method == 'ss':
            sc = StandardScaler()
            sc.fit(train)
            train = pd.DataFrame(sc.transform(train))
            valid = pd.DataFrame(sc.transform(valid))
            return train, valid # scale all variables to zero mean and unit variance, required for PCA and related
        if which_method == 'mm':
            mm = MinMaxScaler()
            mm.fit(train)
            train = pd.DataFrame(mm.transform(train))
            valid = pd.DataFrame(mm.transform(valid))
            return train, valid # use this method to iterate
        
    def pca_feats(train, valid, n = .95):
            train, valid = feat_eng.scalers(train, valid, which_method='ss')
            pca_fit = decomposition.PCA(n_components=n)
            pca_fit.fit(train)
            pca_train = pd.DataFrame(pca_fit.transform(train))
            pca_valid = pd.DataFrame(pca_fit.transform(valid))
            pca_cols = list(set(list(pca_train)))
            pca_cols = ['pca_' + str(s) for s in pca_cols]
            pca_train.columns = pca_cols
            pca_valid.columns = pca_cols
            return pca_train, pca_valid
        
    def ica_feats(train, valid, n = 5):
            train, valid = feat_eng.scalers(train, valid, which_method='ss')
            ica_fit = decomposition.FastICA(n_components=n)
            ica_fit.fit(train)
            ica_train = pd.DataFrame(ica_fit.transform(train))
            ica_valid = pd.DataFrame(ica_fit.transform(valid))
            ica_cols = list(set(list(ica_train)))
            ica_cols = ['ica_' + str(s) for s in ica_cols]
            ica_train.columns = ica_cols
            ica_valid.columns = ica_cols
            return ica_train, ica_valid
        
    def tsvd_feats(train, valid, n = 5):
            train, valid = feat_eng.scalers(train, valid, which_method='ss')
            tsvd_fit = decomposition.TruncatedSVD(n_components=n)
            tsvd_fit.fit(train)
            tsvd_train = pd.DataFrame(tsvd_fit.transform(train))
            tsvd_valid = pd.DataFrame(tsvd_fit.transform(valid))
            tsvd_cols = list(set(list(tsvd_train)))
            tsvd_cols = ['tsvd_' + str(s) for s in tsvd_cols]
            tsvd_train.columns = tsvd_cols
            tsvd_valid.columns = tsvd_cols
            return tsvd_train, tsvd_valid
        
    def grp_feats(train, valid, n = 5):
            train, valid = feat_eng.scalers(train, valid, which_method='ss')
            grp_fit = GaussianRandomProjection(n_components=n, eps=0.1)
            grp_fit.fit(train)
            grp_train = pd.DataFrame(grp_fit.transform(train))
            grp_valid = pd.DataFrame(grp_fit.transform(valid))
            grp_cols = list(set(list(grp_train)))
            grp_cols = ['grp_' + str(s) for s in grp_cols]
            grp_train.columns = grp_cols
            grp_valid.columns = grp_cols
            return grp_train, grp_valid
    
    def srp_feats(train, valid, n = 5):
            train, valid = feat_eng.scalers(train, valid, which_method='ss')
            srp_fit = SparseRandomProjection(n_components=n, dense_output=True, eps=0.1)
            srp_fit.fit(train)
            srp_train = pd.DataFrame(srp_fit.transform(train))
            srp_valid = pd.DataFrame(srp_fit.transform(valid))
            srp_cols = list(set(list(srp_train)))
            srp_cols = ['srp_' + str(s) for s in srp_cols]
            srp_train.columns = srp_cols
            srp_valid.columns = srp_cols
            return srp_train, srp_valid
        
    def return_combined(train, valid, list_objects = ['pca', 'ica', 'tsvd', 'grp', 'srp']):
        if 'pca' in list_objects:
            train = pd.concat([train.reset_index(drop=True), pca_train], axis=1)
            valid = pd.concat([valid.reset_index(drop=True), pca_valid], axis=1)
        if 'ica' in list_objects:
            train = pd.concat([train.reset_index(drop=True), ica_train], axis=1)
            valid = pd.concat([valid.reset_index(drop=True), ica_valid], axis=1)
        if 'tsvd' in list_objects:
            train = pd.concat([train.reset_index(drop=True), tsvd_train], axis=1)
            valid = pd.concat([valid.reset_index(drop=True), tsvd_valid], axis=1)
        if 'grp' in list_objects:
            train = pd.concat([train.reset_index(drop=True), grp_train], axis=1)
            valid = pd.concat([valid.reset_index(drop=True), grp_valid], axis=1)
        if 'srp' in list_objects:
            train = pd.concat([train.reset_index(drop=True), srp_train], axis=1)
            valid = pd.concat([valid.reset_index(drop=True), srp_valid], axis=1)
        return train, valid

In [159]:
## calling the various feat engineering functions and adding those features
## pca, ica, tsvd, grp, srp
pca_train, pca_valid = feat_eng.pca_feats(train=X_train, valid=X_valid, n=.95)
ica_train, ica_valid = feat_eng.ica_feats(train=X_train, valid=X_valid, n=10)
tsvd_train, tsvd_valid = feat_eng.tsvd_feats(train=X_train, valid=X_valid, n=10)
grp_train, grp_valid = feat_eng.grp_feats(train=X_train, valid=X_valid, n=10)
srp_train, srp_valid = feat_eng.srp_feats(train=X_train, valid=X_valid, n=10)

## scale the data
X_train, X_valid = feat_eng.scalers(train=X_train, valid=X_valid, which_method='mm')

## return the final datasets with the added features
X_train, X_valid = feat_eng.return_combined(train = X_train, valid = X_valid)



In [160]:
X_train['response'] = y_train
X_valid['response'] = y_valid

X_train.to_csv('X_train.csv', index=False)
X_valid.to_csv('X_valid.csv', index=False)

In [168]:
## h2o AUTO_ML grid search framework
import h2o
from h2o.automl import H2OAutoML

class h2o_automl():
    
    def __init__():
        """ module to invoke h2o auto ml and tune on the sampling parameters. 
        will need to incorporate to super pipeline later """
        
    def score(params):        
        # function to be minimized (1 - ROCAUC)
        global ITERATION
        ITERATION += 1
        randomseed = 1
        
        #h2o_train = h2o.H2OFrame(params['h2o_train'])
        #h2o_valid = h2o.H2OFrame(params['h2o_valid'])
        
        aml = H2OAutoML(max_runtime_secs = params['time_to_run'], stopping_metric='mean_per_class_error', sort_metric='mean_per_class_error',
                        class_sampling_factors=[params['oversampling'], params['undersampling']],
                        balance_classes = params['balance_classes'])

        aml.train(y = 'response', training_frame = h2o_train)
        
        # Print Leaderboard (ranked by xval metrics)
        print(aml.leaderboard)
        # Evaluate performance on a test set
        perf = aml.leader.model_performance(h2o_valid)
        print(perf.auc())   
        
        return {'loss': (1 - perf.auc()), 'status': STATUS_OK, 'params': params, 'iteration': ITERATION, 'aml': aml}
    
    def auto_ml(train_path, valid_path, response = 'response', time_to_run = 30):
        # Keep track of evals
        global ITERATION
        ITERATION = 0
        global trials
        trials = Trials()
        global h2o_train, h2o_valid
        
        # initializing the h2o cluster
        h2o.init()
        
        # Import a sample binary outcome train/test set into H2O
        h2o_train = h2o.import_file(train_path, header=1)
        h2o_valid = h2o.import_file(valid_path, header=1)

        # Identify the response and set of predictors
        x = list(h2o_train.columns)  #if x is defined as all columns except the response, then x is not required
        x.remove(response)

        # For binary classification, response should be a factor
        h2o_train[response] = h2o_train[response].asfactor()
        h2o_valid[response] = h2o_valid[response].asfactor()

        # space to be traversed for the hyperopt function
        space = {
            'undersampling': hp.uniform('us', 0.01, 1),
            'oversampling': hp.uniform('os', 1, 10),
            'balance_classes': hp.choice('bc', ['True', 'False']),
            'time_to_run': 30, 'x': x, 'response': response}

        best = fmin(h2o_automl.score, space, algo=tpe.suggest, trials=trials, max_evals=MAX_EVALS,
                    rstate=np.random.RandomState(randomseed))
        best = trials.best_trial['result']['params']
        aml = trials.best_trial['result']['aml']
        
        return trials, best, aml # results of all the iterations
    
    def get_score(aml, h2o_valid, y_valid, threshold = 0.1):
        pred = aml.predict(h2o_valid)[:,2]
        pred = pred.as_data_frame().as_matrix()
        predict = np.where(pred > threshold, 1, 0)
        y_test=y_valid

        recall_score = sklearn.metrics.recall_score(y_pred=predict, y_true=y_test)
        precision_score = sklearn.metrics.precision_score(y_pred=predict, y_true=y_test)
        f1_score = sklearn.metrics.f1_score(y_pred=predict, y_true=y_test)
        auc_score = roc_auc_score(y_test, pred)
        tn, fp, fn, tp = sklearn.metrics.confusion_matrix(y_pred=predict, y_true=y_test).ravel()
        print(sklearn.metrics.confusion_matrix(y_pred=predict, y_true=y_test), '\n')
        print('recall score is: ', recall_score)
        print('precision score is: ', precision_score)
        print('f1_score is: ', f1_score)
        print('accuracy score: ', sklearn.metrics.accuracy_score(y_true=y_test, y_pred=predict))
        print('The final AUC after taking the best params and num_rounds when it stopped is {:.4f}.'.format(auc_score), '\n')

In [224]:
trials, best, aml = h2o_automl.auto_ml(train_path='X_train.csv', valid_path='X_valid.csv', response='response', time_to_run=60)

print(aml.leader.model_performance(h2o_valid))

Model Details
H2OGradientBoostingEstimator :  Gradient Boosting Machine
Model Key:  GBM_grid_0_AutoML_20180814_180357_model_2


ModelMetricsBinomial: gbm
** Reported on train data. **

MSE: 0.012345731874956202
RMSE: 0.11111134899260382
LogLoss: 0.05451098842472014
Mean Per-Class Error: 0.013020362902665505
AUC: 0.9970281519607433
Gini: 0.9940563039214867
Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.1987366596826228: 


0,1,2,3,4
,0.0,1.0,Error,Rate
0,4668.0,5.0,0.0011,(5.0/4673.0)
1,12.0,219.0,0.0519,(12.0/231.0)
Total,4680.0,224.0,0.0035,(17.0/4904.0)


Maximum Metrics: Maximum metrics at their respective thresholds



0,1,2,3
metric,threshold,value,idx
max f1,0.1987367,0.9626374,172.0
max f2,0.1525126,0.9664948,182.0
max f0point5,0.2089268,0.9740840,170.0
max accuracy,0.2089268,0.9965334,170.0
max precision,0.9251269,1.0,0.0
max recall,0.0167882,1.0,343.0
max specificity,0.9251269,1.0,0.0
max absolute_mcc,0.1987367,0.9609453,172.0
max min_per_class_accuracy,0.0959240,0.9869463,207.0


Gains/Lift Table: Avg response rate:  4.71 %



0,1,2,3,4,5,6,7,8,9,10,11
,group,cumulative_data_fraction,lower_threshold,lift,cumulative_lift,response_rate,cumulative_response_rate,capture_rate,cumulative_capture_rate,gain,cumulative_gain
,1,0.0101958,0.7218179,21.2294372,21.2294372,1.0,1.0,0.2164502,0.2164502,2022.9437229,2022.9437229
,2,0.0201876,0.5969222,21.2294372,21.2294372,1.0,1.0,0.2121212,0.4285714,2022.9437229,2022.9437229
,3,0.0301794,0.4868321,21.2294372,21.2294372,1.0,1.0,0.2121212,0.6406926,2022.9437229,2022.9437229
,4,0.0401713,0.3360900,20.3629296,21.0139099,0.9591837,0.9898477,0.2034632,0.8441558,1936.2929587,2001.3909948
,5,0.0501631,0.1411981,12.9976146,19.4171682,0.6122449,0.9146341,0.1298701,0.9740260,1199.7614630,1841.7168198
,6,0.1001223,0.0549396,0.2599523,9.8580686,0.0122449,0.4643585,0.0129870,0.9870130,-74.0047707,885.8068612
,7,0.1500816,0.0406388,0.1733015,6.6341991,0.0081633,0.3125,0.0086580,0.9956710,-82.6698472,563.4199134
,8,0.2000408,0.0330241,0.0,4.9773400,0.0,0.2344546,0.0,0.9956710,-100.0,397.7340023
,9,0.2999592,0.0247311,0.0,3.3193546,0.0,0.1563562,0.0,0.9956710,-100.0,231.9354563




ModelMetricsBinomial: gbm
** Reported on validation data. **

MSE: 0.04511686381990303
RMSE: 0.2124073064183599
LogLoss: 0.18568613876747775
Mean Per-Class Error: 0.33176988976571375
AUC: 0.7111800288635736
Gini: 0.42236005772714713
Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.08171475032008457: 


0,1,2,3,4
,0.0,1.0,Error,Rate
0,1067.0,56.0,0.0499,(56.0/1123.0)
1,39.0,19.0,0.6724,(39.0/58.0)
Total,1106.0,75.0,0.0804,(95.0/1181.0)


Maximum Metrics: Maximum metrics at their respective thresholds



0,1,2,3
metric,threshold,value,idx
max f1,0.0817148,0.2857143,72.0
max f2,0.0817148,0.3094463,72.0
max f0point5,0.1408014,0.3235294,27.0
max accuracy,0.3472012,0.9517358,6.0
max precision,0.3472012,0.5714286,6.0
max recall,0.0063866,1.0,388.0
max specificity,0.8370580,0.9991095,0.0
max absolute_mcc,0.1408014,0.2478839,27.0
max min_per_class_accuracy,0.0227196,0.6384684,251.0


Gains/Lift Table: Avg response rate:  4.91 %



0,1,2,3,4,5,6,7,8,9,10,11
,group,cumulative_data_fraction,lower_threshold,lift,cumulative_lift,response_rate,cumulative_response_rate,capture_rate,cumulative_capture_rate,gain,cumulative_gain
,1,0.0101609,0.2056922,8.4841954,8.4841954,0.4166667,0.4166667,0.0862069,0.0862069,748.4195402,748.4195402
,2,0.0203218,0.1436148,6.7873563,7.6357759,0.3333333,0.375,0.0689655,0.1551724,578.7356322,663.5775862
,3,0.0304826,0.1169444,3.3936782,6.2217433,0.1666667,0.3055556,0.0344828,0.1896552,239.3678161,522.1743295
,4,0.0406435,0.1027651,0.0,4.6663075,0.0,0.2291667,0.0,0.1896552,-100.0,366.6307471
,5,0.0508044,0.0930130,5.0905172,4.7511494,0.25,0.2333333,0.0517241,0.2413793,409.0517241,375.1149425
,6,0.1007621,0.0609932,2.0707189,3.4221965,0.1016949,0.1680672,0.1034483,0.3448276,107.0718878,242.2196465
,7,0.1507197,0.0475663,1.0353594,2.6310539,0.0508475,0.1292135,0.0517241,0.3965517,3.5359439,163.1053855
,8,0.2006774,0.0383641,0.6902396,2.1478976,0.0338983,0.1054852,0.0344828,0.4310345,-30.9760374,114.7897570
,9,0.3005927,0.0277649,1.3804793,1.8928120,0.0677966,0.0929577,0.1379310,0.5689655,38.0479252,89.2812045




ModelMetricsBinomial: gbm
** Reported on cross-validation data. **

MSE: 0.042897149903799614
RMSE: 0.20711627146074163
LogLoss: 0.1860541583463351
Mean Per-Class Error: 0.36826227485332985
AUC: 0.6680335500151464
Gini: 0.33606710003029283
Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.06265996800929555: 


0,1,2,3,4
,0.0,1.0,Error,Rate
0,4269.0,404.0,0.0865,(404.0/4673.0)
1,162.0,69.0,0.7013,(162.0/231.0)
Total,4431.0,473.0,0.1154,(566.0/4904.0)


Maximum Metrics: Maximum metrics at their respective thresholds



0,1,2,3
metric,threshold,value,idx
max f1,0.0626600,0.1960227,194.0
max f2,0.0385108,0.2700445,244.0
max f0point5,0.3073742,0.2882206,39.0
max accuracy,0.4312340,0.9545269,22.0
max precision,0.6418320,0.75,3.0
max recall,0.0031637,1.0,399.0
max specificity,0.8182682,0.9997860,0.0
max absolute_mcc,0.3073742,0.2195738,39.0
max min_per_class_accuracy,0.0198797,0.6147186,309.0


Gains/Lift Table: Avg response rate:  4.71 %



0,1,2,3,4,5,6,7,8,9,10,11
,group,cumulative_data_fraction,lower_threshold,lift,cumulative_lift,response_rate,cumulative_response_rate,capture_rate,cumulative_capture_rate,gain,cumulative_gain
,1,0.0101958,0.2759044,9.7655411,9.7655411,0.46,0.46,0.0995671,0.0995671,876.5541126,876.5541126
,2,0.0201876,0.1768234,2.5995229,6.2187240,0.1224490,0.2929293,0.0259740,0.1255411,159.9522926,521.8724037
,3,0.0301794,0.1357597,1.7330153,4.7335907,0.0816327,0.2229730,0.0173160,0.1428571,73.3015284,373.3590734
,4,0.0401713,0.1159468,2.5995229,4.2027820,0.1224490,0.1979695,0.0259740,0.1688312,159.9522926,320.2781990
,5,0.0501631,0.0987736,1.2997615,3.6245381,0.0612245,0.1707317,0.0129870,0.1818182,29.9761463,262.4538064
,6,0.1001223,0.0604267,2.4262214,3.0266000,0.1142857,0.1425662,0.1212121,0.3030303,142.6221398,202.6600012
,7,0.1500816,0.0440954,1.2131107,2.4229249,0.0571429,0.1141304,0.0606061,0.3636364,21.3110699,142.2924901
,8,0.2000408,0.0351749,1.2997615,2.1424203,0.0612245,0.1009174,0.0649351,0.4285714,29.9761463,114.2420271
,9,0.2999592,0.0247010,1.1697853,1.8184290,0.0551020,0.0856560,0.1168831,0.5454545,16.9785317,81.8429022



Cross-Validation Metrics Summary: 


0,1,2,3,4,5,6,7
,mean,sd,cv_1_valid,cv_2_valid,cv_3_valid,cv_4_valid,cv_5_valid
accuracy,0.8966253,0.0547259,0.7502549,0.9592253,0.9408767,0.8858308,0.9469388
auc,0.6689358,0.0309648,0.6292334,0.7481481,0.6305302,0.6569443,0.6798231
err,0.1033747,0.0547259,0.2497452,0.0407747,0.0591233,0.1141692,0.0530612
err_count,101.4,53.690968,245.0,40.0,58.0,112.0,52.0
f0point5,0.2732962,0.0946900,0.0904523,0.4320988,0.2439024,0.1801802,0.4198473
f1,0.2246218,0.0398393,0.1281139,0.2592592,0.2162162,0.2222222,0.2972973
f2,0.2237705,0.0260584,0.2195122,0.1851852,0.1941748,0.2898551,0.2301255
lift_top_group,10.38274,2.9475746,4.562791,15.26,11.147727,6.6886363,14.254545
logloss,0.1860590,0.0099820,0.1886031,0.1659469,0.1826269,0.1832709,0.2098472


Scoring History: 


0,1,2,3,4,5,6,7,8,9,10,11,12,13
,timestamp,duration,number_of_trees,training_rmse,training_logloss,training_auc,training_lift,training_classification_error,validation_rmse,validation_logloss,validation_auc,validation_lift,validation_classification_error
,2018-08-14 18:04:55,12.264 sec,0.0,0.2118622,0.1898994,0.5,1.0,0.9528956,0.2161089,0.1959333,0.5,1.0,0.9508891
,2018-08-14 18:04:56,12.528 sec,5.0,0.1912917,0.1436173,0.9263643,18.2573160,0.0391517,0.2131652,0.1861076,0.6754076,5.0905172,0.0982218
,2018-08-14 18:04:56,12.836 sec,10.0,0.1692332,0.1106636,0.9760876,20.8048485,0.0203915,0.2114478,0.1813669,0.7087159,8.4841954,0.0685859
,2018-08-14 18:04:56,13.132 sec,15.0,0.1546803,0.0932565,0.9871292,21.2294372,0.0134584,0.2111540,0.1813863,0.6947524,8.4841954,0.1278577
,2018-08-14 18:04:56,13.416 sec,20.0,0.1424780,0.0806138,0.9919418,21.2294372,0.0103997,0.2118183,0.1826487,0.7049698,10.1810345,0.0499577
,2018-08-14 18:04:57,13.676 sec,25.0,0.1312188,0.0704007,0.9942231,21.2294372,0.0065253,0.2118712,0.1838461,0.6983450,6.7873563,0.0770533
,2018-08-14 18:04:57,13.936 sec,30.0,0.1201879,0.0611406,0.9963440,21.2294372,0.0048940,0.2121802,0.1844297,0.7119707,8.4841954,0.0736664
,2018-08-14 18:04:57,14.188 sec,34.0,0.1111113,0.0545110,0.9970282,21.2294372,0.0034666,0.2124073,0.1856861,0.7111800,8.4841954,0.0804403


Variable Importances: 


0,1,2,3
variable,relative_importance,scaled_importance,percentage
2,15.0153389,1.0,0.0307435
71,12.2121620,0.8133124,0.0250040
pca_12,11.1265478,0.7410121,0.0227813
pca_0,10.4576960,0.6964675,0.0214118
3,10.4030600,0.6928288,0.0213000
---,---,---,---
59,0.0,0.0,0.0
60,0.0,0.0,0.0
62,0.0,0.0,0.0



See the whole table with table.as_data_frame()



In [None]:
h2o_automl.get_score(aml=aml, h2o_valid=h2o_valid, y_valid=y_valid, threshold=0.1)

In [228]:
h2o.cluster().shutdown()

H2O session _sid_bd02 closed.
