This project is an attempt to build easy interpetable model vs h2o autoML.
Data from Kaggle: Categorical Feature Encoding Challenge

In [105]:
#Load libaries
import numpy as np 
import pandas as pd 
from bayes_opt import BayesianOptimization
from bayes_opt.observer import JSONLogger
from bayes_opt.event import Events
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score, StratifiedKFold
import h2o
from h2o.automl import H2OAutoML
from sklearn.metrics import f1_score, auc

In [2]:
#Load samples and separate and target
df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')
train_trg = df_train[['target']]

In [151]:
#Learn something about dataset
def about_dataset(df):
    summary = pd.DataFrame(df.dtypes, columns=['Type'])
    summary = summary.reset_index()
    summary['Variable'] = summary['index']
    summary = summary[['Variable', 'Type']]
    summary['Missing'] = df.isnull().sum().values    
    summary['Uniques'] = df.nunique().values
    summary['Some Row'] = df.loc[8].values
    return summary

describ = about_dataset(df_train)

FEATURE ENGINEERING

In [4]:
#function for target encoder is useful for variables with many unique categories
def my_TE_encoder(df, var_names, trg_name):
    from category_encoders import TargetEncoder
    TE_features = TargetEncoder(cols=var_names, drop_invariant=True, return_df=True, min_samples_leaf=2, smoothing=1.0)
    TE_features.fit(df[var_names], df[trg_name])
    return TE_features
TE_feature_names = list(describ['Variable'][(describ.Type =='object') & (describ.Uniques>10)])
TE_feature_modifier = my_TE_encoder(df_train, TE_feature_names,['target'])
TE_feature_train = TE_feature_modifier.transform(df_train[TE_feature_names])
TE_feature_test = TE_feature_modifier.transform(df_test[TE_feature_names])
TE_feature_train.columns = [str(col) + '_te' for col in TE_feature_train.columns]
TE_feature_test.columns = [str(col) + '_te' for col in TE_feature_test.columns]

In [5]:
#for variables with lower quantity of unique categories make dummy-transformation
OHE_feature_names = list(describ['Variable'][(describ.Uniques<=10)&(describ.Variable!='target')])
OHE_feature_train = pd.get_dummies(df_train[OHE_feature_names])
OHE_feature_test = pd.get_dummies(df_test[OHE_feature_names])
OHE_feature_train.columns = [str(col) + '_ohe' for col in OHE_feature_train.columns]
OHE_feature_test.columns = [str(col) + '_ohe' for col in OHE_feature_test.columns]

In [6]:
#just leave numeric variables
NUM_feature_names = list(describ['Variable'][(describ.Type =='int64') & (describ.Variable!='target')& (describ.Variable!='id')])
NUM_feature_train = df_train[NUM_feature_names]
NUM_feature_test = df_test[NUM_feature_names]

Make final samples


In [7]:
train_final = pd.concat([NUM_feature_train, TE_feature_train, OHE_feature_train], axis=1)
test_final = pd.concat([NUM_feature_test, TE_feature_test, OHE_feature_test], axis=1)
print('Memory usage of dataframe is {:.2f} MB'.format(train_final.memory_usage().sum() / 1024**2))

Memory usage of dataframe is 54.93 MB


AUTO MODEL FOR EVALUATING CAPACITY

In [None]:
h2o.init(verbose=False)
h2o.remove_all()

In [9]:
#make the appropriate format of data
#It's not recommended to use one-hot-encoding for h20 automl (lose some infornation), but I try...
data = h2o.H2OFrame(pd.concat([train_final, train_trg], axis=1))

Parse progress: |█████████████████████████████████████████████████████████| 100%


In [10]:
# split for train and test
train_cols = [x for x in data.col_names if x not in ['target']]
target = 'target'
train, test = data.split_frame(ratios=[0.7])
train['target'] = train['target'].asfactor()

In [11]:
#make auto-ml
auto_ml = H2OAutoML(max_models=5, seed=1, max_runtime_secs=120)
auto_ml.train(x=train_cols, y=target, training_frame=train)

AutoML progress: |████████████████████████████████████████████████████████| 100%


In [12]:
#Results
lb = auto_ml.leaderboard
lb.head(rows=lb.nrows) 

model_id,auc,logloss,mean_per_class_error,rmse,mse
GLM_grid_1_AutoML_20191011_154724_model_1,0.821897,0.610056,0.258484,0.458126,0.20988




In [13]:
perf = auto_ml.leader.model_performance(test)
print(perf)


ModelMetricsBinomialGLM: glm
** Reported on test data. **

MSE: 0.20965200556180424
RMSE: 0.4578777190056361
LogLoss: 0.6094354371354954
Null degrees of freedom: 89952
Residual degrees of freedom: 89894
Null deviance: 110876.88467836948
Residual deviance: 109641.09175329845
AIC: 109759.09175329845
AUC: 0.8259962814810081
pr_auc: 0.6866743668410089
Gini: 0.6519925629620162

Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.3071700169148137: 

Unnamed: 0,Unnamed: 1,0,1,Error,Rate
0,0,46951.0,15428.0,0.2473,(15428.0/62379.0)
1,1,7201.0,20373.0,0.2612,(7201.0/27574.0)
2,Total,54152.0,35801.0,0.2516,(22629.0/89953.0)



Maximum Metrics: Maximum metrics at their respective thresholds


Unnamed: 0,metric,threshold,value,idx
0,max f1,0.30717,0.642935,176.0
1,max f2,0.303405,0.756061,235.0
2,max f0point5,0.310589,0.646017,126.0
3,max accuracy,0.310589,0.779785,126.0
4,max precision,0.33443,1.0,0.0
5,max recall,0.291925,1.0,369.0
6,max specificity,0.33443,1.0,0.0
7,max absolute_mcc,0.308371,0.468321,158.0
8,max min_per_class_accuracy,0.307032,0.744706,178.0
9,max mean_per_class_accuracy,0.306894,0.746044,180.0



Gains/Lift Table: Avg response rate: 30,65 %, avg score: 30,56 %


Unnamed: 0,Unnamed: 1,group,cumulative_data_fraction,lower_threshold,lift,cumulative_lift,response_rate,score,cumulative_response_rate,cumulative_score,capture_rate,cumulative_capture_rate,gain,cumulative_gain
0,,1,0.010005,0.320283,3.135375,3.135375,0.961111,0.322368,0.961111,0.322368,0.03137,0.03137,213.537491,213.537491
1,,2,0.02001,0.318456,2.907018,3.021197,0.891111,0.319255,0.926111,0.320812,0.029085,0.060456,190.701812,202.119652
2,,3,0.030005,0.317396,2.834048,2.95886,0.868743,0.317924,0.907003,0.31985,0.028324,0.088779,183.404814,195.885995
3,,4,0.04001,0.316601,2.725783,2.900574,0.835556,0.316992,0.889136,0.319135,0.027272,0.116051,172.578258,190.057442
4,,5,0.050004,0.315961,2.648982,2.85029,0.812013,0.316266,0.873722,0.318562,0.026474,0.142526,164.898226,185.028955
5,,6,0.100008,0.31361,2.452119,2.651204,0.751667,0.314694,0.812695,0.316628,0.122616,0.265141,145.211933,165.120444
6,,7,0.150001,0.312056,2.150887,2.484457,0.659328,0.312797,0.76158,0.315351,0.107529,0.37267,115.088748,148.445684
7,,8,0.200004,0.310823,1.875534,2.332218,0.574922,0.311418,0.714913,0.314368,0.093784,0.466454,87.553404,133.221768
8,,9,0.300001,0.308819,1.534105,2.06619,0.470261,0.30978,0.633365,0.312839,0.153405,0.619859,53.410498,106.618997
9,,10,0.399998,0.307102,1.207337,1.851483,0.370094,0.307947,0.56755,0.311616,0.12073,0.740589,20.733699,85.148269






In [14]:
#Leader model
print(auto_ml.leader)
# save the model
model_path = h2o.save_model(model=auto_ml.leader, path = "./model_automl_bin_class", force=True)

Model Details
H2OGeneralizedLinearEstimator :  Generalized Linear Modeling
Model Key:  GLM_grid_1_AutoML_20191011_154724_model_1


GLM Model: summary


Unnamed: 0,Unnamed: 1,family,link,regularization,lambda_search,number_of_predictors_total,number_of_active_predictors,number_of_iterations,training_frame
0,,binomial,logit,Ridge ( lambda = 10.366 ),"nlambda = 30, lambda.max = 10.366, lambda.min = 10.366, lambda.1se...",59,58,2,automl_training_py_5_sid_ba16




ModelMetricsBinomialGLM: glm
** Reported on train data. **

MSE: 0.20931769397647873
RMSE: 0.45751250690716505
LogLoss: 0.6087369328361123
Null degrees of freedom: 210046
Residual degrees of freedom: 209988
Null deviance: 258581.86562823073
Residual deviance: 255726.73306285375
AIC: 255844.73306285375
AUC: 0.8225781746402816
pr_auc: 0.6789209333288229
Gini: 0.6451563492805632

Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.3070471155983036: 

Unnamed: 0,Unnamed: 1,0,1,Error,Rate
0,0,108527.0,37330.0,0.2559,(37330.0/145857.0)
1,1,16661.0,47529.0,0.2596,(16661.0/64190.0)
2,Total,125188.0,84859.0,0.257,(53991.0/210047.0)



Maximum Metrics: Maximum metrics at their respective thresholds


Unnamed: 0,metric,threshold,value,idx
0,max f1,0.307047,0.637763,184.0
1,max f2,0.303424,0.752814,239.0
2,max f0point5,0.310555,0.638738,132.0
3,max accuracy,0.310248,0.777345,136.0
4,max precision,0.33462,1.0,0.0
5,max recall,0.288723,1.0,387.0
6,max specificity,0.33462,1.0,0.0
7,max absolute_mcc,0.307788,0.458615,172.0
8,max min_per_class_accuracy,0.306987,0.740506,185.0
9,max mean_per_class_accuracy,0.306919,0.742397,186.0



Gains/Lift Table: Avg response rate: 30,56 %, avg score: 30,56 %


Unnamed: 0,Unnamed: 1,group,cumulative_data_fraction,lower_threshold,lift,cumulative_lift,response_rate,score,cumulative_response_rate,cumulative_score,capture_rate,cumulative_capture_rate,gain,cumulative_gain
0,,1,0.010003,0.320285,3.108734,3.108734,0.950024,0.322392,0.950024,0.322392,0.031095,0.031095,210.873421,210.873421
1,,2,0.02,0.318574,2.969974,3.039371,0.907619,0.319353,0.928826,0.320873,0.029693,0.060788,196.997442,203.937083
2,,3,0.030003,0.317481,2.797238,2.958647,0.854831,0.318011,0.904157,0.319919,0.027979,0.088768,179.723779,195.864701
3,,4,0.040001,0.316647,2.692611,2.892154,0.822857,0.317061,0.883837,0.319204,0.02692,0.115688,169.26106,189.215374
4,,5,0.050003,0.315991,2.611897,2.836092,0.798191,0.316299,0.866705,0.318623,0.026126,0.141813,161.189743,183.60918
5,,6,0.100001,0.313628,2.391106,2.61361,0.730718,0.314706,0.798715,0.316665,0.119551,0.261365,139.110633,161.360966
6,,7,0.15,0.31204,2.125947,2.451061,0.649686,0.312797,0.74904,0.315376,0.106294,0.367659,112.594715,145.106065
7,,8,0.200003,0.310795,1.875254,2.307102,0.573074,0.311397,0.705046,0.314381,0.093768,0.461427,87.525393,130.710212
8,,9,0.3,0.308804,1.540322,2.051517,0.47072,0.309753,0.62694,0.312838,0.154027,0.615454,54.03224,105.151699
9,,10,0.400001,0.307083,1.207647,1.840547,0.369055,0.307923,0.562468,0.311609,0.120766,0.736221,20.76475,84.054711




ModelMetricsBinomialGLM: glm
** Reported on cross-validation data. **

MSE: 0.2098795005856248
RMSE: 0.4581260749898709
LogLoss: 0.6100562057140109
Null degrees of freedom: 210046
Residual degrees of freedom: 209988
Null deviance: 258582.33883506636
Residual deviance: 256280.95168322173
AIC: 256398.95168322173
AUC: 0.8218967642210769
pr_auc: 0.6778423535879938
Gini: 0.6437935284421539

Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.30685101792749725: 

Unnamed: 0,Unnamed: 1,0,1,Error,Rate
0,0,109089.0,36768.0,0.2521,(36768.0/145857.0)
1,1,17003.0,47187.0,0.2649,(17003.0/64190.0)
2,Total,126092.0,83955.0,0.256,(53771.0/210047.0)



Maximum Metrics: Maximum metrics at their respective thresholds


Unnamed: 0,metric,threshold,value,idx
0,max f1,0.306851,0.637038,179.0
1,max f2,0.303865,0.752513,237.0
2,max f0point5,0.309645,0.638099,128.0
3,max accuracy,0.309645,0.776817,128.0
4,max precision,0.328611,1.0,0.0
5,max recall,0.292038,1.0,389.0
6,max specificity,0.328611,1.0,0.0
7,max absolute_mcc,0.307597,0.458121,165.0
8,max min_per_class_accuracy,0.30679,0.739959,180.0
9,max mean_per_class_accuracy,0.306679,0.741594,182.0



Gains/Lift Table: Avg response rate: 30,56 %, avg score: 30,56 %


Unnamed: 0,Unnamed: 1,group,cumulative_data_fraction,lower_threshold,lift,cumulative_lift,response_rate,score,cumulative_response_rate,cumulative_score,capture_rate,cumulative_capture_rate,gain,cumulative_gain
0,,1,0.010003,0.317441,3.111849,3.111849,0.950976,0.319111,0.950976,0.319111,0.031126,0.031126,211.184917,211.184917
1,,2,0.02,0.316049,2.968416,3.04015,0.907143,0.316679,0.929065,0.317895,0.029678,0.060804,196.84162,204.014976
2,,3,0.030003,0.315188,2.773876,2.951378,0.847692,0.315605,0.901936,0.317132,0.027746,0.08855,177.387556,195.137761
3,,4,0.040001,0.31452,2.703518,2.889427,0.82619,0.314847,0.883004,0.316561,0.027029,0.115579,170.351816,188.94275
4,,5,0.050003,0.313964,2.61034,2.833599,0.797715,0.314231,0.865943,0.316095,0.02611,0.141689,161.033995,183.359936
5,,6,0.100001,0.31208,2.386121,2.609871,0.729194,0.312948,0.797572,0.314521,0.119302,0.260991,138.612096,160.987081
6,,7,0.15,0.31081,2.125012,2.448256,0.6494,0.311416,0.748183,0.313486,0.106247,0.367238,112.50124,144.825647
7,,8,0.200003,0.30981,1.879616,2.30609,0.574407,0.310293,0.704737,0.312688,0.093987,0.461224,87.961571,130.608951
8,,9,0.3,0.308199,1.529573,2.047259,0.467435,0.30897,0.625639,0.311449,0.152952,0.614177,52.957271,104.725879
9,,10,0.400001,0.306818,1.210452,1.838055,0.369912,0.307498,0.561706,0.310461,0.121047,0.735224,21.045163,83.805451




Cross-Validation Metrics Summary: 

Unnamed: 0,Unnamed: 1,mean,sd,cv_1_valid,cv_2_valid,cv_3_valid,cv_4_valid,cv_5_valid
0,accuracy,0.74089134,0.0055663264,0.73508686,0.7407998,0.7401271,0.7384132,0.75002974
1,auc,0.82226974,0.0026204567,0.8211043,0.8215892,0.82064515,0.821091,0.82691914
2,err,0.25910863,0.0055663264,0.2649131,0.2592002,0.25987288,0.26158682,0.24997024
3,err_count,10885.0,233.90596,11129.0,10889.0,10917.0,10989.0,10501.0
4,f0point5,0.5866355,0.006333879,0.58154404,0.58703685,0.5850771,0.58226174,0.5972576
5,f1,0.6378729,0.0039016719,0.63627154,0.63798666,0.63674176,0.634005,0.6443594
6,f2,0.69895464,0.0023496244,0.70236963,0.698621,0.69841456,0.6958416,0.6995264
7,lift_top_group,3.1075325,0.027236322,3.1098583,3.0640204,3.122382,3.1365318,3.1048706
8,logloss,0.6100562,0.0008431292,0.61124563,0.61057,0.609641,0.60912687,0.6096975
9,max_per_class_error,0.26400104,0.0057942932,0.27357358,0.2614424,0.2627252,0.26413417,0.25812992



See the whole table with table.as_data_frame()

Scoring History: 

Unnamed: 0,Unnamed: 1,timestamp,duration,iteration,lambda,predictors,deviance_train,deviance_test,deviance_xval,deviance_se
0,,2019-10-11 15:47:45,0.000 sec,2,",1E2",59,1.217474,,0.0,0.0





In [70]:
#predictions of unknown test sample
#data_test = h2o.H2OFrame(test_final)
#len(test_final)<>len(data_test) -so strange...incorrect working function?
test_final.to_csv('test_sample', header=True)
data_test = h2o.import_file('test_sample')

Parse progress: |█████████████████████████████████████████████████████████| 100%
glm prediction progress: |████████████████████████████████████████████████| 100%


In [125]:
len(test_final)==len(data_test)

True

In [123]:
saved_model = h2o.load_model(model_path)
preds = saved_model.predict(data_test)
preds_df = preds.as_data_frame(header=False)

glm prediction progress: |████████████████████████████████████████████████| 100%


In [133]:
#to csv
preds = pd.DataFrame(preds_df['predict'])
preds.columns = ['target']
preds['id']=df_test['id']
preds = preds.set_index('id')
preds.to_csv('prediction_0',index=True)

In [153]:
h2o.shutdown()

    >>> h2o.shutdown()
        ^^^^ Deprecated, use ``h2o.cluster().shutdown()``.


<span style="color:blue">The result in the leaderboard = 0.709 </span>

<span style="color:blue">Let's see how we can manage with simple decision tree </span>

TUNING HYPERPARAMETERS with BAYESIAN OPT

In [147]:
#function to optimize
def evaluate_opt(print_time=False,n_folds=5,random_state=17,**params):
    params['max_features'] = int(params['max_features'])
    params['max_depth'] = int(params['max_depth'])
    params['min_samples_split'] =int(params['min_samples_split'])
    folds = StratifiedKFold(n_splits = n_folds, shuffle = True, random_state = random_state)
    estimator=DecisionTreeClassifier(**params,random_state=1)
    val_score=cross_val_score(estimator=estimator, X = train_final, y = train_trg,  cv=folds,scoring='f1_macro')  
    return np.mean(val_score)

In [136]:
#parameters to try
params_opt = {
          'max_features': (2, train_final.shape[1]-1),
          'max_depth': (2, 50),
          'min_samples_split':(2, 20), 
}

In [148]:
#Let's go to optimize and log results
optimizer = BayesianOptimization(evaluate_opt, params_opt)
logger = JSONLogger(path="./logs.json")
optimizer.subscribe(Events.OPTMIZATION_STEP, logger)
optimizer.maximize(init_points = 25, n_iter = 5)
best_params = optimizer.max['params']

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


In [149]:
# best choice
best_estimator=DecisionTreeClassifier(max_depth=int(best_params['max_depth']), 
                                      max_features=int(best_params['max_features']) ,
                                      min_samples_split=int(best_params['min_samples_split']), random_state=1)

In [150]:
#make predictions
best_estimator = best_estimator.fit(train_final, train_trg)
predictions = best_estimator.predict(test_final) 
predictions = pd.DataFrame(predictions)
predictions.columns = ['target']
predictions['id']=df_test['id']
predictions.set_index('id')
predictions.to_csv('prediction_1',index=False)

<span style="color:blue">I can't defeat ridge Log-Reg with Decision Tree. The best result in the leaderboard = 0.605. So in this case good-tuned Log-Reg is better than Decision Tree. </span>