In [1]:
import os
import h2o
import pandas as pd
import numpy as np
import datetime as dt
from h2o.grid.grid_search import H2OGridSearch
from h2o.estimators import H2OXGBoostEstimator

In [3]:
h2o.init(nthreads = -1, max_mem_size = 25)

Checking whether there is an H2O instance running at http://localhost:54321. connected.


0,1
H2O cluster uptime:,1 hour 22 mins
H2O cluster version:,3.14.0.7
H2O cluster version age:,17 days
H2O cluster name:,H2O_from_python_laith_wzp11j
H2O cluster total nodes:,1
H2O cluster free memory:,17.03 Gb
H2O cluster total cores:,12
H2O cluster allowed cores:,12
H2O cluster status:,"locked, healthy"
H2O connection url:,http://localhost:54321


In [6]:
H2OXGBoostEstimator.available()

True

In [7]:
def prepare_data(with_validation = True):
    train = h2o.import_file(os.path.realpath("../data/train2.csv"))
    test  = h2o.import_file(os.path.realpath("../data/test2.csv"))
    
    x = train.columns
    y = "target"
    x.remove(y)
    
    test_id = test['id'].as_data_frame(True).id.values
    
    features_bin = [i for i in x if set(i)&set('bin')==set('bin')]
    features_cat = [i for i in x if (set(i)&set('cat')==set('cat'))&(set(i)&set('avg')!=set('avg'))]
    features_con = list(set(x) - (set(features_bin)|set(features_cat)))
    
    train[features_cat] = train[features_cat].asfactor()
    train[features_bin] = train[features_bin].asfactor()
    train[features_con] = train[features_con].asnumeric()
    
    test[features_cat] = test[features_cat].asfactor()
    test[features_bin] = test[features_bin].asfactor()
    test[features_con] = test[features_con].asnumeric()
    
    train['target'] = train['target'].asfactor()
    
    return train, test, x, y, test_id

In [8]:
train, test, x, y, test_id = prepare_data()

Parse progress: |█████████████████████████████████████████████████████████| 100%
Parse progress: |█████████████████████████████████████████████████████████| 100%


In [23]:
param = {
      "ntrees" : 100
    , "max_depth" : 10
    , "learn_rate" : 0.02
    , "sample_rate" : 0.7
    , "col_sample_rate_per_tree" : 0.9
    , "min_rows" : 5
    , "seed": 4241
    , "score_tree_interval": 100
}

In [24]:
model = H2OXGBoostEstimator(**param)

In [25]:
model.train(x,y,train)

xgboost Model Build progress: |███████████████████████████████████████████| 100%


In [26]:
model

Model Details
H2OXGBoostEstimator :  XGBoost
Model Key:  XGBoost_model_python_1510054492941_640


ModelMetricsBinomial: xgboost
** Reported on train data. **

MSE: 0.25
RMSE: 0.5
LogLoss: 0.6931471805576873
Mean Per-Class Error: 0.5
AUC: 0.5
Gini: 0.0
Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.5: 


0,1,2,3,4
,0.0,1.0,Error,Rate
0,0.0,573518.0,1.0,(573518.0/573518.0)
1,0.0,21694.0,0.0,(0.0/21694.0)
Total,0.0,595212.0,0.9636,(573518.0/595212.0)


Maximum Metrics: Maximum metrics at their respective thresholds



0,1,2,3
metric,threshold,value,idx
max f1,0.5,0.0703316,0.0
max f2,0.5,0.1590497,0.0
max f0point5,0.5,0.0451480,0.0
max accuracy,0.5,0.0364475,0.0
max precision,0.5,0.0364475,0.0
max recall,0.5,1.0,0.0
max specificity,0.5,0.0,0.0
max absolute_mcc,0.5,0.0,0.0
max min_per_class_accuracy,0.5,0.0,0.0


Gains/Lift Table: Avg response rate:  3.64 %



0,1,2,3,4,5,6,7,8,9,10,11
,group,cumulative_data_fraction,lower_threshold,lift,cumulative_lift,response_rate,cumulative_response_rate,capture_rate,cumulative_capture_rate,gain,cumulative_gain
,1,1.0,0.5,1.0,1.0,0.0364475,0.0364475,1.0,1.0,0.0,0.0



Scoring History: 


0,1,2,3,4,5,6,7,8
,timestamp,duration,number_of_trees,training_rmse,training_logloss,training_auc,training_lift,training_classification_error
,2017-11-07 14:10:20,0.620 sec,0.0,0.5,0.6931472,0.5,1.0,0.9635525
,2017-11-07 14:10:24,5.300 sec,100.0,0.5,0.6931472,0.5,1.0,0.9635525




In [35]:
myxgb = H2OXGBoostEstimator(distribution = "bernoulli",
                            tree_method = 'exact',
                           ntrees = 50,
                           max_depth = 8,
                           min_rows = 1,
                           learn_rate = 0.1,
                           sample_rate = 0.7,
                           col_sample_rate = 0.9,
                           nfolds = 5,
                           fold_assignment = "Stratified",
                           keep_cross_validation_predictions = True,
                            stopping_rounds = 5,
                                 stopping_metric = "auc",
                                 stopping_tolerance = 1e-4,
                            backend='cpu',
                            quiet_mode=False,
                            
                           seed = 1)

In [36]:
myxgb.train(x=x, y=y, training_frame=train)

xgboost Model Build progress: |███████████████████████████████████████████| 100%


In [37]:
myxgb

Model Details
H2OXGBoostEstimator :  XGBoost
Model Key:  XGBoost_model_python_1510054492941_865


ModelMetricsBinomial: xgboost
** Reported on train data. **

MSE: 0.25
RMSE: 0.5
LogLoss: 0.6931471805576873
Mean Per-Class Error: 0.5
AUC: 0.5
Gini: 0.0
Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.5: 


0,1,2,3,4
,0.0,1.0,Error,Rate
0,0.0,573518.0,1.0,(573518.0/573518.0)
1,0.0,21694.0,0.0,(0.0/21694.0)
Total,0.0,595212.0,0.9636,(573518.0/595212.0)


Maximum Metrics: Maximum metrics at their respective thresholds



0,1,2,3
metric,threshold,value,idx
max f1,0.5,0.0703316,0.0
max f2,0.5,0.1590497,0.0
max f0point5,0.5,0.0451480,0.0
max accuracy,0.5,0.0364475,0.0
max precision,0.5,0.0364475,0.0
max recall,0.5,1.0,0.0
max specificity,0.5,0.0,0.0
max absolute_mcc,0.5,0.0,0.0
max min_per_class_accuracy,0.5,0.0,0.0


Gains/Lift Table: Avg response rate:  3.64 %



0,1,2,3,4,5,6,7,8,9,10,11
,group,cumulative_data_fraction,lower_threshold,lift,cumulative_lift,response_rate,cumulative_response_rate,capture_rate,cumulative_capture_rate,gain,cumulative_gain
,1,1.0,0.5,1.0,1.0,0.0364475,0.0364475,1.0,1.0,0.0,0.0




ModelMetricsBinomial: xgboost
** Reported on cross-validation data. **

MSE: 0.25
RMSE: 0.5
LogLoss: 0.6931471805599071
Mean Per-Class Error: 0.5
AUC: 0.5
Gini: 0.0
Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.5: 


0,1,2,3,4
,0.0,1.0,Error,Rate
0,0.0,573518.0,1.0,(573518.0/573518.0)
1,0.0,21694.0,0.0,(0.0/21694.0)
Total,0.0,595212.0,0.9636,(573518.0/595212.0)


Maximum Metrics: Maximum metrics at their respective thresholds



0,1,2,3
metric,threshold,value,idx
max f1,0.5,0.0703316,0.0
max f2,0.5,0.1590497,0.0
max f0point5,0.5,0.0451480,0.0
max accuracy,0.5,0.0364475,0.0
max precision,0.5,0.0364475,0.0
max recall,0.5,1.0,0.0
max specificity,0.5,0.0,0.0
max absolute_mcc,0.5,0.0,0.0
max min_per_class_accuracy,0.5,0.0,0.0


Gains/Lift Table: Avg response rate:  3.64 %



0,1,2,3,4,5,6,7,8,9,10,11
,group,cumulative_data_fraction,lower_threshold,lift,cumulative_lift,response_rate,cumulative_response_rate,capture_rate,cumulative_capture_rate,gain,cumulative_gain
,1,1.0,0.5,1.0,1.0,0.0364475,0.0364475,1.0,1.0,0.0,0.0



Cross-Validation Metrics Summary: 


0,1,2,3,4,5,6,7
,mean,sd,cv_1_valid,cv_2_valid,cv_3_valid,cv_4_valid,cv_5_valid
accuracy,0.0364487,0.0005329,0.0372700,0.0352393,0.0363691,0.0361406,0.0372245
auc,0.5,0.0,0.5,0.5,0.5,0.5,0.5
err,0.9635513,0.0005329,0.9627301,0.9647607,0.9636309,0.9638594,0.9627755
err_count,114703.6,382.424,114200.0,114684.0,114621.0,115720.0,114293.0
f0point5,0.0451493,0.0006543,0.0461574,0.0436644,0.0450518,0.0447712,0.0461017
f1,0.0703328,0.0009925,0.0718616,0.0680795,0.0701857,0.0697600,0.0717772
f2,0.1590467,0.0020317,0.1621731,0.1544286,0.1587511,0.1578794,0.1620011
lift_top_group,1.0,0.0,1.0,1.0,1.0,1.0,1.0
logloss,0.6931472,0.0000000,0.6931472,0.6931472,0.6931472,0.6931472,0.6931472


Scoring History: 


0,1,2,3,4,5,6,7,8
,timestamp,duration,number_of_trees,training_rmse,training_logloss,training_auc,training_lift,training_classification_error
,2017-11-07 14:28:47,2 min 21.512 sec,0.0,0.5,0.6931472,0.5,1.0,0.9635525
,2017-11-07 14:28:49,2 min 23.641 sec,1.0,0.5,0.6931472,0.5,1.0,0.9635525
,2017-11-07 14:28:50,2 min 24.116 sec,2.0,0.5,0.6931472,0.5,1.0,0.9635525
,2017-11-07 14:28:50,2 min 24.691 sec,3.0,0.5,0.6931472,0.5,1.0,0.9635525
,2017-11-07 14:28:51,2 min 25.276 sec,4.0,0.5,0.6931472,0.5,1.0,0.9635525
,2017-11-07 14:28:52,2 min 25.811 sec,5.0,0.5,0.6931472,0.5,1.0,0.9635525
,2017-11-07 14:28:52,2 min 26.350 sec,6.0,0.5,0.6931472,0.5,1.0,0.9635525
,2017-11-07 14:28:54,2 min 28.406 sec,50.0,0.5,0.6931472,0.5,1.0,0.9635525


