In [1]:
# connect to h2o cluster:
import h2o
h2o.init(nthreads=-1, max_mem_size="8G")

Checking whether there is an H2O instance running at http://localhost:54321. connected.


0,1
H2O cluster uptime:,20 secs
H2O cluster version:,3.10.4.1
H2O cluster version age:,15 days
H2O cluster name:,H2O_from_python_jurgentas_bzduds
H2O cluster total nodes:,1
H2O cluster free memory:,7.111 Gb
H2O cluster total cores:,8
H2O cluster allowed cores:,8
H2O cluster status:,"accepting new members, healthy"
H2O connection url:,http://localhost:54321


In [2]:
# This breast cancer databases was obtained from the University of Wisconsin
# Hospitals, Madison from Dr. William H. Wolberg. 

import pandas as pd

url = 'http://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/breast-cancer-wisconsin.data'
names = ['Sample code number','Clump Thickness','Uniformity of Cell Size','Uniformity of Cell Shape',
         'Marginal Adhesion', 'Single Epithelial Cell Size', 'Bare Nuclei', 'Bland Chromatin', 'Normal Nucleoli',
         'Mitoses', 'Class']
df = pd.read_csv(url, names = names)
df.head()

Unnamed: 0,Sample code number,Clump Thickness,Uniformity of Cell Size,Uniformity of Cell Shape,Marginal Adhesion,Single Epithelial Cell Size,Bare Nuclei,Bland Chromatin,Normal Nucleoli,Mitoses,Class
0,1000025,5,1,1,1,2,1,3,1,1,2
1,1002945,5,4,4,5,7,10,3,2,1,2
2,1015425,3,1,1,1,2,2,3,1,1,2
3,1016277,6,8,8,1,3,4,3,7,1,2
4,1017023,4,1,1,3,2,1,3,1,1,2


In [3]:
# construct h2o dataframe:
df_hex = h2o.H2OFrame(df, column_names = df.columns.tolist())
df_hex['Class'] = df_hex['Class'].asfactor() # encode response variable
df_hex.describe()

Parse progress: |█████████████████████████████████████████████████████████| 100%
Rows:699
Cols:11




Unnamed: 0,Sample code number,Clump Thickness,Uniformity of Cell Size,Uniformity of Cell Shape,Marginal Adhesion,Single Epithelial Cell Size,Bare Nuclei,Bland Chromatin,Normal Nucleoli,Mitoses,Class
type,int,int,int,int,int,int,int,int,int,int,enum
mins,61634.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,
mean,1071704.09871,4.41773962804,3.13447782546,3.20743919886,2.80686695279,3.21602288984,3.54465592972,3.43776824034,2.8669527897,1.58941344778,
maxs,13454352.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,
sigma,617095.729819,2.81574065859,3.05145910995,2.97191276722,2.85537923922,2.21429988665,3.64385716049,2.43836425232,3.05363389361,1.71507794251,
zeros,0,0,0,0,0,0,0,0,0,0,
missing,0,0,0,0,0,0,16,0,0,0,0
0,1000025.0,5.0,1.0,1.0,1.0,2.0,1.0,3.0,1.0,1.0,2
1,1002945.0,5.0,4.0,4.0,5.0,7.0,10.0,3.0,2.0,1.0,2
2,1015425.0,3.0,1.0,1.0,1.0,2.0,2.0,3.0,1.0,1.0,2


In [4]:
# predictor variables:
x = ['Clump Thickness','Uniformity of Cell Size','Uniformity of Cell Shape','Marginal Adhesion', 
     'Single Epithelial Cell Size', 'Bare Nuclei', 'Bland Chromatin', 'Normal Nucleoli','Mitoses']
# response:
y = 'Class'

In [5]:
# split data into train and testing:
train, test = df_hex.split_frame(ratios=[0.6])

In [6]:
# hyperparameter search (random forest)
from h2o.estimators import H2ORandomForestEstimator 
import h2o.grid

grid_search = h2o.grid.H2OGridSearch(
    H2ORandomForestEstimator(
        nfolds = 5
    ),
    hyper_params ={
        "ntrees": [25, 50, 100],
        "mtries": [3, 4, 5],
    }   
)

grid_search.train(x, y, train)
grid_search.summary()

drf Grid Build progress: |████████████████████████████████████████████████| 100%

Grid Summary:



0,1,2,3,4,5,6,7,8,9
Model Id,number_of_trees,number_of_internal_trees,model_size_in_bytes,min_depth,max_depth,mean_depth,min_leaves,max_leaves,mean_leaves
Grid_DRF_py_4_sid_8840_model_python_1489910935007_1_model_7,100.0,100.0,29021.0,6.0,10.0,7.33,11.0,23.0,17.38
Grid_DRF_py_4_sid_8840_model_python_1489910935007_1_model_0,25.0,25.0,7478.0,4.0,11.0,7.48,10.0,25.0,18.04
Grid_DRF_py_4_sid_8840_model_python_1489910935007_1_model_5,50.0,50.0,13967.0,5.0,10.0,6.82,11.0,26.0,16.56
Grid_DRF_py_4_sid_8840_model_python_1489910935007_1_model_6,100.0,100.0,31595.0,5.0,11.0,7.7,13.0,25.0,19.4
Grid_DRF_py_4_sid_8840_model_python_1489910935007_1_model_1,25.0,25.0,7249.0,6.0,11.0,7.28,11.0,23.0,17.4
Grid_DRF_py_4_sid_8840_model_python_1489910935007_1_model_3,50.0,50.0,15160.0,5.0,10.0,7.34,13.0,27.0,18.44
Grid_DRF_py_4_sid_8840_model_python_1489910935007_1_model_4,50.0,50.0,14495.0,5.0,9.0,7.1,13.0,24.0,17.4
Grid_DRF_py_4_sid_8840_model_python_1489910935007_1_model_8,100.0,100.0,27695.0,5.0,10.0,6.87,10.0,22.0,16.34
Grid_DRF_py_4_sid_8840_model_python_1489910935007_1_model_2,25.0,25.0,7174.0,5.0,9.0,7.08,11.0,22.0,17.2


In [7]:
# sorted on the specified area under the ROC curve
grid_sorted = grid_search.get_grid(sort_by='auc', decreasing=True)

In [8]:
# get the best performing model:
best_model = grid_sorted.models[0]
print best_model 

Model Details
H2ORandomForestEstimator :  Distributed Random Forest
Model Key:  Grid_DRF_py_4_sid_8840_model_python_1489910935007_1_model_7


ModelMetricsBinomial: drf
** Reported on train data. **

MSE: 0.0305174261869
RMSE: 0.174692375869
LogLoss: 0.0996434315183
Mean Per-Class Error: 0.0337659919674
AUC: 0.99227331968
Gini: 0.98454663936
Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.333333333333: 


0,1,2,3,4
,2.0,4.0,Error,Rate
2,274.0,13.0,0.0453,(13.0/287.0)
4,3.0,128.0,0.0229,(3.0/131.0)
Total,277.0,141.0,0.0383,(16.0/418.0)


Maximum Metrics: Maximum metrics at their respective thresholds



0,1,2,3
metric,threshold,value,idx
max f1,0.3333333,0.9411765,68.0
max f2,0.1666667,0.9703704,78.0
max f0point5,0.6764706,0.9389671,54.0
max accuracy,0.4117647,0.9617225,64.0
max precision,1.0,1.0,0.0
max recall,0.1666667,1.0,78.0
max specificity,1.0,1.0,0.0
max absolute_mcc,0.3333333,0.9142212,68.0
max min_per_class_accuracy,0.4117647,0.9616725,64.0


Gains/Lift Table: Avg response rate: 31.34 %



0,1,2,3,4,5,6,7,8,9,10,11
,group,cumulative_data_fraction,lower_threshold,lift,cumulative_lift,response_rate,cumulative_response_rate,capture_rate,cumulative_capture_rate,gain,cumulative_gain
,1,0.1220096,1.0,3.1908397,3.1908397,1.0,1.0,0.3893130,0.3893130,219.0839695,219.0839695
,2,0.1698565,0.99,3.1908397,3.1908397,1.0,1.0,0.1526718,0.5419847,219.0839695,219.0839695
,3,0.2105263,0.98,3.1908397,3.1908397,1.0,1.0,0.1297710,0.6717557,219.0839695,219.0839695
,4,0.3014354,0.757,3.1908397,3.1908397,1.0,1.0,0.2900763,0.9618321,219.0839695,219.0839695
,5,0.4019139,0.02,0.3798619,2.4880952,0.1190476,0.7797619,0.0381679,1.0,-62.0138132,148.8095238
,6,1.0,0.0,0.0,1.0,0.0,0.3133971,0.0,1.0,-100.0,0.0




ModelMetricsBinomial: drf
** Reported on cross-validation data. **

MSE: 0.0292942419685
RMSE: 0.17115560747
LogLoss: 0.0957910207191
Mean Per-Class Error: 0.0331010452962
AUC: 0.993363832221
Gini: 0.986727664441
Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.39: 


0,1,2,3,4
,2.0,4.0,Error,Rate
2,276.0,11.0,0.0383,(11.0/287.0)
4,4.0,127.0,0.0305,(4.0/131.0)
Total,280.0,138.0,0.0359,(15.0/418.0)


Maximum Metrics: Maximum metrics at their respective thresholds



0,1,2,3
metric,threshold,value,idx
max f1,0.39,0.9442379,37.0
max f2,0.19,0.9718101,46.0
max f0point5,0.84,0.9481216,14.0
max accuracy,0.39,0.9641148,37.0
max precision,1.0,1.0,0.0
max recall,0.19,1.0,46.0
max specificity,1.0,1.0,0.0
max absolute_mcc,0.39,0.9184852,37.0
max min_per_class_accuracy,0.43,0.9616725,36.0


Gains/Lift Table: Avg response rate: 31.34 %



0,1,2,3,4,5,6,7,8,9,10,11
,group,cumulative_data_fraction,lower_threshold,lift,cumulative_lift,response_rate,cumulative_response_rate,capture_rate,cumulative_capture_rate,gain,cumulative_gain
,1,0.0909091,1.0,3.1908397,3.1908397,1.0,1.0,0.2900763,0.2900763,219.0839695,219.0839695
,2,0.1100478,0.99,3.1908397,3.1908397,1.0,1.0,0.0610687,0.3511450,219.0839695,219.0839695
,3,0.1578947,0.97,3.1908397,3.1908397,1.0,1.0,0.1526718,0.5038168,219.0839695,219.0839695
,4,0.2009569,0.926,3.0135708,3.1528535,0.9444444,0.9880952,0.1297710,0.6335878,201.3570823,215.2853508
,5,0.3014354,0.5920000,2.6590331,2.9882467,0.8333333,0.9365079,0.2671756,0.9007634,165.9033079,198.8246698
,6,0.4043062,0.05,0.9646725,2.4733728,0.3023256,0.7751479,0.0992366,1.0,-3.5327534,147.3372781
,7,0.5406699,0.0001869,0.0,1.8495575,0.0,0.5796460,0.0,1.0,-100.0,84.9557522
,8,1.0,0.0,0.0,1.0,0.0,0.3133971,0.0,1.0,-100.0,0.0



Cross-Validation Metrics Summary: 


0,1,2,3,4,5,6,7
,mean,sd,cv_1_valid,cv_2_valid,cv_3_valid,cv_4_valid,cv_5_valid
accuracy,0.9712623,0.0040467,0.9746835,0.9666666,0.9746835,0.9777778,0.9625
auc,0.9945725,0.0016775,0.9972089,0.9906727,0.9963716,0.9954454,0.9931641
err,0.0287377,0.0040467,0.0253165,0.0333333,0.0253165,0.0222222,0.0375
err_count,2.4,0.3464102,2.0,3.0,2.0,2.0,3.0
f0point5,0.9394541,0.0257182,0.9545454,0.9574468,0.942029,0.9736842,0.8695652
f1,0.9505693,0.0142494,0.9545454,0.9473684,0.962963,0.9736842,0.9142857
f2,0.9628867,0.0114626,0.9545454,0.9375,0.9848485,0.9736842,0.9638554
lift_top_group,3.420248,0.622672,3.590909,3.1034484,3.0384614,2.368421,5.0
logloss,0.0951827,0.0083406,0.0919304,0.1142949,0.0780895,0.0994639,0.0921349


Scoring History: 


0,1,2,3,4,5,6,7,8
,timestamp,duration,number_of_trees,training_rmse,training_logloss,training_auc,training_lift,training_classification_error
,2017-03-19 09:09:34,16.347 sec,0.0,,,,,
,2017-03-19 09:09:34,16.349 sec,1.0,0.2401922,1.9926217,0.9268908,3.1160544,0.0576923
,2017-03-19 09:09:34,16.352 sec,2.0,0.2312865,1.8129191,0.9278132,3.1908397,0.0524017
,2017-03-19 09:09:34,16.354 sec,3.0,0.2362888,1.7377125,0.9396596,3.1908397,0.0598007
,2017-03-19 09:09:34,16.357 sec,4.0,0.2278074,1.4762785,0.9480775,3.1908397,0.0538922
---,---,---,---,---,---,---,---,---
,2017-03-19 09:09:35,16.862 sec,96.0,0.1749484,0.1010828,0.9921004,3.1908397,0.0382775
,2017-03-19 09:09:35,16.871 sec,97.0,0.1751054,0.1012793,0.9921270,3.1908397,0.0382775
,2017-03-19 09:09:35,16.880 sec,98.0,0.1751858,0.1001145,0.9923797,3.1908397,0.0358852



See the whole table with table.as_data_frame()
Variable Importances: 


0,1,2,3
variable,relative_importance,scaled_importance,percentage
Uniformity of Cell Shape,3481.7131348,1.0,0.4420369
Uniformity of Cell Size,1408.1503906,0.4044418,0.1787782
Bare Nuclei,1386.9001465,0.3983384,0.1760803
Bland Chromatin,638.2587891,0.1833175,0.0810331
Clump Thickness,317.6760254,0.0912413,0.0403320
Single Epithelial Cell Size,284.1733093,0.0816188,0.0360785
Normal Nucleoli,225.9375916,0.0648926,0.0286849
Marginal Adhesion,102.5696335,0.0294595,0.0130222
Mitoses,31.1426640,0.0089446,0.0039539





In [9]:
# out-of-sample:
performance = best_model.model_performance(test)
print performance


ModelMetricsBinomial: drf
** Reported on test data. **

MSE: 0.0301401566832
RMSE: 0.173609206793
LogLoss: 0.112516749709
Mean Per-Class Error: 0.0253322700691
AUC: 0.992105263158
Gini: 0.984210526316
Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.46: 


0,1,2,3,4
,2.0,4.0,Error,Rate
2,167.0,4.0,0.0234,(4.0/171.0)
4,3.0,107.0,0.0273,(3.0/110.0)
Total,170.0,111.0,0.0249,(7.0/281.0)


Maximum Metrics: Maximum metrics at their respective thresholds



0,1,2,3
metric,threshold,value,idx
max f1,0.46,0.9683258,30.0
max f2,0.26,0.9784560,35.0
max f0point5,0.48,0.9686347,28.0
max accuracy,0.46,0.9750890,30.0
max precision,1.0,1.0,0.0
max recall,0.01,1.0,48.0
max specificity,1.0,1.0,0.0
max absolute_mcc,0.46,0.9478250,30.0
max min_per_class_accuracy,0.46,0.9727273,30.0


Gains/Lift Table: Avg response rate: 39.15 %



0,1,2,3,4,5,6,7,8,9,10,11
,group,cumulative_data_fraction,lower_threshold,lift,cumulative_lift,response_rate,cumulative_response_rate,capture_rate,cumulative_capture_rate,gain,cumulative_gain
,1,0.0889680,1.0,2.5545455,2.5545455,1.0,1.0,0.2272727,0.2272727,155.4545455,155.4545455
,2,0.1316726,0.99,2.5545455,2.5545455,1.0,1.0,0.1090909,0.3363636,155.4545455,155.4545455
,3,0.1814947,0.97,2.5545455,2.5545455,1.0,1.0,0.1272727,0.4636364,155.4545455,155.4545455
,4,0.2064057,0.96,2.5545455,2.5545455,1.0,1.0,0.0636364,0.5272727,155.4545455,155.4545455
,5,0.3024911,0.84,2.3653199,2.4944385,0.9259259,0.9764706,0.2272727,0.7545455,136.5319865,149.4438503
,6,0.4056940,0.37,2.1141066,2.3976874,0.8275862,0.9385965,0.2181818,0.9727273,111.4106583,139.7687400
,7,0.5088968,0.02,0.1761755,1.9471710,0.0689655,0.7622378,0.0181818,0.9909091,-82.3824451,94.7171011
,8,1.0,0.0,0.0185112,1.0,0.0072464,0.3914591,0.0090909,1.0,-98.1488801,0.0






In [10]:
h2o.cluster().shutdown()

H2O session _sid_8840 closed.
