In [1]:
# connect to h2o cluster:
import h2o
h2o.init(nthreads=-1, max_mem_size="8G")

Checking whether there is an H2O instance running at http://localhost:54321. connected.


0,1
H2O cluster uptime:,12 secs
H2O cluster version:,3.10.4.1
H2O cluster version age:,16 days
H2O cluster name:,H2O_from_python_jurgentas_0j1lo3
H2O cluster total nodes:,1
H2O cluster free memory:,7.111 Gb
H2O cluster total cores:,0
H2O cluster allowed cores:,0
H2O cluster status:,"accepting new members, healthy"
H2O connection url:,http://localhost:54321


In [2]:
# This breast cancer databases was obtained from the University of Wisconsin
# Hospitals, Madison from Dr. William H. Wolberg. 

import pandas as pd

url = 'http://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/breast-cancer-wisconsin.data'
names = ['Sample code number','Clump Thickness','Uniformity of Cell Size','Uniformity of Cell Shape',
         'Marginal Adhesion', 'Single Epithelial Cell Size', 'Bare Nuclei', 'Bland Chromatin', 'Normal Nucleoli',
         'Mitoses', 'Class']
df = pd.read_csv(url, names = names)
df.head()

Unnamed: 0,Sample code number,Clump Thickness,Uniformity of Cell Size,Uniformity of Cell Shape,Marginal Adhesion,Single Epithelial Cell Size,Bare Nuclei,Bland Chromatin,Normal Nucleoli,Mitoses,Class
0,1000025,5,1,1,1,2,1,3,1,1,2
1,1002945,5,4,4,5,7,10,3,2,1,2
2,1015425,3,1,1,1,2,2,3,1,1,2
3,1016277,6,8,8,1,3,4,3,7,1,2
4,1017023,4,1,1,3,2,1,3,1,1,2


In [3]:
# construct h2o dataframe:
df_hex = h2o.H2OFrame(df, column_names = df.columns.tolist())
df_hex['Class'] = df_hex['Class'].asfactor() # encode response variable
df_hex.describe()

Parse progress: |█████████████████████████████████████████████████████████| 100%
Rows:699
Cols:11




Unnamed: 0,Sample code number,Clump Thickness,Uniformity of Cell Size,Uniformity of Cell Shape,Marginal Adhesion,Single Epithelial Cell Size,Bare Nuclei,Bland Chromatin,Normal Nucleoli,Mitoses,Class
type,int,int,int,int,int,int,int,int,int,int,enum
mins,61634.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,
mean,1071704.09871,4.41773962804,3.13447782546,3.20743919886,2.80686695279,3.21602288984,3.54465592972,3.43776824034,2.8669527897,1.58941344778,
maxs,13454352.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,
sigma,617095.729819,2.81574065859,3.05145910995,2.97191276722,2.85537923922,2.21429988665,3.64385716049,2.43836425232,3.05363389361,1.71507794251,
zeros,0,0,0,0,0,0,0,0,0,0,
missing,0,0,0,0,0,0,16,0,0,0,0
0,1000025.0,5.0,1.0,1.0,1.0,2.0,1.0,3.0,1.0,1.0,2
1,1002945.0,5.0,4.0,4.0,5.0,7.0,10.0,3.0,2.0,1.0,2
2,1015425.0,3.0,1.0,1.0,1.0,2.0,2.0,3.0,1.0,1.0,2


In [4]:
# predictor variables:
x = ['Clump Thickness','Uniformity of Cell Size','Uniformity of Cell Shape','Marginal Adhesion', 
     'Single Epithelial Cell Size', 'Bare Nuclei', 'Bland Chromatin', 'Normal Nucleoli','Mitoses']
# response:
y = 'Class'

In [5]:
# split data into train and testing:
train, test = df_hex.split_frame(ratios=[0.6], seed = 123)

In [6]:
# hyperparameter search (random forest)
from h2o.estimators import H2ORandomForestEstimator 
import h2o.grid

grid_search = h2o.grid.H2OGridSearch(
    H2ORandomForestEstimator(
        nfolds = 5,
        balance_classes = True,
        seed = 123
    ),
    hyper_params ={
        "ntrees": [25, 50, 100],
        "mtries": [3, 4, 5],
    }   
)

grid_search.train(x, y, train)
grid_search.summary()

drf Grid Build progress: |████████████████████████████████████████████████| 100%

Grid Summary:



0,1,2,3,4,5,6,7,8,9
Model Id,number_of_trees,number_of_internal_trees,model_size_in_bytes,min_depth,max_depth,mean_depth,min_leaves,max_leaves,mean_leaves
Grid_DRF_py_4_sid_b1f5_model_python_1490003178820_1_model_6,100.0,100.0,36536.0,6.0,14.0,8.79,15.0,32.0,23.31
Grid_DRF_py_4_sid_b1f5_model_python_1490003178820_1_model_3,50.0,50.0,18217.0,6.0,11.0,8.7,15.0,31.0,23.24
Grid_DRF_py_4_sid_b1f5_model_python_1490003178820_1_model_7,100.0,100.0,34438.0,6.0,13.0,8.65,15.0,30.0,21.71
Grid_DRF_py_4_sid_b1f5_model_python_1490003178820_1_model_8,100.0,100.0,33136.0,6.0,11.0,8.42,14.0,28.0,20.69
Grid_DRF_py_4_sid_b1f5_model_python_1490003178820_1_model_5,50.0,50.0,16451.0,6.0,11.0,8.38,14.0,26.0,20.52
Grid_DRF_py_4_sid_b1f5_model_python_1490003178820_1_model_4,50.0,50.0,17293.0,7.0,13.0,8.74,16.0,30.0,21.84
Grid_DRF_py_4_sid_b1f5_model_python_1490003178820_1_model_2,25.0,25.0,8386.0,6.0,11.0,8.36,16.0,26.0,21.08
Grid_DRF_py_4_sid_b1f5_model_python_1490003178820_1_model_0,25.0,25.0,9324.0,6.0,11.0,8.64,17.0,30.0,23.96
Grid_DRF_py_4_sid_b1f5_model_python_1490003178820_1_model_1,25.0,25.0,8826.0,7.0,13.0,8.84,16.0,30.0,22.48


In [7]:
# sorted on the specified area under the ROC curve
grid_sorted = grid_search.get_grid(sort_by='auc', decreasing=True)

In [8]:
# get the best performing model:
best_model = grid_sorted.models[0]
print best_model 

Model Details
H2ORandomForestEstimator :  Distributed Random Forest
Model Key:  Grid_DRF_py_4_sid_b1f5_model_python_1490003178820_1_model_6


ModelMetricsBinomial: drf
** Reported on train data. **

MSE: 0.0220863900178
RMSE: 0.1486149051
LogLoss: 0.0831881436702
Mean Per-Class Error: 0.0171683162569
AUC: 0.992794531681
Gini: 0.985589063362
Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.367788461538: 


0,1,2,3,4
,2.0,4.0,Error,Rate
2,254.0,8.0,0.0305,(8.0/262.0)
4,1.0,262.0,0.0038,(1.0/263.0)
Total,255.0,270.0,0.0171,(9.0/525.0)


Maximum Metrics: Maximum metrics at their respective thresholds



0,1,2,3
metric,threshold,value,idx
max f1,0.3677885,0.9831144,74.0
max f2,0.3245869,0.9924528,77.0
max f0point5,0.3677885,0.9754281,74.0
max accuracy,0.3677885,0.9828571,74.0
max precision,1.0,1.0,0.0
max recall,0.3245869,1.0,77.0
max specificity,1.0,1.0,0.0
max absolute_mcc,0.3677885,0.9660560,74.0
max min_per_class_accuracy,0.6357341,0.9694656,67.0


Gains/Lift Table: Avg response rate: 36.87 %



0,1,2,3,4,5,6,7,8,9,10,11
,group,cumulative_data_fraction,lower_threshold,lift,cumulative_lift,response_rate,cumulative_response_rate,capture_rate,cumulative_capture_rate,gain,cumulative_gain
,1,0.2072289,1.0,2.7124183,2.7124183,1.0,1.0,0.5620915,0.5620915,171.2418301,171.2418301
,2,0.3060241,0.9495202,2.7124183,2.7124183,1.0,1.0,0.2679739,0.8300654,171.2418301,171.2418301
,3,0.4,0.0840015,1.8082789,2.5,0.6666667,0.9216867,0.1699346,1.0,80.8278867,150.0
,4,1.0,0.0,0.0,1.0,0.0,0.3686747,0.0,1.0,-100.0,0.0




ModelMetricsBinomial: drf
** Reported on cross-validation data. **

MSE: 0.0395158212784
RMSE: 0.198785867904
LogLoss: 0.210126533113
Mean Per-Class Error: 0.0294367110712
AUC: 0.985855410867
Gini: 0.971710821733
Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.282080924855: 


0,1,2,3,4
,2.0,4.0,Error,Rate
2,250.0,12.0,0.0458,(12.0/262.0)
4,2.0,151.0,0.0131,(2.0/153.0)
Total,252.0,163.0,0.0337,(14.0/415.0)


Maximum Metrics: Maximum metrics at their respective thresholds



0,1,2,3
metric,threshold,value,idx
max f1,0.2820809,0.9556962,82.0
max f2,0.2292431,0.9743590,87.0
max f0point5,0.3802016,0.9459459,76.0
max accuracy,0.2880813,0.9662651,81.0
max precision,0.9807996,0.9855072,5.0
max recall,0.0114479,1.0,117.0
max specificity,1.0,0.9961832,0.0
max absolute_mcc,0.2820809,0.9297160,82.0
max min_per_class_accuracy,0.3802016,0.9607843,76.0


Gains/Lift Table: Avg response rate: 36.87 %



0,1,2,3,4,5,6,7,8,9,10,11
,group,cumulative_data_fraction,lower_threshold,lift,cumulative_lift,response_rate,cumulative_response_rate,capture_rate,cumulative_capture_rate,gain,cumulative_gain
,1,0.1036145,1.0,2.6493388,2.6493388,0.9767442,0.9767442,0.2745098,0.2745098,164.9338805,164.9338805
,2,0.1662651,0.9807996,2.7124183,2.6731079,1.0,0.9855072,0.1699346,0.4444444,171.2418301,167.3107891
,3,0.2,0.9370529,2.3249300,2.6143791,0.8571429,0.9638554,0.0784314,0.5228758,132.4929972,161.4379085
,4,0.3012048,0.7682147,2.5186741,2.5822222,0.9285714,0.952,0.2549020,0.7777778,151.8674136,158.2222222
,5,0.4,0.2425468,2.1170094,2.4673203,0.7804878,0.9096386,0.2091503,0.9869281,111.7009405,146.7320261
,6,0.5060241,0.0065554,0.1232917,1.9761905,0.0454545,0.7285714,0.0130719,1.0,-87.6708259,97.6190476
,7,1.0,0.0,0.0,1.0,0.0,0.3686747,0.0,1.0,-100.0,0.0



Cross-Validation Metrics Summary: 


0,1,2,3,4,5,6,7
,mean,sd,cv_1_valid,cv_2_valid,cv_3_valid,cv_4_valid,cv_5_valid
accuracy,0.9712092,0.0101445,0.9512195,0.9647059,0.9647059,0.9871795,0.9882353
auc,0.9893585,0.0031815,0.9861111,0.9842216,0.9886500,0.9972546,0.9905556
err,0.0287908,0.0101445,0.0487805,0.0352941,0.0352941,0.0128205,0.0117647
err_count,2.4,0.8485282,4.0,3.0,3.0,1.0,1.0
f0point5,0.9469680,0.0249276,0.9121622,0.9055118,0.9433962,0.9933775,0.9803922
f1,0.9586904,0.0163125,0.9310345,0.9387755,0.9523810,0.9836066,0.9876543
f2,0.9711739,0.0104901,0.9507042,0.9745763,0.9615384,0.9740260,0.9950249
lift_top_group,2.7275445,0.2840121,2.9285715,3.326087,2.7419355,2.516129,2.125
logloss,0.2085467,0.0965355,0.1497515,0.4813721,0.1327389,0.1400809,0.1387898


Scoring History: 


0,1,2,3,4,5,6,7,8
,timestamp,duration,number_of_trees,training_rmse,training_logloss,training_auc,training_lift,training_classification_error
,2017-03-20 10:46:48,14.816 sec,0.0,,,,,
,2017-03-20 10:46:48,14.819 sec,1.0,0.2258770,1.7621825,0.9476429,2.5767974,0.0510204
,2017-03-20 10:46:48,14.821 sec,2.0,0.1972408,1.1861200,0.9641712,2.6767286,0.0432099
,2017-03-20 10:46:48,14.823 sec,3.0,0.1898743,1.0250059,0.9696102,2.7124183,0.0388350
,2017-03-20 10:46:48,14.826 sec,4.0,0.1988998,1.0751365,0.9676475,2.7124183,0.0455531
---,---,---,---,---,---,---,---,---
,2017-03-20 10:46:48,15.370 sec,96.0,0.1483403,0.0825799,0.9930267,2.7124183,0.0171429
,2017-03-20 10:46:48,15.379 sec,97.0,0.1485541,0.0828902,0.9929759,2.7124183,0.0171429
,2017-03-20 10:46:48,15.390 sec,98.0,0.1482011,0.0827093,0.9929251,2.7124183,0.0171429



See the whole table with table.as_data_frame()
Variable Importances: 


0,1,2,3
variable,relative_importance,scaled_importance,percentage
Uniformity of Cell Size,4009.1406250,1.0,0.3462615
Uniformity of Cell Shape,2538.0102539,0.6330559,0.2192029
Bare Nuclei,2305.6059570,0.5750873,0.1991306
Bland Chromatin,807.4774170,0.2014091,0.0697402
Single Epithelial Cell Size,763.2233887,0.1903708,0.0659181
Normal Nucleoli,528.6521606,0.1318617,0.0456586
Clump Thickness,390.0749817,0.0972964,0.0336900
Marginal Adhesion,179.8247528,0.0448537,0.0155311
Mitoses,56.3510361,0.0140556,0.0048669





In [9]:
# out-of-sample:
performance = best_model.model_performance(test)
print performance


ModelMetricsBinomial: drf
** Reported on test data. **

MSE: 0.0211483859586
RMSE: 0.145424846428
LogLoss: 0.0787919742038
Mean Per-Class Error: 0.0178571428571
AUC: 0.994897959184
Gini: 0.989795918367
Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.564243719936: 


0,1,2,3,4
,2.0,4.0,Error,Rate
2,192.0,4.0,0.0204,(4.0/196.0)
4,2.0,86.0,0.0227,(2.0/88.0)
Total,194.0,90.0,0.0211,(6.0/284.0)


Maximum Metrics: Maximum metrics at their respective thresholds



0,1,2,3
metric,threshold,value,idx
max f1,0.5642437,0.9662921,23.0
max f2,0.4009552,0.9843400,27.0
max f0point5,0.6357341,0.9606481,20.0
max accuracy,0.5642437,0.9788732,23.0
max precision,1.0,1.0,0.0
max recall,0.4009552,1.0,27.0
max specificity,1.0,1.0,0.0
max absolute_mcc,0.5642437,0.9510377,23.0
max min_per_class_accuracy,0.5642437,0.9772727,23.0


Gains/Lift Table: Avg response rate: 30.99 %



0,1,2,3,4,5,6,7,8,9,10,11
,group,cumulative_data_fraction,lower_threshold,lift,cumulative_lift,response_rate,cumulative_response_rate,capture_rate,cumulative_capture_rate,gain,cumulative_gain
,1,0.1021127,1.0,3.2272727,3.2272727,1.0,1.0,0.3295455,0.3295455,222.7272727,222.7272727
,2,0.1021127,0.9948799,0.0,3.2272727,0.0,1.0,0.0,0.3295455,-100.0,222.7272727
,3,0.1549296,0.9829332,3.0121212,3.1539256,0.9333333,0.9772727,0.1590909,0.4886364,201.2121212,215.3925620
,4,0.2007042,0.9234864,3.2272727,3.1706539,1.0,0.9824561,0.1477273,0.6363636,222.7272727,217.0653908
,5,0.2992958,0.6369767,2.9967532,3.1133690,0.9285714,0.9647059,0.2954545,0.9318182,199.6753247,211.3368984
,6,0.4049296,0.0236659,0.6454545,2.4695652,0.2,0.7652174,0.0681818,1.0,-35.4545455,146.9565217
,7,1.0,0.0,0.0,1.0,0.0,0.3098592,0.0,1.0,-100.0,0.0






In [10]:
h2o.cluster().shutdown()

H2O session _sid_b1f5 closed.
