In [1]:
# connect to h2o cluster:
import h2o
h2o.init(nthreads=-1, max_mem_size="8G")

Checking whether there is an H2O instance running at http://localhost:54321. connected.


0,1
H2O cluster uptime:,18 days 17 hours 45 mins
H2O cluster version:,3.10.4.8
H2O cluster version age:,5 months and 14 days !!!
H2O cluster name:,H2O_from_python_jurgentas_t81piy
H2O cluster total nodes:,1
H2O cluster free memory:,6.920 Gb
H2O cluster total cores:,4
H2O cluster allowed cores:,4
H2O cluster status:,"locked, healthy"
H2O connection url:,http://localhost:54321


In [2]:
# This breast cancer databases was obtained from the University of Wisconsin
# Hospitals, Madison from Dr. William H. Wolberg. 

import pandas as pd

url = 'http://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/breast-cancer-wisconsin.data'
names = ['Sample code number','Clump Thickness','Uniformity of Cell Size','Uniformity of Cell Shape',
         'Marginal Adhesion', 'Single Epithelial Cell Size', 'Bare Nuclei', 'Bland Chromatin', 'Normal Nucleoli',
         'Mitoses', 'Class']
df = pd.read_csv(url, names = names)
df.head()

Unnamed: 0,Sample code number,Clump Thickness,Uniformity of Cell Size,Uniformity of Cell Shape,Marginal Adhesion,Single Epithelial Cell Size,Bare Nuclei,Bland Chromatin,Normal Nucleoli,Mitoses,Class
0,1000025,5,1,1,1,2,1,3,1,1,2
1,1002945,5,4,4,5,7,10,3,2,1,2
2,1015425,3,1,1,1,2,2,3,1,1,2
3,1016277,6,8,8,1,3,4,3,7,1,2
4,1017023,4,1,1,3,2,1,3,1,1,2


In [3]:
# construct h2o dataframe:
df_hex = h2o.H2OFrame(df, column_names = df.columns.tolist())
df_hex['Class'] = df_hex['Class'].asfactor() # encode response variable
df_hex.describe()

Parse progress: |█████████████████████████████████████████████████████████| 100%
Rows:699
Cols:11




Unnamed: 0,Sample code number,Clump Thickness,Uniformity of Cell Size,Uniformity of Cell Shape,Marginal Adhesion,Single Epithelial Cell Size,Bare Nuclei,Bland Chromatin,Normal Nucleoli,Mitoses,Class
type,int,int,int,int,int,int,int,int,int,int,enum
mins,61634.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,
mean,1071704.0987124462,4.417739628040058,3.1344778254649475,3.207439198855506,2.806866952789701,3.2160228898426304,3.5446559297218094,3.4377682403433494,2.8669527896995723,1.5894134477825477,
maxs,13454352.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,
sigma,617095.7298192448,2.815740658594933,3.0514591099542008,2.9719127672157133,2.855379239217023,2.214299886649047,3.643857160492912,2.4383642523242512,3.0536338936127745,1.715077942506795,
zeros,0,0,0,0,0,0,0,0,0,0,
missing,0,0,0,0,0,0,16,0,0,0,0
0,1000025.0,5.0,1.0,1.0,1.0,2.0,1.0,3.0,1.0,1.0,2
1,1002945.0,5.0,4.0,4.0,5.0,7.0,10.0,3.0,2.0,1.0,2
2,1015425.0,3.0,1.0,1.0,1.0,2.0,2.0,3.0,1.0,1.0,2


In [4]:
# predictor variables:
x = ['Clump Thickness','Uniformity of Cell Size','Uniformity of Cell Shape','Marginal Adhesion', 
     'Single Epithelial Cell Size', 'Bare Nuclei', 'Bland Chromatin', 'Normal Nucleoli','Mitoses']
# response:
y = 'Class'

In [5]:
# split data into train and testing:
train, test = df_hex.split_frame(ratios=[0.6], seed = 123)

In [6]:
# hyperparameter search (random forest)
from h2o.estimators import H2ORandomForestEstimator 
import h2o.grid

grid_search = h2o.grid.H2OGridSearch(
    H2ORandomForestEstimator(
        nfolds = 5,
        balance_classes = True,
        seed = 123
    ),
    hyper_params ={
        "ntrees": [25, 50, 100],
        "mtries": [3, 4, 5],
    }   
)

grid_search.train(x, y, train)
grid_search.summary()

drf Grid Build progress: |████████████████████████████████████████████████| 100%

Grid Summary:



0,1,2,3,4,5,6,7,8,9
Model Id,number_of_trees,number_of_internal_trees,model_size_in_bytes,min_depth,max_depth,mean_depth,min_leaves,max_leaves,mean_leaves
Grid_DRF_py_4_sid_8306_model_python_1508176779571_11750_model_6,100.0,100.0,36539.0,6.0,14.0,8.79,15.0,32.0,23.31
Grid_DRF_py_4_sid_8306_model_python_1508176779571_11750_model_3,50.0,50.0,18220.0,6.0,11.0,8.7,15.0,31.0,23.24
Grid_DRF_py_4_sid_8306_model_python_1508176779571_11750_model_7,100.0,100.0,34439.0,6.0,13.0,8.65,15.0,30.0,21.71
Grid_DRF_py_4_sid_8306_model_python_1508176779571_11750_model_8,100.0,100.0,33136.0,6.0,11.0,8.42,14.0,28.0,20.69
Grid_DRF_py_4_sid_8306_model_python_1508176779571_11750_model_5,50.0,50.0,16451.0,6.0,11.0,8.38,14.0,26.0,20.52
Grid_DRF_py_4_sid_8306_model_python_1508176779571_11750_model_4,50.0,50.0,17295.0,7.0,13.0,8.74,16.0,30.0,21.84
Grid_DRF_py_4_sid_8306_model_python_1508176779571_11750_model_2,25.0,25.0,8386.0,6.0,11.0,8.36,16.0,26.0,21.08
Grid_DRF_py_4_sid_8306_model_python_1508176779571_11750_model_0,25.0,25.0,9325.0,6.0,11.0,8.64,17.0,30.0,23.96
Grid_DRF_py_4_sid_8306_model_python_1508176779571_11750_model_1,25.0,25.0,8825.0,7.0,13.0,8.84,16.0,30.0,22.48


In [7]:
# sorted on the specified area under the ROC curve
grid_sorted = grid_search.get_grid(sort_by='auc', decreasing=True)

In [8]:
# get the best performing model:
best_model = grid_sorted.models[0]
print best_model 

SyntaxError: Missing parentheses in call to 'print' (<ipython-input-8-e80177e34a75>, line 3)

In [None]:
# out-of-sample:
performance = best_model.model_performance(test)
print performance

In [None]:
h2o.cluster().shutdown()