In [2]:
# connect to h2o cluster:
import h2o
h2o.init(nthreads=-1, max_mem_size="8G")

ModuleNotFoundError: No module named 'imp'

In [None]:
# This breast cancer databases was obtained from the University of Wisconsin
# Hospitals, Madison from Dr. William H. Wolberg. 

import pandas as pd

url = 'http://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/breast-cancer-wisconsin.data'
names = ['Sample code number','Clump Thickness','Uniformity of Cell Size','Uniformity of Cell Shape',
         'Marginal Adhesion', 'Single Epithelial Cell Size', 'Bare Nuclei', 'Bland Chromatin', 'Normal Nucleoli',
         'Mitoses', 'Class']
df = pd.read_csv(url, names = names)
df.head()

In [None]:
# construct h2o dataframe:
df_hex = h2o.H2OFrame(df, column_names = df.columns.tolist())
df_hex['Class'] = df_hex['Class'].asfactor() # encode response variable
df_hex.describe()

In [None]:
# predictor variables:
x = ['Clump Thickness','Uniformity of Cell Size','Uniformity of Cell Shape','Marginal Adhesion', 
     'Single Epithelial Cell Size', 'Bare Nuclei', 'Bland Chromatin', 'Normal Nucleoli','Mitoses']
# response:
y = 'Class'

In [None]:
# split data into train and testing:
train, test = df_hex.split_frame(ratios=[0.6], seed = 123)

In [None]:
# hyperparameter search (random forest)
from h2o.estimators import H2ORandomForestEstimator 
import h2o.grid

grid_search = h2o.grid.H2OGridSearch(
    H2ORandomForestEstimator(
        nfolds = 5,
        balance_classes = True,
        seed = 123
    ),
    hyper_params ={
        "ntrees": [25, 50, 100],
        "mtries": [3, 4, 5],
    }   
)

grid_search.train(x, y, train)
grid_search.summary()

In [None]:
# sorted on the specified area under the ROC curve
grid_sorted = grid_search.get_grid(sort_by='auc', decreasing=True)

In [None]:
# get the best performing model:
best_model = grid_sorted.models[0]
print best_model 

In [None]:
# out-of-sample:
performance = best_model.model_performance(test)
print performance

In [None]:
h2o.cluster().shutdown()