In [6]:
import h2o
from h2o.estimators.deeplearning import H2ODeepLearningEstimator

CLASS_DIR = 'Datasets/leukemia-class.txt'
DATA_DIR = 'Datasets/leukemia-data.csv'

In [7]:
# Initialise H2O cluster
h2o.init()
h2o.remove_all()

leukemia_class = h2o.import_file(path=CLASS_DIR, header=-1, sep=',')
leukemia_data = h2o.import_file(path=DATA_DIR, header=-1, sep=',')

Checking whether there is an H2O instance running at http://localhost:54321. connected.


0,1
H2O cluster uptime:,2 mins 26 secs
H2O cluster timezone:,Australia/Sydney
H2O data parsing timezone:,UTC
H2O cluster version:,3.20.0.6
H2O cluster version age:,1 month and 20 days
H2O cluster name:,H2O_from_python_andyg_84jkj8
H2O cluster total nodes:,1
H2O cluster free memory:,2.964 Gb
H2O cluster total cores:,12
H2O cluster allowed cores:,12


Parse progress: |█████████████████████████████████████████████████████████| 100%
Parse progress: |█████████████████████████████████████████████████████████| 100%


In [8]:
# Data preparation and manipulation
def transpose_frame(h2o_df, column_name):
    pd_df = h2o_df.as_data_frame(use_pandas=True)
    pd_df = pd_df.transpose()
    if column_name:
        pd_df.columns = [column_name]
    else:
        pd_df.columns = ['C' + str(col) for col in range(len(pd_df.columns))]
    return h2o.H2OFrame(pd_df)

def randomize_frame(h2o_df):
    pd_df = h2o_df.as_data_frame(use_pandas=True)
    pd_df.sample(frac=1).reset_index(drop=True)
    return h2o.H2OFrame(pd_df)

leukemia_class_t = transpose_frame(leukemia_class, 'class')
leukemia_data_t = transpose_frame(leukemia_data, '')
leukemia = randomize_frame(leukemia_data_t.cbind(leukemia_class_t))

label = 'class'
features = leukemia.names
features.remove(label)

train, valid = leukemia.split_frame([0.75])
train[label] = train[label].asfactor()
valid[label] = valid[label].asfactor()

  data = _handle_python_lists(python_obj.as_matrix().tolist(), -1)[1]


Parse progress: |█████████████████████████████████████████████████████████| 100%
Parse progress: |█████████████████████████████████████████████████████████| 100%
Parse progress: |█████████████████████████████████████████████████████████| 100%


In [9]:
# Model properties and training
model = H2ODeepLearningEstimator(activation = "rectifier_with_dropout", hidden = [190,63,21,7], epochs = 50, input_dropout_ratio = 0.1)
model.train(x = features, y = label, training_frame = train, validation_frame = valid)

deeplearning Model Build progress: |██████████████████████████████████████| 100%


In [10]:
# Model predictions and validation
predictions = model.predict(valid)
print(predictions)
print(model.confusion_matrix(valid=True))

deeplearning prediction progress: |███████████████████████████████████████| 100%


predict,ALL,AML
ALL,0.969184,0.0308155
ALL,0.990335,0.00966494
ALL,0.989068,0.0109324
ALL,0.989385,0.0106145
AML,0.317399,0.682601
AML,0.48659,0.51341
AML,0.302804,0.697196
AML,0.887542,0.112458
AML,0.677986,0.322014



Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.1124582322590477: 


0,1,2,3,4
,ALL,AML,Error,Rate
ALL,4.0,0.0,0.0,(0.0/4.0)
AML,0.0,5.0,0.0,(0.0/5.0)
Total,4.0,5.0,0.0,(0.0/9.0)



