In [4]:
import h2o
from h2o.estimators.deeplearning import H2ODeepLearningEstimator

CLASS_DIR = 'Datasets/lymphoma-class.txt'
DATA_DIR = 'Datasets/lymphoma-data.txt'

In [22]:
# Initialise H2O cluster
h2o.init()
h2o.remove_all()

lymphoma_class = h2o.import_file(path=CLASS_DIR, header=-1, sep='\t')
lymphoma_data = h2o.import_file(path=DATA_DIR, header=1, sep='\t')

Checking whether there is an H2O instance running at http://localhost:54321. connected.


0,1
H2O cluster uptime:,18 mins 05 secs
H2O cluster timezone:,Australia/Sydney
H2O data parsing timezone:,UTC
H2O cluster version:,3.20.0.6
H2O cluster version age:,1 month and 20 days
H2O cluster name:,H2O_from_python_andyg_omrkwi
H2O cluster total nodes:,1
H2O cluster free memory:,3.226 Gb
H2O cluster total cores:,12
H2O cluster allowed cores:,12


Parse progress: |█████████████████████████████████████████████████████████| 100%
Parse progress: |█████████████████████████████████████████████████████████| 100%


In [27]:
# Data preparation and manipulation
def transpose_frame(h2o_df, column_name):
    pd_df = h2o_df.as_data_frame(use_pandas=True)
    pd_df = pd_df.transpose()
    if column_name == 'class':
        pd_df.columns = [column_name]
    elif column_name == 'categories':
        pd_df.columns = pd_df.iloc[0]
        pd_df = pd_df.drop(labels=column_name, axis=0)
    else:
        pd_df.columns = ['C' + str(col) for col in range(len(pd_df.columns))]
    return h2o.H2OFrame(pd_df)

def randomize_frame(h2o_df):
    pd_df = h2o_df.as_data_frame(use_pandas=True)
    pd_df.sample(frac=1).reset_index(drop=True)
    return h2o.H2OFrame(pd_df)

lymphoma_class_t = transpose_frame(lymphoma_class, 'class')
lymphoma_data_t = transpose_frame(lymphoma_data, 'categories')
lymphoma = randomize_frame(lymphoma_data_t.cbind(lymphoma_class_t))

label = 'class'
features = lymphoma.names
features.remove(label)

train, valid = lymphoma.split_frame([0.75])
train[label] = train[label].asfactor()
valid[label] = valid[label].asfactor()

  data = _handle_python_lists(python_obj.as_matrix().tolist(), -1)[1]


Parse progress: |█████████████████████████████████████████████████████████| 100%
Parse progress: |█████████████████████████████████████████████████████████| 100%
Parse progress: |█████████████████████████████████████████████████████████| 100%


In [28]:
# Model properties and training
model = H2ODeepLearningEstimator(activation = "rectifier_with_dropout", hidden = [190,63,21,7], epochs = 50, input_dropout_ratio = 0.1)
model.train(x = features, y = label, training_frame = train, validation_frame = valid)

deeplearning Model Build progress: |██████████████████████████████████████| 100%


In [34]:
# Model predictions and validation
predictions = model.predict(valid)
print(predictions)
print(model.confusion_matrix(valid))

deeplearning prediction progress: |███████████████████████████████████████| 100%


predict,p0,p1,p2
0,0.98733,5.08461e-06,0.0126648
0,0.962262,0.000281133,0.0374572
0,0.998716,1.90317e-08,0.00128442
0,0.992982,7.45278e-07,0.00701714
0,0.990252,1.17013e-06,0.00974686
0,0.995208,5.22845e-08,0.00479206
0,0.99044,8.14475e-08,0.00955979
0,0.989158,1.05307e-06,0.0108405
0,0.99722,2.72455e-08,0.00277995
0,0.995443,6.58942e-08,0.00455719



Confusion Matrix: Row labels: Actual class; Column labels: Predicted class



0,1,2,3,4
0.0,1.0,2.0,Error,Rate
13.0,0.0,0.0,0.0,0 / 13
0.0,0.0,3.0,1.0,3 / 3
0.0,0.0,3.0,0.0,0 / 3
13.0,0.0,6.0,0.1578947,3 / 19



