In [1]:
# connect to h2o cluster:
import h2o
h2o.init(nthreads=-1, max_mem_size="8G")

Checking whether there is an H2O instance running at http://localhost:54321. connected.


0,1
H2O cluster uptime:,12 secs
H2O cluster version:,3.10.4.1
H2O cluster version age:,15 days
H2O cluster name:,H2O_from_python_jurgentas_tsiqri
H2O cluster total nodes:,1
H2O cluster free memory:,7.111 Gb
H2O cluster total cores:,0
H2O cluster allowed cores:,0
H2O cluster status:,"accepting new members, healthy"
H2O connection url:,http://localhost:54321


In [2]:
# Title: Boston Housing Data

#CRIM     per capita crime rate by town
#ZN       proportion of residential land zoned for lots over 25,000 sq.ft.
#INDUS    proportion of non-retail business acres per town
#CHAS     Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)
#NOX      nitric oxides concentration (parts per 10 million)
#RM       average number of rooms per dwelling
#AGE      proportion of owner-occupied units built prior to 1940
#DIS      weighted distances to five Boston employment centres
#RAD      index of accessibility to radial highways
#TAX      full-value property-tax rate per $10,000
#PTRATIO  pupil-teacher ratio by town
#B        1000(Bk - 0.63)^2 where Bk is the proportion of blacks by town
#LSTAT    % lower status of the population
#MEDV     Median value of owner-occupied homes in $1000's

import pandas as pd

url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/housing/housing.data'
names = ['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PRATIO', 'B', 'LSTAT', 'MEDV']
df = pd.read_csv(url, delim_whitespace=True, names = names)
df.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PRATIO,B,LSTAT,MEDV
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296.0,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242.0,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222.0,18.7,396.9,5.33,36.2


In [3]:
# construct h2o dataframe:
df_hex = h2o.H2OFrame(df, column_names = df.columns.tolist())
df_hex.describe()

Parse progress: |█████████████████████████████████████████████████████████| 100%
Rows:506
Cols:14




Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PRATIO,B,LSTAT,MEDV
type,real,real,real,int,real,real,real,real,int,int,real,real,real,real
mins,0.00632,0.0,0.46,0.0,0.385,3.561,2.9,1.1296,1.0,187.0,12.6,0.32,1.73,5.0
mean,3.61352355731,11.3636363636,11.1367786561,0.0691699604743,0.554695059289,6.28463438735,68.5749011858,3.79504268775,9.54940711462,408.23715415,18.4555335968,356.674031621,12.6530632411,22.5328063241
maxs,88.9762,100.0,27.74,1.0,0.871,8.78,100.0,12.1265,24.0,711.0,22.0,396.9,37.97,50.0
sigma,8.60154510533,23.3224529945,6.8603529409,0.25399404134,0.115877675668,0.702617143415,28.1488614069,2.10571012663,8.70725938424,168.537116055,2.16494552371,91.2948643842,7.14106151135,9.19710408738
zeros,0,372,0,471,0,0,0,0,0,0,0,0,0,0
missing,0,0,0,0,0,0,0,0,0,0,0,0,0,0
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03,34.7


In [4]:
# determine response and predictor variables:
x =  ['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PRATIO', 'B', 'LSTAT'] 
y = 'MEDV'

In [5]:
# split data into train and testing:
train, test = df_hex.split_frame(ratios=[0.7])

In [6]:
# hyperparameter search (glm):

from h2o.estimators.glm import H2OGeneralizedLinearEstimator 
import h2o.grid

grid_search = h2o.grid.H2OGridSearch(
    H2OGeneralizedLinearEstimator(
        family="gaussian",
        nfolds = 5,
        standardize = True
    ),
    hyper_params ={
        "lambda": [0.0, 0.25, 0.5, 0.75, 1.0]
    }   
)

grid_search.train(x, y, train)
grid_search.summary()

glm Grid Build progress: |████████████████████████████████████████████████| 100%

Grid Summary:



0,1,2,3,4,5,6,7
Model Id,family,link,regularization,number_of_predictors_total,number_of_active_predictors,number_of_iterations,training_frame
Grid_GLM_py_3_sid_a204_model_python_1489935727643_1_model_0,gaussian,identity,,13,13,0,py_3_sid_a204
Grid_GLM_py_3_sid_a204_model_python_1489935727643_1_model_1,gaussian,identity,"Elastic Net (alpha = 0.5, lambda = 0.25 )",13,13,0,py_3_sid_a204
Grid_GLM_py_3_sid_a204_model_python_1489935727643_1_model_3,gaussian,identity,"Elastic Net (alpha = 0.5, lambda = 0.75 )",13,11,0,py_3_sid_a204
Grid_GLM_py_3_sid_a204_model_python_1489935727643_1_model_2,gaussian,identity,"Elastic Net (alpha = 0.5, lambda = 0.5 )",13,11,0,py_3_sid_a204
Grid_GLM_py_3_sid_a204_model_python_1489935727643_1_model_4,gaussian,identity,"Elastic Net (alpha = 0.5, lambda = 1.0 )",13,10,0,py_3_sid_a204


In [7]:
# sorted on r-squared:
grid_sorted = grid_search.get_grid(sort_by='r2', decreasing=True)

In [8]:
# get the best performing model:
best_model = grid_sorted.models[0]
print best_model 

Model Details
H2OGeneralizedLinearEstimator :  Generalized Linear Modeling
Model Key:  Grid_GLM_py_3_sid_a204_model_python_1489935727643_1_model_0


ModelMetricsRegressionGLM: glm
** Reported on train data. **

MSE: 22.5736756111
RMSE: 4.75117623448
MAE: 3.35040497981
RMSLE: NaN
R^2: 0.743131740156
Mean Residual Deviance: 22.5736756111
Null degrees of freedom: 343
Residual degrees of freedom: 330
Null deviance: 30230.8444606
Residual deviance: 7765.34441021
AIC: 2078.40355525

ModelMetricsRegressionGLM: glm
** Reported on cross-validation data. **

MSE: 25.3521236433
RMSE: 5.03508923886
MAE: 3.52817709514
RMSLE: NaN
R^2: 0.711515483974
Mean Residual Deviance: 25.3521236433
Null degrees of freedom: 343
Residual degrees of freedom: 330
Null deviance: 30398.3943199
Residual deviance: 8721.13053329
AIC: 2118.33441065
Cross-Validation Metrics Summary: 


0,1,2,3,4,5,6,7
,mean,sd,cv_1_valid,cv_2_valid,cv_3_valid,cv_4_valid,cv_5_valid
mae,3.5218015,0.1003883,3.7000654,3.4026937,3.3146899,3.6178334,3.5737255
mse,25.343704,3.4707189,32.832966,24.410088,19.315586,28.755594,21.40429
null_deviance,6079.6787,785.0452,7060.1787,6383.9165,4619.7544,7384.2964,4950.248
r2,0.7074678,0.0263880,0.6865011,0.7587768,0.7311631,0.7110230,0.6498750
residual_deviance,1744.2261,253.66455,2199.8088,1537.8356,1236.1975,2099.1584,1648.1302
rmse,5.010903,0.3424605,5.7300057,4.9406567,4.39495,5.3624244,4.6264772
rmsle,0.2709905,0.0540204,0.2224367,0.3106168,0.1931693,,0.3577393


Scoring History: 


0,1,2,3,4,5
,timestamp,duration,iteration,negative_log_likelihood,objective
,2017-03-19 16:02:22,0.000 sec,0,30230.8443895,87.8803616





In [9]:
# out-of-sample:
performance = best_model.model_performance(test)
print performance


ModelMetricsRegressionGLM: glm
** Reported on test data. **

MSE: 21.1484707222
RMSE: 4.59874664688
MAE: 3.19898665343
RMSLE: 0.195933761084
R^2: 0.724580644743
Mean Residual Deviance: 21.1484707222
Null degrees of freedom: 161
Residual degrees of freedom: 148
Null deviance: 12507.1367651
Residual deviance: 3426.052257
AIC: 984.090035408



In [10]:
h2o.cluster().shutdown()

H2O session _sid_a204 closed.
