In [1]:
# connect to h2o cluster:
import h2o
h2o.init(nthreads=-1, max_mem_size="8G")

Checking whether there is an H2O instance running at http://localhost:54321. connected.


0,1
H2O cluster uptime:,2 hours 0 mins
H2O cluster version:,3.10.4.1
H2O cluster version age:,16 days
H2O cluster name:,H2O_from_python_jurgentas_v21f3g
H2O cluster total nodes:,1
H2O cluster free memory:,6.478 Gb
H2O cluster total cores:,8
H2O cluster allowed cores:,8
H2O cluster status:,"locked, healthy"
H2O connection url:,http://localhost:54321


In [2]:
# A cleaned up version of the Lending Club "Bad Loans" dataset. 
# the purpose here is to predict whether a loan will be bad (i.e. not repaid to the lender).
url = "https://raw.githubusercontent.com/h2oai/app-consumer-loan/master/data/loan.csv"
df_hex = h2o.import_file(url)  
df_hex.describe()

Parse progress: |█████████████████████████████████████████████████████████| 100%
Rows:163987
Cols:15




Unnamed: 0,loan_amnt,term,int_rate,emp_length,home_ownership,annual_inc,purpose,addr_state,dti,delinq_2yrs,revol_util,total_acc,bad_loan,longest_credit_length,verification_status
type,int,enum,real,int,enum,real,enum,enum,real,int,real,int,int,int,enum
mins,500.0,,5.42,0.0,,1896.0,,,0.0,0.0,0.0,1.0,0.0,0.0,
mean,13074.1691415,,13.7159040656,5.684352933,,71915.6705197,,,15.8815301213,0.227357006063,54.0791728024,24.5797338343,0.183038899425,14.8542736554,
maxs,35000.0,,26.06,10.0,,7141778.0,,,39.99,29.0,150.7,118.0,1.0,65.0,
sigma,7993.55618873,,4.39193987055,3.6106637311,,59070.9156549,,,7.58766822419,0.694167922928,25.2853667668,11.6851903659,0.386699589608,6.94773292255,
zeros,0,,0,14248,,0,,,270,139459,1562,0,133971,11,
missing,0,0,0,5804,0,4,0,0,0,29,193,29,0,29,0
0,5000.0,36 months,10.65,10.0,RENT,24000.0,credit_card,AZ,27.65,0.0,83.7,9.0,0.0,26.0,verified
1,2500.0,60 months,15.27,0.0,RENT,30000.0,car,GA,1.0,0.0,9.4,4.0,1.0,12.0,verified
2,2400.0,36 months,15.96,10.0,RENT,12252.0,small_business,IL,8.72,0.0,98.5,10.0,0.0,10.0,not verified


In [3]:
# encode the response column as categorical:
df_hex['bad_loan'] = df_hex['bad_loan'].asfactor()

# split data into train and testing:
train, valid, test = df_hex.split_frame(ratios=[0.7, 0.15], seed=123)

In [4]:
# determine response and predictor variables:
y = 'bad_loan'
x = list(df_hex.columns)
x.remove(y)
x.remove('int_rate')  # remove the interest rate column because it's correlated with the outcome

In [5]:
# hyperparameter search (deeplearning):

from h2o.estimators import H2ODeepLearningEstimator 
import h2o.grid

grid_search = h2o.grid.H2OGridSearch(
    H2ODeepLearningEstimator(
        training_frame=train,
        validation_frame=valid,
        hidden = [10, 10],
        standardize = True,
        seed = 123
    ),
    hyper_params ={
        'l1' : [1e-7, 1e-5, 1e-3, 1e-1],
        'l2' : [1e-7, 1e-5, 1e-3, 1e-1],
    }   
)

grid_search.train(x, y, train)
grid_search.summary()

deeplearning Grid Build progress: |███████████████████████████████████████| 100%

Grid Summary:



0,1,2,3,4,5,6,7,8,9,10,11,12,13
Model Id,layer,units,type,dropout,l1,l2,mean_rate,rate_rms,momentum,mean_weight,weight_rms,mean_bias,bias_rms
Grid_DeepLearning_py_4_sid_9a15_model_python_1490003335373_527_model_8,1,87,Input,0.0,,,,,,,,,
Grid_DeepLearning_py_4_sid_9a15_model_python_1490003335373_527_model_9,1,87,Input,0.0,,,,,,,,,
Grid_DeepLearning_py_4_sid_9a15_model_python_1490003335373_527_model_0,1,87,Input,0.0,,,,,,,,,
Grid_DeepLearning_py_4_sid_9a15_model_python_1490003335373_527_model_4,1,87,Input,0.0,,,,,,,,,
Grid_DeepLearning_py_4_sid_9a15_model_python_1490003335373_527_model_1,1,87,Input,0.0,,,,,,,,,
Grid_DeepLearning_py_4_sid_9a15_model_python_1490003335373_527_model_2,1,87,Input,0.0,,,,,,,,,
Grid_DeepLearning_py_4_sid_9a15_model_python_1490003335373_527_model_6,1,87,Input,0.0,,,,,,,,,
Grid_DeepLearning_py_4_sid_9a15_model_python_1490003335373_527_model_10,1,87,Input,0.0,,,,,,,,,
Grid_DeepLearning_py_4_sid_9a15_model_python_1490003335373_527_model_5,1,87,Input,0.0,,,,,,,,,


In [6]:
# sorted on auc:
grid_sorted = grid_search.get_grid(sort_by='auc', decreasing=True)

In [7]:
# get the best performing model:
best_model = grid_sorted.models[0]
print best_model 

Model Details
H2ODeepLearningEstimator :  Deep Learning
Model Key:  Grid_DeepLearning_py_4_sid_9a15_model_python_1490003335373_527_model_0


ModelMetricsBinomial: deeplearning
** Reported on train data. **

MSE: 0.135737319748
RMSE: 0.368425460233
LogLoss: 0.433069764018
Mean Per-Class Error: 0.348722176508
AUC: 0.699461902937
Gini: 0.398923805875
Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.193269021917: 


0,1,2,3,4
,0.0,1.0,Error,Rate
0,5537.0,2581.0,0.3179,(2581.0/8118.0)
1,678.0,1087.0,0.3841,(678.0/1765.0)
Total,6215.0,3668.0,0.3298,(3259.0/9883.0)


Maximum Metrics: Maximum metrics at their respective thresholds



0,1,2,3
metric,threshold,value,idx
max f1,0.1932690,0.4001472,234.0
max f2,0.1173614,0.5578963,302.0
max f0point5,0.3027237,0.3667263,154.0
max accuracy,0.5882401,0.8225235,23.0
max precision,0.7841934,1.0,0.0
max recall,0.0226188,1.0,395.0
max specificity,0.7841934,1.0,0.0
max absolute_mcc,0.1815536,0.2362417,244.0
max min_per_class_accuracy,0.1815536,0.6498584,244.0


Gains/Lift Table: Avg response rate: 17.86 %



0,1,2,3,4,5,6,7,8,9,10,11
,group,cumulative_data_fraction,lower_threshold,lift,cumulative_lift,response_rate,cumulative_response_rate,capture_rate,cumulative_capture_rate,gain,cumulative_gain
,1,0.0100172,0.5682158,2.8279967,2.8279967,0.5050505,0.5050505,0.0283286,0.0283286,182.7996681,182.7996681
,2,0.0200344,0.5251605,2.8279967,2.8279967,0.5050505,0.5050505,0.0283286,0.0566572,182.7996681,182.7996681
,3,0.0300516,0.4977280,2.0927175,2.5829036,0.3737374,0.4612795,0.0209632,0.0776204,109.2717544,158.2903635
,4,0.0400688,0.4752862,2.6583169,2.6017569,0.4747475,0.4646465,0.0266289,0.1042493,165.8316880,160.1756946
,5,0.0500860,0.4517119,2.2058374,2.5225730,0.3939394,0.4505051,0.0220963,0.1263456,120.5837411,152.2573039
,6,0.1000708,0.3700222,2.0629492,2.2929935,0.3684211,0.4095046,0.1031161,0.2294618,106.2949158,129.2993466
,7,0.1500557,0.3168556,1.7342375,2.1068671,0.3097166,0.3762643,0.0866856,0.3161473,73.4237479,110.6867062
,8,0.2000405,0.2751630,1.5528793,1.9684402,0.2773279,0.3515427,0.0776204,0.3937677,55.2879311,96.8440178
,9,0.3000101,0.2214321,1.3431839,1.7600917,0.2398785,0.3143339,0.1342776,0.5280453,34.3183930,76.0091722




ModelMetricsBinomial: deeplearning
** Reported on validation data. **

MSE: 0.141022349388
RMSE: 0.375529425462
LogLoss: 0.449204366578
Mean Per-Class Error: 0.373659639526
AUC: 0.672044107314
Gini: 0.344088214627
Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.197781362301: 


0,1,2,3,4
,0.0,1.0,Error,Rate
0,13607.0,6432.0,0.321,(6432.0/20039.0)
1,1907.0,2556.0,0.4273,(1907.0/4463.0)
Total,15514.0,8988.0,0.3403,(8339.0/24502.0)


Maximum Metrics: Maximum metrics at their respective thresholds



0,1,2,3
metric,threshold,value,idx
max f1,0.1977814,0.3800461,233.0
max f2,0.0928016,0.5459572,328.0
max f0point5,0.3078505,0.3475830,154.0
max accuracy,0.6166843,0.8183822,23.0
max precision,0.8149441,1.0,0.0
max recall,0.0095775,1.0,397.0
max specificity,0.8149441,1.0,0.0
max absolute_mcc,0.2589413,0.2040572,186.0
max min_per_class_accuracy,0.1787242,0.6249160,248.0


Gains/Lift Table: Avg response rate: 18.21 %



0,1,2,3,4,5,6,7,8,9,10,11
,group,cumulative_data_fraction,lower_threshold,lift,cumulative_lift,response_rate,cumulative_response_rate,capture_rate,cumulative_capture_rate,gain,cumulative_gain
,1,0.0100400,0.5723105,2.7226974,2.7226974,0.4959350,0.4959350,0.0273359,0.0273359,172.2697373,172.2697373
,2,0.0200392,0.5275882,2.3080531,2.5157975,0.4204082,0.4582485,0.0230786,0.0504145,130.8053062,151.5797462
,3,0.0300384,0.4979634,2.2856448,2.4391841,0.4163265,0.4442935,0.0228546,0.0732691,128.5644780,143.9184137
,4,0.0400375,0.4754619,2.5097276,2.4568020,0.4571429,0.4475025,0.0250952,0.0983643,150.9727602,145.6802026
,5,0.0500367,0.4525748,2.1287868,2.3912525,0.3877551,0.4355628,0.0212861,0.1196505,112.8786805,139.1252491
,6,0.1000327,0.3701851,1.9405572,2.1659968,0.3534694,0.3945328,0.0970199,0.2166704,94.0557235,116.5996804
,7,0.1500286,0.3160905,1.6268413,1.9863272,0.2963265,0.3618063,0.0813354,0.2980058,62.6841285,98.6327187
,8,0.2000245,0.2783984,1.4072401,1.8415850,0.2563265,0.3354417,0.0703563,0.3683621,40.7240119,84.1584960
,9,0.3000163,0.2240631,1.2190105,1.6340884,0.2220408,0.2976466,0.1218911,0.4902532,21.9010549,63.4088387



Scoring History: 


0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16
,timestamp,duration,training_speed,epochs,iterations,samples,training_rmse,training_logloss,training_auc,training_lift,training_classification_error,validation_rmse,validation_logloss,validation_auc,validation_lift,validation_classification_error
,2017-03-20 12:49:55,0.000 sec,,0.0,0,0.0,,,,,,,,,,
,2017-03-20 12:49:56,10.057 sec,327320 obs/sec,0.8716690,1,100160.0,0.3894436,0.5059008,0.6804462,2.8279967,0.2965699,0.3941321,0.5210808,0.6584505,2.5218426,0.3900906
,2017-03-20 12:50:01,22.941 sec,298240 obs/sec,10.4469566,12,1200418.0,0.3684255,0.4330698,0.6994619,2.8279967,0.3297582,0.3755294,0.4492044,0.6720441,2.7226974,0.3403396





In [8]:
# out-of-sample:
performance = best_model.model_performance(test)
print performance


ModelMetricsBinomial: deeplearning
** Reported on test data. **

MSE: 0.142056315052
RMSE: 0.376903588537
LogLoss: 0.450833988283
Mean Per-Class Error: 0.367452866903
AUC: 0.678024744738
Gini: 0.356049489477
Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.190358430689: 


0,1,2,3,4
,0.0,1.0,Error,Rate
0,13286.0,6754.0,0.337,(6754.0/20040.0)
1,1824.0,2715.0,0.4019,(1824.0/4539.0)
Total,15110.0,9469.0,0.349,(8578.0/24579.0)


Maximum Metrics: Maximum metrics at their respective thresholds



0,1,2,3
metric,threshold,value,idx
max f1,0.1903584,0.3876356,240.0
max f2,0.0942917,0.5543445,328.0
max f0point5,0.3163128,0.3474685,150.0
max accuracy,0.6793372,0.8157370,10.0
max precision,0.8158268,1.0,0.0
max recall,0.0138815,1.0,397.0
max specificity,0.8158268,1.0,0.0
max absolute_mcc,0.2055815,0.2087703,227.0
max min_per_class_accuracy,0.1791025,0.6295409,250.0


Gains/Lift Table: Avg response rate: 18.47 %



0,1,2,3,4,5,6,7,8,9,10,11
,group,cumulative_data_fraction,lower_threshold,lift,cumulative_lift,response_rate,cumulative_response_rate,capture_rate,cumulative_capture_rate,gain,cumulative_gain
,1,0.0100085,0.5696327,2.5314349,2.5314349,0.4674797,0.4674797,0.0253360,0.0253360,153.1434881,153.1434881
,2,0.0200171,0.5275767,2.3993600,2.4653974,0.4430894,0.4552846,0.0240141,0.0493501,139.9360018,146.5397450
,3,0.0300256,0.4959442,2.2672852,2.3993600,0.4186992,0.4430894,0.0226922,0.0720423,126.7285155,139.9360018
,4,0.0400342,0.4736945,2.2232602,2.3553351,0.4105691,0.4349593,0.0222516,0.0942939,122.3260200,135.5335064
,5,0.0500020,0.4526214,1.9450045,2.2735361,0.3591837,0.4198535,0.0193875,0.1136814,94.5004519,127.3536053
,6,0.1000041,0.3704414,2.0047653,2.1391507,0.3702197,0.3950366,0.1002423,0.2139238,100.4765318,113.9150686
,7,0.1500061,0.3173759,1.6170305,1.9651106,0.2986168,0.3628967,0.0808548,0.2947786,61.7030488,96.5110620
,8,0.2000081,0.2777447,1.4407874,1.8340298,0.2660700,0.3386900,0.0720423,0.3668209,44.0787383,83.4029811
,9,0.3000122,0.2241326,1.2997929,1.6559508,0.2400325,0.3058042,0.1299846,0.4968055,29.9792899,65.5950840






In [9]:
h2o.cluster().shutdown()

H2O session _sid_9a15 closed.
