In [1]:
import h2o
from h2o.estimators.pca import H2OPrincipalComponentAnalysisEstimator as pca

### h2o INIT

In [2]:
h2o.init(nthreads = -1, max_mem_size = 8)

Checking whether there is an H2O instance running at http://localhost:54321 ..... not found.
Attempting to start a local H2O server...
  Java Version: openjdk version "11.0.4" 2019-07-16; OpenJDK Runtime Environment (build 11.0.4+11-post-Ubuntu-1ubuntu218.04.3); OpenJDK 64-Bit Server VM (build 11.0.4+11-post-Ubuntu-1ubuntu218.04.3, mixed mode, sharing)
  Starting server from /home/felipe/miniconda3/envs/lab/lib/python3.7/site-packages/h2o/backend/bin/h2o.jar
  Ice root: /tmp/tmpsgzxetuy
  JVM stdout: /tmp/tmpsgzxetuy/h2o_felipe_started_from_python.out
  JVM stderr: /tmp/tmpsgzxetuy/h2o_felipe_started_from_python.err
  Server is running at http://127.0.0.1:54321
Connecting to H2O server at http://127.0.0.1:54321 ... successful.


0,1
H2O cluster uptime:,02 secs
H2O cluster timezone:,Europe/Madrid
H2O data parsing timezone:,UTC
H2O cluster version:,3.26.0.11
H2O cluster version age:,3 days
H2O cluster name:,H2O_from_python_felipe_mh48qe
H2O cluster total nodes:,1
H2O cluster free memory:,8 Gb
H2O cluster total cores:,4
H2O cluster allowed cores:,4


### LOAD DATA

In [3]:
titanic= h2o.import_file('titanic.csv')

Parse progress: |█████████████████████████████████████████████████████████| 100%


In [4]:
titanic.head()

PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,,7.25,,S
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Thayer)",female,38.0,1,0,,71.2833,C85,C
3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,,7.925,,S
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803.0,53.1,C123,S
5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450.0,8.05,,S
6,0,3,"Moran, Mr. James",male,,0,0,330877.0,8.4583,,Q
7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463.0,51.8625,E46,S
8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909.0,21.075,,S
9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742.0,11.1333,,S
10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736.0,30.0708,,C




In [5]:
y_columns = "Survived"
x_columns = ["Pclass","Sex","Fare", "Embarked"]

In [6]:
titanic[x_columns]

Pclass,Sex,Fare,Embarked
3,male,7.25,S
1,female,71.2833,C
3,female,7.925,S
1,female,53.1,S
3,male,8.05,S
3,male,8.4583,Q
1,male,51.8625,S
3,male,21.075,S
3,female,11.1333,S
2,female,30.0708,C




### Categorical to numeric

In [7]:
titanic[y_columns] = titanic[y_columns].asfactor()
titanic[y_columns].levels()

[['0', '1']]

### Train and test Split

In [8]:
train, test=titanic.split_frame(ratios = [.8])
X_train=train[x_columns]
y_train=train[y_columns]
X_test=test[x_columns]
y_test=test[y_columns]

### PCA
#### As its own object

In [9]:
pca_model = pca(k = 3, transform = "STANDARDIZE", pca_method="Power",
                   use_all_factor_levels=True, impute_missing=True)
pca_model.train(training_frame=X_train)

pca Model Build progress: |███████████████████████████████████████████████| 100%


#### As attribute of the data 

In [10]:
X_train.pca=pca(k = 3, transform = "STANDARDIZE", pca_method="Power",
                   use_all_factor_levels=True, impute_missing=True)
X_train.pca.train(training_frame=X_train)

pca Model Build progress: |███████████████████████████████████████████████| 100%


In [11]:
X_train.pca.varimp(use_pandas=True)

Unnamed: 0,Unnamed: 1,pc1,pc2,pc3
0,Standard deviation,1.272174,1.094934,0.67613
1,Proportion of Variance,0.404322,0.29951,0.114208
2,Cumulative Proportion,0.404322,0.703832,0.818039


In [12]:
X_train.pca.rotation()


Rotation: 


Unnamed: 0,Unnamed: 1,pc1,pc2,pc3
0,Embarked.C,-0.089517,0.123905,0.052748
1,Embarked.Q,0.043921,0.017507,0.071316
2,Embarked.S,0.192444,0.702656,0.053075
3,Sex.female,-0.05889,0.233115,0.29747
4,Sex.male,0.205737,0.610953,-0.12033
5,Pclass,0.667733,-0.198685,0.695667
6,Fare,-0.679235,0.153405,0.634346




### Use Predict to apply to new data

In [13]:
pca_model.predict(X_test)

pca prediction progress: |████████████████████████████████████████████████| 100%


PC1,PC2,PC3
1.20859,1.31323,0.444239
-0.898956,0.723421,-0.72371
-1.02537,0.69487,-0.60565
-0.837496,0.456213,-0.489982
-0.422782,1.32548,0.570352
-0.880378,0.727617,-0.74106
-0.874348,0.0102864,-0.626226
0.692443,1.63426,1.09694
3.94363,2.36854,-1.93939
-1.2894,1.07284,-0.188405




In [14]:
X_train.pca.predict(X_test)

pca prediction progress: |████████████████████████████████████████████████| 100%


PC1,PC2,PC3
-1.20859,1.31323,-0.444239
0.898957,0.723421,0.72371
1.02537,0.69487,0.605649
0.837496,0.456213,0.489982
0.422782,1.32548,-0.570352
0.880378,0.727617,0.74106
0.874348,0.0102864,0.626226
-0.692443,1.63426,-1.09694
-3.94363,2.36854,1.93939
1.2894,1.07284,0.188405




### Random Forest Estimator

In [15]:
from h2o.estimators.random_forest import H2ORandomForestEstimator as rf

In [16]:
rf_fit1 = rf(model_id='rf_fit1', seed=1)
rf_fit1.train(x=x_columns, y=y_columns, training_frame=train)

drf Model Build progress: |███████████████████████████████████████████████| 100%


In [17]:
rf_fit2 = rf(model_id='rf_fit2', ntrees=500, seed=1)
rf_fit2.train(x=x_columns, y=y_columns, training_frame=train)

drf Model Build progress: |███████████████████████████████████████████████| 100%


In [18]:
rf_perf1 = rf_fit1.model_performance(test)
rf_perf2 = rf_fit2.model_performance(test)

In [19]:
print(rf_perf1.auc())
print(rf_perf2.auc())

0.8690939771547248
0.8727933541017653


In [20]:
rf_fit1.model_performance

Model Details
H2ORandomForestEstimator :  Distributed Random Forest
Model Key:  rf_fit1


Model Summary: 


Unnamed: 0,Unnamed: 1,number_of_trees,number_of_internal_trees,model_size_in_bytes,min_depth,max_depth,mean_depth,min_leaves,max_leaves,mean_leaves
0,,50.0,50.0,68629.0,12.0,20.0,15.74,65.0,122.0,104.56




ModelMetricsBinomial: drf
** Reported on train data. **

MSE: 0.14875914993519257
RMSE: 0.3856930773752526
LogLoss: 0.6696648778300591
Mean Per-Class Error: 0.20687112451818335
AUC: 0.8287330316742082
pr_auc: 0.7019398718787383
Gini: 0.6574660633484164

Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.5164389038085937: 


Unnamed: 0,Unnamed: 1,0,1,Error,Rate
0,0,390.0,52.0,0.1176,(52.0/442.0)
1,1,80.0,190.0,0.2963,(80.0/270.0)
2,Total,470.0,242.0,0.1854,(132.0/712.0)



Maximum Metrics: Maximum metrics at their respective thresholds


Unnamed: 0,metric,threshold,value,idx
0,max f1,0.516439,0.742188,147.0
1,max f2,0.060907,0.766401,341.0
2,max f0point5,0.772882,0.794243,83.0
3,max accuracy,0.558824,0.817416,136.0
4,max precision,0.968994,0.968421,29.0
5,max recall,0.0,1.0,399.0
6,max specificity,1.0,0.995475,0.0
7,max absolute_mcc,0.541975,0.605564,140.0
8,max min_per_class_accuracy,0.308197,0.780543,197.0
9,max mean_per_class_accuracy,0.541975,0.793129,140.0



Gains/Lift Table: Avg response rate: 37.92 %, avg score: 38.05 %


Unnamed: 0,Unnamed: 1,group,cumulative_data_fraction,lower_threshold,lift,cumulative_lift,response_rate,score,cumulative_response_rate,cumulative_score,capture_rate,cumulative_capture_rate,gain,cumulative_gain
0,,1,0.044944,1.0,2.472222,2.472222,0.9375,1.0,0.9375,1.0,0.111111,0.111111,147.222222,147.222222
1,,2,0.050562,0.996064,2.637037,2.490535,1.0,0.996856,0.944444,0.999651,0.014815,0.125926,163.703704,149.053498
2,,3,0.101124,0.985876,2.563786,2.52716,0.972222,0.991728,0.958333,0.995689,0.12963,0.255556,156.378601,152.716049
3,,4,0.150281,0.950003,2.486349,2.513811,0.942857,0.971989,0.953271,0.987937,0.122222,0.377778,148.634921,151.381101
4,,5,0.200843,0.839276,2.12428,2.415747,0.805556,0.896721,0.916084,0.964973,0.107407,0.485185,112.427984,141.574722
5,,6,0.300562,0.609469,1.597079,2.144133,0.605634,0.733285,0.813084,0.888105,0.159259,0.644444,59.707877,114.413292
6,,7,0.400281,0.362945,0.965676,1.850552,0.366197,0.483617,0.701754,0.787338,0.096296,0.740741,-3.432447,85.055231
7,,8,0.5,0.189762,0.705686,1.622222,0.267606,0.272426,0.615169,0.684644,0.07037,0.811111,-29.431403,62.222222
8,,9,0.599719,0.134711,0.222848,1.389539,0.084507,0.153675,0.526932,0.596357,0.022222,0.833333,-77.71518,38.953942
9,,10,0.699438,0.102495,0.111424,1.207318,0.042254,0.119088,0.457831,0.528312,0.011111,0.844444,-88.85759,20.731816




Scoring History: 


Unnamed: 0,Unnamed: 1,timestamp,duration,number_of_trees,training_rmse,training_logloss,training_auc,training_pr_auc,training_lift,training_classification_error
0,,2019-12-09 10:50:21,0.020 sec,0.0,,,,,,
1,,2019-12-09 10:50:21,0.230 sec,1.0,0.438454,5.106405,0.767948,0.288372,2.126643,0.204918
2,,2019-12-09 10:50:21,0.325 sec,2.0,0.425639,4.164358,0.787466,0.407391,2.142593,0.212411
3,,2019-12-09 10:50:21,0.371 sec,3.0,0.426485,3.847543,0.791867,0.407914,2.098866,0.22824
4,,2019-12-09 10:50:21,0.410 sec,4.0,0.402409,3.218888,0.817833,0.428276,2.17737,0.204738
5,,2019-12-09 10:50:21,0.440 sec,5.0,0.406285,3.276443,0.814424,0.41545,2.10963,0.206897
6,,2019-12-09 10:50:21,0.479 sec,6.0,0.402841,2.968158,0.81734,0.431553,2.205109,0.209337
7,,2019-12-09 10:50:21,0.508 sec,7.0,0.399438,2.743483,0.821588,0.433517,2.279473,0.205539
8,,2019-12-09 10:50:21,0.539 sec,8.0,0.398831,2.592611,0.820573,0.453709,2.322528,0.207246
9,,2019-12-09 10:50:21,0.561 sec,9.0,0.390597,2.278357,0.827472,0.466643,2.408832,0.204871



See the whole table with table.as_data_frame()

Variable Importances: 


Unnamed: 0,variable,relative_importance,scaled_importance,percentage
0,Sex,1966.18396,1.0,0.432198
1,Fare,1826.943604,0.929182,0.401591
2,Pclass,579.268188,0.294615,0.127332
3,Embarked,176.869492,0.089956,0.038879


<bound method ModelBase.model_performance of >

In [21]:
rf_fit2.model_performance

Model Details
H2ORandomForestEstimator :  Distributed Random Forest
Model Key:  rf_fit2


Model Summary: 


Unnamed: 0,Unnamed: 1,number_of_trees,number_of_internal_trees,model_size_in_bytes,min_depth,max_depth,mean_depth,min_leaves,max_leaves,mean_leaves
0,,500.0,500.0,668044.0,10.0,20.0,15.492,40.0,131.0,101.58




ModelMetricsBinomial: drf
** Reported on train data. **

MSE: 0.14402656811864584
RMSE: 0.3795083241756969
LogLoss: 0.4823035871696077
Mean Per-Class Error: 0.20193564605329306
AUC: 0.8350259762024468
pr_auc: 0.800699593853175
Gini: 0.6700519524048936

Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.5245452530082615: 


Unnamed: 0,Unnamed: 1,0,1,Error,Rate
0,0,396.0,46.0,0.1041,(46.0/442.0)
1,1,81.0,189.0,0.3,(81.0/270.0)
2,Total,477.0,235.0,0.1784,(127.0/712.0)



Maximum Metrics: Maximum metrics at their respective thresholds


Unnamed: 0,metric,threshold,value,idx
0,max f1,0.524545,0.748515,150.0
1,max f2,0.055998,0.777845,347.0
2,max f0point5,0.753061,0.805955,98.0
3,max accuracy,0.582118,0.824438,141.0
4,max precision,0.990232,0.98,13.0
5,max recall,0.006697,1.0,398.0
6,max specificity,0.999383,0.997738,0.0
7,max absolute_mcc,0.573501,0.620654,143.0
8,max min_per_class_accuracy,0.317804,0.776018,210.0
9,max mean_per_class_accuracy,0.573501,0.798064,143.0



Gains/Lift Table: Avg response rate: 37.92 %, avg score: 37.87 %


Unnamed: 0,Unnamed: 1,group,cumulative_data_fraction,lower_threshold,lift,cumulative_lift,response_rate,score,cumulative_response_rate,cumulative_score,capture_rate,cumulative_capture_rate,gain,cumulative_gain
0,,1,0.011236,0.998759,2.307407,2.307407,0.875,0.999148,0.875,0.999148,0.025926,0.025926,130.740741,130.740741
1,,2,0.021067,0.998017,2.637037,2.461235,1.0,0.998365,0.933333,0.998783,0.025926,0.051852,163.703704,146.123457
2,,3,0.030899,0.996109,2.637037,2.517172,1.0,0.997282,0.954545,0.998305,0.025926,0.077778,163.703704,151.717172
3,,4,0.04073,0.994669,2.637037,2.546105,1.0,0.995337,0.965517,0.997589,0.025926,0.103704,163.703704,154.610473
4,,5,0.050562,0.993277,2.637037,2.563786,1.0,0.993857,0.972222,0.996863,0.025926,0.12963,163.703704,156.378601
5,,6,0.101124,0.975201,2.417284,2.490535,0.916667,0.987356,0.944444,0.99211,0.122222,0.251852,141.728395,149.053498
6,,7,0.150281,0.934771,2.637037,2.538456,1.0,0.957085,0.962617,0.980653,0.12963,0.381481,163.703704,153.845621
7,,8,0.200843,0.8236,2.051029,2.415747,0.777778,0.891495,0.916084,0.958208,0.103704,0.485185,105.102881,141.574722
8,,9,0.300562,0.614832,1.671362,2.168778,0.633803,0.71939,0.82243,0.878974,0.166667,0.651852,67.13615,116.877812
9,,10,0.400281,0.350928,0.779969,1.822794,0.295775,0.469508,0.691228,0.776967,0.077778,0.72963,-22.00313,82.279402




Scoring History: 


Unnamed: 0,Unnamed: 1,timestamp,duration,number_of_trees,training_rmse,training_logloss,training_auc,training_pr_auc,training_lift,training_classification_error
0,,2019-12-09 10:50:23,0.002 sec,0.0,,,,,,
1,,2019-12-09 10:50:23,0.022 sec,1.0,0.438454,5.106405,0.767948,0.288372,2.126643,0.204918
2,,2019-12-09 10:50:23,0.037 sec,2.0,0.425639,4.164358,0.787466,0.407391,2.142593,0.212411
3,,2019-12-09 10:50:23,0.055 sec,3.0,0.426485,3.847543,0.791867,0.407914,2.098866,0.22824
4,,2019-12-09 10:50:23,0.074 sec,4.0,0.402409,3.218888,0.817833,0.428276,2.17737,0.204738
5,,2019-12-09 10:50:23,0.099 sec,5.0,0.406285,3.276443,0.814424,0.41545,2.10963,0.206897
6,,2019-12-09 10:50:23,0.120 sec,6.0,0.402841,2.968158,0.81734,0.431553,2.205109,0.209337
7,,2019-12-09 10:50:23,0.141 sec,7.0,0.399438,2.743483,0.821588,0.433517,2.279473,0.205539
8,,2019-12-09 10:50:23,0.159 sec,8.0,0.398831,2.592611,0.820573,0.453709,2.322528,0.207246
9,,2019-12-09 10:50:23,0.176 sec,9.0,0.390597,2.278357,0.827472,0.466643,2.408832,0.204871



See the whole table with table.as_data_frame()

Variable Importances: 


Unnamed: 0,variable,relative_importance,scaled_importance,percentage
0,Fare,18537.431641,1.0,0.417782
1,Sex,18253.474609,0.984682,0.411382
2,Pclass,5790.680664,0.312378,0.130506
3,Embarked,1789.524048,0.096536,0.040331


<bound method ModelBase.model_performance of >

In [22]:
rf_fit3 = rf(model_id='rf_fit3', seed=1, nfolds=5, ntrees=2000)
rf_fit3.train(x=x_columns, y=y_columns, training_frame=train)

drf Model Build progress: |███████████████████████████████████████████████| 100%


In [23]:
rf_perf3 = rf_fit3.model_performance(test)
print(rf_perf3.auc())

0.8733125649013499


In [24]:
rf_fit3.model_performance

Model Details
H2ORandomForestEstimator :  Distributed Random Forest
Model Key:  rf_fit3


Model Summary: 


Unnamed: 0,Unnamed: 1,number_of_trees,number_of_internal_trees,model_size_in_bytes,min_depth,max_depth,mean_depth,min_leaves,max_leaves,mean_leaves
0,,2000.0,2000.0,2636581.0,10.0,20.0,15.48,36.0,131.0,100.1205




ModelMetricsBinomial: drf
** Reported on train data. **

MSE: 0.14269002192077124
RMSE: 0.3777433280956412
LogLoss: 0.47186758080864716
Mean Per-Class Error: 0.19967320261437904
AUC: 0.8384657281716105
pr_auc: 0.8104900487569271
Gini: 0.6769314563432209

Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.5844795274626371: 


Unnamed: 0,Unnamed: 1,0,1,Error,Rate
0,0,403.0,39.0,0.0882,(39.0/442.0)
1,1,84.0,186.0,0.3111,(84.0/270.0)
2,Total,487.0,225.0,0.1728,(123.0/712.0)



Maximum Metrics: Maximum metrics at their respective thresholds


Unnamed: 0,metric,threshold,value,idx
0,max f1,0.58448,0.751515,137.0
1,max f2,0.066445,0.782875,346.0
2,max f0point5,0.751791,0.807128,94.0
3,max accuracy,0.588046,0.827247,136.0
4,max precision,0.998296,1.0,0.0
5,max recall,0.01008,1.0,399.0
6,max specificity,0.998296,1.0,0.0
7,max absolute_mcc,0.588046,0.626855,136.0
8,max min_per_class_accuracy,0.30729,0.780543,212.0
9,max mean_per_class_accuracy,0.58448,0.800327,137.0



Gains/Lift Table: Avg response rate: 37.92 %, avg score: 37.83 %


Unnamed: 0,Unnamed: 1,group,cumulative_data_fraction,lower_threshold,lift,cumulative_lift,response_rate,score,cumulative_response_rate,cumulative_score,capture_rate,cumulative_capture_rate,gain,cumulative_gain
0,,1,0.011236,0.997391,2.637037,2.637037,1.0,0.997997,1.0,0.997997,0.02963,0.02963,163.703704,163.703704
1,,2,0.021067,0.99665,2.637037,2.637037,1.0,0.996976,1.0,0.99752,0.025926,0.055556,163.703704,163.703704
2,,3,0.030899,0.99592,2.260317,2.517172,0.857143,0.996313,0.954545,0.997136,0.022222,0.077778,126.031746,151.717172
3,,4,0.04073,0.993989,2.637037,2.546105,1.0,0.99533,0.965517,0.9967,0.025926,0.103704,163.703704,154.610473
4,,5,0.050562,0.992561,2.637037,2.563786,1.0,0.993129,0.972222,0.996006,0.025926,0.12963,163.703704,156.378601
5,,6,0.101124,0.977038,2.490535,2.52716,0.944444,0.987048,0.958333,0.991527,0.125926,0.255556,149.053498,152.716049
6,,7,0.150281,0.928535,2.561693,2.538456,0.971429,0.953351,0.962617,0.97904,0.125926,0.381481,156.169312,153.845621
7,,8,0.200843,0.821771,2.051029,2.415747,0.777778,0.889828,0.916084,0.956581,0.103704,0.485185,105.102881,141.574722
8,,9,0.300562,0.612078,1.708503,2.181101,0.647887,0.714541,0.827103,0.876278,0.17037,0.655556,70.850287,118.110073
9,,10,0.400281,0.348192,0.854251,1.850552,0.323944,0.471202,0.701754,0.775364,0.085185,0.740741,-14.574857,85.055231




ModelMetricsBinomial: drf
** Reported on cross-validation data. **

MSE: 0.1489366616231624
RMSE: 0.3859231291632602
LogLoss: 0.49586048148461986
Mean Per-Class Error: 0.2075917546505781
AUC: 0.8280961957432545
pr_auc: 0.8084614347986927
Gini: 0.6561923914865091

Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.5647590416651219: 


Unnamed: 0,Unnamed: 1,0,1,Error,Rate
0,0,396.0,46.0,0.1041,(46.0/442.0)
1,1,84.0,186.0,0.3111,(84.0/270.0)
2,Total,480.0,232.0,0.1826,(130.0/712.0)



Maximum Metrics: Maximum metrics at their respective thresholds


Unnamed: 0,metric,threshold,value,idx
0,max f1,0.564759,0.741036,158.0
1,max f2,0.060535,0.7737,335.0
2,max f0point5,0.717141,0.782178,123.0
3,max accuracy,0.587729,0.817416,155.0
4,max precision,0.998126,1.0,0.0
5,max recall,0.009051,1.0,399.0
6,max specificity,0.998126,1.0,0.0
7,max absolute_mcc,0.564759,0.605408,158.0
8,max min_per_class_accuracy,0.306606,0.77037,211.0
9,max mean_per_class_accuracy,0.564759,0.792408,158.0



Gains/Lift Table: Avg response rate: 37.92 %, avg score: 38.03 %


Unnamed: 0,Unnamed: 1,group,cumulative_data_fraction,lower_threshold,lift,cumulative_lift,response_rate,score,cumulative_response_rate,cumulative_score,capture_rate,cumulative_capture_rate,gain,cumulative_gain
0,,1,0.011236,0.997366,2.637037,2.637037,1.0,0.997749,1.0,0.997749,0.02963,0.02963,163.703704,163.703704
1,,2,0.021067,0.996331,2.637037,2.637037,1.0,0.996753,1.0,0.997284,0.025926,0.055556,163.703704,163.703704
2,,3,0.030899,0.99485,2.637037,2.637037,1.0,0.995488,1.0,0.996713,0.025926,0.081481,163.703704,163.703704
3,,4,0.04073,0.993592,2.637037,2.637037,1.0,0.994554,1.0,0.996192,0.025926,0.107407,163.703704,163.703704
4,,5,0.050562,0.992369,2.637037,2.637037,1.0,0.992747,1.0,0.995522,0.025926,0.133333,163.703704,163.703704
5,,6,0.102528,0.977668,2.494494,2.564789,0.945946,0.986001,0.972603,0.990696,0.12963,0.262963,149.449449,156.478945
6,,7,0.150281,0.919203,2.481917,2.538456,0.941176,0.950053,0.962617,0.977781,0.118519,0.381481,148.191721,153.845621
7,,8,0.200843,0.8321,1.904527,2.378866,0.722222,0.881929,0.902098,0.953651,0.096296,0.477778,90.452675,137.886558
8,,9,0.300562,0.631663,1.671362,2.144133,0.633803,0.725776,0.813084,0.878048,0.166667,0.644444,67.13615,114.413292
9,,10,0.400281,0.363247,1.039958,1.869058,0.394366,0.481818,0.708772,0.779338,0.103704,0.748148,3.995827,86.905783




Cross-Validation Metrics Summary: 


Unnamed: 0,Unnamed: 1,mean,sd,cv_1_valid,cv_2_valid,cv_3_valid,cv_4_valid,cv_5_valid
0,accuracy,0.8179962,0.019854797,0.78431374,0.82758623,0.8333333,0.82857144,0.8161765
1,auc,0.82689124,0.008010215,0.8338475,0.83285105,0.8146591,0.8231666,0.829932
2,err,0.18200377,0.019854797,0.21568628,0.1724138,0.16666667,0.17142858,0.18382353
3,err_count,26.0,4.0,33.0,25.0,23.0,24.0,25.0
4,f0point5,0.7663875,0.048627805,0.70552146,0.81666666,0.81578946,0.7509881,0.7429719
5,f1,0.7539269,0.026620116,0.736,0.796748,0.7294118,0.76,0.74747473
6,f2,0.7455693,0.048974484,0.7692308,0.7777778,0.65957445,0.7692308,0.7520325
7,lift_top_group,2.659242,0.23357211,2.637931,2.265625,2.76,2.857143,2.7755103
8,logloss,0.4960345,0.014544271,0.48951805,0.50307304,0.50121325,0.47432846,0.5120397
9,max_per_class_error,0.26096308,0.06718993,0.22105263,0.234375,0.38,0.2244898,0.24489796



Scoring History: 


Unnamed: 0,Unnamed: 1,timestamp,duration,number_of_trees,training_rmse,training_logloss,training_auc,training_pr_auc,training_lift,training_classification_error
0,,2019-12-09 10:50:59,29.325 sec,0.0,,,,,,
1,,2019-12-09 10:50:59,29.332 sec,1.0,0.438454,5.106405,0.767948,0.288372,2.126643,0.204918
2,,2019-12-09 10:50:59,29.337 sec,2.0,0.425639,4.164358,0.787466,0.407391,2.142593,0.212411
3,,2019-12-09 10:50:59,29.341 sec,3.0,0.426485,3.847543,0.791867,0.407914,2.098866,0.22824
4,,2019-12-09 10:50:59,29.345 sec,4.0,0.402409,3.218888,0.817833,0.428276,2.17737,0.204738
5,,2019-12-09 10:50:59,29.349 sec,5.0,0.406285,3.276443,0.814424,0.41545,2.10963,0.206897
6,,2019-12-09 10:50:59,29.356 sec,6.0,0.402841,2.968158,0.81734,0.431553,2.205109,0.209337
7,,2019-12-09 10:50:59,29.360 sec,7.0,0.399438,2.743483,0.821588,0.433517,2.279473,0.205539
8,,2019-12-09 10:50:59,29.363 sec,8.0,0.398831,2.592611,0.820573,0.453709,2.322528,0.207246
9,,2019-12-09 10:50:59,29.370 sec,9.0,0.390597,2.278357,0.827472,0.466643,2.408832,0.204871



See the whole table with table.as_data_frame()

Variable Importances: 


Unnamed: 0,variable,relative_importance,scaled_importance,percentage
0,Sex,74590.585938,1.0,0.42075
1,Fare,73081.054688,0.979762,0.412235
2,Pclass,22443.039062,0.300883,0.126596
3,Embarked,7165.459473,0.096064,0.040419


<bound method ModelBase.model_performance of >

### Predict

In [25]:
y_pred1 = rf_fit1.predict(X_test)

drf prediction progress: |████████████████████████████████████████████████| 100%


In [26]:
y_pred1

predict,p0,p1
1,0.0089375,0.991062
1,0.0334055,0.966595
1,0.458584,0.541416
0,0.976032,0.0239683
0,0.861735,0.138265
0,0.873405,0.126595
0,0.621027,0.378973
0,0.551189,0.448811
0,0.821189,0.178811
0,0.991834,0.00816603




# autoML

In [34]:
from h2o.automl import H2OAutoML
aml_ti = H2OAutoML(max_runtime_secs= 180,max_models= 15, seed= 1, nfolds=0)
aml_ti.train(x = x_columns, y = y_columns, training_frame = train, validation_frame=test)

AutoML progress: |████████████████████████████████████████████████████████| 100%


In [36]:
lb_ti = aml_ti.leaderboard
lb_ti

model_id,auc,logloss,mean_per_class_error,rmse,mse
XGBoost_grid_1_AutoML_20191209_105108_model_2,0.980392,0.250417,0.0735294,0.272504,0.0742583
XRT_1_AutoML_20191209_105108,0.966503,0.285959,0.0735294,0.298599,0.0891614
DRF_1_AutoML_20191209_105108,0.961601,0.280782,0.0874183,0.294499,0.0867298
StackedEnsemble_AllModels_AutoML_20191209_105108,0.937092,0.386763,0.131536,0.344611,0.118757
StackedEnsemble_BestOfFamily_AutoML_20191209_105108,0.934641,0.397556,0.131536,0.350178,0.122624
XGBoost_grid_1_AutoML_20191209_105108_model_1,0.919935,0.40139,0.158497,0.358677,0.12865
GBM_1_AutoML_20191209_105108,0.911356,0.383874,0.131536,0.343529,0.118012
GBM_4_AutoML_20191209_105108,0.895425,0.412947,0.146242,0.361616,0.130766
GBM_3_AutoML_20191209_105108,0.89134,0.420524,0.160131,0.364992,0.13322
GBM_2_AutoML_20191209_105108,0.888072,0.427274,0.172386,0.368701,0.13594




In [37]:
pred_automl = aml_ti.leader.predict(test)

xgboost prediction progress: |████████████████████████████████████████████| 100%


In [47]:
pred_automl

predict,p0,p1
1,0.0349731,0.965027
1,0.0612363,0.938764
1,0.508873,0.491127
0,0.960478,0.0395219
0,0.856251,0.143749
0,0.927165,0.0728346
1,0.0669857,0.933014
1,0.513078,0.486922
0,0.903001,0.0969988
0,0.963528,0.0364717




Links:


https://medium.com/analytics-vidhya/gentle-introduction-to-automl-from-h2o-ai-a42b393b4ba2


https://www.kaggle.com/cooleel/a-quick-try-of-h2o-automl-on-titanic-dataset