### Classification on the Penguin Dataset

#### Libraries

In [7]:
import pandas as pd 

import h2o

#### Read data

In [8]:
penguins = pd.read_csv("data/penguins.txt")

In [9]:
penguins.head(3)

Unnamed: 0,studyName,Sample Number,Species,Region,Island,Stage,Individual ID,Clutch Completion,Date Egg,Culmen Length (mm),Culmen Depth (mm),Flipper Length (mm),Body Mass (g),Sex,Delta 15 N (o/oo),Delta 13 C (o/oo),Comments
0,PAL0708,1,Adelie Penguin (Pygoscelis adeliae),Anvers,Torgersen,"Adult, 1 Egg Stage",N1A1,Yes,11/11/07,39.1,18.7,181.0,3750.0,MALE,,,Not enough blood for isotopes.
1,PAL0708,2,Adelie Penguin (Pygoscelis adeliae),Anvers,Torgersen,"Adult, 1 Egg Stage",N1A2,Yes,11/11/07,39.5,17.4,186.0,3800.0,FEMALE,8.94956,-24.69454,
2,PAL0708,3,Adelie Penguin (Pygoscelis adeliae),Anvers,Torgersen,"Adult, 1 Egg Stage",N2A1,Yes,11/16/07,40.3,18.0,195.0,3250.0,FEMALE,8.36821,-25.33302,


#### Preprocess data

We will only use this four columns for analysis : 
- Culmen length (mm)
- Culmen Depth (mm)
- Flipper Length (mm)
- Body Mass (g)

In [10]:
attributes = ["Culmen Length (mm)","Culmen Depth (mm)","Flipper Length (mm)","Body Mass (g)"]
target = ["Species"]
penguins = penguins[attributes+target]
penguins.dropna(inplace=True)

#### Initialize H2o Cluster

In [11]:
h2o.init()

Checking whether there is an H2O instance running at http://localhost:54321 . connected.


0,1
H2O_cluster_uptime:,39 mins 20 secs
H2O_cluster_timezone:,Europe/Paris
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.38.0.4
H2O_cluster_version_age:,5 days
H2O_cluster_name:,H2O_from_python_massonga_h1rso7
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,1.423 Gb
H2O_cluster_total_cores:,8
H2O_cluster_allowed_cores:,8


#### Train-Test Split

In [12]:
penguins = h2o.H2OFrame(penguins)

Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%


In [13]:
train, test, valid = penguins.split_frame(ratios = [.7, .15], seed = 1234)
x = attributes
y = "Species"

#### Classification with auto ML

In [14]:
aml = h2o.automl.H2OAutoML(max_models = 10, seed = 1,max_runtime_secs=180)

In [15]:
aml.train(x=x, y=y, training_frame=train, validation_frame=valid)

AutoML progress: |█
14:32:14.365: User specified a validation frame with cross-validation still enabled. Please note that the models will still be validated using cross-validation only, the validation frame will be used to provide purely informative validation metrics on the trained models.
14:32:14.378: AutoML: XGBoost is not available; skipping it.
14:32:17.41: GBM_1_AutoML_2_20230111_143214 [GBM def_5] failed: water.exceptions.H2OModelBuilderIllegalArgumentException: Illegal argument(s) for GBM model: GBM_1_AutoML_2_20230111_143214.  Details: ERRR on field: _min_rows: The dataset size is too small to split for min_rows=100.0: must have at least 200.0 (weighted) rows, but have only 199.0.
ERRR on field: _min_rows: The dataset size is too small to split for min_rows=100.0: must have at least 200.0 (weighted) rows, but have only 199.0.
ERRR on field: _min_rows: The dataset size is too small to split for min_rows=100.0: must have at least 200.0 (weighted) rows, but have only 199.0.
ERRR

Unnamed: 0,family,link,regularization,lambda_search,number_of_predictors_total,number_of_active_predictors,number_of_iterations,training_frame
,multinomial,multinomial,Ridge ( lambda = 1.102E-4 ),"nlambda = 30, lambda.max = 42.501, lambda.min = 1.102E-4, lambda.1se = 0.001193",15,12,96,AutoML_2_20230111_143214_training_py_2_sid_af0b

Adelie Penguin (Pygoscelis adeliae),Chinstrap penguin (Pygoscelis antarctica),Gentoo penguin (Pygoscelis papua),Error,Rate
110.0,0.0,0.0,0.0,0 / 110
0.0,48.0,0.0,0.0,0 / 48
0.0,0.0,91.0,0.0,0 / 91
110.0,48.0,91.0,0.0,0 / 249

k,hit_ratio
1,1.0
2,1.0
3,1.0

Adelie Penguin (Pygoscelis adeliae),Chinstrap penguin (Pygoscelis antarctica),Gentoo penguin (Pygoscelis papua),Error,Rate
25.0,0.0,0.0,0.0,0 / 25
1.0,10.0,0.0,0.0909091,1 / 11
0.0,0.0,18.0,0.0,0 / 18
26.0,10.0,18.0,0.0185185,1 / 54

k,hit_ratio
1,0.9814815
2,1.0
3,1.0

Adelie Penguin (Pygoscelis adeliae),Chinstrap penguin (Pygoscelis antarctica),Gentoo penguin (Pygoscelis papua),Error,Rate
109.0,1.0,0.0,0.0090909,1 / 110
2.0,46.0,0.0,0.0416667,2 / 48
0.0,0.0,91.0,0.0,0 / 91
111.0,47.0,91.0,0.0120482,3 / 249

k,hit_ratio
1,0.9879518
2,1.0
3,1.0

Unnamed: 0,mean,sd,cv_1_valid,cv_2_valid,cv_3_valid,cv_4_valid,cv_5_valid
accuracy,0.9878367,0.0182088,1.0,0.98,1.0,1.0,0.9591837
auc,,0.0,,,,,
err,0.0121633,0.0182088,0.0,0.02,0.0,0.0,0.0408163
err_count,0.6,0.8944272,0.0,1.0,0.0,0.0,2.0
logloss,0.0276818,0.0265488,0.0010661,0.0576689,0.0089112,0.0161349,0.0546276
max_per_class_error,0.0535354,0.096331,0.0,0.0454545,0.0,0.0,0.2222222
mean_per_class_accuracy,0.9821549,0.0321103,1.0,0.9848485,1.0,1.0,0.9259259
mean_per_class_error,0.0178451,0.0321103,0.0,0.0151515,0.0,0.0,0.0740741
mse,0.0094484,0.01018,1.41e-05,0.0196853,0.0014901,0.0049086,0.021144
null_deviance,104.21201,1.4364079,105.1194,105.1194,105.1194,103.87024,101.831635

Unnamed: 0,timestamp,duration,iteration,lambda,predictors,deviance_train,deviance_test,deviance_xval,deviance_se,alpha,iterations,training_rmse,training_logloss,training_r2,training_classification_error,training_auc,training_pr_auc,validation_rmse,validation_logloss,validation_r2,validation_classification_error,validation_auc,validation_pr_auc
,2023-01-11 14:32:16,0.000 sec,2,",43E2",15,2.0436574,2.0488079,2.0534236,0.0059331,0.0,,,,,,,,,,,,,
,2023-01-11 14:32:16,0.018 sec,4,",26E2",15,2.0156276,2.0203101,2.0306273,0.0058921,0.0,,,,,,,,,,,,,
,2023-01-11 14:32:16,0.034 sec,6,",16E2",15,1.9728936,1.9768419,1.9955038,0.0058550,0.0,,,,,,,,,,,,,
,2023-01-11 14:32:16,0.045 sec,8,",1E2",15,1.9096608,1.9124910,1.9427160,0.0058534,0.0,,,,,,,,,,,,,
,2023-01-11 14:32:16,0.056 sec,10,",63E1",15,1.8200940,1.8212575,1.8662293,0.0059639,0.0,,,,,,,,,,,,,
,2023-01-11 14:32:16,0.069 sec,12,",39E1",15,1.7006423,1.6994235,1.7611523,0.0063110,0.0,,,,,,,,,,,,,
,2023-01-11 14:32:16,0.088 sec,15,",24E1",15,1.5511154,1.5466338,1.6253804,0.0070382,0.0,,,,,,,,,,,,,
,2023-01-11 14:32:16,0.108 sec,18,",15E1",15,1.3793847,1.3709053,1.4635208,0.0082434,0.0,,,,,,,,,,,,,
,2023-01-11 14:32:16,0.152 sec,21,",94E0",15,1.1940973,1.1812929,1.2836530,0.0099593,0.0,,,,,,,,,,,,,
,2023-01-11 14:32:16,0.175 sec,24,",58E0",15,1.0070798,0.9908453,1.0967570,0.0122516,0.0,,,,,,,,,,,,,

variable,relative_importance,scaled_importance,percentage
Culmen Length (mm),13.0746212,1.0,0.4003172
Body Mass (g),8.3268557,0.6368717,0.2549507
Culmen Depth (mm),7.6950436,0.5885481,0.2356059
Flipper Length (mm),3.5641348,0.2725995,0.1091263


#### Compare all the models

In [16]:
aml.leaderboard.as_data_frame().sort_values(by='rmse')

Unnamed: 0,model_id,mean_per_class_error,logloss,rmse,mse
0,GLM_1_AutoML_2_20230111_143214,0.016919,0.028084,0.096912,0.009392
1,StackedEnsemble_BestOfFamily_1_AutoML_2_202301...,0.016919,0.046786,0.106619,0.011368
6,DeepLearning_1_AutoML_2_20230111_143214,0.026894,0.052384,0.111788,0.012497
2,StackedEnsemble_AllModels_1_AutoML_2_20230111_...,0.023864,0.054036,0.11532,0.013299
10,DeepLearning_grid_1_AutoML_2_20230111_143214_m...,0.034722,0.070598,0.130618,0.017061
4,GBM_2_AutoML_2_20230111_143214,0.026894,0.060541,0.131091,0.017185
3,XRT_1_AutoML_2_20230111_143214,0.026894,0.072608,0.132453,0.017544
5,DRF_1_AutoML_2_20230111_143214,0.026894,0.076056,0.136338,0.018588
7,GBM_5_AutoML_2_20230111_143214,0.029924,0.066401,0.137028,0.018777
9,GBM_3_AutoML_2_20230111_143214,0.029924,0.065016,0.137063,0.018786


#### Get the best model tested

In [19]:
model = h2o.get_model("GLM_1_AutoML_2_20230111_143214")
model.model_performance(test)

Adelie Penguin (Pygoscelis adeliae),Chinstrap penguin (Pygoscelis antarctica),Gentoo penguin (Pygoscelis papua),Error,Rate
16.0,0.0,0.0,0.0,0 / 16
0.0,9.0,0.0,0.0,0 / 9
0.0,0.0,14.0,0.0,0 / 14
16.0,9.0,14.0,0.0,0 / 39

k,hit_ratio
1,1.0
2,1.0
3,1.0
