### Classification on the Penguin Dataset

#### Libraries

In [2]:
import pandas as pd 

import h2o

#### Read data

In [3]:
penguins = pd.read_csv("data/penguins.txt")

In [4]:
penguins.head(3)

Unnamed: 0,studyName,Sample Number,Species,Region,Island,Stage,Individual ID,Clutch Completion,Date Egg,Culmen Length (mm),Culmen Depth (mm),Flipper Length (mm),Body Mass (g),Sex,Delta 15 N (o/oo),Delta 13 C (o/oo),Comments
0,PAL0708,1,Adelie Penguin (Pygoscelis adeliae),Anvers,Torgersen,"Adult, 1 Egg Stage",N1A1,Yes,11/11/07,39.1,18.7,181.0,3750.0,MALE,,,Not enough blood for isotopes.
1,PAL0708,2,Adelie Penguin (Pygoscelis adeliae),Anvers,Torgersen,"Adult, 1 Egg Stage",N1A2,Yes,11/11/07,39.5,17.4,186.0,3800.0,FEMALE,8.94956,-24.69454,
2,PAL0708,3,Adelie Penguin (Pygoscelis adeliae),Anvers,Torgersen,"Adult, 1 Egg Stage",N2A1,Yes,11/16/07,40.3,18.0,195.0,3250.0,FEMALE,8.36821,-25.33302,


#### Preprocess data

We will only use this four columns for analysis : 
- Culmen length (mm)
- Culmen Depth (mm)
- Flipper Length (mm)
- Body Mass (g)

In [6]:
attributes = ["Culmen Length (mm)","Culmen Depth (mm)","Flipper Length (mm)","Body Mass (g)"]
target = ["Species"]
penguins = penguins[attributes+target]
penguins.dropna(inplace=True)

#### Initialize H2o Cluster

In [None]:
h2o.init()

#### Train-Test Split

In [None]:
penguins = h2o.H2OFrame(penguins)

In [6]:
train, test, valid = penguins.split_frame(ratios = [.7, .15], seed = 1234)
x = attributes
y = "Species"

#### Classification with auto ML

In [None]:
aml = h2o.automl.H2OAutoML(max_models = 10, seed = 1,max_runtime_secs=180)

In [None]:
aml.train(x=x, y=y, training_frame=train, validation_frame=valid)

#### Compare all the models

In [None]:
aml.leaderboard.as_data_frame().sort_values(by='rmse')

#### Get the best model tested

In [None]:
aml.leader.model_performance(test)