In [None]:
# install packages
!pip install catboost
!pip install xgboost
!pip install lightgbm
!pip install h2o

In [None]:
# import packages
import os
import numpy as np
import pandas as pd

# H2O
import h2o
from h2o.automl import H2OAutoML

import catboost
import lightgbm
import xgboost

from sklearn.preprocessing import LabelEncoder
from joblib import load, dump

In [None]:
# connect with Google Cloud
from google.colab import drive
drive.mount('/content/drive')
path = "/content/drive/My Drive/colab/TPS Jun"
# path = r'C:\Users\Chen\Desktop\Kaggle\Classifier\Tabular Playground Series - Jun 2021'
os.chdir(path)

In [None]:
# load data
df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')

In [None]:
# Initialize and Connect to H2O
h2o.init()

hof_train = h2o.H2OFrame(df_train.copy())
hof_test = h2o.H2OFrame(df_test.copy())

feature_col = [i for i in df_train.columns if i not in ['id', 'target']]
target_col = 'target'

# for binary or multiclass
hof_train[target_column] = hof_train[target_column].asfactor()

# Warnings : Xgboost is not supported on windows
h2o.estimators.xgboost.H2OXGBoostEstimator.available()

In [None]:
# construct model
PROJECT_NAME = 'h2o-20210611'
SEED = 20210611

automl = H2OAutoML(project_name = PROJECT_NAME,
                   seed=SEED,
                   max_runtime_secs=6*60*60,
                   nfolds = 5,
                   # include_algos = ['DRF', 'GLM', 'XGBoost', 'GBM', "DeepLearning", 'StackedEnsemble'],
                   exclude_algos = ["DeepLearning"],
                   )

In [None]:
# training model
automl.train(x=feature_col, 
             y=target_col, 
             training_frame=hof_train,
            )

In [None]:
# detail
lb = automl.leaderboard 
print(lb.as_data_frame())

In [None]:
# save best model
h2o.save_model(model=automl.leader, path=PROJECT_NAME, force=True)

In [None]:
# load model
best_model = h2o.load_model(f"{PROJECT_NAME}/")

In [None]:
# predict probability
result = best_model.predict(hof_test).as_data_frame()

In [None]:
# submission
sub = pd.read_csv('sample_submission.csv')
sub[sub.columns[1:]] = 
sub.to_csv('h2o0522.csv', index=False)