In [None]:
# install packages
!pip install catboost
!pip install xgboost
!pip install lightgbm
!pip install mljar-supervised

In [None]:
# import packages
import os
import numpy as np
import pandas as pd

# mljar
from supervised.automl import AutoML

import catboost
import lightgbm
import xgboost

from sklearn.preprocessing import LabelEncoder
from joblib import load, dump

In [None]:
# # connect with Google Cloud
# from google.colab import drive
# drive.mount('/content/drive')
# path = "/content/drive/My Drive/colab/TPS Jun"
path = r'C:\Users\Chen\Desktop\Kaggle\Classifier\Tabular Playground Series - Jun 2021'
os.chdir(path)

In [None]:
# load data
df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')

In [None]:
# target labelling
f = LabelEncoder()
f.fit(df_train['target'])
y = f.transform(df_train['target'])
y = pd.DataFrame(y).astype('int64')

col = [i for i in df_train.columns if i not in ['id', 'target']]
x = df_train[col]
x = x.astype('int64')

In [None]:
# construct model
# AutoML : mljar
RESULTS_PATH = 'mljar-20210608'
SEED = 20210608

cv = {"validation_type": "kfold",
      "k_folds": 5,
      "shuffle": True,
      "stratify": True,
      "random_seed": SEED}

automl = AutoML(results_path=RESULTS_PATH,
                mode="Optuna",                          # or 'Explain', 'Perform', 'Compete'
                ml_task='multiclass_classification',   # or 'auto', 'binary_classification', 'regression'
                algorithms=['Baseline', 'Linear', 'Decision Tree', 'Random Forest', 'Extra Trees', 'LightGBM', 'Xgboost', 'CatBoost', 'Nearest Neighbors'],   # 'Neural Network'
                train_ensemble=True,
                stack_models=True,
                eval_metric='logloss',
                validation_strategy=cv,
                golden_features=True,
                boost_on_errors=True,
                optuna_time_budget=1.5*60*60,
                total_time_limit=12*60*60,
                optuna_verbose=False,
                n_jobs=-1,
                random_state=SEED)

In [None]:
# training model
automl.fit(x, y)

In [None]:
# load well-trained model
automl = AutoML(results_path=RESULTS_PATH)

In [None]:
# predict probability
x_test = df_test.drop(['id'], axis=1)
x_test = x_test.astype('int64')
result = automl.predict_proba(x_test)

In [None]:
# submission
sub = pd.read_csv('sample_submission.csv')
sub[sub.columns[1:]] = result
sub.to_csv(f'{RESULTS_PATH}.csv', index=False)