In [None]:
# install packages
!pip install catboost
!pip install xgboost
!pip install lightgbm
!pip install mljar-supervised

In [1]:
# import packages
import os
import numpy as np
import pandas as pd

# mljar
from supervised.automl import AutoML

import catboost
import lightgbm
import xgboost

from sklearn.preprocessing import LabelEncoder
from imblearn.under_sampling import RandomUnderSampler, NearMiss
from joblib import load, dump

C:\Users\Chen\anaconda3\envs\ML\lib\site-packages\numpy\.libs\libopenblas.NOIJJG62EMASZI6NYURL6JBKM4EVBGM7.gfortran-win_amd64.dll
C:\Users\Chen\anaconda3\envs\ML\lib\site-packages\numpy\.libs\libopenblas.WCDJNK7YVMPZQ2ME2ZZHJJRJ3JIKNDB7.gfortran-win_amd64.dll
  stacklevel=1)


In [2]:
# # connect with Google Cloud
# from google.colab import drive
# drive.mount('/content/drive')
# path = "/content/drive/My Drive/colab/TPS May"
path = r'C:\Users\Chen\Desktop\Kaggle\Classifier\Tabular Playground Series - May 2021'
os.chdir(path)

In [3]:
# load data
df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')

In [4]:
# target labelling
f = LabelEncoder()
f.fit(df_train['target'])
y = f.transform(df_train['target'])
y = pd.DataFrame(y).astype('int64')

col = [i for i in df_train.columns if i not in ['id', 'target']]
x = df_train[col]
x = x.astype('int64')

In [5]:
# Under sampling
un_samp = RandomUnderSampler(random_state=20210524)
# un_samp = NearMiss()
x_new, y_new = un_samp.fit_resample(x, y)
y_new.value_counts()

0    8490
1    8490
2    8490
3    8490
dtype: int64

In [10]:
print(x_new.head())
print(y_new.head())

   feature_0  feature_1  feature_2  feature_3  feature_4  feature_5  \
0          0          0          0          0          2          1   
1          0          0          0          0          0          0   
2          0          0          0          1          0          0   
3          0          0          0          0          0          1   
4          0          0          0          0          0          1   

   feature_6  feature_7  feature_8  feature_9  ...  feature_40  feature_41  \
0          0          0          0          0  ...           0           0   
1          0          0          0          2  ...           0           0   
2          0          0          0          2  ...           0           0   
3          0          0          0          0  ...           0           0   
4          0          0          0          0  ...           0           0   

   feature_42  feature_43  feature_44  feature_45  feature_46  feature_47  \
0           0           0  

In [11]:
# construct model
# AutoML : mljar
RESULTS_PATH = 'mljar-20210524'

cv = {"validation_type": "kfold",
      "k_folds": 5,
      "shuffle": True,
      "stratify": True,
      "random_seed": 42}

automl = AutoML(results_path=RESULTS_PATH,
                mode="Optuna",                          # or 'Explain', 'Perform', 'Compete'
                ml_task='multiclass_classification',   # or 'auto', 'binary_classification', 'regression'
                algorithms=['Baseline', 'Linear', 'Decision Tree', 'Random Forest', 'Extra Trees', 'LightGBM', 'Xgboost', 'CatBoost', 'Neural Network', 'Nearest Neighbors'],
                train_ensemble=True,
                stack_models=True,
                eval_metric='logloss',
                validation_strategy=cv,
                golden_features=True,
                boost_on_errors=True,
                optuna_time_budget=10*60,
                total_time_limit=1*60*60,
                optuna_verbose=False,
                n_jobs=-1,
                random_state=42)

In [12]:
# training model
automl.fit(x_new, y_new)

Linear algorithm was disabled.
AutoML directory: mljar-20210524
Expected computing time:
Total training time: Optuna + ML training = 9000 seconds
Total Optuna time: len(algorithms) * optuna_time_budget = 5400 seconds
Total ML model training time: 3600 seconds
The task is multiclass_classification with evaluation metric logloss
AutoML will use algorithms: ['Baseline', 'Decision Tree', 'Random Forest', 'Extra Trees', 'LightGBM', 'Xgboost', 'CatBoost', 'Neural Network', 'Nearest Neighbors']
AutoML will stack models
AutoML will ensemble availabe models
AutoML steps: ['simple_algorithms', 'default_algorithms', 'golden_features', 'boost_on_errors', 'ensemble', 'stack', 'ensemble_stacked']
* Step simple_algorithms will try to check up to 2 models
1_Baseline logloss 1.386294 trained in 6.86 seconds
2_DecisionTree logloss 1.388248 trained in 6.1 seconds
* Step default_algorithms will try to check up to 7 models
3_Optuna_LightGBM logloss 1.361028 trained in 152.22 seconds
4_Optuna_Xgboost loglos

An input array is constant; the correlation coefficent is not defined.


AutoML fit time: 2487.04 seconds
AutoML best model: Ensemble_Stacked


AutoML(algorithms=['Baseline', 'Linear', 'Decision Tree', 'Random Forest',
                   'Extra Trees', 'LightGBM', 'Xgboost', 'CatBoost',
                   'Neural Network', 'Nearest Neighbors'],
       boost_on_errors=True, eval_metric='logloss', golden_features=True,
       ml_task='multiclass_classification', mode='Optuna',
       optuna_time_budget=600, optuna_verbose=False, random_state=42,
       results_path='mljar-20210524', stack_models=True,
       validation_strategy={'k_folds': 5, 'random_seed': 42, 'shuffle': True,
                            'stratify': True, 'validation_type': 'kfold'})

In [13]:
# load well-trained model
automl = AutoML(results_path=RESULTS_PATH)

In [14]:
# predict probability
x_test = df_test.drop(['id'], axis=1)
x_test = x_test.astype('int64')
result = automl.predict_proba(x_test)

In [15]:
# submission
sub = pd.read_csv('sample_submission.csv')
sub[sub.columns[1:]] = result
sub.to_csv(f'{RESULTS_PATH}.csv', index=False)