In [None]:
# install packages
!pip install catboost
!pip install xgboost
!pip install lightgbm
!pip install mljar-supervised

In [2]:
# import packages
import os
import numpy as np
import pandas as pd

# mljar
from supervised.automl import AutoML

import catboost
import lightgbm
import xgboost

from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.cluster import KMeans
from joblib import load, dump

In [3]:
# # connect with Google Cloud
# from google.colab import drive
# drive.mount('/content/drive')
# path = "/content/drive/My Drive/colab/TPS May"
path = r'C:\Users\Chen\Desktop\Kaggle\Classifier\Tabular Playground Series - May 2021'
os.chdir(path)

In [4]:
# load data
df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')

In [5]:
df_train.drop('id', axis=1, inplace=True)
df_test.drop('id', axis=1, inplace=True)

all_data = pd.concat([df_train, df_test]).reset_index(drop=True)

# The number of category in each features
# print(all_data.nunique())

# testing data have new category in below features
print(all_data.columns[all_data.nunique() != df_train.nunique()])

Index(['feature_1', 'feature_3', 'feature_4', 'feature_8', 'feature_14',
       'feature_19', 'feature_21', 'feature_24', 'feature_25', 'feature_30',
       'feature_31', 'feature_34', 'feature_37', 'feature_40', 'feature_41',
       'feature_45', 'feature_47', 'feature_48', 'feature_49'],
      dtype='object')


In [6]:
differ = all_data.columns[all_data.nunique() != df_train.nunique()]

j = len(df_train)
J = len(all_data)

# 把新的類別當做 missing value來處理，用 mode補值
for col in differ:
    mo = all_data[col].mode().values[0]

    list_k = df_train[col].unique()
    num_k = df_train[col].nunique()

    all_data.loc[j:, col] = [all_data.loc[i, col] if all_data.loc[i, col] in list_k else mo for i in range(j,J)]


In [7]:
# Encoding and transform
onehot_col = all_data.columns[(all_data.nunique() <= 15) & (all_data.columns != 'target')]
label_col = all_data.columns[all_data.nunique() > 15]
target = 'target'

def label_encoder(c):
    le = LabelEncoder()
    return le.fit_transform(c)

df_onehot = pd.get_dummies(all_data[onehot_col])

# df_label = all_data[label_col].apply(label_encoder)
scaler = StandardScaler()
df_label = pd.DataFrame(scaler.fit_transform(all_data[label_col]), columns=label_col)

all_data = pd.concat([df_onehot, df_label], axis=1)

# create new feature by Kmean cluster
clus = KMeans(n_clusters=4, random_state=42, max_iter=1000, n_jobs=-1)
clus.fit(all_data)
all_data['kmeans'] = clus.labels_

'n_jobs' was deprecated in version 0.23 and will be removed in 1.0 (renaming of 0.25).


In [8]:
# target labelling
f = LabelEncoder()
f.fit(df_train['target'])
y = f.transform(df_train['target'])
y = pd.DataFrame(y).astype('int64')

x = all_data[:len(df_train)]
x = x.astype('float64')

df_test = all_data[len(df_train):, :]

In [9]:
# construct model
# AutoML : mljar
RESULTS_PATH = 'mljar-20210602'
SEED = 20210602

cv = {"validation_type": "kfold",
      "k_folds": 5,
      "shuffle": True,
      "stratify": True,
      "random_seed": SEED}

automl = AutoML(results_path=RESULTS_PATH,
                mode="Optuna",                          # or 'Explain', 'Perform', 'Compete'
                ml_task='multiclass_classification',   # or 'auto', 'binary_classification', 'regression'
                algorithms=['Baseline', 'Linear', 'Decision Tree', 'Random Forest', 'Extra Trees', 'LightGBM', 'Xgboost', 'CatBoost', 'Neural Network', 'Nearest Neighbors'],
                train_ensemble=True,
                stack_models=True,
                eval_metric='logloss',
                validation_strategy=cv,
                golden_features=True,
                boost_on_errors=True,
                optuna_time_budget=2*60*60,
                total_time_limit=8*60*60,
                optuna_verbose=False,
                n_jobs=-1,
                random_state=SEED)

In [10]:
# training model
automl.fit(x, y)

Linear algorithm was disabled.
AutoML directory: mljar-20210602
Expected computing time:
Total training time: Optuna + ML training = 93600 seconds
Total Optuna time: len(algorithms) * optuna_time_budget = 64800 seconds
Total ML model training time: 28800 seconds
The task is multiclass_classification with evaluation metric logloss
AutoML will use algorithms: ['Baseline', 'Decision Tree', 'Random Forest', 'Extra Trees', 'LightGBM', 'Xgboost', 'CatBoost', 'Neural Network', 'Nearest Neighbors']
AutoML will stack models
AutoML will ensemble availabe models
AutoML steps: ['simple_algorithms', 'default_algorithms', 'golden_features', 'boost_on_errors', 'ensemble', 'stack', 'ensemble_stacked']
* Step simple_algorithms will try to check up to 2 models
1_Baseline logloss 1.118577 trained in 6.58 seconds
2_DecisionTree logloss 1.114949 trained in 8.16 seconds
* Step default_algorithms will try to check up to 7 models
3_Optuna_LightGBM logloss 1.090909 trained in 105.84 seconds
4_Optuna_Xgboost lo

An input array is constant; the correlation coefficent is not defined.


AutoML fit time: 15412.22 seconds
AutoML best model: Ensemble_Stacked


AutoML(algorithms=['Baseline', 'Linear', 'Decision Tree', 'Random Forest',
                   'Extra Trees', 'LightGBM', 'Xgboost', 'CatBoost',
                   'Neural Network', 'Nearest Neighbors'],
       boost_on_errors=True, eval_metric='logloss', golden_features=True,
       ml_task='multiclass_classification', mode='Optuna',
       optuna_time_budget=7200, optuna_verbose=False, random_state=20210602,
       results_path='mljar-20210602', stack_models=True, total_time_limit=28800,
       validation_strategy={'k_folds': 5, 'random_seed': 20210602,
                            'shuffle': True, 'stratify': True,
                            'validation_type': 'kfold'})

In [11]:
# load well-trained model
automl = AutoML(results_path=RESULTS_PATH)

In [15]:
# predict probability
x_test = df_test
x_test = x_test.astype('float64')
result = automl.predict_proba(x_test)

In [16]:
# submission
sub = pd.read_csv('sample_submission.csv')
sub[sub.columns[1:]] = result
sub.to_csv(f'{RESULTS_PATH}.csv', index=False)