In [None]:
# install packages
!pip install catboost
!pip install xgboost
!pip install lightgbm
!pip install evalml

In [2]:
# import packages
import os
import numpy as np
import pandas as pd

# evalml
import evalml
from evalml import AutoMLSearch
from evalml.automl import make_data_splitter

import catboost
import lightgbm
import xgboost

from sklearn.preprocessing import LabelEncoder
from joblib import load, dump

In [3]:
# # connect with Google Cloud
# from google.colab import drive
# drive.mount('/content/drive')
# path = "/content/drive/My Drive/colab/TPS Jun"
path = r'C:\Users\Chen\Desktop\Kaggle\Classifier\Tabular Playground Series - Jun 2021'
os.chdir(path)

In [4]:
# load data
df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')

In [5]:
# target labelling
f = LabelEncoder()
f.fit(df_train['target'])
y = f.transform(df_train['target'])
y = pd.DataFrame(y).astype('int64')

col = [i for i in df_train.columns if i not in ['id', 'target']]
x = df_train[col]
x = x.astype('int64')

In [6]:
# # 可處理的問題
# evalml.problem_types.ProblemTypes.all_problem_types

# # 可使用的模型
# evalml.pipelines.components.utils.allowed_model_families("multiclass")

# # 可使用的目標函數
# evalml.objectives.get_all_objective_names()

In [13]:
# construct model
# AutoML : evalml
PROJECT_NAME = 'evalml-20210611'
SEED = 202106011

# X_train, X_test, y_train, y_test = evalml.preprocessing.split_data(x, y.squeeze(), problem_type='multiclass')

# The StratifiedKFold is the default of binary or multiclass problem.
cv = make_data_splitter(x, y, problem_type='multiclass', n_splits=5, shuffle=True, random_seed=SEED)

automl = AutoMLSearch(X_train=x, 
                      y_train=y.squeeze(), 
                      problem_type="multiclass",   # or 'binary', 'regression', 'time series regression', 'time series binary', 'time series multiclass'
                      objective="log loss multiclass", 
                      allowed_model_families=['DECISION_TREE', 'CATBOOST', 'RANDOM_FOREST' , 'XGBOOST', 'LINEAR_MODEL', 'LIGHTGBM', 'EXTRA_TREES'],
                      data_splitter=cv,
                      random_seed=SEED,
                      n_jobs=-1,
                      max_batches=300,
                      max_iterations=300,
                      patience=None,
                      max_time=8*60*60,   # in seconds
                      ensembling=True,
                     )

Generating pipelines to search over...
8 pipelines ready for search.
Ensembling will run at the 50 iteration and every 40 iterations after that.


In [14]:
# training model
automl.search()


*****************************
* Beginning pipeline search *
*****************************

Optimizing for Log Loss Multiclass. 
Lower score is better.

Using SequentialEngine to train and score pipelines.
Searching up to 300 batches for a total of 300 pipelines. 
Will stop searching for new pipelines after 28800 seconds.

Allowed model families: linear_model, random_forest, decision_tree, lightgbm, xgboost, extra_trees, catboost



FigureWidget({
    'data': [{'mode': 'lines+markers',
              'name': 'Best Score',
              'type'…

Evaluating Baseline Pipeline: Mode Baseline Multiclass Classification Pipeline
Mode Baseline Multiclass Classification Pipeline:
	Starting cross validation
	Finished cross validation - mean Log Loss Multiclass: 25.584

*****************************
* Evaluating Batch Number 1 *
*****************************

Elastic Net Classifier w/ Imputer + Undersampler + Standard Scaler:
	Starting cross validation
	Finished cross validation - mean Log Loss Multiclass: 2.094
Decision Tree Classifier w/ Imputer + Undersampler:
	Starting cross validation
	Finished cross validation - mean Log Loss Multiclass: 2.075
Random Forest Classifier w/ Imputer + Undersampler:
	Starting cross validation
	Finished cross validation - mean Log Loss Multiclass: 1.936
LightGBM Classifier w/ Imputer + Undersampler:
	Starting cross validation
	Finished cross validation - mean Log Loss Multiclass: 1.939
Logistic Regression Classifier w/ Imputer + Undersampler + Standard Scaler:
	Starting cross validation
	Finished cross 


XGBoost Classifier w/ Imputer + Undersampler:
	Starting cross validation
	Finished cross validation - mean Log Loss Multiclass: 2.928
XGBoost Classifier w/ Imputer + Undersampler:
	Starting cross validation
	Finished cross validation - mean Log Loss Multiclass: 1.948
XGBoost Classifier w/ Imputer + Undersampler:
	Starting cross validation
	Finished cross validation - mean Log Loss Multiclass: 3.184
XGBoost Classifier w/ Imputer + Undersampler:
	Starting cross validation
	Finished cross validation - mean Log Loss Multiclass: 2.754
XGBoost Classifier w/ Imputer + Undersampler:
	Starting cross validation
	Finished cross validation - mean Log Loss Multiclass: 2.246

******************************
* Evaluating Batch Number 12 *
******************************

Random Forest Classifier w/ Imputer + Undersampler:
	Starting cross validation
	Finished cross validation - mean Log Loss Multiclass: 1.928
Random Forest Classifier w/ Imputer + Undersampler:
	Starting cross validation
	Finished cross

	Finished cross validation - mean Log Loss Multiclass: 1.931

******************************
* Evaluating Batch Number 22 *
******************************

LightGBM Classifier w/ Imputer + Undersampler:
	Starting cross validation
	Finished cross validation - mean Log Loss Multiclass: 1.937
LightGBM Classifier w/ Imputer + Undersampler:
	Starting cross validation
	Finished cross validation - mean Log Loss Multiclass: 1.937
LightGBM Classifier w/ Imputer + Undersampler:
	Starting cross validation
	Finished cross validation - mean Log Loss Multiclass: 1.937
LightGBM Classifier w/ Imputer + Undersampler:
	Starting cross validation
	Finished cross validation - mean Log Loss Multiclass: 1.937
LightGBM Classifier w/ Imputer + Undersampler:
	Starting cross validation
	Finished cross validation - mean Log Loss Multiclass: 1.937

******************************
* Evaluating Batch Number 23 *
******************************

Logistic Regression Classifier w/ Imputer + Undersampler + Standard Scaler

	Finished cross validation - mean Log Loss Multiclass: 1.999
Logistic Regression Classifier w/ Imputer + Undersampler + Standard Scaler:
	Starting cross validation
	Finished cross validation - mean Log Loss Multiclass: 1.999

******************************
* Evaluating Batch Number 33 *
******************************

Extra Trees Classifier w/ Imputer + Undersampler:
	Starting cross validation
	Finished cross validation - mean Log Loss Multiclass: 1.987
Extra Trees Classifier w/ Imputer + Undersampler:
	Starting cross validation
	Finished cross validation - mean Log Loss Multiclass: 1.987
Extra Trees Classifier w/ Imputer + Undersampler:
	Starting cross validation
	Finished cross validation - mean Log Loss Multiclass: 1.987
Extra Trees Classifier w/ Imputer + Undersampler:
	Starting cross validation
	Finished cross validation - mean Log Loss Multiclass: 1.987
Extra Trees Classifier w/ Imputer + Undersampler:
	Starting cross validation
	Finished cross validation - mean Log Loss Multicla

Decision Tree Classifier w/ Imputer + Undersampler:
	Starting cross validation
	Finished cross validation - mean Log Loss Multiclass: 2.129
Decision Tree Classifier w/ Imputer + Undersampler:
	Starting cross validation
	Finished cross validation - mean Log Loss Multiclass: 2.129

******************************
* Evaluating Batch Number 44 *
******************************

Elastic Net Classifier w/ Imputer + Undersampler + Standard Scaler:
	Starting cross validation
	Finished cross validation - mean Log Loss Multiclass: 1.997
Elastic Net Classifier w/ Imputer + Undersampler + Standard Scaler:
	Starting cross validation
	Finished cross validation - mean Log Loss Multiclass: 1.997
Elastic Net Classifier w/ Imputer + Undersampler + Standard Scaler:
	Starting cross validation
	Finished cross validation - mean Log Loss Multiclass: 1.997
Elastic Net Classifier w/ Imputer + Undersampler + Standard Scaler:
	Starting cross validation
	Finished cross validation - mean Log Loss Multiclass: 1.997
E

CatBoost Classifier w/ Imputer + Undersampler:
	Starting cross validation
	Finished cross validation - mean Log Loss Multiclass: 1.939
CatBoost Classifier w/ Imputer + Undersampler:
	Starting cross validation
	Finished cross validation - mean Log Loss Multiclass: 1.939
CatBoost Classifier w/ Imputer + Undersampler:
	Starting cross validation
	Finished cross validation - mean Log Loss Multiclass: 1.939

******************************
* Evaluating Batch Number 55 *
******************************

Stacked Ensemble Classification Pipeline:
	Starting cross validation
	Finished cross validation - mean Log Loss Multiclass: 1.755

******************************
* Evaluating Batch Number 56 *
******************************

XGBoost Classifier w/ Imputer + Undersampler:
	Starting cross validation
	Finished cross validation - mean Log Loss Multiclass: 1.913
XGBoost Classifier w/ Imputer + Undersampler:
	Starting cross validation
	Finished cross validation - mean Log Loss Multiclass: 1.913
XGBoost

In [15]:
# save
automl.save(f'{PROJECT_NAME}.pkl')

In [16]:
# load well-trained model
automl = AutoMLSearch.load(f'{PROJECT_NAME}.pkl')

In [19]:
# # 全部pipeline表現的排序
# automl.full_rankings

# # 前九名pipeline表現的排序
# automl.rankings

# # 查看第n個pipeline的內容
# automl.get_pipeline(0)

# # 詳細內容
# automl.results

# 表現最佳的pipeline
best_pipeline = automl.best_pipeline

# # 以graphviz呈現pipeline
# best_pipeline.graph()

# # testing set
# scores = best_pipeline.score(X_test, y_test, 
#                              objectives=evalml.objectives.get_core_objectives('multiclass'))

# scores['Log Loss Multiclass']

In [None]:
# fit best pipeline
best_pipeline.fit(x, y.squeeze())

In [20]:
# save best pipeline
best_pipeline.save(f'{PROJECT_NAME}_best.pkl')

In [21]:
# load best pipeline
best_pipeline = automl.load(f'{PROJECT_NAME}_best.pkl')

In [24]:
# predict probability
x_test = df_test.drop(['id'], axis=1)
x_test = x_test.astype('int64')
result = best_pipeline.predict_proba(x_test)

In [27]:
# submission
sub = pd.read_csv('sample_submission.csv')
sub[sub.columns[1:]] = result
sub.to_csv(f'{PROJECT_NAME}.csv', index=False)