# 1. Libraries 

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.preprocessing import RobustScaler, MinMaxScaler, StandardScaler
from sklearn.preprocessing import LabelEncoder

from sklearn.datasets import make_blobs
from sklearn.decomposition import PCA

from pyod.models.iforest import IForest
from pyod.models.pca import PCA

from sklearn.neighbors import LocalOutlierFactor
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, classification_report, precision_score, recall_score

import tensorflow as tf

import seaborn as sns
sns.set_style('white')

import warnings
warnings.filterwarnings(action='ignore')

SEED = 42
np.random.seed(SEED)
tf.random.set_seed(SEED)

In [2]:
from tensorflow.python.client import device_lib
print(device_lib.list_local_devices())
print(tf.config.list_physical_devices('GPU'))

[name: "/device:CPU:0"
device_type: "CPU"
memory_limit: 268435456
locality {
}
incarnation: 2424446911713524714
xla_global_id: -1
]
[]


In [3]:
import os

os.environ["CUDA_VISIBLE_DEVICES"] = '0' # 여러개 사용시 '0,1,2' 식으로 하나의 문자열에 입력
gpus = tf.config.experimental.list_physical_devices('GPU') # 호스트 러나임에 표시되는 GPU 장치 목록 반환

if gpus: # 반환된 GPU 장치 목록이 있다면
    try: # 해당 장치에 대한 메모리 증가 활성화 여부 설정
        tf.config.experimental.set_memory_growth(gpus[0], True)
    except RuntimeError as e: # try문 실패시에 에러문구 출력
        print(e)

In [4]:
path = "./Dataset/"

dataT = pd.read_csv(path+"train_T.csv")  
dataO = pd.read_csv(path+"train_O.csv")  

# original data -> data
data = pd.concat([dataT, dataO]).reset_index(drop=True)
data

Unnamed: 0,Y_Class,Y_Quality,LINE,PRODUCT_CODE,X_1,X_2,X_3,X_4,X_5,X_6,...,X_3317,X_3318,X_3319,X_3320,X_3321,X_3322,X_3323,X_3324,X_3325,X_3326
0,1,0.531957,T100304,T_31,2.0,95.0,0.0,45.0,10.0,0.0,...,0.000008,0.000003,0.191408,0.000008,0.001210,0.000021,0.000003,0.000002,0.189,0.000006
1,1,0.531105,T100306,T_31,2.0,96.0,0.0,45.0,10.0,0.0,...,0.000008,0.000003,0.188993,0.000032,0.000644,0.000041,0.000002,0.000003,0.185,0.000029
2,1,0.532292,T100306,T_31,2.0,95.0,0.0,45.0,10.0,0.0,...,,,,,,,,,,
3,1,0.528141,T100306,T_31,2.0,87.0,0.0,45.0,10.0,0.0,...,0.000007,0.000003,0.189424,0.000034,0.000678,0.000043,0.000004,0.000003,0.188,0.000031
4,1,0.532289,T100306,T_31,2.0,95.0,0.0,45.0,10.0,0.0,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
833,1,0.529740,T100304,O_31,154.0,97.0,0.0,45.0,11.0,0.0,...,,,,,,,,,,
834,1,0.532343,T100306,O_31,146.0,94.0,0.0,45.0,10.0,0.0,...,,,,,,,,,,
835,1,0.530533,T100304,O_31,4.0,98.0,0.0,45.0,11.0,0.0,...,,,,,,,,,,
836,2,0.535205,T100306,O_31,6.0,89.0,0.0,45.0,10.0,0.0,...,,,,,,,,,,


# 2. Preprocessing

1. 같은 값으로만 채워진 column 삭제 (nunique(dropna=False) <= 1) 
2. 모든 행이 같은 값을 가지는 두 개 이상의 중복된 column 삭제 
3. 결측치 처리 (KNN Imputetor, fillna(0))
4. 정규화 (Robust, MinMax, Standard)
5. 카테고리 변수 수치화 (LINE, PRODUCT_CODE) LabelEncoder

In [5]:
# 1. Handling missing values  
# Preprocessed data -> processed_data
processed_data = data.copy()
processed_data = processed_data.fillna(0)
processed_data

Unnamed: 0,Y_Class,Y_Quality,LINE,PRODUCT_CODE,X_1,X_2,X_3,X_4,X_5,X_6,...,X_3317,X_3318,X_3319,X_3320,X_3321,X_3322,X_3323,X_3324,X_3325,X_3326
0,1,0.531957,T100304,T_31,2.0,95.0,0.0,45.0,10.0,0.0,...,0.000008,0.000003,0.191408,0.000008,0.001210,0.000021,0.000003,0.000002,0.189,0.000006
1,1,0.531105,T100306,T_31,2.0,96.0,0.0,45.0,10.0,0.0,...,0.000008,0.000003,0.188993,0.000032,0.000644,0.000041,0.000002,0.000003,0.185,0.000029
2,1,0.532292,T100306,T_31,2.0,95.0,0.0,45.0,10.0,0.0,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000,0.000000
3,1,0.528141,T100306,T_31,2.0,87.0,0.0,45.0,10.0,0.0,...,0.000007,0.000003,0.189424,0.000034,0.000678,0.000043,0.000004,0.000003,0.188,0.000031
4,1,0.532289,T100306,T_31,2.0,95.0,0.0,45.0,10.0,0.0,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
833,1,0.529740,T100304,O_31,154.0,97.0,0.0,45.0,11.0,0.0,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000,0.000000
834,1,0.532343,T100306,O_31,146.0,94.0,0.0,45.0,10.0,0.0,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000,0.000000
835,1,0.530533,T100304,O_31,4.0,98.0,0.0,45.0,11.0,0.0,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000,0.000000
836,2,0.535205,T100306,O_31,6.0,89.0,0.0,45.0,10.0,0.0,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000,0.000000


In [6]:
# 2. 같은 값으로만 채워진 column 삭제 (nunique(dropna=False) <= 1)
# Get the counts of unique values per column
value_counts = processed_data.iloc[:,4:].apply(lambda x: x.nunique(dropna=False))

# Filter columns with only one unique value
unique_columns = value_counts[value_counts >= 2].index
duplicated_columns = value_counts[value_counts <= 1].index

processed_data = processed_data.drop(duplicated_columns, axis=1)
processed_data

Unnamed: 0,Y_Class,Y_Quality,LINE,PRODUCT_CODE,X_1,X_2,X_4,X_5,X_7,X_8,...,X_3317,X_3318,X_3319,X_3320,X_3321,X_3322,X_3323,X_3324,X_3325,X_3326
0,1,0.531957,T100304,T_31,2.0,95.0,45.0,10.0,45.0,10.0,...,0.000008,0.000003,0.191408,0.000008,0.001210,0.000021,0.000003,0.000002,0.189,0.000006
1,1,0.531105,T100306,T_31,2.0,96.0,45.0,10.0,53.0,10.0,...,0.000008,0.000003,0.188993,0.000032,0.000644,0.000041,0.000002,0.000003,0.185,0.000029
2,1,0.532292,T100306,T_31,2.0,95.0,45.0,10.0,60.0,10.0,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000,0.000000
3,1,0.528141,T100306,T_31,2.0,87.0,45.0,10.0,53.0,10.0,...,0.000007,0.000003,0.189424,0.000034,0.000678,0.000043,0.000004,0.000003,0.188,0.000031
4,1,0.532289,T100306,T_31,2.0,95.0,45.0,10.0,51.0,10.0,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
833,1,0.529740,T100304,O_31,154.0,97.0,45.0,11.0,45.0,10.0,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000,0.000000
834,1,0.532343,T100306,O_31,146.0,94.0,45.0,10.0,67.0,11.0,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000,0.000000
835,1,0.530533,T100304,O_31,4.0,98.0,45.0,11.0,45.0,10.0,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000,0.000000
836,2,0.535205,T100306,O_31,6.0,89.0,45.0,10.0,51.0,10.0,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000,0.000000


In [7]:
# 3. 모든 행이 같은 값을 가지는 두 개 이상의 중복된 column 삭제
df = processed_data.iloc[:,4:].T.drop_duplicates(keep='first').T
processed_data = pd.concat([processed_data.iloc[:,:4], df], axis=1)
processed_data

Unnamed: 0,Y_Class,Y_Quality,LINE,PRODUCT_CODE,X_1,X_2,X_4,X_5,X_7,X_8,...,X_3317,X_3318,X_3319,X_3320,X_3321,X_3322,X_3323,X_3324,X_3325,X_3326
0,1,0.531957,T100304,T_31,2.0,95.0,45.0,10.0,45.0,10.0,...,0.000008,0.000003,0.191408,0.000008,0.001210,0.000021,0.000003,0.000002,0.189,0.000006
1,1,0.531105,T100306,T_31,2.0,96.0,45.0,10.0,53.0,10.0,...,0.000008,0.000003,0.188993,0.000032,0.000644,0.000041,0.000002,0.000003,0.185,0.000029
2,1,0.532292,T100306,T_31,2.0,95.0,45.0,10.0,60.0,10.0,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000,0.000000
3,1,0.528141,T100306,T_31,2.0,87.0,45.0,10.0,53.0,10.0,...,0.000007,0.000003,0.189424,0.000034,0.000678,0.000043,0.000004,0.000003,0.188,0.000031
4,1,0.532289,T100306,T_31,2.0,95.0,45.0,10.0,51.0,10.0,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
833,1,0.529740,T100304,O_31,154.0,97.0,45.0,11.0,45.0,10.0,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000,0.000000
834,1,0.532343,T100306,O_31,146.0,94.0,45.0,10.0,67.0,11.0,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000,0.000000
835,1,0.530533,T100304,O_31,4.0,98.0,45.0,11.0,45.0,10.0,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000,0.000000
836,2,0.535205,T100306,O_31,6.0,89.0,45.0,10.0,51.0,10.0,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000,0.000000


In [8]:
# 4. Regularization 
rs = RobustScaler()
ss = StandardScaler()

processed_data.iloc[:, 4:] = rs.fit_transform(processed_data.iloc[:, 4:])
processed_data

Unnamed: 0,Y_Class,Y_Quality,LINE,PRODUCT_CODE,X_1,X_2,X_4,X_5,X_7,X_8,...,X_3317,X_3318,X_3319,X_3320,X_3321,X_3322,X_3323,X_3324,X_3325,X_3326
0,1,0.531957,T100304,T_31,0.0,0.0,0.0,0.0,-0.833333,0.0,...,0.000008,0.000003,0.191408,0.000008,0.001210,0.000021,0.000003,0.000002,0.189,0.000006
1,1,0.531105,T100306,T_31,0.0,0.2,0.0,0.0,0.500000,0.0,...,0.000008,0.000003,0.188993,0.000032,0.000644,0.000041,0.000002,0.000003,0.185,0.000029
2,1,0.532292,T100306,T_31,0.0,0.0,0.0,0.0,1.666667,0.0,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000,0.000000
3,1,0.528141,T100306,T_31,0.0,-1.6,0.0,0.0,0.500000,0.0,...,0.000007,0.000003,0.189424,0.000034,0.000678,0.000043,0.000004,0.000003,0.188,0.000031
4,1,0.532289,T100306,T_31,0.0,0.0,0.0,0.0,0.166667,0.0,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
833,1,0.529740,T100304,O_31,152.0,0.4,0.0,1.0,-0.833333,0.0,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000,0.000000
834,1,0.532343,T100306,O_31,144.0,-0.2,0.0,0.0,2.833333,1.0,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000,0.000000
835,1,0.530533,T100304,O_31,2.0,0.6,0.0,1.0,-0.833333,0.0,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000,0.000000
836,2,0.535205,T100306,O_31,4.0,-1.2,0.0,0.0,0.166667,0.0,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000,0.000000


In [9]:
# 5. Label Encoding 
cat_cols = ['PRODUCT_CODE', 'LINE']

for col in cat_cols:
    le = LabelEncoder()
    processed_data[col] = le.fit_transform(processed_data[col])
    print(le.classes_)

processed_data.head()

['O_31' 'T_31']
['T100304' 'T100306']


Unnamed: 0,Y_Class,Y_Quality,LINE,PRODUCT_CODE,X_1,X_2,X_4,X_5,X_7,X_8,...,X_3317,X_3318,X_3319,X_3320,X_3321,X_3322,X_3323,X_3324,X_3325,X_3326
0,1,0.531957,0,1,0.0,0.0,0.0,0.0,-0.833333,0.0,...,8e-06,3e-06,0.191408,8e-06,0.00121,2.1e-05,3e-06,2e-06,0.189,6e-06
1,1,0.531105,1,1,0.0,0.2,0.0,0.0,0.5,0.0,...,8e-06,3e-06,0.188993,3.2e-05,0.000644,4.1e-05,2e-06,3e-06,0.185,2.9e-05
2,1,0.532292,1,1,0.0,0.0,0.0,0.0,1.666667,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1,0.528141,1,1,0.0,-1.6,0.0,0.0,0.5,0.0,...,7e-06,3e-06,0.189424,3.4e-05,0.000678,4.3e-05,4e-06,3e-06,0.188,3.1e-05
4,1,0.532289,1,1,0.0,0.0,0.0,0.0,0.166667,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [10]:
# # 5. 차원 축소
# from sklearn.decomposition import PCA 

# nonX_data = processed_data.iloc[:, :4]
# X_data = processed_data.iloc[:, 4:]

# pca = PCA(n_components=3)
# pca_data = pca.fit_transform(X_data) 
# pca_df = pd.DataFrame(pca_data, columns=['X1', 'X2', 'X3'])

# processed_data = pd.concat([nonX_data, pca_df], axis=1)
# processed_data

# 3. Data Split

In [11]:
# Divide processed_data to input_data_X and input_data_y 
input_data = processed_data.copy()
input_data_y = processed_data['Y_Quality']
input_data_X = processed_data.drop(['Y_Quality'], axis=1)

# train/ test data 로 split 
tr_val_X, test_X, tr_val_y, test_y = train_test_split(
    input_data_X, 
    input_data_y, 
    test_size = 0.2, 
    random_state = SEED,
    shuffle=True
    #stratify = input_data_y # importnat
)

# valid/train 로 split
train_X, valid_X, train_y, valid_y = train_test_split(
    tr_val_X, 
    tr_val_y, 
    test_size = 0.2, 
    random_state = SEED,
    shuffle=True
    #stratify = tr_val_y
)

print('Input Data X:', input_data_X.shape, 'Input Data y:', input_data_y.shape)
print('X_train:', train_X.shape, 'y_train:', train_y.shape)
print('X_valid:', valid_X.shape, 'y_train:', valid_y.shape)
print('X_test:', test_X.shape, 'y_train:', test_y.shape)

Input Data X: (838, 681) Input Data y: (838,)
X_train: (536, 681) y_train: (536,)
X_valid: (134, 681) y_train: (134,)
X_test: (168, 681) y_train: (168,)


# 4. Modeling 

1. Isolation Forest 
2. LOF (Local Outlier Factor)
3. PCA 
4. AutoEncoder 

In [12]:
from pycaret.regression import *
from time import time

In [13]:
# Setup Environment  
cat_cols = ['LINE', 'PRODUCT_CODE']

reg = setup(data = input_data, 
            train_size = 0.8,
            #test_data = test,
            target = 'Y_Quality',
            categorical_features = cat_cols,
            #numeric_features = num_cols,
            #ordinal_features = ord_dict,
            #high_cardinality_features = ['religion'],
            #high_cardinality_method = 'frequency',
            #handle_unknown_categorical = True,
            #normalize = True,
            #normalize_method = 'zscore', 
            #feature_selection = True, 
            #imputation_type='iterative',
            fold=5, # defualt 10 
            fold_shuffle=True,
            use_gpu = True,
            session_id = SEED)

[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 2, number of used features: 0
[LightGBM] [Info] Using GPU Device: NVIDIA GeForce RTX 2060, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 16 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Start training from score 0.500000
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 2, number of used features: 0
[LightGBM] [Info] Using GPU Device: NVIDIA GeForce RTX 2060, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 16 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Start training from score 0.500000
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 2, number of used features: 0
[LightGBM] [Info] U

Unnamed: 0,Description,Value
0,Session id,42
1,Target,Y_Quality
2,Target type,Regression
3,Original data shape,"(838, 682)"
4,Transformed data shape,"(838, 682)"
5,Transformed train set shape,"(670, 682)"
6,Transformed test set shape,"(168, 682)"
7,Ordinal features,2
8,Numeric features,679
9,Categorical features,2


[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 2, number of used features: 0
[LightGBM] [Info] Using GPU Device: NVIDIA GeForce RTX 2060, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 16 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Start training from score 0.500000
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 2, number of used features: 0
[LightGBM] [Info] Using GPU Device: NVIDIA GeForce RTX 2060, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 16 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Start training from score 0.500000


In [14]:
# lasso, catboost, xgb, BayesianRidge, LassoLars
top5 = compare_models(sort='rmse', n_select=5, fold=5)

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
catboost,CatBoost Regressor,0.0022,0.0,0.0028,0.7508,0.0018,0.0042,16.814
et,Extra Trees Regressor,0.0022,0.0,0.0029,0.7392,0.0019,0.0042,1.054
lightgbm,Light Gradient Boosting Machine,0.0023,0.0,0.003,0.7133,0.002,0.0043,1.15
br,Bayesian Ridge,0.0024,0.0,0.003,0.7085,0.002,0.0045,0.508
ada,AdaBoost Regressor,0.0023,0.0,0.003,0.725,0.0019,0.0044,1.24
gbr,Gradient Boosting Regressor,0.0023,0.0,0.003,0.724,0.0019,0.0042,3.014
ridge,Ridge Regression,0.0024,0.0,0.0031,0.7032,0.002,0.0046,0.356
omp,Orthogonal Matching Pursuit,0.0024,0.0,0.0031,0.6991,0.002,0.0046,0.37
rf,Random Forest Regressor,0.0023,0.0,0.0031,0.7024,0.002,0.0044,1.98
xgboost,Extreme Gradient Boosting,0.0023,0.0,0.0031,0.6967,0.002,0.0044,0.922


In [15]:
total_models = []
tuned_models = []

for model in top5:
    cell_start_time = time() 
    model = create_model(model, fold=5)
    tuned_model = tune_model(model, fold=5, optimize='rmse', choose_better=True)
    total_models.append(model)
    tuned_models.append(tuned_model)
    cell_end_time = time()
    print("CELL RUN TIME: ", cell_end_time - cell_start_time)

Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,0.0024,0.0,0.0032,0.735,0.0021,0.0045
1,0.002,0.0,0.0024,0.7762,0.0016,0.0038
2,0.0022,0.0,0.0029,0.7578,0.0019,0.0043
3,0.0023,0.0,0.0028,0.7852,0.0018,0.0042
4,0.0022,0.0,0.0028,0.6996,0.0018,0.0041
Mean,0.0022,0.0,0.0028,0.7508,0.0018,0.0042
Std,0.0001,0.0,0.0003,0.0308,0.0002,0.0002


Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,0.0025,0.0,0.0034,0.7106,0.0022,0.0046
1,0.002,0.0,0.0024,0.7753,0.0016,0.0038
2,0.0023,0.0,0.003,0.7383,0.002,0.0044
3,0.0021,0.0,0.0027,0.8065,0.0017,0.004
4,0.0021,0.0,0.0028,0.6862,0.0018,0.004
Mean,0.0022,0.0,0.0029,0.7434,0.0019,0.0042
Std,0.0002,0.0,0.0003,0.0433,0.0002,0.0003


Fitting 5 folds for each of 10 candidates, totalling 50 fits
Original model was better than the tuned model, hence it will be returned. NOTE: The display metrics are for the tuned model (not the original one).
CELL RUN TIME:  296.07768750190735


Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,0.0024,0.0,0.0033,0.7279,0.0022,0.0045
1,0.002,0.0,0.0025,0.7602,0.0016,0.0039
2,0.0022,0.0,0.0028,0.7699,0.0018,0.0041
3,0.0023,0.0,0.0029,0.7763,0.0019,0.0044
4,0.0023,0.0,0.0029,0.6618,0.0019,0.0042
Mean,0.0022,0.0,0.0029,0.7392,0.0019,0.0042
Std,0.0001,0.0,0.0002,0.0421,0.0002,0.0002


Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,0.0038,0.0,0.0057,0.1766,0.0037,0.0071
1,0.003,0.0,0.0045,0.2246,0.003,0.0057
2,0.0035,0.0,0.0052,0.2041,0.0034,0.0066
3,0.0035,0.0,0.0052,0.2662,0.0034,0.0066
4,0.0033,0.0,0.0048,0.0877,0.0031,0.0062
Mean,0.0034,0.0,0.0051,0.1918,0.0033,0.0064
Std,0.0002,0.0,0.0004,0.0597,0.0003,0.0005


Fitting 5 folds for each of 10 candidates, totalling 50 fits
Original model was better than the tuned model, hence it will be returned. NOTE: The display metrics are for the tuned model (not the original one).
CELL RUN TIME:  26.26038360595703


Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,0.0025,0.0,0.0035,0.6912,0.0023,0.0046
1,0.0021,0.0,0.0026,0.7536,0.0017,0.0039
2,0.0025,0.0,0.0033,0.6868,0.0021,0.0047
3,0.0023,0.0,0.0029,0.7642,0.0019,0.0043
4,0.0022,0.0,0.0029,0.6712,0.0019,0.0042
Mean,0.0023,0.0,0.003,0.7134,0.002,0.0043
Std,0.0002,0.0,0.0003,0.0379,0.0002,0.0003


Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,0.0041,0.0,0.0063,-0.002,0.0041,0.0077
1,0.0034,0.0,0.0052,-0.0016,0.0034,0.0064
2,0.004,0.0,0.0059,-0.0083,0.0038,0.0076
3,0.0042,0.0,0.0062,-0.0337,0.004,0.0078
4,0.0035,0.0,0.005,-0.0004,0.0033,0.0066
Mean,0.0038,0.0,0.0057,-0.0092,0.0037,0.0072
Std,0.0003,0.0,0.0005,0.0126,0.0003,0.0006


Fitting 5 folds for each of 10 candidates, totalling 50 fits
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 19261
[LightGBM] [Info] Number of data points in the train set: 536, number of used features: 651
[LightGBM] [Info] Using GPU Device: NVIDIA GeForce RTX 2060, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 563 dense feature groups (0.29 MB) transferred to GPU in 0.007420 secs. 1 sparse feature groups
[LightGBM] [Info] Start training from score 0.530623
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 19247
[LightGBM] [Info] Number of data points in the train set: 536, number of used features: 651
[LightGBM] [Info] Using GPU Device: NVIDIA GeForce RTX 2060, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been

Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,0.0023,0.0,0.0031,0.7601,0.002,0.0044
1,0.0025,0.0,0.0032,0.6239,0.0021,0.0047
2,0.0026,0.0,0.0032,0.6943,0.0021,0.0049
3,0.0022,0.0,0.0029,0.7707,0.0019,0.0041
4,0.0023,0.0,0.0028,0.6934,0.0018,0.0043
Mean,0.0024,0.0,0.003,0.7085,0.002,0.0045
Std,0.0001,0.0,0.0002,0.0532,0.0001,0.0003


Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,0.0024,0.0,0.0031,0.7518,0.002,0.0046
1,0.0025,0.0,0.0032,0.6188,0.0021,0.0047
2,0.0027,0.0,0.0033,0.6874,0.0021,0.0051
3,0.0021,0.0,0.0027,0.7986,0.0018,0.004
4,0.0024,0.0,0.0029,0.6745,0.0019,0.0044
Mean,0.0024,0.0,0.003,0.7062,0.002,0.0046
Std,0.0002,0.0,0.0002,0.0626,0.0001,0.0004


Fitting 5 folds for each of 10 candidates, totalling 50 fits
Original model was better than the tuned model, hence it will be returned. NOTE: The display metrics are for the tuned model (not the original one).
CELL RUN TIME:  16.560221672058105


Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,0.0025,0.0,0.0033,0.7169,0.0022,0.0048
1,0.0022,0.0,0.0026,0.7356,0.0017,0.0041
2,0.0024,0.0,0.0031,0.7285,0.002,0.0045
3,0.0025,0.0,0.0031,0.7344,0.002,0.0047
4,0.0022,0.0,0.0027,0.7099,0.0018,0.0041
Mean,0.0023,0.0,0.003,0.725,0.0019,0.0044
Std,0.0002,0.0,0.0003,0.0101,0.0002,0.0003


Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,0.0025,0.0,0.0033,0.7327,0.0021,0.0047
1,0.0022,0.0,0.0027,0.7327,0.0017,0.0041
2,0.0022,0.0,0.0029,0.7521,0.0019,0.0042
3,0.0024,0.0,0.0029,0.7661,0.0019,0.0045
4,0.0021,0.0,0.0027,0.7195,0.0017,0.0039
Mean,0.0023,0.0,0.0029,0.7406,0.0019,0.0043
Std,0.0002,0.0,0.0002,0.0165,0.0001,0.0003


Fitting 5 folds for each of 10 candidates, totalling 50 fits
CELL RUN TIME:  80.69625854492188


In [16]:
blender_top5 = blend_models(estimator_list=total_models)
blender_tuned5 = blend_models(estimator_list=tuned_models)

Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,0.0023,0.0,0.0031,0.7569,0.002,0.0043
1,0.002,0.0,0.0024,0.7748,0.0016,0.0037
2,0.0023,0.0,0.0029,0.7614,0.0019,0.0043
3,0.0022,0.0,0.0027,0.7989,0.0018,0.0041
4,0.0021,0.0,0.0026,0.7308,0.0017,0.0039
Mean,0.0022,0.0,0.0027,0.7646,0.0018,0.0041
Std,0.0001,0.0,0.0002,0.0223,0.0001,0.0002


Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,0.0023,0.0,0.0031,0.7591,0.002,0.0043
1,0.002,0.0,0.0024,0.7747,0.0016,0.0037
2,0.0022,0.0,0.0028,0.7647,0.0019,0.0042
3,0.0022,0.0,0.0027,0.8035,0.0018,0.0041
4,0.0021,0.0,0.0026,0.7311,0.0017,0.0039
Mean,0.0021,0.0,0.0027,0.7666,0.0018,0.004
Std,0.0001,0.0,0.0002,0.0234,0.0001,0.0002


In [17]:
# for naive models 
for model in total_models: 
    prediction = predict_model(model)

# for tuned models 
for model in tuned_models: 
    prediction = predict_model(model)

# for blended model 
prediction = predict_model(blender_top5)

# for blended model (tuned)
prediction = predict_model(blender_tuned5)

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,CatBoost Regressor,0.0024,0.0,0.0028,0.7044,0.0018,0.0045


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,Extra Trees Regressor,0.0023,0.0,0.0028,0.7119,0.0018,0.0043


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,Light Gradient Boosting Machine,0.0024,0.0,0.0029,0.6774,0.0019,0.0046


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,Bayesian Ridge,0.0024,0.0,0.0033,0.5872,0.0022,0.0044


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,AdaBoost Regressor,0.0024,0.0,0.0028,0.7054,0.0018,0.0046


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,CatBoost Regressor,0.0024,0.0,0.0028,0.7044,0.0018,0.0045


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,Extra Trees Regressor,0.0023,0.0,0.0028,0.7119,0.0018,0.0043


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,Light Gradient Boosting Machine,0.0024,0.0,0.0029,0.6776,0.0019,0.0046


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,Bayesian Ridge,0.0024,0.0,0.0033,0.5872,0.0022,0.0044


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,AdaBoost Regressor,0.0023,0.0,0.0026,0.7406,0.0017,0.0043


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,Voting Regressor,0.0023,0.0,0.0027,0.7355,0.0017,0.0043


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,Voting Regressor,0.0022,0.0,0.0026,0.7411,0.0017,0.0042


In [21]:
evaluate_model(total_models[1])

interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Pipeline Plot', 'pipelin…

In [19]:
total_models[2]

[<catboost.core.CatBoostRegressor at 0x19adf90a500>,
 ExtraTreesRegressor(n_jobs=-1, random_state=42),
 LGBMRegressor(device='gpu', n_jobs=-1, random_state=42),
 BayesianRidge(),
 AdaBoostRegressor(random_state=42)]

In [18]:
interpret_model(total_models, plot='summary')

TypeError: This function only supports tree based models for binary classification: catboost, dt, rf, lightgbm, et, xgboost.