## Intro

### 배경

    - 데이터 타입에 상관없이 특정 데이터 입력 시 자동으로 모델을 선정하고 성능을 평가하여 결과를 제시해 주는 autoML 역량 내재화를 목적으로 합니다.

### 사용 가능 데이터

    - 우선, table data에 대한 분석만 가능하도록 만들었으며, 추후 다양한 데이터에 대해 적용이 가능하도록 업데이트할 예정입니다.
    
    - Pycaret, autokeras, Deep Learning 모델을 활용하며, 해당 모델을 구성하였습니다.
    

### 모델 사용 환경 및 세팅

    DataFrame Input

    target 변수는 마지막 column에 배치

    scikit-learn version 0.23.2

    tensorflow 2.5

    python 3.8 사용

    pycaret, autokeras

In [60]:
# import
from tensorflow import keras
import pycaret
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_iris
from sklearn.datasets import load_boston
import pandas as pd
import numpy as np
import tensorflow as tf
import autokeras as ak
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
import scikitplot as skplt
from sklearn.metrics import accuracy_score, classification_report
from sklearn.metrics import cohen_kappa_score, confusion_matrix
import warnings
warnings.filterwarnings("ignore")
from sklearn.preprocessing import LabelEncoder

In [2]:
from pycaret.classification import *

In [56]:
class classification_model():
    
    
    def __init__(self, df):
        self.df = df
        
        
    def preprocess(self):
        target_name = self.df.columns[-1]
        
        
        # 학습에 사용될 dataset에 대한 transforamtion 진행 / 이 중 20% 를 test set 으로 사용하며 이는 학습에 사용되지 않는
        # holdout set에 해당
        self.exp_clf = setup(data = self.df, target = target_name, transformation = True, normalize = True,
                       session_id=123, log_experiment=True, experiment_name='classification', fold_shuffle=True,
                       imputation_type='iterative', train_size = 0.8)
        
        # train / test size 를 출력
        self.X_train = self.exp_clf[5][1][1]
        self.y_train = self.exp_clf[5][2][1]
        self.X_test = self.exp_clf[5][3][1]
        self.y_test = self.exp_clf[5][4][1]
        print('X_train_shape : ', self.X_train.shape)
        print('y_train_shape : ', self.y_train.shape)
        print('X_test_shape : ', self.X_test.shape)
        print('y_test_shape : ', self.y_test.shape)
        
        
    def pycaret_compare_models(self):
        '''
        pycaret 오픈 소스를 통한 모델 생성 및 비교
        
        accuracy 기준으로 10개 fold를 이용해 cross validation 성능이 가장 좋은 top 3 모델을 선정
        
        총 5개의 후보에 대해 비교 진행
        1. 가장 성능이 좋은 것으로 나타난 머신러닝 모델
        2. top 3 모델을 blending한 모델 (hard)
        3. top 3 모델을 blending한 모델 (hard X)
        4. top 3 모델을 stacking한 모델
        5. top 3 모델을 xgboost를 이용해 stacking한 모델
        
        총 5가지 모델 중 accuracy가 가장 좋은 모델을 선정 및 결과 도출
        '''
        exp_clf = self.exp_clf
        top3 = compare_models(n_select=3)
        
        # 가장 성능이 좋은 모델 tuning
        m1 = create_model(top3[0])
        tuned_model1 = tune_model(m1)
        
        m2 = create_model(top3[1])
        m3 = create_model(top3[2])
        
        # 가장 성능이 좋은 3개의 모델 blend
        blend_hard = blend_models(estimator_list = [m1, m2, m3], method='hard')
        blender_top3 = blend_models(top3)
        
        # 가장 성능이 좋은 3개의 모델 stack
        stack_soft = stack_models(top3)
        
        xgboost = create_model('xgboost')
        stack_soft2 = stack_models(top3, meta_model=xgboost)
        
        # 모든 모델 중 하나의 모델 선택
        best_model = automl(use_holdout=True)
        pred_holdout = predict_model(best_model)
        
        return pred_holdout
    
    
    def autokeras_model(self):
        self.y_train = self.y_train.astype('str')
        clf = ak.StructuredDataClassifier(overwrite=True, max_trials=5)
        clf.fit(self.X_train, self.y_train, epochs=50)
        predicted_y = clf.predict(self.X_test)
        
        return predicted_y
    
    def forward(self):
        self.preprocess()
        print("Model Training Start!")
        '''
        각 모델들을 학습시키고 결과 비교 시작
        
        '''
        pycaret_ = self.pycaret_compare_models()
        autokeras_ = self.autokeras_model()
        
        pycaret_acc = accuracy_score(pycaret_['Label'], self.y_test)
        y_test = self.y_test.astype('str')
        autokeras_acc = accuracy_score(autokeras_, y_test)
        
        print("Accuracy for each autoML models result")
        print("Pycaret: ", pycaret_acc)
        print("Autokeras: ", autokeras_acc)
            

In [57]:
iris = load_iris()
iris_data = np.concatenate([iris.data, iris.target.reshape(-1,1)], 1)
df_iris = pd.DataFrame(data=iris_data, columns = iris.feature_names + ['target'])

In [58]:
clm = classification_model(df_iris)
clm.forward()

Trial 5 Complete [00h 00m 05s]
val_accuracy: 0.9166666865348816

Best val_accuracy So Far: 0.9166666865348816
Total elapsed time: 00h 00m 25s
INFO:tensorflow:Oracle triggered exit
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
INFO:tensorflow:Assets written to: ./structured_data_classifier/best_model/assets
Accuracy for each autoML models result
Pycaret:  1.0
Autokeras:  1.0


In [61]:
from pycaret.regression import *

In [70]:
class Regression_model():
    
    def __init__(self, df):
        self.df = df
        
        
    def preprocess(self):
        target_name = self.df.columns[-1]
        self.exp_reg102 = setup(data = self.df, target = target_name, session_id=123,
                  normalize = True, transformation = True, transform_target = True, 
                  combine_rare_levels = True, rare_level_threshold = 0.05,
                  remove_multicollinearity = True, multicollinearity_threshold = 0.95,
                  log_experiment = True, experiment_name = 'regression')
        
        # train / test size 를 출력
        self.X_train = self.exp_reg102[5][1][1]
        self.y_train = self.exp_reg102[5][2][1]
        self.X_test = self.exp_reg102[5][3][1]
        self.y_test = self.exp_reg102[5][4][1]
        print('X_train_shape : ', self.X_train.shape)
        print('y_train_shape : ', self.y_train.shape)
        print('X_test_shape : ', self.X_test.shape)
        print('y_test_shape : ', self.y_test.shape)
    
    
    def pycaret_compare_models(self):
        '''
        pycaret 오픈 소스를 통한 모델 생성 및 비교
        
        accuracy 기준으로 10개 fold를 이용해 cross validation 성능이 가장 좋은 top 3 모델을 선정
        
        총 5개의 후보에 대해 비교 진행
        1. 가장 성능이 좋은 것으로 나타난 머신러닝 모델
        2. top 3 모델을 blending한 모델 (hard)
        3. top 3 모델을 blending한 모델 (hard X)
        4. top 3 모델을 stacking한 모델
        5. top 3 모델을 xgboost를 이용해 stacking한 모델
        
        총 5가지 모델 중 accuracy가 가장 좋은 모델을 선정 및 결과 도출
        '''
        exp_reg102 = self.exp_reg102
        top3 = compare_models(n_select=3)
        
        # 가장 성능이 좋은 모델 tuning
        m1 = create_model(top3[0])
        tuned_m1 = tune_model(m1, n_iter=50)
        
        m2 = create_model(top3[1])
        m3 = create_model(top3[2])
        
        # 가장 성능이 좋은 3개의 모델 blend
        blend_hard = blend_models(estimator_list = [m1, m2, m3])
        blender_top3 = blend_models(top3)
        
        # 가장 성능이 좋은 3개의 모델 stack
        stack_soft = stack_models(top3)
        
        xgboost = create_model('xgboost')
        stack_soft2 = stack_models(top3, meta_model=xgboost)
        
        # 모든 모델 중 하나의 모델 선택
        best_model = automl(use_holdout=True)
        pred_holdout = predict_model(best_model)
        
        return pred_holdout

        
    def autokeras_model(self):
        y_train = np.array(self.y_train)
        clf = ak.StructuredDataRegressor(overwrite=True, max_trials=5)
        clf.fit(self.X_train, self.y_train, epochs=50)
        predicted_y = clf.predict(self.X_test)
        
        return predicted_y
    
    
    def forward(self):
        print("Preprocessing Start")
        self.preprocess()
        print("Preprocessing Done!")
        print("Model Training Start!")
        '''
        각 모델들을 학습시키고 결과 비교 시작
        '''
        pycaret_ = self.pycaret_compare_models()
        autokeras_ = self.autokeras_model()
        
        pycaret_acc = mean_absolute_error(pycaret_['Label'], self.y_test)
        y_test = self.y_test
        autokeras_acc = mean_absolute_error(autokeras_, y_test)
        
        print("Accuracy for each autoML models result")
        print("Pycaret: ", pycaret_acc)
        print("Autokeras: ", autokeras_acc)
            
    

In [71]:
boston = load_boston()
boston_data = np.concatenate([boston.data, boston.target.reshape(-1,1)], 1)

In [72]:
df_boston = pd.DataFrame(boston_data, columns = list(boston.feature_names) + ['target'])

In [73]:
rg = Regression_model(df_boston)
rg.forward()

Trial 5 Complete [00h 00m 04s]
val_loss: 15.143587112426758

Best val_loss So Far: 13.073578834533691
Total elapsed time: 00h 00m 22s
INFO:tensorflow:Oracle triggered exit
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
INFO:tensorflow:Assets written to: ./structured_data_regressor/best_model/assets


NameError: name 'mean_absolute_error' is not defined

In [40]:
rg.compare_models()

Unnamed: 0,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,2.8407,13.5289,3.6782,0.8486,0.1392,0.1071
1,1.6294,4.7152,2.1714,0.8982,0.102,0.0853
2,1.9007,5.929,2.435,0.9408,0.131,0.1064
3,2.5649,17.3543,4.1659,0.8289,0.1695,0.1287
4,2.1327,7.1562,2.6751,0.9393,0.1191,0.0996
5,1.8943,7.2033,2.6839,0.866,0.1308,0.0973
6,1.4782,3.6572,1.9124,0.9458,0.0828,0.0667
7,2.7817,28.4382,5.3327,0.739,0.1929,0.1191
8,1.6387,4.0674,2.0168,0.9273,0.1113,0.0917
9,2.026,8.8364,2.9726,0.8978,0.1286,0.088


TypeError: blend_models() got an unexpected keyword argument 'method'

## Autokeras

- input type 에 유연한 편. ndarray, tf.data, pd.Series, pd.DataFrame 등 많은 input 지원 -> pycaret 과는 대비되는 장점

In [9]:
class autokeras_classif():
    
    def __init__(self, df):
        self.df = df
        
        
    def preprocess(self):
        train, test = train_test_split(self.df, random_state= 123)
        
        self.y_train = train[train.columns[-1]]
        self.X_train = train.drop(columns = train.columns[-1])
        self.y_test = test[test.columns[-1]]
        self.X_test = test.drop(columns = test.columns[-1])
        self.y_train = self.y_train.astype('str')
        self.y_test  = self.y_test.astype('str')
        
    def model_learn(self):
        clf = ak.StructuredDataClassifier(overwrite=True, max_trials=3)
        clf.fit(self.X_train, self.y_train, epochs=10)
        predicted_y = clf.predict(self.X_test)
        acc                 = accuracy_score(self.y_test, predicted_y)
        classReport         = classification_report(self.y_test, predicted_y)
        confMatrix          = confusion_matrix(self.y_test, predicted_y) 

        print(); print('Testing Results of the trained model: ')
        print(); print('Accuracy : ', acc)
        print(); print('Confusion Matrix :\n', confMatrix)
        print(); print('Classification Report :\n',classReport)


In [16]:
akc = autokeras_classif(df_iris)
akc.preprocess()

In [17]:
akc.model_learn()

Trial 3 Complete [00h 00m 01s]
val_accuracy: 0.875

Best val_accuracy So Far: 0.875
Total elapsed time: 00h 00m 04s
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10

Testing Results of the trained model: 

Accuracy :  0.868421052631579

Confusion Matrix :
 [[16  0  0]
 [ 3  3  2]
 [ 0  0 14]]

Classification Report :
               precision    recall  f1-score   support

         0.0       0.84      1.00      0.91        16
         1.0       1.00      0.38      0.55         8
         2.0       0.88      1.00      0.93        14

    accuracy                           0.87        38
   macro avg       0.91      0.79      0.80        38
weighted avg       0.89      0.87      0.84        38



In [12]:
import warnings
warnings.filterwarnings("ignore")
import autokeras as ak
print(ak.__version__)

import logging
tf.get_logger().setLevel(logging.ERROR)

1.0.16.post1


In [18]:
df_boston

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,target
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.0900,1.0,296.0,15.3,396.90,4.98,24.0
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.90,9.14,21.6
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.90,5.33,36.2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
501,0.06263,0.0,11.93,0.0,0.573,6.593,69.1,2.4786,1.0,273.0,21.0,391.99,9.67,22.4
502,0.04527,0.0,11.93,0.0,0.573,6.120,76.7,2.2875,1.0,273.0,21.0,396.90,9.08,20.6
503,0.06076,0.0,11.93,0.0,0.573,6.976,91.0,2.1675,1.0,273.0,21.0,396.90,5.64,23.9
504,0.10959,0.0,11.93,0.0,0.573,6.794,89.3,2.3889,1.0,273.0,21.0,393.45,6.48,22.0


In [22]:
class autokeras_ref():
    
    def __init__(self, df):
        self.df = df

        
    def preprocess(self):
        train, test = train_test_split(self.df, random_state= 123)
        
        self.y_train = np.array(train[train.columns[-1]])
        self.X_train = train.drop(columns = train.columns[-1])
        self.y_test = np.array(test[test.columns[-1]])
        self.X_test = test.drop(columns = test.columns[-1])
        
        
    def model_learn(self):
        clf = ak.StructuredDataRegressor(overwrite=True, max_trials=3)
        clf.fit(self.X_train, self.y_train, epochs=10)
        predicted_y = clf.predict(self.X_test)
        print(np.mean((self.y_test-predicted_y)**2))


In [23]:
krg = autokeras_ref(df_boston)
krg.preprocess()

In [24]:
krg.model_learn()

Trial 3 Complete [00h 00m 01s]
val_loss: 67.78148651123047

Best val_loss So Far: 33.229000091552734
Total elapsed time: 00h 00m 04s
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
135.32502765670853


In [211]:
df_boston

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,target
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.0900,1.0,296.0,15.3,396.90,4.98,24.0
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.90,9.14,21.6
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.90,5.33,36.2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
501,0.06263,0.0,11.93,0.0,0.573,6.593,69.1,2.4786,1.0,273.0,21.0,391.99,9.67,22.4
502,0.04527,0.0,11.93,0.0,0.573,6.120,76.7,2.2875,1.0,273.0,21.0,396.90,9.08,20.6
503,0.06076,0.0,11.93,0.0,0.573,6.976,91.0,2.1675,1.0,273.0,21.0,396.90,5.64,23.9
504,0.10959,0.0,11.93,0.0,0.573,6.794,89.3,2.3889,1.0,273.0,21.0,393.45,6.48,22.0


In [212]:
data = df_boston.values

train, test = train_test_split(df_boston, random_state= 123)
        
data = data.astype('float32')
X, y = data[:, :-1], data[:, -1]
print(X.shape, y.shape)
# separate into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=1)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(506, 13) (506,)
(339, 13) (167, 13) (339,) (167,)


In [213]:
clff = ak.StructuredDataRegressor(overwrite=True, max_trials=3)
clff.fit(X_train,y_train, epochs=10, batch_size = 32)
predicted_y = clff.predict(X_test)


Search: Running Trial #1

Hyperparameter    |Value             |Best Value So Far 
structured_data...|True              |?                 
structured_data...|2                 |?                 
structured_data...|False             |?                 
structured_data...|0                 |?                 
structured_data...|32                |?                 
structured_data...|32                |?                 
regression_head...|0                 |?                 
optimizer         |adam              |?                 
learning_rate     |0.001             |?                 



ValueError: in user code:

    File "/opt/anaconda3/envs/pycaret/lib/python3.8/site-packages/keras/engine/base_preprocessing_layer.py", line 118, in adapt_step  *
        self.update_state(data)
    File "/opt/anaconda3/envs/pycaret/lib/python3.8/site-packages/keras/layers/preprocessing/index_lookup.py", line 541, in update_state  **
        raise ValueError(

    ValueError: Cannot adapt StringLookup layer after setting a static vocabulary via init argument or `set_vocabulary`.


In [201]:
clff

<autokeras.tasks.structured_data.StructuredDataRegressor at 0x7fc6bd4bb8e0>

In [207]:
# load the sonar dataset
from pandas import read_csv
from sklearn.model_selection import train_test_split
# load dataset
url = 'https://raw.githubusercontent.com/jbrownlee/Datasets/master/auto-insurance.csv'
dataframe = read_csv(url, header=None)
print(dataframe.shape)
# split into input and output elements
data = dataframe.values
data = data.astype('float32')
X, y = data[:, :-1], data[:, -1]
print(X.shape, y.shape)
# separate into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=1)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(63, 2)
(63, 1) (63,)
(42, 1) (21, 1) (42,) (21,)


In [215]:
dataframe

Unnamed: 0,0,1
0,108,392.5
1,19,46.2
2,13,15.7
3,124,422.2
4,40,119.4
...,...,...
58,9,87.4
59,31,209.8
60,14,95.5
61,53,244.6


In [208]:
search = ak.StructuredDataRegressor(max_trials=15, loss='mean_absolute_error')
# perform the search

INFO:tensorflow:Reloading Oracle from existing project ./structured_data_regressor/oracle.json
INFO:tensorflow:Reloading Tuner from ./structured_data_regressor/tuner0.json


In [209]:
search.fit(x=X_train, y=y_train, verbose=0, epochs=10)

INFO:tensorflow:Oracle triggered exit
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
INFO:tensorflow:Assets written to: ./structured_data_regressor/best_model/assets


<keras.callbacks.History at 0x7fc6a0f348b0>

In [210]:
mae, _ = search.evaluate(X_test, y_test, verbose=0)
print('MAE: %.3f' % mae)

MAE: 117.589


In [1]:
import torch

ModuleNotFoundError: No module named 'torch'

In [26]:
import torch

print(torch.__version__)

1.10.1


In [27]:
X_train.shape

NameError: name 'X_train' is not defined