### 1. 전처리, 피처공학

In [1]:
# # Data Preprocessing
import pandas as pd
import numpy as np

data = pd.read_csv('../DATA/raw_2023051820231018_경대기업맞춤형.csv')
print('기본 데이터 shape :', data.shape)

# 2) scale_pv < 5, E_scr_pv == 8, k_rpm_pv > 50
data = data[data['scale_pv'] < 5]  # 약 1800개 제거
print('scale_pv < 5 shape :', data.shape)
data = data[data['E_scr_pv'] == 8]  # 약 3800개 제거
print('E_scr_pv == 8 shape :', data.shape)
data = data[data['k_rpm_pv'] > 50] # 약 170개 제거
print('k_rpm_pv > 100 shape :', data.shape)

# 3) E_scr_sv, c_temp_sv, n_temp_sv, s_temp_sv, k_rpm_sv, n_temp_sv 제거
data.drop(['E_scr_sv', 'E_scr_pv', 'c_temp_sv', 's_temp_sv', 'k_rpm_sv', 'n_temp_sv', 'Unnamed: 12'], axis=1, inplace=True)

data['time'] = pd.to_datetime(data['time'])
oct_data = data[data['time'].dt.month == 10]
oct_data = oct_data.drop('time', axis=1)
print('oct_data shape :', oct_data.shape)

train_data = data[data['time'].dt.month != 10]
train_data = train_data.drop('time', axis=1)
print('train_data shape :', train_data.shape)

기본 데이터 shape : (235413, 13)
scale_pv < 5 shape : (233676, 13)
E_scr_pv == 8 shape : (229983, 13)
k_rpm_pv > 100 shape : (229810, 13)
oct_data shape : (29651, 5)
train_data shape : (200159, 5)


### 스케일링 X. 증강 X

In [2]:
# 2 < scale_pv < 4
train_data_2_4 = train_data[(train_data['scale_pv'] > 2) & (train_data['scale_pv'] < 4)]
oct_data_2_4 = oct_data[(oct_data['scale_pv'] > 2) & (oct_data['scale_pv'] < 4)]

X_train = train_data_2_4.drop('scale_pv', axis=1)
y_train = train_data_2_4['scale_pv']
X_test = oct_data_2_4.drop('scale_pv', axis=1)
y_test = oct_data_2_4['scale_pv']


In [3]:
# 스케일링 없이 데이터 학습 및 분석
# pipeline
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, Lasso, Ridge, BayesianRidge, ElasticNet, SGDRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor
from lightgbm import LGBMRegressor
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error, r2_score


# 모델 리스트
models = [
    ('LinearRegression', LinearRegression()),
    ('Lasso', Lasso()),
    ('Ridge', Ridge()),
    ('BayesianRidge', BayesianRidge()),
    ('ElasticNet', ElasticNet()),
    # ('SGDRegressor', SGDRegressor()),
    ('RandomForestRegressor', RandomForestRegressor()),
    ('GradientBoostingRegressor', GradientBoostingRegressor()),
    ('AdaBoostRegressor', AdaBoostRegressor()),
    ('LGBMRegressor', LGBMRegressor())
]

# [ cross_val_score 사용은 보류]
# # 모델별 성능 확인
# for name, model in models:
#     pipeline = Pipeline([
#         ('scaler', StandardScaler()),
#         ('model', model)
#     ])
#     scores = cross_val_score(pipeline, X_train, y_train, cv=5, scoring='neg_mean_absolute_error')
#     print(f'{name} : {scores.mean()}')
    
# 모델별 성능 확인
score_df = pd.DataFrame(columns=['scaler', 'model', 'MAE', 'MAPE', 'R2'])

for name, model in models:
    # StandardScaler 미사용
    pipeline = Pipeline([
        ('model', model)
    ])
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)
    diff = y_test - y_pred
    
    score_df = pd.concat(
        [score_df, 
         pd.DataFrame(
            [['None',
              name, 
              mean_absolute_error(y_test, y_pred), 
              mean_absolute_percentage_error(y_test, y_pred)*100, 
              r2_score(y_test, y_pred),
              diff.mean(), diff.std(), diff.max(), diff.min(), diff.median()
            ]],
            columns=['scaler', 'model', 'MAE', 'MAPE', 'R2', 'diff_mean', 'diff_std', 'diff_max', 'diff_min', 'diff_median'])])
     

for name, model in models:
    # StandardScaler 사용
    pipeline = Pipeline([
        ('scaler', StandardScaler()),
        ('model', model)
    ])
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)
    # print(f'[ {name} ]')
    # print('MAE :', mean_absolute_error(y_test, y_pred))
    # print('MAPE :', mean_absolute_percentage_error(y_test, y_pred)*100)
    # print('R2 :', r2_score(y_test, y_pred))
    
    diff = y_test - y_pred
    
    # score_df에 저장 : concat
    score_df = pd.concat(
        [score_df, 
         pd.DataFrame(
            [[str(pipeline.named_steps['scaler']),
              name, 
              mean_absolute_error(y_test, y_pred), 
              mean_absolute_percentage_error(y_test, y_pred)*100, 
              r2_score(y_test, y_pred),
              diff.mean(), diff.std(), diff.max(), diff.min(), diff.median()
            ]],
            columns=['scaler', 'model', 'MAE', 'MAPE', 'R2', 'diff_mean', 'diff_std', 'diff_max', 'diff_min', 'diff_median'])])
    
score_df

  score_df = pd.concat(


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000252 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 215
[LightGBM] [Info] Number of data points in the train set: 36720, number of used features: 4
[LightGBM] [Info] Start training from score 3.041614
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000129 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 215
[LightGBM] [Info] Number of data points in the train set: 36720, number of used features: 4
[LightGBM] [Info] Start training from score 3.041614


Unnamed: 0,scaler,model,MAE,MAPE,R2,diff_mean,diff_std,diff_max,diff_min,diff_median
0,,LinearRegression,0.023703,0.774473,0.013737,0.009337,0.030945,0.218106,-0.182137,0.009348
0,,Lasso,0.023442,0.765985,-0.033625,0.005966,0.032548,0.238386,-0.191614,0.008386
0,,Ridge,0.023703,0.774475,0.013734,0.009338,0.030945,0.218106,-0.182137,0.009348
0,,BayesianRidge,0.02371,0.774689,0.013247,0.009363,0.030945,0.218172,-0.182093,0.009399
0,,ElasticNet,0.023442,0.765985,-0.033625,0.005966,0.032548,0.238386,-0.191614,0.008386
0,,RandomForestRegressor,0.028054,0.918468,-0.326716,0.005242,0.037121,0.2439,-0.207033,0.005375
0,,GradientBoostingRegressor,0.023749,0.776502,-0.001255,0.007719,0.03164,0.217014,-0.21031,0.007059
0,,AdaBoostRegressor,0.025824,0.843499,-0.173625,0.012019,0.033148,0.236022,-0.191613,0.010687
0,,LGBMRegressor,0.024335,0.796186,-0.031141,0.005449,0.032598,0.214798,-0.202663,0.004949
0,StandardScaler(),LinearRegression,0.023703,0.774473,0.013737,0.009337,0.030945,0.218106,-0.182137,0.009348


In [None]:
# %pip install bayesian-optimization

In [9]:
from sklearn.ensemble import RandomForestClassifier as RFC
from sklearn.svm import SVC

from bayes_opt import BayesianOptimization

def svc_cv(C, gamma, data, targets):
    """SVC cross validation.

    This function will instantiate a SVC classifier with parameters C and
    gamma. Combined with data and targets this will in turn be used to perform
    cross validation. The result of cross validation is returned.

    Our goal is to find combinations of C and gamma that maximizes the roc_auc
    metric.
    """
    estimator = SVC(C=C, gamma=gamma, random_state=2)
    cval = cross_val_score(estimator, data, targets, scoring='roc_auc', cv=4)
    return cval.mean()


def rfc_cv(n_estimators, min_samples_split, max_features, data, targets):
    """Random Forest cross validation.

    This function will instantiate a random forest classifier with parameters
    n_estimators, min_samples_split, and max_features. Combined with data and
    targets this will in turn be used to perform cross validation. The result
    of cross validation is returned.

    Our goal is to find combinations of n_estimators, min_samples_split, and
    max_features that minimizes the log loss.
    """
    estimator = RFC(
        n_estimators=n_estimators,
        min_samples_split=min_samples_split,
        max_features=max_features,
        random_state=2
    )
    cval = cross_val_score(estimator, data, targets,
                           scoring='neg_log_loss', cv=4)
    return cval.mean()
def optimize_svc(data, targets):
    """Apply Bayesian Optimization to SVC parameters."""
    def svc_crossval(expC, expGamma):
        """Wrapper of SVC cross validation.

        Notice how we transform between regular and log scale. While this
        is not technically necessary, it greatly improves the performance
        of the optimizer.
        """
        C = 10 ** expC
        gamma = 10 ** expGamma
        return svc_cv(C=C, gamma=gamma, data=data, targets=targets)

    optimizer = BayesianOptimization(
        f=svc_crossval,
        pbounds={"expC": (-3, 2), "expGamma": (-4, -1)},
        random_state=1234,
        verbose=2
    )
    optimizer.maximize(n_iter=10)

    print("Final result:", optimizer.max)


def optimize_rfc(data, targets):
    """Apply Bayesian Optimization to Random Forest parameters."""
    def rfc_crossval(n_estimators, min_samples_split, max_features):
        """Wrapper of RandomForest cross validation.

        Notice how we ensure n_estimators and min_samples_split are casted
        to integer before we pass them along. Moreover, to avoid max_features
        taking values outside the (0, 1) range, we also ensure it is capped
        accordingly.
        """
        return rfc_cv(
            n_estimators=int(n_estimators),
            min_samples_split=int(min_samples_split),
            max_features=max(min(max_features, 0.999), 1e-3),
            data=data,
            targets=targets,
        )

    optimizer = BayesianOptimization(
        f=rfc_crossval,
        pbounds={
            "n_estimators": (10, 250),
            "min_samples_split": (2, 25),
            "max_features": (0.1, 0.999),
        },
        random_state=1234,
        verbose=2
    )
    optimizer.maximize(n_iter=10)

    print("Final result:", optimizer.max)
    
from sklearn.ensemble import RandomForestRegressor as RFR
from sklearn.svm import SVR

from bayes_opt import BayesianOptimization

def svr_cv(C, gamma, data, targets):
    estimator = SVR(C=C, gamma=gamma)
    cval = cross_val_score(estimator, data, targets, scoring='neg_mean_squared_error', cv=4)
    return cval.mean()

def rfr_cv(n_estimators, min_samples_split, max_features, data, targets):
    estimator = RFR(
        n_estimators=int(n_estimators),
        min_samples_split=int(min_samples_split),
        max_features=max(min(max_features, 0.999), 1e-3),
        random_state=2
    )
    cval = cross_val_score(estimator, data, targets, scoring='neg_mean_squared_error', cv=4)
    return cval.mean()

def optimize_svr(data, targets):
    def svr_crossval(expC, expGamma):
        C = 10 ** expC
        gamma = 10 ** expGamma
        return svr_cv(C=C, gamma=gamma, data=data, targets=targets)

    optimizer = BayesianOptimization(
        f=svr_crossval,
        pbounds={"expC": (-3, 2), "expGamma": (-4, -1)},
        random_state=1234,
        verbose=2
    )
    optimizer.maximize(n_iter=10)

    print("Final result:", optimizer.max)

def optimize_rfr(data, targets):
    def rfr_crossval(n_estimators, min_samples_split, max_features):
        return rfr_cv(
            n_estimators=int(n_estimators),
            min_samples_split=int(min_samples_split),
            max_features=max(min(max_features, 0.999), 1e-3),
            data=data,
            targets=targets,
        )

    optimizer = BayesianOptimization(
        f=rfr_crossval,
        pbounds={
            "n_estimators": (10, 250),
            "min_samples_split": (2, 25),
            "max_features": (0.1, 0.999),
        },
        random_state=1234,
        verbose=2
    )
    optimizer.maximize(n_iter=10)

    print("Final result:", optimizer.max)

In [11]:
# bayesian optimization for regression
optimize_svr(X_train, y_train)
optimize_rfr(X_train, y_train)

|   iter    |  target   |   expC    | expGamma  |
-------------------------------------------------
| [0m1        [0m | [0m-0.001675[0m | [0m-2.042   [0m | [0m-2.134   [0m |
| [0m2        [0m | [0m-0.001899[0m | [0m-0.8114  [0m | [0m-1.644   [0m |
| [0m3        [0m | [0m-0.001757[0m | [0m0.8999   [0m | [0m-3.182   [0m |
| [0m4        [0m | [0m-0.001818[0m | [0m-1.618   [0m | [0m-1.594   [0m |
| [0m5        [0m | [0m-0.00269 [0m | [0m1.791    [0m | [0m-1.372   [0m |
| [95m6        [0m | [95m-0.001613[0m | [95m-1.011   [0m | [95m-4.0     [0m |
| [95m7        [0m | [95m-0.001564[0m | [95m-3.0     [0m | [95m-4.0     [0m |
| [0m8        [0m | [0m-0.001668[0m | [0m2.0      [0m | [0m-4.0     [0m |
| [0m9        [0m | [0m-0.001587[0m | [0m-3.0     [0m | [0m-1.0     [0m |
| [95m10       [0m | [95m-0.00155 [0m | [95m-3.0     [0m | [95m-2.692   [0m |
| [0m11       [0m | [0m-0.001633[0m | [0m0.4357   [0m | [0m-4

In [13]:
# 각 SVR, RFR 모델별 최적화된 파라미터로 학습 및 예측
# SVR : {'target': -0.0015364217791858223, 'params': {'expC': -3.0, 'expGamma': -1.8371336702048258}}
# RFR : {'target': -0.0015870208389014108, 'params': {'max_features': 0.34854136537364394, 'min_samples_split': 20.443060083305443, 'n_estimators': 239.95344488408924}}
svr = SVR(C=10**-3, gamma=10**-1.8371336702048258)
svr.fit(X_train, y_train)
y_pred = svr.predict(X_test)
print('SVR MAE :', mean_absolute_error(y_test, y_pred))
print('SVR MAPE :', mean_absolute_percentage_error(y_test, y_pred)*100)
print('SVR R2 :', r2_score(y_test, y_pred))
diff = y_test - y_pred
score_df = pd.concat(
    [score_df, 
     pd.DataFrame(
        [['None',
          'SVR', 
          mean_absolute_error(y_test, y_pred), 
          mean_absolute_percentage_error(y_test, y_pred)*100, 
          r2_score(y_test, y_pred),
          diff.mean(), diff.std(), diff.max(), diff.min(), diff.median()
        ]],
        columns=['scaler', 'model', 'MAE', 'MAPE', 'R2', 'diff_mean', 'diff_std', 'diff_max', 'diff_min', 'diff_median'])])

rfr = RFR(n_estimators=240, min_samples_split=20, max_features=0.349)
rfr.fit(X_train, y_train)
y_pred = rfr.predict(X_test)
print('RFR MAE :', mean_absolute_error(y_test, y_pred))
print('RFR MAPE :', mean_absolute_percentage_error(y_test, y_pred)*100)
print('RFR R2 :', r2_score(y_test, y_pred))
diff = y_test - y_pred
score_df = pd.concat(
    [score_df, 
     pd.DataFrame(
        [['None',
          'RFR', 
          mean_absolute_error(y_test, y_pred), 
          mean_absolute_percentage_error(y_test, y_pred)*100, 
          r2_score(y_test, y_pred),
          diff.mean(), diff.std(), diff.max(), diff.min(), diff.median()
        ]],
        columns=['scaler', 'model', 'MAE', 'MAPE', 'R2', 'diff_mean', 'diff_std', 'diff_max', 'diff_min', 'diff_median'])])



SVR MAE : 0.026444442434937328
SVR MAPE : 0.863107501273374
SVR R2 : -0.17035263635446385
RFR MAE : 0.02448590793199232
RFR MAPE : 0.8011213892547586
RFR R2 : -0.038916358955644936


In [14]:
score_df

Unnamed: 0,scaler,model,MAE,MAPE,R2,diff_mean,diff_std,diff_max,diff_min,diff_median
0,,LinearRegression,0.023703,0.774473,0.013737,0.009337,0.030945,0.218106,-0.182137,0.009348
0,,Lasso,0.023442,0.765985,-0.033625,0.005966,0.032548,0.238386,-0.191614,0.008386
0,,Ridge,0.023703,0.774475,0.013734,0.009338,0.030945,0.218106,-0.182137,0.009348
0,,BayesianRidge,0.02371,0.774689,0.013247,0.009363,0.030945,0.218172,-0.182093,0.009399
0,,ElasticNet,0.023442,0.765985,-0.033625,0.005966,0.032548,0.238386,-0.191614,0.008386
0,,RandomForestRegressor,0.028054,0.918468,-0.326716,0.005242,0.037121,0.2439,-0.207033,0.005375
0,,GradientBoostingRegressor,0.023749,0.776502,-0.001255,0.007719,0.03164,0.217014,-0.21031,0.007059
0,,AdaBoostRegressor,0.025824,0.843499,-0.173625,0.012019,0.033148,0.236022,-0.191613,0.010687
0,,LGBMRegressor,0.024335,0.796186,-0.031141,0.005449,0.032598,0.214798,-0.202663,0.004949
0,StandardScaler(),LinearRegression,0.023703,0.774473,0.013737,0.009337,0.030945,0.218106,-0.182137,0.009348


In [15]:
# 평가 : MAE, MAPE, diff_mean, diff_std, diff_max와 diff_min의 절대값 합이 작을수록 좋은 모델
# 절대값 합
score_df['abs_sum'] = score_df['diff_max'].abs() + score_df['diff_min'].abs()
score_df.sort_values('abs_sum', ascending=True)

Unnamed: 0,scaler,model,MAE,MAPE,R2,diff_mean,diff_std,diff_max,diff_min,diff_median,abs_sum
0,StandardScaler(),LinearRegression,0.023703,0.774473,0.013737,0.009337,0.030945,0.218106,-0.182137,0.009348,0.400243
0,,LinearRegression,0.023703,0.774473,0.013737,0.009337,0.030945,0.218106,-0.182137,0.009348,0.400243
0,,Ridge,0.023703,0.774475,0.013734,0.009338,0.030945,0.218106,-0.182137,0.009348,0.400243
0,StandardScaler(),Ridge,0.023703,0.774473,0.013737,0.009337,0.030945,0.218106,-0.182138,0.009348,0.400244
0,,BayesianRidge,0.02371,0.774689,0.013247,0.009363,0.030945,0.218172,-0.182093,0.009399,0.400264
0,StandardScaler(),BayesianRidge,0.023703,0.774444,0.013678,0.009334,0.030947,0.218152,-0.182175,0.009356,0.400327
0,StandardScaler(),LGBMRegressor,0.024335,0.796186,-0.031141,0.005449,0.032598,0.214798,-0.202663,0.004949,0.417462
0,,LGBMRegressor,0.024335,0.796186,-0.031141,0.005449,0.032598,0.214798,-0.202663,0.004949,0.417462
0,StandardScaler(),AdaBoostRegressor,0.027677,0.903282,-0.301232,0.017378,0.032807,0.231384,-0.190195,0.014156,0.421579
0,,SVR,0.026444,0.863108,-0.170353,0.015997,0.031365,0.217505,-0.20694,0.015072,0.424445


절대값 합 정렬 시 StandardScaler(), LinearRegression()가 제일 낮다.

In [16]:
# diff_mean 정렬
score_df.sort_values('diff_mean', ascending=True)

Unnamed: 0,scaler,model,MAE,MAPE,R2,diff_mean,diff_std,diff_max,diff_min,diff_median,abs_sum
0,StandardScaler(),RandomForestRegressor,0.028001,0.916759,-0.322519,0.004968,0.037099,0.2414,-0.19692,0.00466,0.43832
0,,RandomForestRegressor,0.028054,0.918468,-0.326716,0.005242,0.037121,0.2439,-0.207033,0.005375,0.450933
0,StandardScaler(),LGBMRegressor,0.024335,0.796186,-0.031141,0.005449,0.032598,0.214798,-0.202663,0.004949,0.417462
0,,LGBMRegressor,0.024335,0.796186,-0.031141,0.005449,0.032598,0.214798,-0.202663,0.004949,0.417462
0,,Lasso,0.023442,0.765985,-0.033625,0.005966,0.032548,0.238386,-0.191614,0.008386,0.43
0,,ElasticNet,0.023442,0.765985,-0.033625,0.005966,0.032548,0.238386,-0.191614,0.008386,0.43
0,StandardScaler(),ElasticNet,0.023442,0.765985,-0.033625,0.005966,0.032548,0.238386,-0.191614,0.008386,0.43
0,StandardScaler(),Lasso,0.023442,0.765985,-0.033625,0.005966,0.032548,0.238386,-0.191614,0.008386,0.43
0,,RFR,0.024486,0.801121,-0.038916,0.006199,0.032591,0.224731,-0.202895,0.006426,0.427626
0,StandardScaler(),GradientBoostingRegressor,0.023801,0.778226,-0.004808,0.007717,0.0317,0.217014,-0.213104,0.007059,0.430118


In [17]:
# MAPE 정렬
score_df.sort_values('MAPE', ascending=True)

Unnamed: 0,scaler,model,MAE,MAPE,R2,diff_mean,diff_std,diff_max,diff_min,diff_median,abs_sum
0,,Lasso,0.023442,0.765985,-0.033625,0.005966,0.032548,0.238386,-0.191614,0.008386,0.43
0,StandardScaler(),ElasticNet,0.023442,0.765985,-0.033625,0.005966,0.032548,0.238386,-0.191614,0.008386,0.43
0,,ElasticNet,0.023442,0.765985,-0.033625,0.005966,0.032548,0.238386,-0.191614,0.008386,0.43
0,StandardScaler(),Lasso,0.023442,0.765985,-0.033625,0.005966,0.032548,0.238386,-0.191614,0.008386,0.43
0,StandardScaler(),BayesianRidge,0.023703,0.774444,0.013678,0.009334,0.030947,0.218152,-0.182175,0.009356,0.400327
0,StandardScaler(),Ridge,0.023703,0.774473,0.013737,0.009337,0.030945,0.218106,-0.182138,0.009348,0.400244
0,StandardScaler(),LinearRegression,0.023703,0.774473,0.013737,0.009337,0.030945,0.218106,-0.182137,0.009348,0.400243
0,,LinearRegression,0.023703,0.774473,0.013737,0.009337,0.030945,0.218106,-0.182137,0.009348,0.400243
0,,Ridge,0.023703,0.774475,0.013734,0.009338,0.030945,0.218106,-0.182137,0.009348,0.400243
0,,BayesianRidge,0.02371,0.774689,0.013247,0.009363,0.030945,0.218172,-0.182093,0.009399,0.400264


In [29]:
# 각 순위 출력 : 이름 = scaler + model, 비교 대상은 각 MAE, MAPE, diff_mean, abs_sum, 10위까지
score_df['name'] = score_df['scaler'] + ' ' + score_df['model']

score_board = pd.DataFrame()
for col in ['MAE', 'MAPE', 'diff_mean', 'abs_sum']:
    print(f'[{col}]')
    for idx, name in enumerate(score_df.sort_values(col)['name'].values):
        print(f'{idx+1}위 : {name}')
        score_board.loc[idx, col] = name
        if idx == 4:
            break
    print()

score_board

[MAE]
1위 : None Lasso
2위 : StandardScaler() ElasticNet
3위 : None ElasticNet
4위 : StandardScaler() Lasso
5위 : StandardScaler() BayesianRidge

[MAPE]
1위 : None Lasso
2위 : StandardScaler() ElasticNet
3위 : None ElasticNet
4위 : StandardScaler() Lasso
5위 : StandardScaler() BayesianRidge

[diff_mean]
1위 : StandardScaler() RandomForestRegressor
2위 : None RandomForestRegressor
3위 : StandardScaler() LGBMRegressor
4위 : None LGBMRegressor
5위 : None Lasso

[abs_sum]
1위 : StandardScaler() LinearRegression
2위 : None LinearRegression
3위 : None Ridge
4위 : StandardScaler() Ridge
5위 : None BayesianRidge



Unnamed: 0,MAE,MAPE,diff_mean,abs_sum
0,None Lasso,None Lasso,StandardScaler() RandomForestRegressor,StandardScaler() LinearRegression
1,StandardScaler() ElasticNet,StandardScaler() ElasticNet,None RandomForestRegressor,None LinearRegression
2,None ElasticNet,None ElasticNet,StandardScaler() LGBMRegressor,None Ridge
3,StandardScaler() Lasso,StandardScaler() Lasso,None LGBMRegressor,StandardScaler() Ridge
4,StandardScaler() BayesianRidge,StandardScaler() BayesianRidge,None Lasso,None BayesianRidge


In [31]:
# 전체 컬럼에서 가장 많이 나온 모델
score_board.stack().value_counts().head(5)

None Lasso                        3
StandardScaler() ElasticNet       2
None ElasticNet                   2
StandardScaler() Lasso            2
StandardScaler() BayesianRidge    2
Name: count, dtype: int64

None Lasso가 가장 많이 언급