In [1]:
import numpy as np
import pandas as pd

# 소수를 e로 표현하지 않도록 하기
pd.options.display.float_format = "{:.5f}".format

In [2]:
lib_2007 = pd.read_csv("../final_data/lib_2007.csv", index_col=0)
lib_2008 = pd.read_csv("../final_data/lib_2008.csv", index_col=0)
lib_2009 = pd.read_csv("../final_data/lib_2009.csv", index_col=0)
lib_2010 = pd.read_csv("../final_data/lib_2010.csv", index_col=0)
lib_2011 = pd.read_csv("../final_data/lib_2011.csv", index_col=0)
lib_2012 = pd.read_csv("../final_data/lib_2012.csv", index_col=0)
lib_2013 = pd.read_csv("../final_data/lib_2013.csv", index_col=0)
lib_2014 = pd.read_csv("../final_data/lib_2014.csv", index_col=0)
lib_2015 = pd.read_csv("../final_data/lib_2015.csv", index_col=0)
lib_2016 = pd.read_csv("../final_data/lib_2016.csv", index_col=0)
lib_2017 = pd.read_csv("../final_data/lib_2017.csv", index_col=0)
lib_2018 = pd.read_csv("../final_data/lib_2018.csv", index_col=0)
lib_2019 = pd.read_csv("../final_data/lib_2019.csv", index_col=0)
lib_2020 = pd.read_csv("../final_data/lib_2020.csv", index_col=0)
lib_2021 = pd.read_csv("../final_data/lib_2021.csv", index_col=0)

In [3]:
train_lib = pd.concat([lib_2007,lib_2008,lib_2009,lib_2010,lib_2011,lib_2012,lib_2013,lib_2014,lib_2015, lib_2016, lib_2017])
valid_lib = pd.concat([lib_2018, lib_2019])
test_lib = pd.concat([lib_2020,lib_2021])

In [4]:
# 사분위수를 활용하여 비율 충족도의 이상치 행 제거하기
def del_outlier(df, col):
    q1 = df[col].quantile(0.10)
    q3 = df[col].quantile(0.90)
    iqr = q3 - q1 
    boundary = 1.5 * iqr 

    upper_index = df[df[col] > q3 + boundary].index
    lower_index = df[df[col] < q1 - boundary].index 

    df.drop(upper_index, inplace = True)
    df.drop(lower_index, inplace = True)

    return df

del_outlier(train_lib, "future_acq_budget_settlement")
del_outlier(valid_lib, "future_acq_budget_settlement")
del_outlier(test_lib, "future_acq_budget_settlement") 

# 이상치 제거 후 train, test set의 데이터 수 구하기
print(f"이상치 제거 후 train set의 데이터 수 : {len(train_lib)}")
print(f"이상치 제거 후 valid set의 데이터 수 : {len(valid_lib)}")
print(f"이상치 제거 후 test set의 데이터 수 : {len(test_lib)}")

이상치 제거 후 train set의 데이터 수 : 8184
이상치 제거 후 valid set의 데이터 수 : 2178
이상치 제거 후 test set의 데이터 수 : 2310


In [5]:
from sklearn.preprocessing import MinMaxScaler

# Min-Max Scaler 생성
scaler = MinMaxScaler()

# 모든 변수에 대해 Min-Max Scaling 적용
# 스케일러를 학습 데이터에 대해 fit
scaler.fit(train_lib.drop(columns=["future_acq_budget_settlement"]))

# 학습 데이터에 대해 변환
train_lib_scaled = pd.DataFrame(scaler.transform(train_lib.drop(columns=["future_acq_budget_settlement"])), 
                                columns=train_lib.columns[:-1])

# 타겟 변수에 대해 스케일링 적용
scaler_y = MinMaxScaler()
train_y_scaled = scaler_y.fit_transform(train_lib[['future_acq_budget_settlement']])

# 학습 데이터에 스케일된 타겟 변수 추가
train_lib_scaled['future_acq_budget_settlement_scaled'] = train_y_scaled

# 검증 데이터에 대해 변환
valid_lib_scaled = pd.DataFrame(scaler.transform(valid_lib.drop(columns=["future_acq_budget_settlement"])), 
                                columns=train_lib.columns[:-1])
valid_y_scaled = scaler_y.transform(valid_lib[['future_acq_budget_settlement']])
valid_lib_scaled['future_acq_budget_settlement_scaled'] = valid_y_scaled

# 테스트 데이터에 대해 변환
test_lib_scaled = pd.DataFrame(scaler.transform(test_lib.drop(columns=["future_acq_budget_settlement"])), 
                               columns=train_lib.columns[:-1])
test_y_scaled = scaler_y.transform(test_lib[['future_acq_budget_settlement']])
test_lib_scaled['future_acq_budget_settlement_scaled'] = test_y_scaled

# X_train, Y_train 구성하기 (스케일된 타겟 사용)
X_train = train_lib_scaled.drop(columns=["future_acq_budget_settlement_scaled"])
Y_train = train_lib_scaled["future_acq_budget_settlement_scaled"]

# X_valid, Y_valid 구성하기 (스케일된 타겟 사용)
X_valid = valid_lib_scaled.drop(columns=["future_acq_budget_settlement_scaled"])
Y_valid = valid_lib_scaled["future_acq_budget_settlement_scaled"]

# X_test, Y_test 구성하기 (스케일된 타겟 사용)
X_test = test_lib_scaled.drop(columns=["future_acq_budget_settlement_scaled"])
Y_test = test_lib_scaled["future_acq_budget_settlement_scaled"]


In [6]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import ExtraTreesRegressor, RandomForestRegressor
from catboost import CatBoostRegressor
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from scipy.stats import randint, uniform

# Define parameter grids for each model

# Random Forest
rf_param_grid = {
    'n_estimators': randint(50, 500),
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth': randint(10, 100),
    'min_samples_split': randint(2, 10),
    'min_samples_leaf': randint(1, 10)
}

# Extra Trees
et_param_grid = {
    'n_estimators': randint(50, 500),
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth': randint(10, 100),
    'min_samples_split': randint(2, 10),
    'min_samples_leaf': randint(1, 10)
}

# CatBoost
cat_param_grid = {
    'iterations': randint(100, 1000),
    'depth': randint(4, 10),
    'learning_rate': uniform(0.01, 0.3),
    'l2_leaf_reg': randint(1, 10),
    'border_count': randint(1, 255)
}

# LightGBM
lgb_param_grid = {
    'n_estimators': randint(50, 500),
    'learning_rate': uniform(0.01, 0.3),
    'num_leaves': randint(20, 100),
    'max_depth': randint(10, 100),
    'min_child_samples': randint(1, 20),
    'subsample': uniform(0.6, 0.4)
}

# XGBoost
xgb_param_grid = {
    'n_estimators': randint(50, 500),
    'learning_rate': uniform(0.01, 0.3),
    'max_depth': randint(3, 10),
    'min_child_weight': randint(1, 10),
    'subsample': uniform(0.6, 0.4),
    'colsample_bytree': uniform(0.6, 0.4)
}

# RandomizedSearchCV setup
def perform_randomized_search(model, param_grid, X_train, y_train):
    random_search = RandomizedSearchCV(
        estimator=model,
        param_distributions=param_grid,
        n_iter=100,
        cv=5,
        verbose=1,
        random_state=42,
        n_jobs=5
    )
    random_search.fit(X_train, y_train)
    return random_search

# RandomForestRegressor
rf_model = RandomForestRegressor()
rf_search = perform_randomized_search(rf_model, rf_param_grid, X_train, Y_train)


# ExtraTreesRegressor
et_model = ExtraTreesRegressor()
et_search = perform_randomized_search(et_model, et_param_grid, X_train, Y_train)


# CatBoostRegressor
cat_model = CatBoostRegressor(silent=True) 
cat_search = perform_randomized_search(cat_model, cat_param_grid, X_train, Y_train)


# LGBMRegressor
lgb_model = LGBMRegressor()
lgb_search = perform_randomized_search(lgb_model, lgb_param_grid, X_train, Y_train)


# XGBRegressor
xgb_model = XGBRegressor(verbosity=0)  # verbosity=0 to suppress verbose output
xgb_search = perform_randomized_search(xgb_model, xgb_param_grid, X_train, Y_train)




print("Best parameters for RandomForestRegressor:", rf_search.best_params_)
print("Best parameters for ExtraTreesRegressor:", et_search.best_params_)
print("Best parameters for CatBoostRegressor:", cat_search.best_params_)
print("Best parameters for LGBMRegressor:", lgb_search.best_params_)
print("Best parameters for XGBRegressor:", xgb_search.best_params_)

Fitting 3 folds for each of 100 candidates, totalling 300 fits


120 fits failed out of a total of 300.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
92 fits failed with the following error:
Traceback (most recent call last):
  File "/root/anaconda3/envs/jw_tensorflow/lib/python3.11/site-packages/sklearn/model_selection/_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/root/anaconda3/envs/jw_tensorflow/lib/python3.11/site-packages/sklearn/base.py", line 1466, in wrapper
    estimator._validate_params()
  File "/root/anaconda3/envs/jw_tensorflow/lib/python3.11/site-packages/sklearn/base.py", line 666, in _validate_params
    validate_parameter_constraints(
  File "/root/anaconda3/envs/jw_tensorflow/lib/python3.11/site-packages/sklearn/utils/_param

Fitting 3 folds for each of 100 candidates, totalling 300 fits


120 fits failed out of a total of 300.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
37 fits failed with the following error:
Traceback (most recent call last):
  File "/root/anaconda3/envs/jw_tensorflow/lib/python3.11/site-packages/sklearn/model_selection/_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/root/anaconda3/envs/jw_tensorflow/lib/python3.11/site-packages/sklearn/base.py", line 1466, in wrapper
    estimator._validate_params()
  File "/root/anaconda3/envs/jw_tensorflow/lib/python3.11/site-packages/sklearn/base.py", line 666, in _validate_params
    validate_parameter_constraints(
  File "/root/anaconda3/envs/jw_tensorflow/lib/python3.11/site-packages/sklearn/utils/_param

Fitting 3 folds for each of 100 candidates, totalling 300 fits




Fitting 3 folds for each of 100 candidates, totalling 300 fits
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002915 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 8107
[LightGBM] [Info] Number of data points in the train set: 5456, number of used features: 36
[LightGBM] [Info] Start training from score 0.196868
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.083130 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 8144
[LightGBM] [Info] Number of data points in the train set: 5456, number of used features: 36
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.109508 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 8151
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.103648 seconds.
You can 



Best parameters for RandomForestRegressor: {'max_depth': 36, 'max_features': 'sqrt', 'min_samples_leaf': 2, 'min_samples_split': 3, 'n_estimators': 322}
Best parameters for ExtraTreesRegressor: {'max_depth': 30, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 416}
Best parameters for CatBoostRegressor: {'border_count': 124, 'depth': 4, 'iterations': 136, 'l2_leaf_reg': 8, 'learning_rate': 0.1092250914011541}
Best parameters for LGBMRegressor: {'learning_rate': 0.034478254120072105, 'max_depth': 16, 'min_child_samples': 4, 'n_estimators': 222, 'num_leaves': 39, 'subsample': 0.8599855723111061}
Best parameters for XGBRegressor: {'colsample_bytree': 0.6464290562027665, 'learning_rate': 0.023800792606525824, 'max_depth': 3, 'min_child_weight': 4, 'n_estimators': 253, 'subsample': 0.6068644407327001}


In [7]:
print("Best parameters for RandomForestRegressor:", rf_search.best_params_)
print("Best parameters for ExtraTreesRegressor:", et_search.best_params_)
print("Best parameters for CatBoostRegressor:", cat_search.best_params_)
print("Best parameters for LGBMRegressor:", lgb_search.best_params_)
print("Best parameters for XGBRegressor:", xgb_search.best_params_)

Best parameters for RandomForestRegressor: {'max_depth': 36, 'max_features': 'sqrt', 'min_samples_leaf': 2, 'min_samples_split': 3, 'n_estimators': 322}
Best parameters for ExtraTreesRegressor: {'max_depth': 30, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 416}
Best parameters for CatBoostRegressor: {'border_count': 124, 'depth': 4, 'iterations': 136, 'l2_leaf_reg': 8, 'learning_rate': 0.1092250914011541}
Best parameters for LGBMRegressor: {'learning_rate': 0.034478254120072105, 'max_depth': 16, 'min_child_samples': 4, 'n_estimators': 222, 'num_leaves': 39, 'subsample': 0.8599855723111061}
Best parameters for XGBRegressor: {'colsample_bytree': 0.6464290562027665, 'learning_rate': 0.023800792606525824, 'max_depth': 3, 'min_child_weight': 4, 'n_estimators': 253, 'subsample': 0.6068644407327001}
