## Baseline V1 - 2024.01.18

### 파이프라인 정의

In [1]:
from src.models.classification.models import Model
from src.base import BasePiepline
from src.utils.data_preprocessing import DataPreprocessing
from src.utils.manage_pkl_files import get_best_params
from src.utils.set_seed import seed_everything


# class AdvancePreprocessing(DataPreprocessing):
#     def __init__(self, df):
#         super().__init__(df)
    
#     def set_up(self):
#         print(f"{type(self).__name__}: set_up method 실행")
#         super().set_up()  # self.df = self.df.drop(columns=['ID', '근로기간'])
    
#     def __call__(self):
#         print("AdvancePreprocessing __call__ method is called.")
#         return super().__call__()

#     def step_four_add_another_function(self):
#         print("전처리 함수 추가함")


class ModelProcess(BasePiepline):
    def preprocessing(self, df):
        # NOTE: 전처리 함수 추가 원할시 상속 받아서 적용 후 상속 받은 클래스 호출
        # preprocessing = AdvancePreprocessing(df)
        preprocessing = DataPreprocessing(df)

        return preprocessing()

### 분류 모델 클래스 및 파이프라인 선언

In [2]:
model = Model()
model_process = ModelProcess()

### 파라미터 최적화 데이터 불러오기

In [3]:
params = get_best_params()

### 앙상블 모델 생성

In [4]:
_model = model.xgboost
model_process.run(_model)

DataPreprocessing __call__ method is called.
{'model_name': 'XGBClassifier', 'model_instance': XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, device=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=None, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=None, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              multi_strategy=None, n_estimators=None, n_jobs=None,
              num_parallel_tree=None, objective='multi:softprob', ...)}
DataPreprocessing __call__ method is called.


In [3]:
_model = model.xgboost
model_process.train_score(_model)

----[Train Score]-----
f1 score : 0.7936
precision score : 0.7695
recall score : 0.8249


In [8]:
from sklearn.ensemble import VotingClassifier

xgboost_best_params_model = model.xgboost.set_params(**params)
lightgbm_model = model.lightgbm.set_params(**params)

ensemble = VotingClassifier(estimators=[('xgb', xgboost_best_params_model), ('lgbm', lightgbm_model)], voting='soft')

### 파이프라인 실행

In [9]:
model_process.run(ensemble)

DataPreprocessing __call__ method is called.
DataPreprocessing set_up method is called.
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.005251 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1511
[LightGBM] [Info] Number of data points in the train set: 96294, number of used features: 14
[LightGBM] [Info] Start training from score -1.747695
[LightGBM] [Info] Start training from score -1.206441
[LightGBM] [Info] Start training from score -1.248757
[LightGBM] [Info] Start training from score -1.975590
[LightGBM] [Info] Start training from score -2.572162
[LightGBM] [Info] Start training from score -3.897527
[LightGBM] [Info] Start training from score -5.434907
[Train Score]: 1.00 {'model_name': 'VotingClassifier', 'model_instance': VotingClassifier(estimators=[('xgb',
                              XGBClassifier(base_score=None, booster=No