In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler

DATA_PATH = "/content/drive/MyDrive/the_datas/data/"

SEED = 42 # 시드값

# 데이터 블러오기
import pandas as pd
import numpy as np

train_tr = pd.read_csv(f"{DATA_PATH}store_train_transactions.csv") # 학습용 구매기록 데이터
train_target = pd.read_csv(f"{DATA_PATH}store_train.csv") # 학습용 정답 데이터
test_tr = pd.read_csv(f"{DATA_PATH}store_test_transactions.csv") # 테스트용 구매기록 데이터
submit = pd.read_csv(f"{DATA_PATH}store_submission.csv") # 제출 양식 데이터

train_tr.shape , train_target.shape , test_tr.shape , submit.shape

#공통 피처 파일 불러오기
train_ft = pd.read_csv(f"{DATA_PATH}train_ft_군집분석_34.csv") # 학습 데이터(피처)
test_ft = pd.read_csv(f"{DATA_PATH}test_ft_군집분석_34.csv") # 테스트 데이터(피처)

train_tr.shape , train_target.shape , test_tr.shape , submit.shape, train_ft.shape , test_ft.shape

((523105, 7), (14940, 2), (441196, 7), (12225, 2), (14940, 535), (12225, 535))

In [None]:
target = train_target["target"]
target

Unnamed: 0,target
0,1.0
1,1.0
2,0.0
3,0.0
4,0.0
...,...
14935,0.0
14936,0.0
14937,0.0
14938,1.0


# importance 0 제거

In [None]:
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(train_ft, target, test_size=0.2, random_state=42)

model = XGBClassifier(n_estimators=100, random_state=42)

model.fit(X_train, y_train)

feature_importances = model.feature_importances_

feature_importance_df = pd.DataFrame({
    'Feature': train_ft.columns,
    'Importance': feature_importances
}).sort_values(by='Importance', ascending=False)

feature_importance_df.head(50)
feature_importance_df[feature_importance_df["Importance"]==0]
rows_to_drop = feature_importance_df[feature_importance_df["Importance"]==0].index
feature_importance_df.drop(rows_to_drop, inplace=True)
cols = feature_importance_df['Feature'].tolist()
train_ft = train_ft[cols]
test_ft = test_ft[cols]
train_ft.shape, test_ft.shape

((14940, 365), (12225, 365))

# AutoML(Automated machine learning)
- 시간 소모적이고 반복적인 기계 학습 모델 개발 작업을 자동화하는 프로세스
- 데이터 과학자, 분석가 및 개발자는 모델 품질을 유지하면서 확장성, 효율성 및 생산성이 높은 ML 모델을 빌드할 수 있다.

## FLAML(A Fast Library for Automated Machine Learning & Tuning)
- 머신러닝 학습 및 하이퍼파라미터 튜닝을 자동화해 주는 라이브러리
- https://microsoft.github.io/FLAML/docs/Use-Cases/Task-Oriented-AutoML
- flaml 설치하기
    ```bash
    pip install flaml
    ```

In [None]:
pip install flaml

Collecting flaml
  Downloading FLAML-2.3.2-py3-none-any.whl.metadata (16 kB)
Downloading FLAML-2.3.2-py3-none-any.whl (313 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/313.9 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━[0m [32m184.3/313.9 kB[0m [31m5.2 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m313.9/313.9 kB[0m [31m5.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: flaml
Successfully installed flaml-2.3.2


- `AutoML` 클래스의 `fit` 메서드 주요 파라미터
    - metric
        - 평가지표
        - ex) 'roc_auc'
    - task
        - 작업 유형
        - ex) 'classification'
    - estimator_list
        - FLAML에서 제공해주는 모델들의 별칭을 리스트에 넣어주면 된다.
        - 생략시 자동으로 모델들이 선택된다.
        - ex) ['lgbm', 'rf', 'xgboost', 'extra_tree', 'xgb_limitdepth', 'lrl1']
    - time_budget
        - 수행 시간
            - 초단위
    - ensemble
        - 튜닝 후 스태킹 앙상블 여부
        - False(기본값) : 최상의 모델을 선택해서 학습
        - True: 스태킹하여 앙상블
        - `dict` 예시
            - {'final_estimator' : 사이킷런 모델 객체 }
    - n_splits
        - 교차검증 폴드 수(기본 5)
    - seed
        - 시드값
    - early_stop
        - True or False(기본값)
        - 튜닝이 수렴할 경우 조기 중지여부

In [None]:
from flaml import AutoML
automl = AutoML()

In [None]:
!pip install datasets


Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.1.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m10.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl (

In [None]:
%pip install catboost

Collecting catboost
  Downloading catboost-1.2.7-cp310-cp310-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.7-cp310-cp310-manylinux2014_x86_64.whl (98.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.7/98.7 MB[0m [31m7.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.7


In [None]:
params = {
    "metric": 'macro_f1',
    "task": 'classification',
    #"estimator_list" : ['xgboost', 'xgb_limitdepth', 'rf', 'lgbm', 'lgbm_spark', 'rf_spark', 'lrl1', 'lrl2', 'catboost', 'extra_tree', 'kneighbor', 'transformer', 'transformer_ms', 'histgb', 'svc', 'sgd', 'nb_spark', 'enet', 'lassolars', 'glr_spark', 'lr_spark', 'svc_spark', 'gbt_spark', 'aft_spark'],
    "estimator_list" : ['histgb','catboost', 'lgbm', 'rf', 'xgboost', 'xgb_limitdepth', 'lrl1'],
    "time_budget": 60*60,
    "seed" : SEED,
    "early_stop": True,
    # "ensemble" : True
}
automl.fit(train_ft, target, **params)


[flaml.automl.logger: 11-14 04:01:17] {1728} INFO - task = classification
[flaml.automl.logger: 11-14 04:01:17] {1739} INFO - Evaluation method: cv
[flaml.automl.logger: 11-14 04:01:18] {1838} INFO - Minimizing error metric: 1-macro_f1
[flaml.automl.logger: 11-14 04:01:18] {1955} INFO - List of ML learners in AutoML Run: ['histgb', 'catboost', 'lgbm', 'rf', 'xgboost', 'xgb_limitdepth', 'lrl1']
[flaml.automl.logger: 11-14 04:01:18] {2258} INFO - iteration 0, current learner histgb
[flaml.automl.logger: 11-14 04:01:28] {2393} INFO - Estimated sufficient time budget=99544s. Estimated necessary time budget=2435s.
[flaml.automl.logger: 11-14 04:01:28] {2442} INFO -  at 17.2s,	estimator histgb's best error=0.6223,	best estimator histgb's best error=0.6223
[flaml.automl.logger: 11-14 04:01:28] {2258} INFO - iteration 1, current learner histgb
[flaml.automl.logger: 11-14 04:01:39] {2442} INFO -  at 28.9s,	estimator histgb's best error=0.6223,	best estimator histgb's best error=0.6223
[flaml.au

INFO:flaml.tune.searcher.blendsearch:No low-cost partial config given to the search algorithm. For cost-frugal search, consider providing low-cost values for cost-related hps via 'low_cost_partial_config'. More info can be found at https://microsoft.github.io/FLAML/docs/FAQ#about-low_cost_partial_config-in-tune


[flaml.automl.logger: 11-14 05:02:16] {2442} INFO -  at 3665.2s,	estimator lrl1's best error=0.3040,	best estimator xgboost's best error=0.2797
[flaml.automl.logger: 11-14 05:05:49] {2685} INFO - retrain xgboost for 213.1s
[flaml.automl.logger: 11-14 05:05:49] {2688} INFO - retrained model: XGBClassifier(base_score=None, booster=None, callbacks=[],
              colsample_bylevel=0.8734795776739355, colsample_bynode=None,
              colsample_bytree=1.0, device=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=None, grow_policy='lossguide', importance_type=None,
              interaction_constraints=None, learning_rate=0.01581486020173773,
              max_bin=None, max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=0, max_leaves=27,
              min_child_weight=1.6569101873119776, missing=nan,
              monotone_constraints=None, multi_strategy=None, n_

- 선택된 모델 객체 확인

In [None]:
automl.model.estimator

- 튜닝된 하이퍼파라미터

In [None]:
automl.best_config

{'n_estimators': 2303,
 'max_leaves': 27,
 'min_child_weight': 1.6569101873119776,
 'learning_rate': 0.01581486020173773,
 'subsample': 0.9299273637168837,
 'colsample_bylevel': 0.8734795776739355,
 'colsample_bytree': 1.0,
 'reg_alpha': 0.0069961689373672555,
 'reg_lambda': 2.794303559944623}

- 선택된 모델의 cv 점수 확인하기

In [None]:
1 - automl.best_loss

0.7203230900884477

- 두번째

In [None]:
from sklearn.pipeline import Pipeline

automl2 = Pipeline(
    task='classification',
    score_metric='f1_macro',
    n_algos_tuned=3,
    model_list=['LogisticRegression', 'RandomForestClassifier', 'XGBClassifier', 'catboost'],
    min_features=0.5,
    preprocessing=True,
    optimization=3,
    max_tuning_trials=100,
    adaptive_sampling=True,
    threshold_tuning=True
)

automl.fit(train_ft, target)

TypeError: Pipeline.__init__() got an unexpected keyword argument 'task'

In [None]:
automl2.model.estimator

In [None]:
automl2.best_config

In [None]:
1 - automl2.best_loss

- 예측 가능

In [None]:
pred = automl.predict_proba(test_ft)[:,1]
pred

array([0.03589807, 0.3939578 , 0.3975633 , ..., 0.0210551 , 0.13384268,
       0.00664407], dtype=float32)

In [None]:
pred = automl.predict(test_ft)
pred

array([0., 0., 0., ..., 0., 0., 0.])

In [None]:
pred_proba = automl.predict_proba(test_ft)[:,1]
pred_proba

array([[0.9641019 , 0.03589807],
       [0.6060422 , 0.3939578 ],
       [0.60243666, 0.3975633 ],
       ...,
       [0.9789449 , 0.0210551 ],
       [0.8661573 , 0.13384268],
       [0.99335593, 0.00664407]], dtype=float32)

# 정답 데이터

In [None]:
target = train_target["target"]
target

Unnamed: 0,target
0,1.0
1,1.0
2,0.0
3,0.0
4,0.0
...,...
14935,0.0
14936,0.0
14937,0.0
14938,1.0


In [None]:
submit

Unnamed: 0,ID,target
0,test_0,0.5
1,test_1,0.5
2,test_2,0.5
3,test_3,0.5
4,test_4,0.5
...,...,...
12220,test_12220,0.5
12221,test_12221,0.5
12222,test_12222,0.5
12223,test_12223,0.5


In [None]:
submit["target"] = pred_proba
submit

Unnamed: 0,ID,target
0,test_0,0.964102
1,test_1,0.606042
2,test_2,0.602437
3,test_3,0.211876
4,test_4,0.348256
...,...,...
12220,test_12220,0.282909
12221,test_12221,0.789004
12222,test_12222,0.978945
12223,test_12223,0.866157


#업로드

In [None]:
submit.to_csv(f"{DATA_PATH}submit_새출발_automl_proba_군집분석.csv",index=False)

In [None]:
pred = automl2.predict_proba(test_ft)[:,1]
pred

- 앙상블 해보기

In [None]:
auto_ml_ens = AutoML()
params = {
    "metric": 'macro_f1',
    "task": 'classification',
    "time_budget": 60*60,
    "seed" : SEED,
    "early_stop": True,
    "ensemble" : True# 메타모델이 로지스틱회귀! list of ml learners에 있는 애들 중에 진행해서 ㄱㄱ
}

auto_ml_ens.fit(train_ft, target, **params)

[flaml.automl.logger: 10-30 03:34:22] {1728} INFO - task = classification
[flaml.automl.logger: 10-30 03:34:22] {1739} INFO - Evaluation method: cv
[flaml.automl.logger: 10-30 03:34:22] {1838} INFO - Minimizing error metric: 1-roc_auc
[flaml.automl.logger: 10-30 03:34:22] {1955} INFO - List of ML learners in AutoML Run: ['lgbm', 'rf', 'xgboost', 'extra_tree', 'xgb_limitdepth', 'sgd', 'lrl1']
[flaml.automl.logger: 10-30 03:34:22] {2258} INFO - iteration 0, current learner lgbm
[flaml.automl.logger: 10-30 03:34:22] {2393} INFO - Estimated sufficient time budget=4626s. Estimated necessary time budget=107s.
[flaml.automl.logger: 10-30 03:34:22] {2442} INFO -  at 0.5s,	estimator lgbm's best error=0.1071,	best estimator lgbm's best error=0.1071
[flaml.automl.logger: 10-30 03:34:22] {2258} INFO - iteration 1, current learner lgbm
[flaml.automl.logger: 10-30 03:34:22] {2442} INFO -  at 0.7s,	estimator lgbm's best error=0.1071,	best estimator lgbm's best error=0.1071
[flaml.automl.logger: 10-30

INFO:flaml.tune.searcher.blendsearch:No low-cost partial config given to the search algorithm. For cost-frugal search, consider providing low-cost values for cost-related hps via 'low_cost_partial_config'. More info can be found at https://microsoft.github.io/FLAML/docs/FAQ#about-low_cost_partial_config-in-tune


[flaml.automl.logger: 10-30 03:34:23] {2442} INFO -  at 1.5s,	estimator sgd's best error=0.1152,	best estimator lgbm's best error=0.0964
[flaml.automl.logger: 10-30 03:34:23] {2258} INFO - iteration 4, current learner lgbm
[flaml.automl.logger: 10-30 03:34:23] {2442} INFO -  at 1.7s,	estimator lgbm's best error=0.0959,	best estimator lgbm's best error=0.0959
[flaml.automl.logger: 10-30 03:34:23] {2258} INFO - iteration 5, current learner lgbm
[flaml.automl.logger: 10-30 03:34:24] {2442} INFO -  at 2.0s,	estimator lgbm's best error=0.0959,	best estimator lgbm's best error=0.0959
[flaml.automl.logger: 10-30 03:34:24] {2258} INFO - iteration 6, current learner lgbm
[flaml.automl.logger: 10-30 03:34:24] {2442} INFO -  at 2.1s,	estimator lgbm's best error=0.0959,	best estimator lgbm's best error=0.0959
[flaml.automl.logger: 10-30 03:34:24] {2258} INFO - iteration 7, current learner lgbm
[flaml.automl.logger: 10-30 03:34:24] {2442} INFO -  at 2.3s,	estimator lgbm's best error=0.0959,	best es

In [None]:
#앙상블이어서 cv 점수 확인 불가 - stacking 앙상블만

In [None]:
pred_ens = auto_ml_ens.predict_proba(test_ft)[:,1]
pred_ens.shape

(393,)

In [None]:
from lightgbm import LGBMClassifier

In [None]:
auto_ml_ens=  AutoML()
params = {
    "metric": 'roc_auc',
    "task": 'classification',
    "time_budget": 60*3,
    "seed" : SEED,
    "early_stop": True,
    "ensemble" : {'final_estimator' : LGBMClassifier(random_state=SEED)}, #메타모델 변경시
}

auto_ml_ens.fit(train_ft, target, **params)

[flaml.automl.logger: 10-30 03:41:41] {1728} INFO - task = classification
[flaml.automl.logger: 10-30 03:41:41] {1739} INFO - Evaluation method: cv
[flaml.automl.logger: 10-30 03:41:41] {1838} INFO - Minimizing error metric: 1-roc_auc
[flaml.automl.logger: 10-30 03:41:41] {1955} INFO - List of ML learners in AutoML Run: ['lgbm', 'rf', 'xgboost', 'extra_tree', 'xgb_limitdepth', 'sgd', 'lrl1']
[flaml.automl.logger: 10-30 03:41:41] {2258} INFO - iteration 0, current learner lgbm
[flaml.automl.logger: 10-30 03:41:42] {2393} INFO - Estimated sufficient time budget=7302s. Estimated necessary time budget=169s.
[flaml.automl.logger: 10-30 03:41:42] {2442} INFO -  at 0.8s,	estimator lgbm's best error=0.1071,	best estimator lgbm's best error=0.1071
[flaml.automl.logger: 10-30 03:41:42] {2258} INFO - iteration 1, current learner lgbm
[flaml.automl.logger: 10-30 03:41:42] {2442} INFO -  at 0.9s,	estimator lgbm's best error=0.1071,	best estimator lgbm's best error=0.1071
[flaml.automl.logger: 10-30

INFO:flaml.tune.searcher.blendsearch:No low-cost partial config given to the search algorithm. For cost-frugal search, consider providing low-cost values for cost-related hps via 'low_cost_partial_config'. More info can be found at https://microsoft.github.io/FLAML/docs/FAQ#about-low_cost_partial_config-in-tune


[flaml.automl.logger: 10-30 03:41:43] {2442} INFO -  at 2.4s,	estimator sgd's best error=0.1151,	best estimator lgbm's best error=0.0943
[flaml.automl.logger: 10-30 03:41:43] {2258} INFO - iteration 9, current learner sgd
[flaml.automl.logger: 10-30 03:41:43] {2442} INFO -  at 2.7s,	estimator sgd's best error=0.1136,	best estimator lgbm's best error=0.0943
[flaml.automl.logger: 10-30 03:41:43] {2258} INFO - iteration 10, current learner sgd
[flaml.automl.logger: 10-30 03:41:44] {2442} INFO -  at 2.9s,	estimator sgd's best error=0.1136,	best estimator lgbm's best error=0.0943
[flaml.automl.logger: 10-30 03:41:44] {2258} INFO - iteration 11, current learner sgd
[flaml.automl.logger: 10-30 03:41:44] {2442} INFO -  at 3.1s,	estimator sgd's best error=0.1136,	best estimator lgbm's best error=0.0943
[flaml.automl.logger: 10-30 03:41:44] {2258} INFO - iteration 12, current learner sgd
[flaml.automl.logger: 10-30 03:41:44] {2442} INFO -  at 3.5s,	estimator sgd's best error=0.1136,	best estimat

In [None]:
%pip install catboost

Collecting catboost
  Downloading catboost-1.2.7-cp310-cp310-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.7-cp310-cp310-manylinux2014_x86_64.whl (98.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.7/98.7 MB[0m [31m8.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.7


In [None]:
auto_ml_ens=  AutoML()
params = {
    "metric": 'roc_auc',
    "task": 'classification',
    "time_budget": 60*3,
    "seed" : SEED,
    "early_stop": True, #메타모델 변경시
    "ensemble" : True,
    "estimator_list" : ['catboost', 'lgbm', 'rf', 'xgboost', 'extra_tree', 'xgb_limitdepth', 'lrl1']
}

auto_ml_ens.fit(train_ft, target, **params)

NameError: name 'SEED' is not defined