In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler

DATA_PATH = "/content/drive/MyDrive/the_datas/data/"

SEED = 42 # 시드값

# 데이터 블러오기
import pandas as pd
import numpy as np

train_tr = pd.read_csv(f"{DATA_PATH}store_train_transactions.csv") # 학습용 구매기록 데이터
train_target = pd.read_csv(f"{DATA_PATH}store_train.csv") # 학습용 정답 데이터
test_tr = pd.read_csv(f"{DATA_PATH}store_test_transactions.csv") # 테스트용 구매기록 데이터
submit = pd.read_csv(f"{DATA_PATH}store_submission.csv") # 제출 양식 데이터

train_tr.shape , train_target.shape , test_tr.shape , submit.shape

#공통 피처 파일 불러오기
train_ft = pd.read_csv(f"{DATA_PATH}train_common_4.csv") # 학습 데이터(피처)
test_ft = pd.read_csv(f"{DATA_PATH}test_common_4.csv") # 테스트 데이터(피처)

train_tr.shape , train_target.shape , test_tr.shape , submit.shape, train_ft.shape , test_ft.shape

((523105, 7), (14940, 2), (441196, 7), (12225, 2), (14940, 450), (12225, 450))

In [4]:
target = train_target["target"]
target

Unnamed: 0,target
0,1.0
1,1.0
2,0.0
3,0.0
4,0.0
...,...
14935,0.0
14936,0.0
14937,0.0
14938,1.0


# 결측치 처리

In [5]:
mask = train_ft.isnull().sum() > 0
train_ft.isnull().sum()[mask]

Unnamed: 0,0
3회이상count,4214
구매금액표준편차,388
구매금액왜도,840
구매금액첨도,1265


In [6]:
mask = test_ft.isnull().sum() > 0
test_ft.isnull().sum()[mask]

Unnamed: 0,0
3회이상count,3284
구매금액표준편차,242
구매금액왜도,599
구매금액첨도,954


In [7]:
train_ft["3회이상count"] = train_ft["3회이상count"].fillna(0)
train_ft["구매금액표준편차"] = train_ft["구매금액표준편차"].fillna(0)
train_ft["구매금액왜도"] = train_ft["구매금액왜도"].fillna(0)
train_ft["구매금액첨도"] = train_ft["구매금액첨도"].fillna(0)

test_ft["3회이상count"] = test_ft["3회이상count"].fillna(0)
test_ft["구매금액표준편차"] = test_ft["구매금액표준편차"].fillna(0)
test_ft["구매금액왜도"] = test_ft["구매금액왜도"].fillna(0)
test_ft["구매금액첨도"] = test_ft["구매금액첨도"].fillna(0)

In [8]:
train_ft.isnull().sum().sum(), test_ft.isnull().sum().sum()

(0, 0)

# 특성 공학(Feature Engineering)

- ID 변수 제외

In [9]:
train_ft = train_ft.iloc[:,1:]
test_ft = test_ft.iloc[:,1:]
train_ft.shape, test_ft.shape

((14940, 449), (12225, 449))

- 추가 피처 만들어 보기

In [10]:
cols = [ col for col in train_ft.columns if col.startswith("중_pivot_횟수비율_") ]

In [11]:
# train_ft["중분류별_구매횟수_std"] = train_ft[cols].std(axis=1)
# train_ft["중분류별_구매횟수_skew"] = train_ft[cols].skew(axis=1)
# train_ft["중분류별_구매횟수_kurt"] = train_ft[cols].kurt(axis=1)

# test_ft["중분류별_구매횟수_std"] = test_ft[cols].std(axis=1)
# test_ft["중분류별_구매횟수_skew"] = test_ft[cols].skew(axis=1)
# test_ft["중분류별_구매횟수_kurt"] = test_ft[cols].kurt(axis=1)

# train_ft.shape, test_ft.shape

In [12]:
# 최소 구매금액범주화
# 중분류 개수

## Feature Encoding

In [13]:
cols = train_ft.select_dtypes("object").columns.tolist()
train_ft[cols].nunique()

Unnamed: 0,0
주구매지점,4
최대구매액_대분류,28
최소구매액_대분류,28


In [14]:
%pip install category_encoders

Collecting category_encoders
  Downloading category_encoders-2.6.4-py2.py3-none-any.whl.metadata (8.0 kB)
Downloading category_encoders-2.6.4-py2.py3-none-any.whl (82 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m82.0/82.0 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: category_encoders
Successfully installed category_encoders-2.6.4


In [15]:
import category_encoders as ce

In [16]:
enc = ce.one_hot.OneHotEncoder()
tmp = enc.fit_transform(train_ft[["주구매지점"]])
train_ft = pd.concat([train_ft,tmp],axis =1)

tmp = enc.transform(test_ft[["주구매지점"]])
test_ft = pd.concat([test_ft,tmp],axis =1)

train_ft.shape, test_ft.shape

((14940, 453), (12225, 453))

In [17]:
enc = ce.one_hot.OneHotEncoder()
tmp = enc.fit_transform(train_ft[["최소구매액_대분류"]])
train_ft = pd.concat([train_ft,tmp],axis =1)

tmp = enc.transform(test_ft[["최소구매액_대분류"]])
test_ft = pd.concat([test_ft,tmp],axis =1)

train_ft.shape, test_ft.shape

((14940, 481), (12225, 481))

In [18]:
enc = ce.one_hot.OneHotEncoder()
tmp = enc.fit_transform(train_ft[["최대구매액_대분류"]])
train_ft = pd.concat([train_ft,tmp],axis =1)

tmp = enc.transform(test_ft[["최대구매액_대분류"]])
test_ft = pd.concat([test_ft,tmp],axis =1)

train_ft.shape, test_ft.shape

((14940, 509), (12225, 509))

In [19]:
# enc = ce.count.CountEncoder()
# train_ft["최소구매액_대분류_cnt"] = enc.fit_transform(train_ft[["최소구매액_대분류"]])
# test_ft["최소구매액_대분류_cnt"] = enc.transform(test_ft[["최소구매액_대분류"]])

# train_ft.shape, test_ft.shape

In [20]:
# enc = ce.count.CountEncoder()
# train_ft["최대구매액_대분류_cnt"] = enc.fit_transform(train_ft[["최대구매액_대분류"]])
# test_ft["최대구매액_대분류_cnt"] = enc.transform(test_ft[["최대구매액_대분류"]])

# train_ft.shape, test_ft.shape

- 문자열 피처 삭제

In [21]:
cols

['주구매지점', '최대구매액_대분류', '최소구매액_대분류']

In [22]:
train_ft = train_ft.drop(columns=cols)
test_ft = test_ft.drop(columns=cols)
train_ft.shape, test_ft.shape

((14940, 506), (12225, 506))

In [23]:
train_ft.select_dtypes("object").columns , test_ft.select_dtypes("object").columns

(Index([], dtype='object'), Index([], dtype='object'))

## Feature Scaling

In [24]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

In [25]:
train_ft[train_ft.columns] = scaler.fit_transform(train_ft)
test_ft[test_ft.columns] = scaler.transform(test_ft)
train_ft.head()

Unnamed: 0,구매횟수,내점일수,구매주기,봄_구매비율,여름_구매비율,가을_구매비율,겨울_구매비율,주구매요일,12시이전구매비율,12시이후_18시이전구매비율,...,최대구매액_대분류_19,최대구매액_대분류_20,최대구매액_대분류_21,최대구매액_대분류_22,최대구매액_대분류_23,최대구매액_대분류_24,최대구매액_대분류_25,최대구매액_대분류_26,최대구매액_대분류_27,최대구매액_대분류_28
0,-0.437126,-0.369867,0.002987,-1.029777,0.001191,0.838272,0.338186,0.109631,0.718557,-0.65415,...,-0.193649,-0.131248,-0.07785,-0.1402,-0.200973,-0.090737,-0.071977,-0.146044,-0.067118,-0.050497
1,0.239394,0.14411,-0.356452,0.323951,-0.390607,0.620171,-0.552996,0.109631,0.269465,-0.542415,...,-0.193649,-0.131248,-0.07785,-0.1402,-0.200973,-0.090737,-0.071977,-0.146044,-0.067118,-0.050497
2,1.890101,1.943028,-0.869935,0.798943,-0.514333,-0.304527,-0.059266,-1.64337,-0.20983,0.532172,...,-0.193649,-0.131248,-0.07785,-0.1402,-0.200973,-0.090737,-0.071977,-0.146044,-0.067118,-0.050497
3,4.29851,3.793345,-1.02398,0.420933,-0.327474,0.008592,-0.135636,0.109631,-0.009001,0.054536,...,-0.193649,-0.131248,-0.07785,-0.1402,-0.200973,-0.090737,-0.071977,-0.146044,-0.067118,-0.050497
4,0.618244,0.452496,-0.613193,-0.752532,1.70741,-0.130285,-0.821561,0.693965,-0.346226,0.591326,...,-0.193649,-0.131248,-0.07785,-0.1402,-0.200973,-0.090737,-0.071977,-0.146044,-0.067118,-0.050497


# 정답 데이터

In [26]:
target = train_target["target"]
target

Unnamed: 0,target
0,1.0
1,1.0
2,0.0
3,0.0
4,0.0
...,...
14935,0.0
14936,0.0
14937,0.0
14938,1.0


# importance 0 제거

In [27]:
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(train_ft, target, test_size=0.2, random_state=42)

model = LGBMClassifier(n_estimators=100, random_state=42)

model.fit(X_train, y_train)

feature_importances = model.feature_importances_

feature_importance_df = pd.DataFrame({
    'Feature': train_ft.columns,
    'Importance': feature_importances
}).sort_values(by='Importance', ascending=False)

feature_importance_df.head(50)
feature_importance_df[feature_importance_df["Importance"]==0]
rows_to_drop = feature_importance_df[feature_importance_df["Importance"]==0].index
feature_importance_df.drop(rows_to_drop, inplace=True)
cols = feature_importance_df['Feature'].tolist()
train_ft = train_ft[cols]
test_ft = test_ft[cols]
train_ft.shape, test_ft.shape

Dask dataframe query planning is disabled because dask-expr is not installed.

You can install it with `pip install dask[dataframe]` or `conda install dask`.
This will raise in a future version.



[LightGBM] [Info] Number of positive: 4687, number of negative: 7265
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.184302 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 53879
[LightGBM] [Info] Number of data points in the train set: 11952, number of used features: 481
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.392152 -> initscore=-0.438276
[LightGBM] [Info] Start training from score -0.438276


((14940, 337), (12225, 337))

In [28]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
cv = KFold(n_splits=5,shuffle=True, random_state=SEED)

In [30]:
params = {'n_estimators': 761,
 'max_leaves': 4,
 'min_child_weight': 0.31836025007133445,
 'learning_rate': 0.10446225029384025,
 'subsample': 0.68620375768124,
 'colsample_bylevel': 0.8637839877103641,
 'colsample_bytree': 0.8431416115296962,
 'reg_alpha': 0.0009765625,
 'reg_lambda': 41.013742162540886}
model = XGBClassifier(**params)

scores = cross_val_score(model,train_ft,target,cv = cv ,scoring='f1_macro',n_jobs = -1)
np.mean(scores)

0.7194480633235241

In [31]:
model.fit(train_ft,target)

In [32]:
pred = model.predict_proba(test_ft)[:,1]
pred

array([0.04231999, 0.3825432 , 0.2719834 , ..., 0.02809458, 0.08336712,
       0.02385197], dtype=float32)

In [33]:
submit

Unnamed: 0,ID,target
0,test_0,0.5
1,test_1,0.5
2,test_2,0.5
3,test_3,0.5
4,test_4,0.5
...,...,...
12220,test_12220,0.5
12221,test_12221,0.5
12222,test_12222,0.5
12223,test_12223,0.5


In [34]:
submit["target"] = pred
submit

Unnamed: 0,ID,target
0,test_0,0.042320
1,test_1,0.382543
2,test_2,0.271983
3,test_3,0.876889
4,test_4,0.545982
...,...,...
12220,test_12220,0.761663
12221,test_12221,0.292413
12222,test_12222,0.028095
12223,test_12223,0.083367


#업로드

In [35]:
submit

Unnamed: 0,ID,target
0,test_0,0.042320
1,test_1,0.382543
2,test_2,0.271983
3,test_3,0.876889
4,test_4,0.545982
...,...,...
12220,test_12220,0.761663
12221,test_12221,0.292413
12222,test_12222,0.028095
12223,test_12223,0.083367


In [36]:
submit.to_csv(f"{DATA_PATH}submit_새출발_automl_proba_5.csv",index=False)

# AutoML(Automated machine learning)
- 시간 소모적이고 반복적인 기계 학습 모델 개발 작업을 자동화하는 프로세스
- 데이터 과학자, 분석가 및 개발자는 모델 품질을 유지하면서 확장성, 효율성 및 생산성이 높은 ML 모델을 빌드할 수 있다.

## FLAML(A Fast Library for Automated Machine Learning & Tuning)
- 머신러닝 학습 및 하이퍼파라미터 튜닝을 자동화해 주는 라이브러리
- https://microsoft.github.io/FLAML/docs/Use-Cases/Task-Oriented-AutoML
- flaml 설치하기
    ```bash
    pip install flaml
    ```

In [35]:
pip install flaml

Collecting flaml
  Downloading FLAML-2.3.2-py3-none-any.whl.metadata (16 kB)
Downloading FLAML-2.3.2-py3-none-any.whl (313 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/313.9 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m313.9/313.9 kB[0m [31m16.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: flaml
Successfully installed flaml-2.3.2


- `AutoML` 클래스의 `fit` 메서드 주요 파라미터
    - metric
        - 평가지표
        - ex) 'roc_auc'
    - task
        - 작업 유형
        - ex) 'classification'
    - estimator_list
        - FLAML에서 제공해주는 모델들의 별칭을 리스트에 넣어주면 된다.
        - 생략시 자동으로 모델들이 선택된다.
        - ex) ['lgbm', 'rf', 'xgboost', 'extra_tree', 'xgb_limitdepth', 'lrl1']
    - time_budget
        - 수행 시간
            - 초단위
    - ensemble
        - 튜닝 후 스태킹 앙상블 여부
        - False(기본값) : 최상의 모델을 선택해서 학습
        - True: 스태킹하여 앙상블
        - `dict` 예시
            - {'final_estimator' : 사이킷런 모델 객체 }
    - n_splits
        - 교차검증 폴드 수(기본 5)
    - seed
        - 시드값
    - early_stop
        - True or False(기본값)
        - 튜닝이 수렴할 경우 조기 중지여부

In [36]:
from flaml import AutoML
automl = AutoML()

In [37]:
!pip install datasets


Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.1.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m21.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m5.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl (

In [38]:
%pip install catboost

Collecting catboost
  Downloading catboost-1.2.7-cp310-cp310-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.7-cp310-cp310-manylinux2014_x86_64.whl (98.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.7/98.7 MB[0m [31m9.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.7


In [39]:
params = {
    "metric": 'macro_f1',
    "task": 'classification',
    #"estimator_list" : ['xgboost', 'xgb_limitdepth', 'rf', 'lgbm', 'lgbm_spark', 'rf_spark', 'lrl1', 'lrl2', 'catboost', 'extra_tree', 'kneighbor', 'transformer', 'transformer_ms', 'histgb', 'svc', 'sgd', 'nb_spark', 'enet', 'lassolars', 'glr_spark', 'lr_spark', 'svc_spark', 'gbt_spark', 'aft_spark'],
    "estimator_list" : ['histgb','catboost', 'lgbm', 'rf', 'xgboost'],
    "time_budget": 60*60,
    "seed" : SEED,
    "early_stop": True,
    # "ensemble" : True
}
automl.fit(train_ft, target, **params)


[flaml.automl.logger: 11-14 06:21:19] {1728} INFO - task = classification
[flaml.automl.logger: 11-14 06:21:19] {1739} INFO - Evaluation method: cv
[flaml.automl.logger: 11-14 06:21:19] {1838} INFO - Minimizing error metric: 1-macro_f1
[flaml.automl.logger: 11-14 06:21:19] {1955} INFO - List of ML learners in AutoML Run: ['histgb', 'catboost', 'lgbm', 'rf', 'xgboost']
[flaml.automl.logger: 11-14 06:21:19] {2258} INFO - iteration 0, current learner histgb
[flaml.automl.logger: 11-14 06:21:29] {2393} INFO - Estimated sufficient time budget=99270s. Estimated necessary time budget=204s.
[flaml.automl.logger: 11-14 06:21:29] {2442} INFO -  at 11.7s,	estimator histgb's best error=0.6223,	best estimator histgb's best error=0.6223
[flaml.automl.logger: 11-14 06:21:29] {2258} INFO - iteration 1, current learner histgb
[flaml.automl.logger: 11-14 06:21:37] {2442} INFO -  at 20.1s,	estimator histgb's best error=0.6223,	best estimator histgb's best error=0.6223
[flaml.automl.logger: 11-14 06:21:37

- 선택된 모델 객체 확인

In [40]:
automl.model.estimator

- 튜닝된 하이퍼파라미터

In [41]:
automl.best_config

{'n_estimators': 761,
 'max_leaves': 4,
 'min_child_weight': 0.31836025007133445,
 'learning_rate': 0.10446225029384025,
 'subsample': 0.68620375768124,
 'colsample_bylevel': 0.8637839877103641,
 'colsample_bytree': 0.8431416115296962,
 'reg_alpha': 0.0009765625,
 'reg_lambda': 41.013742162540886}

- 선택된 모델의 cv 점수 확인하기

In [42]:
1 - automl.best_loss

0.719420540779006

- 두번째

In [None]:
from sklearn.pipeline import Pipeline

automl2 = Pipeline(
    task='classification',
    score_metric='f1_macro',
    n_algos_tuned=3,
    model_list=['LogisticRegression', 'RandomForestClassifier', 'XGBClassifier', 'catboost'],
    min_features=0.5,
    preprocessing=True,
    optimization=3,
    max_tuning_trials=100,
    adaptive_sampling=True,
    threshold_tuning=True
)

automl.fit(train_ft, target)

TypeError: Pipeline.__init__() got an unexpected keyword argument 'task'

In [None]:
automl2.model.estimator

In [None]:
automl2.best_config

In [None]:
1 - automl2.best_loss

- 예측 가능

In [43]:
pred = automl.predict_proba(test_ft)[:,1]
pred

array([0.04131197, 0.40860572, 0.31985277, ..., 0.0285932 , 0.13512567,
       0.03194593], dtype=float32)

In [44]:
pred = automl.predict(test_ft)
pred

array([0., 0., 0., ..., 0., 0., 0.])

In [1]:
pred_proba = automl.predict_proba(test_ft)[:,1]
pred_proba

NameError: name 'automl' is not defined

# 정답 데이터

In [46]:
target = train_target["target"]
target

Unnamed: 0,target
0,1.0
1,1.0
2,0.0
3,0.0
4,0.0
...,...
14935,0.0
14936,0.0
14937,0.0
14938,1.0


In [47]:
submit

Unnamed: 0,ID,target
0,test_0,0.5
1,test_1,0.5
2,test_2,0.5
3,test_3,0.5
4,test_4,0.5
...,...,...
12220,test_12220,0.5
12221,test_12221,0.5
12222,test_12222,0.5
12223,test_12223,0.5


In [50]:
submit["target"] = pred_proba
submit

Unnamed: 0,ID,target
0,test_0,0.0
1,test_1,0.0
2,test_2,0.0
3,test_3,1.0
4,test_4,1.0
...,...,...
12220,test_12220,1.0
12221,test_12221,0.0
12222,test_12222,0.0
12223,test_12223,0.0


#업로드

In [54]:
submit

Unnamed: 0,ID,target
0,test_0,0.0
1,test_1,0.0
2,test_2,0.0
3,test_3,1.0
4,test_4,1.0
...,...,...
12220,test_12220,1.0
12221,test_12221,0.0
12222,test_12222,0.0
12223,test_12223,0.0


In [55]:
submit.to_csv(f"{DATA_PATH}submit_새출발_automl_proba_5.csv",index=False)

In [None]:
pred_proba = automl.predict_proba(test_ft)[:,1]
pred_proba

NameError: name 'automl' is not defined

# 정답 데이터

In [None]:
target = train_target["target"]
target

Unnamed: 0,target
0,1.0
1,1.0
2,0.0
3,0.0
4,0.0
...,...
14935,0.0
14936,0.0
14937,0.0
14938,1.0


In [None]:
submit

Unnamed: 0,ID,target
0,test_0,0.5
1,test_1,0.5
2,test_2,0.5
3,test_3,0.5
4,test_4,0.5
...,...,...
12220,test_12220,0.5
12221,test_12221,0.5
12222,test_12222,0.5
12223,test_12223,0.5


In [None]:
submit["target"] = pred_proba
submit

Unnamed: 0,ID,target
0,test_0,0.0
1,test_1,0.0
2,test_2,0.0
3,test_3,1.0
4,test_4,1.0
...,...,...
12220,test_12220,1.0
12221,test_12221,0.0
12222,test_12222,0.0
12223,test_12223,0.0


#업로드

In [None]:
submit

Unnamed: 0,ID,target
0,test_0,0.0
1,test_1,0.0
2,test_2,0.0
3,test_3,1.0
4,test_4,1.0
...,...,...
12220,test_12220,1.0
12221,test_12221,0.0
12222,test_12222,0.0
12223,test_12223,0.0


In [None]:
submit.to_csv(f"{DATA_PATH}submit_새출발_automl_proba_5.csv",index=False)

In [None]:
pred = automl2.predict_proba(test_ft)[:,1]
pred

- 앙상블 해보기

In [None]:
auto_ml_ens = AutoML()
params = {
    "metric": 'macro_f1',
    "task": 'classification',
    "time_budget": 60*60,
    "seed" : SEED,
    "early_stop": True,
    "ensemble" : True# 메타모델이 로지스틱회귀! list of ml learners에 있는 애들 중에 진행해서 ㄱㄱ
}

auto_ml_ens.fit(train_ft, target, **params)

[flaml.automl.logger: 10-30 03:34:22] {1728} INFO - task = classification
[flaml.automl.logger: 10-30 03:34:22] {1739} INFO - Evaluation method: cv
[flaml.automl.logger: 10-30 03:34:22] {1838} INFO - Minimizing error metric: 1-roc_auc
[flaml.automl.logger: 10-30 03:34:22] {1955} INFO - List of ML learners in AutoML Run: ['lgbm', 'rf', 'xgboost', 'extra_tree', 'xgb_limitdepth', 'sgd', 'lrl1']
[flaml.automl.logger: 10-30 03:34:22] {2258} INFO - iteration 0, current learner lgbm
[flaml.automl.logger: 10-30 03:34:22] {2393} INFO - Estimated sufficient time budget=4626s. Estimated necessary time budget=107s.
[flaml.automl.logger: 10-30 03:34:22] {2442} INFO -  at 0.5s,	estimator lgbm's best error=0.1071,	best estimator lgbm's best error=0.1071
[flaml.automl.logger: 10-30 03:34:22] {2258} INFO - iteration 1, current learner lgbm
[flaml.automl.logger: 10-30 03:34:22] {2442} INFO -  at 0.7s,	estimator lgbm's best error=0.1071,	best estimator lgbm's best error=0.1071
[flaml.automl.logger: 10-30

INFO:flaml.tune.searcher.blendsearch:No low-cost partial config given to the search algorithm. For cost-frugal search, consider providing low-cost values for cost-related hps via 'low_cost_partial_config'. More info can be found at https://microsoft.github.io/FLAML/docs/FAQ#about-low_cost_partial_config-in-tune


[flaml.automl.logger: 10-30 03:34:23] {2442} INFO -  at 1.5s,	estimator sgd's best error=0.1152,	best estimator lgbm's best error=0.0964
[flaml.automl.logger: 10-30 03:34:23] {2258} INFO - iteration 4, current learner lgbm
[flaml.automl.logger: 10-30 03:34:23] {2442} INFO -  at 1.7s,	estimator lgbm's best error=0.0959,	best estimator lgbm's best error=0.0959
[flaml.automl.logger: 10-30 03:34:23] {2258} INFO - iteration 5, current learner lgbm
[flaml.automl.logger: 10-30 03:34:24] {2442} INFO -  at 2.0s,	estimator lgbm's best error=0.0959,	best estimator lgbm's best error=0.0959
[flaml.automl.logger: 10-30 03:34:24] {2258} INFO - iteration 6, current learner lgbm
[flaml.automl.logger: 10-30 03:34:24] {2442} INFO -  at 2.1s,	estimator lgbm's best error=0.0959,	best estimator lgbm's best error=0.0959
[flaml.automl.logger: 10-30 03:34:24] {2258} INFO - iteration 7, current learner lgbm
[flaml.automl.logger: 10-30 03:34:24] {2442} INFO -  at 2.3s,	estimator lgbm's best error=0.0959,	best es

In [None]:
#앙상블이어서 cv 점수 확인 불가 - stacking 앙상블만

In [None]:
pred_ens = auto_ml_ens.predict_proba(test_ft)[:,1]
pred_ens.shape

(393,)

In [None]:
from lightgbm import LGBMClassifier

In [None]:
auto_ml_ens=  AutoML()
params = {
    "metric": 'roc_auc',
    "task": 'classification',
    "time_budget": 60*3,
    "seed" : SEED,
    "early_stop": True,
    "ensemble" : {'final_estimator' : LGBMClassifier(random_state=SEED)}, #메타모델 변경시
}

auto_ml_ens.fit(train_ft, target, **params)

[flaml.automl.logger: 10-30 03:41:41] {1728} INFO - task = classification
[flaml.automl.logger: 10-30 03:41:41] {1739} INFO - Evaluation method: cv
[flaml.automl.logger: 10-30 03:41:41] {1838} INFO - Minimizing error metric: 1-roc_auc
[flaml.automl.logger: 10-30 03:41:41] {1955} INFO - List of ML learners in AutoML Run: ['lgbm', 'rf', 'xgboost', 'extra_tree', 'xgb_limitdepth', 'sgd', 'lrl1']
[flaml.automl.logger: 10-30 03:41:41] {2258} INFO - iteration 0, current learner lgbm
[flaml.automl.logger: 10-30 03:41:42] {2393} INFO - Estimated sufficient time budget=7302s. Estimated necessary time budget=169s.
[flaml.automl.logger: 10-30 03:41:42] {2442} INFO -  at 0.8s,	estimator lgbm's best error=0.1071,	best estimator lgbm's best error=0.1071
[flaml.automl.logger: 10-30 03:41:42] {2258} INFO - iteration 1, current learner lgbm
[flaml.automl.logger: 10-30 03:41:42] {2442} INFO -  at 0.9s,	estimator lgbm's best error=0.1071,	best estimator lgbm's best error=0.1071
[flaml.automl.logger: 10-30

INFO:flaml.tune.searcher.blendsearch:No low-cost partial config given to the search algorithm. For cost-frugal search, consider providing low-cost values for cost-related hps via 'low_cost_partial_config'. More info can be found at https://microsoft.github.io/FLAML/docs/FAQ#about-low_cost_partial_config-in-tune


[flaml.automl.logger: 10-30 03:41:43] {2442} INFO -  at 2.4s,	estimator sgd's best error=0.1151,	best estimator lgbm's best error=0.0943
[flaml.automl.logger: 10-30 03:41:43] {2258} INFO - iteration 9, current learner sgd
[flaml.automl.logger: 10-30 03:41:43] {2442} INFO -  at 2.7s,	estimator sgd's best error=0.1136,	best estimator lgbm's best error=0.0943
[flaml.automl.logger: 10-30 03:41:43] {2258} INFO - iteration 10, current learner sgd
[flaml.automl.logger: 10-30 03:41:44] {2442} INFO -  at 2.9s,	estimator sgd's best error=0.1136,	best estimator lgbm's best error=0.0943
[flaml.automl.logger: 10-30 03:41:44] {2258} INFO - iteration 11, current learner sgd
[flaml.automl.logger: 10-30 03:41:44] {2442} INFO -  at 3.1s,	estimator sgd's best error=0.1136,	best estimator lgbm's best error=0.0943
[flaml.automl.logger: 10-30 03:41:44] {2258} INFO - iteration 12, current learner sgd
[flaml.automl.logger: 10-30 03:41:44] {2442} INFO -  at 3.5s,	estimator sgd's best error=0.1136,	best estimat

In [None]:
%pip install catboost

Collecting catboost
  Downloading catboost-1.2.7-cp310-cp310-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.7-cp310-cp310-manylinux2014_x86_64.whl (98.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.7/98.7 MB[0m [31m8.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.7


In [None]:
auto_ml_ens=  AutoML()
params = {
    "metric": 'roc_auc',
    "task": 'classification',
    "time_budget": 60*3,
    "seed" : SEED,
    "early_stop": True, #메타모델 변경시
    "ensemble" : True,
    "estimator_list" : ['catboost', 'lgbm', 'rf', 'xgboost', 'extra_tree', 'xgb_limitdepth', 'lrl1']
}

auto_ml_ens.fit(train_ft, target, **params)

NameError: name 'SEED' is not defined