In [6]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


- 데이터 경로 변수

In [7]:
DATA_PATH = "/content/drive/MyDrive/the_datas/data/"
DATA_PATH

'/content/drive/MyDrive/the_datas/data/'

- 시드값

In [8]:

SEED = 42

- 데이터 불러오기

In [9]:
import pandas as pd
import numpy as np

train_tr = pd.read_csv(f"{DATA_PATH}store_train_transactions.csv") # 학습용 구매기록 데이터
train_target = pd.read_csv(f"{DATA_PATH}store_train.csv") # 학습용 정답 데이터
test_tr = pd.read_csv(f"{DATA_PATH}store_test_transactions.csv") # 테스트용 구매기록 데이터
submit = pd.read_csv(f"{DATA_PATH}store_submission.csv") # 제출 양식 데이터

train_tr.shape , train_target.shape , test_tr.shape , submit.shape

((523105, 7), (14940, 2), (441196, 7), (12225, 2))

- 공통 피처 파일 불러오기

In [10]:
train_ft = pd.read_csv(f"{DATA_PATH}train_common_1.csv") # 학습 데이터(피처)
test_ft = pd.read_csv(f"{DATA_PATH}test_common_1.csv") # 테스트 데이터(피처)

train_ft.shape , test_ft.shape

((14940, 1142), (12225, 1142))

# 결측치 처리

In [11]:
mask = train_ft.isnull().sum() > 0
train_ft.isnull().sum()[mask]

Unnamed: 0,0
일별구매횟수표준편차,664
3회이상count,4214
총_생식품_구매가격,9711
평균_생식품_구매가격,1574
구매금액표준편차,388
구매금액왜도,840
구매금액첨도,1265
2004-05월_count_diff,14940
월별_구매총액_변화_비율2_1,4251
월별_구매총액_변화_비율3_2,4209


In [12]:

mask = test_ft.isnull().sum() > 0
test_ft.isnull().sum()[mask]

Unnamed: 0,0
일별구매횟수표준편차,443
3회이상count,3284
총_생식품_구매가격,7820
평균_생식품_구매가격,1257
구매금액표준편차,242
구매금액왜도,599
구매금액첨도,954
2004-05월_count_diff,12225
월별_구매총액_변화_비율2_1,3335
월별_구매총액_변화_비율3_2,3269


In [13]:
train_ft["구매금액표준편차"] = train_ft["구매금액표준편차"].fillna(0)
train_ft["구매금액왜도"] = train_ft["구매금액왜도"].fillna(0)
train_ft["구매금액첨도"] = train_ft["구매금액첨도"].fillna(0)

test_ft["구매금액표준편차"] = test_ft["구매금액표준편차"].fillna(0)
test_ft["구매금액왜도"] = test_ft["구매금액왜도"].fillna(0)
test_ft["구매금액첨도"] = test_ft["구매금액첨도"].fillna(0)

In [14]:
train_ft["총_생식품_구매가격"] = train_ft["총_생식품_구매가격"].fillna(0)
train_ft["평균_생식품_구매가격"] = train_ft["평균_생식품_구매가격"].fillna(0)
train_ft["3회이상count"] = train_ft["3회이상count"].fillna(0)


test_ft["총_생식품_구매가격"] = test_ft["총_생식품_구매가격"].fillna(0)
test_ft["평균_생식품_구매가격"] = test_ft["평균_생식품_구매가격"].fillna(0)
test_ft["3회이상count"] = test_ft["3회이상count"].fillna(0)


In [15]:
train_ft["일별구매횟수표준편차"] = train_ft["일별구매횟수표준편차"].fillna(0)
test_ft["일별구매횟수표준편차"] = test_ft["일별구매횟수표준편차"].fillna(0)

In [16]:
train_ft["2004-05월_count_diff"] = train_ft["2004-05월_count_diff"].fillna(0)
test_ft["2004-05월_count_diff"] = test_ft["2004-05월_count_diff"].fillna(0)

In [17]:
train_ft["월별_구매총액_변화비율_1_12"] = train_ft["월별_구매총액_변화비율_1_12"].fillna(0)
test_ft["월별_구매총액_변화비율_1_12"] = test_ft["월별_구매총액_변화비율_1_12"].fillna(0)
for i in range(1,12):
    train_ft[f"월별_구매총액_변화_비율{i+1}_{i}"] = train_ft[f"월별_구매총액_변화_비율{i+1}_{i}"].fillna(0)
    test_ft[f"월별_구매총액_변화_비율{i+1}_{i}"] = test_ft[f"월별_구매총액_변화_비율{i+1}_{i}"].fillna(0)

In [18]:
train_ft.isnull().sum().sum(), test_ft.isnull().sum().sum()

(0, 0)

# 특성 공학(Feature Engineering)

- ID 변수 제외

In [19]:
train_ft = train_ft.iloc[:,1:]
test_ft = test_ft.iloc[:,1:]
train_ft.shape, test_ft.shape

((14940, 1141), (12225, 1141))

- 추가 피처 만들어 보기

In [20]:
cols = [ col for col in train_ft.columns if col.startswith("pivot_cnt_") ]

In [21]:
train_ft["중분류별_구매횟수_std"] = train_ft[cols].std(axis=1)
train_ft["중분류별_구매횟수_skew"] = train_ft[cols].skew(axis=1)
train_ft["중분류별_구매횟수_kurt"] = train_ft[cols].kurt(axis=1)

test_ft["중분류별_구매횟수_std"] = test_ft[cols].std(axis=1)
test_ft["중분류별_구매횟수_skew"] = test_ft[cols].skew(axis=1)
test_ft["중분류별_구매횟수_kurt"] = test_ft[cols].kurt(axis=1)

train_ft.shape, test_ft.shape

((14940, 1144), (12225, 1144))

In [22]:
cols = [ col for col in train_ft.columns if col.startswith("수정_중_pivot_cnt_") ]

In [23]:
train_ft["수정_중분류별_구매횟수_std"] = train_ft[cols].std(axis=1)
train_ft["수정_중분류별_구매횟수_skew"] = train_ft[cols].skew(axis=1)
train_ft["수정_중분류별_구매횟수_kurt"] = train_ft[cols].kurt(axis=1)

test_ft["수정_중분류별_구매횟수_std"] = test_ft[cols].std(axis=1)
test_ft["수정_중분류별_구매횟수_skew"] = test_ft[cols].skew(axis=1)
test_ft["수정_중분류별_구매횟수_kurt"] = test_ft[cols].kurt(axis=1)

train_ft.shape, test_ft.shape

((14940, 1147), (12225, 1147))

In [24]:
cols_대 = [ col for col in train_ft.columns if col.startswith("대_pivot_cnt") ]

In [25]:
train_ft["대분류별_구매횟수_std"] = train_ft[cols_대].std(axis=1)
train_ft["대분류별_구매횟수_skew"] = train_ft[cols_대].skew(axis=1)
train_ft["대분류별_구매횟수_kurt"] = train_ft[cols_대].kurt(axis=1)

test_ft["대분류별_구매횟수_std"] = test_ft[cols_대].std(axis=1)
test_ft["대분류별_구매횟수_skew"] = test_ft[cols_대].skew(axis=1)
test_ft["대분류별_구매횟수_kurt"] = test_ft[cols_대].kurt(axis=1)

train_ft.shape, test_ft.shape

((14940, 1150), (12225, 1150))

## Feature Encoding

In [26]:
train_ft = train_ft.drop(columns = ["하루 구매 시간 간격"])
test_ft = test_ft.drop(columns = ["하루 구매 시간 간격"])
train_ft.shape, test_ft.shape

((14940, 1149), (12225, 1149))

In [27]:
cols = train_ft.select_dtypes("object").columns.tolist()
train_ft[cols].nunique()

Unnamed: 0,0
주구매지점,4
주구매_중분류,246
주구매_대분류,28
주구매_수정_중분류,211
최대구매액_대분류,28
최소구매액_대분류,28


In [28]:
%pip install category_encoders



In [29]:
import category_encoders as ce

In [30]:
enc = ce.one_hot.OneHotEncoder()
tmp = enc.fit_transform(train_ft[["주구매지점","주구매_대분류", "최소구매액_대분류", "최대구매액_대분류"]])
train_ft = pd.concat([train_ft,tmp],axis =1)

tmp = enc.transform(test_ft[["주구매지점","주구매_대분류", "최소구매액_대분류", "최대구매액_대분류"]])
test_ft = pd.concat([test_ft,tmp],axis =1)

train_ft.shape, test_ft.shape

((14940, 1237), (12225, 1237))

In [31]:
enc = ce.count.CountEncoder()
train_ft["주구매_중분류_cnt"] = enc.fit_transform(train_ft[["주구매_중분류"]])
test_ft["주구매_중분류_cnt"] = enc.transform(test_ft[["주구매_중분류"]])

train_ft.shape, test_ft.shape

((14940, 1238), (12225, 1238))

In [32]:
enc = ce.count.CountEncoder()
train_ft["주구매_수정_중분류_cnt"] = enc.fit_transform(train_ft[["주구매_수정_중분류"]])
test_ft["주구매_수정_중분류_cnt"] = enc.transform(test_ft[["주구매_수정_중분류"]])

train_ft.shape, test_ft.shape

((14940, 1239), (12225, 1239))

- 문자열 피처 삭제

In [33]:
cols

['주구매지점', '주구매_중분류', '주구매_대분류', '주구매_수정_중분류', '최대구매액_대분류', '최소구매액_대분류']

In [34]:
train_ft = train_ft.drop(columns=cols)
test_ft = test_ft.drop(columns=cols)
train_ft.shape, test_ft.shape

((14940, 1233), (12225, 1233))

In [35]:
train_ft.select_dtypes("object").columns , test_ft.select_dtypes("object").columns

(Index([], dtype='object'), Index([], dtype='object'))

## Inf 값 처리

In [36]:
mask = np.isinf(train_ft).sum() > 0
np.isinf(train_ft).sum()[mask]

Unnamed: 0,0
월별_구매총액_변화_비율2_1,2114
월별_구매총액_변화_비율3_2,2753
월별_구매총액_변화_비율4_3,2737
월별_구매총액_변화_비율5_4,2650
월별_구매총액_변화_비율6_5,2141
월별_구매총액_변화_비율7_6,2392
월별_구매총액_변화_비율8_7,1803
월별_구매총액_변화_비율9_8,2760
월별_구매총액_변화_비율10_9,2548
월별_구매총액_변화_비율11_10,2015


In [37]:
mask = np.isinf(test_ft).sum() > 0
np.isinf(test_ft).sum()[mask]

Unnamed: 0,0
월별_구매총액_변화_비율2_1,1811
월별_구매총액_변화_비율3_2,2226
월별_구매총액_변화_비율4_3,2230
월별_구매총액_변화_비율5_4,2116
월별_구매총액_변화_비율6_5,1805
월별_구매총액_변화_비율7_6,1903
월별_구매총액_변화_비율8_7,1388
월별_구매총액_변화_비율9_8,2200
월별_구매총액_변화_비율10_9,2125
월별_구매총액_변화_비율11_10,1677


In [38]:
cols = np.isinf(train_ft).sum()[mask].index
cols

Index(['월별_구매총액_변화_비율2_1', '월별_구매총액_변화_비율3_2', '월별_구매총액_변화_비율4_3',
       '월별_구매총액_변화_비율5_4', '월별_구매총액_변화_비율6_5', '월별_구매총액_변화_비율7_6',
       '월별_구매총액_변화_비율8_7', '월별_구매총액_변화_비율9_8', '월별_구매총액_변화_비율10_9',
       '월별_구매총액_변화_비율11_10', '월별_구매총액_변화_비율12_11', '월별_구매총액_변화비율_1_12'],
      dtype='object')

In [39]:
train_inf = train_ft[cols].copy()
train_inf[np.isinf(train_inf)] = np.nan  # 무한대를 NaN으로 대체
train_inf = np.nan_to_num(train_inf, nan=np.nanmax(train_inf[~np.isinf(train_inf)]))
np.isinf(train_inf).sum().sum()

0

In [40]:
test_inf = test_ft[cols].copy()
test_inf[np.isinf(test_inf)] = np.nan  # 무한대를 NaN으로 대체
test_inf = np.nan_to_num(test_inf, nan=np.nanmax(test_inf[~np.isinf(test_inf)]))
np.isinf(test_inf).sum().sum()

0

In [41]:
train_ft[cols] = train_inf
test_ft[cols] = test_inf


In [42]:
np.isinf(train_ft).sum().sum(), np.isinf(test_ft).sum().sum()

(0, 0)

## nan 값 처리

In [43]:
mask = np.isnan(train_ft).sum() > 0
np.isnan(train_ft).sum()[mask]

Unnamed: 0,0


In [44]:
mask = np.isnan(test_ft).sum() > 0
np.isnan(test_ft).sum()[mask]

Unnamed: 0,0


In [45]:
np.isnan(train_ft).sum().sum(), np.isnan(test_ft).sum().sum()

(0, 0)

## Feature Scaling

In [46]:
# from sklearn.preprocessing import StandardScaler
# scaler = StandardScaler()

In [47]:
# train_ft[train_ft.columns] = scaler.fit_transform(train_ft)
# test_ft[test_ft.columns] = scaler.transform(test_ft)
# train_ft.head()

# 정답 데이터

In [48]:
target = train_target["target"]
target

Unnamed: 0,target
0,1.0
1,1.0
2,0.0
3,0.0
4,0.0
...,...
14935,0.0
14936,0.0
14937,0.0
14938,1.0


## 피처셀렉션

### SequentialFeatureSelector

In [77]:
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier()
sfs = SequentialFeatureSelector(model, n_features_to_select=5, direction='forward')
sfs.fit(train_ft, target)

selected_features = train_ft.columns[sfs.get_support()]

KeyboardInterrupt: 

In [None]:
selected_features

### boruta

In [78]:
pip install boruta

Collecting boruta
  Downloading Boruta-0.4.3-py3-none-any.whl.metadata (8.8 kB)
Downloading Boruta-0.4.3-py3-none-any.whl (57 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/57.9 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m57.9/57.9 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: boruta
Successfully installed boruta-0.4.3


In [80]:
from boruta import BorutaPy
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier()
boruta_selector = BorutaPy(model, n_estimators='auto', random_state=SEED)
boruta_selector.fit(train_ft.values, target.values)

selected_features = train_ft.columns[boruta_selector.support_]


KeyboardInterrupt: 

# AutoML

In [49]:
pip install flaml



In [64]:
from flaml import AutoML
automl = AutoML()

In [65]:
params = {
    "metric": 'macro_f1',
    "task": 'classification',
    "time_budget": 60*5,
    "seed" : SEED,
    "early_stop": True
}
automl.fit(train_ft, target, **params)

[flaml.automl.logger: 11-06 07:29:09] {1728} INFO - task = classification
[flaml.automl.logger: 11-06 07:29:09] {1739} INFO - Evaluation method: holdout
[flaml.automl.logger: 11-06 07:29:10] {1838} INFO - Minimizing error metric: 1-macro_f1
[flaml.automl.logger: 11-06 07:29:10] {1955} INFO - List of ML learners in AutoML Run: ['lgbm', 'rf', 'xgboost', 'extra_tree', 'xgb_limitdepth', 'sgd', 'lrl1']
[flaml.automl.logger: 11-06 07:29:10] {2258} INFO - iteration 0, current learner lgbm
[flaml.automl.logger: 11-06 07:29:14] {2393} INFO - Estimated sufficient time budget=35088s. Estimated necessary time budget=812s.
[flaml.automl.logger: 11-06 07:29:14] {2442} INFO -  at 9.7s,	estimator lgbm's best error=0.6223,	best estimator lgbm's best error=0.6223
[flaml.automl.logger: 11-06 07:29:14] {2258} INFO - iteration 1, current learner lgbm
[flaml.automl.logger: 11-06 07:29:21] {2442} INFO -  at 17.2s,	estimator lgbm's best error=0.6223,	best estimator lgbm's best error=0.6223
[flaml.automl.logge

INFO:flaml.tune.searcher.blendsearch:No low-cost partial config given to the search algorithm. For cost-frugal search, consider providing low-cost values for cost-related hps via 'low_cost_partial_config'. More info can be found at https://microsoft.github.io/FLAML/docs/FAQ#about-low_cost_partial_config-in-tune


[flaml.automl.logger: 11-06 07:29:23] {2442} INFO -  at 19.6s,	estimator sgd's best error=0.6223,	best estimator lgbm's best error=0.6223
[flaml.automl.logger: 11-06 07:29:23] {2258} INFO - iteration 3, current learner sgd
[flaml.automl.logger: 11-06 07:29:26] {2442} INFO -  at 22.4s,	estimator sgd's best error=0.6223,	best estimator lgbm's best error=0.6223
[flaml.automl.logger: 11-06 07:29:26] {2258} INFO - iteration 4, current learner lgbm
[flaml.automl.logger: 11-06 07:29:31] {2442} INFO -  at 26.8s,	estimator lgbm's best error=0.3851,	best estimator lgbm's best error=0.3851
[flaml.automl.logger: 11-06 07:29:31] {2258} INFO - iteration 5, current learner sgd
[flaml.automl.logger: 11-06 07:29:45] {2442} INFO -  at 41.6s,	estimator sgd's best error=0.4455,	best estimator lgbm's best error=0.3851
[flaml.automl.logger: 11-06 07:29:45] {2258} INFO - iteration 6, current learner xgboost
[flaml.automl.logger: 11-06 07:29:50] {2442} INFO -  at 46.0s,	estimator xgboost's best error=0.6223,	

In [66]:
automl.model.estimator

In [67]:
automl.best_config

{'n_estimators': 962,
 'num_leaves': 4,
 'min_child_samples': 25,
 'learning_rate': 0.06389203377503197,
 'log_max_bin': 6,
 'colsample_bytree': 0.934927351329294,
 'reg_alpha': 0.04677254162372406,
 'reg_lambda': 0.6051168062227635}

In [71]:
1 - automl.best_loss

0.7096901260504201

In [72]:
pred = automl.predict(test_ft)
pred

array([0., 0., 0., ..., 0., 0., 0.])

In [73]:
pred.sum()

3358.0

In [75]:
submit["target"] = pred
submit

Unnamed: 0,ID,target
0,test_0,0.0
1,test_1,0.0
2,test_2,0.0
3,test_3,1.0
4,test_4,1.0
...,...,...
12220,test_12220,1.0
12221,test_12221,0.0
12222,test_12222,0.0
12223,test_12223,0.0


In [74]:
pred_proba = automl.predict_proba(test_ft)[:,1]
pred_proba

array([0.06328928, 0.30753968, 0.35871403, ..., 0.04259616, 0.12149773,
       0.02169182])

In [76]:
submit.to_csv(f"{DATA_PATH}submit_automl_1.csv",index=False)

앙상블 해보기

In [59]:
auto_ml_ens = AutoML()
params = {
    "metric": 'macro_f1',
    "task": 'classification',
    "time_budget": 60*3,
    "seed" : SEED,
    "early_stop": True,
    "ensemble" : True# 메타모델이 로지스틱회귀! list of ml learners에 있는 애들 중에 진행해서 ㄱㄱ
}

auto_ml_ens.fit(train_ft, target, **params)

[flaml.automl.logger: 11-06 07:16:00] {1728} INFO - task = classification
[flaml.automl.logger: 11-06 07:16:00] {1739} INFO - Evaluation method: holdout
[flaml.automl.logger: 11-06 07:16:00] {1838} INFO - Minimizing error metric: 1-macro_f1
[flaml.automl.logger: 11-06 07:16:00] {1955} INFO - List of ML learners in AutoML Run: ['lgbm', 'rf', 'xgboost', 'extra_tree', 'xgb_limitdepth', 'sgd', 'lrl1']
[flaml.automl.logger: 11-06 07:16:00] {2258} INFO - iteration 0, current learner lgbm
[flaml.automl.logger: 11-06 07:16:02] {2393} INFO - Estimated sufficient time budget=21001s. Estimated necessary time budget=486s.
[flaml.automl.logger: 11-06 07:16:02] {2442} INFO -  at 9.0s,	estimator lgbm's best error=0.6223,	best estimator lgbm's best error=0.6223
[flaml.automl.logger: 11-06 07:16:02] {2258} INFO - iteration 1, current learner lgbm
[flaml.automl.logger: 11-06 07:16:05] {2442} INFO -  at 11.3s,	estimator lgbm's best error=0.6223,	best estimator lgbm's best error=0.6223
[flaml.automl.logge

INFO:flaml.tune.searcher.blendsearch:No low-cost partial config given to the search algorithm. For cost-frugal search, consider providing low-cost values for cost-related hps via 'low_cost_partial_config'. More info can be found at https://microsoft.github.io/FLAML/docs/FAQ#about-low_cost_partial_config-in-tune


[flaml.automl.logger: 11-06 07:16:06] {2442} INFO -  at 12.9s,	estimator sgd's best error=0.6223,	best estimator lgbm's best error=0.6223
[flaml.automl.logger: 11-06 07:16:06] {2258} INFO - iteration 3, current learner sgd
[flaml.automl.logger: 11-06 07:16:09] {2442} INFO -  at 15.2s,	estimator sgd's best error=0.6223,	best estimator lgbm's best error=0.6223
[flaml.automl.logger: 11-06 07:16:09] {2258} INFO - iteration 4, current learner lgbm
[flaml.automl.logger: 11-06 07:16:13] {2442} INFO -  at 19.7s,	estimator lgbm's best error=0.3851,	best estimator lgbm's best error=0.3851
[flaml.automl.logger: 11-06 07:16:13] {2258} INFO - iteration 5, current learner sgd
[flaml.automl.logger: 11-06 07:16:23] {2442} INFO -  at 29.5s,	estimator sgd's best error=0.4457,	best estimator lgbm's best error=0.3851
[flaml.automl.logger: 11-06 07:16:23] {2258} INFO - iteration 6, current learner xgboost
[flaml.automl.logger: 11-06 07:16:27] {2442} INFO -  at 34.0s,	estimator xgboost's best error=0.6223,	

In [63]:
pred_ens = auto_ml_ens.predict(test_ft)
pred_ens.shape

(12225,)

In [60]:
pred_ens = auto_ml_ens.predict_proba(test_ft)[:,1]
pred_ens

array([2.90983893e-01, 3.73895517e-01, 2.71329513e-01, ...,
       1.66294920e-10, 6.48640180e-12, 1.44788827e-08])

# cv 점수 확인해보기

In [97]:
from sklearn.model_selection import cross_val_score

from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold


cv = KFold(n_splits=5,shuffle=True, random_state=SEED)

In [98]:
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.linear_model import LogisticRegression

from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from lightgbm import LGBMClassifier

model = LinearRegression()
scores = cross_val_score(model,train_ft,target,cv = cv ,scoring='f1_macro',n_jobs = -1, error_score='raise')
np.mean(scores)

ValueError: Classification metrics can't handle a mix of binary and continuous targets

# 모델 학습

In [None]:
model = LGBMClassifier(random_state=SEED)
model.fit(train_ft,target)

# 테스트 데이터 예측

In [None]:
pred = model.predict(test_ft)
pred

array([0., 0., 0., ..., 0., 0., 0.])

In [None]:
# pred = model.predict_proba(test_ft)[:,1]
# pred

# 평가를 위한 제출 파일 생성
- 예측 결과를 target 컬럼에 넣어 csv 파일로 저장후에 제출한다.

In [None]:
submit

Unnamed: 0,ID,target
0,test_0,0.5
1,test_1,0.5
2,test_2,0.5
3,test_3,0.5
4,test_4,0.5
...,...,...
12220,test_12220,0.5
12221,test_12221,0.5
12222,test_12222,0.5
12223,test_12223,0.5


In [None]:
submit["target"] = pred
submit

Unnamed: 0,ID,target
0,test_0,0.0
1,test_1,0.0
2,test_2,0.0
3,test_3,1.0
4,test_4,1.0
...,...,...
12220,test_12220,1.0
12221,test_12221,0.0
12222,test_12222,0.0
12223,test_12223,0.0


- 예측 결과를 csv 파일로 저장하여 제출

In [None]:
submit.to_csv(f"{DATA_PATH}submit_5.csv",index=False)