In [1308]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


- 데이터 경로 변수

In [1309]:
DATA_PATH = "/content/drive/MyDrive/data/"
DATA_PATH

'/content/drive/MyDrive/data/'

- 시드값

In [1310]:
SEED = 42

- 데이터 불러오기

In [1311]:
import pandas as pd
import numpy as np

train_tr = pd.read_csv(f"{DATA_PATH}store_train_transactions.csv") # 학습용 구매기록 데이터
train_target = pd.read_csv(f"{DATA_PATH}store_train.csv") # 학습용 정답 데이터
test_tr = pd.read_csv(f"{DATA_PATH}store_test_transactions.csv") # 테스트용 구매기록 데이터
submit = pd.read_csv(f"{DATA_PATH}store_submission.csv") # 제출 양식 데이터

train_tr.shape , train_target.shape , test_tr.shape , submit.shape

((523105, 7), (14940, 2), (441196, 7), (12225, 2))

- 공통 피처 파일 불러오기

In [1312]:
train_ft = pd.read_csv(f"{DATA_PATH}train_common_1_완료.csv") # 학습 데이터(피처)
test_ft = pd.read_csv(f"{DATA_PATH}test_common_1_완료.csv") # 테스트 데이터(피처)

train_ft.shape , test_ft.shape

((14940, 1142), (12225, 1142))

In [1313]:
train_ft.columns

Index(['ID', '구매횟수', '내점일수', '구매주기', '봄_구매비율', '여름_구매비율', '가을_구매비율', '겨울_구매비율',
       '주구매요일', '일별평균구매건수',
       ...
       '최소구매액_중분류수_3_3', '최소구매액_중분류수_3_4', '최소구매액_중분류수_4_1', '최소구매액_중분류수_4_2',
       '최소구매액_중분류수_4_3', '최소구매액_중분류수_4_4', '가족단위_구매비율', '최대구매액_대분류',
       '최소구매액_대분류', '월별_구매총액_변화비율_1_12'],
      dtype='object', length=1142)

# 결측치 처리

In [1314]:
mask = train_ft.isnull().sum() > 0
train_ft.isnull().sum()[mask]

Unnamed: 0,0
일별구매횟수표준편차,664
3회이상count,4214
총_생식품_구매가격,9711
평균_생식품_구매가격,1574
구매금액표준편차,388
구매금액왜도,840
구매금액첨도,1265
2004-05월_count_diff,14940
월별_구매총액_변화_비율2_1,4251
월별_구매총액_변화_비율3_2,4209


In [1315]:
mask = test_ft.isnull().sum() > 0
test_ft.isnull().sum()[mask]

Unnamed: 0,0
일별구매횟수표준편차,443
3회이상count,3284
총_생식품_구매가격,7820
평균_생식품_구매가격,1257
구매금액표준편차,242
구매금액왜도,599
구매금액첨도,954
2004-05월_count_diff,12225
월별_구매총액_변화_비율2_1,3335
월별_구매총액_변화_비율3_2,3269


## Pandas로 결측치 처리

In [1316]:
train_ft["구매금액표준편차"] = train_ft["구매금액표준편차"].fillna(0)
test_ft["구매금액표준편차"] = test_ft["구매금액표준편차"].fillna(0)

In [1317]:
train_ft.shape, test_ft.shape

((14940, 1142), (12225, 1142))

In [1318]:
train_ft['일별구매횟수표준편차'] = train_ft['일별구매횟수표준편차'].fillna(train_ft['일별구매횟수표준편차'].median())
test_ft['일별구매횟수표준편차'] = test_ft['일별구매횟수표준편차'].fillna(train_ft['일별구매횟수표준편차'].median())

In [1319]:
train_ft["3회이상count"] = train_ft["3회이상count"].fillna(0)
test_ft["3회이상count"] = test_ft["3회이상count"].fillna(0)

In [1320]:
train_ft["총_생식품_구매가격"] = train_ft["총_생식품_구매가격"].fillna(0)
test_ft["총_생식품_구매가격"] = test_ft["총_생식품_구매가격"].fillna(0)

In [1321]:
train_ft["평균_생식품_구매가격"] = train_ft["평균_생식품_구매가격"].fillna(0)
test_ft["평균_생식품_구매가격"] = test_ft["평균_생식품_구매가격"].fillna(0)

In [1322]:
train_ft['구매금액왜도'] = train_ft['구매금액왜도'].fillna(train_ft['구매금액왜도'].median())
test_ft['구매금액왜도'] = test_ft['구매금액왜도'].fillna(train_ft['구매금액왜도'].median())

In [1323]:
train_ft['구매금액첨도'] = train_ft['구매금액첨도'].fillna(train_ft['구매금액첨도'].median())
test_ft['구매금액첨도'] = test_ft['구매금액첨도'].fillna(train_ft['구매금액첨도'].median())

In [1324]:
train_ft = train_ft.drop('2004-05월_count_diff', axis=1)

In [1325]:
test_ft = test_ft.drop('2004-05월_count_diff', axis=1)

In [1326]:
cols = ['월별_구매총액_변화_비율2_1', '월별_구매총액_변화_비율3_2', '월별_구매총액_변화_비율4_3', '월별_구매총액_변화_비율5_4',
        '월별_구매총액_변화_비율6_5', '월별_구매총액_변화_비율7_6', '월별_구매총액_변화_비율8_7', '월별_구매총액_변화_비율9_8', '월별_구매총액_변화_비율10_9',
        '월별_구매총액_변화_비율11_10', '월별_구매총액_변화_비율12_11', '월별_구매총액_변화비율_1_12']

train_ft = train_ft.drop(cols, axis=1)


In [1327]:
test_ft = test_ft.drop(cols, axis=1)

In [1328]:
train_ft.isnull().sum().sum(), test_ft.isnull().sum().sum()

(0, 0)

## Imute로 결측치 처리

In [1329]:
# from sklearn.impute import KNNImputer

In [1330]:
# knn = KNNImputer(n_neighbors=10)

In [1331]:
# train_ft[cols] = train_ft[cols].replace([np.inf, -np.inf], np.nan)
# cols = train_ft.select_dtypes(include=['float64', 'int64']).columns

# knn_data = knn.fit_transform(train_ft[cols])
# train_tmp = pd.DataFrame(knn_data, columns=train_ft.columns)
# train_tmp.isnull().sum()

# 특성 공학(Feature Engineering)

- ID 변수 제외

In [1332]:
train_ft = train_ft.iloc[:,1:]
test_ft = test_ft.iloc[:,1:]
train_ft.shape, test_ft.shape

((14940, 1128), (12225, 1128))

- 추가 피처 만들어 보기

In [1333]:
cols = [ col for col in train_ft.columns if col.startswith("pivot_cnt_") ]

In [1334]:
train_ft["중분류별_구매횟수_std"] = train_ft[cols].std(axis=1)
train_ft["중분류별_구매횟수_skew"] = train_ft[cols].skew(axis=1)
train_ft["중분류별_구매횟수_kurt"] = train_ft[cols].kurt(axis=1)

test_ft["중분류별_구매횟수_std"] = test_ft[cols].std(axis=1)
test_ft["중분류별_구매횟수_skew"] = test_ft[cols].skew(axis=1)
test_ft["중분류별_구매횟수_kurt"] = test_ft[cols].kurt(axis=1)

train_ft.shape, test_ft.shape

((14940, 1131), (12225, 1131))

In [1335]:
cols = [ col for col in train_ft.columns if col.startswith("수정_중_pivot_cnt_") ]

In [1336]:
train_ft["수정_중분류별_구매횟수_std"] = train_ft[cols].std(axis=1)
train_ft["수정_중분류별_구매횟수_skew"] = train_ft[cols].skew(axis=1)
train_ft["수정_중분류별_구매횟수_kurt"] = train_ft[cols].kurt(axis=1)

test_ft["수정_중분류별_구매횟수_std"] = test_ft[cols].std(axis=1)
test_ft["수정_중분류별_구매횟수_skew"] = test_ft[cols].skew(axis=1)
test_ft["수정_중분류별_구매횟수_kurt"] = test_ft[cols].kurt(axis=1)

train_ft.shape, test_ft.shape

((14940, 1134), (12225, 1134))

In [1337]:
cols_대 = [ col for col in train_ft.columns if col.startswith("대_pivot_cnt") ]

In [1338]:
train_ft["대분류별_구매횟수_std"] = train_ft[cols_대].std(axis=1)
train_ft["대분류별_구매횟수_skew"] = train_ft[cols_대].skew(axis=1)
train_ft["대분류별_구매횟수_kurt"] = train_ft[cols_대].kurt(axis=1)

test_ft["대분류별_구매횟수_std"] = test_ft[cols_대].std(axis=1)
test_ft["대분류별_구매횟수_skew"] = test_ft[cols_대].skew(axis=1)
test_ft["대분류별_구매횟수_kurt"] = test_ft[cols_대].kurt(axis=1)

train_ft.shape, test_ft.shape

((14940, 1137), (12225, 1137))

## Feature Encoding

In [1339]:
train_ft = train_ft.drop(columns = ["하루 구매 시간 간격"])
test_ft = test_ft.drop(columns = ["하루 구매 시간 간격"])
train_ft.shape, test_ft.shape

((14940, 1136), (12225, 1136))

In [1340]:
cols = train_ft.select_dtypes("object").columns.tolist()
train_ft[cols].nunique()

Unnamed: 0,0
주구매지점,4
주구매_중분류,246
주구매_대분류,28
주구매_수정_중분류,211
최대구매액_대분류,28
최소구매액_대분류,28


In [1341]:
%pip install category_encoders



In [1342]:
import category_encoders as ce

In [1343]:
enc = ce.one_hot.OneHotEncoder()
tmp = enc.fit_transform(train_ft[["주구매지점","주구매_대분류", "최소구매액_대분류", "최대구매액_대분류"]])
train_ft = pd.concat([train_ft,tmp],axis =1)

tmp = enc.transform(test_ft[["주구매지점","주구매_대분류", "최소구매액_대분류", "최대구매액_대분류"]])
test_ft = pd.concat([test_ft,tmp],axis =1)

train_ft.shape, test_ft.shape

((14940, 1224), (12225, 1224))

In [1344]:
enc = ce.count.CountEncoder()
train_ft["주구매_중분류_cnt"] = enc.fit_transform(train_ft[["주구매_중분류"]])
test_ft["주구매_중분류_cnt"] = enc.transform(test_ft[["주구매_중분류"]])

train_ft.shape, test_ft.shape

((14940, 1225), (12225, 1225))

In [1345]:
enc = ce.count.CountEncoder()
train_ft["주구매_수정_중분류_cnt"] = enc.fit_transform(train_ft[["주구매_수정_중분류"]])
test_ft["주구매_수정_중분류_cnt"] = enc.transform(test_ft[["주구매_수정_중분류"]])

train_ft.shape, test_ft.shape

((14940, 1226), (12225, 1226))

- 문자열 피처 삭제

In [1346]:
cols

['주구매지점', '주구매_중분류', '주구매_대분류', '주구매_수정_중분류', '최대구매액_대분류', '최소구매액_대분류']

In [1347]:
train_ft = train_ft.drop(columns=cols)
test_ft = test_ft.drop(columns=cols)
train_ft.shape, test_ft.shape

((14940, 1220), (12225, 1220))

In [1348]:
train_ft.select_dtypes("object").columns , test_ft.select_dtypes("object").columns

(Index([], dtype='object'), Index([], dtype='object'))

## Feature Scaling

### StandardScaler

In [1349]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

In [1350]:
train_ft[train_ft.columns] = scaler.fit_transform(train_ft)
test_ft[test_ft.columns] = scaler.transform(test_ft)
train_ft.head()

Unnamed: 0,구매횟수,내점일수,구매주기,봄_구매비율,여름_구매비율,가을_구매비율,겨울_구매비율,주구매요일,일별평균구매건수,거래개월수,...,최대구매액_대분류_21,최대구매액_대분류_22,최대구매액_대분류_23,최대구매액_대분류_24,최대구매액_대분류_25,최대구매액_대분류_26,최대구매액_대분류_27,최대구매액_대분류_28,주구매_중분류_cnt,주구매_수정_중분류_cnt
0,-0.437126,-0.369867,0.002987,-1.029777,0.001191,0.838272,0.338186,0.109631,-0.057297,0.01762,...,-0.07785,-0.1402,-0.200973,-0.090737,-0.071977,-0.146044,-0.067118,-0.050497,-0.72697,-0.763996
1,0.239394,0.14411,-0.356452,0.323951,-0.390607,0.620171,-0.552996,0.109631,0.222706,1.17601,...,-0.07785,-0.1402,-0.200973,-0.090737,-0.071977,-0.146044,-0.067118,-0.050497,-0.510078,-0.467002
2,1.890101,1.943028,-0.869935,0.798943,-0.514333,-0.304527,-0.059266,-1.64337,0.277707,1.465608,...,-0.07785,-0.1402,-0.200973,-0.090737,-0.071977,-0.146044,-0.067118,-0.050497,-0.848408,-0.888034
3,4.29851,3.793345,-1.02398,0.420933,-0.327474,0.008592,-0.135636,0.109631,0.674668,1.465608,...,-0.07785,-0.1402,-0.200973,-0.090737,-0.071977,-0.146044,-0.067118,-0.050497,1.523851,1.535081
4,0.618244,0.452496,-0.613193,-0.752532,1.70741,-0.130285,-0.821561,0.693965,0.679008,0.886413,...,-0.07785,-0.1402,-0.200973,-0.090737,-0.071977,-0.146044,-0.067118,-0.050497,-0.833157,-0.872893


### MinMaxScaler

In [1351]:
# from sklearn.preprocessing import MinMaxScaler
# scaler = MinMaxScaler()

In [1352]:
# train_ft[train_ft.columns] = scaler.fit_transform(train_ft)
# test_ft[test_ft.columns] = scaler.transform(test_ft)
# train_ft.head()

### PowerTransform

In [1353]:
# from sklearn.preprocessing import PowerTransformer
# scaler = PowerTransformer()

In [1354]:
# train_ft[train_ft.columns] = scaler.fit_transform(train_ft)
# test_ft[test_ft.columns] = scaler.transform(test_ft)
# train_ft.head()

### Robust

In [1355]:
# from sklearn.preprocessing import RobustScaler
# scaler = RobustScaler()

In [1356]:
# train_ft[train_ft.columns] = scaler.fit_transform(train_ft)
# test_ft[test_ft.columns] = scaler.transform(test_ft)
# train_ft.head()

# 정답 데이터

In [1357]:
target = train_target["target"]
target

Unnamed: 0,target
0,1.0
1,1.0
2,0.0
3,0.0
4,0.0
...,...
14935,0.0
14936,0.0
14937,0.0
14938,1.0


# cv 점수 확인해보기

In [1358]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
cv = KFold(n_splits=5,shuffle=True, random_state=SEED)

In [1359]:
from lightgbm import LGBMClassifier

model = LGBMClassifier(random_state=SEED)
scores = cross_val_score(model,train_ft,target,cv = cv ,scoring='f1_macro',n_jobs = -1)
np.mean(scores)

0.7153748437971978

# 피처 셀렉션

In [1291]:
%pip install lightgbm



## RandomForestClassifier with LogisticRegression

In [1292]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

# rf = RandomForestClassifier(random_state=42) # 특성 선택을 하기 위한 모델
# model = LogisticRegression(random_state=42) # 학습용 모델

In [1293]:
# from sklearn.feature_selection import SelectFromModel
# fs = SelectFromModel(rf) # 특성 선택에 사용하기 위한 모델 객체 전달해 줘야 함
# x_train = fs.fit_transform(train_ft, target) # 특성 선택이 완료된 입력 데이터가 ndarray로 반환

# scores = cross_val_score(model, x_train, target, cv=cv, scoring='f1_macro', n_jobs=-1)
# scores.mean()

In [1294]:
# best_cols = fs.get_feature_names_out()
# best_cols

In [1295]:
# train_tmp = train_ft[best_cols]

In [1296]:
# model = LGBMClassifier(random_state=SEED)
# scores = cross_val_score(model,train_tmp,target,cv = cv ,scoring='f1_macro',n_jobs = -1)
# np.mean(scores)

## LogisticRegression with RandomForestClassifier

In [1297]:
# lr = LogisticRegression(random_state=42) # 특성 선택을 하기 위한 모델
# model = RandomForestClassifier(random_state=42) # 학습용 모델

In [1298]:
from sklearn.feature_selection import SelectFromModel
# fs = SelectFromModel(lr) # 특성 선택에 사용하기 위한 모델 객체 전달해 줘야 함
# x_train = fs.fit_transform(train_ft, target) # 특성 선택이 완료된 입력 데이터가 ndarray로 반환

# scores = cross_val_score(model, x_train, target, cv=cv, scoring='f1_macro', n_jobs=-1)
# scores.mean()

In [1299]:
# best_cols = fs.get_feature_names_out()
# best_cols

In [1300]:
# train_tmp = train_ft[best_cols]
# train_tmp.head()

In [1301]:
# model = LGBMClassifier(random_state=SEED)
# scores = cross_val_score(model,train_tmp,target,cv = cv ,scoring='f1_macro',n_jobs = -1)
# np.mean(scores)

## SelectKBest

In [1302]:
from sklearn.feature_selection import SelectKBest, SelectPercentile

In [1303]:
sk = SelectKBest(k=500)
x = sk.fit_transform(train_ft, target)
scores = cross_val_score(model, x, target, cv=cv, scoring="f1_macro", n_jobs=-1)
scores.mean()

0.7088537922931876

In [1304]:
best_cols = sk.get_feature_names_out()
best_cols

array(['구매횟수', '내점일수', '구매주기', '여름_구매비율', '겨울_구매비율', '일별평균구매건수', '거래개월수',
       '12시이전구매비율', '12시이후_18시이전구매비율', '18시이후구매비율', '평일_18시_구매비율',
       '주중방문비율', '주말방문비율', '12시이전구매횟수', '12시이후_18시이전구매횟수', '방문평균거래평균횟수',
       '일별최대구매횟수', '일별평균구매횟수', '일별중앙값구매횟수', '일별구매횟수표준편차', '월요일방문비율',
       '화요일방문비율', '수요일방문비율', '목요일방문비율', '금요일방문비율', '일요일방문비율', '거래주수',
       '거래기간일수', '구매월요일cnt', '구매화요일cnt', '구매수요일cnt', '구매목요일cnt',
       '구매금요일cnt', '구매토요일cnt', '구매일요일cnt', '주구매시간', '어린이날전후구매',
       '크리스마스전후구매', '설날전후구매', '추석전후구매', '9시_구매비율', '10시_구매비율', '11시_구매비율',
       '12시_구매비율', '13시_구매비율', '14시_구매비율', '15시_구매비율', '16시_구매비율',
       '17시_구매비율', '18시_구매비율', '19시_구매비율', '20시_구매비율', '짧은_구매주기',
       '긴_구매주기', '많은_거래개월수', '적은_거래개월수', '구매주기짧고_거래개월많음', '구매주기짧고_거래개월적음',
       '구매주기긴_거래개월적음', '많은_내점일수', '적은_내점일수', '구매주기짧고_내점일수많음',
       '구매주기짧고_내점일수적음', '구매주기긴_내점일수적음', '브랜드코드_평균구매액', '3회이상count',
       '중분류_nunique', '중분류_아동_cnt', '중_주구매비율', '행사키워드_구매횟수', '골프유무',
       '골프count', '유아관련유무', '유아관련cou

In [1305]:
train_tmp = train_ft[best_cols]
train_tmp.head()

Unnamed: 0,구매횟수,내점일수,구매주기,여름_구매비율,겨울_구매비율,일별평균구매건수,거래개월수,12시이전구매비율,12시이후_18시이전구매비율,18시이후구매비율,...,최소구매액_대분류_15,최대구매액_대분류_2,최대구매액_대분류_6,최대구매액_대분류_7,최대구매액_대분류_8,최대구매액_대분류_9,최대구매액_대분류_15,최대구매액_대분류_18,주구매_중분류_cnt,주구매_수정_중분류_cnt
0,-0.437126,-0.369867,0.002987,0.001191,0.338186,-0.057297,0.01762,0.718557,-0.65415,0.071084,...,-0.304802,-0.290815,-0.288099,-0.168611,-0.292437,-0.187233,-0.301907,-0.17963,-0.72697,-0.763996
1,0.239394,0.14411,-0.356452,-0.390607,-0.552996,0.222706,1.17601,0.269465,-0.542415,0.332492,...,-0.304802,3.438609,-0.288099,-0.168611,-0.292437,-0.187233,-0.301907,-0.17963,-0.510078,-0.467002
2,1.890101,1.943028,-0.869935,-0.514333,-0.059266,0.277707,1.465608,-0.20983,0.532172,-0.371936,...,-0.304802,-0.290815,-0.288099,-0.168611,-0.292437,-0.187233,-0.301907,-0.17963,-0.848408,-0.888034
3,4.29851,3.793345,-1.02398,-0.327474,-0.135636,0.674668,1.465608,-0.009001,0.054536,-0.048595,...,-0.304802,-0.290815,-0.288099,-0.168611,-0.292437,-0.187233,-0.301907,-0.17963,1.523851,1.535081
4,0.618244,0.452496,-0.613193,1.70741,-0.821561,0.679008,0.886413,-0.346226,0.591326,-0.3185,...,-0.304802,-0.290815,-0.288099,-0.168611,-0.292437,-0.187233,-0.301907,-0.17963,-0.833157,-0.872893


In [1306]:
model = LGBMClassifier(random_state=SEED)
scores = cross_val_score(model,train_tmp,target,cv = cv ,scoring='f1_macro',n_jobs = -1)
np.mean(scores)

0.7088537922931876

## SelectPercentile

In [1361]:
sp = SelectPercentile(percentile=10) # 상위 10% 피처 선택
x = sp.fit_transform(train_ft, target)
scores = cross_val_score(model, x, target, cv=cv, scoring="f1_macro", n_jobs=-1)
scores.mean()

0.7008831973461165

In [1362]:
best_cols = sp.get_feature_names_out()
best_cols

array(['구매횟수', '내점일수', '일별평균구매건수', '거래개월수', '12시이전구매비율',
       '12시이후_18시이전구매비율', '18시이후구매비율', '평일_18시_구매비율', '12시이전구매횟수',
       '12시이후_18시이전구매횟수', '방문평균거래평균횟수', '일별최대구매횟수', '일별평균구매횟수',
       '일별구매횟수표준편차', '목요일방문비율', '거래주수', '거래기간일수', '구매월요일cnt', '구매화요일cnt',
       '구매수요일cnt', '구매금요일cnt', '구매토요일cnt', '구매일요일cnt', '주구매시간', '추석전후구매',
       '많은_거래개월수', '적은_거래개월수', '구매주기짧고_거래개월많음', '많은_내점일수', '적은_내점일수',
       '구매주기짧고_내점일수많음', '3회이상count', '중분류_nunique', '중분류_아동_cnt', '골프유무',
       '골프count', '유아관련유무', '유아관련count', '대분류_nunique', '대분류_아동_cnt',
       '대_주구매비율', '대분류_여성_cnt', '대분류_남성_cnt', '대분류_생식품_cnt', '남_여모두구매',
       '총_생식품_구매가격', '총구매액', '구매건수', '최대구매액', '환불건수', '구매금액왜도', '구매금액첨도',
       '최대구매액 - 최소구매액', '같은가격재구매횟수', '같은중분류재구매횟수', '반복중분류비율', '실제구매횟수',
       '거래개월대비구매비용', '거래개월대비구매횟수', '2004-05월_count', '2004-06월_count',
       '2004-07월_count', '2004-08월_count', '2004-09월_count',
       '2004-10월_count', '2004-11월_count', '2004-12월_count',
       '2005-01월_count', '2005-04월_coun

In [1363]:
train_tmp = train_ft[best_cols]
train_tmp.head()

Unnamed: 0,구매횟수,내점일수,일별평균구매건수,거래개월수,12시이전구매비율,12시이후_18시이전구매비율,18시이후구매비율,평일_18시_구매비율,12시이전구매횟수,12시이후_18시이전구매횟수,...,수정_중_pivot_cnt_청과,중분류_nunique_bin,가족단위_구매비율,중분류별_구매횟수_std,수정_중분류별_구매횟수_std,대분류별_구매횟수_std,대분류별_구매횟수_skew,대분류별_구매횟수_kurt,주구매_대분류_2,주구매_대분류_8
0,-0.437126,-0.369867,-0.057297,0.01762,0.718557,-0.65415,0.071084,0.07674,-0.020842,-0.460006,...,-0.249109,-0.420306,-0.328996,-0.115097,-0.121569,-0.377,0.567283,0.663751,-0.235019,-0.319269
1,0.239394,0.14411,0.222706,1.17601,0.269465,-0.542415,0.332492,0.513533,0.231831,-0.046467,...,0.283322,1.360064,-0.437509,-0.056149,-0.058116,-0.027378,-0.792691,-0.751426,4.254981,-0.319269
2,1.890101,1.943028,0.277707,1.465608,-0.20983,0.532172,-0.371936,-0.271929,0.652953,2.321984,...,-0.249109,1.360064,1.319372,1.463979,1.435398,1.667174,-0.40099,-0.51401,-0.235019,-0.319269
3,4.29851,3.793345,0.674668,1.465608,-0.009001,0.054536,-0.048595,-0.024696,2.337441,4.088923,...,0.283322,1.360064,5.349865,3.48353,3.524919,3.726162,-0.393535,-0.505595,-0.235019,-0.319269
4,0.618244,0.452496,0.679008,0.886413,-0.346226,0.591326,-0.3185,-0.531483,-0.020842,0.930989,...,0.283322,1.360064,0.290913,0.547783,0.531968,0.400696,-1.35731,-1.152972,-0.235019,-0.319269


In [1364]:
model = LGBMClassifier(random_state=SEED)
scores = cross_val_score(model,train_tmp,target,cv = cv ,scoring='f1_macro',n_jobs = -1)
np.mean(scores)

0.7008831973461165

# 모델 학습

In [1307]:
model = LGBMClassifier(random_state=SEED)
model.fit(train_tmp,target)

[LightGBM] [Info] Number of positive: 5874, number of negative: 9066
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.303461 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 108469
[LightGBM] [Info] Number of data points in the train set: 14940, number of used features: 1188
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.393173 -> initscore=-0.433995
[LightGBM] [Info] Start training from score -0.433995


# 테스트 데이터 예측

In [None]:
pred = model.predict(test_ft)
pred

array([0., 0., 0., ..., 0., 0., 0.])

In [None]:
pred = model.predict_proba(test_ft)[:,1]
pred

array([0.08222887, 0.34809674, 0.33352402, ..., 0.08107   , 0.09808246,
       0.07521644])

# 평가를 위한 제출 파일 생성
- 예측 결과를 target 컬럼에 넣어 csv 파일로 저장후에 제출한다.

In [None]:
submit

Unnamed: 0,ID,target
0,test_0,0.5
1,test_1,0.5
2,test_2,0.5
3,test_3,0.5
4,test_4,0.5
...,...,...
12220,test_12220,0.5
12221,test_12221,0.5
12222,test_12222,0.5
12223,test_12223,0.5


In [None]:
submit["target"] = pred
submit

Unnamed: 0,ID,target
0,test_0,0.0
1,test_1,0.0
2,test_2,0.0
3,test_3,1.0
4,test_4,1.0
...,...,...
12220,test_12220,1.0
12221,test_12221,0.0
12222,test_12222,0.0
12223,test_12223,0.0


- 예측 결과를 csv 파일로 저장하여 제출

In [None]:
submit.to_csv(f"{DATA_PATH}submit.csv",index=False)

In [None]:
submit.to_csv(f"{DATA_PATH}submit_proba.csv",index=False)