In [149]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


- 데이터 경로 변수

In [150]:
DATA_PATH = "/content/drive/MyDrive/the_datas/data/"
DATA_PATH

'/content/drive/MyDrive/the_datas/data/'

- 시드값

In [151]:

SEED = 42

- 데이터 불러오기

In [152]:
import pandas as pd
import numpy as np

train_tr = pd.read_csv(f"{DATA_PATH}store_train_transactions.csv") # 학습용 구매기록 데이터
train_target = pd.read_csv(f"{DATA_PATH}store_train.csv") # 학습용 정답 데이터
test_tr = pd.read_csv(f"{DATA_PATH}store_test_transactions.csv") # 테스트용 구매기록 데이터
submit = pd.read_csv(f"{DATA_PATH}store_submission.csv") # 제출 양식 데이터

train_tr.shape , train_target.shape , test_tr.shape , submit.shape

((523105, 7), (14940, 2), (441196, 7), (12225, 2))

- 공통 피처 파일 불러오기

In [153]:
train_ft = pd.read_csv(f"{DATA_PATH}train_common_1.csv") # 학습 데이터(피처)
test_ft = pd.read_csv(f"{DATA_PATH}test_common_1.csv") # 테스트 데이터(피처)

train_ft.shape , test_ft.shape

((14940, 1142), (12225, 1142))

# 결측치 처리

In [154]:
mask = train_ft.isnull().sum() > 0
train_ft.isnull().sum()[mask]

Unnamed: 0,0
일별구매횟수표준편차,664
3회이상count,4214
총_생식품_구매가격,9711
평균_생식품_구매가격,1574
구매금액표준편차,388
구매금액왜도,840
구매금액첨도,1265
2004-05월_count_diff,14940
월별_구매총액_변화_비율2_1,4251
월별_구매총액_변화_비율3_2,4209


In [155]:

mask = test_ft.isnull().sum() > 0
test_ft.isnull().sum()[mask]

Unnamed: 0,0
일별구매횟수표준편차,443
3회이상count,3284
총_생식품_구매가격,7820
평균_생식품_구매가격,1257
구매금액표준편차,242
구매금액왜도,599
구매금액첨도,954
2004-05월_count_diff,12225
월별_구매총액_변화_비율2_1,3335
월별_구매총액_변화_비율3_2,3269


In [156]:
train_ft["구매금액표준편차"] = train_ft["구매금액표준편차"].fillna(0)
train_ft["구매금액왜도"] = train_ft["구매금액왜도"].fillna(0)
train_ft["구매금액첨도"] = train_ft["구매금액첨도"].fillna(0)

test_ft["구매금액표준편차"] = test_ft["구매금액표준편차"].fillna(0)
test_ft["구매금액왜도"] = test_ft["구매금액왜도"].fillna(0)
test_ft["구매금액첨도"] = test_ft["구매금액첨도"].fillna(0)

In [157]:
train_ft["총_생식품_구매가격"] = train_ft["총_생식품_구매가격"].fillna(0)
train_ft["평균_생식품_구매가격"] = train_ft["평균_생식품_구매가격"].fillna(0)
train_ft["3회이상count"] = train_ft["3회이상count"].fillna(0)


test_ft["총_생식품_구매가격"] = test_ft["총_생식품_구매가격"].fillna(0)
test_ft["평균_생식품_구매가격"] = test_ft["평균_생식품_구매가격"].fillna(0)
test_ft["3회이상count"] = test_ft["3회이상count"].fillna(0)


In [158]:
train_ft["일별구매횟수표준편차"] = train_ft["일별구매횟수표준편차"].fillna(0)
test_ft["일별구매횟수표준편차"] = test_ft["일별구매횟수표준편차"].fillna(0)

In [159]:
train_ft["2004-05월_count_diff"] = train_ft["2004-05월_count_diff"].fillna(0)
test_ft["2004-05월_count_diff"] = test_ft["2004-05월_count_diff"].fillna(0)

In [160]:
train_ft["월별_구매총액_변화비율_1_12"] = train_ft["월별_구매총액_변화비율_1_12"].fillna(0)
test_ft["월별_구매총액_변화비율_1_12"] = test_ft["월별_구매총액_변화비율_1_12"].fillna(0)
for i in range(1,12):
    train_ft[f"월별_구매총액_변화_비율{i+1}_{i}"] = train_ft[f"월별_구매총액_변화_비율{i+1}_{i}"].fillna(0)
    test_ft[f"월별_구매총액_변화_비율{i+1}_{i}"] = test_ft[f"월별_구매총액_변화_비율{i+1}_{i}"].fillna(0)

In [161]:
train_ft.isnull().sum().sum(), test_ft.isnull().sum().sum()

(0, 0)

# 특성 공학(Feature Engineering)

- ID 변수 제외

In [162]:
train_ft = train_ft.iloc[:,1:]
test_ft = test_ft.iloc[:,1:]
train_ft.shape, test_ft.shape

((14940, 1141), (12225, 1141))

- 추가 피처 만들어 보기

In [163]:
cols = [ col for col in train_ft.columns if col.startswith("중_pivot_cnt_") ]

In [164]:
train_ft["중분류별_구매횟수_std"] = train_ft[cols].std(axis=1)
train_ft["중분류별_구매횟수_skew"] = train_ft[cols].skew(axis=1)
train_ft["중분류별_구매횟수_kurt"] = train_ft[cols].kurt(axis=1)

test_ft["중분류별_구매횟수_std"] = test_ft[cols].std(axis=1)
test_ft["중분류별_구매횟수_skew"] = test_ft[cols].skew(axis=1)
test_ft["중분류별_구매횟수_kurt"] = test_ft[cols].kurt(axis=1)

train_ft.shape, test_ft.shape

((14940, 1144), (12225, 1144))

In [165]:
cols = [ col for col in train_ft.columns if col.startswith("수정_중_pivot_cnt_") ]

In [166]:
train_ft["수정_중분류별_구매횟수_std"] = train_ft[cols].std(axis=1)
train_ft["수정_중분류별_구매횟수_skew"] = train_ft[cols].skew(axis=1)
train_ft["수정_중분류별_구매횟수_kurt"] = train_ft[cols].kurt(axis=1)

test_ft["수정_중분류별_구매횟수_std"] = test_ft[cols].std(axis=1)
test_ft["수정_중분류별_구매횟수_skew"] = test_ft[cols].skew(axis=1)
test_ft["수정_중분류별_구매횟수_kurt"] = test_ft[cols].kurt(axis=1)

train_ft.shape, test_ft.shape

((14940, 1147), (12225, 1147))

In [167]:
cols_대 = [ col for col in train_ft.columns if col.startswith("대_pivot_cnt") ]

In [168]:
train_ft["대분류별_구매횟수_std"] = train_ft[cols_대].std(axis=1)
train_ft["대분류별_구매횟수_skew"] = train_ft[cols_대].skew(axis=1)
train_ft["대분류별_구매횟수_kurt"] = train_ft[cols_대].kurt(axis=1)

test_ft["대분류별_구매횟수_std"] = test_ft[cols_대].std(axis=1)
test_ft["대분류별_구매횟수_skew"] = test_ft[cols_대].skew(axis=1)
test_ft["대분류별_구매횟수_kurt"] = test_ft[cols_대].kurt(axis=1)

train_ft.shape, test_ft.shape

((14940, 1150), (12225, 1150))

## Feature Encoding

In [169]:
train_ft = train_ft.drop(columns = ["하루 구매 시간 간격"])
test_ft = test_ft.drop(columns = ["하루 구매 시간 간격"])
train_ft.shape, test_ft.shape

((14940, 1149), (12225, 1149))

In [170]:
cols = train_ft.select_dtypes("object").columns.tolist()
train_ft[cols].nunique()

Unnamed: 0,0
주구매지점,4
주구매_중분류,246
주구매_대분류,28
주구매_수정_중분류,211
최대구매액_대분류,28
최소구매액_대분류,28


In [171]:
%pip install category_encoders



In [172]:
import category_encoders as ce

In [173]:
enc = ce.one_hot.OneHotEncoder()
tmp = enc.fit_transform(train_ft[["주구매지점","주구매_대분류", "최소구매액_대분류", "최대구매액_대분류"]])
train_ft = pd.concat([train_ft,tmp],axis =1)

tmp = enc.transform(test_ft[["주구매지점","주구매_대분류", "최소구매액_대분류", "최대구매액_대분류"]])
test_ft = pd.concat([test_ft,tmp],axis =1)

train_ft.shape, test_ft.shape

((14940, 1237), (12225, 1237))

In [174]:
enc = ce.count.CountEncoder()
train_ft["주구매_중분류_cnt"] = enc.fit_transform(train_ft[["주구매_중분류"]])
test_ft["주구매_중분류_cnt"] = enc.transform(test_ft[["주구매_중분류"]])

train_ft.shape, test_ft.shape

((14940, 1238), (12225, 1238))

In [175]:
enc = ce.count.CountEncoder()
train_ft["주구매_수정_중분류_cnt"] = enc.fit_transform(train_ft[["주구매_수정_중분류"]])
test_ft["주구매_수정_중분류_cnt"] = enc.transform(test_ft[["주구매_수정_중분류"]])

train_ft.shape, test_ft.shape

((14940, 1239), (12225, 1239))

- 문자열 피처 삭제

In [176]:
cols

['주구매지점', '주구매_중분류', '주구매_대분류', '주구매_수정_중분류', '최대구매액_대분류', '최소구매액_대분류']

In [177]:
train_ft = train_ft.drop(columns=cols)
test_ft = test_ft.drop(columns=cols)
train_ft.shape, test_ft.shape

((14940, 1233), (12225, 1233))

In [178]:
train_ft.select_dtypes("object").columns , test_ft.select_dtypes("object").columns

(Index([], dtype='object'), Index([], dtype='object'))

## Inf 값 처리

In [179]:
mask = np.isinf(train_ft).sum() > 0
np.isinf(train_ft).sum()[mask]

Unnamed: 0,0
월별_구매총액_변화_비율2_1,2114
월별_구매총액_변화_비율3_2,2753
월별_구매총액_변화_비율4_3,2737
월별_구매총액_변화_비율5_4,2650
월별_구매총액_변화_비율6_5,2141
월별_구매총액_변화_비율7_6,2392
월별_구매총액_변화_비율8_7,1803
월별_구매총액_변화_비율9_8,2760
월별_구매총액_변화_비율10_9,2548
월별_구매총액_변화_비율11_10,2015


In [180]:
mask = np.isinf(test_ft).sum() > 0
np.isinf(test_ft).sum()[mask]

Unnamed: 0,0
월별_구매총액_변화_비율2_1,1811
월별_구매총액_변화_비율3_2,2226
월별_구매총액_변화_비율4_3,2230
월별_구매총액_변화_비율5_4,2116
월별_구매총액_변화_비율6_5,1805
월별_구매총액_변화_비율7_6,1903
월별_구매총액_변화_비율8_7,1388
월별_구매총액_변화_비율9_8,2200
월별_구매총액_변화_비율10_9,2125
월별_구매총액_변화_비율11_10,1677


In [181]:
cols = np.isinf(train_ft).sum()[mask].index
cols

Index(['월별_구매총액_변화_비율2_1', '월별_구매총액_변화_비율3_2', '월별_구매총액_변화_비율4_3',
       '월별_구매총액_변화_비율5_4', '월별_구매총액_변화_비율6_5', '월별_구매총액_변화_비율7_6',
       '월별_구매총액_변화_비율8_7', '월별_구매총액_변화_비율9_8', '월별_구매총액_변화_비율10_9',
       '월별_구매총액_변화_비율11_10', '월별_구매총액_변화_비율12_11', '월별_구매총액_변화비율_1_12'],
      dtype='object')

In [182]:
train_inf = train_ft[cols].copy()
train_inf[np.isinf(train_inf)] = np.nan  # 무한대를 NaN으로 대체
train_inf = np.nan_to_num(train_inf, nan=np.nanmax(train_inf[~np.isinf(train_inf)]))
np.isinf(train_inf).sum().sum()

0

In [183]:
test_inf = test_ft[cols].copy()
test_inf[np.isinf(test_inf)] = np.nan  # 무한대를 NaN으로 대체
test_inf = np.nan_to_num(test_inf, nan=np.nanmax(test_inf[~np.isinf(test_inf)]))
np.isinf(test_inf).sum().sum()

0

In [184]:
train_ft[cols] = train_inf
test_ft[cols] = test_inf


In [185]:
np.isinf(train_ft).sum().sum(), np.isinf(test_ft).sum().sum()

(0, 0)

## Feature Scaling

In [187]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

In [188]:
train_ft[train_ft.columns] = scaler.fit_transform(train_ft)
test_ft[test_ft.columns] = scaler.transform(test_ft)
train_ft.head()

  updated_mean = (last_sum + new_sum) / updated_sample_count
  T = new_sum / new_sample_count
  new_unnormalized_variance -= correction**2 / new_sample_count


Unnamed: 0,구매횟수,내점일수,구매주기,봄_구매비율,여름_구매비율,가을_구매비율,겨울_구매비율,주구매요일,일별평균구매건수,거래개월수,...,최대구매액_대분류_21,최대구매액_대분류_22,최대구매액_대분류_23,최대구매액_대분류_24,최대구매액_대분류_25,최대구매액_대분류_26,최대구매액_대분류_27,최대구매액_대분류_28,주구매_중분류_cnt,주구매_수정_중분류_cnt
0,-0.437126,-0.369867,0.002987,-1.029777,0.001191,0.838272,0.338186,0.109631,-0.057297,0.01762,...,-0.07785,-0.1402,-0.200973,-0.090737,-0.071977,-0.146044,-0.067118,-0.050497,-0.72697,-0.763996
1,0.239394,0.14411,-0.356452,0.323951,-0.390607,0.620171,-0.552996,0.109631,0.222706,1.17601,...,-0.07785,-0.1402,-0.200973,-0.090737,-0.071977,-0.146044,-0.067118,-0.050497,-0.510078,-0.467002
2,1.890101,1.943028,-0.869935,0.798943,-0.514333,-0.304527,-0.059266,-1.64337,0.277707,1.465608,...,-0.07785,-0.1402,-0.200973,-0.090737,-0.071977,-0.146044,-0.067118,-0.050497,-0.848408,-0.888034
3,4.29851,3.793345,-1.02398,0.420933,-0.327474,0.008592,-0.135636,0.109631,0.674668,1.465608,...,-0.07785,-0.1402,-0.200973,-0.090737,-0.071977,-0.146044,-0.067118,-0.050497,1.523851,1.535081
4,0.618244,0.452496,-0.613193,-0.752532,1.70741,-0.130285,-0.821561,0.693965,0.679008,0.886413,...,-0.07785,-0.1402,-0.200973,-0.090737,-0.071977,-0.146044,-0.067118,-0.050497,-0.833157,-0.872893


# 정답 데이터

In [189]:
target = train_target["target"]
target

Unnamed: 0,target
0,1.0
1,1.0
2,0.0
3,0.0
4,0.0
...,...
14935,0.0
14936,0.0
14937,0.0
14938,1.0


# cv 점수 확인해보기

In [190]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
cv = KFold(n_splits=5,shuffle=True, random_state=SEED)

In [191]:
from lightgbm import LGBMClassifier

model = LGBMClassifier(random_state=SEED)
scores = cross_val_score(model,train_ft,target,cv = cv ,scoring='f1_macro',n_jobs = -1)
np.mean(scores)

Dask dataframe query planning is disabled because dask-expr is not installed.

You can install it with `pip install dask[dataframe]` or `conda install dask`.
This will raise in a future version.



0.7151941228633614

# 모델 학습

In [192]:
model = LGBMClassifier(random_state=SEED)
model.fit(train_ft,target)

[LightGBM] [Info] Number of positive: 5874, number of negative: 9066
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.479805 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 110280
[LightGBM] [Info] Number of data points in the train set: 14940, number of used features: 1197
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.393173 -> initscore=-0.433995
[LightGBM] [Info] Start training from score -0.433995


# 테스트 데이터 예측

In [193]:
pred = model.predict(test_ft)
pred

array([0., 0., 0., ..., 0., 0., 0.])

In [194]:
# pred = model.predict_proba(test_ft)[:,1]
# pred

# 평가를 위한 제출 파일 생성
- 예측 결과를 target 컬럼에 넣어 csv 파일로 저장후에 제출한다.

In [195]:
submit

Unnamed: 0,ID,target
0,test_0,0.5
1,test_1,0.5
2,test_2,0.5
3,test_3,0.5
4,test_4,0.5
...,...,...
12220,test_12220,0.5
12221,test_12221,0.5
12222,test_12222,0.5
12223,test_12223,0.5


In [196]:
submit["target"] = pred
submit

Unnamed: 0,ID,target
0,test_0,0.0
1,test_1,0.0
2,test_2,0.0
3,test_3,1.0
4,test_4,1.0
...,...,...
12220,test_12220,1.0
12221,test_12221,0.0
12222,test_12222,0.0
12223,test_12223,0.0


- 예측 결과를 csv 파일로 저장하여 제출

In [197]:
submit.to_csv(f"{DATA_PATH}submit_5.csv",index=False)