In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


- 데이터 경로 변수

In [2]:
DATA_PATH = "/content/drive/MyDrive/the_datas/data/"
DATA_PATH

'/content/drive/MyDrive/the_datas/data/'

- 시드값

In [3]:
SEED = 42

- 데이터 불러오기

In [4]:
import pandas as pd
import numpy as np

train_tr = pd.read_csv(f"{DATA_PATH}store_train_transactions.csv") # 학습용 구매기록 데이터
train_target = pd.read_csv(f"{DATA_PATH}store_train.csv") # 학습용 정답 데이터
test_tr = pd.read_csv(f"{DATA_PATH}store_test_transactions.csv") # 테스트용 구매기록 데이터
submit = pd.read_csv(f"{DATA_PATH}store_submission.csv") # 제출 양식 데이터

train_tr.shape , train_target.shape , test_tr.shape , submit.shape

((523105, 7), (14940, 2), (441196, 7), (12225, 2))

- 공통 피처 파일 불러오기

In [5]:
train_ft = pd.read_csv(f"{DATA_PATH}train_common.csv") # 학습 데이터(피처)
test_ft = pd.read_csv(f"{DATA_PATH}test_common.csv") # 테스트 데이터(피처)

train_ft.shape , test_ft.shape

((14940, 356), (12225, 356))

# 결측치 처리

In [6]:
mask = train_ft.isnull().sum() > 0
train_ft.isnull().sum()[mask]

Unnamed: 0,0
구매금액표준편차,388
구매금액왜도,840
구매금액첨도,1265


In [7]:
mask = test_ft.isnull().sum() > 0
test_ft.isnull().sum()[mask]

Unnamed: 0,0
구매금액표준편차,242
구매금액왜도,599
구매금액첨도,954


In [8]:
train_ft["구매금액표준편차"] = train_ft["구매금액표준편차"].fillna(0)
train_ft["구매금액왜도"] = train_ft["구매금액왜도"].fillna(0)
train_ft["구매금액첨도"] = train_ft["구매금액첨도"].fillna(0)

test_ft["구매금액표준편차"] = test_ft["구매금액표준편차"].fillna(0)
test_ft["구매금액왜도"] = test_ft["구매금액왜도"].fillna(0)
test_ft["구매금액첨도"] = test_ft["구매금액첨도"].fillna(0)

In [9]:
train_ft.isnull().sum().sum(), test_ft.isnull().sum().sum()

(0, 0)

# 특성 공학(Feature Engineering)

- ID 변수 제외

In [10]:
train_ft = train_ft.iloc[:,1:]
test_ft = test_ft.iloc[:,1:]
train_ft.shape, test_ft.shape

((14940, 355), (12225, 355))

- 추가 피처 만들어 보기

In [11]:
cols = [ col for col in train_ft.columns if col.startswith("pivot_cnt_") ]

In [12]:
train_ft["중분류별_구매횟수_std"] = train_ft[cols].std(axis=1)
train_ft["중분류별_구매횟수_skew"] = train_ft[cols].skew(axis=1)
train_ft["중분류별_구매횟수_kurt"] = train_ft[cols].kurt(axis=1)

test_ft["중분류별_구매횟수_std"] = test_ft[cols].std(axis=1)
test_ft["중분류별_구매횟수_skew"] = test_ft[cols].skew(axis=1)
test_ft["중분류별_구매횟수_kurt"] = test_ft[cols].kurt(axis=1)

train_ft.shape, test_ft.shape

((14940, 358), (12225, 358))

## Feature Encoding

In [13]:
cols = train_ft.select_dtypes("object").columns.tolist()
train_ft[cols].nunique()

Unnamed: 0,0
주구매지점,4
주구매_중분류,246
주구매_대분류,28


In [14]:
%pip install category_encoders

Collecting category_encoders
  Downloading category_encoders-2.6.4-py2.py3-none-any.whl.metadata (8.0 kB)
Downloading category_encoders-2.6.4-py2.py3-none-any.whl (82 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m82.0/82.0 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: category_encoders
Successfully installed category_encoders-2.6.4


In [15]:
import category_encoders as ce

In [16]:
enc = ce.one_hot.OneHotEncoder()
tmp = enc.fit_transform(train_ft[["주구매지점","주구매_대분류"]])
train_ft = pd.concat([train_ft,tmp],axis =1)

tmp = enc.transform(test_ft[["주구매지점","주구매_대분류"]])
test_ft = pd.concat([test_ft,tmp],axis =1)

train_ft.shape, test_ft.shape

((14940, 390), (12225, 390))

In [17]:
enc = ce.count.CountEncoder()
train_ft["주구매_중분류_cnt"] = enc.fit_transform(train_ft[["주구매_중분류"]])
test_ft["주구매_중분류_cnt"] = enc.transform(test_ft[["주구매_중분류"]])

train_ft.shape, test_ft.shape

((14940, 391), (12225, 391))

- 문자열 피처 삭제

In [18]:
cols

['주구매지점', '주구매_중분류', '주구매_대분류']

In [19]:
train_ft = train_ft.drop(columns=cols)
test_ft = test_ft.drop(columns=cols)
train_ft.shape, test_ft.shape

((14940, 388), (12225, 388))

In [20]:
train_ft.select_dtypes("object").columns , test_ft.select_dtypes("object").columns

(Index([], dtype='object'), Index([], dtype='object'))

## Feature Scaling

In [21]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

In [22]:
train_ft[train_ft.columns] = scaler.fit_transform(train_ft)
test_ft[test_ft.columns] = scaler.transform(test_ft)
train_ft.head()

Unnamed: 0,구매횟수,내점일수,구매주기,주말방문비율,봄_구매비율,여름_구매비율,가을_구매비율,겨울_구매비율,주구매요일,일별평균구매건수,...,주구매_대분류_20,주구매_대분류_21,주구매_대분류_22,주구매_대분류_23,주구매_대분류_24,주구매_대분류_25,주구매_대분류_26,주구매_대분류_27,주구매_대분류_28,주구매_중분류_cnt
0,-0.437126,-0.369867,0.002987,0.257728,-1.029777,0.001191,0.838272,0.338186,0.109631,-0.057297,...,-0.081258,-0.18062,-0.171941,-0.124766,-0.074744,-0.08084,-0.089606,-0.101046,-0.048458,-0.72697
1,0.239394,0.14411,-0.356452,-1.008554,0.323951,-0.390607,0.620171,-0.552996,0.109631,0.222706,...,-0.081258,-0.18062,-0.171941,-0.124766,-0.074744,-0.08084,-0.089606,-0.101046,-0.048458,-0.510078
2,1.890101,1.943028,-0.869935,0.036742,0.798943,-0.514333,-0.304527,-0.059266,-1.64337,0.277707,...,-0.081258,-0.18062,-0.171941,-0.124766,-0.074744,-0.08084,-0.089606,-0.101046,-0.048458,-0.848408
3,4.29851,3.793345,-1.02398,-0.080558,0.420933,-0.327474,0.008592,-0.135636,0.109631,0.674668,...,-0.081258,-0.18062,-0.171941,-0.124766,-0.074744,-0.08084,-0.089606,-0.101046,-0.048458,1.523851
4,0.618244,0.452496,-0.613193,0.302875,-0.752532,1.70741,-0.130285,-0.821561,0.693965,0.679008,...,-0.081258,-0.18062,-0.171941,-0.124766,-0.074744,-0.08084,-0.089606,-0.101046,-0.048458,-0.833157


# 정답 데이터

In [23]:
target = train_target["target"]
target

Unnamed: 0,target
0,1.0
1,1.0
2,0.0
3,0.0
4,0.0
...,...
14935,0.0
14936,0.0
14937,0.0
14938,1.0


# cv 점수 확인해보기

In [24]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
cv = KFold(n_splits=5,shuffle=True, random_state=SEED)

In [25]:
from lightgbm import LGBMClassifier

model = LGBMClassifier(random_state=SEED)
scores = cross_val_score(model,train_ft,target,cv = cv ,scoring='f1_macro',n_jobs = -1)
np.mean(scores)

Dask dataframe query planning is disabled because dask-expr is not installed.

You can install it with `pip install dask[dataframe]` or `conda install dask`.
This will raise in a future version.



0.7154512315623972

# 모델 학습

In [26]:
model = LGBMClassifier(random_state=SEED)
model.fit(train_ft,target)

[LightGBM] [Info] Number of positive: 5874, number of negative: 9066
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.067697 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 12769
[LightGBM] [Info] Number of data points in the train set: 14940, number of used features: 372
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.393173 -> initscore=-0.433995
[LightGBM] [Info] Start training from score -0.433995


# 테스트 데이터 예측

In [27]:
pred = model.predict(test_ft)
pred

array([0., 1., 0., ..., 0., 0., 0.])

In [28]:
# pred = model.predict_proba(test_ft)[:,1]
# pred

# 평가를 위한 제출 파일 생성
- 예측 결과를 target 컬럼에 넣어 csv 파일로 저장후에 제출한다.

In [29]:
submit

Unnamed: 0,ID,target
0,test_0,0.5
1,test_1,0.5
2,test_2,0.5
3,test_3,0.5
4,test_4,0.5
...,...,...
12220,test_12220,0.5
12221,test_12221,0.5
12222,test_12222,0.5
12223,test_12223,0.5


In [30]:
submit["target"] = pred
submit

Unnamed: 0,ID,target
0,test_0,0.0
1,test_1,1.0
2,test_2,0.0
3,test_3,1.0
4,test_4,1.0
...,...,...
12220,test_12220,1.0
12221,test_12221,0.0
12222,test_12222,0.0
12223,test_12223,0.0


- 예측 결과를 csv 파일로 저장하여 제출

In [31]:
submit.to_csv(f"{DATA_PATH}submit.csv",index=False)