In [None]:
from google.colab import drive
drive.mount('/content/drive')

- 데이터 경로 변수

In [None]:
DATA_PATH = "/content/drive/MyDrive/data/"
DATA_PATH

- 시드값

In [None]:
SEED = 42

- 데이터 불러오기

In [None]:
import pandas as pd
import numpy as np

train_tr = pd.read_csv(f"{DATA_PATH}store_train_transactions.csv") # 학습용 구매기록 데이터
train_target = pd.read_csv(f"{DATA_PATH}store_train.csv") # 학습용 정답 데이터
test_tr = pd.read_csv(f"{DATA_PATH}store_test_transactions.csv") # 테스트용 구매기록 데이터
submit = pd.read_csv(f"{DATA_PATH}store_submission.csv") # 제출 양식 데이터

train_tr.shape , train_target.shape , test_tr.shape , submit.shape

- 공통 피처 파일 불러오기

In [None]:
train_ft = pd.read_csv(f"{DATA_PATH}train_common.csv") # 학습 데이터(피처)
test_ft = pd.read_csv(f"{DATA_PATH}test_common.csv") # 테스트 데이터(피처)

train_ft.shape , test_ft.shape

# 결측치 처리

In [None]:
mask = train_ft.isnull().sum() > 0
train_ft.isnull().sum()[mask]

In [None]:
mask = test_ft.isnull().sum() > 0
test_ft.isnull().sum()[mask]

In [None]:
train_ft["구매금액표준편차"] = train_ft["구매금액표준편차"].fillna(0)
test_ft["구매금액표준편차"] = test_ft["구매금액표준편차"].fillna(0)

# 특성 공학(Feature Engineering)

- ID 변수 제외

In [None]:
train_ft = train_ft.iloc[:,1:]
test_ft = test_ft.iloc[:,1:]
train_ft.shape, test_ft.shape

- 추가 피처 만들어 보기

In [None]:
cols = [ col for col in train_ft.columns if col.startswith("pivot_cnt_") ]

In [None]:
train_ft["중분류별_구매횟수_std"] = train_ft[cols].std(axis=1)
train_ft["중분류별_구매횟수_skew"] = train_ft[cols].skew(axis=1)
train_ft["중분류별_구매횟수_kurt"] = train_ft[cols].kurt(axis=1)

test_ft["중분류별_구매횟수_std"] = test_ft[cols].std(axis=1)
test_ft["중분류별_구매횟수_skew"] = test_ft[cols].skew(axis=1)
test_ft["중분류별_구매횟수_kurt"] = test_ft[cols].kurt(axis=1)

train_ft.shape, test_ft.shape

## Feature Encoding

In [None]:
cols = train_ft.select_dtypes("object").columns.tolist()
train_ft[cols].nunique()

In [None]:
%pip install category_encoders

In [None]:
import category_encoders as ce

In [None]:
enc = ce.one_hot.OneHotEncoder()
tmp = enc.fit_transform(train_ft[["주구매지점","주구매_대분류"]])
train_ft = pd.concat([train_ft,tmp],axis =1)

tmp = enc.transform(test_ft[["주구매지점","주구매_대분류"]])
test_ft = pd.concat([test_ft,tmp],axis =1)

train_ft.shape, test_ft.shape

In [None]:
enc = ce.count.CountEncoder()
train_ft["주구매_중분류_cnt"] = enc.fit_transform(train_ft[["주구매_중분류"]])
test_ft["주구매_중분류_cnt"] = enc.transform(test_ft[["주구매_중분류"]])

train_ft.shape, test_ft.shape

- 문자열 피처 삭제

In [None]:
cols

In [None]:
train_ft = train_ft.drop(columns=cols)
test_ft = test_ft.drop(columns=cols)
train_ft.shape, test_ft.shape

In [None]:
train_ft.select_dtypes("object").columns , test_ft.select_dtypes("object").columns

## Feature Scaling

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

In [None]:
train_ft[train_ft.columns] = scaler.fit_transform(train_ft)
test_ft[test_ft.columns] = scaler.transform(test_ft)
train_ft.head()

# 정답 데이터

In [None]:
target = train_target["target"]
target

# cv 점수 확인해보기

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
cv = KFold(n_splits=5,shuffle=True, random_state=SEED)

In [None]:
from lightgbm import LGBMClassifier

model = LGBMClassifier(random_state=SEED)
scores = cross_val_score(model,train_ft,target,cv = cv ,scoring='f1_macro',n_jobs = -1)
np.mean(scores)

# 모델 학습

In [None]:
model = LGBMClassifier(random_state=SEED)
model.fit(train_ft,target)

# 테스트 데이터 예측

In [None]:
pred = model.predict(test_ft)
pred

# 평가를 위한 제출 파일 생성
- 예측 결과를 target 컬럼에 넣어 csv 파일로 저장후에 제출한다.

In [None]:
submit

In [None]:
submit["target"] = pred
submit

- 예측 결과를 csv 파일로 저장하여 제출

In [None]:
submit.to_csv(f"{DATA_PATH}submit.csv",index=False)