In [7]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [8]:
DATA_PATH = "/content/drive/MyDrive/the_datas/data/"
DATA_PATH

'/content/drive/MyDrive/the_datas/data/'

In [9]:
SEED = 42

In [10]:
import pandas as pd
import numpy as np

train_tr = pd.read_csv(f"{DATA_PATH}store_train_transactions.csv") # 학습용 구매기록 데이터
train_target = pd.read_csv(f"{DATA_PATH}store_train.csv") # 학습용 정답 데이터
test_tr = pd.read_csv(f"{DATA_PATH}store_test_transactions.csv") # 테스트용 구매기록 데이터
submit = pd.read_csv(f"{DATA_PATH}store_submission.csv") # 제출 양식 데이터

train_tr.shape , train_target.shape , test_tr.shape , submit.shape

((523105, 7), (14940, 2), (441196, 7), (12225, 2))

In [109]:
train_ft = pd.read_csv(f"{DATA_PATH}train_common.csv") # 학습 데이터(피처)
test_ft = pd.read_csv(f"{DATA_PATH}test_common.csv") # 테스트 데이터(피처)

train_ft.shape , test_ft.shape

((14940, 356), (12225, 356))

# 특성 생성 실험

0.7130706978777662

In [110]:
train_tr["구매일시"] = pd.to_datetime(train_tr["구매일시"])
test_tr["구매일시"] = pd.to_datetime(test_tr["구매일시"])

In [139]:
train_tr["구매일시"].min(), train_tr["구매일시"].max()

(Timestamp('2004-05-01 09:40:00'), Timestamp('2005-04-29 20:03:00'))

In [111]:
# 1월부터 12월까지의 구매 비율 추가
agg_list = []
agg_list += [(f'{n}월_구매비율', lambda x: np.mean(x.dt.month == n)) for n in range(1, 13)]

tmp = train_tr.groupby('ID')["구매일시"].agg(agg_list).reset_index()
train_ft = train_ft.merge(tmp, how='left',on="ID")
train_ft.head()

tmp = test_tr.groupby('ID')["구매일시"].agg(agg_list).reset_index()
test_ft = test_ft.merge(tmp, how='left',on="ID")
test_ft.head()

Unnamed: 0,ID,구매횟수,내점일수,구매주기,주말방문비율,봄_구매비율,여름_구매비율,가을_구매비율,겨울_구매비율,주구매요일,...,3월_구매비율,4월_구매비율,5월_구매비율,6월_구매비율,7월_구매비율,8월_구매비율,9월_구매비율,10월_구매비율,11월_구매비율,12월_구매비율
0,test_0,7,5,32,0.571429,0.285714,0.285714,0.428571,0.0,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,test_1,4,3,54,0.25,0.0,0.0,0.75,0.25,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,test_2,4,2,8,0.0,0.0,0.6,0.4,0.0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,test_3,28,16,16,0.03125,0.0625,0.46875,0.34375,0.125,3,...,0.09375,0.09375,0.09375,0.09375,0.09375,0.09375,0.09375,0.09375,0.09375,0.09375
4,test_4,19,9,29,0.47619,0.380952,0.285714,0.238095,0.095238,6,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [112]:

#대분류, 구매가격 pivot 추가

train_tmp = pd.pivot_table(train_tr,index="ID",columns="대분류",values="구매가격",aggfunc="count",fill_value=0).add_prefix("대_pivot_cnt").reset_index()
train_ft = train_ft.merge(train_tmp,how="left",on="ID")
train_ft.head()
test_tmp = pd.pivot_table(test_tr,index="ID",columns="대분류",values="구매가격",aggfunc="count",fill_value=0).add_prefix("대_pivot_cnt").reset_index()
test_tmp.head()

for col in train_tmp.columns:
    if col not in test_tmp.columns:
        test_tmp[col] = 0

test_tmp = test_tmp[train_tmp.columns]

test_ft = test_ft.merge(test_tmp,how="left",on="ID")
test_ft.head()

Unnamed: 0,ID,구매횟수,내점일수,구매주기,주말방문비율,봄_구매비율,여름_구매비율,가을_구매비율,겨울_구매비율,주구매요일,...,대_pivot_cnt여성캐주얼,대_pivot_cnt여성캐쥬얼,대_pivot_cnt영라이브,대_pivot_cnt영어덜트캐쥬얼,대_pivot_cnt영캐릭터,대_pivot_cnt영플라자,대_pivot_cnt잡화,대_pivot_cnt잡화파트,대_pivot_cnt케주얼_구두_아동,대_pivot_cnt패션잡화
0,test_0,7,5,32,0.571429,0.285714,0.285714,0.428571,0.0,1,...,0,1,0,0,0,0,1,0,0,0
1,test_1,4,3,54,0.25,0.0,0.0,0.75,0.25,0,...,1,0,0,0,0,0,0,0,0,0
2,test_2,4,2,8,0.0,0.0,0.6,0.4,0.0,0,...,0,0,0,0,0,0,0,2,2,0
3,test_3,28,16,16,0.03125,0.0625,0.46875,0.34375,0.125,3,...,0,0,0,0,2,15,0,0,0,13
4,test_4,19,9,29,0.47619,0.380952,0.285714,0.238095,0.095238,6,...,9,0,0,0,0,1,0,0,5,1


0.7107565592454816

In [113]:
q25, q50, q75 = train_ft['최소구매액'].quantile([0.25, 0.5, 0.75])

train_ft['최소구매액_bin'] = pd.cut(train_ft['최소구매액'], bins=[0, q25, q50, q75, np.inf], labels=[1, 2, 3, 4])
test_ft['최소구매액_bin'] = pd.cut(test_ft['최소구매액'], bins=[0, q25, q50, q75, np.inf], labels=[1, 2, 3, 4])

train_ft.shape, test_ft.shape

((14940, 397), (12225, 397))

In [114]:
q25, q50, q75 = train_ft['중분류_nunique'].quantile([0.25, 0.5, 0.75])
train_ft['중분류_nunique_bin'] = pd.cut(train_ft['중분류_nunique'], bins=[0, q25, q50, q75, np.inf], labels=[1, 2, 3, 4])
test_ft['중분류_nunique_bin'] = pd.cut(test_ft['중분류_nunique'], bins=[0, q25, q50, q75, np.inf], labels=[1, 2, 3, 4])

In [115]:
for i in range(1,5):
    for j in range(1,5):
        train_ft[f'최소구매액_중분류수_{i}_{j}'] = ((train_ft['최소구매액_bin'] == i) & (train_ft['중분류_nunique_bin'] == j)).astype(int)
        test_ft[f'최소구매액_중분류수_{i}_{j}'] = ((test_ft['최소구매액_bin'] == i) & (test_ft['중분류_nunique_bin'] == j)).astype(int)

0.7154512315623972

In [116]:
Q1 = train_tr['구매가격'].quantile(0.1)
Q3 = train_tr['구매가격'].quantile(0.75)
IQR = Q3 - Q1

In [117]:
Q3, Q1

(119700.0, 8100.0)

In [118]:
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

In [119]:
lower_bound, upper_bound

(-159300.0, 287100.0)

In [120]:
train_ft['구매가격_특이값'] = ((train_ft['총구매액'] > upper_bound)).astype(int)
test_ft['구매가격_특이값'] = ((test_ft['총구매액'] > upper_bound)).astype(int) #(test_ft['총구매액'] < lower_bound) |

0.714270454857448

In [121]:
purchase_counts = train_tr.groupby('ID')['구매가격'].count().reset_index()
purchase_counts.columns = ['ID', '구매횟수']

In [122]:
같은가격재구매 = train_tr[train_tr.duplicated(subset=['ID', '구매가격'], keep=False)]
같은가격재구매횟수 = 같은가격재구매.groupby('ID')['구매가격'].count().reset_index()
같은가격재구매횟수.columns = ['ID', '같은가격재구매횟수']

In [123]:
purchase_counts = purchase_counts.merge(같은가격재구매횟수, on='ID', how='left').fillna(0)
purchase_counts['반복구매비율'] = purchase_counts['같은가격재구매횟수'] / purchase_counts['구매횟수']


In [124]:
train_ft = train_ft.merge(purchase_counts[['ID', '반복구매비율']], on='ID', how='left')

In [125]:
train_ft.head()

Unnamed: 0,ID,구매횟수,내점일수,구매주기,주말방문비율,봄_구매비율,여름_구매비율,가을_구매비율,겨울_구매비율,주구매요일,...,최소구매액_중분류수_3_1,최소구매액_중분류수_3_2,최소구매액_중분류수_3_3,최소구매액_중분류수_3_4,최소구매액_중분류수_4_1,최소구매액_중분류수_4_2,최소구매액_중분류수_4_3,최소구매액_중분류수_4_4,구매가격_특이값,반복구매비율
0,train_0,16,11,23,0.25,0.05,0.25,0.4,0.3,3,...,0,1,0,0,0,0,0,0,1,0.1
1,train_1,41,21,16,0.02381,0.357143,0.166667,0.357143,0.119048,3,...,0,0,0,0,0,0,0,0,1,0.119048
2,train_2,102,56,6,0.210526,0.464912,0.140351,0.175439,0.219298,0,...,0,0,0,0,0,0,0,0,1,0.210526
3,train_3,191,92,3,0.189573,0.379147,0.180095,0.236967,0.203791,3,...,0,0,0,0,0,0,0,0,1,0.473934
4,train_4,55,27,11,0.258065,0.112903,0.612903,0.209677,0.064516,4,...,0,0,0,0,0,0,0,0,1,0.064516


In [126]:
purchase_counts = test_tr.groupby('ID')['구매가격'].count().reset_index()
purchase_counts.columns = ['ID', '구매횟수']
같은가격재구매 = test_tr[test_tr.duplicated(subset=['ID', '구매가격'], keep=False)]
같은가격재구매횟수 = 같은가격재구매.groupby('ID')['구매가격'].count().reset_index()
같은가격재구매횟수.columns = ['ID', '같은가격재구매횟수']
purchase_counts = purchase_counts.merge(같은가격재구매횟수, on='ID', how='left').fillna(0)
purchase_counts['반복구매비율'] = purchase_counts['같은가격재구매횟수'] / purchase_counts['구매횟수']
test_ft = test_ft.merge(purchase_counts[['ID', '반복구매비율']], on='ID', how='left')

In [127]:
test_ft.head()

Unnamed: 0,ID,구매횟수,내점일수,구매주기,주말방문비율,봄_구매비율,여름_구매비율,가을_구매비율,겨울_구매비율,주구매요일,...,최소구매액_중분류수_3_1,최소구매액_중분류수_3_2,최소구매액_중분류수_3_3,최소구매액_중분류수_3_4,최소구매액_중분류수_4_1,최소구매액_중분류수_4_2,최소구매액_중분류수_4_3,최소구매액_중분류수_4_4,구매가격_특이값,반복구매비율
0,test_0,7,5,32,0.571429,0.285714,0.285714,0.428571,0.0,1,...,0,0,0,0,1,0,0,0,1,0.0
1,test_1,4,3,54,0.25,0.0,0.0,0.75,0.25,0,...,0,0,0,0,1,0,0,0,1,0.0
2,test_2,4,2,8,0.0,0.0,0.6,0.4,0.0,0,...,0,0,0,0,1,0,0,0,1,0.0
3,test_3,28,16,16,0.03125,0.0625,0.46875,0.34375,0.125,3,...,0,0,0,0,0,0,0,0,1,0.125
4,test_4,19,9,29,0.47619,0.380952,0.285714,0.238095,0.095238,6,...,0,1,0,0,0,0,0,0,1,0.0


0.7143624217714903

In [128]:
purchase_counts = train_tr.groupby('ID')['중분류'].count().reset_index()
purchase_counts.columns = ['ID', '중분류수']
같은중분류재구매 = train_tr[train_tr.duplicated(subset=['ID', '중분류'], keep=False)]
같은중분류재구매횟수 = 같은중분류재구매.groupby('ID')['중분류'].count().reset_index()
같은중분류재구매횟수.columns = ['ID', '같은중분류재구매횟수']
purchase_counts = purchase_counts.merge(같은중분류재구매횟수, on='ID', how='left').fillna(0)
purchase_counts['반복중분류비율'] = purchase_counts['같은중분류재구매횟수'] / purchase_counts['중분류수']
train_ft = train_ft.merge(purchase_counts[['ID', '반복중분류비율']], on='ID', how='left')

In [129]:
purchase_counts = test_tr.groupby('ID')['중분류'].count().reset_index()
purchase_counts.columns = ['ID', '중분류수']
같은중분류재구매 = test_tr[test_tr.duplicated(subset=['ID', '중분류'], keep=False)]
같은중분류재구매횟수 = 같은중분류재구매.groupby('ID')['중분류'].count().reset_index()
같은중분류재구매횟수.columns = ['ID', '같은중분류재구매횟수']
purchase_counts = purchase_counts.merge(같은중분류재구매횟수, on='ID', how='left').fillna(0)
purchase_counts['반복중분류비율'] = purchase_counts['같은중분류재구매횟수'] / purchase_counts['중분류수']
test_ft = test_ft.merge(purchase_counts[['ID', '반복중분류비율']], on='ID', how='left')

In [130]:
# agg_list = [
#              ('주구매_브랜드코드', lambda x:x.mode[0]),
#              ]

# tmp = train_tr.groupby('ID')["브랜드코드"].agg(agg_list).reset_index()
# train_ft = train_ft.merge(tmp, how='left',on="ID")
# train_ft.head()

# 마무리

In [131]:
train_ft.shape, test_ft.shape

((14940, 417), (12225, 417))

In [132]:
train_ft.to_csv(f"{DATA_PATH}train_common_실험.csv",index=False)
test_ft.to_csv(f"{DATA_PATH}test_common_실험.csv",index=False)