### Imports

In [1]:
# for read data
import os
import pickle

# for feature generation
import pandas as pd
import numpy as np
import warnings; warnings.filterwarnings("ignore")

# for confirm features
import matplotlib.pyplot as plt
import seaborn as sns
plt.rc('font', family='Malgun Gothic')
plt.rc('axes', unicode_minus=False)
import warnings; warnings.filterwarnings("ignore")

### Read Data

In [2]:
path = (os.path.abspath("./input"))

train = pd.read_csv(path + '/X_train_DC.csv', parse_dates=['SALES_DATE', 'SALES_DAY'], encoding='cp949')
test = pd.read_csv(path + '/X_test_DC.csv', parse_dates=['SALES_DATE', 'SALES_DAY'], encoding='cp949')
y_train = pd.read_csv(path + '/y_train.csv')

goodcdset, brd_nmset, corner_nmset, pc_nmset, part_nmset, buyer_nmset = pd.read_pickle(path+'/FE_group.pkl')

### Concat Data
train, test data로 한번에 Feature Generation하고 추후 Feature Transformation, Feature Selection을 위해 분리한다.

In [3]:
# Concat to One Data
data = pd.concat([train, test]).reset_index(drop=True)
data.head()

# 추후 custid를 기준으로 train data로 만든 feature, test data로 만든 feature로 분리한다.
train_ID = train.custid.unique()
test_ID = test.custid.unique()

feature = pd.DataFrame({'custid':train.custid.unique().tolist() + test.custid.unique().tolist()})

- **[실제구매액, 구매건수, 평균구매액, 최대구매액, 최소구매액]**

In [4]:
amount = data.groupby('custid')['net_amt'].agg([('실제구매액', np.sum),
                                                ('구매건수', np.size),
                                                ('평균구매액', lambda x : np.round(np.mean(x))),
                                                ('최대구매액', np.max)]).reset_index()

feature = pd.merge(feature, amount, on='custid', how='outer')
feature

Unnamed: 0,custid,실제구매액,구매건수,평균구매액,최대구매액
0,0,1567800,11,142527.0,409500
1,2,3495760,11,317796.0,1416000
2,3,2208840,30,73628.0,589950
3,4,1023200,4,255800.0,560000
4,5,4692499,32,146641.0,613800
...,...,...,...,...,...
35962,49988,600400,4,150100.0,401850
35963,49990,202350,1,202350.0,202350
35964,49992,209950,2,104975.0,159600
35965,49993,139529,4,34882.0,66500


- **[월별평균구매액, 월별최대구매액, 월별최소구매액]**

In [5]:
month = pd.pivot_table(data ,index='custid', columns='sales_month', values='net_amt',
                       aggfunc = [np.mean, max, min], fill_value=0)
column_name = ['월평균구매액', '월최대구매액', '월최소구매액']
month.columns = [str(j)+i for i in column_name for j in month.columns.get_level_values(1)[:12]]

feature = pd.merge(feature, month, on='custid', how='outer')
feature

Unnamed: 0,custid,실제구매액,구매건수,평균구매액,최대구매액,1월평균구매액,2월평균구매액,3월평균구매액,4월평균구매액,5월평균구매액,...,3월최소구매액,4월최소구매액,5월최소구매액,6월최소구매액,7월최소구매액,8월최소구매액,9월최소구매액,10월최소구매액,11월최소구매액,12월최소구매액
0,0,1567800,11,142527.0,409500,0.00,0.0,0.000000,264600.0,0.0,...,0,264600,0,35100,0,157500,90000,0,0,86400
1,2,3495760,11,317796.0,1416000,-108000.00,0.0,0.000000,0.0,0.0,...,0,0,0,0,0,-391050,400500,131400,208800,1416000
2,3,2208840,30,73628.0,589950,69275.00,0.0,410083.333333,-93575.0,85500.0,...,128250,-589950,77000,4200,6800,-70870,14800,0,0,64600
3,4,1023200,4,255800.0,560000,0.00,0.0,0.000000,0.0,0.0,...,0,0,0,0,49000,0,0,0,0,0
4,5,4692499,32,146641.0,613800,0.00,0.0,185050.000000,0.0,121050.0,...,124200,0,94500,66600,31500,21600,20335,44000,38700,58083
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
35962,49988,600400,4,150100.0,401850,56050.00,0.0,0.000000,0.0,0.0,...,0,0,0,0,0,0,0,0,49400,0
35963,49990,202350,1,202350.0,202350,0.00,202350.0,0.000000,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
35964,49992,209950,2,104975.0,159600,0.00,0.0,0.000000,159600.0,0.0,...,0,159600,0,0,50350,0,0,0,0,0
35965,49993,139529,4,34882.0,66500,34882.25,0.0,0.000000,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0


- **[환불건수, 환불비율, 환불금액]**

In [6]:
refund = data.groupby('custid').agg({'REFUND':[('환불건수', np.sum), ('환불비율', np.mean)],
                                     'REFUND_AMT':[('환불금액', np.sum)]})
refund.columns = refund.columns.get_level_values(1)
refund = refund.reset_index()

feature = pd.merge(feature, refund, on='custid', how='outer')
feature

Unnamed: 0,custid,실제구매액,구매건수,평균구매액,최대구매액,1월평균구매액,2월평균구매액,3월평균구매액,4월평균구매액,5월평균구매액,...,6월최소구매액,7월최소구매액,8월최소구매액,9월최소구매액,10월최소구매액,11월최소구매액,12월최소구매액,환불건수,환불비율,환불금액
0,0,1567800,11,142527.0,409500,0.00,0.0,0.000000,264600.0,0.0,...,35100,0,157500,90000,0,0,86400,0,0.000000,0
1,2,3495760,11,317796.0,1416000,-108000.00,0.0,0.000000,0.0,0.0,...,0,0,-391050,400500,131400,208800,1416000,2,0.181818,1807050
2,3,2208840,30,73628.0,589950,69275.00,0.0,410083.333333,-93575.0,85500.0,...,4200,6800,-70870,14800,0,0,64600,2,0.066667,660820
3,4,1023200,4,255800.0,560000,0.00,0.0,0.000000,0.0,0.0,...,0,49000,0,0,0,0,0,0,0.000000,0
4,5,4692499,32,146641.0,613800,0.00,0.0,185050.000000,0.0,121050.0,...,66600,31500,21600,20335,44000,38700,58083,0,0.000000,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
35962,49988,600400,4,150100.0,401850,56050.00,0.0,0.000000,0.0,0.0,...,0,0,0,0,0,49400,0,0,0.000000,0
35963,49990,202350,1,202350.0,202350,0.00,202350.0,0.000000,0.0,0.0,...,0,0,0,0,0,0,0,0,0.000000,0
35964,49992,209950,2,104975.0,159600,0.00,0.0,0.000000,159600.0,0.0,...,0,50350,0,0,0,0,0,0,0.000000,0
35965,49993,139529,4,34882.0,66500,34882.25,0.0,0.000000,0.0,0.0,...,0,0,0,0,0,0,0,0,0.000000,0


- **[할인횟수, 할인비율, 평균할인율]**

In [7]:
discount = data.groupby('custid').agg({'DISCOUNT':[('할인횟수', np.sum), ('할인비율', np.mean)],
                                       'DISCOUNT_PER':[('평균할인율', lambda x: np.mean([i for i in x if i > 0]))]})
discount.columns = discount.columns.get_level_values(1)
discount = discount.fillna(0).reset_index()  # 할인받지 않은 고객의 평균할인율을 0으로 처리

feature = pd.merge(feature, discount, on='custid', how='outer')
feature

Unnamed: 0,custid,실제구매액,구매건수,평균구매액,최대구매액,1월평균구매액,2월평균구매액,3월평균구매액,4월평균구매액,5월평균구매액,...,9월최소구매액,10월최소구매액,11월최소구매액,12월최소구매액,환불건수,환불비율,환불금액,할인횟수,할인비율,평균할인율
0,0,1567800,11,142527.0,409500,0.00,0.0,0.000000,264600.0,0.0,...,90000,0,0,86400,0,0.000000,0,11,1.000000,0.100000
1,2,3495760,11,317796.0,1416000,-108000.00,0.0,0.000000,0.0,0.0,...,400500,131400,208800,1416000,2,0.181818,1807050,7,0.636364,0.100003
2,3,2208840,30,73628.0,589950,69275.00,0.0,410083.333333,-93575.0,85500.0,...,14800,0,0,64600,2,0.066667,660820,18,0.600000,0.051071
3,4,1023200,4,255800.0,560000,0.00,0.0,0.000000,0.0,0.0,...,0,0,0,0,0,0.000000,0,2,0.500000,0.050000
4,5,4692499,32,146641.0,613800,0.00,0.0,185050.000000,0.0,121050.0,...,20335,44000,38700,58083,0,0.000000,0,22,0.687500,0.097727
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
35962,49988,600400,4,150100.0,401850,56050.00,0.0,0.000000,0.0,0.0,...,0,0,49400,0,0,0.000000,0,4,1.000000,0.050000
35963,49990,202350,1,202350.0,202350,0.00,202350.0,0.000000,0.0,0.0,...,0,0,0,0,0,0.000000,0,1,1.000000,0.050000
35964,49992,209950,2,104975.0,159600,0.00,0.0,0.000000,159600.0,0.0,...,0,0,0,0,0,0.000000,0,2,1.000000,0.050000
35965,49993,139529,4,34882.0,66500,34882.25,0.0,0.000000,0.0,0.0,...,0,0,0,0,0,0.000000,0,1,0.250000,0.050000


- **[평균방문월, 평균구매월]**<br>
  방문월들의 평균, 구매액이 높았던 5개 데이터의 월평균을 feature로 생성한다.

In [8]:
# 평균방문월
month = data.groupby('custid')['sales_month'].agg([('평균방문월', np.mean)]).reset_index()

# 평균구매월
max_amt = data.query('net_amt > 0').groupby('custid')['net_amt']\
          .agg([('상위구매액', lambda x: x.sort_values()[-5:].index.tolist())])
max_amt = [j for i in max_amt.상위구매액 for j in i]
max_month = data.loc[max_amt].groupby('custid')['sales_month'].agg([('평균구매월', np.mean)]).reset_index()
month = pd.merge(month, max_month, on='custid', how='outer')

feature = pd.merge(feature, month, on='custid', how='outer')
feature

Unnamed: 0,custid,실제구매액,구매건수,평균구매액,최대구매액,1월평균구매액,2월평균구매액,3월평균구매액,4월평균구매액,5월평균구매액,...,11월최소구매액,12월최소구매액,환불건수,환불비율,환불금액,할인횟수,할인비율,평균할인율,평균방문월,평균구매월
0,0,1567800,11,142527.0,409500,0.00,0.0,0.000000,264600.0,0.0,...,0,86400,0,0.000000,0,11,1.000000,0.100000,8.636364,8.2
1,2,3495760,11,317796.0,1416000,-108000.00,0.0,0.000000,0.0,0.0,...,208800,1416000,2,0.181818,1807050,7,0.636364,0.100003,7.909091,7.8
2,3,2208840,30,73628.0,589950,69275.00,0.0,410083.333333,-93575.0,85500.0,...,0,64600,2,0.066667,660820,18,0.600000,0.051071,6.633333,5.6
3,4,1023200,4,255800.0,560000,0.00,0.0,0.000000,0.0,0.0,...,0,0,0,0.000000,0,2,0.500000,0.050000,7.000000,7.0
4,5,4692499,32,146641.0,613800,0.00,0.0,185050.000000,0.0,121050.0,...,38700,58083,0,0.000000,0,22,0.687500,0.097727,7.593750,6.6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
35962,49988,600400,4,150100.0,401850,56050.00,0.0,0.000000,0.0,0.0,...,49400,0,0,0.000000,0,4,1.000000,0.050000,8.500000,8.5
35963,49990,202350,1,202350.0,202350,0.00,202350.0,0.000000,0.0,0.0,...,0,0,0,0.000000,0,1,1.000000,0.050000,2.000000,2.0
35964,49992,209950,2,104975.0,159600,0.00,0.0,0.000000,159600.0,0.0,...,0,0,0,0.000000,0,2,1.000000,0.050000,5.500000,5.5
35965,49993,139529,4,34882.0,66500,34882.25,0.0,0.000000,0.0,0.0,...,0,0,0,0.000000,0,1,0.250000,0.050000,1.000000,1.0


- **[내점일수, 최근구매일, 구매주기]**

In [9]:
day = data.groupby('custid')['SALES_DAY'].agg([('내점일수',lambda x: x.nunique()),
                                               ('최근구매일', lambda x: (data.SALES_DAY.max() - x.max()).days),
                                               ('구매주기', lambda x: int((x.max() - x.min()).days / x.nunique()))])

feature = pd.merge(feature, day, on='custid', how='outer')
feature

Unnamed: 0,custid,실제구매액,구매건수,평균구매액,최대구매액,1월평균구매액,2월평균구매액,3월평균구매액,4월평균구매액,5월평균구매액,...,환불비율,환불금액,할인횟수,할인비율,평균할인율,평균방문월,평균구매월,내점일수,최근구매일,구매주기
0,0,1567800,11,142527.0,409500,0.00,0.0,0.000000,264600.0,0.0,...,0.000000,0,11,1.000000,0.100000,8.636364,8.2,7,20,41
1,2,3495760,11,317796.0,1416000,-108000.00,0.0,0.000000,0.0,0.0,...,0.181818,1807050,7,0.636364,0.100003,7.909091,7.8,7,116,18
2,3,2208840,30,73628.0,589950,69275.00,0.0,410083.333333,-93575.0,85500.0,...,0.066667,660820,18,0.600000,0.051071,6.633333,5.6,13,20,25
3,4,1023200,4,255800.0,560000,0.00,0.0,0.000000,0.0,0.0,...,0.000000,0,2,0.500000,0.050000,7.000000,7.0,2,273,1
4,5,4692499,32,146641.0,613800,0.00,0.0,185050.000000,0.0,121050.0,...,0.000000,0,22,0.687500,0.097727,7.593750,6.6,21,38,15
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
35962,49988,600400,4,150100.0,401850,56050.00,0.0,0.000000,0.0,0.0,...,0.000000,0,4,1.000000,0.050000,8.500000,8.5,2,117,28
35963,49990,202350,1,202350.0,202350,0.00,202350.0,0.000000,0.0,0.0,...,0.000000,0,1,1.000000,0.050000,2.000000,2.0,1,67,0
35964,49992,209950,2,104975.0,159600,0.00,0.0,0.000000,159600.0,0.0,...,0.000000,0,2,1.000000,0.050000,5.500000,5.5,2,17,142
35965,49993,139529,4,34882.0,66500,34882.25,0.0,0.000000,0.0,0.0,...,0.000000,0,1,0.250000,0.050000,1.000000,1.0,1,88,0


- **[1일평균구매액, 1일평균구매건수, 1일1개]**<br>
  내점일 당 평균구매액, 평균 구매건수, 1개만 사간 날의 개수를 구한다.

In [10]:
perday_data = data.groupby(['custid', 'SALES_DAY']).agg({'net_amt':[('1일구매액', np.sum)],
                                                         'str_nm':[('1일구매건수', np.size)]})
perday_data.columns = perday_data.columns.get_level_values(1)
perday_data = perday_data.reset_index()

In [11]:
perday = perday_data.groupby('custid').agg({'1일구매액':[('1일평균구매액', np.mean)],
                                            '1일구매건수':[('1일평균구매건수', np.mean), ('1일1개', lambda x: x.tolist().count(1))]})
perday.columns = perday.columns.get_level_values(1)
perday = perday.reset_index()

feature = pd.merge(feature, perday, on='custid', how='outer')
feature

Unnamed: 0,custid,실제구매액,구매건수,평균구매액,최대구매액,1월평균구매액,2월평균구매액,3월평균구매액,4월평균구매액,5월평균구매액,...,할인비율,평균할인율,평균방문월,평균구매월,내점일수,최근구매일,구매주기,1일평균구매액,1일평균구매건수,1일1개
0,0,1567800,11,142527.0,409500,0.00,0.0,0.000000,264600.0,0.0,...,1.000000,0.100000,8.636364,8.2,7,20,41,223971.428571,1.571429,4
1,2,3495760,11,317796.0,1416000,-108000.00,0.0,0.000000,0.0,0.0,...,0.636364,0.100003,7.909091,7.8,7,116,18,499394.285714,1.571429,4
2,3,2208840,30,73628.0,589950,69275.00,0.0,410083.333333,-93575.0,85500.0,...,0.600000,0.051071,6.633333,5.6,13,20,25,169910.769231,2.307692,5
3,4,1023200,4,255800.0,560000,0.00,0.0,0.000000,0.0,0.0,...,0.500000,0.050000,7.000000,7.0,2,273,1,511600.000000,2.000000,1
4,5,4692499,32,146641.0,613800,0.00,0.0,185050.000000,0.0,121050.0,...,0.687500,0.097727,7.593750,6.6,21,38,15,223452.333333,1.523810,12
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
35962,49988,600400,4,150100.0,401850,56050.00,0.0,0.000000,0.0,0.0,...,1.000000,0.050000,8.500000,8.5,2,117,28,300200.000000,2.000000,1
35963,49990,202350,1,202350.0,202350,0.00,202350.0,0.000000,0.0,0.0,...,1.000000,0.050000,2.000000,2.0,1,67,0,202350.000000,1.000000,1
35964,49992,209950,2,104975.0,159600,0.00,0.0,0.000000,159600.0,0.0,...,1.000000,0.050000,5.500000,5.5,2,17,142,104975.000000,1.000000,2
35965,49993,139529,4,34882.0,66500,34882.25,0.0,0.000000,0.0,0.0,...,0.250000,0.050000,1.000000,1.0,1,88,0,139529.000000,4.000000,0


- **[복수방문비율]**

In [12]:
duplicate = data.groupby('custid')['DUPLICATE_VISIT'].agg([('복수방문비율', np.mean)]).reset_index()

feature = pd.merge(feature, duplicate, on='custid', how='outer')
feature

Unnamed: 0,custid,실제구매액,구매건수,평균구매액,최대구매액,1월평균구매액,2월평균구매액,3월평균구매액,4월평균구매액,5월평균구매액,...,평균할인율,평균방문월,평균구매월,내점일수,최근구매일,구매주기,1일평균구매액,1일평균구매건수,1일1개,복수방문비율
0,0,1567800,11,142527.0,409500,0.00,0.0,0.000000,264600.0,0.0,...,0.100000,8.636364,8.2,7,20,41,223971.428571,1.571429,4,0.000000
1,2,3495760,11,317796.0,1416000,-108000.00,0.0,0.000000,0.0,0.0,...,0.100003,7.909091,7.8,7,116,18,499394.285714,1.571429,4,0.000000
2,3,2208840,30,73628.0,589950,69275.00,0.0,410083.333333,-93575.0,85500.0,...,0.051071,6.633333,5.6,13,20,25,169910.769231,2.307692,5,0.000000
3,4,1023200,4,255800.0,560000,0.00,0.0,0.000000,0.0,0.0,...,0.050000,7.000000,7.0,2,273,1,511600.000000,2.000000,1,0.000000
4,5,4692499,32,146641.0,613800,0.00,0.0,185050.000000,0.0,121050.0,...,0.097727,7.593750,6.6,21,38,15,223452.333333,1.523810,12,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
35962,49988,600400,4,150100.0,401850,56050.00,0.0,0.000000,0.0,0.0,...,0.050000,8.500000,8.5,2,117,28,300200.000000,2.000000,1,0.000000
35963,49990,202350,1,202350.0,202350,0.00,202350.0,0.000000,0.0,0.0,...,0.050000,2.000000,2.0,1,67,0,202350.000000,1.000000,1,0.000000
35964,49992,209950,2,104975.0,159600,0.00,0.0,0.000000,159600.0,0.0,...,0.050000,5.500000,5.5,2,17,142,104975.000000,1.000000,2,0.000000
35965,49993,139529,4,34882.0,66500,34882.25,0.0,0.000000,0.0,0.0,...,0.050000,1.000000,1.0,1,88,0,139529.000000,4.000000,0,0.000000


- **[주말방문비율]**

In [13]:
weekend = data.groupby('custid')['SALES_WEEKEND'].agg([('주말방문비율', np.mean)]).reset_index()

feature = pd.merge(feature, weekend, on='custid', how='outer')
feature

Unnamed: 0,custid,실제구매액,구매건수,평균구매액,최대구매액,1월평균구매액,2월평균구매액,3월평균구매액,4월평균구매액,5월평균구매액,...,평균방문월,평균구매월,내점일수,최근구매일,구매주기,1일평균구매액,1일평균구매건수,1일1개,복수방문비율,주말방문비율
0,0,1567800,11,142527.0,409500,0.00,0.0,0.000000,264600.0,0.0,...,8.636364,8.2,7,20,41,223971.428571,1.571429,4,0.000000,0.636364
1,2,3495760,11,317796.0,1416000,-108000.00,0.0,0.000000,0.0,0.0,...,7.909091,7.8,7,116,18,499394.285714,1.571429,4,0.000000,0.363636
2,3,2208840,30,73628.0,589950,69275.00,0.0,410083.333333,-93575.0,85500.0,...,6.633333,5.6,13,20,25,169910.769231,2.307692,5,0.000000,0.366667
3,4,1023200,4,255800.0,560000,0.00,0.0,0.000000,0.0,0.0,...,7.000000,7.0,2,273,1,511600.000000,2.000000,1,0.000000,0.250000
4,5,4692499,32,146641.0,613800,0.00,0.0,185050.000000,0.0,121050.0,...,7.593750,6.6,21,38,15,223452.333333,1.523810,12,0.000000,0.156250
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
35962,49988,600400,4,150100.0,401850,56050.00,0.0,0.000000,0.0,0.0,...,8.500000,8.5,2,117,28,300200.000000,2.000000,1,0.000000,0.000000
35963,49990,202350,1,202350.0,202350,0.00,202350.0,0.000000,0.0,0.0,...,2.000000,2.0,1,67,0,202350.000000,1.000000,1,0.000000,0.000000
35964,49992,209950,2,104975.0,159600,0.00,0.0,0.000000,159600.0,0.0,...,5.500000,5.5,2,17,142,104975.000000,1.000000,2,0.000000,0.500000
35965,49993,139529,4,34882.0,66500,34882.25,0.0,0.000000,0.0,0.0,...,1.000000,1.0,1,88,0,139529.000000,4.000000,0,0.000000,0.000000


- **[일상쇼핑비율]**

In [14]:
daily = data.groupby('custid')['DAILY'].agg([('일상쇼핑비율', np.mean)]).reset_index()

feature = pd.merge(feature, daily, on='custid', how='outer')
feature

Unnamed: 0,custid,실제구매액,구매건수,평균구매액,최대구매액,1월평균구매액,2월평균구매액,3월평균구매액,4월평균구매액,5월평균구매액,...,평균구매월,내점일수,최근구매일,구매주기,1일평균구매액,1일평균구매건수,1일1개,복수방문비율,주말방문비율,일상쇼핑비율
0,0,1567800,11,142527.0,409500,0.00,0.0,0.000000,264600.0,0.0,...,8.2,7,20,41,223971.428571,1.571429,4,0.000000,0.636364,0.000000
1,2,3495760,11,317796.0,1416000,-108000.00,0.0,0.000000,0.0,0.0,...,7.8,7,116,18,499394.285714,1.571429,4,0.000000,0.363636,0.545455
2,3,2208840,30,73628.0,589950,69275.00,0.0,410083.333333,-93575.0,85500.0,...,5.6,13,20,25,169910.769231,2.307692,5,0.000000,0.366667,0.600000
3,4,1023200,4,255800.0,560000,0.00,0.0,0.000000,0.0,0.0,...,7.0,2,273,1,511600.000000,2.000000,1,0.000000,0.250000,0.750000
4,5,4692499,32,146641.0,613800,0.00,0.0,185050.000000,0.0,121050.0,...,6.6,21,38,15,223452.333333,1.523810,12,0.000000,0.156250,0.687500
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
35962,49988,600400,4,150100.0,401850,56050.00,0.0,0.000000,0.0,0.0,...,8.5,2,117,28,300200.000000,2.000000,1,0.000000,0.000000,1.000000
35963,49990,202350,1,202350.0,202350,0.00,202350.0,0.000000,0.0,0.0,...,2.0,1,67,0,202350.000000,1.000000,1,0.000000,0.000000,1.000000
35964,49992,209950,2,104975.0,159600,0.00,0.0,0.000000,159600.0,0.0,...,5.5,2,17,142,104975.000000,1.000000,2,0.000000,0.500000,0.500000
35965,49993,139529,4,34882.0,66500,34882.25,0.0,0.000000,0.0,0.0,...,1.0,1,88,0,139529.000000,4.000000,0,0.000000,0.000000,1.000000


- **[퇴근쇼핑비율]**

In [15]:
leave_office = data.groupby('custid')['LEAVE_OFFICE'].agg([('퇴근쇼핑비율', np.mean)]).reset_index()

feature = pd.merge(feature, leave_office, on='custid', how='outer')
feature

Unnamed: 0,custid,실제구매액,구매건수,평균구매액,최대구매액,1월평균구매액,2월평균구매액,3월평균구매액,4월평균구매액,5월평균구매액,...,내점일수,최근구매일,구매주기,1일평균구매액,1일평균구매건수,1일1개,복수방문비율,주말방문비율,일상쇼핑비율,퇴근쇼핑비율
0,0,1567800,11,142527.0,409500,0.00,0.0,0.000000,264600.0,0.0,...,7,20,41,223971.428571,1.571429,4,0.000000,0.636364,0.000000,0.000000
1,2,3495760,11,317796.0,1416000,-108000.00,0.0,0.000000,0.0,0.0,...,7,116,18,499394.285714,1.571429,4,0.000000,0.363636,0.545455,0.545455
2,3,2208840,30,73628.0,589950,69275.00,0.0,410083.333333,-93575.0,85500.0,...,13,20,25,169910.769231,2.307692,5,0.000000,0.366667,0.600000,0.366667
3,4,1023200,4,255800.0,560000,0.00,0.0,0.000000,0.0,0.0,...,2,273,1,511600.000000,2.000000,1,0.000000,0.250000,0.750000,0.750000
4,5,4692499,32,146641.0,613800,0.00,0.0,185050.000000,0.0,121050.0,...,21,38,15,223452.333333,1.523810,12,0.000000,0.156250,0.687500,0.562500
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
35962,49988,600400,4,150100.0,401850,56050.00,0.0,0.000000,0.0,0.0,...,2,117,28,300200.000000,2.000000,1,0.000000,0.000000,1.000000,0.750000
35963,49990,202350,1,202350.0,202350,0.00,202350.0,0.000000,0.0,0.0,...,1,67,0,202350.000000,1.000000,1,0.000000,0.000000,1.000000,1.000000
35964,49992,209950,2,104975.0,159600,0.00,0.0,0.000000,159600.0,0.0,...,2,17,142,104975.000000,1.000000,2,0.000000,0.500000,0.500000,0.000000
35965,49993,139529,4,34882.0,66500,34882.25,0.0,0.000000,0.0,0.0,...,1,88,0,139529.000000,4.000000,0,0.000000,0.000000,1.000000,0.750000


- **[구매결정시간]**<br>
  다음 상품을 구매하는 데 걸린 시간들의 평균을 구매를 결정하는 데 걸리는 시간으로 해석한다.

In [16]:
choose = data.groupby('custid')['NEXT_PURCHASE'].agg([('구매결정시간', lambda x: np.mean([i for i in x if i > 0]))]).reset_index()

feature = pd.merge(feature, choose, on='custid', how='outer')
feature

Unnamed: 0,custid,실제구매액,구매건수,평균구매액,최대구매액,1월평균구매액,2월평균구매액,3월평균구매액,4월평균구매액,5월평균구매액,...,최근구매일,구매주기,1일평균구매액,1일평균구매건수,1일1개,복수방문비율,주말방문비율,일상쇼핑비율,퇴근쇼핑비율,구매결정시간
0,0,1567800,11,142527.0,409500,0.00,0.0,0.000000,264600.0,0.0,...,20,41,223971.428571,1.571429,4,0.000000,0.636364,0.000000,0.000000,34.500000
1,2,3495760,11,317796.0,1416000,-108000.00,0.0,0.000000,0.0,0.0,...,116,18,499394.285714,1.571429,4,0.000000,0.363636,0.545455,0.545455,25.000000
2,3,2208840,30,73628.0,589950,69275.00,0.0,410083.333333,-93575.0,85500.0,...,20,25,169910.769231,2.307692,5,0.000000,0.366667,0.600000,0.366667,21.352941
3,4,1023200,4,255800.0,560000,0.00,0.0,0.000000,0.0,0.0,...,273,1,511600.000000,2.000000,1,0.000000,0.250000,0.750000,0.750000,23.500000
4,5,4692499,32,146641.0,613800,0.00,0.0,185050.000000,0.0,121050.0,...,38,15,223452.333333,1.523810,12,0.000000,0.156250,0.687500,0.562500,16.181818
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
35962,49988,600400,4,150100.0,401850,56050.00,0.0,0.000000,0.0,0.0,...,117,28,300200.000000,2.000000,1,0.000000,0.000000,1.000000,0.750000,10.000000
35963,49990,202350,1,202350.0,202350,0.00,202350.0,0.000000,0.0,0.0,...,67,0,202350.000000,1.000000,1,0.000000,0.000000,1.000000,1.000000,
35964,49992,209950,2,104975.0,159600,0.00,0.0,0.000000,159600.0,0.0,...,17,142,104975.000000,1.000000,2,0.000000,0.500000,0.500000,0.000000,
35965,49993,139529,4,34882.0,66500,34882.25,0.0,0.000000,0.0,0.0,...,88,0,139529.000000,4.000000,0,0.000000,0.000000,1.000000,0.750000,41.000000


- **[평균쇼핑시간]**<br>
  평균 매장 체류시간을 의미한다.

In [17]:
shopping = data.groupby(['custid', 'SALES_DAY'])['NEXT_PURCHASE'].agg([('일별쇼핑시간', lambda x: np.sum([i for i in x if i > 0]))]).reset_index()
shopping = shopping.groupby('custid')['일별쇼핑시간'].agg([('평균쇼핑시간', np.mean)]).reset_index()

feature = pd.merge(feature, shopping, on='custid', how='outer')
feature

Unnamed: 0,custid,실제구매액,구매건수,평균구매액,최대구매액,1월평균구매액,2월평균구매액,3월평균구매액,4월평균구매액,5월평균구매액,...,구매주기,1일평균구매액,1일평균구매건수,1일1개,복수방문비율,주말방문비율,일상쇼핑비율,퇴근쇼핑비율,구매결정시간,평균쇼핑시간
0,0,1567800,11,142527.0,409500,0.00,0.0,0.000000,264600.0,0.0,...,41,223971.428571,1.571429,4,0.000000,0.636364,0.000000,0.000000,34.500000,19.714286
1,2,3495760,11,317796.0,1416000,-108000.00,0.0,0.000000,0.0,0.0,...,18,499394.285714,1.571429,4,0.000000,0.363636,0.545455,0.545455,25.000000,7.142857
2,3,2208840,30,73628.0,589950,69275.00,0.0,410083.333333,-93575.0,85500.0,...,25,169910.769231,2.307692,5,0.000000,0.366667,0.600000,0.366667,21.352941,27.923077
3,4,1023200,4,255800.0,560000,0.00,0.0,0.000000,0.0,0.0,...,1,511600.000000,2.000000,1,0.000000,0.250000,0.750000,0.750000,23.500000,23.500000
4,5,4692499,32,146641.0,613800,0.00,0.0,185050.000000,0.0,121050.0,...,15,223452.333333,1.523810,12,0.000000,0.156250,0.687500,0.562500,16.181818,8.476190
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
35962,49988,600400,4,150100.0,401850,56050.00,0.0,0.000000,0.0,0.0,...,28,300200.000000,2.000000,1,0.000000,0.000000,1.000000,0.750000,10.000000,10.000000
35963,49990,202350,1,202350.0,202350,0.00,202350.0,0.000000,0.0,0.0,...,0,202350.000000,1.000000,1,0.000000,0.000000,1.000000,1.000000,,0.000000
35964,49992,209950,2,104975.0,159600,0.00,0.0,0.000000,159600.0,0.0,...,142,104975.000000,1.000000,2,0.000000,0.500000,0.500000,0.000000,,0.000000
35965,49993,139529,4,34882.0,66500,34882.25,0.0,0.000000,0.0,0.0,...,0,139529.000000,4.000000,0,0.000000,0.000000,1.000000,0.750000,41.000000,123.000000


- **[평균방문분]**<br>
  방문시간을 분으로 환산해 표현한 데이터를 평균한 값을 의미한다.

In [18]:
mean_min = data.groupby('custid')['SALES_MIN'].agg([('평균방문분', np.mean)]).reset_index()

feature = pd.merge(feature, mean_min, on='custid', how='outer')
feature

Unnamed: 0,custid,실제구매액,구매건수,평균구매액,최대구매액,1월평균구매액,2월평균구매액,3월평균구매액,4월평균구매액,5월평균구매액,...,1일평균구매액,1일평균구매건수,1일1개,복수방문비율,주말방문비율,일상쇼핑비율,퇴근쇼핑비율,구매결정시간,평균쇼핑시간,평균방문분
0,0,1567800,11,142527.0,409500,0.00,0.0,0.000000,264600.0,0.0,...,223971.428571,1.571429,4,0.000000,0.636364,0.000000,0.000000,34.500000,19.714286,1062.545455
1,2,3495760,11,317796.0,1416000,-108000.00,0.0,0.000000,0.0,0.0,...,499394.285714,1.571429,4,0.000000,0.363636,0.545455,0.545455,25.000000,7.142857,933.454545
2,3,2208840,30,73628.0,589950,69275.00,0.0,410083.333333,-93575.0,85500.0,...,169910.769231,2.307692,5,0.000000,0.366667,0.600000,0.366667,21.352941,27.923077,972.066667
3,4,1023200,4,255800.0,560000,0.00,0.0,0.000000,0.0,0.0,...,511600.000000,2.000000,1,0.000000,0.250000,0.750000,0.750000,23.500000,23.500000,874.000000
4,5,4692499,32,146641.0,613800,0.00,0.0,185050.000000,0.0,121050.0,...,223452.333333,1.523810,12,0.000000,0.156250,0.687500,0.562500,16.181818,8.476190,949.437500
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
35962,49988,600400,4,150100.0,401850,56050.00,0.0,0.000000,0.0,0.0,...,300200.000000,2.000000,1,0.000000,0.000000,1.000000,0.750000,10.000000,10.000000,832.500000
35963,49990,202350,1,202350.0,202350,0.00,202350.0,0.000000,0.0,0.0,...,202350.000000,1.000000,1,0.000000,0.000000,1.000000,1.000000,,0.000000,903.000000
35964,49992,209950,2,104975.0,159600,0.00,0.0,0.000000,159600.0,0.0,...,104975.000000,1.000000,2,0.000000,0.500000,0.500000,0.000000,,0.000000,1096.500000
35965,49993,139529,4,34882.0,66500,34882.25,0.0,0.000000,0.0,0.0,...,139529.000000,4.000000,0,0.000000,0.000000,1.000000,0.750000,41.000000,123.000000,1050.750000


- **[주구매지점]**

In [19]:
major_str = data.groupby('custid')['str_nm'].agg([('주구매지점', lambda x: x.value_counts().index[0])]).reset_index()

feature = pd.merge(feature, major_str, on='custid', how='outer')
feature

Unnamed: 0,custid,실제구매액,구매건수,평균구매액,최대구매액,1월평균구매액,2월평균구매액,3월평균구매액,4월평균구매액,5월평균구매액,...,1일평균구매건수,1일1개,복수방문비율,주말방문비율,일상쇼핑비율,퇴근쇼핑비율,구매결정시간,평균쇼핑시간,평균방문분,주구매지점
0,0,1567800,11,142527.0,409500,0.00,0.0,0.000000,264600.0,0.0,...,1.571429,4,0.000000,0.636364,0.000000,0.000000,34.500000,19.714286,1062.545455,본점
1,2,3495760,11,317796.0,1416000,-108000.00,0.0,0.000000,0.0,0.0,...,1.571429,4,0.000000,0.363636,0.545455,0.545455,25.000000,7.142857,933.454545,무역점
2,3,2208840,30,73628.0,589950,69275.00,0.0,410083.333333,-93575.0,85500.0,...,2.307692,5,0.000000,0.366667,0.600000,0.366667,21.352941,27.923077,972.066667,천호점
3,4,1023200,4,255800.0,560000,0.00,0.0,0.000000,0.0,0.0,...,2.000000,1,0.000000,0.250000,0.750000,0.750000,23.500000,23.500000,874.000000,무역점
4,5,4692499,32,146641.0,613800,0.00,0.0,185050.000000,0.0,121050.0,...,1.523810,12,0.000000,0.156250,0.687500,0.562500,16.181818,8.476190,949.437500,본점
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
35962,49988,600400,4,150100.0,401850,56050.00,0.0,0.000000,0.0,0.0,...,2.000000,1,0.000000,0.000000,1.000000,0.750000,10.000000,10.000000,832.500000,본점
35963,49990,202350,1,202350.0,202350,0.00,202350.0,0.000000,0.0,0.0,...,1.000000,1,0.000000,0.000000,1.000000,1.000000,,0.000000,903.000000,무역점
35964,49992,209950,2,104975.0,159600,0.00,0.0,0.000000,159600.0,0.0,...,1.000000,2,0.000000,0.500000,0.500000,0.000000,,0.000000,1096.500000,본점
35965,49993,139529,4,34882.0,66500,34882.25,0.0,0.000000,0.0,0.0,...,4.000000,0,0.000000,0.000000,1.000000,0.750000,41.000000,123.000000,1050.750000,신촌점


- **[구매종류수]**<br>
  고객별 구매한 'goodcd','brd_nm','corner_nm','pc_nm','part_nm','team_nm','buyer_nm'의 종류수를 의미한다.

In [20]:
nunique = data.groupby('custid')[['goodcd','brd_nm','corner_nm','pc_nm','part_nm','team_nm','buyer_nm']].apply(lambda x: x.nunique())\
          .reset_index()
nunique.columns = ['custid'] + [f'{i}종류수' for i in ['상품', '브랜드', '코너', '상품군', '파트', '팀', '바이어']]

feature = pd.merge(feature, nunique, on='custid', how='outer')
feature

Unnamed: 0,custid,실제구매액,구매건수,평균구매액,최대구매액,1월평균구매액,2월평균구매액,3월평균구매액,4월평균구매액,5월평균구매액,...,평균쇼핑시간,평균방문분,주구매지점,상품종류수,브랜드종류수,코너종류수,상품군종류수,파트종류수,팀종류수,바이어종류수
0,0,1567800,11,142527.0,409500,0.00,0.0,0.000000,264600.0,0.0,...,19.714286,1062.545455,본점,7,7,4,4,4,2,4
1,2,3495760,11,317796.0,1416000,-108000.00,0.0,0.000000,0.0,0.0,...,7.142857,933.454545,무역점,7,7,7,7,6,2,7
2,3,2208840,30,73628.0,589950,69275.00,0.0,410083.333333,-93575.0,85500.0,...,27.923077,972.066667,천호점,22,21,18,14,8,3,14
3,4,1023200,4,255800.0,560000,0.00,0.0,0.000000,0.0,0.0,...,23.500000,874.000000,무역점,4,4,4,3,3,2,3
4,5,4692499,32,146641.0,613800,0.00,0.0,185050.000000,0.0,121050.0,...,8.476190,949.437500,본점,22,21,21,16,9,3,11
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
35962,49988,600400,4,150100.0,401850,56050.00,0.0,0.000000,0.0,0.0,...,10.000000,832.500000,본점,4,4,4,4,2,2,4
35963,49990,202350,1,202350.0,202350,0.00,202350.0,0.000000,0.0,0.0,...,0.000000,903.000000,무역점,1,1,1,1,1,1,1
35964,49992,209950,2,104975.0,159600,0.00,0.0,0.000000,159600.0,0.0,...,0.000000,1096.500000,본점,2,2,2,2,2,2,2
35965,49993,139529,4,34882.0,66500,34882.25,0.0,0.000000,0.0,0.0,...,123.000000,1050.750000,신촌점,3,3,4,4,4,2,3


- **[주구매상품, 주구매상품수]**

In [21]:
vc_column = ['goodcd', 'brd_nm', 'corner_nm', 'pc_nm', 'part_nm','team_nm', 'buyer_nm']
format_column = ['상품', '브랜드', '코너', '상품군', '파트', '팀', '바이어']
major = pd.DataFrame({'custid':data.custid.unique()})

for i in range(7):
    major[f'주{format_column[i]}'] = data.groupby('custid')[vc_column[i]].agg(lambda x: x.value_counts().index[0]).reset_index().iloc[:, -1]
    major[f'주{format_column[i]}수'] = data.groupby('custid')[vc_column[i]].agg(lambda x: x.value_counts().iloc[0]).reset_index().iloc[:, -1]
    
feature = pd.merge(feature, major, on='custid', how='outer')
feature

Unnamed: 0,custid,실제구매액,구매건수,평균구매액,최대구매액,1월평균구매액,2월평균구매액,3월평균구매액,4월평균구매액,5월평균구매액,...,주코너,주코너수,주상품군,주상품군수,주파트,주파트수,주팀,주팀수,주바이어,주바이어수
0,0,1567800,11,142527.0,409500,0.00,0.0,0.000000,264600.0,0.0,...,수입종합화장품,8,화장품,8,잡화,6,잡화가용팀,9,화장품,8
1,2,3495760,11,317796.0,1416000,-108000.00,0.0,0.000000,0.0,0.0,...,트래디셔널,3,트레디셔널,3,가정용품,4,잡화가용팀,6,트래디셔널캐주얼,3
2,3,2208840,30,73628.0,589950,69275.00,0.0,410083.333333,-93575.0,85500.0,...,골프(LC),4,영트랜디,5,스포츠캐주얼,9,잡화가용팀,14,스포츠,8
3,4,1023200,4,255800.0,560000,0.00,0.0,0.000000,0.0,0.0,...,수입종합화장품,1,캐릭터캐주얼,2,여성캐주얼,2,의류패션팀,3,캐릭터캐주얼,2
4,5,4692499,32,146641.0,613800,0.00,0.0,185050.000000,0.0,121050.0,...,트래디셔널,4,트레디셔널,4,남성정장스포츠,8,잡화가용팀,17,스포츠,6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
35962,49988,600400,4,150100.0,401850,56050.00,0.0,0.000000,0.0,0.0,...,아동,1,아동복,1,"케주얼,구두,아동",2,의류패션팀,2,유아동복,1
35963,49990,202350,1,202350.0,202350,0.00,202350.0,0.000000,0.0,0.0,...,수입종합화장품,1,화장품,1,명품잡화,1,잡화가용팀,1,화장품,1
35964,49992,209950,2,104975.0,159600,0.00,0.0,0.000000,159600.0,0.0,...,수입종합화장품,1,화장품,1,잡화,1,잡화가용팀,1,화장품,1
35965,49993,139529,4,34882.0,66500,34882.25,0.0,0.000000,0.0,0.0,...,차류,1,일반식품,1,공산품,1,식품팀,2,일반식품,2


- **[평균구매비율]**<br>
  고객별 구매한 'goodcd','brd_nm','corner_nm','pc_nm','part_nm','team_nm','buyer_nm'의 비율들의 평균을 의미한다.

In [22]:
mean_per = pd.DataFrame({'custid':data.custid.unique()})
for i in range(7):
    column = ['goodcd', 'brd_nm', 'corner_nm', 'pc_nm', 'part_nm','team_nm', 'buyer_nm']
    format_column = ['상품', '브랜드', '코너', '상품군', '파트', '팀', '바이어']
    ease = pd.pivot_table(data, index='custid', columns=column[i], values='str_nm', aggfunc=np.size, margins=True)
    mean_per[f'평균{format_column[i]}비율'] = ease.divide(ease['All'], axis=0).iloc[:-1, :-1].mean(axis=1).reset_index().iloc[:, -1]
    
feature = pd.merge(feature, mean_per, on='custid', how='outer')
feature

Unnamed: 0,custid,실제구매액,구매건수,평균구매액,최대구매액,1월평균구매액,2월평균구매액,3월평균구매액,4월평균구매액,5월평균구매액,...,주팀수,주바이어,주바이어수,평균상품비율,평균브랜드비율,평균코너비율,평균상품군비율,평균파트비율,평균팀비율,평균바이어비율
0,0,1567800,11,142527.0,409500,0.00,0.0,0.000000,264600.0,0.0,...,9,화장품,8,0.142857,0.142857,0.250000,0.250000,0.250000,0.500000,0.250000
1,2,3495760,11,317796.0,1416000,-108000.00,0.0,0.000000,0.0,0.0,...,6,트래디셔널캐주얼,3,0.142857,0.142857,0.142857,0.142857,0.166667,0.500000,0.142857
2,3,2208840,30,73628.0,589950,69275.00,0.0,410083.333333,-93575.0,85500.0,...,14,스포츠,8,0.045455,0.047619,0.055556,0.071429,0.125000,0.333333,0.071429
3,4,1023200,4,255800.0,560000,0.00,0.0,0.000000,0.0,0.0,...,3,캐릭터캐주얼,2,0.250000,0.250000,0.250000,0.333333,0.333333,0.500000,0.333333
4,5,4692499,32,146641.0,613800,0.00,0.0,185050.000000,0.0,121050.0,...,17,스포츠,6,0.045455,0.047619,0.047619,0.062500,0.111111,0.333333,0.090909
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
35962,49988,600400,4,150100.0,401850,56050.00,0.0,0.000000,0.0,0.0,...,2,유아동복,1,0.250000,0.250000,0.250000,0.250000,0.500000,0.500000,0.250000
35963,49990,202350,1,202350.0,202350,0.00,202350.0,0.000000,0.0,0.0,...,1,화장품,1,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000
35964,49992,209950,2,104975.0,159600,0.00,0.0,0.000000,159600.0,0.0,...,1,화장품,1,0.500000,0.500000,0.500000,0.500000,0.500000,0.500000,0.500000
35965,49993,139529,4,34882.0,66500,34882.25,0.0,0.000000,0.0,0.0,...,2,일반식품,2,0.333333,0.333333,0.250000,0.250000,0.250000,0.500000,0.333333


- **[goodcd구매정보]**<br>
  EDA에서 확인한 group간 차이있는 상품 정보를 만든다.

In [23]:
goodcd = pd.DataFrame({'custid':train.custid.unique().tolist() + test.custid.unique().tolist()})

In [24]:
for i in goodcdset:
    ease = data.query('goodcd == @i').groupby('custid').agg({'net_amt':[(f'{i}평균액', np.mean)],
                                                             'str_nm':[(f'{i}주구매매장', lambda x: x.value_counts().index[0])],
                                                             'brd_nm':[(f'{i}주브랜드', lambda x: x.value_counts().index[0])],
                                                             'corner_nm':[(f'{i}주코너', lambda x: x.value_counts().index[0])],
                                                             'pc_nm':[(f'{i}주상품군', lambda x: x.value_counts().index[0])],
                                                             'part_nm':[(f'{i}주파트', lambda x: x.value_counts().index[0])],
                                                             'buyer_nm':[(f'{i}주바이어', lambda x: x.value_counts().index[0])]})
    ease.columns = ease.columns.get_level_values(1)
    ease = ease.reset_index()
    goodcd = pd.merge(goodcd, ease, on='custid', how='outer')

In [25]:
goodcd = goodcd.fillna(0)

feature = pd.merge(feature, goodcd, on='custid', how='outer')
feature

Unnamed: 0,custid,실제구매액,구매건수,평균구매액,최대구매액,1월평균구매액,2월평균구매액,3월평균구매액,4월평균구매액,5월평균구매액,...,4405580026000주상품군,4405580026000주파트,4405580026000주바이어,4135140006900평균액,4135140006900주구매매장,4135140006900주브랜드,4135140006900주코너,4135140006900주상품군,4135140006900주파트,4135140006900주바이어
0,0,1567800,11,142527.0,409500,0.00,0.0,0.000000,264600.0,0.0,...,0,0,0,0.0,0,0,0,0,0,0
1,2,3495760,11,317796.0,1416000,-108000.00,0.0,0.000000,0.0,0.0,...,0,0,0,0.0,0,0,0,0,0,0
2,3,2208840,30,73628.0,589950,69275.00,0.0,410083.333333,-93575.0,85500.0,...,0,0,0,0.0,0,0,0,0,0,0
3,4,1023200,4,255800.0,560000,0.00,0.0,0.000000,0.0,0.0,...,0,0,0,0.0,0,0,0,0,0,0
4,5,4692499,32,146641.0,613800,0.00,0.0,185050.000000,0.0,121050.0,...,0,0,0,0.0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
35962,49988,600400,4,150100.0,401850,56050.00,0.0,0.000000,0.0,0.0,...,0,0,0,0.0,0,0,0,0,0,0
35963,49990,202350,1,202350.0,202350,0.00,202350.0,0.000000,0.0,0.0,...,0,0,0,0.0,0,0,0,0,0,0
35964,49992,209950,2,104975.0,159600,0.00,0.0,0.000000,159600.0,0.0,...,0,0,0,0.0,0,0,0,0,0,0
35965,49993,139529,4,34882.0,66500,34882.25,0.0,0.000000,0.0,0.0,...,0,0,0,0.0,0,0,0,0,0,0


- **[{group}가중치비율]**<br>
  고객의 brand 가중치 합 중 각 group별 합이 차지하는 비율을 의미한다.

In [26]:
weight = data[['custid']+[i for i in data.columns if 'WEIGHT' in i]]
weight = weight.groupby('custid').sum()
weight = weight.divide(weight['WEIGHT_SUM'], axis=0).iloc[:, :-1].fillna(0).reset_index() # NaN은 모든 가중치가 0인 데이터
weight.columns = ['custid'] + [f'{i[-3:]}가중치비율' for i in weight.columns[1:]]

feature = pd.merge(feature, weight, on='custid', how='outer')
feature

Unnamed: 0,custid,실제구매액,구매건수,평균구매액,최대구매액,1월평균구매액,2월평균구매액,3월평균구매액,4월평균구매액,5월평균구매액,...,4135140006900주파트,4135140006900주바이어,F20가중치비율,F30가중치비율,F40가중치비율,F50가중치비율,M20가중치비율,M30가중치비율,M40가중치비율,M50가중치비율
0,0,1567800,11,142527.0,409500,0.00,0.0,0.000000,264600.0,0.0,...,0,0,0.143210,0.138272,0.120988,0.103704,0.133333,0.116049,0.108642,0.135802
1,2,3495760,11,317796.0,1416000,-108000.00,0.0,0.000000,0.0,0.0,...,0,0,0.210526,0.157895,0.105263,0.052632,0.157895,0.131579,0.052632,0.131579
2,3,2208840,30,73628.0,589950,69275.00,0.0,410083.333333,-93575.0,85500.0,...,0,0,0.125000,0.097222,0.138889,0.111111,0.138889,0.111111,0.138889,0.138889
3,4,1023200,4,255800.0,560000,0.00,0.0,0.000000,0.0,0.0,...,0,0,0.000000,0.142857,0.095238,0.285714,0.000000,0.047619,0.238095,0.190476
4,5,4692499,32,146641.0,613800,0.00,0.0,185050.000000,0.0,121050.0,...,0,0,0.104348,0.089130,0.150000,0.093478,0.158696,0.113043,0.143478,0.147826
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
35962,49988,600400,4,150100.0,401850,56050.00,0.0,0.000000,0.0,0.0,...,0,0,0.159091,0.181818,0.113636,0.068182,0.159091,0.159091,0.090909,0.068182
35963,49990,202350,1,202350.0,202350,0.00,202350.0,0.000000,0.0,0.0,...,0,0,0.210526,0.157895,0.105263,0.052632,0.157895,0.131579,0.052632,0.131579
35964,49992,209950,2,104975.0,159600,0.00,0.0,0.000000,159600.0,0.0,...,0,0,0.086207,0.155172,0.137931,0.172414,0.068966,0.103448,0.137931,0.137931
35965,49993,139529,4,34882.0,66500,34882.25,0.0,0.000000,0.0,0.0,...,0,0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000


- **[{group}별구매건수]**<br>
  group별 선호 brand상품의 각 구매비율을 구한다.

In [27]:
favor = data[['custid']+[i for i in data.columns if 'FAVOR' in i]]
favor = favor.groupby('custid').mean().reset_index()
favor.columns = ['custid'] + [f'{i[-3:]}상품비율' for i in favor.columns[1:]]

feature = pd.merge(feature, favor, on='custid', how='outer')
feature

Unnamed: 0,custid,실제구매액,구매건수,평균구매액,최대구매액,1월평균구매액,2월평균구매액,3월평균구매액,4월평균구매액,5월평균구매액,...,M40가중치비율,M50가중치비율,F20상품비율,F30상품비율,F40상품비율,F50상품비율,M20상품비율,M30상품비율,M40상품비율,M50상품비율
0,0,1567800,11,142527.0,409500,0.00,0.0,0.000000,264600.0,0.0,...,0.108642,0.135802,0.727273,0.727273,0.818182,0.818182,0.727273,0.818182,0.818182,0.818182
1,2,3495760,11,317796.0,1416000,-108000.00,0.0,0.000000,0.0,0.0,...,0.052632,0.131579,0.090909,0.090909,0.090909,0.090909,0.090909,0.090909,0.090909,0.090909
2,3,2208840,30,73628.0,589950,69275.00,0.0,410083.333333,-93575.0,85500.0,...,0.138889,0.138889,0.100000,0.100000,0.100000,0.100000,0.100000,0.100000,0.100000,0.100000
3,4,1023200,4,255800.0,560000,0.00,0.0,0.000000,0.0,0.0,...,0.238095,0.190476,0.000000,0.250000,0.250000,0.250000,0.000000,0.250000,0.250000,0.250000
4,5,4692499,32,146641.0,613800,0.00,0.0,185050.000000,0.0,121050.0,...,0.143478,0.147826,0.281250,0.156250,0.281250,0.281250,0.281250,0.343750,0.281250,0.281250
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
35962,49988,600400,4,150100.0,401850,56050.00,0.0,0.000000,0.0,0.0,...,0.090909,0.068182,0.250000,0.250000,0.250000,0.250000,0.250000,0.250000,0.250000,0.250000
35963,49990,202350,1,202350.0,202350,0.00,202350.0,0.000000,0.0,0.0,...,0.052632,0.131579,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000
35964,49992,209950,2,104975.0,159600,0.00,0.0,0.000000,159600.0,0.0,...,0.137931,0.137931,0.500000,0.500000,0.500000,0.500000,0.500000,0.500000,0.500000,0.500000
35965,49993,139529,4,34882.0,66500,34882.25,0.0,0.000000,0.0,0.0,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000


- **[브랜드정보]**<br>
  EDA에서 확인한 group별 상이한 brand 정보를 생성한다.

In [28]:
brd_nm = pd.DataFrame({'custid':train.custid.unique().tolist() + test.custid.unique().tolist()})

In [29]:
for i in brd_nmset:
    ease = data.query('brd_nm == @i').groupby('custid').agg({'net_amt':[(f'{i}평균액', np.mean)],
                                                             'str_nm':[(f'{i}주구매매장', lambda x: x.value_counts().index[0])],
                                                             'goodcd':[(f'{i}주상품', lambda x: x.value_counts().index[0])],
                                                             'corner_nm':[(f'{i}주코너', lambda x: x.value_counts().index[0])],
                                                             'pc_nm':[(f'{i}주상품군', lambda x: x.value_counts().index[0])],
                                                             'part_nm':[(f'{i}주파트', lambda x: x.value_counts().index[0])],
                                                             'team_nm':[(f'{i}주팀', lambda x: x.value_counts().index[0])], 
                                                             'buyer_nm':[(f'{i}주바이어', lambda x: x.value_counts().index[0])]})
    ease.columns = ease.columns.get_level_values(1)
    ease = ease.reset_index()
    brd_nm = pd.merge(brd_nm, ease, on='custid', how='outer')

In [30]:
brd_nm = brd_nm.fillna(0)

feature = pd.merge(feature, brd_nm, on='custid', how='outer')
feature

Unnamed: 0,custid,실제구매액,구매건수,평균구매액,최대구매액,1월평균구매액,2월평균구매액,3월평균구매액,4월평균구매액,5월평균구매액,...,리바이스주팀,리바이스주바이어,비오뗌평균액,비오뗌주구매매장,비오뗌주상품,비오뗌주코너,비오뗌주상품군,비오뗌주파트,비오뗌주팀,비오뗌주바이어
0,0,1567800,11,142527.0,409500,0.00,0.0,0.000000,264600.0,0.0,...,0,0,0.0,0,0.0,0,0,0,0,0
1,2,3495760,11,317796.0,1416000,-108000.00,0.0,0.000000,0.0,0.0,...,0,0,0.0,0,0.0,0,0,0,0,0
2,3,2208840,30,73628.0,589950,69275.00,0.0,410083.333333,-93575.0,85500.0,...,0,0,0.0,0,0.0,0,0,0,0,0
3,4,1023200,4,255800.0,560000,0.00,0.0,0.000000,0.0,0.0,...,0,0,0.0,0,0.0,0,0,0,0,0
4,5,4692499,32,146641.0,613800,0.00,0.0,185050.000000,0.0,121050.0,...,0,0,0.0,0,0.0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
35962,49988,600400,4,150100.0,401850,56050.00,0.0,0.000000,0.0,0.0,...,0,0,0.0,0,0.0,0,0,0,0,0
35963,49990,202350,1,202350.0,202350,0.00,202350.0,0.000000,0.0,0.0,...,0,0,0.0,0,0.0,0,0,0,0,0
35964,49992,209950,2,104975.0,159600,0.00,0.0,0.000000,159600.0,0.0,...,0,0,0.0,0,0.0,0,0,0,0,0
35965,49993,139529,4,34882.0,66500,34882.25,0.0,0.000000,0.0,0.0,...,0,0,0.0,0,0.0,0,0,0,0,0


- **[코너구매비율]**<br>
  EDA에서 group간 차이가 큰 corner의 고객별 구매비율을 생성한다.

In [31]:
corner_nm = pd.DataFrame({'custid':train.custid.unique().tolist() + test.custid.unique().tolist()})

In [32]:
for i in corner_nmset:
    ease = data.query('corner_nm == @i').groupby('custid').agg({'net_amt':[(f'{i}평균액', np.mean)],
                                                             'str_nm':[(f'{i}주구매매장', lambda x: x.value_counts().index[0])],
                                                             'goodcd':[(f'{i}주상품', lambda x: x.value_counts().index[0])],
                                                             'brd_nm':[(f'{i}주브랜드', lambda x: x.value_counts().index[0])],
                                                             'pc_nm':[(f'{i}주상품군', lambda x: x.value_counts().index[0])],
                                                             'part_nm':[(f'{i}주파트', lambda x: x.value_counts().index[0])],
                                                             'team_nm':[(f'{i}주팀', lambda x: x.value_counts().index[0])], 
                                                             'buyer_nm':[(f'{i}주바이어', lambda x: x.value_counts().index[0])]})
    ease.columns = ease.columns.get_level_values(1)
    ease = ease.reset_index()
    corner_nm = pd.merge(corner_nm, ease, on='custid', how='outer')

In [33]:
corner_nm = corner_nm.fillna(0)

feature = pd.merge(feature, corner_nm, on='custid', how='outer')
feature

Unnamed: 0,custid,실제구매액,구매건수,평균구매액,최대구매액,1월평균구매액,2월평균구매액,3월평균구매액,4월평균구매액,5월평균구매액,...,수입의류주팀,수입의류주바이어,유아복평균액,유아복주구매매장,유아복주상품,유아복주브랜드,유아복주상품군,유아복주파트,유아복주팀,유아복주바이어
0,0,1567800,11,142527.0,409500,0.00,0.0,0.000000,264600.0,0.0,...,잡화가용팀,수입명품,0.0,0,0.000000e+00,0,0,0,0,0
1,2,3495760,11,317796.0,1416000,-108000.00,0.0,0.000000,0.0,0.0,...,0,0,190210.0,천호점,4.400213e+12,압소바,유아/완구,아동,잡화가용팀,유아동복
2,3,2208840,30,73628.0,589950,69275.00,0.0,410083.333333,-93575.0,85500.0,...,0,0,0.0,0,0.000000e+00,0,0,0,0,0
3,4,1023200,4,255800.0,560000,0.00,0.0,0.000000,0.0,0.0,...,0,0,0.0,0,0.000000e+00,0,0,0,0,0
4,5,4692499,32,146641.0,613800,0.00,0.0,185050.000000,0.0,121050.0,...,0,0,21600.0,본점,4.400202e+12,쇼콜라,아동복,"케주얼,구두,아동",의류패션팀,유아동복
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
35962,49988,600400,4,150100.0,401850,56050.00,0.0,0.000000,0.0,0.0,...,0,0,0.0,0,0.000000e+00,0,0,0,0,0
35963,49990,202350,1,202350.0,202350,0.00,202350.0,0.000000,0.0,0.0,...,0,0,0.0,0,0.000000e+00,0,0,0,0,0
35964,49992,209950,2,104975.0,159600,0.00,0.0,0.000000,159600.0,0.0,...,0,0,0.0,0,0.000000e+00,0,0,0,0,0
35965,49993,139529,4,34882.0,66500,34882.25,0.0,0.000000,0.0,0.0,...,0,0,0.0,0,0.000000e+00,0,0,0,0,0


- **[상품군구매비율]**<br>
  EDA에서 group간 차이가 큰 pc의 고객별 구매비율을 생성한다.

In [34]:
pc_nm = pd.DataFrame({'custid':train.custid.unique().tolist() + test.custid.unique().tolist()})

In [35]:
for i in pc_nmset:
    ease = data.query('pc_nm == @i').groupby('custid').agg({'net_amt':[(f'{i}평균액', np.mean)],
                                                             'str_nm':[(f'{i}주구매매장', lambda x: x.value_counts().index[0])],
                                                             'goodcd':[(f'{i}주상품', lambda x: x.value_counts().index[0])],
                                                             'brd_nm':[(f'{i}주브랜드', lambda x: x.value_counts().index[0])],
                                                             'corner_nm':[(f'{i}주코너', lambda x: x.value_counts().index[0])],
                                                             'part_nm':[(f'{i}주파트', lambda x: x.value_counts().index[0])],
                                                             'team_nm':[(f'{i}주팀', lambda x: x.value_counts().index[0])], 
                                                             'buyer_nm':[(f'{i}주바이어', lambda x: x.value_counts().index[0])]})
    ease.columns = ease.columns.get_level_values(1)
    ease = ease.reset_index()
    pc_nm = pd.merge(pc_nm, ease, on='custid', how='outer')

In [36]:
pc_nm = pc_nm.fillna(0)

feature = pd.merge(feature, pc_nm, on='custid', how='outer')
feature

Unnamed: 0,custid,실제구매액,구매건수,평균구매액,최대구매액,1월평균구매액,2월평균구매액,3월평균구매액,4월평균구매액,5월평균구매액,...,수산물주팀,수산물주바이어,정장평균액,정장주구매매장,정장주상품,정장주브랜드,정장주코너,정장주파트,정장주팀,정장주바이어
0,0,1567800,11,142527.0,409500,0.00,0.0,0.000000,264600.0,0.0,...,0,0,0.0,0,0.000000e+00,0,0,0,0,0
1,2,3495760,11,317796.0,1416000,-108000.00,0.0,0.000000,0.0,0.0,...,0,0,0.0,0,0.000000e+00,0,0,0,0,0
2,3,2208840,30,73628.0,589950,69275.00,0.0,410083.333333,-93575.0,85500.0,...,0,0,0.0,0,0.000000e+00,0,0,0,0,0
3,4,1023200,4,255800.0,560000,0.00,0.0,0.000000,0.0,0.0,...,0,0,0.0,0,0.000000e+00,0,0,0,0,0
4,5,4692499,32,146641.0,613800,0.00,0.0,185050.000000,0.0,121050.0,...,0,0,418875.0,본점,6.300010e+12,마에스트로,내셔널,남성정장스포츠,잡화가용팀,정장셔츠
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
35962,49988,600400,4,150100.0,401850,56050.00,0.0,0.000000,0.0,0.0,...,0,0,0.0,0,0.000000e+00,0,0,0,0,0
35963,49990,202350,1,202350.0,202350,0.00,202350.0,0.000000,0.0,0.0,...,0,0,0.0,0,0.000000e+00,0,0,0,0,0
35964,49992,209950,2,104975.0,159600,0.00,0.0,0.000000,159600.0,0.0,...,0,0,0.0,0,0.000000e+00,0,0,0,0,0
35965,49993,139529,4,34882.0,66500,34882.25,0.0,0.000000,0.0,0.0,...,0,0,0.0,0,0.000000e+00,0,0,0,0,0


- **[파트구매비율]**<br>
  EDA에서 group간 차이가 큰 part의 고객별 구매비율을 생성한다.

In [37]:
part_nm = pd.DataFrame({'custid':train.custid.unique().tolist() + test.custid.unique().tolist()})

In [38]:
for i in part_nmset:
    ease = data.query('part_nm == @i').groupby('custid').agg({'net_amt':[(f'{i}평균액', np.mean)],
                                                             'str_nm':[(f'{i}주구매매장', lambda x: x.value_counts().index[0])],
                                                             'goodcd':[(f'{i}주상품', lambda x: x.value_counts().index[0])],
                                                             'brd_nm':[(f'{i}주브랜드', lambda x: x.value_counts().index[0])],
                                                             'corner_nm':[(f'{i}주코너', lambda x: x.value_counts().index[0])],
                                                             'pc_nm':[(f'{i}주상품군', lambda x: x.value_counts().index[0])],
                                                             'team_nm':[(f'{i}주팀', lambda x: x.value_counts().index[0])], 
                                                             'buyer_nm':[(f'{i}주바이어', lambda x: x.value_counts().index[0])]})
    ease.columns = ease.columns.get_level_values(1)
    ease = ease.reset_index()
    part_nm = pd.merge(part_nm, ease, on='custid', how='outer')

In [39]:
part_nm = part_nm.fillna(0)

feature = pd.merge(feature, part_nm, on='custid', how='outer')
feature

Unnamed: 0,custid,실제구매액,구매건수,평균구매액,최대구매액,1월평균구매액,2월평균구매액,3월평균구매액,4월평균구매액,5월평균구매액,...,영캐릭터주팀,영캐릭터주바이어,명품잡화평균액,명품잡화주구매매장,명품잡화주상품,명품잡화주브랜드,명품잡화주코너,명품잡화주상품군,명품잡화주팀,명품잡화주바이어
0,0,1567800,11,142527.0,409500,0.00,0.0,0.000000,264600.0,0.0,...,0,0,68700.0,무역점,2.116050e+12,에스티로더,수입종합화장품,화장품,잡화가용팀,화장품
1,2,3495760,11,317796.0,1416000,-108000.00,0.0,0.000000,0.0,0.0,...,0,0,0.0,0,0.000000e+00,0,0,0,0,0
2,3,2208840,30,73628.0,589950,69275.00,0.0,410083.333333,-93575.0,85500.0,...,0,0,121125.0,무역점,4.234960e+12,휠라인티모,란제리,란제리,잡화가용팀,니트단품
3,4,1023200,4,255800.0,560000,0.00,0.0,0.000000,0.0,0.0,...,0,0,57000.0,무역점,4.125440e+12,시슬리,수입종합화장품,화장품,잡화가용팀,화장품
4,5,4692499,32,146641.0,613800,0.00,0.0,185050.000000,0.0,121050.0,...,0,0,0.0,0,0.000000e+00,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
35962,49988,600400,4,150100.0,401850,56050.00,0.0,0.000000,0.0,0.0,...,0,0,0.0,0,0.000000e+00,0,0,0,0,0
35963,49990,202350,1,202350.0,202350,0.00,202350.0,0.000000,0.0,0.0,...,0,0,202350.0,무역점,2.116052e+12,크리니크,수입종합화장품,화장품,잡화가용팀,화장품
35964,49992,209950,2,104975.0,159600,0.00,0.0,0.000000,159600.0,0.0,...,0,0,0.0,0,0.000000e+00,0,0,0,0,0
35965,49993,139529,4,34882.0,66500,34882.25,0.0,0.000000,0.0,0.0,...,0,0,0.0,0,0.000000e+00,0,0,0,0,0


- **[바이어구매비율]**<br>
  EDA에서 group간 차이가 큰 buyer의 고객별 구매비율을 생성한다.

In [40]:
buyer_nm = pd.DataFrame({'custid':train.custid.unique().tolist() + test.custid.unique().tolist()})

In [41]:
for i in buyer_nmset:
    ease = data.query('buyer_nm == @i').groupby('custid').agg({'net_amt':[(f'{i}평균액', np.mean)],
                                                             'str_nm':[(f'{i}주구매매장', lambda x: x.value_counts().index[0])],
                                                             'goodcd':[(f'{i}주상품', lambda x: x.value_counts().index[0])],
                                                             'brd_nm':[(f'{i}주브랜드', lambda x: x.value_counts().index[0])],
                                                             'corner_nm':[(f'{i}주코너', lambda x: x.value_counts().index[0])],
                                                             'pc_nm':[(f'{i}주상품군', lambda x: x.value_counts().index[0])],
                                                             'team_nm':[(f'{i}주팀', lambda x: x.value_counts().index[0])], 
                                                             'part_nm':[(f'{i}주파트', lambda x: x.value_counts().index[0])]})
    ease.columns = ease.columns.get_level_values(1)
    ease = ease.reset_index()
    buyer_nm = pd.merge(buyer_nm, ease, on='custid', how='outer')

In [42]:
buyer_nm = buyer_nm.fillna(0)

feature = pd.merge(feature, buyer_nm, on='custid', how='outer')
feature

Unnamed: 0,custid,실제구매액,구매건수,평균구매액,최대구매액,1월평균구매액,2월평균구매액,3월평균구매액,4월평균구매액,5월평균구매액,...,가전주팀_y,가전주파트_y,침구수예평균액,침구수예주구매매장,침구수예주상품,침구수예주브랜드,침구수예주코너,침구수예주상품군,침구수예주팀,침구수예주파트
0,0,1567800,11,142527.0,409500,0.00,0.0,0.000000,264600.0,0.0,...,0,0,0.0,0,0.000000e+00,0,0,0,0,0
1,2,3495760,11,317796.0,1416000,-108000.00,0.0,0.000000,0.0,0.0,...,잡화가용팀,가정용품,1000800.0,무역점,4.522862e+12,레노마,브랜드침구,수예,잡화가용팀,가정용품
2,3,2208840,30,73628.0,589950,69275.00,0.0,410083.333333,-93575.0,85500.0,...,잡화가용팀,가정용품,0.0,0,0.000000e+00,0,0,0,0,0
3,4,1023200,4,255800.0,560000,0.00,0.0,0.000000,0.0,0.0,...,0,0,0.0,0,0.000000e+00,0,0,0,0,0
4,5,4692499,32,146641.0,613800,0.00,0.0,185050.000000,0.0,121050.0,...,0,0,0.0,0,0.000000e+00,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
35962,49988,600400,4,150100.0,401850,56050.00,0.0,0.000000,0.0,0.0,...,0,0,0.0,0,0.000000e+00,0,0,0,0,0
35963,49990,202350,1,202350.0,202350,0.00,202350.0,0.000000,0.0,0.0,...,0,0,0.0,0,0.000000e+00,0,0,0,0,0
35964,49992,209950,2,104975.0,159600,0.00,0.0,0.000000,159600.0,0.0,...,0,0,0.0,0,0.000000e+00,0,0,0,0,0
35965,49993,139529,4,34882.0,66500,34882.25,0.0,0.000000,0.0,0.0,...,0,0,0.0,0,0.000000e+00,0,0,0,0,0


- **[남성상품구매비율, 여성상품구매비율, 유아용품구매비율, 아동용품구매비율]**

In [43]:
product_prob = data.groupby('custid').agg({'MAN': [('남성상품구매비율', np.mean)],
                                          'WOMAN': [('여성상품구매비율', np.mean)],
                                          'BABY': [('유아용품구매비율', np.mean)],
                                          'CHILD': [('아동용품구매비율', np.mean)]})
product_prob.columns = product_prob.columns.get_level_values(1)
product_prob = product_prob.reset_index()

feature = pd.merge(feature, product_prob, on='custid', how='outer')
feature

Unnamed: 0,custid,실제구매액,구매건수,평균구매액,최대구매액,1월평균구매액,2월평균구매액,3월평균구매액,4월평균구매액,5월평균구매액,...,침구수예주상품,침구수예주브랜드,침구수예주코너,침구수예주상품군,침구수예주팀,침구수예주파트,남성상품구매비율,여성상품구매비율,유아용품구매비율,아동용품구매비율
0,0,1567800,11,142527.0,409500,0.00,0.0,0.000000,264600.0,0.0,...,0.000000e+00,0,0,0,0,0,0.090909,0.000000,0.000000,0.000000
1,2,3495760,11,317796.0,1416000,-108000.00,0.0,0.000000,0.0,0.0,...,4.522862e+12,레노마,브랜드침구,수예,잡화가용팀,가정용품,0.090909,0.000000,0.090909,0.000000
2,3,2208840,30,73628.0,589950,69275.00,0.0,410083.333333,-93575.0,85500.0,...,0.000000e+00,0,0,0,0,0,0.100000,0.066667,0.000000,0.033333
3,4,1023200,4,255800.0,560000,0.00,0.0,0.000000,0.0,0.0,...,0.000000e+00,0,0,0,0,0,0.000000,0.750000,0.000000,0.000000
4,5,4692499,32,146641.0,613800,0.00,0.0,185050.000000,0.0,121050.0,...,0.000000e+00,0,0,0,0,0,0.281250,0.093750,0.062500,0.062500
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
35962,49988,600400,4,150100.0,401850,56050.00,0.0,0.000000,0.0,0.0,...,0.000000e+00,0,0,0,0,0,0.000000,0.000000,0.000000,0.250000
35963,49990,202350,1,202350.0,202350,0.00,202350.0,0.000000,0.0,0.0,...,0.000000e+00,0,0,0,0,0,0.000000,0.000000,0.000000,0.000000
35964,49992,209950,2,104975.0,159600,0.00,0.0,0.000000,159600.0,0.0,...,0.000000e+00,0,0,0,0,0,0.000000,0.000000,0.000000,0.000000
35965,49993,139529,4,34882.0,66500,34882.25,0.0,0.000000,0.0,0.0,...,0.000000e+00,0,0,0,0,0,0.000000,0.000000,0.000000,0.250000


- **[화장품구매주기]**<br>
  『고아라.(2020).성인여성의 연령대에 따른 차이연령, 소비가치, 화장행동과 화장품 구매 행동 비교 연구, 한국디자인문화학회지,26(1),15-39.』를 근거로<br>
   화장품 구매주기를 Feature로 생성한다.

In [44]:
cosmetic = data.query('COSMETIC==1')\
           .groupby('custid')['SALES_DAY'].agg([('화장품구매주기', lambda x: (x.max() - x.min()).days/x.nunique()/30)]).reset_index()

feature = pd.merge(feature, cosmetic, on='custid', how='outer').fillna(0)
feature

Unnamed: 0,custid,실제구매액,구매건수,평균구매액,최대구매액,1월평균구매액,2월평균구매액,3월평균구매액,4월평균구매액,5월평균구매액,...,침구수예주브랜드,침구수예주코너,침구수예주상품군,침구수예주팀,침구수예주파트,남성상품구매비율,여성상품구매비율,유아용품구매비율,아동용품구매비율,화장품구매주기
0,0,1567800,11,142527.0,409500,0.00,0.0,0.000000,264600.0,0.0,...,0,0,0,0,0,0.090909,0.000000,0.000000,0.000000,1.371429
1,2,3495760,11,317796.0,1416000,-108000.00,0.0,0.000000,0.0,0.0,...,레노마,브랜드침구,수예,잡화가용팀,가정용품,0.090909,0.000000,0.090909,0.000000,0.000000
2,3,2208840,30,73628.0,589950,69275.00,0.0,410083.333333,-93575.0,85500.0,...,0,0,0,0,0,0.100000,0.066667,0.000000,0.033333,0.000000
3,4,1023200,4,255800.0,560000,0.00,0.0,0.000000,0.0,0.0,...,0,0,0,0,0,0.000000,0.750000,0.000000,0.000000,0.000000
4,5,4692499,32,146641.0,613800,0.00,0.0,185050.000000,0.0,121050.0,...,0,0,0,0,0,0.281250,0.093750,0.062500,0.062500,1.041667
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
35962,49988,600400,4,150100.0,401850,56050.00,0.0,0.000000,0.0,0.0,...,0,0,0,0,0,0.000000,0.000000,0.000000,0.250000,0.000000
35963,49990,202350,1,202350.0,202350,0.00,202350.0,0.000000,0.0,0.0,...,0,0,0,0,0,0.000000,0.000000,0.000000,0.000000,0.000000
35964,49992,209950,2,104975.0,159600,0.00,0.0,0.000000,159600.0,0.0,...,0,0,0,0,0,0.000000,0.000000,0.000000,0.000000,0.000000
35965,49993,139529,4,34882.0,66500,34882.25,0.0,0.000000,0.0,0.0,...,0,0,0,0,0,0.000000,0.000000,0.000000,0.250000,0.000000


- **[수입상품비율, 국내상품액, 수입상품액, 국내상품평균액, 수입상품평균액]**<br>

In [45]:
# 상품비율
Import = data.groupby('custid')['import_flg'].agg([('수입상품비율', np.mean)]).reset_index()

# 상품액
Import['국내상품액'] = pd.pivot_table(data, index='custid', columns='import_flg', values='net_amt', aggfunc=sum, fill_value=0).reset_index()[0]
Import['수입상품액'] = pd.pivot_table(data, index='custid', columns='import_flg', values='net_amt', aggfunc=sum, fill_value=0).reset_index()[1]

# 평균액
Import['국내상품평균액'] = pd.pivot_table(data, index='custid', columns='import_flg', values='net_amt', aggfunc=np.mean, fill_value=0).reset_index()[0]
Import['수입상품평균액'] = pd.pivot_table(data, index='custid', columns='import_flg', values='net_amt', aggfunc=np.mean, fill_value=0).reset_index()[1]

feature = pd.merge(feature, Import, on='custid', how='outer').fillna(0)
feature

Unnamed: 0,custid,실제구매액,구매건수,평균구매액,최대구매액,1월평균구매액,2월평균구매액,3월평균구매액,4월평균구매액,5월평균구매액,...,남성상품구매비율,여성상품구매비율,유아용품구매비율,아동용품구매비율,화장품구매주기,수입상품비율,국내상품액,수입상품액,국내상품평균액,수입상품평균액
0,0,1567800,11,142527.0,409500,0.00,0.0,0.000000,264600.0,0.0,...,0.090909,0.000000,0.000000,0.000000,1.371429,0.636364,562500,1005300,140625.000000,143614.285714
1,2,3495760,11,317796.0,1416000,-108000.00,0.0,0.000000,0.0,0.0,...,0.090909,0.000000,0.090909,0.000000,0.000000,0.090909,3364360,131400,336436.000000,131400.000000
2,3,2208840,30,73628.0,589950,69275.00,0.0,410083.333333,-93575.0,85500.0,...,0.100000,0.066667,0.000000,0.033333,0.000000,0.000000,2208840,0,73628.000000,0.000000
3,4,1023200,4,255800.0,560000,0.00,0.0,0.000000,0.0,0.0,...,0.000000,0.750000,0.000000,0.000000,0.000000,0.250000,966200,57000,322066.666667,57000.000000
4,5,4692499,32,146641.0,613800,0.00,0.0,185050.000000,0.0,121050.0,...,0.281250,0.093750,0.062500,0.062500,1.041667,0.187500,3330969,1361530,128114.192308,226921.666667
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
35962,49988,600400,4,150100.0,401850,56050.00,0.0,0.000000,0.0,0.0,...,0.000000,0.000000,0.000000,0.250000,0.000000,0.250000,507300,93100,169100.000000,93100.000000
35963,49990,202350,1,202350.0,202350,0.00,202350.0,0.000000,0.0,0.0,...,0.000000,0.000000,0.000000,0.000000,0.000000,1.000000,0,202350,0.000000,202350.000000
35964,49992,209950,2,104975.0,159600,0.00,0.0,0.000000,159600.0,0.0,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.500000,159600,50350,159600.000000,50350.000000
35965,49993,139529,4,34882.0,66500,34882.25,0.0,0.000000,0.0,0.0,...,0.000000,0.000000,0.000000,0.250000,0.000000,0.000000,139529,0,34882.250000,0.000000


- **[무이자카드이용률]**

In [46]:
zero_instcard = data.groupby('custid')['WITHOUT_INST'].agg([('무이자카드이용률', np.mean)]).reset_index()

feature = pd.merge(feature, zero_instcard, on='custid', how='outer')
feature

Unnamed: 0,custid,실제구매액,구매건수,평균구매액,최대구매액,1월평균구매액,2월평균구매액,3월평균구매액,4월평균구매액,5월평균구매액,...,여성상품구매비율,유아용품구매비율,아동용품구매비율,화장품구매주기,수입상품비율,국내상품액,수입상품액,국내상품평균액,수입상품평균액,무이자카드이용률
0,0,1567800,11,142527.0,409500,0.00,0.0,0.000000,264600.0,0.0,...,0.000000,0.000000,0.000000,1.371429,0.636364,562500,1005300,140625.000000,143614.285714,0.909091
1,2,3495760,11,317796.0,1416000,-108000.00,0.0,0.000000,0.0,0.0,...,0.000000,0.090909,0.000000,0.000000,0.090909,3364360,131400,336436.000000,131400.000000,0.272727
2,3,2208840,30,73628.0,589950,69275.00,0.0,410083.333333,-93575.0,85500.0,...,0.066667,0.000000,0.033333,0.000000,0.000000,2208840,0,73628.000000,0.000000,0.600000
3,4,1023200,4,255800.0,560000,0.00,0.0,0.000000,0.0,0.0,...,0.750000,0.000000,0.000000,0.000000,0.250000,966200,57000,322066.666667,57000.000000,0.250000
4,5,4692499,32,146641.0,613800,0.00,0.0,185050.000000,0.0,121050.0,...,0.093750,0.062500,0.062500,1.041667,0.187500,3330969,1361530,128114.192308,226921.666667,0.437500
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
35962,49988,600400,4,150100.0,401850,56050.00,0.0,0.000000,0.0,0.0,...,0.000000,0.000000,0.250000,0.000000,0.250000,507300,93100,169100.000000,93100.000000,0.750000
35963,49990,202350,1,202350.0,202350,0.00,202350.0,0.000000,0.0,0.0,...,0.000000,0.000000,0.000000,0.000000,1.000000,0,202350,0.000000,202350.000000,0.000000
35964,49992,209950,2,104975.0,159600,0.00,0.0,0.000000,159600.0,0.0,...,0.000000,0.000000,0.000000,0.000000,0.500000,159600,50350,159600.000000,50350.000000,0.000000
35965,49993,139529,4,34882.0,66500,34882.25,0.0,0.000000,0.0,0.0,...,0.000000,0.000000,0.250000,0.000000,0.000000,139529,0,34882.250000,0.000000,0.000000


### $~~~$ Confirm data

In [47]:
# 결측치가 없다.
feature.isna().sum().reset_index().iloc[:, -1].value_counts()

0    986
Name: 0, dtype: int64

In [48]:
# 아래는 Category Encoding이 필요하다.
feature.select_dtypes(include='O')

Unnamed: 0,주구매지점,주브랜드,주코너,주상품군,주파트,주팀,주바이어,2700000000000주구매매장,2700000000000주브랜드,2700000000000주코너,...,가전주코너_y,가전주상품군,가전주팀_y,가전주파트_y,침구수예주구매매장,침구수예주브랜드,침구수예주코너,침구수예주상품군,침구수예주팀,침구수예주파트
0,본점,크리니크,수입종합화장품,화장품,잡화,잡화가용팀,화장품,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,무역점,노티카,트래디셔널,트레디셔널,가정용품,잡화가용팀,트래디셔널캐주얼,0,0,0,...,가전특정,가전,잡화가용팀,가정용품,무역점,레노마,브랜드침구,수예,잡화가용팀,가정용품
2,천호점,라코스테,골프(LC),영트랜디,스포츠캐주얼,잡화가용팀,스포츠,천호점,식품,즉석조리,...,취사소형,가전/문화,잡화가용팀,가정용품,0,0,0,0,0,0
3,무역점,시슬리,수입종합화장품,캐릭터캐주얼,여성캐주얼,의류패션팀,캐릭터캐주얼,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,본점,식품,트래디셔널,트레디셔널,남성정장스포츠,잡화가용팀,스포츠,본점,식품,야채,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
35962,본점,미키클럽,아동,아동복,"케주얼,구두,아동",의류패션팀,유아동복,0,0,0,...,0,0,0,0,0,0,0,0,0,0
35963,무역점,크리니크,수입종합화장품,화장품,명품잡화,잡화가용팀,화장품,0,0,0,...,0,0,0,0,0,0,0,0,0,0
35964,본점,에스티로더,수입종합화장품,화장품,잡화,잡화가용팀,화장품,0,0,0,...,0,0,0,0,0,0,0,0,0,0
35965,신촌점,식품,차류,일반식품,공산품,식품팀,일반식품,신촌점,식품,차류,...,0,0,0,0,0,0,0,0,0,0


In [49]:
# column명의 특수문자를 처리해야한다.
feature.columns = [str(i).replace(',','') for i in feature.columns]

### $~~~~$ Feature Correlation

In [50]:
# Feature가 많아서 상관관계 확인이 어렵다.
# plt.figure(figsize=(30,30))
# sns.heatmap(feature.iloc[:,1:].corr(), cmap="YlGnBu", vmin=-1, vmax=1)
# plt.show()

### $~~~$ Split train, test data

In [51]:
feature_train = feature.query('custid in @train_ID').reset_index(drop=True)
feature_test = feature.query('custid in @test_ID').reset_index(drop=True)

### $~~~$ Save train, test data

In [52]:
feature_train.to_csv(path +'/feature_train.csv', index=False, encoding='cp949')
feature_test.to_csv(path +'/feature_test.csv', index=False, encoding='cp949')