## Import

In [1]:
# for read data
import os

# for feature generation
import pandas as pd
import numpy as np
import warnings; warnings.filterwarnings("ignore")

# for feature decompose
from sklearn.decomposition import PCA 

### Read Data

In [2]:
path = (os.path.abspath("./input"))

train = pd.read_csv(path + '/X_train_DC.csv', parse_dates=['SALES_DATE', 'SALES_DAY'], encoding='cp949')
test = pd.read_csv(path + '/X_test_DC.csv', parse_dates=['SALES_DATE', 'SALES_DAY'], encoding='cp949')
y_train = pd.read_csv(path + '/y_train.csv').group

In [3]:
# Concat to One Data
data = pd.concat([train, test]).reset_index(drop=True)

# 추후 custid를 기준으로 train data로 만든 feature, test data로 만든 feature로 분리한다.
train_ID = train.custid.unique()
test_ID = test.custid.unique()

feature = pd.DataFrame({'custid':train.custid.unique().tolist() + test.custid.unique().tolist()})
feature

Unnamed: 0,custid
0,0
1,2
2,3
3,4
4,5
...,...
35962,49988
35963,49990
35964,49992
35965,49993


## Bag of Words
Category feature를 자체적으로 처리하는 Catboost에만 적용할 feature를 생성한다.

- **[goodcd]**<br>
  고객의 구매한 적이 있는 상품이면 1, 구매한 적이 없는 상품이면 0으로 나타낸 데이터이다.

In [4]:
OH_good =  pd.pivot_table(data.drop_duplicates(['custid','goodcd']), 
                          index='custid', columns='goodcd', values='tot_amt', aggfunc=np.size, fill_value=0).reset_index()
OH_good.columns = ['custid']+[f'상품_{i}' for i in OH_good.columns[1:]]

feature = pd.merge(feature, OH_good, on='custid', how='outer')
feature

Unnamed: 0,custid,상품_2101013000150,상품_2101013200100,상품_2101013200150,상품_2101053008000,상품_2116050008000,상품_2116052008000,상품_2116052008100,상품_2116053008000,상품_2116054008000,...,상품_6513511030771,상품_6519460031000,상품_6528820030700,상품_6528820030770,상품_6528820030771,상품_6554790031000,상품_6732970091600,상품_6732980091600,상품_6736630075300,상품_8801192410767
0,0,0,0,0,0,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,3,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,4,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
35962,49988,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
35963,49990,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
35964,49992,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
35965,49993,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


- **[brd_nm]**<br>
  고객의 구매한 적이 있는 브랜드이면 1, 구매한 적이 없는 브랜드이면 0으로 나타낸 데이터이다.

In [5]:
OH_brd =  pd.pivot_table(data.drop_duplicates(['custid','brd_nm']),
                         index='custid', columns='brd_nm', values='tot_amt', aggfunc=np.size, fill_value=0).reset_index()
OH_brd.columns = ['custid']+[f'브랜드_{i}' for i in OH_brd.columns[1:]]

feature = pd.merge(feature, OH_brd, on='custid', how='outer')
feature

Unnamed: 0,custid,상품_2101013000150,상품_2101013200100,상품_2101013200150,상품_2101053008000,상품_2116050008000,상품_2116052008000,상품_2116052008100,상품_2116053008000,상품_2116054008000,...,브랜드_휘나래,브랜드_휠라슈즈,브랜드_휠라의류,브랜드_휠라인티모,브랜드_휠라키즈,브랜드_휠라행사,브랜드_휴고보스,브랜드_휴먼앤휴먼,브랜드_흙침대,브랜드_희원상사
0,0,0,0,0,0,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,3,0,0,0,0,0,0,0,0,0,...,0,1,0,1,0,0,0,0,0,0
3,4,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
35962,49988,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
35963,49990,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
35964,49992,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
35965,49993,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


- **[corner_nm]**<br>
  고객의 구매한 적이 있는 코너이면 1, 구매한 적이 없는 코너이면 0으로 나타낸 데이터이다.

In [6]:
OH_cor =  pd.pivot_table(data.drop_duplicates(['custid','corner_nm']), 
                         index='custid', columns='corner_nm', values='tot_amt', aggfunc=np.size, fill_value=0).reset_index()
OH_cor.columns = ['custid']+[f'코너_{i}' for i in OH_cor.columns[1:]]

feature = pd.merge(feature, OH_cor, on='custid', how='outer')
feature

Unnamed: 0,custid,상품_2101013000150,상품_2101013200100,상품_2101013200150,상품_2101053008000,상품_2116050008000,상품_2116052008000,상품_2116052008100,상품_2116053008000,상품_2116054008000,...,코너_행사소품,코너_행사슈즈,코너_행사핸드백,코너_향수,코너_헤어악세사리,코너_홈데코,코너_홈쇼핑,코너_화장잡화,코너_화장품,코너_훼미닌부틱
0,0,0,0,0,0,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,3,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,4,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
35962,49988,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
35963,49990,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
35964,49992,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
35965,49993,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0


- **[pc_nm]**<br>
  고객의 구매한 적이 있는 상품군이면 1, 구매한 적이 없는 상품군이면 0으로 나타낸 데이터이다.

In [7]:
OH_pc =  pd.pivot_table(data.drop_duplicates(['custid','pc_nm']), 
                        index='custid', columns='pc_nm', values='tot_amt', aggfunc=np.size, fill_value=0).reset_index()
OH_pc.columns = ['custid']+[f'상품군_{i}' for i in OH_pc.columns[1:]]

feature = pd.merge(feature, OH_pc, on='custid', how='outer')
feature

Unnamed: 0,custid,상품_2101013000150,상품_2101013200100,상품_2101013200150,상품_2101053008000,상품_2116050008000,상품_2116052008000,상품_2116052008100,상품_2116053008000,상품_2116054008000,...,상품군_타운란제리,상품군_트래디셔널캐쥬얼,상품군_트랜디캐쥬얼,상품군_트레디셔널,상품군_패션슈즈,상품군_패션시즌,상품군_피혁A,상품군_피혁B,상품군_핸드백,상품군_화장품
0,0,0,0,0,0,1,1,0,0,0,...,0,0,0,1,0,0,0,0,0,1
1,2,0,0,0,0,0,1,0,0,0,...,0,0,0,1,1,0,0,0,0,1
2,3,0,0,0,0,0,0,0,0,0,...,0,1,0,0,1,0,0,0,0,1
3,4,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,5,0,0,0,0,0,1,0,0,0,...,0,0,1,1,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
35962,49988,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
35963,49990,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,1
35964,49992,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
35965,49993,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


- **[part_nm]**<br>
  고객의 구매한 적이 있는 파트이면 1, 구매한 적이 없는 파트이면 0으로 나타낸 데이터이다.

In [8]:
OH_part =  pd.pivot_table(data.drop_duplicates(['custid','part_nm']), 
                          index='custid', columns='part_nm', values='tot_amt', aggfunc=np.size, fill_value=0).reset_index()
OH_part.columns = ['custid']+[f'파트_{i}' for i in OH_part.columns[1:]]

feature = pd.merge(feature, OH_part, on='custid', how='outer')
feature

Unnamed: 0,custid,상품_2101013000150,상품_2101013200100,상품_2101013200150,상품_2101053008000,상품_2116050008000,상품_2116052008000,상품_2116052008100,상품_2116053008000,상품_2116054008000,...,파트_여성정장,파트_여성캐주얼,파트_영라이브,파트_영어덜트캐쥬얼,파트_영캐릭터,파트_영플라자,파트_인터넷백화점,파트_잡화,"파트_케주얼,구두,아동",파트_패션잡화
0,0,0,0,0,0,1,1,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,2,0,0,0,0,0,1,0,0,0,...,0,0,0,1,0,0,0,1,0,0
2,3,0,0,0,0,0,0,0,0,0,...,0,0,1,1,0,0,0,0,0,0
3,4,0,0,0,0,0,0,0,0,0,...,1,1,0,0,0,0,0,0,0,0
4,5,0,0,0,0,0,1,0,0,0,...,0,1,0,0,0,0,0,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
35962,49988,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,1,0
35963,49990,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
35964,49992,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,1,1,0
35965,49993,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


- **[team_nm]**<br>
  고객의 구매를 담당한 팀이면 1, 담당하지 않은 팀이면 0으로 나타낸 데이터이다.

In [9]:
OH_team =  pd.pivot_table(data.drop_duplicates(['custid','team_nm']), 
                          index='custid', columns='team_nm', values='tot_amt', aggfunc=np.size, fill_value=0).reset_index()
OH_team.columns = ['custid']+[f'팀_{i}' for i in OH_team.columns[1:]]

feature = pd.merge(feature, OH_team, on='custid', how='outer')
feature

Unnamed: 0,custid,상품_2101013000150,상품_2101013200100,상품_2101013200150,상품_2101053008000,상품_2116050008000,상품_2116052008000,상품_2116052008100,상품_2116053008000,상품_2116054008000,...,파트_영캐릭터,파트_영플라자,파트_인터넷백화점,파트_잡화,"파트_케주얼,구두,아동",파트_패션잡화,팀_식품팀,팀_의류패션팀,팀_인터넷백화점,팀_잡화가용팀
0,0,0,0,0,0,1,1,0,0,0,...,0,0,0,1,0,0,0,1,0,1
1,2,0,0,0,0,0,1,0,0,0,...,0,0,0,1,0,0,0,1,0,1
2,3,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,1,0,1
3,4,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,1
4,5,0,0,0,0,0,1,0,0,0,...,0,0,0,1,1,0,1,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
35962,49988,0,0,0,0,0,0,0,0,0,...,0,0,0,1,1,0,0,1,0,1
35963,49990,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,1
35964,49992,0,0,0,0,1,0,0,0,0,...,0,0,0,1,1,0,0,1,0,1
35965,49993,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,1,0,0,1


- **[buyer_nm]**<br>
  고객의 구매한 상품의 바이어이면 1, 구매한 상품의 바이어가 아니면 0으로 나타낸 데이터이다.

In [10]:
OH_buyer =  pd.pivot_table(data.drop_duplicates(['custid','buyer_nm']), 
                           index='custid', columns='buyer_nm', values='tot_amt', aggfunc=np.size, fill_value=0).reset_index()
OH_buyer.columns = ['custid']+[f'바이어_{i}' for i in OH_buyer.columns[1:]]

feature = pd.merge(feature, OH_buyer, on='custid', how='outer')
feature

Unnamed: 0,custid,상품_2101013000150,상품_2101013200100,상품_2101013200150,상품_2101053008000,상품_2116050008000,상품_2116052008000,상품_2116052008100,상품_2116053008000,상품_2116054008000,...,바이어_타운모피,바이어_트래디셔널캐주얼,바이어_피혁,바이어_행사장(남성),바이어_행사장(아동스포츠),바이어_행사장(여성정장),바이어_행사장(여성캐주얼),바이어_행사장(여성캐쥬),바이어_행사장(잡화),바이어_화장품
0,0,0,0,0,0,1,1,0,0,0,...,0,1,0,0,0,0,0,0,0,1
1,2,0,0,0,0,0,1,0,0,0,...,0,1,1,0,0,0,0,0,0,1
2,3,0,0,0,0,0,0,0,0,0,...,0,1,1,0,0,0,0,0,0,1
3,4,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,5,0,0,0,0,0,1,0,0,0,...,0,1,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
35962,49988,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
35963,49990,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,1
35964,49992,0,0,0,0,1,0,0,0,0,...,0,0,1,0,0,0,0,0,0,1
35965,49993,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


## PCA
Spare함으로 PCA를 통해 일부만을 저장한다.

In [6]:
# split data
feature_train = feature.query('custid in @train_ID').reset_index(drop=True)
feature_test = feature.query('custid in @test_ID').reset_index(drop=True)

# del custid
del feature_train['custid'], feature_test['custid']

In [7]:
max_d = num_d = feature_train.shape[1]

In [8]:
pca = PCA(n_components=max_d, random_state=0).fit(feature_train)
cumsum = np.cumsum(pca.explained_variance_ratio_) # 분산의 설명량을 누적합
num_d = np.argmax(cumsum >= 0.99) + 1             # 분산의 설명량이 99%이상 되는 차원의 수
if num_d == 1: num_d = max_d
    
pca = PCA(n_components=num_d, random_state=0)  
feature_train = pca.fit_transform(feature_train)
feature_test = pca.transform(feature_test)
print(feature_train.shape)

(21587, 4937)


### Make train, test data

In [None]:
feature_train = pd.concat([pd.Series(train_ID, name='custid'), pd.DataFrame(feature_train)], axis=1)
feature_test = pd.concat([pd.Series(test_ID, name='custid'), pd.DataFrame(feature_test)], axis=1)

### Save train, test data

In [20]:
feature_train.to_csv(path + '/feature_train_BOW.csv', index=False, encoding='cp949')
feature_test.to_csv(path + '/feature_test_BOW.csv', index=False, encoding='cp949')