## Import

In [1]:
import pickle
import numpy as np
import pandas as pd
import warnings ; warnings.filterwarnings(action='ignore')

# Feature Transformation
from sklearn.preprocessing import StandardScaler

## Read Data

In [2]:
data_path = "../data/"

lpay = pd.read_csv(data_path+'DC_LPOINT_LPAY.csv', parse_dates=['date','de_dt'])

## Feature Generation

In [3]:
trade_list = lpay.drop_duplicates(['cust','cop_c','date'])[['cust', 'date']]\
             .groupby('date')['cust'].unique().reset_index()

feature = pd.DataFrame()
for _, DATE, CUSTS in trade_list.itertuples():
    ease = lpay.query('cust in @CUSTS & date <= @DATE')
    # 실제 구매액, 구매건수, 평균구매액, 최대구매액
    amount = ease.groupby('cust')['buy_am'].agg([('real_pay', np.sum),('consum_count', np.size),
                                                 ('mean_pay', lambda x : np.round(np.mean(x))),('max_pay', np.max)])
    
    # 내점일수, 최근구매일, 구매주기
    day = ease.groupby('cust')['de_dt'].agg([('visit_day_count',lambda x: x.nunique()), 
                                             ('recent_consum_day', lambda x: (ease.de_dt.max() - x.max()).days), 
                                             ('consum_cycle', lambda x: int((x.max() - x.min()).days / x.nunique()))])
    
    # 1일 평균구매액, 1일 평균구매건수, 1일 1회 구매
    daily = ease.groupby(['cust', 'de_dt'])['buy_am'].agg([('day1_pay', np.sum),('day1_pay_count', np.size)]).reset_index()\
            .groupby('cust').agg({'day1_pay':[('day1_mean_pay', np.mean)],
                                  'day1_pay_count':[('day1_mean_pay_count', np.mean),
                                                    ('day1_1consum', lambda x: x.tolist().count(1))]}) 
    daily.columns = daily.columns.get_level_values(1)
    
    # cop 별 구매비율, 총금액, 평균금액, 구매주기, cop 종류수, 표준편차, 
    cop = pd.concat([pd.pivot_table(ease, index='cust', columns='cop_c', values='buy_am', aggfunc=np.size, fill_value=0)\
                     .divide(ease.groupby('cust')['cop_c'].size(), axis=0),
                     pd.pivot_table(ease, index='cust', columns='cop_c', values='buy_am', aggfunc=np.sum, fill_value=0),
                     pd.pivot_table(ease, index='cust', columns='cop_c', values='buy_am', aggfunc=np.mean, fill_value=0),
                     pd.pivot_table(ease, index='cust', columns='cop_c', values='de_dt', 
                                    aggfunc=lambda x: (x.max()-x.min()).days//x.nunique(), fill_value=0),
                     ease.groupby("cust")["cop_c"].nunique(),
                     pd.pivot_table(ease, index='cust', columns='cop_c', values='buy_am', aggfunc=np.std, fill_value=0)], axis=1)
    cop.columns = [f'{j}_{i}' for i in ['ratio','total_amount','mean_amount','consum_cycle'] for j in sorted(ease.cop_c.unique())]\
                  +["cop_c_unique"]\
                  +[f'{i}_sd' for i in cop.columns[ease.cop_c.nunique()*4+1:]]

    # 오프라인/온라인 건수, 온라인 비율
    onoff = pd.concat([pd.pivot_table(ease, index='cust', columns='chnl_dv', values='buy_am', aggfunc=np.size, fill_value=0)\
                       .rename(columns={1:'off_count', 2:'on_count'}),
                       ease.groupby("cust")["chnl_dv"].agg([("online_ratio", np.mean)])], axis=1)

    # 월평균구매액,월최대구매액,월최소구매액, 평균방문월, 평균구매월
    high = ease.groupby('cust')['buy_am'].agg([('high_rank_pay', lambda x: x.sort_values()[-5:].index.tolist())]) #상위구매액
    high = [j for i in high.high_rank_pay for j in i]
    month = pd.concat([pd.pivot_table(ease ,index='cust', columns='de_month', values='buy_am',
                                      aggfunc = [np.mean, max, min], fill_value=0),
                       ease.groupby('cust')['de_month'].mean(),
                       ease.loc[high].groupby('cust')['de_month'].mean()], axis=1)
    month.columns = [f'{str(j)}_{i}' for i in ['month_mean_pay', 'month_max_pay', 'month_min_pay'] for j in sorted(ease['de_month'].unique())]\
                    +['mean_visit_month', 'mean_consum_month']

    # 요일 종류수, 공휴일 비율, 주말 비율
    day_info = pd.concat([ease.groupby("cust")["consum_day"].agg([("consum_day_unique", pd.Series.nunique)]),
                          ease.groupby("cust")["holiday"].agg([("holiday_ratio", np.mean)]),
                          ease.groupby("cust")["weekend"].agg([("weekend_ratio", np.mean)])], axis=1)
    
    make_feature = pd.concat([pd.DataFrame({'date':[DATE]*len(CUSTS)}, index=CUSTS),
                              amount, day, daily, cop, onoff, month, day_info],axis=1).reset_index()
    feature = pd.concat([feature, make_feature])
    
feature = feature.rename(columns={'index':'cust'}).sort_values(['cust','date']).reset_index(drop=True)
feature

Unnamed: 0,cust,date,real_pay,consum_count,mean_pay,max_pay,visit_day_count,recent_consum_day,consum_cycle,day1_mean_pay,...,9_month_min_pay,10_month_mean_pay,10_month_max_pay,10_month_min_pay,11_month_mean_pay,11_month_max_pay,11_month_min_pay,12_month_mean_pay,12_month_max_pay,12_month_min_pay
0,M000136117,2021-01-04 11:00:00,42600,1,42600.0,42600,1,0,0,42600.000000,...,,,,,,,,,,
1,M000136117,2021-01-09 14:00:00,945100,2,472550.0,902500,2,0,2,472550.000000,...,,,,,,,,,,
2,M000136117,2021-09-26 22:00:00,960200,3,320067.0,902500,3,0,88,320066.666667,...,15100.0,,,,,,,,,
3,M000136117,2021-11-20 00:00:00,1137501,4,284375.0,902500,4,0,80,284375.250000,...,15100.0,0.0,0.0,0.0,177301.0,177301.0,177301.0,,,
4,M000419293,2021-01-07 23:00:00,55880,1,55880.0,55880,1,0,0,55880.000000,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
314669,M999962961,2021-06-18 18:00:00,3234750,12,269562.0,2600000,11,0,14,294068.181818,...,,,,,,,,,,
314670,M999962961,2021-07-24 20:00:00,3243906,13,249531.0,2600000,12,0,16,270325.500000,...,,,,,,,,,,
314671,M999962961,2021-08-19 11:00:00,3266036,14,233288.0,2600000,13,0,16,251233.538462,...,,,,,,,,,,
314672,M999962961,2021-12-16 17:00:00,3425636,15,228376.0,2600000,14,0,24,244688.285714,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,159600.0,159600.0,159600.0


## Generate Target

In [4]:
# target을 만들고자 하며 마지막 구매의 다음 구매까지 걸리는 시간을 구할 수 없어 임의의 값을 삽입해 처리해 구한다.
# 분석 기간 이외의 2040-08-01의 값으로 처리한다.
target = pd.concat([feature[['cust','date']],
                    pd.DataFrame({'cust':feature.cust.unique(), 'date':['2040-08-01 00:00:00']*feature.cust.nunique()})])
target['date'] = pd.to_datetime(target['date'])
target = target.sort_values(['cust','date']).reset_index(drop=True)

In [5]:
target['target'] = target.groupby('cust')['date'].diff(-1).dt.total_seconds()/60/60 * -1

# 마지막 거래와 그 다음 거래간 시간차이는 알 수 없음으로 삭제한다.
target.dropna(inplace=True)

In [6]:
# 1년 기간 안의 시간간격 최댓값 24*364보다 크면 임의의 값을 넣은 날의 target값이다.
# 즉 고객의 마지막 구매일로 마지막 구매일의 target은 예측하고자 하는 다음 구매시간으로 test data로 구분한다.
feature = feature.merge(target, on=['cust','date'])
train = feature.query('target <= 24*364')
test = feature.query('target > 24*364') ; del test['target']
print('학습데이터 크기:', train.shape)
print('평가데이터 크기:', test.shape)

학습데이터 크기: (305768, 128)
평가데이터 크기: (8906, 127)


## Feature Transformation

In [7]:
# Save & Drop unuseful columns
trainID, trainDate = train['cust'], train['date'] ; del train['cust'], train['date']
testID, testDate = test['cust'], test['date'] ; del test['cust'], test['date']

In [8]:
# Define data
y_train = train['target'] ; del train['target']
X_train, X_test = train, test

In [9]:
# Inputation
X_train.fillna(0, inplace=True)
X_test.fillna(0, inplace=True)

In [10]:
# Scaling
scaler = StandardScaler()
X_train[X_train.columns] =scaler.fit_transform(X_train) 
X_test[X_test.columns] = scaler.transform(X_test) 

## Save Data

In [11]:
pickle.dump((trainID, trainDate, testID, testDate), open(data_path+'TS_feature_info.pkl', 'wb'))
pickle.dump((X_train, y_train, X_test), open(data_path+'TS_features.pkl', 'wb'))