# Import Library

In [1]:
# 기본 패키지
import pandas as pd
import numpy as np
import os

# 시각화 패키지
import matplotlib.pyplot as plt
import seaborn as sns
plt.rc('font', family='NanumBarunGothic') 
plt.style.use(['ggplot'])

# 모델링 패키지
import sklearn
import matplotlib
# import pycaret
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import make_pipeline
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.model_selection import validation_curve
from sklearn.metrics import mean_squared_log_error

# 경고 메시지 무시
import warnings
warnings.filterwarnings(action='ignore') 

# 사용자 운영체제 확인
import platform
platform.system()

# 운영체제별 한글 폰트 설정
if platform.system() == 'Darwin': # Mac 환경 폰트 설정
    plt.rc('font', family='AppleGothic')
elif platform.system() == 'Windows': # Windows 환경 폰트 설정
    plt.rc('font', family='Malgun Gothic')

plt.rc('axes', unicode_minus=False) # 마이너스 폰트 설정


# 글씨 선명하게 출력하는 설정
%config InlineBackend.figure_format = 'retina'

# 버전 확인
print('Pandas : %s'%(pd.__version__))
print('Numpy : %s'%(np.__version__))
print('Scikit-Learn : %s'%(sklearn.__version__))
print('seaborn : %s'%(sns.__version__))
print('matplotlib : %s'%(matplotlib.__version__))
# print('pycaret : %s'%(pycaret.__version__))
!python --version

Pandas : 1.4.2
Numpy : 1.19.5
Scikit-Learn : 0.23.2
seaborn : 0.11.2
matplotlib : 3.5.2
Python 3.9.12


# Load Data

In [2]:
# 고객정보 데이터 불러오기 및 칼럼명 변경
demo = pd.read_csv('../data/LPOINT_BIG_COMP_01_DEMO.csv', low_memory=False)
demo.columns = ['고객번호', '성별', '연령대', '거주지대분류코드']

In [3]:
# 02_Data_Clustering에서 라벨링했던 데이터 불러오기
df_train = pd.read_csv("../data/02_Data_Clustering_train.csv")
df_test = pd.read_csv("../data/02_Data_Clustering_test.csv")

In [4]:
rfm_df_train = pd.DataFrame(df_train.groupby('고객번호', as_index = False)['영수증번호'].count()['고객번호'])
rfm_df_train = pd.merge(rfm_df_train, demo, how = 'left', on = '고객번호')
rfm_df_train = pd.merge(rfm_df_train, df_train[['고객번호', 'labels']].drop_duplicates(['고객번호']), how = 'left', on = '고객번호')

In [5]:
rfm_df_test = pd.DataFrame(df_test.groupby('고객번호', as_index = False)['영수증번호'].count()['고객번호'])
rfm_df_test = pd.merge(rfm_df_test, demo, how = 'left', on = '고객번호')

In [6]:
X_train = rfm_df_train.iloc[:,:-1]
y_train = rfm_df_train['labels']
X_test = rfm_df_test
print(f"X_train shape: {X_train.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"y_train shape: {y_train.shape}")

X_train shape: (26212, 4)
X_test shape: (3662, 4)
y_train shape: (26212,)


# 파생변수 생성

### 영수증번호  
- 길이, 타입

In [7]:
# 영수증 번호의 길이가 12, 15, 18 3가지로 나누어지는데 이는 각 다른 점포에서 결제한 것을 의미할 것이라고 판단하여 새로운 파생변수로 생성
df_train['영수증번호길이'] = df_train['영수증번호'].apply(lambda x: len(str(x)))
df_test['영수증번호길이'] = df_test['영수증번호'].apply(lambda x: len(str(x)))
df_train['영수증번호길이'].value_counts()

12    4134942
15     353002
18     245778
Name: 영수증번호길이, dtype: int64

In [8]:
X_train['영수증번호길이평균'] = df_train.groupby('고객번호')[['영수증번호길이']].mean()['영수증번호길이'].tolist()
X_train['영수증번호길이최대'] = df_train.groupby('고객번호')['영수증번호길이'].agg(lambda x:x.value_counts().index[0]).tolist()
X_test['영수증번호길이평균'] = df_test.groupby('고객번호')[['영수증번호길이']].mean()['영수증번호길이'].tolist()
X_test['영수증번호길이최대'] = df_test.groupby('고객번호')['영수증번호길이'].agg(lambda x:x.value_counts().index[0]).tolist()

In [9]:
# 영수증 번호의 타입을 생성하는 함수
def receipt_type(x):
    if len(str(x)) == 18:
        return str(x)[8]
    elif len(str(x)) == 15:
        return 'number'
    elif len(str(x)) == 12:
        return str(x)[0]

In [10]:
# 영수증마다 하나의 영어를 포함하거나, 모두 숫자로 이루어져있기 때문에 이러한 타입을 나타내는 파생변수를 생성
df_train['영수증타입'] = df_train['영수증번호'].apply(receipt_type)
df_test['영수증타입'] = df_test['영수증번호'].apply(receipt_type)
df_train['영수증타입'].value_counts()

A         3760036
E          381115
number     353002
D          154419
C           77108
B            8042
Name: 영수증타입, dtype: int64

In [11]:
X_train['영수증타입최대값'] = df_train.groupby('고객번호')['영수증타입'].agg(lambda x:x.value_counts().index[0]).tolist()
X_test['영수증타입최대값'] = df_test.groupby('고객번호')['영수증타입'].agg(lambda x:x.value_counts().index[0]).tolist()

### 구매경로  
- 최다 구매 경로

In [12]:
df_train['채널구분'].value_counts()

1    4122105
2     611617
Name: 채널구분, dtype: int64

In [13]:
def buy_root(x):
    if x == 1:
        return '오프라인'
    elif x == 2:
        return '온라인'

In [14]:
X_train['최다구매경로'] = df_train.groupby('고객번호')['채널구분'].agg(lambda x:x.value_counts().index[0]).tolist()
X_train['최다구매경로'] = X_train['최다구매경로'].apply(buy_root)
X_test['최다구매경로'] = df_test.groupby('고객번호')['채널구분'].agg(lambda x:x.value_counts().index[0]).tolist()
X_test['최다구매경로'] = X_test['최다구매경로'].apply(buy_root)

### 제휴사  
- 제휴사의 타입

In [15]:
# 제휴사의 제일 앞에 알파벳의 경우에는 제휴사의 타입을 나타낼 수 있기 때문에 파생변수로 생성
df_train['제휴사타입'] = df_train['제휴사'].apply(lambda x: str(x)[0])
df_test['제휴사타입'] = df_test['제휴사'].apply(lambda x: str(x)[0])
df_train['제휴사타입'].value_counts()

A    4362804
D     171769
L      93429
C      90236
B       8811
E       6673
Name: 제휴사타입, dtype: int64

In [16]:
X_train['최다제휴사타입'] = df_train.groupby('고객번호')['제휴사타입'].agg(lambda x:x.value_counts().index[0]).tolist()
X_train['최다제휴사'] = df_train.groupby('고객번호')['제휴사'].agg(lambda x:x.value_counts().index[0]).tolist()
X_test['최다제휴사타입'] = df_test.groupby('고객번호')['제휴사타입'].agg(lambda x:x.value_counts().index[0]).tolist()
X_test['최다제휴사'] = df_test.groupby('고객번호')['제휴사'].agg(lambda x:x.value_counts().index[0]).tolist()

### 구매일자  
- 구매연도, 월, 일, 요일, 시간 등

In [17]:
df_train['구매일자'] = pd.to_datetime(df_train['구매일자'].astype(str))
df_train['구매년도'] = df_train['구매일자'].dt.year
df_train['구매월'] = df_train['구매일자'].dt.month
df_train['구매일'] = df_train['구매일자'].dt.day
df_train['구매요일'] = df_train['구매일자'].dt.weekday
df_test['구매일자'] = pd.to_datetime(df_test['구매일자'].astype(str))
df_test['구매년도'] = df_test['구매일자'].dt.year
df_test['구매월'] = df_test['구매일자'].dt.month
df_test['구매일'] = df_test['구매일자'].dt.day
df_test['구매요일'] = df_test['구매일자'].dt.weekday

In [18]:
# # 대부분이 2021년도이기 때문에 제거
# X_train['최다구매년도'] = df_train.groupby('고객번호')['구매년도'].agg(lambda x:x.value_counts().index[0]).tolist()
# X_test['최다구매년도'] = df_test.groupby('고객번호')['구매년도'].agg(lambda x:x.value_counts().index[0]).tolist()

In [19]:
X_train['구매월평균'] = df_train.groupby('고객번호')[['구매월']].mean()['구매월'].tolist()
X_train['최다구매월'] = df_train.groupby('고객번호')['구매월'].agg(lambda x:x.value_counts().index[0]).tolist()
X_test['구매월평균'] = df_test.groupby('고객번호')[['구매월']].mean()['구매월'].tolist()
X_test['최다구매월'] = df_test.groupby('고객번호')['구매월'].agg(lambda x:x.value_counts().index[0]).tolist()

In [20]:
# 최다 구매 계절의 파생변수를 만들기 위해 계절을 나누는 파생변수 생성
def season(x):
    if x >= 3 and x <= 5:
        return '봄'
    elif x >= 6 and x <= 7:
        return '여름'
    elif x >= 8 and x <= 11:
        return '가을'
    else:
        return '겨울'

In [21]:
df_train['구매계절'] = df_train['구매월'].apply(season)
df_test['구매계절'] = df_test['구매월'].apply(season)
X_train['최다구매계절'] = df_train.groupby('고객번호')['구매계절'].agg(lambda x:x.value_counts().index[0]).tolist()
X_test['최다구매계절'] = df_test.groupby('고객번호')['구매계절'].agg(lambda x:x.value_counts().index[0]).tolist()

In [22]:
X_train['평균구매일'] = df_train.groupby('고객번호')[['구매일']].mean()['구매일'].tolist()
X_test['평균구매일'] = df_test.groupby('고객번호')[['구매일']].mean()['구매일'].tolist()

In [23]:
# 요일을 만드는 파생변수 생성
def dayofweek(x):
    if x == 0:
        return '월요일'
    elif x == 1:
        return '화요일'
    elif x == 2:
        return '수요일'
    elif x == 3:
        return '목요일'
    elif x == 4:
        return '금요일'
    elif x == 5:
        return '토요일'
    elif x == 6:
        return '일요일'

In [24]:
df_train['구매요일'] = df_train['구매요일'].apply(dayofweek)
df_test['구매요일'] = df_test['구매요일'].apply(dayofweek)
X_train['구매요일최대값'] = df_train.groupby('고객번호')[['구매요일']].max()['구매요일'].tolist()
X_train['최다구매요일'] = df_train.groupby('고객번호')['구매요일'].agg(lambda x:x.value_counts().index[0]).tolist()
X_test['구매요일최대값'] = df_test.groupby('고객번호')[['구매요일']].max()['구매요일'].tolist()
X_test['최다구매요일'] = df_test.groupby('고객번호')['구매요일'].agg(lambda x:x.value_counts().index[0]).tolist()

In [25]:
def weekday(x):
    if x == '토요일' or x == '일요일':
        return '주말'
    else:
        return '주중'

In [26]:
df_train['주중주말여부'] = df_train['구매요일'].apply(weekday)
df_test['주중주말여부'] = df_test['구매요일'].apply(weekday)
X_train['주중주말선호도'] = df_train.groupby('고객번호')['주중주말여부'].agg(lambda x:x.value_counts().index[0]).tolist()
X_test['주중주말선호도'] = df_test.groupby('고객번호')['주중주말여부'].agg(lambda x:x.value_counts().index[0]).tolist()

In [27]:
X_train['평균구매시간'] = df_train.groupby('고객번호')[['구매시간']].mean()['구매시간'].tolist()
X_train['최다구매시간'] = df_train.groupby('고객번호')['구매시간'].agg(lambda x:x.value_counts().index[0]).tolist()
X_test['평균구매시간'] = df_test.groupby('고객번호')[['구매시간']].mean()['구매시간'].tolist()
X_test['최다구매시간'] = df_test.groupby('고객번호')['구매시간'].agg(lambda x:x.value_counts().index[0]).tolist()

In [28]:
def hour_type1(x):
    if x < 6:
        return '새벽'
    elif x >= 6 and x < 12:
        return '오전'
    elif x < 18 and x >= 12:
        return '오후'
    else:
        return '저녁'

In [29]:
df_train['구매시간분류1'] = df_train['구매시간'].apply(hour_type1)
df_test['구매시간분류1'] = df_test['구매시간'].apply(hour_type1)
X_train['주구매시간분류1'] = df_train.groupby('고객번호')['구매시간분류1'].agg(lambda x:x.value_counts().index[0]).tolist()
X_test['주구매시간분류1'] = df_test.groupby('고객번호')['구매시간분류1'].agg(lambda x:x.value_counts().index[0]).tolist()

In [30]:
# 평균 직장인들의 춡퇴근시간에 맞춰서 시간을 분류
def hour_type2(x):
    if x >= 9 and x <= 18:
        return '출근'
    else:
        return '퇴근'

In [31]:
df_train['구매시간분류2'] = df_train['구매시간'].apply(hour_type2)
df_test['구매시간분류2'] = df_test['구매시간'].apply(hour_type2)
X_train['주구매시간분류2'] = df_train.groupby('고객번호')['구매시간분류2'].agg(lambda x:x.value_counts().index[0]).tolist()
X_test['주구매시간분류2'] = df_test.groupby('고객번호')['구매시간분류2'].agg(lambda x:x.value_counts().index[0]).tolist()

### 구매금액

In [32]:
X_train['평균구매금액'] = df_train.groupby('고객번호')[['구매금액']].mean()['구매금액'].tolist()
X_train['최대구매금액'] = df_train.groupby('고객번호')['구매금액'].agg(lambda x:x.value_counts().index[0]).tolist()
X_train['최소구매금액'] = df_train.groupby('고객번호')[['구매금액']].min()['구매금액'].tolist()
X_test['평균구매금액'] = df_test.groupby('고객번호')[['구매금액']].mean()['구매금액'].tolist()
X_test['최대구매금액'] = df_test.groupby('고객번호')['구매금액'].agg(lambda x:x.value_counts().index[0]).tolist()
X_test['최소구매금액'] = df_test.groupby('고객번호')[['구매금액']].min()['구매금액'].tolist()

### 구매수량

In [33]:
X_train['평균구매수량'] = df_train.groupby('고객번호')[['구매수량']].mean()['구매수량'].tolist()
X_train['최대구매수량'] = df_train.groupby('고객번호')['구매수량'].agg(lambda x:x.value_counts().index[0]).tolist()
X_train['최소구매수량'] = df_train.groupby('고객번호')[['구매수량']].min()['구매수량'].tolist()
X_train['평균낱개구매금액'] = [(x/y) if y != 0 else 0 for x,y in zip(X_train['평균구매금액'], X_train['평균구매수량'])]
X_test['평균구매수량'] = df_test.groupby('고객번호')[['구매수량']].mean()['구매수량'].tolist()
X_test['최대구매수량'] = df_test.groupby('고객번호')['구매수량'].agg(lambda x:x.value_counts().index[0]).tolist()
X_test['최소구매수량'] = df_test.groupby('고객번호')[['구매수량']].min()['구매수량'].tolist()
X_test['평균낱개구매금액'] = [(x/y) if y != 0 else 0 for x,y in zip(X_test['평균구매금액'], X_test['평균구매수량'])]

In [34]:
# # 구매 품목 수의 경우에는 신규고객의 경우 모두 구매품목수가 적을 것으로 예상되기 때문에 제거
# X_train['구매품목수'] = df_train.groupby('고객번호')[['영수증번호']].count()['영수증번호'].tolist()
# X_test['구매품목수'] = df_test.groupby('고객번호')[['영수증번호']].count()['영수증번호'].tolist()

## EDA에 따른 파생변수 추가

### 온라인이용, 오프라인이용, 둘다 이용구분

In [35]:
# 온라인, 오프라인, 둘다
def div_ch(x):
    if set(x) == {1}: # 오프라인만 이용하는 경우
        return '오프라인'
    elif set(x) == {2}: # 온라인만 이용
        return '온라인'
    else: # 둘다이용
        return '둘다이용'

# 이용채널과 이용 횟수를 구해줌
X_train['이용채널'] = df_train.groupby('고객번호')['채널구분'].apply(list).reset_index()['채널구분'].apply(div_ch)
X_test['이용채널'] = df_test.groupby('고객번호')['채널구분'].apply(list).reset_index()['채널구분'].apply(div_ch)

### 엘페이 이용여부

In [36]:
# 엘페이 이용 여부
lpay_user_train = df_train[df_train['타입'] == 'Lpay']['고객번호'].unique()
lpay_user_test = df_test[df_test['타입'] == 'Lpay']['고객번호'].unique()
X_train['Lpay이용여부'] = X_train['고객번호'].apply(lambda x:'Lpay사용' if x in lpay_user_train else 'Lpay미사용')
X_test['Lpay이용여부'] = X_test['고객번호'].apply(lambda x:'Lpay사용' if x in lpay_user_test else 'Lpay미사용')

### 최다구매 중분류명, 대분류명

In [37]:
# rfm_df['최다구매중분류명'] = df_old.groupby('고객번호')['중분류명'].apply(list).apply(lambda x:max(x, key = x.count)).reset_index()['중분류명']
# rfm_df['최다구매대분류명'] = df_old.groupby('고객번호')['대분류명'].apply(list).apply(lambda x:max(x, key = x.count)).reset_index()['대분류명']

In [38]:
X_train

Unnamed: 0,고객번호,성별,연령대,거주지대분류코드,영수증번호길이평균,영수증번호길이최대,영수증타입최대값,최다구매경로,최다제휴사타입,최다제휴사,...,주구매시간분류2,평균구매금액,최대구매금액,최소구매금액,평균구매수량,최대구매수량,최소구매수량,평균낱개구매금액,이용채널,Lpay이용여부
0,M000034966,여성,40대,Z07,12.122449,12,A,오프라인,A,A02,...,출근,12576.326531,1980,100,1.224490,1,0,10270.666667,오프라인,Lpay미사용
1,M000136117,여성,30대,Z11,12.842975,12,A,오프라인,A,A01,...,출근,237165.545455,14300,100,0.884298,1,0,268196.551402,둘다이용,Lpay사용
2,M000201112,여성,50대,Z17,12.000000,12,A,오프라인,A,A04,...,출근,2571.000000,1700,20,1.450000,1,1,1773.103448,오프라인,Lpay미사용
3,M000225114,여성,40대,Z17,12.368098,12,A,오프라인,A,A01,...,출근,12786.932515,3000,300,1.073620,1,0,11910.114286,오프라인,Lpay미사용
4,M000261625,여성,40대,Z17,12.089552,12,A,오프라인,A,A01,...,출근,90627.611940,45000,2600,1.044776,1,0,86743.571429,오프라인,Lpay미사용
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26207,M999673157,남성,30대,Z10,12.000000,12,A,오프라인,A,A01,...,출근,118807.083333,4500,50,1.166667,1,1,101834.642857,오프라인,Lpay미사용
26208,M999770689,여성,30대,Z16,12.813433,12,A,오프라인,A,A04,...,퇴근,5171.069652,1000,50,0.940299,1,0,5499.391534,둘다이용,Lpay사용
26209,M999849895,여성,20대,Z04,12.085714,12,A,오프라인,A,A02,...,출근,15647.314286,9900,840,1.657143,1,0,9442.344828,오프라인,Lpay미사용
26210,M999926092,남성,30대,Z08,18.000000,18,D,오프라인,D,D02,...,출근,18500.000000,29100,10000,0.000000,0,0,0.000000,둘다이용,Lpay미사용


In [39]:
X_test

Unnamed: 0,고객번호,성별,연령대,거주지대분류코드,영수증번호길이평균,영수증번호길이최대,영수증타입최대값,최다구매경로,최다제휴사타입,최다제휴사,...,주구매시간분류2,평균구매금액,최대구매금액,최소구매금액,평균구매수량,최대구매수량,최소구매수량,평균낱개구매금액,이용채널,Lpay이용여부
0,M000059535,여성,30대,Z12,18.0,18,C,온라인,C,C01,...,출근,46000.000000,46000,46000,0.000000,0,0,0.000000,온라인,Lpay미사용
1,M000658311,여성,20대,Z16,12.0,12,A,오프라인,A,A01,...,출근,14450.000000,7900,100,1.000000,1,1,14450.000000,오프라인,Lpay미사용
2,M000713279,여성,50대,Z12,12.0,12,A,오프라인,A,A05,...,출근,39000.000000,39000,39000,1.000000,1,1,39000.000000,오프라인,Lpay미사용
3,M000859319,남성,70대,Z17,12.0,12,A,오프라인,A,A05,...,출근,11500.000000,11500,11500,1.000000,1,1,11500.000000,오프라인,Lpay미사용
4,M001080017,여성,30대,Z17,16.0,18,C,오프라인,C,C01,...,출근,176560.000000,16200,16200,0.333333,0,0,529680.000000,둘다이용,Lpay미사용
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3657,M998572907,남성,50대,Z03,18.0,18,D,오프라인,D,D01,...,출근,7600.000000,7600,7600,0.000000,0,0,0.000000,오프라인,Lpay미사용
3658,M999340261,남성,50대,Z11,12.0,12,A,오프라인,A,A05,...,출근,495428.571429,49500,49500,1.000000,1,1,495428.571429,오프라인,Lpay미사용
3659,M999492154,여성,20대,Z10,13.2,12,A,오프라인,A,A01,...,출근,17020.000000,10300,100,0.800000,1,0,21275.000000,오프라인,Lpay미사용
3660,M999493501,남성,50대,Z03,18.0,18,D,오프라인,D,D02,...,출근,12446.666667,15840,9100,0.000000,0,0,0.000000,둘다이용,Lpay미사용


## 외부데이터 코로나 데이터 추가  

### 고객이 상품을 구매한 날짜의 일별, 코로나 확진자 및 사망자 파생변수

In [40]:
X_train['코로나신규확진자수평균'] = df_train.groupby('고객번호', as_index = False)['신규확진자'].mean()['신규확진자']
X_train['코로나신규사망자수평균'] = df_train.groupby('고객번호', as_index = False)['신규사망자'].mean()['신규사망자']
X_test['코로나신규확진자수평균'] = df_test.groupby('고객번호', as_index = False)['신규확진자'].mean()['신규확진자']
X_test['코로나신규사망자수평균'] = df_test.groupby('고객번호', as_index = False)['신규사망자'].mean()['신규사망자']

In [41]:
# 확진자의 최대 최소 차이 값을 구하면 코로나에 따른 쇼핑유무를 알 수 있을 것이라고 판단
X_train['코로나확진자의최대최소차이'] = df_train.groupby('고객번호', as_index = False)['신규확진자'].apply(lambda x: x.max()-x.min())['신규확진자']
X_train['코로나사망자의최대최소차이'] = df_train.groupby('고객번호', as_index = False)['신규사망자'].apply(lambda x: x.max()-x.min())['신규사망자']
X_test['코로나확진자의최대최소차이'] = df_test.groupby('고객번호', as_index = False)['신규확진자'].apply(lambda x: x.max()-x.min())['신규확진자']
X_test['코로나사망자의최대최소차이'] = df_test.groupby('고객번호', as_index = False)['신규사망자'].apply(lambda x: x.max()-x.min())['신규사망자']

In [42]:
train = X_train
train['labels'] = y_train
test = X_test

In [43]:
# 데이터 저장
train.to_csv("../data/03_Derived_Variable_train.csv", index = False)
test.to_csv("../data/03_Derived_Variable_test.csv", index = False)

In [44]:
# 데이터가 제대로 저장되었는지 확인
pd.read_csv("../data/03_Derived_Variable_train.csv")

Unnamed: 0,고객번호,성별,연령대,거주지대분류코드,영수증번호길이평균,영수증번호길이최대,영수증타입최대값,최다구매경로,최다제휴사타입,최다제휴사,...,최대구매수량,최소구매수량,평균낱개구매금액,이용채널,Lpay이용여부,코로나신규확진자수평균,코로나신규사망자수평균,코로나확진자의최대최소차이,코로나사망자의최대최소차이,labels
0,M000034966,여성,40대,Z07,12.122449,12,A,오프라인,A,A02,...,1,0,10270.666667,오프라인,Lpay미사용,2052.224490,14.612245,5847.0,55.0,1
1,M000136117,여성,30대,Z11,12.842975,12,A,오프라인,A,A01,...,1,0,268196.551402,둘다이용,Lpay사용,1571.099174,15.553719,7499.0,107.0,2
2,M000201112,여성,50대,Z17,12.000000,12,A,오프라인,A,A04,...,1,1,1773.103448,오프라인,Lpay미사용,1775.500000,6.650000,1986.0,29.0,1
3,M000225114,여성,40대,Z17,12.368098,12,A,오프라인,A,A01,...,1,0,11910.114286,오프라인,Lpay미사용,1655.846626,12.300613,6754.0,68.0,1
4,M000261625,여성,40대,Z17,12.089552,12,A,오프라인,A,A01,...,1,0,86743.571429,오프라인,Lpay미사용,1344.074627,7.761194,4801.0,42.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26207,M999673157,남성,30대,Z10,12.000000,12,A,오프라인,A,A01,...,1,1,101834.642857,오프라인,Lpay미사용,3532.541667,37.166667,7108.0,92.0,1
26208,M999770689,여성,30대,Z16,12.813433,12,A,오프라인,A,A04,...,1,0,5499.391534,둘다이용,Lpay사용,1410.000000,8.870647,7320.0,77.0,2
26209,M999849895,여성,20대,Z04,12.085714,12,A,오프라인,A,A02,...,1,0,9442.344828,오프라인,Lpay미사용,1644.114286,6.300000,4893.0,45.0,1
26210,M999926092,남성,30대,Z08,18.000000,18,D,오프라인,D,D02,...,0,0,0.000000,둘다이용,Lpay미사용,1646.285714,11.285714,4348.0,33.0,1


In [45]:
# 데이터가 제대로 저장되었는지 확인
pd.read_csv("../data/03_Derived_Variable_test.csv")

Unnamed: 0,고객번호,성별,연령대,거주지대분류코드,영수증번호길이평균,영수증번호길이최대,영수증타입최대값,최다구매경로,최다제휴사타입,최다제휴사,...,평균구매수량,최대구매수량,최소구매수량,평균낱개구매금액,이용채널,Lpay이용여부,코로나신규확진자수평균,코로나신규사망자수평균,코로나확진자의최대최소차이,코로나사망자의최대최소차이
0,M000059535,여성,30대,Z12,18.0,18,C,온라인,C,C01,...,0.000000,0,0,0.000000,온라인,Lpay미사용,424.000000,7.000000,0.0,0.0
1,M000658311,여성,20대,Z16,12.0,12,A,오프라인,A,A01,...,1.000000,1,1,14450.000000,오프라인,Lpay미사용,1324.000000,5.000000,0.0,0.0
2,M000713279,여성,50대,Z12,12.0,12,A,오프라인,A,A05,...,1.000000,1,1,39000.000000,오프라인,Lpay미사용,357.000000,2.000000,0.0,0.0
3,M000859319,남성,70대,Z17,12.0,12,A,오프라인,A,A05,...,1.000000,1,1,11500.000000,오프라인,Lpay미사용,1838.000000,3.000000,0.0,0.0
4,M001080017,여성,30대,Z17,16.0,18,C,오프라인,C,C01,...,0.333333,0,0,529680.000000,둘다이용,Lpay미사용,1109.000000,6.000000,1334.0,14.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3657,M998572907,남성,50대,Z03,18.0,18,D,오프라인,D,D01,...,0.000000,0,0,0.000000,오프라인,Lpay미사용,610.000000,1.000000,0.0,0.0
3658,M999340261,남성,50대,Z11,12.0,12,A,오프라인,A,A05,...,1.000000,1,1,495428.571429,오프라인,Lpay미사용,546.000000,4.857143,35.0,6.0
3659,M999492154,여성,20대,Z10,13.2,12,A,오프라인,A,A01,...,0.800000,1,0,21275.000000,오프라인,Lpay미사용,1505.800000,11.400000,1051.0,17.0
3660,M999493501,남성,50대,Z03,18.0,18,D,오프라인,D,D02,...,0.000000,0,0,0.000000,둘다이용,Lpay미사용,1352.666667,4.000000,1118.0,5.0
