## Import library

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import datetime

from matplotlib import font_manager, rc
%matplotlib inline

import platform
your_os = platform.system()
if your_os == 'Linux':
    rc('font', family='NanumGothic')
elif your_os == 'Windows':
    ttf = "c:/Windows/Fonts/malgun.ttf"
    font_name = font_manager.FontProperties(fname=ttf).get_name()
    rc('font', family=font_name)
elif your_os == 'Darwin':
    rc('font', family='AppleGothic')
rc('axes', unicode_minus=False)


# 배경색 흰색으로 맞춰줌
import matplotlib as mpl
mpl.rc('figure', facecolor = 'w', edgecolor ='w')

In [2]:
# Data Load
demo = pd.read_csv('data/LPOINT_BIG_COMP_01_DEMO.csv', low_memory=False)
pdde = pd.read_csv('data/LPOINT_BIG_COMP_02_PDDE.csv', low_memory=False)
cop = pd.read_csv('data/LPOINT_BIG_COMP_03_COP_U.csv', low_memory=False)
clac = pd.read_csv('data/LPOINT_BIG_COMP_04_PD_CLAC.csv', low_memory=False)
br = pd.read_csv('data/LPOINT_BIG_COMP_05_BR.csv', low_memory=False)
lpay = pd.read_csv('data/LPOINT_BIG_COMP_06_LPAY.csv', low_memory=False)

# Column명 변경
demo.columns = ['고객번호', '성별', '연령대', '거주지대분류코드']
pdde.columns = ['고객번호', '영수증번호', '채널구분', '제휴사', '점포코드', '상품코드', '구매일자', '구매시간', '구매금액', '구매수량']
cop.columns = ['고객번호', '영수증번호', '제휴사', '점포코드', '채널구분', '이용일자', '방문일자', '이용시간', '이용금액']
clac.columns = ['상품코드', '소분류명', '대분류명', '중분류명']
br.columns = ['점포코드', '제휴사', '점포대분류코드', '점포중분류코드']
lpay.columns = ['고객번호', '영수증번호', '제휴사', '채널구분', '이용일자', '이용시간', '이용금액']

## Preprocessing

In [3]:
# 중복되는 행 제거 -> 데이터 추출시 고객, 상품 각각 추출하여 중복이 발생한 것으로 판단함
pdde.drop_duplicates(subset=None, keep='first', inplace=True, ignore_index=False)

In [4]:
# 결측치 치환 -> na값을 온라인으로 채워줌
pdde['점포코드'] = pdde['점포코드'].fillna('온라인')

In [5]:
# 결측치 치환
cop['점포코드'] = cop['점포코드'].fillna('온라인')

## Data Merge

In [6]:
# 데이터 크기 확인
print('pdde :',pdde.shape)
print('copu :',cop.shape)
print('lpay :',lpay.shape)

pdde : (4144389, 10)
copu : (248304, 9)
lpay : (353184, 7)


In [7]:
# # 칼럼별 상관관계 확인
# # 제휴사 테이블 중에서 방문일자와 이용일자는 상관관계가 상당히 높게 나타남을 확인
# # 제휴사 테이블 내 방문일자 제거
# plt.figure(figsize=(15,15))
# sns.heatmap(data = cop.corr(), annot=True,fmt = '.2f', linewidths=.5, cmap='Blues')

In [8]:
# 데이터를 통합하기 위하여 칼럼 통합
pdde['타입'] = 'Product'
cop['타입'] = 'Affiliate'
lpay['타입'] = 'Lpay'
pdde['방문일자'] = 0
lpay['방문일자'] = 0
cop['구매수량'] = 0
cop['상품코드'] = np.nan
lpay['점포코드'] = np.nan
lpay['상품코드'] = np.nan
lpay['구매수량'] = 0

In [9]:
# 칼럼명을 새롭게 동일하게 통일
cop.columns = ['고객번호', '영수증번호', '제휴사', '점포코드', '채널구분', '구매일자', '방문일자', '구매시간', '구매금액', '타입', '구매수량', '상품코드']
lpay.columns = ['고객번호', '영수증번호', '제휴사', '채널구분', '구매일자', '구매시간', '구매금액', '타입', '방문일자', '점포코드', '상품코드', '구매수량']

# 상품구매데이터를 기준으로 하여 열을 통일
cop = cop[['고객번호', '영수증번호', '채널구분', '제휴사', '점포코드', '상품코드', '구매일자', '방문일자', '구매시간', '구매금액', '구매수량', '타입']]
lpay = lpay[['고객번호', '영수증번호', '채널구분', '제휴사', '점포코드', '상품코드', '구매일자', '방문일자', '구매시간', '구매금액', '구매수량', '타입']]

# 데이터 타입 변경
lpay['영수증번호'] = lpay['영수증번호'].astype('object')
lpay['점포코드'] = lpay['점포코드'].astype('object')
cop['상품코드'] = cop['상품코드'].astype('object')
lpay['상품코드'] = lpay['상품코드'].astype('object')
pdde['구매금액'] = pdde['구매금액'].astype('int')
pdde['방문일자'] = pdde['방문일자'].astype('int64')
lpay['방문일자'] = lpay['방문일자'].astype('int64')

In [10]:
# 데이터 concat
df = pd.concat([pdde,cop,lpay])
# 고객 정보 테이븛 통합
df = pd.merge(df, demo, on = ['고객번호'], how = 'left')
# 상품 분류 테이블 통합
df = pd.merge(df, br, on = ['점포코드','제휴사'], how = 'left')
# 점포 정보 테이블 통합
df = pd.merge(df, clac, on = ['상품코드'], how = 'left')

In [11]:
# 결측치 확인
df.isnull().sum()

고객번호             0
영수증번호            0
채널구분             0
제휴사              0
점포코드        353184
상품코드        601488
구매일자             0
구매시간             0
구매금액             0
구매수량             0
타입               0
방문일자             0
성별               0
연령대              0
거주지대분류코드         0
점포대분류코드     729023
점포중분류코드     729023
소분류명        601488
대분류명        601488
중분류명        601488
dtype: int64

In [12]:
print('<엘페이>')
print(lpay.isnull().sum())
print('<제휴사>')
print(cop.isnull().sum())
print('<상품구매>')
print(pdde.isnull().sum())

<엘페이>
고객번호          0
영수증번호         0
채널구분          0
제휴사           0
점포코드     353184
상품코드     353184
구매일자          0
방문일자          0
구매시간          0
구매금액          0
구매수량          0
타입            0
dtype: int64
<제휴사>
고객번호          0
영수증번호         0
채널구분          0
제휴사           0
점포코드          0
상품코드     248304
구매일자          0
방문일자          0
구매시간          0
구매금액          0
구매수량          0
타입            0
dtype: int64
<상품구매>
고객번호     0
영수증번호    0
채널구분     0
제휴사      0
점포코드     0
상품코드     0
구매일자     0
구매시간     0
구매금액     0
구매수량     0
타입       0
방문일자     0
dtype: int64


In [13]:
data = df.copy()

In [14]:
# 데이터 분리
df1 = data[data['타입']== 'Product']
df2 = data[data['타입']== 'Affiliate']
df3 = data[data['타입']== 'Lpay']

### 구매빈도
- 전 구매 후 얼마 뒤 구매가 이루어졌는지

In [15]:
df['날짜'] = df['구매일자'].astype(str)
df['날짜'] = pd.to_datetime(df['날짜'])
# df['날짜'] = df['날짜'].dt.tz_localize('UTC')

In [16]:
df = df.sort_values(by=['날짜'], ascending=[True])

In [17]:
df['diff'] = df.groupby('고객번호')['날짜'].diff().fillna('0').astype(str)

In [18]:
df['diff'] = df['diff'].apply(lambda x: int(x.split()[0]))

In [19]:
df['diff'].nunique()

321

In [20]:
df

Unnamed: 0,고객번호,영수증번호,채널구분,제휴사,점포코드,상품코드,구매일자,구매시간,구매금액,구매수량,...,성별,연령대,거주지대분류코드,점포대분류코드,점포중분류코드,소분류명,대분류명,중분류명,날짜,diff
4241800,M495972460,20122713C012328716,2,C01,C010004,,20201227,13,26000,0,...,남성,40대,Z10,Z17,Z17011,,,,2020-12-27,0
4340152,M039320884,20122911C011862577,2,C01,C010117,,20201229,11,6500,0,...,여성,40대,Z06,Z06,Z06013,,,,2020-12-29,0
4168101,M192531170,20122900C013317930,2,C01,C010026,,20201229,0,10000,0,...,여성,40대,Z10,Z10,Z10011,,,,2020-12-29,0
4269174,M864132607,20123021C013512700,2,C01,C010086,,20201230,21,15000,0,...,여성,70대,Z14,Z14,Z14004,,,,2020-12-30,0
4322220,M577916024,20123012C012610437,2,C01,C010102,,20201230,12,16000,0,...,여성,30대,Z16,Z16,Z16015,,,,2020-12-30,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3769902,M593010408,A03159704778,1,A03,A030337,PD0232,20211231,15,3990,1,...,남성,30대,Z10,Z17,Z17018,일반스낵,과자,스낵류,2021-12-31,0
3769903,M593010408,A03159704778,1,A03,A030337,PD1333,20211231,15,2550,2,...,남성,30대,Z10,Z17,Z17018,일반소주,주류,소주,2021-12-31,0
3769904,M593010408,A03159704778,1,A03,A030337,PD1343,20211231,15,1390,2,...,남성,30대,Z10,Z17,Z17018,막걸리,주류,전통주,2021-12-31,0
3769891,M714979893,A03158123233,1,A03,A030331,PD1249,20211231,12,3600,1,...,여성,50대,Z05,Z17,Z17018,기타냉장조리,조리식품,냉장조리,2021-12-31,0


### 구매횟수 확인

In [21]:
# 최고 많은 구매회수
df.groupby('고객번호')[['영수증번호']].count().max()

영수증번호    13120
dtype: int64

In [22]:
imp = df.groupby('고객번호')[['영수증번호']].count()

# 구매를 가장 많이한 고객의 고객번호
imp[imp['영수증번호']==13120]

Unnamed: 0_level_0,영수증번호
고객번호,Unnamed: 1_level_1
M057015266,13120


In [23]:
imp.sort_values(by=['영수증번호'], ascending=[False])

Unnamed: 0_level_0,영수증번호
고객번호,Unnamed: 1_level_1
M057015266,13120
M919374790,9442
M287960590,2491
M124357021,2397
M763326819,2286
...,...
M736399256,1
M473392527,1
M652693880,1
M627610179,1


In [24]:
imp.describe()

Unnamed: 0,영수증번호
count,29874.0
mean,158.863125
std,226.480154
min,1.0
25%,20.0
50%,83.0
75%,211.0
max,13120.0


In [25]:
df[df['고객번호'] == 'M057015266']['diff'].value_counts()

0    12821
1      258
3       24
2       17
Name: diff, dtype: int64

### 마지막 날 데이터('2021-12-31')부터 가장 마지막 구매 기록

In [26]:
df_last = df.groupby('고객번호').last()
df_last

Unnamed: 0_level_0,영수증번호,채널구분,제휴사,점포코드,상품코드,구매일자,구매시간,구매금액,구매수량,타입,...,성별,연령대,거주지대분류코드,점포대분류코드,점포중분류코드,소분류명,대분류명,중분류명,날짜,diff
고객번호,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
M000034966,A02359647751,1,A02,A020116,PD1025,20211223,19,39900,1,Product,...,여성,40대,Z07,Z07,Z07002,롤플레잉완구,완구,남아완구,2021-12-23,0
M000059535,21030317C012142421,2,C01,C010085,,20210303,17,46000,0,Affiliate,...,여성,30대,Z12,Z12,Z12019,,,,2021-03-03,0
M000136117,E06052098124,2,A06,온라인,PD0967,20211230,20,125300,1,Product,...,여성,30대,Z11,Z11,Z11013,여성바지,여성의류,여성의류하의,2021-12-30,14
M000201112,A04117603249,1,A04,A042949,PD1161,20211128,16,1800,2,Product,...,여성,50대,Z17,Z10,Z10006,과일음료,음료,과채음료,2021-11-28,0
M000225114,A01502713681,1,A01,A010011,PD0232,20211231,19,10000,1,Product,...,여성,40대,Z17,Z17,Z17001,일반스낵,과자,스낵류,2021-12-31,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
M999708287,21090215C014224753,2,C01,C010047,,20210902,15,26000,0,Affiliate,...,여성,40대,Z10,Z10,Z10038,,,,2021-09-02,58
M999770689,A04129008648,1,A04,A044519,PD0223,20211231,22,1200,1,Product,...,여성,30대,Z16,Z16,Z16011,젤리,과자,사탕/캔디,2021-12-31,0
M999849895,A02134902896,1,A02,A020033,PD1303,20211201,15,8480,1,Product,...,여성,20대,Z04,Z04,Z04008,고추장,조미료,장류,2021-12-01,0
M999926092,21120222C012914863,2,C01,C010076,,20211202,22,10000,0,Affiliate,...,남성,30대,Z08,Z15,Z15003,,,,2021-12-02,76


In [27]:
df_last['last_day_diff'] = pd.to_datetime('2021-12-31') - df_last['날짜']

In [28]:
df_last['last_day_diff'].describe()

count                         29874
mean     40 days 08:33:15.581442056
std      68 days 09:15:51.135140269
min                 0 days 00:00:00
25%                 3 days 00:00:00
50%                10 days 00:00:00
75%                42 days 00:00:00
max               364 days 00:00:00
Name: last_day_diff, dtype: object

In [29]:
df_last['last_day_diff'].sort_values(ascending=False)

고객번호
M583636385   364 days
M286726304   364 days
M311201219   364 days
M601849823   364 days
M437975074   364 days
               ...   
M618889124     0 days
M878364274     0 days
M259007083     0 days
M258814726     0 days
M999962961     0 days
Name: last_day_diff, Length: 29874, dtype: timedelta64[ns]

In [30]:
df[df['고객번호'] == 'M286726304']

Unnamed: 0,고객번호,영수증번호,채널구분,제휴사,점포코드,상품코드,구매일자,구매시간,구매금액,구매수량,...,성별,연령대,거주지대분류코드,점포대분류코드,점포중분류코드,소분류명,대분류명,중분류명,날짜,diff
3770531,M286726304,A04000024318,1,A04,A045147,PD0381,20210101,15,4100,1,...,남성,30대,Z07,Z07,Z07002,국산담배,담배,일반담배,2021-01-01,0
3770532,M286726304,A04000024318,1,A04,A045147,PD0380,20210101,15,4500,1,...,남성,30대,Z07,Z07,Z07002,수입담배,담배,일반담배,2021-01-01,0


## CLVM: 고객 생애 가치 모델

In [79]:
df.head()

Unnamed: 0,고객번호,영수증번호,채널구분,제휴사,점포코드,상품코드,구매일자,구매시간,구매금액,구매수량,...,성별,연령대,거주지대분류코드,점포대분류코드,점포중분류코드,소분류명,대분류명,중분류명,날짜,diff
4241800,M495972460,20122713C012328716,2,C01,C010004,,20201227,13,26000,0,...,남성,40대,Z10,Z17,Z17011,,,,2020-12-27,0
4340152,M039320884,20122911C011862577,2,C01,C010117,,20201229,11,6500,0,...,여성,40대,Z06,Z06,Z06013,,,,2020-12-29,0
4168101,M192531170,20122900C013317930,2,C01,C010026,,20201229,0,10000,0,...,여성,40대,Z10,Z10,Z10011,,,,2020-12-29,0
4269174,M864132607,20123021C013512700,2,C01,C010086,,20201230,21,15000,0,...,여성,70대,Z14,Z14,Z14004,,,,2020-12-30,0
4322220,M577916024,20123012C012610437,2,C01,C010102,,20201230,12,16000,0,...,여성,30대,Z16,Z16,Z16015,,,,2020-12-30,0


In [80]:
clvm_features = []

In [81]:
# 고객별 총 구매횟수
f = df.groupby(['고객번호', '영수증번호'])['채널구분'].agg([('cnt', 'count')]).groupby('고객번호')['cnt'].agg([('구매횟수', 'sum')]).reset_index()
clvm_features.append(f); f

Unnamed: 0,고객번호,구매횟수
0,M000034966,49
1,M000059535,1
2,M000136117,121
3,M000201112,20
4,M000225114,163
...,...,...
29869,M999708287,2
29870,M999770689,402
29871,M999849895,70
29872,M999926092,7


In [82]:
# 고객별 총 구매액
f = df.groupby('고객번호')['구매금액'].agg([('구매액', 'sum')]).reset_index()
clvm_features.append(f); f

Unnamed: 0,고객번호,구매액
0,M000034966,616240
1,M000059535,46000
2,M000136117,28697031
3,M000201112,51420
4,M000225114,2084270
...,...,...
29869,M999708287,60000
29870,M999770689,2078770
29871,M999849895,1095312
29872,M999926092,129500


In [83]:
data = pd.DataFrame({'고객번호': df.고객번호.unique()})

for f in clvm_features:
        data = pd.merge(data, f, how='left', on='고객번호')
        
data = data.fillna(0)
data

Unnamed: 0,고객번호,구매횟수,구매액
0,M495972460,111,7341460
1,M039320884,391,4329720
2,M192531170,339,2962360
3,M864132607,330,5454280
4,M577916024,188,1571641
...,...,...,...
29869,M667199832,1,38500
29870,M612859933,1,3000
29871,M948334702,1,9200
29872,M514600362,1,41500


### 평균 구매 금액(Average purchase value)
- 고객의 거래당 평균 구매 금액

In [84]:
data['평균구매금액'] = data['구매액']/data['구매횟수']
data

Unnamed: 0,고객번호,구매횟수,구매액,평균구매금액
0,M495972460,111,7341460,66139.279279
1,M039320884,391,4329720,11073.452685
2,M192531170,339,2962360,8738.525074
3,M864132607,330,5454280,16528.121212
4,M577916024,188,1571641,8359.792553
...,...,...,...,...
29869,M667199832,1,38500,38500.000000
29870,M612859933,1,3000,3000.000000
29871,M948334702,1,9200,9200.000000
29872,M514600362,1,41500,41500.000000


### 평균 구매 빈도율(Average purchase frequency rate)
- 위의 구매 횟수를 해당 기간 동안 구매한 고객 수로 나누어 계산

In [74]:
data['평균구매빈도율'] = data['구매횟수']/365
data

Unnamed: 0,고객번호,구매횟수,구매액,평균구매가치,평균구매빈도율
0,M495972460,111,7341460,66139.279279,0.304110
1,M039320884,391,4329720,11073.452685,1.071233
2,M192531170,339,2962360,8738.525074,0.928767
3,M864132607,330,5454280,16528.121212,0.904110
4,M577916024,188,1571641,8359.792553,0.515068
...,...,...,...,...,...
29869,M667199832,1,38500,38500.000000,0.002740
29870,M612859933,1,3000,3000.000000,0.002740
29871,M948334702,1,9200,9200.000000,0.002740
29872,M514600362,1,41500,41500.000000,0.002740


### 고객 가치(Customer value)
- 1번에서 계산한 평균 구매 가치와 2번에서 계산한 평균 구매 빈도율을 곱하여 계산

In [78]:
data['고객가치'] = data['평균구매가치']*data['평균구매빈도율']
data

Unnamed: 0,고객번호,구매횟수,구매액,평균구매가치,평균구매빈도율,고객가치
0,M495972460,111,7341460,66139.279279,0.304110,20113.589041
1,M039320884,391,4329720,11073.452685,1.071233,11862.246575
2,M192531170,339,2962360,8738.525074,0.928767,8116.054795
3,M864132607,330,5454280,16528.121212,0.904110,14943.232877
4,M577916024,188,1571641,8359.792553,0.515068,4305.865753
...,...,...,...,...,...,...
29869,M667199832,1,38500,38500.000000,0.002740,105.479452
29870,M612859933,1,3000,3000.000000,0.002740,8.219178
29871,M948334702,1,9200,9200.000000,0.002740,25.205479
29872,M514600362,1,41500,41500.000000,0.002740,113.698630


### 평균 고객 수명(Average customer lifespan)
- 고객들이 계속 구매하는 년 수의 평균값을 계산

### 고객 생애 가치(CLTV)
- 4번에서 계산한 고객 가치에 5번의 평균 고객 수명을 곱하여 계산
- 이렇게 하면 평균적으로 각 개별 고객들이 비즈니스에 안겨다 줄 수 있는 기대 수익을 계산할 수 있음