## Import library

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import datetime

from matplotlib import font_manager, rc
%matplotlib inline

import platform
your_os = platform.system()
if your_os == 'Linux':
    rc('font', family='NanumGothic')
elif your_os == 'Windows':
    ttf = "c:/Windows/Fonts/malgun.ttf"
    font_name = font_manager.FontProperties(fname=ttf).get_name()
    rc('font', family=font_name)
elif your_os == 'Darwin':
    rc('font', family='AppleGothic')
rc('axes', unicode_minus=False)


# 배경색 흰색으로 맞춰줌
import matplotlib as mpl
mpl.rc('figure', facecolor = 'w', edgecolor ='w')

In [2]:
# Data Load
demo = pd.read_csv('data/LPOINT_BIG_COMP_01_DEMO.csv', low_memory=False)
pdde = pd.read_csv('data/LPOINT_BIG_COMP_02_PDDE.csv', low_memory=False)
cop = pd.read_csv('data/LPOINT_BIG_COMP_03_COP_U.csv', low_memory=False)
clac = pd.read_csv('data/LPOINT_BIG_COMP_04_PD_CLAC.csv', low_memory=False)
br = pd.read_csv('data/LPOINT_BIG_COMP_05_BR.csv', low_memory=False)
lpay = pd.read_csv('data/LPOINT_BIG_COMP_06_LPAY.csv', low_memory=False)

# Column명 변경
demo.columns = ['고객번호', '성별', '연령대', '거주지대분류코드']
pdde.columns = ['고객번호', '영수증번호', '채널구분', '제휴사', '점포코드', '상품코드', '구매일자', '구매시간', '구매금액', '구매수량']
cop.columns = ['고객번호', '영수증번호', '제휴사', '점포코드', '채널구분', '이용일자', '방문일자', '이용시간', '이용금액']
clac.columns = ['상품코드', '소분류명', '대분류명', '중분류명']
br.columns = ['점포코드', '제휴사', '점포대분류코드', '점포중분류코드']
lpay.columns = ['고객번호', '영수증번호', '제휴사', '채널구분', '이용일자', '이용시간', '이용금액']

## Preprocessing

In [3]:
# 중복되는 행 제거 -> 데이터 추출시 고객, 상품 각각 추출하여 중복이 발생한 것으로 판단함
pdde.drop_duplicates(subset=None, keep='first', inplace=True, ignore_index=False)

In [4]:
# 결측치 치환 -> na값을 온라인으로 채워줌
pdde['점포코드'] = pdde['점포코드'].fillna('온라인')

In [5]:
# 결측치 치환
cop['점포코드'] = cop['점포코드'].fillna('온라인')

## Data Merge

In [6]:
# 데이터 크기 확인
print('pdde :',pdde.shape)
print('copu :',cop.shape)
print('lpay :',lpay.shape)

pdde : (4144389, 10)
copu : (248304, 9)
lpay : (353184, 7)


In [7]:
# # 칼럼별 상관관계 확인
# # 제휴사 테이블 중에서 방문일자와 이용일자는 상관관계가 상당히 높게 나타남을 확인
# # 제휴사 테이블 내 방문일자 제거
# plt.figure(figsize=(15,15))
# sns.heatmap(data = cop.corr(), annot=True,fmt = '.2f', linewidths=.5, cmap='Blues')

In [8]:
# 데이터를 통합하기 위하여 칼럼 통합
pdde['타입'] = 'Product'
cop['타입'] = 'Affiliate'
lpay['타입'] = 'Lpay'
pdde['방문일자'] = 0
lpay['방문일자'] = 0
cop['구매수량'] = 0
cop['상품코드'] = np.nan
lpay['점포코드'] = np.nan
lpay['상품코드'] = np.nan
lpay['구매수량'] = 0

In [9]:
# 칼럼명을 새롭게 동일하게 통일
cop.columns = ['고객번호', '영수증번호', '제휴사', '점포코드', '채널구분', '구매일자', '방문일자', '구매시간', '구매금액', '타입', '구매수량', '상품코드']
lpay.columns = ['고객번호', '영수증번호', '제휴사', '채널구분', '구매일자', '구매시간', '구매금액', '타입', '방문일자', '점포코드', '상품코드', '구매수량']

# 상품구매데이터를 기준으로 하여 열을 통일
cop = cop[['고객번호', '영수증번호', '채널구분', '제휴사', '점포코드', '상품코드', '구매일자', '방문일자', '구매시간', '구매금액', '구매수량', '타입']]
lpay = lpay[['고객번호', '영수증번호', '채널구분', '제휴사', '점포코드', '상품코드', '구매일자', '방문일자', '구매시간', '구매금액', '구매수량', '타입']]

# 데이터 타입 변경
lpay['영수증번호'] = lpay['영수증번호'].astype('object')
lpay['점포코드'] = lpay['점포코드'].astype('object')
cop['상품코드'] = cop['상품코드'].astype('object')
lpay['상품코드'] = lpay['상품코드'].astype('object')
pdde['구매금액'] = pdde['구매금액'].astype('int')
pdde['방문일자'] = pdde['방문일자'].astype('int64')
lpay['방문일자'] = lpay['방문일자'].astype('int64')

In [10]:
# 데이터 concat
df = pd.concat([pdde,cop,lpay])
# 고객 정보 테이븛 통합
df = pd.merge(df, demo, on = ['고객번호'], how = 'left')
# 상품 분류 테이블 통합
df = pd.merge(df, br, on = ['점포코드','제휴사'], how = 'left')
# 점포 정보 테이블 통합
df = pd.merge(df, clac, on = ['상품코드'], how = 'left')

In [11]:
# 결측치 확인
df.isnull().sum()

고객번호             0
영수증번호            0
채널구분             0
제휴사              0
점포코드        353184
상품코드        601488
구매일자             0
구매시간             0
구매금액             0
구매수량             0
타입               0
방문일자             0
성별               0
연령대              0
거주지대분류코드         0
점포대분류코드     729023
점포중분류코드     729023
소분류명        601488
대분류명        601488
중분류명        601488
dtype: int64

In [12]:
print('<엘페이>')
print(lpay.isnull().sum())
print('<제휴사>')
print(cop.isnull().sum())
print('<상품구매>')
print(pdde.isnull().sum())

<엘페이>
고객번호          0
영수증번호         0
채널구분          0
제휴사           0
점포코드     353184
상품코드     353184
구매일자          0
방문일자          0
구매시간          0
구매금액          0
구매수량          0
타입            0
dtype: int64
<제휴사>
고객번호          0
영수증번호         0
채널구분          0
제휴사           0
점포코드          0
상품코드     248304
구매일자          0
방문일자          0
구매시간          0
구매금액          0
구매수량          0
타입            0
dtype: int64
<상품구매>
고객번호     0
영수증번호    0
채널구분     0
제휴사      0
점포코드     0
상품코드     0
구매일자     0
구매시간     0
구매금액     0
구매수량     0
타입       0
방문일자     0
dtype: int64


In [13]:
data = df.copy()

In [14]:
# 데이터 분리
df1 = data[data['타입']== 'Product']
df2 = data[data['타입']== 'Affiliate']
df3 = data[data['타입']== 'Lpay']

In [15]:
df

Unnamed: 0,고객번호,영수증번호,채널구분,제휴사,점포코드,상품코드,구매일자,구매시간,구매금액,구매수량,타입,방문일자,성별,연령대,거주지대분류코드,점포대분류코드,점포중분류코드,소분류명,대분류명,중분류명
0,M430112881,A01000001113,1,A01,A010039,PD0290,20210101,10,15000,1,Product,0,여성,50대,Z17,Z10,Z10042,남성티셔츠,남성의류,남성의류상의
1,M646853852,A01000002265,1,A01,A010025,PD1369,20210101,10,79700,1,Product,0,여성,40대,Z13,Z13,Z13001,기타주방일회용품,주방잡화,주방일회용품
2,M430112881,A01000003148,1,A01,A010039,PD0290,20210101,10,19000,1,Product,0,여성,50대,Z17,Z10,Z10042,남성티셔츠,남성의류,남성의류상의
3,M430112881,A01000004946,1,A01,A010039,PD0290,20210101,10,19000,1,Product,0,여성,50대,Z17,Z10,Z10042,남성티셔츠,남성의류,남성의류상의
4,M430112881,A01000005297,1,A01,A010039,PD1692,20210101,10,9900,1,Product,0,여성,50대,Z17,Z10,Z10042,커피/음료,테넌트/음식점,식당
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4745872,M470112503,210125110644545,2,A06,,,20210125,11,51800,0,Lpay,0,여성,50대,Z16,,,,,
4745873,M470112503,210104210652271,2,A06,,,20210104,21,6440,0,Lpay,0,여성,50대,Z16,,,,,
4745874,M748878049,210119200278240,1,A02,,,20210119,20,17600,0,Lpay,0,여성,30대,Z10,,,,,
4745875,M748878049,210119180267206,1,A02,,,20210119,18,12400,0,Lpay,0,여성,30대,Z10,,,,,


### 최종 접속 경과일 (최종 구매 경과일)

In [16]:
df['날짜'] = df['구매일자'].astype(str)
df['날짜'] = pd.to_datetime(df['날짜'])
# df['날짜'] = df['날짜'].dt.tz_localize('UTC')

In [17]:
df = df.sort_values(by=['날짜'], ascending=[True])

In [21]:
imp = df.groupby(['고객번호', '날짜'])[['채널구분']].count().reset_index().groupby('고객번호')['날짜'].agg([('구매일수', 'count')]).reset_index()
imp

Unnamed: 0,고객번호,구매일수
0,M000034966,8
1,M000059535,1
2,M000136117,45
3,M000201112,4
4,M000225114,43
...,...,...
29869,M999708287,2
29870,M999770689,101
29871,M999849895,20
29872,M999926092,7


In [85]:
# 딱 하루만 구매한 고객들
oneday_buyer = list(imp[imp['구매일수']==1]['고객번호'])

In [56]:
# 고객별 구매 날짜
imp2 = df.groupby(['고객번호', '날짜'])[['채널구분']].count().drop('채널구분', axis=1).reset_index()
imp2

Unnamed: 0,고객번호,날짜
0,M000034966,2021-01-16
1,M000034966,2021-03-25
2,M000034966,2021-03-28
3,M000034966,2021-06-29
4,M000034966,2021-08-29
...,...,...
1129845,M999962961,2021-12-19
1129846,M999962961,2021-12-20
1129847,M999962961,2021-12-23
1129848,M999962961,2021-12-30


In [57]:
# 고객별 최근 구매일자 상위 2일 
f = imp2.groupby('고객번호').apply(lambda x: x.nlargest(2, '날짜'))
f

Unnamed: 0_level_0,Unnamed: 1_level_0,고객번호,날짜
고객번호,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
M000034966,7,M000034966,2021-12-23
M000034966,6,M000034966,2021-10-30
M000059535,8,M000059535,2021-03-03
M000136117,53,M000136117,2021-12-30
M000136117,52,M000136117,2021-12-16
...,...,...,...
M999849895,1129747,M999849895,2021-10-15
M999926092,1129755,M999926092,2021-12-02
M999926092,1129754,M999926092,2021-09-17
M999962961,1129849,M999962961,2021-12-31


In [86]:
imp3 = f[['날짜']].reset_index().drop('level_1', axis=1)
imp3

Unnamed: 0,고객번호,날짜
0,M000034966,2021-12-23
1,M000034966,2021-10-30
2,M000059535,2021-03-03
3,M000136117,2021-12-30
4,M000136117,2021-12-16
...,...,...
57578,M999849895,2021-10-15
57579,M999926092,2021-12-02
57580,M999926092,2021-09-17
57581,M999962961,2021-12-31


In [87]:
# 최종구매경과일을 구했으나, 딱 하루만 구매한 고객들은 2021.12.31에서 빼서 구해줘야함

diff_final_buy = imp3.groupby('고객번호')['날짜'].diff().fillna('0').astype(str)
diff_final_buy = diff_final_buy.apply(lambda x: abs(int(x.split()[0])))
imp3['최종구매경과일'] = diff_final_buy
imp3

Unnamed: 0,고객번호,날짜,최종구매경과일
0,M000034966,2021-12-23,0
1,M000034966,2021-10-30,54
2,M000059535,2021-03-03,0
3,M000136117,2021-12-30,0
4,M000136117,2021-12-16,14
...,...,...,...
57578,M999849895,2021-10-15,47
57579,M999926092,2021-12-02,0
57580,M999926092,2021-09-17,76
57581,M999962961,2021-12-31,0


In [138]:
# 고객별 최종 구매경과일 데이터프레임 구성
result = imp3.groupby('고객번호')[['최종구매경과일']].sum().reset_index()
result

Unnamed: 0,고객번호,최종구매경과일
0,M000034966,54
1,M000059535,0
2,M000136117,14
3,M000201112,107
4,M000225114,6
...,...,...
29869,M999708287,58
29870,M999770689,13
29871,M999849895,47
29872,M999926092,76


In [139]:
result[result['최종구매경과일']==0]

Unnamed: 0,고객번호,최종구매경과일
1,M000059535,0
15,M000658311,0
17,M000713279,0
19,M000859319,0
24,M001086020,0
...,...,...
29766,M996060733,0
29770,M996092967,0
29808,M997346183,0
29818,M997821980,0


In [140]:
# 딱 하루만 구매한 고객들
oneday_customer = imp3[imp3['고객번호'].isin(oneday_buyer)]
oneday_customer

Unnamed: 0,고객번호,날짜,최종구매경과일
2,M000059535,2021-03-03,0
29,M000658311,2021-07-10,0
32,M000713279,2021-06-20,0
35,M000859319,2021-07-21,0
44,M001086020,2021-02-20,0
...,...,...,...
57372,M996060733,2021-06-14,0
57379,M996092967,2021-12-01,0
57454,M997346183,2021-10-31,0
57473,M997821980,2021-12-25,0


In [141]:
oneday_customer['최종구매경과일'] = oneday_customer['날짜'].apply(lambda x: pd.to_datetime('2021-12-31')-x).dt.days
oneday_customer.drop('날짜', axis=1, inplace=True)
oneday_customer

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  oneday_customer['최종구매경과일'] = oneday_customer['날짜'].apply(lambda x: pd.to_datetime('2021-12-31')-x).dt.days
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


Unnamed: 0,고객번호,최종구매경과일
2,M000059535,303
29,M000658311,174
32,M000713279,194
35,M000859319,163
44,M001086020,314
...,...,...
57372,M996060733,200
57379,M996092967,30
57454,M997346183,61
57473,M997821980,6


In [142]:
result = pd.merge(result, oneday_customer, how='left', on='고객번호')
result

Unnamed: 0,고객번호,최종구매경과일_x,최종구매경과일_y
0,M000034966,54,
1,M000059535,0,303.0
2,M000136117,14,
3,M000201112,107,
4,M000225114,6,
...,...,...,...
29869,M999708287,58,
29870,M999770689,13,
29871,M999849895,47,
29872,M999926092,76,


In [144]:
result.shape[0]

29874

In [145]:
for i in range(result.shape[0]):
    if result['최종구매경과일_x'][i] == 0:
        result.loc[i,'최종구매경과일_x'] = result.loc[i,'최종구매경과일_y']

In [146]:
result.drop('최종구매경과일_y', axis=1, inplace=True)
result.rename(columns={'최종구매경과일_x': '최종구매경과일'}, inplace=True)
result

Unnamed: 0,고객번호,최종구매경과일
0,M000034966,54.0
1,M000059535,303.0
2,M000136117,14.0
3,M000201112,107.0
4,M000225114,6.0
...,...,...
29869,M999708287,58.0
29870,M999770689,13.0
29871,M999849895,47.0
29872,M999926092,76.0


### 평균 구매일 주기

In [159]:
diff_final_buy = df.groupby('고객번호')['날짜'].diff().fillna('0').astype(str)
diff_final_buy = diff_final_buy.apply(lambda x: abs(int(x.split()[0])))
df['diff'] = diff_final_buy

In [163]:
f = df.groupby(['고객번호'])[['diff']].sum().reset_index()
result['구매간격합'] = f['diff']
result['구매일수'] = imp['구매일수']

In [167]:
result['구매접속간격'] = result['구매일수']-1
result

Unnamed: 0,고객번호,최종구매경과일,구매간격합,구매일수,구매접속간격
0,M000034966,54.0,341,8,7
1,M000059535,303.0,0,1,0
2,M000136117,14.0,360,45,44
3,M000201112,107.0,126,4,3
4,M000225114,6.0,304,43,42
...,...,...,...,...,...
29869,M999708287,58.0,58,2,1
29870,M999770689,13.0,349,101,100
29871,M999849895,47.0,300,20,19
29872,M999926092,76.0,327,7,6


In [174]:
# 평균구매주기
mean_buy_cycle = [] 

for i in range(result.shape[0]):
    try:
        mean_buy_cycle.append(result['구매간격합'][i]/result['구매접속간격'][i])

    except ZeroDivisionError:
        mean_buy_cycle.append(1)
        continue

len(mean_buy_cycle)

  mean_buy_cycle.append(result['구매간격합'][i]/result['구매접속간격'][i])


29874

In [178]:
result['평균구매주기'] = mean_buy_cycle
result

Unnamed: 0,고객번호,최종구매경과일,구매간격합,구매일수,구매접속간격,평균구매주기
0,M000034966,54.0,341,8,7,48.714286
1,M000059535,303.0,0,1,0,
2,M000136117,14.0,360,45,44,8.181818
3,M000201112,107.0,126,4,3,42.000000
4,M000225114,6.0,304,43,42,7.238095
...,...,...,...,...,...,...
29869,M999708287,58.0,58,2,1,58.000000
29870,M999770689,13.0,349,101,100,3.490000
29871,M999849895,47.0,300,20,19,15.789474
29872,M999926092,76.0,327,7,6,54.500000


In [179]:
# 딱 하루 구매한 사람들은 평균구매주기를 1로 대체
result['평균구매주기'].fillna(1, inplace=True)
result

Unnamed: 0,고객번호,최종구매경과일,구매간격합,구매일수,구매접속간격,평균구매주기
0,M000034966,54.0,341,8,7,48.714286
1,M000059535,303.0,0,1,0,1.000000
2,M000136117,14.0,360,45,44,8.181818
3,M000201112,107.0,126,4,3,42.000000
4,M000225114,6.0,304,43,42,7.238095
...,...,...,...,...,...,...
29869,M999708287,58.0,58,2,1,58.000000
29870,M999770689,13.0,349,101,100,3.490000
29871,M999849895,47.0,300,20,19,15.789474
29872,M999926092,76.0,327,7,6,54.500000


## 이탈 위험 비율
- [최종구매경과일] / [평균구매주기]

In [182]:
result['이탈위험비율'] = result['최종구매경과일']/result['평균구매주기']

## K-Means Clustering

In [183]:
data = result[['이탈위험비율']]
data

Unnamed: 0,이탈위험비율
0,1.108504
1,303.000000
2,1.711111
3,2.547619
4,0.828947
...,...
29869,1.000000
29870,3.724928
29871,2.976667
29872,1.394495


In [184]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
data_scale = scaler.fit_transform(data)

In [201]:
from sklearn.cluster import KMeans

# 그룹 수, random_state 설정
model = KMeans(n_clusters = 2, random_state = 42)

# 정규화된 데이터에 학습
model.fit(data_scale)

# 클러스터링 결과 각 데이터가 몇 번째 그룹에 속하는지 저장
result['cluster'] = model.fit_predict(data_scale)

In [202]:
# kmean_transformed = model.fit_transform(data_scale)
# result['pca_x'] = kmean_transformed[:,0]
# result['pca_y'] = kmean_transformed[:,1]

In [203]:
result['cluster'].value_counts()

0    28498
1     1376
Name: cluster, dtype: int64

In [204]:
result[result['cluster'] == 0]

Unnamed: 0,고객번호,최종구매경과일,구매간격합,구매일수,구매접속간격,평균구매주기,이탈위험비율,cluster,pca_x,pca_y
0,M000034966,54.0,341,8,7,48.714286,1.108504,0,0.004479,0.650917
2,M000136117,14.0,360,45,44,8.181818,1.711111,0,0.002823,0.649261
3,M000201112,107.0,126,4,3,42.000000,2.547619,0,0.000525,0.646963
4,M000225114,6.0,304,43,42,7.238095,0.828947,0,0.005247,0.651685
5,M000261625,29.0,336,22,21,16.000000,1.812500,0,0.002545,0.648983
...,...,...,...,...,...,...,...,...,...,...
29869,M999708287,58.0,58,2,1,58.000000,1.000000,0,0.004777,0.651215
29870,M999770689,13.0,349,101,100,3.490000,3.724928,0,0.002709,0.643729
29871,M999849895,47.0,300,20,19,15.789474,2.976667,0,0.000654,0.645784
29872,M999926092,76.0,327,7,6,54.500000,1.394495,0,0.003693,0.650131


In [246]:
result[result['cluster'] == 0]['이탈위험비율'].max()

120.0

In [208]:
result[(result['cluster'] == 0) & (result['구매일수']==1)]

Unnamed: 0,고객번호,최종구매경과일,구매간격합,구매일수,구매접속간격,평균구매주기,이탈위험비율,cluster,pca_x,pca_y
45,M001719436,68.0,0,1,0,1.0,68.0,0,0.179289,0.467149
52,M001964634,71.0,0,1,0,1.0,71.0,0,0.187531,0.458907
74,M003213142,9.0,0,1,0,1.0,9.0,0,0.017201,0.629237
187,M006712731,120.0,0,1,0,1.0,120.0,0,0.322146,0.324292
294,M010224238,61.0,0,1,0,1.0,61.0,0,0.160058,0.486380
...,...,...,...,...,...,...,...,...,...,...
29647,M991781797,68.0,0,1,0,1.0,68.0,0,0.179289,0.467149
29690,M993430415,64.0,0,1,0,1.0,64.0,0,0.168300,0.478138
29770,M996092967,30.0,0,1,0,1.0,30.0,0,0.074894,0.571544
29808,M997346183,61.0,0,1,0,1.0,61.0,0,0.160058,0.486380


In [209]:
result[(result['cluster'] == 0) & (result['구매일수']==1)]['최종구매경과일'].max()

120.0

In [205]:
result[result['cluster'] == 1]

Unnamed: 0,고객번호,최종구매경과일,구매간격합,구매일수,구매접속간격,평균구매주기,이탈위험비율,cluster,pca_x,pca_y
1,M000059535,303.0,0,1,0,1.0,303.0,1,0.824894,0.178456
15,M000658311,174.0,0,1,0,1.0,174.0,1,0.470498,0.175940
17,M000713279,194.0,0,1,0,1.0,194.0,1,0.525443,0.120995
19,M000859319,163.0,0,1,0,1.0,163.0,1,0.440278,0.206160
24,M001086020,314.0,0,1,0,1.0,314.0,1,0.855113,0.208675
...,...,...,...,...,...,...,...,...,...,...
29721,M994285756,214.0,0,1,0,1.0,214.0,1,0.580388,0.066050
29727,M994550219,361.0,0,1,0,1.0,361.0,1,0.984234,0.337796
29752,M995528586,330.0,0,1,0,1.0,330.0,1,0.899069,0.252631
29766,M996060733,200.0,0,1,0,1.0,200.0,1,0.541927,0.104511


In [206]:
result[result['cluster'] == 1]['구매일수'].value_counts()

1    1376
Name: 구매일수, dtype: int64

In [245]:
result[result['cluster'] == 1]['이탈위험비율'].min()

121.0

In [210]:
result[(result['cluster'] == 1) & (result['구매일수']==1)]['최종구매경과일'].min()

121.0

- 클러스터링 결과, 구매일수가 딱 하루인 고객들의 대부분이 이탈고객으로 분류되었음
- 최종구매경과일이 120일인 고객까지는 비이탈고객으로 분류됨 -> 121일인 고객부터 이탈고객판정
- 구매일수가 딱 하루인 고객들을 제외하고 계산할 필요성이 보임

## 구매일수가 딱 하루인 고객들을 제외한 K-means clustering

In [211]:
no_oncebuyer = result[result['구매일수'] != 1]
no_oncebuyer

Unnamed: 0,고객번호,최종구매경과일,구매간격합,구매일수,구매접속간격,평균구매주기,이탈위험비율,cluster,pca_x,pca_y
0,M000034966,54.0,341,8,7,48.714286,1.108504,0,0.004479,0.650917
2,M000136117,14.0,360,45,44,8.181818,1.711111,0,0.002823,0.649261
3,M000201112,107.0,126,4,3,42.000000,2.547619,0,0.000525,0.646963
4,M000225114,6.0,304,43,42,7.238095,0.828947,0,0.005247,0.651685
5,M000261625,29.0,336,22,21,16.000000,1.812500,0,0.002545,0.648983
...,...,...,...,...,...,...,...,...,...,...
29869,M999708287,58.0,58,2,1,58.000000,1.000000,0,0.004777,0.651215
29870,M999770689,13.0,349,101,100,3.490000,3.724928,0,0.002709,0.643729
29871,M999849895,47.0,300,20,19,15.789474,2.976667,0,0.000654,0.645784
29872,M999926092,76.0,327,7,6,54.500000,1.394495,0,0.003693,0.650131


In [224]:
data = no_oncebuyer[['이탈위험비율']].reset_index(drop=True)
data

Unnamed: 0,이탈위험비율
0,1.108504
1,1.711111
2,2.547619
3,0.828947
4,1.812500
...,...
27704,1.000000
27705,3.724928
27706,2.976667
27707,1.394495


In [225]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
data_scale = scaler.fit_transform(data)

In [226]:
from sklearn.cluster import KMeans

# 그룹 수, random_state 설정
model = KMeans(n_clusters = 2, random_state = 42)

# 정규화된 데이터에 학습
model.fit(data_scale)

# 클러스터링 결과 각 데이터가 몇 번째 그룹에 속하는지 저장
no_oncebuyer['cluster'] = model.fit_predict(data_scale)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  no_oncebuyer['cluster'] = model.fit_predict(data_scale)


In [227]:
no_oncebuyer['cluster'].value_counts()

0    25078
1     2631
Name: cluster, dtype: int64

In [228]:
no_oncebuyer[no_oncebuyer['cluster'] == 0]

Unnamed: 0,고객번호,최종구매경과일,구매간격합,구매일수,구매접속간격,평균구매주기,이탈위험비율,cluster,pca_x,pca_y
0,M000034966,54.0,341,8,7,48.714286,1.108504,0,0.004479,0.650917
2,M000136117,14.0,360,45,44,8.181818,1.711111,0,0.002823,0.649261
3,M000201112,107.0,126,4,3,42.000000,2.547619,0,0.000525,0.646963
4,M000225114,6.0,304,43,42,7.238095,0.828947,0,0.005247,0.651685
5,M000261625,29.0,336,22,21,16.000000,1.812500,0,0.002545,0.648983
...,...,...,...,...,...,...,...,...,...,...
29866,M999515910,20.0,353,23,22,16.045455,1.246459,0,0.004100,0.650538
29868,M999673157,2.0,312,13,12,26.000000,0.076923,0,0.007313,0.653751
29869,M999708287,58.0,58,2,1,58.000000,1.000000,0,0.004777,0.651215
29872,M999926092,76.0,327,7,6,54.500000,1.394495,0,0.003693,0.650131


In [241]:
no_oncebuyer[(no_oncebuyer['cluster'] == 0)]['이탈위험비율'].max()

2.689855072463768

In [233]:
no_oncebuyer[no_oncebuyer['cluster'] == 1]

Unnamed: 0,고객번호,최종구매경과일,구매간격합,구매일수,구매접속간격,평균구매주기,이탈위험비율,cluster,pca_x,pca_y
16,M000705571,31.0,338,55,54,6.259259,4.952663,1,0.006082,0.640356
18,M000803099,246.0,260,4,3,86.666667,2.838462,1,0.000274,0.646164
22,M000986912,27.0,350,42,41,8.536585,3.162857,1,0.001165,0.645273
43,M001698984,39.0,136,14,13,10.461538,3.727941,1,0.002718,0.643720
44,M001714196,18.0,352,86,85,4.141176,4.346591,1,0.004417,0.642021
...,...,...,...,...,...,...,...,...,...,...
29850,M999101603,108.0,346,11,10,34.600000,3.121387,1,0.001051,0.645387
29852,M999105944,61.0,250,20,19,13.157895,4.636000,1,0.005212,0.641226
29867,M999599111,17.0,316,73,72,4.388889,3.873418,1,0.003117,0.643321
29870,M999770689,13.0,349,101,100,3.490000,3.724928,1,0.002709,0.643729


In [240]:
no_oncebuyer[(no_oncebuyer['cluster'] == 1)]['이탈위험비율'].min()

2.691131498470948