

# 基于 RFM 的 P2P 用户聚类模型



### 22.3.1 获取数据

In [9]:
# 导入需要的库，并读取 csv 文件。

import pandas as pd
data = pd.read_csv('data.csv', encoding = 'utf-8') 

data.head(5)

Unnamed: 0,ID,RECENT_DATE,TRANS_NUM,AVG_TRANS_AMT,TRANS_NUM_BEFORE
0,4356689447,20180125,,0.0,0.0
1,5985727751,20180125,0.0,0.0,
2,8221439519,20171211,0.0,0.0,0.0
3,6255360379,20171226,0.0,0.0,0.0
4,5985726631,20171203,0.0,0.0,0.0


### 22.3.2 数据预处理

In [10]:
# 过滤掉用户 ID 为空的用户投资记录。

data = data[data['ID'].notnull()]

# 投资金额或投资次数为空值，统一修正为 0。

data = data.fillna(0)

# 保存清洗后的数据。

data.to_csv('clean_data.csv') 

data.head(5)

Unnamed: 0,ID,RECENT_DATE,TRANS_NUM,AVG_TRANS_AMT,TRANS_NUM_BEFORE
0,4356689447,20180125,0.0,0.0,0.0
1,5985727751,20180125,0.0,0.0,0.0
2,8221439519,20171211,0.0,0.0,0.0
3,6255360379,20171226,0.0,0.0,0.0
4,5985726631,20171203,0.0,0.0,0.0


### 22.3.3.	计算 RFM 指标

In [37]:
# 读入清洗后的用户投资数据。

data = pd.read_csv('clean_data.csv', encoding = 'utf-8')

# 标记流失用户和沉默用户。

data['RFM'] = ''   
data['USR_TYPE'] = ''

# 流失用户 ( L ) 即为观测时间前曾有投资交易记录，但观测时间段内无投资记录的用户。

data.ix[(data.TRANS_NUM_BEFORE > 0) & (data.TRANS_NUM == 0),['USR_TYPE']] = 'L'    
data.ix[(data.TRANS_NUM_BEFORE > 0) & (data.TRANS_NUM == 0),['RFM']] = '000'

# 沉默用户 ( S ) 即为注册开户至今，从未有过任何一笔投资交易记录的用户。

data.ix[(data.TRANS_NUM_BEFORE == 0) & (data.TRANS_NUM == 0),['USR_TYPE']] = 'S'
data.ix[(data. TRANS_NUM_BEFORE == 0) & (data.TRANS_NUM == 0),['RFM']] = '000'

# 计算最近交易时间距离 2018 年 2 月 14 日的天数。

data['DAYS'] =  [(pd.to_datetime('02/14/2018')- pd.to_datetime(str(t))).days 
                   for t in data['RECENT_DATE']] 


# 计算观测窗内有投资交易记录的用户的 R,F,M 指标。

data['R'] = data[data.TRANS_NUM > 0]['DAYS']
data['F'] = data[data.TRANS_NUM > 0]['TRANS_NUM']
data['M'] = data[data.TRANS_NUM > 0]['AVG_TRANS_AMT']

# 筛选有效 R,F,M 指标存入单独 dataframe，用于后续建立模型。

data_RFM = data[['R','F','M']].dropna()

data


Unnamed: 0.1,Unnamed: 0,ID,RECENT_DATE,TRANS_NUM,AVG_TRANS_AMT,TRANS_NUM_BEFORE,RFM,USR_TYPE,DAYS,R,F,M
0,0,4356689447,20180125,0.0,0.000000e+00,0.0,000,S,20,,,
1,1,5985727751,20180125,0.0,0.000000e+00,0.0,000,S,20,,,
2,2,8221439519,20171211,0.0,0.000000e+00,0.0,000,S,65,,,
3,3,6255360379,20171226,0.0,0.000000e+00,0.0,000,S,50,,,
4,4,5985726631,20171203,0.0,0.000000e+00,0.0,000,S,73,,,
5,5,6255360403,20180109,0.0,0.000000e+00,0.0,000,S,36,,,
6,6,8221440591,20171208,0.0,0.000000e+00,0.0,000,S,68,,,
7,7,8221439459,20171208,0.0,0.000000e+00,1.0,000,L,68,,,
8,8,4373540587,20171204,0.0,0.000000e+00,0.0,000,S,72,,,
9,9,8221438327,20171122,0.0,0.000000e+00,0.0,000,S,84,,,


## 22.3.4.	数据标准化

In [38]:
from sklearn import preprocessing
from pandas import DataFrame

min_max_scaler = preprocessing.MinMaxScaler()
data_minmax = pd.DataFrame(min_max_scaler.fit_transform(data_RFM),
                           columns = ['R','F','M'])

# 零均值标准化。
data_scale = pd.DataFrame(preprocessing.scale(data_RFM),
                          columns = ['R','F','M']) 

data_scale

Unnamed: 0,R,F,M
0,-0.481776,-0.117034,-0.189368
1,-0.579722,-0.193889,-0.156358
2,2.554565,-0.203548,-0.186256
3,-0.628696,-0.195569,-0.186276
4,-0.530749,-0.183809,-0.186380
5,0.203849,-0.192629,-0.186007
6,-0.481776,-0.022540,-0.189368
7,-1.069455,-0.093095,-0.159314
8,0.693582,-0.202708,-0.188024
9,1.183314,0.123611,-0.185119


##  22.3.5.	K-means 聚类最优 K 值选择

In [40]:
import numpy as np
from sklearn.cluster import KMeans

# 设置模型参数，其中 n_cluster 为类别的个数，并开始训练。

model = KMeans(n_clusters = 8)
model.fit(data_scale)

# 对每一类用户分别算出其 R、F、M 指标的相应平均值，高于平均值则赋值为 1，低于平均值则赋值为 0,R 指标则相反。

avg_RFM = [np.mean(data_scale.R),np.mean(data_scale.F), np.mean(data_scale.M)] 
RFM = pd.DataFrame()
RFM['R_type'] = ['1' if value[0] <= avg_RFM[0] else '0' 
                    for value in model.cluster_centers_]
RFM['F_type'] = ['1' if value[1] >= avg_RFM[0] else '0' 
                    for value in model.cluster_centers_]
RFM['M_type'] = ['1' if value[2] >= avg_RFM[0] else '0' 
                    for value in model.cluster_centers_]

# 将 RFM 三个指标连接起来。

RFM['RFM'] = RFM.R_type + RFM.F_type + RFM.M_type

# 最后，得到每个用户通过聚类模型预测的类型结果。

data.ix[data.TRANS_NUM > 0,['RFM']] = [RFM.ix[i,'RFM']  for i in model.labels_]


In [42]:
RFM

Unnamed: 0,R_type,F_type,M_type,RFM
0,0,0,0,0
1,1,0,0,100
2,1,1,0,110
3,0,1,1,11
4,0,0,0,0
5,1,1,1,111
6,1,1,1,111
7,1,1,1,111


In [44]:
data.RFM

0      000
1      000
2      000
3      000
4      000
5      000
6      000
7      000
8      000
9      000
10     000
11     000
12     000
13     000
14     000
15     100
16     000
17     100
18     000
19     000
20     000
21     100
22     000
23     100
24     000
25     000
26     000
27     000
28     000
29     000
      ... 
970    000
971    000
972    000
973    000
974    000
975    000
976    000
977    000
978    000
979    000
980    000
981    000
982    000
983    000
984    100
985    000
986    100
987    000
988    000
989    000
990    000
991    000
992    000
993    000
994    100
995    000
996    000
997    000
998    100
999    100
Name: RFM, dtype: object