# 脱敏数据

- 目的  
  跟原数据不一样  
- 方法
  1. 枚举类型，重新生成枚举数值
  2. 连续数字，则需要不同的函数变换  
    2.1 时间类型，固定加减一些数  
    2.2 金额类型，做线性变换最为稳妥  
    等等

## 读入数据


In [1]:
import pandas as pd
import numpy as np
from collections import Counter

df_info = pd.read_csv('../data/new_data/user_detail.csv')
df_bank = pd.read_csv('../data/new_data/bank_detail.csv')
df_bill = pd.read_csv('../data/new_data/bill_detail.csv')
df_browse = pd.read_csv('../data/new_data/browse_detail.csv')
df_overdue = pd.read_csv('../data/new_data/overdue.csv')

In [2]:
df_info.head()

Unnamed: 0,用户标识,用户性别,用户职业,用户教育程度,用户婚姻状态,用户户口类型
0,1,1,2,3,1,3
1,2,1,2,3,2,1
2,3,1,4,4,1,4
3,4,1,4,4,3,2
4,5,1,2,2,3,1


In [3]:
df_bank.head()

Unnamed: 0,用户标识,流水时间,交易类型,交易金额,工资收入标记
0,6965,5894316387,0,13.756664,0
1,6965,5894321388,1,13.756664,0
2,6965,5897553564,0,14.44981,0
3,6965,5897563463,1,10.527763,0
4,6965,5897564598,1,13.651303,0


In [4]:
df_bill.head()

Unnamed: 0,银行标识,用户标识,账单时间戳,上期账单金额,上期还款金额,信用卡额度,本期账单余额,还款状态
0,16.0,2,5908251000.0,21.580308,21.600903,0.0,21.578907,0.0
1,16.0,2,5910843000.0,21.578907,21.579187,0.0,21.552968,0.0
2,16.0,2,5913539000.0,21.552968,21.350226,0.0,0.0,0.0
3,16.0,2,5916178000.0,0.0,0.0,21.580708,21.548334,0.0
4,16.0,2,5918829000.0,21.548334,21.714991,21.580708,0.0,0.0


In [5]:
df_browse.head()

Unnamed: 0,day,month,week,浏览子行为编号,浏览行为数据,用户标识
0,15,10,5,1,173,34801
1,15,10,5,4,164,34801
2,15,10,5,7,38,34801
3,15,10,5,1,45,34801
4,15,10,5,7,110,34801


In [6]:
df_overdue.head()

Unnamed: 0,用户标识,样本标签
0,1,0
1,2,0
2,3,0
3,4,1
4,5,0


## 处理浏览记录
### 合并month-day

In [7]:
# 合并month和day为日期
df_browse['日期'] = df_browse['month'].apply(str) + '-' + df_browse['day'].apply(str)
df_browse.drop(columns=['month','day'],inplace=True)

# 修改week为星期几
df_browse.rename(columns={'week':'星期几'}, inplace = True)

### 合并浏览记录的类型
- 通用方法
  1. 抽取所有小于一定阈值的（如比例占1%以下），放入lst1；其余的放入lst2
  2. 每次从lst1中抽取一个，然后  
    2.1 以0.7的概率从lst1中抽取另一个，进行合并；  
    2.2 以0.3的概率从lst2中抽取一个，同时再以0.5的概率从lst1中抽取一个
- 处理细节  
  1. 浏览子行为记录，合并即可  
  2. 浏览数据记录，按不同阈值合并（具体实施中，是合并一次作为一个记录，再合并1次作为第二个记录），分别作为两个字段

In [8]:
# 合并记录
def dimReduce(lst, pMerge):
    temp = len(lst) * pMerge
    dct = dict(Counter(lst))
    lst1 = []
    lst2 = []
    for k,v in dct.items():
        if v < temp:
            lst1.append(k)
        else:
            lst2.append(k)
    np.random.shuffle(lst1)
    np.random.shuffle(lst2)
            
    newdict = {}
    i,j = 0, 0
    tempMerge = []
    newKey = 0
    while True:
        tempMerge = []
        if i < len(lst1):
            tempMerge.append(lst1[i])
            i += 1
            if np.random.random() < 0.7:
                if i < len(lst1):
                    tempMerge.append(lst1[i])
                    i += 1
            else:
                if j < len(lst2):
                    tempMerge.append(lst2[j])
                    j += 1
                if np.random.random() < 0.5:
                    if i < len(lst1):
                        tempMerge.append(lst1[i])
                        i += 1
        elif j < len(lst2):
            tempMerge.append(lst2[j])
            j += 1
        else:
            break
        # 将tempMerge中的所有key，合并为一个key
        for kk in tempMerge:
            newdict[kk] = newKey
        newKey += 1
    return newdict


In [9]:
# 合并：浏览子行为编号
idmap = dimReduce(df_browse['浏览子行为编号'].tolist(), 0.08)
df_browse['浏览子行为编号'] = df_browse['浏览子行为编号'].map(idmap)

# 然后是：浏览行为数据
# 第一次合并，重点合并小的
aa = 1/len(set(df_browse['浏览行为数据'].tolist()))
idmap = dimReduce(df_browse['浏览行为数据'].tolist(), aa)
df_browse['浏览行为数据'] = df_browse['浏览行为数据'].map(idmap)
df_browse.rename(columns={'浏览行为数据':'子类型1'}, inplace = True)
# 第二次合并，阈值取得较大，等于说完全是随机合并
df_browse['子类型2'] = df_browse['子类型1']
idmap = dimReduce(df_browse['子类型2'].tolist(), 0.5)
df_browse['子类型2'] = df_browse['子类型2'].map(idmap)



## 脱敏id
- 注意
  1. 所有表格都需要统一更新

In [10]:
'''
  脱敏id类型数据
'''
def desID(df, feat, IDmap):
    df[feat] = df[feat].map(IDmap)
    return df

def genMap(df, feat):
    oldList = list(set(df[feat].tolist()))
    newList = list(range(len(oldList)))
    np.random.shuffle(newList)
    return dict(zip(oldList,newList))
    

In [11]:
idname = '用户标识'
# 生成idmap
idmap = genMap(df_info, idname)
df_idmap = pd.DataFrame(idmap,index=[0]).T
df_idmap.columns = ['newid']
df_idmap['oldid'] = df_idmap.index
df_idmap.to_csv('idmap.csv',index=None)

# 更新id
df_info = desID(df_info, idname, idmap)
df_bank = desID(df_bank, idname, idmap)
df_bill = desID(df_bill, idname, idmap)
df_browse = desID(df_browse, idname, idmap)
df_overdue = desID(df_overdue, idname, idmap)


## 脱敏普通类型字段
与脱敏id类似，只是不需要那么严格。

In [12]:
# 定义函数，可一次处理一个df中的所有类型字段
def desEnumer(df, featList):
    for feat in featList:
        # 先生成类型的map
        thisMap = genMap(df,feat)
        # 然后再做替换
        df[feat] = df[feat].map(thisMap)
    return df    


In [14]:
# info
featList = ['用户性别','用户职业','用户教育程度','用户婚姻状态','用户户口类型']
df_info = desEnumer(df_info, featList)

# bill
featList = ['银行标识','还款状态']
df_bill = desEnumer(df_bill, featList)


## 脱敏时间字段
1. 三分之一记录减去x (x取值2106359207)
2. 三分之一记录减去y (y取值2106359203)
3. 三分之一记录减去z (z取值2106359201)
3. 所有小于0的，全部归0

In [15]:
def desTime(df,feat):
    x = 2106359207
    y = 2106359203
    z = 2106359201
    x1 = int(df.shape[0]/3)
    y1 = x1
    z1 = df.shape[0]-x1-y1
    timeChange = [x] * x1 + [y] * y1 + [z] * z1
    np.random.shuffle(timeChange)
    df[feat] = df[feat] - timeChange
    # 还需要把小于0的，全部都归0
    df[feat][df[feat]<0] = 0
    return df

In [16]:
# bank
feat = '流水时间'
df_bank = desTime(df_bank,feat)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if sys.path[0] == '':


In [17]:
df_bank.head()

Unnamed: 0,用户标识,流水时间,交易类型,交易金额,工资收入标记
0,7870,3787957180,0,13.756664,0
1,7870,3787962187,1,13.756664,0
2,7870,3791194357,0,14.44981,0
3,7870,3791204256,1,10.527763,0
4,7870,3791205391,1,13.651303,0


In [18]:
# bill
feat = '账单时间戳'
df_bill = desTime(df_bill,feat)
df_bill[feat] = df_bill[feat].apply(int)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if sys.path[0] == '':


In [19]:
df_bill.head()

Unnamed: 0,银行标识,用户标识,账单时间戳,上期账单金额,上期还款金额,信用卡额度,本期账单余额,还款状态
0,11,48586,3801891346,21.580308,21.600903,0.0,21.578907,0
1,11,48586,3804483930,21.578907,21.579187,0.0,21.552968,0
2,11,48586,3807179413,21.552968,21.350226,0.0,0.0,0
3,11,48586,3809819139,0.0,0.0,21.580708,21.548334,0
4,11,48586,3812469893,21.548334,21.714991,21.580708,0.0,0


## 脱敏金额数据
- 分析  
  当前金额的范围都在15~22之间，需要扩大至百的规模。
  需要考虑很多金额是负数
- 方法
  x = x ** 3 / 10

In [20]:
def desMoney(df,featList):
    for feat in featList:
        df[feat] = df[feat]**3 / 10
    return df    

In [21]:
# bank
featList =['交易金额']
df_bank = desMoney(df_bank,featList)

# bill
featList = ['上期账单金额','上期还款金额','信用卡额度','本期账单余额']
df_bill = desMoney(df_bill,featList)

## 查看一下结果

In [22]:
df_info.head()

Unnamed: 0,用户标识,用户性别,用户职业,用户教育程度,用户婚姻状态,用户户口类型
0,53931,2,0,4,2,1
1,48586,2,0,4,1,2
2,41595,2,2,3,2,3
3,34254,2,2,3,5,4
4,88346,2,0,2,5,2


In [23]:
df_bank.head()

Unnamed: 0,用户标识,流水时间,交易类型,交易金额,工资收入标记
0,7870,3787957180,0,260.339094,0
1,7870,3787962187,1,260.339094,0
2,7870,3791194357,0,301.707711,0
3,7870,3791204256,1,116.683191,0
4,7870,3791205391,1,254.403053,0


In [24]:
df_bill.head()

Unnamed: 0,银行标识,用户标识,账单时间戳,上期账单金额,上期还款金额,信用卡额度,本期账单余额,还款状态
0,11,48586,3801891346,1005.015862,1007.895996,0.0,1004.820137,0
1,11,48586,3804483930,1004.820137,1004.859252,0.0,1001.200948,0
2,11,48586,3807179413,1001.200948,973.212001,0.0,0.0,0
3,11,48586,3809819139,0.0,0.0,1005.071748,1000.555297,0
4,11,48586,3812469893,1000.555297,1023.950546,1005.071748,0.0,0


In [25]:
df_browse.head()

Unnamed: 0,星期几,浏览子行为编号,子类型1,用户标识,日期,子类型2
0,5,6,82,458,10-15,25
1,5,1,30,458,10-15,33
2,5,4,97,458,10-15,26
3,5,6,24,458,10-15,14
4,5,4,50,458,10-15,37


In [26]:
df_overdue.head()

Unnamed: 0,用户标识,样本标签
0,53931,0
1,48586,0
2,41595,0
3,34254,1
4,88346,0


## 按用户标识排序

In [27]:
df_info.sort_values("用户标识",inplace=True)
df_bank.sort_values("用户标识",inplace=True)
df_bill.sort_values("用户标识",inplace=True)
df_browse.sort_values("用户标识",inplace=True)
df_overdue.sort_values("用户标识",inplace=True)

## 输出数据

In [28]:
df_info.to_csv("../data/dataV1/基本信息.csv",index=None)
df_bank.to_csv("../data/dataV1/银行流水.csv",index=None)
df_bill.to_csv("../data/dataV1/信用卡账单.csv",index=None)
df_browse.to_csv("../data/dataV1/浏览记录.csv",index=None)
df_overdue.to_csv("../data/dataV1/label.csv",index=None)
