# 读入函数

In [None]:
import os
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.cluster import KMeans
from sklearn.ensemble import IsolationForest
from sklearn.externals import joblib

import warnings
warnings.filterwarnings('ignore')

# K-means模型

In [None]:
def kmeans_train(data,to_delete=None):
    '''
    训练kmeans模型并输出异常得分
    input:
    data: dataframe，原始变量数据，其index为User_Id，columns为变量名
    to_delete: list，训练模型时需要删除的变量，默认不删除
    return:
    cluster: 训练好的kmeans模型对象
    score: series, 模型计算的异常得分
    '''
    #删除变量
    if to_delete is None:
        to_delete = []
    feature_remain = [col for col in data.columns.tolist() if col not in to_delete]
    #归一化处理
    data_select = data[feature_remain]
    select_index = data_select.index
    select_column = data_select.columns
    scaler = MinMaxScaler()
    data_std = scaler.fit_transform(data_select)
    data_std = pd.DataFrame(data_std,index=select_index,columns=select_column)
    #模型训练
    cluster = KMeans(n_clustering=2,random_state=0)
    cluster.fit(data_std)
    label = cluster.labels_
    print('模型训练完成')
    #计算异常得分
    score = cluster.transform(data_std)
    score = pd.DataFrame(score,index=select_index,columns=['Distance0','Distance1'])
    score['Class'] = label
    score = score[score['Class'] == 1]
    del score['Distance1'],score['Class']
    score = pd.Series(score)
    score.index = select_index
    score.name = 'score'
    print('异常得分计算完成')
    return cluster,score

# iForest模型

In [None]:
def iforest_train(data,params,to_delete=None,feature_weights=None):
    '''
    训练iForest模型并输出异常得分
    input:
    data: dataframe，原始变量数据，其index为User_Id，columns为变量名
    params: dict，iForest相关参数及取值
    to_delete: list，训练模型时需要删除的变量，默认不删除
    feature_weights: dict，key为变量名，value为变量权重(实际取值应为正整数，代表该变量对应列的复制次数)，默认均为1
    return:
    clf: 训练好的iForest模型对象
    columns_use: list, 实际入模的变量列表，根据变量权重可能有重复
    score: series, 模型计算的异常得分
    ''' 
    #删除变量
    if to_delete is None:
        to_delete = []
    feature_remain = [col for col in data.columns.tolist() if col not in to_delete]
    #加入变量权重
    if feature_weights is None:
        feature_weights = {}
    for col in feature_remain:
        if col not in feature_weights:
            feature_weights[col] = 1
    columns_use = []
    for col in feature_remain:
        columns_use.extend([col]*feature_weights[col])
    #模型训练
    clf = IsolationForest(**params)
    clf.fit(data[columns_use])
    print('模型训练完成')
    #计算异常得分
    score = pd.Series(0.5-clf.decision_function(data[columns_use]))
    score.index = data.index
    score.name = 'score'
    print('异常得分计算完成')
    return clf,columns_use,score

# 模型训练

In [None]:
##################################main##################################
path = '/usr/local/workspace/'
os.chdir(path)

#读入大宽表
data = pd.read_csv('/data1/sample_bigtable/bigtable_final.csv',converters={'User_Id':str})

#K-means模型训练
data_cluster = data[columns_cluster]
data_cluster = data_cluster.set_index('User_Id')
cluster,score_cluster = kmeans_train(data=data_cluster)
score_cluster.to_csv('result/score_cluster.csv',index=True,header=True)
print('K-means模型训练完成')

#iForest模型训练
data_forest = data[columns_forest]
data_forest = data_forest.set_index('User_Id')
params={'n_estimators':100,'max_samples':0.1,'bootstrap':False,'n_jobs':1,'random_state':0}
clf,columns_use,score_forest = iforest_train(data=data_forest,params=params)
score_forest.to_csv('result/score_forest.csv',index=True,header=True)
print('iForest模型训练完成')