In [1]:
# 0，task1 and task2:
import pandas as pd
import numpy as np
import seaborn
from sklearn.preprocessing import Imputer
from sklearn.model_selection  import train_test_split


data = pd.read_csv('../DataSets/001data.csv', encoding='gbk')

columns_dropna = ['bank_card_no', 'reg_preference_for_trad', 'id_name','first_transaction_time','trade_no',
                 'latest_query_time','loans_latest_time']  # 对于这些列实行删除NA样本的操作
temp_columns = list(data.columns.values)
columns_imputer = []                                       # 对于剩余的列实行填充0值的操作
for i in range(len(temp_columns)):
    if temp_columns[i] not in columns_dropna:
        columns_imputer.append(temp_columns[i])
print(len(columns_imputer), len(columns_dropna))

print(data.shape)
data.dropna(axis=0, how='any', subset=columns_dropna, inplace=True)         # 删除缺省值样本
data = data.replace(np.NaN, 0)                                              # 缺省值填充0

column_headers = list(data.columns.values)
print(column_headers.index('status'), len(column_headers))


# 删掉id数据：
columns_drop = ['bank_card_no','source','id_name','custid','student_feature','trade_no']
data.drop(columns_drop, axis=1, inplace=True)
# 日期数据格式化：
data['latest_query_time'] = pd.to_datetime(data['latest_query_time'])
data['loans_latest_time'] = pd.to_datetime(data['loans_latest_time'])
# 将城市特征数值化：
map_dic = {'一线城市':'1','二线城市':'2','三线城市':'3','其他城市':'4','境外':'0'}
data['reg_preference_for_trad'] = data['reg_preference_for_trad'].map(map_dic)
# 将时间戳数据列丢掉：
data.drop(['latest_query_time','loans_latest_time'], axis=1, inplace=True)
# # 丢掉方差为零的列：
# data.drop(data.columns[data.std()==0], axis=1, inplace=True)
data.drop(['Unnamed: 0'], axis=1, inplace=True)
#print(data.columns.values)
print(set(data['reg_preference_for_trad']))

# 统计缺失值占比：
data_missing = (data.isnull().sum()/len(data))*100    # np.isnan(data):可将他替换成 data.isnull()
data_missing = data_missing.drop(data_missing[data_missing==0].index).sort_values(ascending=False)
miss_data = pd.DataFrame({'缺失百分比':data_missing})



from sklearn.ensemble import RandomForestClassifier   # 使用随机会森林进行特征选择
x_columns = []  # X的列属性(x变量)
for i in range(len(column_headers)):
    if column_headers[i] not in ['Unnamed: 0','reg_preference_for_trad','latest_query_time','loans_latest_time','status','bank_card_no','source','id_name','unnameid','custid','student_feature','trade_no']:
        x_columns.append(column_headers[i])  
        
X = data[x_columns]  # 获取x变量
Y = data['status']   # 获取y标签(label)

clf = RandomForestClassifier()
clf.fit(X, Y)

importance = clf.feature_importances_
indices = np.argsort(importance)[::-1]
features = X.columns
for f in range(X.shape[1]):
    print(("%2d) %-*s %f" % (f + 1, 30, features[f], importance[indices[f]])))

83 7
(4754, 90)
44 90
{'2', '4', '3', '0', '1'}
 1) low_volume_percent             0.050628
 2) middle_volume_percent          0.049992
 3) take_amount_in_later_12_month_highest 0.033458
 4) trans_amount_increase_rate_lately 0.031082
 5) trans_activity_month           0.029079
 6) trans_activity_day             0.024676
 7) transd_mcc                     0.021055
 8) trans_days_interval_filter     0.018402
 9) trans_days_interval            0.018395
10) regional_mobility              0.017821
11) repayment_capability           0.017615
12) is_high_user                   0.016818
13) number_of_trans_from_2011      0.016479
14) first_transaction_time         0.016430
15) historical_trans_amount        0.016297
16) historical_trans_day           0.015933
17) rank_trad_1_month              0.015679
18) trans_amount_3_month           0.014631
19) avg_consume_less_12_valid_month 0.014053
20) abs                            0.013458
21) top_trans_count_last_1_month   0.012876
22) avg_price_las



In [2]:
# 特征工程：
from sklearn.feature_selection import VarianceThreshold
X_fsvar = VarianceThreshold().fit_transform(X) # 过滤掉方差为零的列，并实例化，不填写参数默认方差为0
nan_rate = pd.DataFrame((data.shape[0]-data.count())/data.shape[0])  #查看缺失值比例
nan_rate

# 特征工程：互信息法,它返回每个特征与目标之间的互信息量的估计，这个估计在[0,1]之间取值，0表示独立，1表示两个变量完全相关
from sklearn.feature_selection import mutual_info_classif as MIC
result = MIC(X,Y)
print((result == 0).sum())
delete = []
for i in range(79):
    if result[i] == 0:
        delete.append(i)
X_ = X.drop(X.iloc[:,delete],axis=1)
X_.shape

32


(4429, 47)

In [3]:
# SMOTE:样本不均衡
print(Y[Y==1].count()/Y[Y==0].count())

import imblearn
from imblearn.over_sampling import SMOTE
sm = SMOTE(random_state=420)
X,Y = sm.fit_sample(X_,Y)

n_sample_ = X.shape[0]

pd.Series(Y).value_counts()

n_1_sample = pd.Series(Y).value_counts()[1]
n_0_sample = pd.Series(Y).value_counts()[0]
print('样本个数：{}; 1占{:.2%}; 0占{:.2%}'.format(n_sample_,n_1_sample/n_sample_,n_0_sample/n_sample_))

from sklearn.preprocessing import StandardScaler
X = StandardScaler().fit_transform(X)

0.3360482654600302
样本个数：6630; 1占50.00%; 0占50.00%


In [4]:
# 逻辑回归模型：
x_train, x_test, y_train, y_test = train_test_split(X, Y, train_size=0.7, random_state=2018)

from sklearn.linear_model import LogisticRegression as LR
lr = LR().fit(x_train, y_train)
#lr.score(Xtest,Ytest)
from sklearn import metrics
y_test_pred = lr.predict(x_test)
acc = metrics.accuracy_score(y_test,y_test_pred)
precision = metrics.precision_score(y_test,y_test_pred)
recall = metrics.recall_score(y_test,y_test_pred)
f1 = metrics.f1_score(y_test,y_test_pred)
auc= metrics.roc_auc_score(y_test,y_test_pred)
print('准确率:{:.4f},精确率:{:.4f},召回率:{:.4f},f1-score:{:.4f},auc:{:.4f}'.format(acc,precision,recall,f1,auc))

准确率:0.7677,精确率:0.7753,召回率:0.7550,f1-score:0.7650,auc:0.7677




In [5]:
# SVM:
from sklearn.svm import SVC
clf = SVC(kernel='rbf',gamma='auto',cache_size=5000).fit(x_train, y_train)
clf.score(x_test, y_test)

from sklearn import metrics
y_test_pred = clf.predict(x_test)
acc = metrics.accuracy_score(y_test,y_test_pred)
precision = metrics.precision_score(y_test,y_test_pred)
recall = metrics.recall_score(y_test,y_test_pred)
f1 = metrics.f1_score(y_test,y_test_pred)
auc= metrics.roc_auc_score(y_test,y_test_pred)
print('准确率:{:.4f},精确率:{:.4f},召回率:{:.4f},f1-score:{:.4f},auc:{:.4f}'.format(acc,precision,recall,f1,auc))




准确率:0.8074,精确率:0.8032,召回率:0.8153,f1-score:0.8092,auc:0.8074


In [6]:
# 随机森林：
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(n_estimators=10)
rfc.fit(x_train,y_train)
rfc.score(x_test,y_test)

from sklearn import metrics
y_test_pred = clf.predict(x_test)
acc = metrics.accuracy_score(y_test,y_test_pred)
precision = metrics.precision_score(y_test,y_test_pred)
recall = metrics.recall_score(y_test,y_test_pred)
f1 = metrics.f1_score(y_test,y_test_pred)
auc= metrics.roc_auc_score(y_test,y_test_pred)
print('准确率:{:.4f},精确率:{:.4f},召回率:{:.4f},f1-score:{:.4f},auc:{:.4f}'.format(acc,precision,recall,f1,auc))

准确率:0.8074,精确率:0.8032,召回率:0.8153,f1-score:0.8092,auc:0.8074


In [7]:
# XGBoost:
from xgboost.sklearn import XGBClassifier as XGBC
clf = XGBC().fit(x_train,y_train)
clf.score(x_test,y_test)

from sklearn import metrics
y_test_pred = clf.predict(x_test)
acc = metrics.accuracy_score(y_test,y_test_pred)
precision = metrics.precision_score(y_test,y_test_pred)
recall = metrics.recall_score(y_test,y_test_pred)
f1 = metrics.f1_score(y_test,y_test_pred)
auc= metrics.roc_auc_score(y_test,y_test_pred)
print('准确率:{:.4f},精确率:{:.4f},召回率:{:.4f},f1-score:{:.4f},auc:{:.4f}'.format(acc,precision,recall,f1,auc))

准确率:0.8532,精确率:0.8777,召回率:0.8213,f1-score:0.8485,auc:0.8532
