In [1]:
# 导入相关库
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.preprocessing import OneHotEncoder,LabelEncoder
from sklearn.model_selection import train_test_split

In [2]:
# 查看object类型数据元素种类
def uniqueValueOfObject(data):
    dic = {}
    objects = data.columns[data.dtypes=='object']
    max_len = 0
    for obj in objects:
        dic[obj] = pd.Series(data[obj].unique())
        max_len = max(max_len, len(dic[obj]))
    
    return pd.DataFrame(dic).fillna('-')

In [3]:
def drop_features(data, del_features):
    for feature in del_features:
        data.drop(feature, inplace=True)

In [4]:
data = pd.read_csv('./data.csv', encoding='gbk',infer_datetime_format=True)
data.head()

Unnamed: 0.1,Unnamed: 0,custid,trade_no,bank_card_no,low_volume_percent,middle_volume_percent,take_amount_in_later_12_month_highest,trans_amount_increase_rate_lately,trans_activity_month,trans_activity_day,...,loans_max_limit,loans_avg_limit,consfin_credit_limit,consfin_credibility,consfin_org_count_current,consfin_product_count,consfin_max_limit,consfin_avg_limit,latest_query_day,loans_latest_day
0,5,2791858,20180507115231274000000023057383,卡号1,0.01,0.99,0,0.9,0.55,0.313,...,2900.0,1688.0,1200.0,75.0,1.0,2.0,1200.0,1200.0,12.0,18.0
1,10,534047,20180507121002192000000023073000,卡号1,0.02,0.94,2000,1.28,1.0,0.458,...,3500.0,1758.0,15100.0,80.0,5.0,6.0,22800.0,9360.0,4.0,2.0
2,12,2849787,20180507125159718000000023114911,卡号1,0.04,0.96,0,1.0,1.0,0.114,...,1600.0,1250.0,4200.0,87.0,1.0,1.0,4200.0,4200.0,2.0,6.0
3,13,1809708,20180507121358683000000388283484,卡号1,0.0,0.96,2000,0.13,0.57,0.777,...,3200.0,1541.0,16300.0,80.0,5.0,5.0,30000.0,12180.0,2.0,4.0
4,14,2499829,20180507115448545000000388205844,卡号1,0.01,0.99,0,0.46,1.0,0.175,...,2300.0,1630.0,8300.0,79.0,2.0,2.0,8400.0,8250.0,22.0,120.0


In [5]:
del_features = ['Unnamed: 0','custid','trade_no','id_name']
data.drop(columns = del_features, inplace=True)
data.head()

Unnamed: 0,bank_card_no,low_volume_percent,middle_volume_percent,take_amount_in_later_12_month_highest,trans_amount_increase_rate_lately,trans_activity_month,trans_activity_day,transd_mcc,trans_days_interval_filter,trans_days_interval,...,loans_max_limit,loans_avg_limit,consfin_credit_limit,consfin_credibility,consfin_org_count_current,consfin_product_count,consfin_max_limit,consfin_avg_limit,latest_query_day,loans_latest_day
0,卡号1,0.01,0.99,0,0.9,0.55,0.313,17.0,27.0,26.0,...,2900.0,1688.0,1200.0,75.0,1.0,2.0,1200.0,1200.0,12.0,18.0
1,卡号1,0.02,0.94,2000,1.28,1.0,0.458,19.0,30.0,14.0,...,3500.0,1758.0,15100.0,80.0,5.0,6.0,22800.0,9360.0,4.0,2.0
2,卡号1,0.04,0.96,0,1.0,1.0,0.114,13.0,68.0,22.0,...,1600.0,1250.0,4200.0,87.0,1.0,1.0,4200.0,4200.0,2.0,6.0
3,卡号1,0.0,0.96,2000,0.13,0.57,0.777,22.0,14.0,6.0,...,3200.0,1541.0,16300.0,80.0,5.0,5.0,30000.0,12180.0,2.0,4.0
4,卡号1,0.01,0.99,0,0.46,1.0,0.175,13.0,66.0,42.0,...,2300.0,1630.0,8300.0,79.0,2.0,2.0,8400.0,8250.0,22.0,120.0


In [6]:
# 将日期数据转换为datetime 类型
data['latest_query_time'] = pd.to_datetime(data['latest_query_time'])
data['loans_latest_time'] = pd.to_datetime(data['loans_latest_time'])
# 时间过两天在处理
del_features = ['latest_query_time', 'loans_latest_time']
data.drop(columns=del_features, inplace=True)

In [7]:
uniqueValueOfObject(data)

Unnamed: 0,bank_card_no,reg_preference_for_trad,source
0,卡号1,一线城市,xs
1,-,三线城市,-
2,-,境外,-
3,-,二线城市,-
4,-,其他城市,-
5,-,-,-


In [8]:
# 从以上结果可以看出, bank_card_no 都是卡号1， source都是 xs， 可以去除掉
del_features = ['bank_card_no', 'source']
data.drop(columns=del_features, inplace=True)
data.head()

Unnamed: 0,low_volume_percent,middle_volume_percent,take_amount_in_later_12_month_highest,trans_amount_increase_rate_lately,trans_activity_month,trans_activity_day,transd_mcc,trans_days_interval_filter,trans_days_interval,regional_mobility,...,loans_max_limit,loans_avg_limit,consfin_credit_limit,consfin_credibility,consfin_org_count_current,consfin_product_count,consfin_max_limit,consfin_avg_limit,latest_query_day,loans_latest_day
0,0.01,0.99,0,0.9,0.55,0.313,17.0,27.0,26.0,3.0,...,2900.0,1688.0,1200.0,75.0,1.0,2.0,1200.0,1200.0,12.0,18.0
1,0.02,0.94,2000,1.28,1.0,0.458,19.0,30.0,14.0,4.0,...,3500.0,1758.0,15100.0,80.0,5.0,6.0,22800.0,9360.0,4.0,2.0
2,0.04,0.96,0,1.0,1.0,0.114,13.0,68.0,22.0,1.0,...,1600.0,1250.0,4200.0,87.0,1.0,1.0,4200.0,4200.0,2.0,6.0
3,0.0,0.96,2000,0.13,0.57,0.777,22.0,14.0,6.0,3.0,...,3200.0,1541.0,16300.0,80.0,5.0,5.0,30000.0,12180.0,2.0,4.0
4,0.01,0.99,0,0.46,1.0,0.175,13.0,66.0,42.0,1.0,...,2300.0,1630.0,8300.0,79.0,2.0,2.0,8400.0,8250.0,22.0,120.0


In [9]:
# 检查每一列缺失数据的数量
na_numbers_of_columns = data.isnull().sum(axis=0).sort_values(ascending=False)
na_numbers_of_columns[na_numbers_of_columns>0]

student_feature                            2998
cross_consume_count_last_1_month            426
latest_one_month_apply                      304
query_finance_count                         304
latest_six_month_apply                      304
latest_three_month_apply                    304
query_cash_count                            304
query_sum_count                             304
query_org_count                             304
apply_credibility                           304
apply_score                                 304
latest_query_day                            304
latest_one_month_loan                       297
loans_score                                 297
loans_credibility_behavior                  297
loans_count                                 297
loans_settle_count                          297
loans_overdue_count                         297
loans_org_count_behavior                    297
consfin_org_count_behavior                  297
loans_cash_count                        

In [10]:
# 查看student features 数据, 只包含两种类型 NA 和 1, 推断NA为非学生标志
data['student_feature'] = data['student_feature'].fillna(0)

In [11]:
na_numbers_of_index = data.isnull().sum(axis=1)
na_numbers_of_index.value_counts().sort_values(ascending =False)

0     3983
1      385
39     249
2       37
40      27
10      22
30      12
11       8
3        7
4        7
29       6
31       3
5        3
41       1
42       1
14       1
6        1
12       1
dtype: int64

In [12]:
# 删除包含na数据的行
data.dropna(how='any', inplace=True, axis='index')

In [13]:
data.isnull().any().sum().sum()

0

In [14]:
data.describe()

Unnamed: 0,low_volume_percent,middle_volume_percent,take_amount_in_later_12_month_highest,trans_amount_increase_rate_lately,trans_activity_month,trans_activity_day,transd_mcc,trans_days_interval_filter,trans_days_interval,regional_mobility,...,loans_max_limit,loans_avg_limit,consfin_credit_limit,consfin_credibility,consfin_org_count_current,consfin_product_count,consfin_max_limit,consfin_avg_limit,latest_query_day,loans_latest_day
count,3983.0,3983.0,3983.0,3983.0,3983.0,3983.0,3983.0,3983.0,3983.0,3983.0,...,3983.0,3983.0,3983.0,3983.0,3983.0,3983.0,3983.0,3983.0,3983.0,3983.0
mean,0.019982,0.904783,2019.093648,4.378511,0.806465,0.388897,18.207381,26.118755,19.93171,2.71052,...,3481.621893,1842.186292,9325.05649,76.685413,4.95305,5.473763,16614.838062,8082.725082,23.431835,52.370073
std,0.026783,0.132394,4047.801495,78.10894,0.194997,0.163907,4.11752,17.958326,14.065691,0.879938,...,1419.273829,537.812515,7282.72002,12.806453,2.963372,3.401296,14480.490287,5601.708872,37.416001,51.011847
min,0.0,0.05,0.0,0.0,0.23,0.051,5.0,5.0,4.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-2.0,-2.0
25%,0.01,0.88,0.0,0.61,0.67,0.263,16.0,15.0,11.0,2.0,...,2500.0,1566.0,5000.0,77.0,3.0,3.0,7800.0,4800.0,5.0,9.0
50%,0.01,0.96,600.0,0.96,0.86,0.375,18.0,22.0,16.0,3.0,...,3300.0,1820.0,7800.0,79.0,5.0,5.0,14400.0,7100.0,14.0,33.0
75%,0.02,0.99,2000.0,1.58,1.0,0.497,20.0,31.0,24.0,3.0,...,4400.0,2100.0,11800.0,80.0,7.0,8.0,20400.0,9937.5,23.0,87.0
max,0.34,1.0,68000.0,4677.98,1.0,0.941,42.0,229.0,151.0,5.0,...,10000.0,6900.0,87100.0,87.0,18.0,20.0,266400.0,82800.0,360.0,323.0


In [15]:
# 是否进行编码
encodingFlag = True
if encodingFlag:
    for col in data.columns[data.dtypes == 'object']:
        le = LabelEncoder()
        le.fit(data[col])
        data[col] = le.transform(data[col])
    data.head()

In [16]:
columns = list(set(data.columns)-set(['status']))
y = data['status']
X = data[columns]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=2018)

## 特征选择

In [17]:
# 利用随机森林选择特征
from sklearn.ensemble import RandomForestClassifier
feat_lables = X.columns
forest = RandomForestClassifier(n_estimators=10000, random_state=0,n_jobs=1)
forest.fit(X, y)
importance = forest.feature_importances_
imp_result = np.argsort(importance)[::-1]

for i in range(X.shape[1]):
    print("%2d. %-*s %f"%(i+1, 30, feat_lables[i], importance[imp_result[i]]))

 1. loans_max_limit                0.051434
 2. loans_credibility_behavior     0.050106
 3. trans_days_interval            0.037573
 4. middle_volume_percent          0.031237
 5. consume_mini_time_last_1_month 0.026536
 6. query_sum_count                0.024709
 7. latest_three_month_loan        0.018023
 8. latest_six_month_loan          0.016932
 9. transd_mcc                     0.016663
10. avg_price_last_12_month        0.016304
11. trans_days_interval_filter     0.016059
12. rank_trad_1_month              0.015602
13. take_amount_in_later_12_month_highest 0.015145
14. latest_query_day               0.014920
15. avg_consume_less_12_valid_month 0.014848
16. cross_consume_count_last_1_month 0.014662
17. loans_score                    0.014615
18. loans_product_count            0.014491
19. consume_top_time_last_6_month  0.014348
20. latest_six_month_apply         0.014322
21. railway_consume_count_last_12_month 0.014279
22. latest_one_month_fail          0.014187
23. regional_mobi

In [18]:
# IV值计算
def CalcIV(Xvar,Yvar):
    N_0=np.sum(Yvar==0)
    N_1=np.sum(Yvar==1)
    N_0_group=np.zeros(np.unique(Xvar).shape)
    
    N_1_group=np.zeros(np.unique(Xvar).shape)
    for i in range(len(np.unique(Xvar))):
        N_0_group[i] = Yvar[(Xvar==np.unique(Xvar)[i])&(Yvar==0)].count()
        N_1_group[i] = Yvar[(Xvar==np.unique(Xvar)[i])&(Yvar==1)].count()
    iv = np.sum((N_0_group/N_0-N_1_group/N_1)*np.log((N_0_group/N_0)/(N_1_group/N_1)))
    if iv>=1.0:## 处理极端值
        iv=1
    return iv

def caliv_batch(df,Yvar):
    ivlist=[]
    for col in df.columns:
        iv=CalcIV(df[col],Yvar)
        ivlist.append(iv)
    names=list(df.columns)
    iv_df=pd.DataFrame({'Var':names,'Iv':ivlist},columns=['Var','Iv'])

    return iv_df,ivlist
im_iv, ivl = caliv_batch(X,y)

  # This is added back by InteractiveShellApp.init_path()
  # This is added back by InteractiveShellApp.init_path()


In [19]:
print(im_iv)

                                           Var        Iv
0                              loans_max_limit  1.000000
1                   loans_credibility_behavior  1.000000
2                          trans_days_interval  1.000000
3                        middle_volume_percent  1.000000
4               consume_mini_time_last_1_month  1.000000
5                              query_sum_count  1.000000
6                      latest_three_month_loan  1.000000
7                        latest_six_month_loan  1.000000
8                                   transd_mcc  1.000000
9                      avg_price_last_12_month  1.000000
10                  trans_days_interval_filter  1.000000
11                           rank_trad_1_month  0.142581
12       take_amount_in_later_12_month_highest  1.000000
13                            latest_query_day  1.000000
14             avg_consume_less_12_valid_month  1.000000
15            cross_consume_count_last_1_month  1.000000
16                             

In [20]:
print(ivl)

[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0.1425813087542006, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0.013511596645623724, 1, 0.0007522228798560241, 1, 1, 1, 1, 0.0866649912415553, 1, 0.03316241433020335, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0.00042338549749841646, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0.004408041787028861, 1, 1, 1]


## 模型构建

In [21]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(C=1000.0, random_state=0)
lr.fit(X_train, y_train)
result = lr.predict(X_test)
lr.score(X_test, y_test)



0.7690376569037657

In [22]:
from sklearn import svm
 
clf = svm.SVC()  # class 
clf.fit(X_train, y_train)  # training the svc model
result = clf.predict(X_test) # predict the target of testing samples 
clf.score(X_test, y_test)



0.7698744769874477

In [23]:
from sklearn.tree import DecisionTreeClassifier
clf = DecisionTreeClassifier(max_depth=4)
#拟合模型
clf.fit(X_train, y_train)
result = clf.predict(X_test) # predict the target of testing samples 
clf.score(X_test, y_test)

0.7364016736401674

In [24]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(oob_score=True, random_state=10)
rfc.fit(X_train,y_train)
rfc.score(X_test, y_test)

  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])


0.7748953974895397

In [25]:
import lightgbm as lgb
from sklearn.metrics import roc_auc_score

params = {
    'task':'train',
    'boosting_type':'gbdt',
    'objective':'binary',
    'metric':{'auc'},
    'verbose':0,
    'learning_rate':0.01,
    'is_unbalance':True,
    'num_leaves' : 30,
    'reg_alpha' : 0,
    'reg_lambda' : 0,
    'max_depth' : -1,
    'n_estimators' : 1500,
    'colsample_bytree':0.7,
    'subsample':0.95,
    'subsample_freq':1,
    'random_state':201907
}
train_data = lgb.Dataset(X_train, label = y_train)
val_data = lgb.Dataset(X_test, label = y_test)
    
model = lgb.train(params,
              train_data,
              num_boost_round=1000,
              valid_sets=[train_data,val_data],
              early_stopping_rounds=50,
              verbose_eval = 100
             )
pred_y_test = model.predict(X_test,num_iteration=model.best_iteration)
print("roc_auc_score:  ", roc_auc_score(y_test,pred_y_test))



Training until validation scores don't improve for 50 rounds.
[100]	training's auc: 0.943455	valid_1's auc: 0.762423
[200]	training's auc: 0.965251	valid_1's auc: 0.77034
[300]	training's auc: 0.981241	valid_1's auc: 0.772723
[400]	training's auc: 0.99107	valid_1's auc: 0.773458
Early stopping, best iteration is:
[356]	training's auc: 0.987302	valid_1's auc: 0.774178
roc_auc_score:   0.7741778656126482
