|Field|Description
|-|-
|id|为贷款清单分配的唯一信用证标识
|loanAmnt|贷款金额
|term|贷款期限（year）
|interestRate|贷款利率
|installment|分期付款金额
|grade|贷款等级
|subGrade|贷款等级之子级
|employmentTitle|就业职称
|employmentLength|就业年限（年）
|homeOwnership|借款人在登记时提供的房屋所有权状况
|annualIncome|年收入
|verificationStatus|验证状态
|issueDate|贷款发放的月份
|purpose|借款人在贷款申请时的贷款用途类别
|postCode|借款人在贷款申请中提供的邮政编码的前3位数字
|regionCode|地区编码
|dti|债务收入比
|delinquency_2years|借款人过去2年信用档案中逾期30天以上的违约事件数
|ficoRangeLow|借款人在贷款发放时的fico所属的下限范围
|ficoRangeHigh|借款人在贷款发放时的fico所属的上限范围
|openAcc|借款人信用档案中未结信用额度的数量
|pubRec|贬损公共记录的数量
|pubRecBankruptcies|公开记录清除的数量
|revolBal|信贷周转余额合计
|revolUtil|循环额度利用率，或借款人使用的相对于所有可用循环信贷的信贷金额
|totalAcc|借款人信用档案中当前的信用额度总数
|initialListStatus|贷款的初始列表状态
|applicationType|表明贷款是个人申请还是与两个共同借款人的联合申请
|earliesCreditLine|借款人最早报告的信用额度开立的月份
|title|借款人提供的贷款名称
|policyCode|公开可用的策略_代码=1新产品不公开可用的策略_代码=2
|n系列匿名特征|匿名特征n0-n14，为一些贷款人行为计数特征的处理
|isDefault|Y

In [1]:
import sqlite3
import pandas as pd

In [2]:
%time data = pd.read_csv("LoanRisk.csv").sample(10000)
print(data.shape)
%time X = data.drop(["id", "isDefault"], axis=1)
%time y = data["isDefault"]

badrate = pd.value_counts(y, normalize=True)[1]
print(pd.value_counts(y), badrate)

CPU times: user 9.32 s, sys: 1.09 s, total: 10.4 s
Wall time: 11.5 s
(10000, 47)
CPU times: user 9.65 ms, sys: 0 ns, total: 9.65 ms
Wall time: 24.3 ms
CPU times: user 442 µs, sys: 0 ns, total: 442 µs
Wall time: 477 µs
0    7985
1    2015
Name: isDefault, dtype: int64 0.2015


In [None]:
%time col = X.describe().columns
col

CPU times: user 267 ms, sys: 1.04 ms, total: 268 ms
Wall time: 296 ms


Index(['loanAmnt', 'term', 'interestRate', 'installment', 'employmentTitle',
       'homeOwnership', 'annualIncome', 'verificationStatus', 'purpose',
       'postCode', 'regionCode', 'dti', 'delinquency_2years', 'ficoRangeLow',
       'ficoRangeHigh', 'openAcc', 'pubRec', 'pubRecBankruptcies', 'revolBal',
       'revolUtil', 'totalAcc', 'initialListStatus', 'applicationType',
       'title', 'policyCode', 'n0', 'n1', 'n2', 'n3', 'n4', 'n5', 'n6', 'n7',
       'n8', 'n9', 'n10', 'n11', 'n12', 'n13', 'n14'],
      dtype='object')

In [None]:
from sklearn.model_selection import train_test_split
X_trai, X_test, y_trai, y_test = train_test_split(X[col].fillna(-1), y, test_size=0.3)
print(X_trai.shape, X_test.shape)

(7000, 40) (3000, 40)


In [None]:

def result(_p, _y):
    conn = sqlite3.connect(":memory:")
    pd.DataFrame({"p": _p, "y": _y}).to_sql(
        "_r", conn, index=False, if_exists="replace")
    
    SQL = """
    SELECT
        p,
        SUM(1) AS cnt,
        SUM(1) *1.0/(SELECT SUM(1) FROM _r) AS pnt,
        SUM(CASE WHEN y = 1 THEN 1 ELSE 0 END) *1.0/SUM(1) AS rate
    FROM _r
    GROUP BY p
    """
    print(pd.read_sql(SQL, conn))


In [None]:
from sklearn import svm
model = svm.OneClassSVM(
    nu=0.1, 
    kernel="rbf", 
    gamma=0.1
)
%time model.fit(X_trai)

%time result(_p=model.predict(X_trai), _y=[_ for _ in y_trai])
%time result(_p=model.predict(X_test), _y=[_ for _ in y_test])


CPU times: user 7.16 s, sys: 1.68 ms, total: 7.16 s
Wall time: 7.39 s
   p   cnt  pnt      rate
0 -1  4200  0.6  0.197619
1  1  2800  0.4  0.207857
CPU times: user 5.45 s, sys: 1.5 ms, total: 5.45 s
Wall time: 5.8 s
   p   cnt  pnt   rate
0 -1  3000  1.0  0.201
CPU times: user 2.33 s, sys: 0 ns, total: 2.33 s
Wall time: 2.41 s


In [None]:
from sklearn.ensemble import IsolationForest
model = IsolationForest(
    n_estimators=400,
    contamination=badrate,
    bootstrap=True,
    n_jobs=-1
)
%time model.fit(X_trai)

%time result(_p=model.predict(X_trai), _y=[_ for _ in y_trai])
%time result(_p=model.predict(X_test), _y=[_ for _ in y_test])


  from numpy.core.umath_tests import inner1d
  pickler.file_handle.write(chunk.tostring('C'))


   p   cnt       pnt      rate
0 -1  1411  0.201571  0.226790
1  1  5589  0.798429  0.195384
CPU times: user 2.64 s, sys: 6.87 ms, total: 2.65 s
Wall time: 2.67 s
   p   cnt       pnt      rate
0 -1   643  0.214333  0.216174
1  1  2357  0.785667  0.196860
CPU times: user 1.19 s, sys: 0 ns, total: 1.19 s
Wall time: 1.27 s


In [None]:
from sklearn.covariance import EllipticEnvelope
model = EllipticEnvelope(
    contamination=badrate
)
%time model.fit(X_trai)

%time result(_p=model.predict(X_trai), _y=[_ for _ in y_trai])
%time result(_p=model.predict(X_test), _y=[_ for _ in y_test])


  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval


CPU times: user 4min 7s, sys: 6min 11s, total: 10min 18s
Wall time: 5min 18s
   p   cnt       pnt      rate
0 -1  1411  0.201571  0.195606
1  1  5589  0.798429  0.203256
CPU times: user 280 ms, sys: 446 ms, total: 725 ms
Wall time: 367 ms
   p   cnt       pnt      rate
0 -1   566  0.188667  0.155477
1  1  2434  0.811333  0.211586
CPU times: user 271 ms, sys: 447 ms, total: 717 ms
Wall time: 341 ms


In [None]:
from sklearn.cluster import AgglomerativeClustering
model = AgglomerativeClustering(linkage=['ward', 'average', 'complete'][0], n_clusters=5)
%time model.fit(X_trai)

%time result(_p=model.fit_predict(X_trai), _y=[_ for _ in y_trai])
%time result(_p=model.fit_predict(X_test), _y=[_ for _ in y_test])


CPU times: user 6.56 s, sys: 430 ms, total: 6.99 s
Wall time: 7.45 s
   p   cnt       pnt      rate
0  0  4718  0.674000  0.208139
1  1  1142  0.163143  0.167250
2  2   419  0.059857  0.171838
3  3   702  0.100286  0.235043
4  4    19  0.002714  0.105263
CPU times: user 6.56 s, sys: 417 ms, total: 6.98 s
Wall time: 7.26 s
   p   cnt       pnt      rate
0  0   408  0.136000  0.166667
1  1   484  0.161333  0.194215
2  2   547  0.182333  0.170018
3  3     2  0.000667  0.000000
4  4  1559  0.519667  0.223220
CPU times: user 951 ms, sys: 0 ns, total: 951 ms
Wall time: 989 ms


In [None]:
from sklearn.cluster import MeanShift, estimate_bandwidth
bandwidth = estimate_bandwidth(X_trai, quantile=0.2, n_samples=1000)
model = MeanShift(bandwidth=bandwidth, bin_seeding=True)

%time model.fit(X_trai)

%time result(_p=model.fit_predict(X_trai), _y=[_ for _ in y_trai])
%time result(_p=model.fit_predict(X_test), _y=[_ for _ in y_test])


CPU times: user 5.14 s, sys: 0 ns, total: 5.14 s
Wall time: 5.36 s
     p   cnt       pnt      rate
0    0  5135  0.733571  0.207595
1    1  1654  0.236286  0.188029
2    2   103  0.014714  0.165049
3    3    12  0.001714  0.083333
4    4    11  0.001571  0.090909
5    5    32  0.004571  0.250000
6    6     4  0.000571  0.000000
7    7     4  0.000571  0.000000
8    8     1  0.000143  0.000000
9    9     1  0.000143  0.000000
10  10     1  0.000143  0.000000
11  11     2  0.000286  0.000000
12  12     1  0.000143  0.000000
13  13     1  0.000143  0.000000
14  14     1  0.000143  0.000000
15  15     1  0.000143  0.000000
16  16     1  0.000143  0.000000
17  17     1  0.000143  1.000000
18  18     1  0.000143  0.000000
19  19     1  0.000143  1.000000
20  20     1  0.000143  0.000000
21  21    20  0.002857  0.200000
22  22     5  0.000714  0.200000
23  23     5  0.000714  0.200000
24  24     1  0.000143  0.000000
CPU times: user 5.15 s, sys: 0 ns, total: 5.15 s
Wall time: 5.41 s
     p  

In [None]:
from sklearn.cluster import DBSCAN
model = DBSCAN(eps=0.1, min_samples=100)

%time model.fit(X_trai)

%time result(_p=model.fit_predict(X_trai), _y=[_ for _ in y_trai])
%time result(_p=model.fit_predict(X_test), _y=[_ for _ in y_test])


CPU times: user 432 ms, sys: 0 ns, total: 432 ms
Wall time: 461 ms
   p   cnt  pnt      rate
0 -1  7000  1.0  0.201714
CPU times: user 432 ms, sys: 17.1 ms, total: 449 ms
Wall time: 464 ms
   p   cnt  pnt   rate
0 -1  3000  1.0  0.201
CPU times: user 156 ms, sys: 0 ns, total: 156 ms
Wall time: 159 ms


In [None]:
from sklearn.cluster import Birch
model = Birch(n_clusters=5)

%time model.fit(X_trai)

%time result(_p=model.fit_predict(X_trai), _y=[_ for _ in y_trai])
%time result(_p=model.fit_predict(X_test), _y=[_ for _ in y_test])


  node1_dist, node2_dist = dist[[farthest_idx]]


CPU times: user 7.37 s, sys: 1.45 s, total: 8.82 s
Wall time: 8.56 s
   p   cnt       pnt      rate
0  0  4718  0.674000  0.208139
1  1  1142  0.163143  0.167250
2  2   419  0.059857  0.171838
3  3   702  0.100286  0.235043
4  4    19  0.002714  0.105263
CPU times: user 7.77 s, sys: 1.73 s, total: 9.5 s
Wall time: 8.91 s


  node1_dist, node2_dist = dist[[farthest_idx]]


   p   cnt       pnt      rate
0  0   408  0.136000  0.166667
1  1   484  0.161333  0.194215
2  2   547  0.182333  0.170018
3  3     2  0.000667  0.000000
4  4  1559  0.519667  0.223220
CPU times: user 1.77 s, sys: 762 ms, total: 2.53 s
Wall time: 1.94 s


In [None]:
# from sklearn.mixture import GaussianMixture
# model = GaussianMixture(n_components=5, covariance_type='full')

# %time model.fit(X_trai)

# %time result(_p=model.fit_predict(X_trai), _y=[_ for _ in y_trai])
# %time result(_p=model.fit_predict(X_test), _y=[_ for _ in y_test])
