In [8]:
import numpy  as np
import pandas as pd
from sklearn.model_selection import train_test_split
from skcredit.feature_discretization import SplitCat
from skcredit.feature_discretization import SplitNum
from skcredit.feature_discretization import DiscreteAuto
from skcredit.feature_discretization import DiscreteCust
from skcredit.feature_selection import SelectBins
from skcredit.feature_selection import SelectCIFE
from skcredit.linear_model import LMClassifier
from skcredit.linear_model import LMCreditcard
np.random.seed(7)
pd.set_option("max_rows",    None)
pd.set_option("max_columns", None)

## 读入数据

[数据介绍](https://archive.ics.uci.edu/ml/datasets/default+of+credit+card+clients)

In [9]:
dataset = pd.read_csv("../UCI_Credit_Card.csv")

In [10]:
dataset.shape

(30000, 25)

In [11]:
dataset.head()

Unnamed: 0,ID,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,PAY_5,PAY_6,BILL_AMT1,BILL_AMT2,BILL_AMT3,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default.payment.next.month
0,1,20000.0,2,2,1,24,2,2,-1,-1,-2,-2,3913.0,3102.0,689.0,0.0,0.0,0.0,0.0,689.0,0.0,0.0,0.0,0.0,1
1,2,120000.0,2,2,2,26,-1,2,0,0,0,2,2682.0,1725.0,2682.0,3272.0,3455.0,3261.0,0.0,1000.0,1000.0,1000.0,0.0,2000.0,1
2,3,90000.0,2,2,2,34,0,0,0,0,0,0,29239.0,14027.0,13559.0,14331.0,14948.0,15549.0,1518.0,1500.0,1000.0,1000.0,1000.0,5000.0,0
3,4,50000.0,2,2,1,37,0,0,0,0,0,0,46990.0,48233.0,49291.0,28314.0,28959.0,29547.0,2000.0,2019.0,1200.0,1100.0,1069.0,1000.0,0
4,5,50000.0,1,2,1,57,-1,0,-1,0,0,0,8617.0,5670.0,35835.0,20940.0,19146.0,19131.0,2000.0,36681.0,10000.0,9000.0,689.0,679.0,0


## 分类单特征分箱

### 默认参数

In [12]:
# min_bin_cnt_negative=75
# min_bin_cnt_positive=75
# min_information_value_split_gain=0.015
sc = SplitCat()
sc.fit(dataset["EDUCATION"], dataset["default.payment.next.month"])

SplitCat()

In [13]:
print(sc.table.to_markdown())

|    | Column    | Bucket          |   CntPositive |   CntNegative |        WoE |        IvS |
|---:|:----------|:----------------|--------------:|--------------:|-----------:|-----------:|
|  0 | EDUCATION | {0, 1, 4, 5, 6} |          2069 |          8984 | -0.209693  | 0.0152528  |
|  1 | EDUCATION | {2}             |          3330 |         10700 |  0.0914156 | 0.00400755 |
|  2 | EDUCATION | {3}             |          1237 |          3680 |  0.168463  | 0.00486862 |
|  3 | EDUCATION | {nan}           |             0 |             0 |  0         | 0          |


### 调整参数

In [14]:
sc = SplitCat(min_information_value_split_gain=0.0001)
sc.fit(dataset["EDUCATION"], dataset["default.payment.next.month"])

SplitCat()

In [15]:
print(sc.table.to_markdown())

|    | Column    | Bucket          |   CntPositive |   CntNegative |        WoE |        IvS |
|---:|:----------|:----------------|--------------:|--------------:|-----------:|-----------:|
|  0 | EDUCATION | {0, 1, 4, 5, 6} |          2069 |          8984 | -0.209693  | 0.0152528  |
|  1 | EDUCATION | {2}             |          3330 |         10700 |  0.0914156 | 0.00400755 |
|  2 | EDUCATION | {3}             |          1237 |          3680 |  0.168463  | 0.00486862 |
|  3 | EDUCATION | {nan}           |             0 |             0 |  0         | 0          |


## 连续单变量分箱

### 默认参数

In [16]:
sn = SplitNum()
sn.fit(dataset["LIMIT_BAL"], dataset["default.payment.next.month"])

SplitNum()

In [17]:
print(sn.table.to_markdown())

|    | Column    | Bucket              |   CntPositive |   CntNegative |          WoE |         IvS |
|---:|:----------|:--------------------|--------------:|--------------:|-------------:|------------:|
|  0 | LIMIT_BAL | (-inf,10000.0]      |           197 |           296 |  0.851531    | 0.0144909   |
|  1 | LIMIT_BAL | (10000.0,40000.0]   |          1358 |          2460 |  0.664539    | 0.0660227   |
|  2 | LIMIT_BAL | (40000.0,70000.0]   |          1328 |          3593 |  0.263374    | 0.0122039   |
|  3 | LIMIT_BAL | (70000.0,120000.0]  |          1112 |          3468 |  0.121269    | 0.00232077  |
|  4 | LIMIT_BAL | (120000.0,140000.0] |           327 |          1151 |  0.000260766 | 3.35032e-09 |
|  5 | LIMIT_BAL | (140000.0,220000.0] |          1103 |          5184 | -0.288856    | 0.0160792   |
|  6 | LIMIT_BAL | (220000.0,240000.0] |           223 |          1133 | -0.366765    | 0.00546071  |
|  7 | LIMIT_BAL | (240000.0,260000.0] |           138 |           733 | -0.411205

### 调整参数

In [18]:
# 默认参数
# min_bin_cnt_negative=75
# min_bin_cnt_positive=75
# min_information_value_split_gain=0.015

sn = SplitNum(min_information_value_split_gain=0.0001)
sn.fit(dataset["LIMIT_BAL"], dataset["default.payment.next.month"])

SplitNum()

In [19]:
print(sn.table.to_markdown())

|    | Column    | Bucket              |   CntPositive |   CntNegative |          WoE |         IvS |
|---:|:----------|:--------------------|--------------:|--------------:|-------------:|------------:|
|  0 | LIMIT_BAL | (-inf,10000.0]      |           197 |           296 |  0.851531    | 0.0144909   |
|  1 | LIMIT_BAL | (10000.0,40000.0]   |          1358 |          2460 |  0.664539    | 0.0660227   |
|  2 | LIMIT_BAL | (40000.0,70000.0]   |          1328 |          3593 |  0.263374    | 0.0122039   |
|  3 | LIMIT_BAL | (70000.0,120000.0]  |          1112 |          3468 |  0.121269    | 0.00232077  |
|  4 | LIMIT_BAL | (120000.0,140000.0] |           327 |          1151 |  0.000260766 | 3.35032e-09 |
|  5 | LIMIT_BAL | (140000.0,220000.0] |          1103 |          5184 | -0.288856    | 0.0160792   |
|  6 | LIMIT_BAL | (220000.0,240000.0] |           223 |          1133 | -0.366765    | 0.00546071  |
|  7 | LIMIT_BAL | (240000.0,260000.0] |           138 |           733 | -0.411205

## 数据整理

In [20]:
cat_columns = ["SEX", "EDUCATION", "MARRIAGE"]
num_columns = ["LIMIT_BAL", "AGE",
               "PAY_0",     "PAY_2",     "PAY_3",     "PAY_4",     "PAY_5",     "PAY_6",
               "PAY_AMT1",  "PAY_AMT2",  "PAY_AMT3",  "PAY_AMT4",  "PAY_AMT5",  "PAY_AMT6",
               "BILL_AMT1", "BILL_AMT2", "BILL_AMT3", "BILL_AMT4", "BILL_AMT5", "BILL_AMT6"]

target = "default.payment.next.month"

In [21]:
train_x, test_x, train_y, test_y = train_test_split(
        dataset.drop([target], axis=1), dataset[target], train_size=0.75, shuffle=True, random_state=7)

## 手动多特征分箱

对每个特征手动调整后参数进行分箱，以分类特征 EDUCATION 连续特征 LIMIT_BAL 为例

In [22]:
sc = SplitCat(min_information_value_split_gain=0.0001)
sc.fit(train_x["EDUCATION"], train_y)

SplitCat()

In [23]:
sn = SplitNum(min_information_value_split_gain=0.0001)
sn.fit(train_x["LIMIT_BAL"], train_y)

SplitNum()

In [24]:
cust = DiscreteCust(keep_columns=["ID"], date_columns=[], cat_spliter={"EDUCATION": sc}, num_spliter={"LIMIT_BAL":sn})
cust.fit(train_x, train_y)

DiscreteCust(cat_spliter={'EDUCATION': SplitCat()}, date_columns=[],
             keep_columns=['ID'], num_spliter={'LIMIT_BAL': SplitNum()})

In [25]:
cust.transform(train_x).head()

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    1.8s
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    1.9s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    1.1s
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    1.2s finished


Unnamed: 0,ID,LIMIT_BAL,EDUCATION
17845,17846,0.047881,0.107146
1698,1699,0.172807,-0.229193
9132,9133,-0.478657,0.107146
27932,27933,0.047881,-0.229193
23497,23498,-0.759759,-0.229193


## 自动多特征分箱

In [26]:
auto = DiscreteAuto(keep_columns=["ID"], date_columns=[], cat_columns=cat_columns, num_columns=num_columns)
auto.fit(train_x, train_y)
train_x = auto.transform(train_x)
test_x  = auto.transform(test_x )

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    1.2s
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    1.2s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    1.2s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    0.4s
[Parallel(n_jobs=-1)]: Done   3 out of  20 | elapsed:    0.5s remaining:    3.1s
[Parallel(n_jobs=-1)]: Done   5 out of  20 | elapsed:    0.7s remaining:    2.4s
[Parallel(n_jobs=-1)]: Done   7 out of  20 | elapsed:    1.3s remaining:    2.4s
[Parallel(n_jobs=-1)]: Done   9 out of  20 | elapsed:    1.3s remaining:    1.6s
[Parallel(n_jobs=-1)]: Done  11 out of  20 | elapsed:    2.1s remaining:    1.7s
[Parallel(n_jobs=-1)]: Done  13 out of  20 | elapsed:    2.2s remaining:    1.1s
[Parallel(n_jobs=-1)]: Done  15 out of  20 | elapsed:    2.2s remaining:    0.7s
[Pa

In [27]:
print(auto.information_value_score.head().to_markdown())

|       |      IvS |
|:------|---------:|
| PAY_0 | 0.876508 |
| PAY_2 | 0.561801 |
| PAY_3 | 0.423221 |
| PAY_4 | 0.372056 |
| PAY_5 | 0.338337 |


In [28]:
print(auto.information_value_table.head().to_markdown())

|    | Column   | Bucket   |   CntPositive |   CntNegative |       WoE |       IvS |
|---:|:---------|:---------|--------------:|--------------:|----------:|----------:|
|  0 | PAY_0    | (-inf,0] |          2391 |         14987 | -0.578847 | 0.217663  |
|  1 | PAY_0    | (0,1]    |           959 |          1821 |  0.615374 | 0.0544047 |
|  2 | PAY_0    | (1,2]    |          1367 |           616 |  2.05375  | 0.490955  |
|  3 | PAY_0    | (2,+inf) |           268 |            91 |  2.33675  | 0.113486  |
|  4 | PAY_0    | [nan]    |             0 |             0 |  0        | 0         |


## 特征选择

In [29]:
select = SelectBins(keep_columns=["ID"], date_columns=[])
select.fit(train_x, train_y)
train_x = select.transform(train_x)
test_x  = select.transform(test_x )

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Batch computation too fast (0.0690s.) Setting batch_size=2.
[Parallel(n_jobs=-1)]: Done   2 out of  23 | elapsed:    0.0s remaining:    0.8s
[Parallel(n_jobs=-1)]: Done   4 out of  23 | elapsed:    0.0s remaining:    0.4s
[Parallel(n_jobs=-1)]: Done   6 out of  23 | elapsed:    0.0s remaining:    0.2s
[Parallel(n_jobs=-1)]: Done   8 out of  23 | elapsed:    0.0s remaining:    0.1s
[Parallel(n_jobs=-1)]: Done  10 out of  23 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  12 out of  23 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  14 out of  23 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  16 out of  23 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  18 out of  23 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  20 out of  23 | elap

In [30]:
select = SelectCIFE(keep_columns=["ID"], date_columns=[], nums_feature=10)
select.fit(train_x, train_y)
train_x = select.transform(train_x)
test_x  = select.transform(test_x )

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Batch computation too fast (0.0150s.) Setting batch_size=2.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done   3 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done   4 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done   6 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done   7 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done  11 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done  12 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done  13 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done  14 ta

[Parallel(n_jobs=-1)]: Done  72 tasks      | elapsed:    0.2s
[Parallel(n_jobs=-1)]: Done  74 tasks      | elapsed:    0.2s
[Parallel(n_jobs=-1)]: Done  76 tasks      | elapsed:    0.2s
[Parallel(n_jobs=-1)]: Done  78 tasks      | elapsed:    0.2s
[Parallel(n_jobs=-1)]: Done  80 tasks      | elapsed:    0.2s
[Parallel(n_jobs=-1)]: Done  82 tasks      | elapsed:    0.2s
[Parallel(n_jobs=-1)]: Done  84 tasks      | elapsed:    0.2s
[Parallel(n_jobs=-1)]: Done  86 tasks      | elapsed:    0.2s
[Parallel(n_jobs=-1)]: Done  88 tasks      | elapsed:    0.2s
[Parallel(n_jobs=-1)]: Done  90 tasks      | elapsed:    0.3s
[Parallel(n_jobs=-1)]: Done  92 tasks      | elapsed:    0.3s
[Parallel(n_jobs=-1)]: Done  94 tasks      | elapsed:    0.3s
[Parallel(n_jobs=-1)]: Done  96 tasks      | elapsed:    0.3s
[Parallel(n_jobs=-1)]: Done 100 tasks      | elapsed:    0.3s
[Parallel(n_jobs=-1)]: Batch computation too fast (0.1900s.) Setting batch_size=8.
[Parallel(n_jobs=-1)]: Done 104 tasks      | elap

## 模型训练

In [31]:
lmclassifier = LMClassifier(keep_columns=["ID"], date_columns=[])
lmclassifier.fit(train_x, train_y)
print("train ks {}".format(lmclassifier.score(train_x, train_y)))
print("test  ks {}".format(lmclassifier.score(test_x,  test_y )))

train ks 0.41052
test  ks 0.39004


In [32]:
lmclassifier.model.summary()

0,1,2,3
Dep. Variable:,default.payment.next.month,No. Observations:,22500.0
Model:,GLM,Df Residuals:,22495.0
Model Family:,Binomial,Df Model:,4.0
Link Function:,logit,Scale:,1.0
Method:,IRLS,Log-Likelihood:,-10015.0
Date:,"Thu, 17 Mar 2022",Deviance:,20031.0
Time:,10:22:18,Pearson chi2:,22300.0
No. Iterations:,4,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,-1.2445,0.018,-69.346,0.000,-1.280,-1.209
PAY_0,0.7890,0.023,34.098,0.000,0.744,0.834
PAY_2,0.1410,0.033,4.216,0.000,0.075,0.207
PAY_4,0.2978,0.038,7.756,0.000,0.223,0.373
PAY_3,0.2305,0.040,5.752,0.000,0.152,0.309


## 评分卡生成

In [33]:
lmcreditcard = LMCreditcard(
        keep_columns=["ID"], date_columns=[], discrete=auto, lmclassifier=lmclassifier, BASE=500,  PDO=20,  ODDS=1)
print(lmcreditcard.show_scorecard().to_markdown())

|    | Column   | Bucket    |       WoE |   Coefficients |   PartialScore |   OffsetScores |
|---:|:---------|:----------|----------:|---------------:|---------------:|---------------:|
|  0 | PAY_0    | (-inf,0]  | -0.578847 |       0.788993 |       13.1778  |        535.907 |
|  1 | PAY_0    | (0,1]     |  0.615374 |       0.788993 |      -14.0093  |        535.907 |
|  2 | PAY_0    | (1,2]     |  2.05375  |       0.788993 |      -46.7547  |        535.907 |
|  3 | PAY_0    | (2,+inf)  |  2.33675  |       0.788993 |      -53.1974  |        535.907 |
|  4 | PAY_0    | [nan]     |  0        |       0.788993 |       -0       |        535.907 |
|  0 | PAY_2    | (-inf,0]  | -0.387067 |       0.140988 |        1.57461 |        535.907 |
|  1 | PAY_2    | (0,2]     |  1.49205  |       0.140988 |       -6.06972 |        535.907 |
|  2 | PAY_2    | (2,+inf)  |  1.72753  |       0.140988 |       -7.02768 |        535.907 |
|  3 | PAY_2    | [nan]     |  0        |       0.140988 |       -0   