In [1]:
import numpy  as np
import pandas as pd
from skcredit.feature_discrete import SplitMixND, C1Discrete, CXDiscrete
from skcredit.feature_selector import SelectBins, SelectCIFE
from skcredit.linear_model     import LMClassifier, LMCreditcard
from sklearn.model_selection   import train_test_split
np.random.seed(7)

## 读入数据

[数据介绍](https://archive.ics.uci.edu/ml/datasets/default+of+credit+card+clients)

In [2]:
dataset = pd.read_csv("../UCI_Credit_Card.csv")

In [3]:
dataset.shape

(30000, 25)

In [4]:
dataset.head()

Unnamed: 0,ID,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,PAY_5,PAY_6,BILL_AMT1,BILL_AMT2,BILL_AMT3,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default.payment.next.month
0,1,20000.0,2,2,1,24,2,2,-1,-1,-2,-2,3913.0,3102.0,689.0,0.0,0.0,0.0,0.0,689.0,0.0,0.0,0.0,0.0,1
1,2,120000.0,2,2,2,26,-1,2,0,0,0,2,2682.0,1725.0,2682.0,3272.0,3455.0,3261.0,0.0,1000.0,1000.0,1000.0,0.0,2000.0,1
2,3,90000.0,2,2,2,34,0,0,0,0,0,0,29239.0,14027.0,13559.0,14331.0,14948.0,15549.0,1518.0,1500.0,1000.0,1000.0,1000.0,5000.0,0
3,4,50000.0,2,2,1,37,0,0,0,0,0,0,46990.0,48233.0,49291.0,28314.0,28959.0,29547.0,2000.0,2019.0,1200.0,1100.0,1069.0,1000.0,0
4,5,50000.0,1,2,1,57,-1,0,-1,0,0,0,8617.0,5670.0,35835.0,20940.0,19146.0,19131.0,2000.0,36681.0,10000.0,9000.0,689.0,679.0,0


In [5]:
cat_columns = ["SEX", "EDUCATION", "MARRIAGE"]
num_columns = ["LIMIT_BAL", "AGE",
               "PAY_0",     "PAY_2",     "PAY_3",     "PAY_4",     "PAY_5",     "PAY_6",
               "PAY_AMT1",  "PAY_AMT2",  "PAY_AMT3",  "PAY_AMT4",  "PAY_AMT5",  "PAY_AMT6",
               "BILL_AMT1", "BILL_AMT2", "BILL_AMT3", "BILL_AMT4", "BILL_AMT5", "BILL_AMT6"]

target = "default.payment.next.month"

In [6]:
# 分类变量需要 astype category
dataset[cat_columns] = dataset[cat_columns].astype("category")

## 分类特征

### 单特征分箱

In [7]:
# min_bin_cnt_negative=75
# min_bin_cnt_positive=75
# min_information_value_split_gain=0.005

smnd = SplitMixND()
smnd.fit(dataset[["EDUCATION"]], dataset["default.payment.next.month"])

In [8]:
smnd.build_table()

Unnamed: 0,Column,Bucket,CntPositive,CntNegative,PctPositive,PctNegative,WoE,IvS
0,EDUCATION,"[0, 1, 4, 5, 6]",2069,8984,0.311784,0.384523,-0.209693,0.015253
1,EDUCATION,"[2, 3]",4567,14380,0.688216,0.615477,0.111705,0.008125
2,EDUCATION,[MISSING],0,0,0.0,0.0,0.0,0.0
0,ALL,ALL,6636,23364,1.0,1.0,-,0.023378


### 多特征分箱

In [9]:
smnd = SplitMixND()
smnd.fit(dataset[["EDUCATION", "SEX"]], dataset["default.payment.next.month"])

In [10]:
smnd.build_table()

Unnamed: 0,Column,Bucket,CntPositive,CntNegative,PctPositive,PctNegative,WoE,IvS
0,EDUCATION @ SEX,"[0, 1, 4, 5, 6] @ [1, 2]",2069,8984,0.311784,0.384523,-0.209693,0.015253
1,EDUCATION @ SEX,"[2, 3] @ [2]",2614,8969,0.393912,0.383881,0.025794,0.000259
2,EDUCATION @ SEX,"[2, 3] @ [1]",1953,5411,0.294304,0.231596,0.23962,0.015026
3,EDUCATION @ SEX,"[0, 1, 2, 3, 4, 5, 6] @ [MISSING]",0,0,0.0,0.0,0.0,0.0
4,EDUCATION @ SEX,"[MISSING] @ [1, 2]",0,0,0.0,0.0,0.0,0.0
5,EDUCATION @ SEX,[MISSING] @ [MISSING],0,0,0.0,0.0,0.0,0.0
0,ALL,ALL,6636,23364,1.0,1.0,-,0.030538


In [11]:
smnd = SplitMixND()
smnd.fit(dataset[["EDUCATION", "SEX", "MARRIAGE"]], dataset["default.payment.next.month"])

In [12]:
smnd.build_table()

Unnamed: 0,Column,Bucket,CntPositive,CntNegative,PctPositive,PctNegative,WoE,IvS
0,EDUCATION @ SEX @ MARRIAGE,"[0, 1, 4, 5, 6] @ [1, 2] @ [0, 1, 2, 3]",2069,8984,0.311784,0.384523,-0.209693,0.015253
1,EDUCATION @ SEX @ MARRIAGE,"[2, 3] @ [2] @ [0, 1, 2, 3]",2614,8969,0.393912,0.383881,0.025794,0.000259
2,EDUCATION @ SEX @ MARRIAGE,"[2, 3] @ [1] @ [0, 1, 2, 3]",1953,5411,0.294304,0.231596,0.23962,0.015026
3,EDUCATION @ SEX @ MARRIAGE,"[0, 1, 2, 3, 4, 5, 6] @ [1, 2] @ [MISSING]",0,0,0.0,0.0,0.0,0.0
4,EDUCATION @ SEX @ MARRIAGE,"[0, 1, 2, 3, 4, 5, 6] @ [MISSING] @ [0, 1, 2, 3]",0,0,0.0,0.0,0.0,0.0
5,EDUCATION @ SEX @ MARRIAGE,"[0, 1, 2, 3, 4, 5, 6] @ [MISSING] @ [MISSING]",0,0,0.0,0.0,0.0,0.0
6,EDUCATION @ SEX @ MARRIAGE,"[MISSING] @ [1, 2] @ [0, 1, 2, 3]",0,0,0.0,0.0,0.0,0.0
7,EDUCATION @ SEX @ MARRIAGE,"[MISSING] @ [1, 2] @ [MISSING]",0,0,0.0,0.0,0.0,0.0
8,EDUCATION @ SEX @ MARRIAGE,"[MISSING] @ [MISSING] @ [0, 1, 2, 3]",0,0,0.0,0.0,0.0,0.0
9,EDUCATION @ SEX @ MARRIAGE,[MISSING] @ [MISSING] @ [MISSING],0,0,0.0,0.0,0.0,0.0


## 连续特征

### 单特征分箱

In [13]:
# min_bin_cnt_negative=75
# min_bin_cnt_positive=75
# min_information_value_split_gain=0.005

smnd = SplitMixND()
smnd.fit(dataset[["PAY_AMT1"]], dataset["default.payment.next.month"])

In [14]:
smnd.build_table()

Unnamed: 0,Column,Bucket,CntPositive,CntNegative,PctPositive,PctNegative,WoE,IvS
0,PAY_AMT1,"(-inf, 13.000000]",1930,3453,0.290838,0.147791,0.676964,0.096837
1,PAY_AMT1,"(13.000000, 4501.000000]",3487,12426,0.525467,0.531844,-0.012062,7.7e-05
2,PAY_AMT1,"(4501.000000, 25794.000000]",1136,6585,0.171187,0.281844,-0.498594,0.055173
3,PAY_AMT1,"(25794.000000, +inf)",83,900,0.012508,0.038521,-1.124867,0.029261
4,PAY_AMT1,[MISSING],0,0,0.0,0.0,0.0,0.0
0,ALL,ALL,6636,23364,1.0,1.0,-,0.181348


### 多特征分箱

In [15]:
smnd = SplitMixND()
smnd.fit(dataset[["PAY_AMT1", "PAY_AMT2"]], dataset["default.payment.next.month"])

In [16]:
smnd.build_table()

Unnamed: 0,Column,Bucket,CntPositive,CntNegative,PctPositive,PctNegative,WoE,IvS
0,PAY_AMT1 @ PAY_AMT2,"(-inf, 13.000000] @ (-inf, +inf)",1930,3453,0.290838,0.147791,0.676964,0.096837
1,PAY_AMT1 @ PAY_AMT2,"(13.000000, +inf) @ (-inf, 33.000000]",972,1945,0.146474,0.083248,0.565026,0.035724
2,PAY_AMT1 @ PAY_AMT2,"(13.000000, 4739.000000] @ (33.000000, 2000.00...",1606,5580,0.242013,0.238829,0.013245,4.2e-05
3,PAY_AMT1 @ PAY_AMT2,"(4739.000000, +inf) @ (33.000000, 2000.000000]",110,733,0.016576,0.031373,-0.637978,0.00944
4,PAY_AMT1 @ PAY_AMT2,"(13.000000, 3000.000000] @ (2000.000000, 15043...",790,3270,0.119048,0.139959,-0.161825,0.003384
5,PAY_AMT1 @ PAY_AMT2,"(3000.000000, +inf) @ (2000.000000, 15043.000000]",1103,6769,0.166215,0.289719,-0.555633,0.068623
6,PAY_AMT1 @ PAY_AMT2,"(13.000000, +inf) @ (15043.000000, +inf)",125,1614,0.018837,0.069081,-1.29947,0.065291
7,PAY_AMT1 @ PAY_AMT2,"(-inf, +inf) @ [MISSING]",0,0,0.0,0.0,0.0,0.0
8,PAY_AMT1 @ PAY_AMT2,"[MISSING] @ (-inf, +inf)",0,0,0.0,0.0,0.0,0.0
9,PAY_AMT1 @ PAY_AMT2,[MISSING] @ [MISSING],0,0,0.0,0.0,0.0,0.0


In [17]:
smnd = SplitMixND()
smnd.fit(dataset[["PAY_AMT1", "PAY_AMT2", "PAY_AMT3"]], dataset["default.payment.next.month"])

In [18]:
smnd.build_table()

Unnamed: 0,Column,Bucket,CntPositive,CntNegative,PctPositive,PctNegative,WoE,IvS
0,PAY_AMT1 @ PAY_AMT2 @ PAY_AMT3,"(-inf, 13.000000] @ (-inf, +inf) @ (-inf, +inf)",1930,3453,0.290838,0.147791,0.676964,0.096837
1,PAY_AMT1 @ PAY_AMT2 @ PAY_AMT3,"(13.000000, +inf) @ (-inf, 33.000000] @ (-inf,...",972,1945,0.146474,0.083248,0.565026,0.035724
2,PAY_AMT1 @ PAY_AMT2 @ PAY_AMT3,"(13.000000, +inf) @ (33.000000, 2000.000000] @...",287,612,0.043249,0.026194,0.501437,0.008552
3,PAY_AMT1 @ PAY_AMT2 @ PAY_AMT3,"(13.000000, 4739.000000] @ (33.000000, 2000.00...",1343,5060,0.202381,0.216573,-0.067774,0.000962
4,PAY_AMT1 @ PAY_AMT2 @ PAY_AMT3,"(4739.000000, +inf) @ (33.000000, 2000.000000]...",86,641,0.01296,0.027435,-0.749995,0.010857
5,PAY_AMT1 @ PAY_AMT2 @ PAY_AMT3,"(13.000000, +inf) @ (2000.000000, +inf) @ (-in...",369,871,0.055606,0.03728,0.399842,0.007328
6,PAY_AMT1 @ PAY_AMT2 @ PAY_AMT3,"(13.000000, 3803.000000] @ (2000.000000, 15043...",758,3792,0.114225,0.162301,-0.351278,0.016888
7,PAY_AMT1 @ PAY_AMT2 @ PAY_AMT3,"(3803.000000, +inf) @ (2000.000000, 15043.0000...",789,5484,0.118897,0.23472,-0.680137,0.078776
8,PAY_AMT1 @ PAY_AMT2 @ PAY_AMT3,"(13.000000, +inf) @ (15043.000000, +inf) @ (8....",102,1506,0.015371,0.064458,-1.433553,0.070369
9,PAY_AMT1 @ PAY_AMT2 @ PAY_AMT3,"(-inf, +inf) @ (-inf, +inf) @ [MISSING]",0,0,0.0,0.0,0.0,0.0


## 并行单特征分箱

In [7]:
trn_x, tst_x, trn_y, tst_y = train_test_split(
        dataset[cat_columns + num_columns], dataset[target], train_size=0.75, shuffle=True, random_state=7)

In [8]:
trn_x = trn_x.reset_index(drop=True)
tst_x = tst_x.reset_index(drop=True)
trn_y = trn_y.reset_index(drop=True)
tst_y = tst_y.reset_index(drop=True)

In [9]:
c1 = C1Discrete(keep_columns=[], date_columns=[])
c1.fit(trn_x, trn_y)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    2.9s
[Parallel(n_jobs=-1)]: Done   2 out of  23 | elapsed:    3.0s remaining:   32.1s
[Parallel(n_jobs=-1)]: Done   4 out of  23 | elapsed:    3.1s remaining:   15.0s
[Parallel(n_jobs=-1)]: Done   6 out of  23 | elapsed:    3.1s remaining:    9.1s
[Parallel(n_jobs=-1)]: Done   8 out of  23 | elapsed:    3.2s remaining:    6.1s
[Parallel(n_jobs=-1)]: Done  10 out of  23 | elapsed:    3.3s remaining:    4.4s
[Parallel(n_jobs=-1)]: Done  12 out of  23 | elapsed:    3.4s remaining:    3.1s
[Parallel(n_jobs=-1)]: Done  14 out of  23 | elapsed:    3.5s remaining:    2.2s
[Parallel(n_jobs=-1)]: Done  16 out of  23 | elapsed:    3.5s remaining:    1.5s
[Parallel(n_jobs=-1)]: Done  18 out of  23 | elapsed:    3.6s remaining:    0.9s
[Parallel(n_jobs=-1)]: Done  20 out of  23 | elapsed:    3.7s remaining:    0.5s
[Parallel(n_jobs=-1)]: Done  23 out of  23 | elapse

In [10]:
c1.information_value_score

Unnamed: 0,IvS
SEX,0.008907
EDUCATION,0.027667
MARRIAGE,0.0
LIMIT_BAL,0.179315
AGE,0.01411
PAY_0,0.874626
PAY_2,0.563306
PAY_3,0.421203
PAY_4,0.371034
PAY_5,0.336415


In [11]:
c1.information_value_table

Unnamed: 0,Column,Bucket,CntPositive,CntNegative,PctPositive,PctNegative,WoE,IvS
0,SEX,[2],2824,10735,0.566499,0.612903,-0.078731,0.003653
1,SEX,[1],2161,6780,0.433501,0.387097,0.113218,0.005254
2,SEX,[MISSING],0,0,0.0,0.0,0.0,0.0
0,ALL,ALL,4985,17515,1.0,1.0,-,0.008907
0,EDUCATION,"[0, 1, 4, 5]",1529,6756,0.30672,0.385727,-0.229193,0.018108
1,EDUCATION,"[2, 3, 6]",3456,10759,0.69328,0.614273,0.120993,0.009559
2,EDUCATION,[MISSING],0,0,0.0,0.0,0.0,0.0
0,ALL,ALL,4985,17515,1.0,1.0,-,0.027667
0,MARRIAGE,"[0, 1, 2, 3]",4985,17515,1.0,1.0,0.0,0.0
1,MARRIAGE,[MISSING],0,0,0.0,0.0,0.0,0.0


In [12]:
trn_x_c1 = c1.transform(trn_x)
tst_x_c1 = c1.transform(tst_x)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Batch computation too fast (0.0333s.) Setting batch_size=2.
[Parallel(n_jobs=-1)]: Done   2 out of  23 | elapsed:    0.0s remaining:    0.3s
[Parallel(n_jobs=-1)]: Done   4 out of  23 | elapsed:    0.0s remaining:    0.1s
[Parallel(n_jobs=-1)]: Done   6 out of  23 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   8 out of  23 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  10 out of  23 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  12 out of  23 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  14 out of  23 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  16 out of  23 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  18 out of  23 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  20 out of  23 | elap

## 并行多特征分箱

In [19]:
smnd1 = SplitMixND()
smnd1.fit(trn_x[["PAY_0", "PAY_2"]], trn_y)

In [20]:
smnd2 = SplitMixND()
smnd2.fit(trn_x[["PAY_0", "PAY_3"]], trn_y)

In [22]:
cx = CXDiscrete(
    keep_columns=[], 
    date_columns=[], 
    feature_spliter={"PAY_0 @ PAY_2": smnd1,
                     "PAY_0 @ PAY_3": smnd2}
)

In [24]:
cx.fit(trn_x[["PAY_0", "PAY_2", "PAY_3"]], trn_y)

In [25]:
cx.information_value_score

Unnamed: 0,IvS
PAY_0 @ PAY_2,0.874626
PAY_0 @ PAY_3,0.8802


In [26]:
cx.information_value_table

Unnamed: 0,Column,Bucket,CntPositive,CntNegative,PctPositive,PctNegative,WoE,IvS
0,PAY_0 @ PAY_2,"(-inf, 0.000000] @ (-inf, +inf)",2391,14987,0.479639,0.855667,-0.578847,0.217663
1,PAY_0 @ PAY_2,"(0.000000, 1.000000] @ (-inf, +inf)",959,1821,0.192377,0.103968,0.615374,0.054405
2,PAY_0 @ PAY_2,"(1.000000, +inf) @ (-inf, +inf)",1635,707,0.327984,0.040365,2.094992,0.602558
3,PAY_0 @ PAY_2,"(-inf, +inf) @ [MISSING]",0,0,0.0,0.0,0.0,0.0
4,PAY_0 @ PAY_2,"[MISSING] @ (-inf, +inf)",0,0,0.0,0.0,0.0,0.0
5,PAY_0 @ PAY_2,[MISSING] @ [MISSING],0,0,0.0,0.0,0.0,0.0
0,ALL,ALL,4985,17515,1.0,1.0,-,0.874626
0,PAY_0 @ PAY_3,"(-inf, 0.000000] @ (-inf, +inf)",2391,14987,0.479639,0.855667,-0.578847,0.217663
1,PAY_0 @ PAY_3,"(0.000000, 1.000000] @ (-inf, +inf)",959,1821,0.192377,0.103968,0.615374,0.054405
2,PAY_0 @ PAY_3,"(1.000000, +inf) @ (-inf, 0.000000]",670,351,0.134403,0.02004,1.903116,0.217646


In [27]:
trn_x_cx = cx.transform(trn_x)
tst_x_cx = cx.transform(tst_x)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Batch computation too fast (0.0653s.) Setting batch_size=2.
[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Batch computation too fast (0.0169s.) Setting batch_size=2.
[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:    0.0s finished


In [28]:
trn_x = pd.concat([trn_x_c1, trn_x_cx], axis="columns")
tst_x = pd.concat([tst_x_c1, tst_x_cx], axis="columns")

## 特征选择

In [29]:
select = SelectBins(
    keep_columns=[], 
    date_columns=[],
)
select.fit(trn_x,        trn_y)
trn_x = select.transform(trn_x)
tst_x = select.transform(tst_x)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Batch computation too fast (0.0517s.) Setting batch_size=2.
[Parallel(n_jobs=-1)]: Done   2 out of  25 | elapsed:    0.0s remaining:    0.6s
[Parallel(n_jobs=-1)]: Done   4 out of  25 | elapsed:    0.0s remaining:    0.3s
[Parallel(n_jobs=-1)]: Done   6 out of  25 | elapsed:    0.0s remaining:    0.2s
[Parallel(n_jobs=-1)]: Done   8 out of  25 | elapsed:    0.0s remaining:    0.1s
[Parallel(n_jobs=-1)]: Done  10 out of  25 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  12 out of  25 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  14 out of  25 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  16 out of  25 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  18 out of  25 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  20 out of  25 | elap

In [30]:
select = SelectCIFE(
    keep_columns=[], 
    date_columns=[],
    nums_columns=10
)
select.fit(trn_x,        trn_y)
trn_x = select.transform(trn_x)
tst_x = select.transform(tst_x)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Batch computation too fast (0.0181s.) Setting batch_size=2.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done   3 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done   4 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done   6 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done   7 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done  11 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done  12 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done  13 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done  14 ta

In [31]:
trn_x.head()

Unnamed: 0,SEX,LIMIT_BAL,PAY_0,PAY_2,PAY_3,PAY_4,PAY_6,PAY_AMT4,PAY_0 @ PAY_2,PAY_0 @ PAY_3
0,0.113218,0.178152,-0.578847,-0.386973,-0.318642,-0.265879,-0.211054,-0.134603,-0.578847,-0.578847
1,0.113218,0.178152,-0.578847,-0.386973,-0.318642,-0.265879,1.345301,-0.134603,-0.578847,-0.578847
2,0.113218,-0.360787,-0.578847,-0.386973,-0.318642,-0.265879,-0.211054,-0.51447,-0.578847,-0.578847
3,-0.078731,0.178152,2.094992,-0.386973,-0.318642,-0.265879,-0.211054,-0.51447,2.094992,1.903116
4,-0.078731,-0.759759,-0.578847,-0.386973,-0.318642,-0.265879,-0.211054,-0.51447,-0.578847,-0.578847


## 模型训练

In [33]:
lm = LMClassifier(keep_columns=[], date_columns=[])
lm.fit(trn_x, trn_y)
print("trn ks {}".format(lm.score(trn_x, trn_y)))
print("tst ks {}".format(lm.score(tst_x, tst_y)))

trn ks 0.4152
tst ks 0.39046


In [34]:
lm.model.summary()

0,1,2,3
Dep. Variable:,default.payment.next.month,No. Observations:,22500.0
Model:,GLM,Df Residuals:,22493.0
Model Family:,Binomial,Df Model:,6.0
Link Function:,Logit,Scale:,1.0
Method:,IRLS,Log-Likelihood:,-9874.0
Date:,"Sun, 05 Jun 2022",Deviance:,19748.0
Time:,21:03:54,Pearson chi2:,22200.0
No. Iterations:,5,Pseudo R-squ. (CS):,0.1648
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,-1.2467,0.018,-68.770,0.000,-1.282,-1.211
PAY_3,0.2203,0.039,5.609,0.000,0.143,0.297
PAY_4,0.2407,0.038,6.339,0.000,0.166,0.315
PAY_0,0.7607,0.023,32.980,0.000,0.716,0.806
LIMIT_BAL,0.4292,0.044,9.664,0.000,0.342,0.516
PAY_AMT4,0.5901,0.055,10.805,0.000,0.483,0.697
PAY_2,0.1308,0.033,3.945,0.000,0.066,0.196


## 评分卡生成

In [35]:
lmcreditcard = LMCreditcard(
        [], [], c1, cx, lm, BASE=500,  PDO=20,  ODDS=1)
lmcreditcard.show_scorecard()

Unnamed: 0,Column,Bucket,WoE,Coefficients,PartialScore,OffsetScores
0,PAY_3,"(-inf, 1.000000]",-0.318642,0.220332,2.025742,535.971735
1,PAY_3,"(1.000000, +inf)",1.368731,0.220332,-8.701611,535.971735
2,PAY_3,[MISSING],0.0,0.220332,-0.0,535.971735
0,PAY_4,"(-inf, 1.000000]",-0.265879,0.240652,1.846194,535.971735
1,PAY_4,"(1.000000, 2.000000]",1.358094,0.240652,-9.430247,535.971735
2,PAY_4,"(2.000000, +inf)",1.9612,0.240652,-13.618059,535.971735
3,PAY_4,[MISSING],0.0,0.240652,-0.0,535.971735
0,PAY_0,"(-inf, 0.000000]",-0.578847,0.760709,12.705364,535.971735
1,PAY_0,"(0.000000, 1.000000]",0.615374,0.760709,-13.507112,535.971735
2,PAY_0,"(1.000000, +inf)",2.094992,0.760709,-45.983865,535.971735
