In [1]:
import numpy  as np
import pandas as pd
from sklearn.model_selection import train_test_split
from skcredit.feature_discretization import SplitCat
from skcredit.feature_discretization import SplitNum
from skcredit.feature_discretization import DiscreteAuto
from skcredit.feature_discretization import DiscreteCust
from skcredit.feature_selection import SelectBins
from skcredit.feature_selection import SelectCIFE
from skcredit.linear_model import LMClassifier
from skcredit.linear_model import LMCreditcard
np.random.seed(7)
pd.set_option("max_rows",    None)
pd.set_option("max_columns", None)

[数据介绍]()

## 读入数据

In [2]:
dataset = pd.read_csv("../UCI_Credit_Card.csv")

In [3]:
dataset.shape

(30000, 25)

In [4]:
dataset.head()

Unnamed: 0,ID,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,PAY_5,PAY_6,BILL_AMT1,BILL_AMT2,BILL_AMT3,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default.payment.next.month
0,1,20000.0,2,2,1,24,2,2,-1,-1,-2,-2,3913.0,3102.0,689.0,0.0,0.0,0.0,0.0,689.0,0.0,0.0,0.0,0.0,1
1,2,120000.0,2,2,2,26,-1,2,0,0,0,2,2682.0,1725.0,2682.0,3272.0,3455.0,3261.0,0.0,1000.0,1000.0,1000.0,0.0,2000.0,1
2,3,90000.0,2,2,2,34,0,0,0,0,0,0,29239.0,14027.0,13559.0,14331.0,14948.0,15549.0,1518.0,1500.0,1000.0,1000.0,1000.0,5000.0,0
3,4,50000.0,2,2,1,37,0,0,0,0,0,0,46990.0,48233.0,49291.0,28314.0,28959.0,29547.0,2000.0,2019.0,1200.0,1100.0,1069.0,1000.0,0
4,5,50000.0,1,2,1,57,-1,0,-1,0,0,0,8617.0,5670.0,35835.0,20940.0,19146.0,19131.0,2000.0,36681.0,10000.0,9000.0,689.0,679.0,0


## 分类单特征分箱

In [5]:
sc = SplitCat()
sc.fit(dataset["EDUCATION"], dataset["default.payment.next.month"])

SplitCat()

In [6]:
sc.table

Unnamed: 0,Column,Bucket,CntPositive,CntNegative,WoE,IvS
0,EDUCATION,"{0, 1, 4, 5, 6}",2069,8984,-0.209693,0.015253
1,EDUCATION,"{2, 3}",4567,14380,0.111705,0.008125
2,EDUCATION,{nan},0,0,0.0,0.0


自定义参数

In [7]:
# 默认参数
# min_bin_cnt_negative=75
# min_bin_cnt_positive=75
# min_information_value_split_gain=0.015

sc = SplitCat(min_information_value_split_gain=0.001)
sc.fit(dataset["EDUCATION"], dataset["default.payment.next.month"])

SplitCat(min_information_value_split_gain=0.001)

In [8]:
sc.table

Unnamed: 0,Column,Bucket,CntPositive,CntNegative,WoE,IvS
0,EDUCATION,"{0, 1, 4, 5, 6}",2069,8984,-0.209693,0.015253
1,EDUCATION,"{2, 3}",4567,14380,0.111705,0.008125
2,EDUCATION,{nan},0,0,0.0,0.0


## 连续单变量分箱

In [9]:
sn = SplitNum()
sn.fit(dataset["LIMIT_BAL"], dataset["default.payment.next.month"])

SplitNum()

In [10]:
sn.table

Unnamed: 0,Column,Bucket,CntPositive,CntNegative,WoE,IvS
0,LIMIT_BAL,"(-inf,40000.0]",1555,2756,0.686382,0.079873
1,LIMIT_BAL,"(40000.0,140000.0]",2767,8212,0.170854,0.011189
2,LIMIT_BAL,"(140000.0,+inf)",2314,12396,-0.419709,0.076327
3,LIMIT_BAL,"[nan,nan]",0,0,0.0,0.0


自定义参数

In [11]:
# 默认参数
# min_bin_cnt_negative=75
# min_bin_cnt_positive=75
# min_information_value_split_gain=0.015

sn = SplitNum(min_information_value_split_gain=0.001)
sn.fit(dataset["LIMIT_BAL"], dataset["default.payment.next.month"])

SplitNum(min_information_value_split_gain=0.001)

In [12]:
sn.table

Unnamed: 0,Column,Bucket,CntPositive,CntNegative,WoE,IvS
0,LIMIT_BAL,"(-inf,40000.0]",1555,2756,0.686382,0.079873
1,LIMIT_BAL,"(40000.0,70000.0]",1328,3593,0.263374,0.012204
2,LIMIT_BAL,"(70000.0,140000.0]",1439,4619,0.092457,0.001771
3,LIMIT_BAL,"(140000.0,240000.0]",1326,6317,-0.302391,0.021335
4,LIMIT_BAL,"(240000.0,360000.0]",694,3897,-0.466803,0.029042
5,LIMIT_BAL,"(360000.0,+inf)",294,2182,-0.74573,0.036606
6,LIMIT_BAL,"[nan,nan]",0,0,0.0,0.0


## 数据整理

In [13]:
cat_columns = ["SEX", "EDUCATION", "MARRIAGE"]
num_columns = ["LIMIT_BAL", "AGE",
               "PAY_0",     "PAY_2",     "PAY_3",     "PAY_4",     "PAY_5",     "PAY_6",
               "PAY_AMT1",  "PAY_AMT2",  "PAY_AMT3",  "PAY_AMT4",  "PAY_AMT5",  "PAY_AMT6",
               "BILL_AMT1", "BILL_AMT2", "BILL_AMT3", "BILL_AMT4", "BILL_AMT5", "BILL_AMT6"]

target = "default.payment.next.month"

In [14]:
train_x, test_x, train_y, test_y = train_test_split(
        dataset.drop([target], axis=1), dataset[target], train_size=0.75, shuffle=True, random_state=7)

## 手动多特征分箱

对每个特征手动调整后参数进行分箱，以分类特征 EDUCATION 连续特征 LIMIT_BAL 为例

In [15]:
sc = SplitCat(min_information_value_split_gain=0.001)
sc.fit(train_x["EDUCATION"], train_y)

SplitCat(min_information_value_split_gain=0.001)

In [16]:
sn = SplitNum(min_information_value_split_gain=0.001)
sn.fit(train_x["LIMIT_BAL"], train_y)

SplitNum(min_information_value_split_gain=0.001)

In [17]:
cust = DiscreteCust(keep_columns=["ID"], date_columns=[], cat_spliter={"EDUCATION": sc}, num_spliter={"LIMIT_BAL":sn})
cust.fit(train_x, train_y)

DiscreteCust(cat_spliter={'EDUCATION': SplitCat(min_information_value_split_gain=0.001)},
             date_columns=[], keep_columns=['ID'],
             num_spliter={'LIMIT_BAL': SplitNum(min_information_value_split_gain=0.001)})

In [18]:
cust.transform(train_x).head()

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    8.7s
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    8.7s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    5.9s
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    5.9s finished


Unnamed: 0,ID,LIMIT_BAL,EDUCATION
17845,17846,0.047881,0.120993
1698,1699,0.2211,-0.229193
9132,9133,-0.478657,0.120993
27932,27933,0.047881,-0.229193
23497,23498,-0.759759,-0.229193


In [19]:
from skcredit.feature_discretization import Discrete
??Discrete

## 自动多特征分箱

In [20]:
discrete = DiscreteAuto(keep_columns=["ID"], date_columns=[], cat_columns=cat_columns, num_columns=num_columns)
discrete.fit(train_x, train_y)
train_x = discrete.transform(train_x)
test_x  = discrete.transform(test_x )

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Batch computation too fast (0.0770s.) Setting batch_size=2.
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    8.0s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    8.0s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Batch computation too fast (0.1770s.) Setting batch_size=2.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done   3 tasks      | elapsed:    0.2s
[Parallel(n_jobs=-1)]: Done   4 tasks      | elapsed:    0.2s
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:    0.3s
[Parallel(n_jobs=-1)]: Done   6 tasks      | elapsed:    0.3s
[Parallel(n_jobs=-1)]: Done   7 tasks      | elapsed:    0.4s
[Parallel(n_jobs=-1)]: Done   8 ta

In [21]:
discrete.information_value_score.head()

Unnamed: 0,IvS
PAY_0,0.874626
PAY_2,0.560568
PAY_3,0.420644
PAY_4,0.364907
PAY_5,0.330149


In [22]:
discrete.information_value_table.head()

Unnamed: 0,Column,Bucket,CntPositive,CntNegative,WoE,IvS
0,PAY_0,"(-inf,0]",2391,14987,-0.578847,0.217663
1,PAY_0,"(0,1]",959,1821,0.615374,0.054405
2,PAY_0,"(1,+inf)",1635,707,2.094992,0.602558
3,PAY_0,"[nan,nan]",0,0,0.0,0.0
0,PAY_2,"(-inf,0]",3105,16066,-0.387067,0.113953


## 特征选择

In [23]:
select = SelectBins(keep_columns=["ID"], date_columns=[])
select.fit(train_x, train_y)
train_x = select.transform(train_x)
test_x  = select.transform(test_x )

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Batch computation too fast (0.1170s.) Setting batch_size=2.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done   3 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done   4 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done   6 tasks      | elapsed:    0.2s
[Parallel(n_jobs=-1)]: Done   7 tasks      | elapsed:    0.2s
[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed:    0.2s
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:    0.4s
[Parallel(n_jobs=-1)]: Done  12 tasks      | elapsed:    0.4s
[Parallel(n_jobs=-1)]: Done  14 tasks      | elapsed:    0.5s
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:    0.5s
[Parallel(n_jobs=-1)]: Done  18 out of  23 | elapsed:    0.6s remaining:    0.1s
[Parallel(n_jobs

In [24]:
select = SelectCIFE(keep_columns=["ID"], date_columns=[], nums_feature=10)
select.fit(train_x, train_y)
train_x = select.transform(train_x)
test_x  = select.transform(test_x )

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Batch computation too fast (0.0330s.) Setting batch_size=2.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done   3 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done   4 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done   6 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done   7 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Batch computation too fast (0.0830s.) Setting batch_size=4.
[Parallel(n_jobs=-1)]: Done  12 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done  14 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:    0.1s
[Parallel(n_jo

## 模型训练

In [25]:
lmclassifier = LMClassifier(keep_columns=["ID"], date_columns=[])
lmclassifier.fit(train_x, train_y)
print("train ks {}".format(lmclassifier.score(train_x, train_y)))
print("test  ks {}".format(lmclassifier.score(test_x,  test_y )))

train ks 0.41385
test  ks 0.39301


In [26]:
lmclassifier.model()

0,1,2,3
Dep. Variable:,default.payment.next.month,No. Observations:,22500.0
Model:,GLM,Df Residuals:,22492.0
Model Family:,Binomial,Df Model:,7.0
Link Function:,logit,Scale:,1.0
Method:,IRLS,Log-Likelihood:,-9848.8
Date:,"Wed, 29 Sep 2021",Deviance:,19698.0
Time:,16:12:41,Pearson chi2:,22200.0
No. Iterations:,5,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,-1.2446,0.018,-68.556,0.000,-1.280,-1.209
PAY_2,0.1078,0.033,3.226,0.001,0.042,0.173
LIMIT_BAL,0.3955,0.045,8.716,0.000,0.307,0.484
PAY_6,0.2970,0.039,7.525,0.000,0.220,0.374
PAY_0,0.7521,0.023,32.458,0.000,0.707,0.797
PAY_4,0.1618,0.040,4.034,0.000,0.083,0.240
PAY_AMT4,0.6404,0.055,11.705,0.000,0.533,0.748
PAY_3,0.1943,0.040,4.911,0.000,0.117,0.272


## 评分卡生成

In [27]:
lmcreditcard = LMCreditcard(
        keep_columns=["ID"], date_columns=[], discrete=discrete, lmclassifier=lmclassifier, BASE=500,  PDO=20,  ODDS=1)
lmcreditcard.show_scorecard()

AttributeError: 'DiscreteAuto' object has no attribute 'cat_spliter_'