In [22]:
import numpy  as np
import pandas as pd
from sklearn.model_selection import train_test_split
from skcredit.feature_preprocessings import FormatTabular
from skcredit.feature_discretization import Discrete
from skcredit.feature_selection import SelectBins
from skcredit.feature_selection import SelectCIFE
from skcredit.linear_model import LMClassifier
from skcredit.linear_model import LMCreditcard
np.random.seed(7)
pd.set_option("max_rows",    None)
pd.set_option("max_columns", None)

[数据介绍]()

## 读入数据

In [2]:
dataset = pd.read_csv("../UCI_Credit_Card.csv")

In [3]:
dataset.shape

(30000, 25)

In [4]:
dataset.head()

Unnamed: 0,ID,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,PAY_5,PAY_6,BILL_AMT1,BILL_AMT2,BILL_AMT3,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default.payment.next.month
0,1,20000.0,2,2,1,24,2,2,-1,-1,-2,-2,3913.0,3102.0,689.0,0.0,0.0,0.0,0.0,689.0,0.0,0.0,0.0,0.0,1
1,2,120000.0,2,2,2,26,-1,2,0,0,0,2,2682.0,1725.0,2682.0,3272.0,3455.0,3261.0,0.0,1000.0,1000.0,1000.0,0.0,2000.0,1
2,3,90000.0,2,2,2,34,0,0,0,0,0,0,29239.0,14027.0,13559.0,14331.0,14948.0,15549.0,1518.0,1500.0,1000.0,1000.0,1000.0,5000.0,0
3,4,50000.0,2,2,1,37,0,0,0,0,0,0,46990.0,48233.0,49291.0,28314.0,28959.0,29547.0,2000.0,2019.0,1200.0,1100.0,1069.0,1000.0,0
4,5,50000.0,1,2,1,57,-1,0,-1,0,0,0,8617.0,5670.0,35835.0,20940.0,19146.0,19131.0,2000.0,36681.0,10000.0,9000.0,689.0,679.0,0


## 分类变量分箱

In [5]:
from skcredit.feature_discretization.SplitCat import binning_cat

sc = binning_cat(
    x=dataset["EDUCATION"].fillna("missing").astype(str),
    y=dataset["default.payment.next.month"] ,
    column="EDUCATION", target="default.payment.next.month",
)

In [6]:
sc.table

Unnamed: 0,Column,Bucket,CntNegative,Cntpositive,WoE,IVS
0,EDUCATION,"{6, 1, 0, 5, 4}",8984,2069,-0.209693,0.015253
1,EDUCATION,"{2, 3}",14380,4567,0.111705,0.008125
2,EDUCATION,{missing},0,0,0.0,0.0


## 连续变量分箱

In [7]:
from skcredit.feature_discretization.SplitNum import binning_num

sn = binning_num(
    x=dataset["LIMIT_BAL"].fillna(-999999.0),
    y=dataset["default.payment.next.month"] ,
    column="LIMIT_BAL", target="default.payment.next.month",
)

In [8]:
sn.table

Unnamed: 0,Column,Bucket,CntNegative,Cntpositive,WoE,IVS
0,LIMIT_BAL,"(-inf,40000.0]",2756,1555,0.686382,0.079873
1,LIMIT_BAL,"(40000.0,140000.0]",8212,2767,0.170854,0.011189
2,LIMIT_BAL,"(140000.0,+inf)",12396,2314,-0.419709,0.076327
3,LIMIT_BAL,[-999999],0,0,0.0,0.0


## 自动分箱

In [9]:
cat_columns = ["SEX", "EDUCATION", "MARRIAGE"]
num_columns = ["LIMIT_BAL", "AGE",
               "PAY_0",     "PAY_2",     "PAY_3",     "PAY_4",     "PAY_5",     "PAY_6",
               "PAY_AMT1",  "PAY_AMT2",  "PAY_AMT3",  "PAY_AMT4",  "PAY_AMT5",  "PAY_AMT6",
               "BILL_AMT1", "BILL_AMT2", "BILL_AMT3", "BILL_AMT4", "BILL_AMT5", "BILL_AMT6"]

target = "default.payment.next.month"

In [10]:
train_x, test_x, train_y, test_y = train_test_split(
        dataset.drop([target], axis=1), dataset[target], train_size=0.75, shuffle=True, random_state=7)

In [11]:
ft = FormatTabular(keep_columns=["ID"], date_columns=[], cat_columns=cat_columns, num_columns=num_columns)
ft.fit(train_x, train_y)
train_x = ft.transform(train_x)
test_x  = ft.transform(test_x )

In [12]:
discrete = Discrete(keep_columns=["ID"], date_columns=[])
discrete.fit(train_x, train_y)
train_x = discrete.transform(train_x)
test_x  = discrete.transform(test_x )

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    9.2s
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    9.3s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    9.3s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Batch computation too fast (0.1880s.) Setting batch_size=2.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done   3 tasks      | elapsed:    0.2s
[Parallel(n_jobs=-1)]: Done   4 tasks      | elapsed:    0.3s
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:    0.3s
[Parallel(n_jobs=-1)]: Done   6 tasks      | elapsed:    0.4s
[Parallel(n_jobs=-1)]: Done   7 tasks      | elapsed:    0.4s
[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed:   10.0s
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:  

In [24]:
discrete.information_value_score.head()

Unnamed: 0,IVS
PAY_0,0.874626
PAY_2,0.563306
PAY_3,0.421203
PAY_4,0.365194
PAY_5,0.330149


In [25]:
discrete.information_value_table.head()

Unnamed: 0,Column,Bucket,CntNegative,Cntpositive,WoE,IVS
0,PAY_0,"(-inf,0]",14987,2391,-0.578847,0.217663
1,PAY_0,"(0,1]",1821,959,0.615374,0.054405
2,PAY_0,"(1,+inf)",707,1635,2.094992,0.602558
3,PAY_0,[-999999],0,0,0.0,0.0
0,PAY_2,"(-inf,1]",16080,3108,-0.386973,0.114002


## 特征选择

In [17]:
select = SelectBins(keep_columns=["ID"], date_columns=[])
select.fit(train_x, train_y)
train_x = select.transform(train_x)
test_x  = select.transform(test_x )

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Batch computation too fast (0.0980s.) Setting batch_size=2.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done   3 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done   4 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done   6 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done   7 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:    0.3s
[Parallel(n_jobs=-1)]: Done  12 tasks      | elapsed:    0.3s
[Parallel(n_jobs=-1)]: Done  14 tasks      | elapsed:    0.3s
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:    0.3s
[Parallel(n_jobs=-1)]: Done  18 out of  23 | elapsed:    0.4s remaining:    0.0s
[Parallel(n_jobs

In [19]:
select = SelectCIFE(keep_columns=["ID"], date_columns=[], nums_feature=10)
select.fit(train_x, train_y)
train_x = select.transform(train_x)
test_x  = select.transform(test_x )

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Batch computation too fast (0.0300s.) Setting batch_size=2.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done   3 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done   4 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done   6 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done   7 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Batch computation too fast (0.1120s.) Setting batch_size=4.
[Parallel(n_jobs=-1)]: Done  12 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done  14 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:    0.1s
[Parallel(n_jo

## 模型训练

In [21]:
lmclassifier = LMClassifier(keep_columns=["ID"], date_columns=[])
lmclassifier.fit(train_x, train_y)
print("train ks {}".format(lmclassifier.score(train_x, train_y)))
print("test  ks {}".format(lmclassifier.score(test_x,  test_y )))

train ks 0.41234
test  ks 0.39388


In [26]:
lmclassifier.model()

0,1,2,3
Dep. Variable:,default.payment.next.month,No. Observations:,22500.0
Model:,GLM,Df Residuals:,22492.0
Model Family:,Binomial,Df Model:,7.0
Link Function:,logit,Scale:,1.0
Method:,IRLS,Log-Likelihood:,-9869.1
Date:,"Wed, 22 Sep 2021",Deviance:,19738.0
Time:,11:39:07,Pearson chi2:,22200.0
No. Iterations:,5,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,-1.2454,0.018,-68.695,0.000,-1.281,-1.210
PAY_5,0.1584,0.042,3.802,0.000,0.077,0.240
LIMIT_BAL,0.4420,0.045,9.857,0.000,0.354,0.530
PAY_AMT4,0.5919,0.058,10.234,0.000,0.479,0.705
PAY_3,0.2122,0.039,5.381,0.000,0.135,0.289
PAY_0,0.7574,0.023,32.783,0.000,0.712,0.803
PAY_4,0.1679,0.044,3.777,0.000,0.081,0.255
PAY_2,0.1206,0.033,3.614,0.000,0.055,0.186


## 评分卡生成

In [23]:
lmcreditcard = LMCreditcard(
        keep_columns=["ID"], date_columns=[], discrete=discrete, lmclassifier=lmclassifier, BASE=500,  PDO=20,  ODDS=1)
lmcreditcard.show_scorecard()

Unnamed: 0,Column,Bucket,WoE,Coefficients,PartialScore,OffsetScores
0,PAY_5,"(-inf,0]",-0.228131,0.158436,1.042898,535.93372
1,PAY_5,"(0,+inf)",1.486849,0.158436,-6.797114,535.93372
2,PAY_5,[-999999],0.0,0.158436,-0.0,535.93372
0,LIMIT_BAL,"(-inf,40000.0]",0.690275,0.442031,-8.803997,535.93372
1,LIMIT_BAL,"(40000.0,140000.0]",0.178152,0.442031,-2.272213,535.93372
2,LIMIT_BAL,"(140000.0,+inf)",-0.421951,0.442031,5.381706,535.93372
3,LIMIT_BAL,[-999999],0.0,0.442031,-0.0,535.93372
0,PAY_AMT4,"(-inf,0.0]",0.475069,0.591888,-8.113349,535.93372
1,PAY_AMT4,"(0.0,1900.0]",0.06561,0.591888,-1.120511,535.93372
2,PAY_AMT4,"(1900.0,+inf)",-0.332999,0.591888,5.687054,535.93372
