提取最优分箱方式：
- ccx_best_bins
- toad_best_bins
- xgb_best_bins

工具：
- ccx_scorecard
- toad
- xgbcard

方法：
- 等频
- 等距
- 卡方
- 决策树
- 提升树分箱

标准：
- 单调
- PSI<0.1
- IV最大

- input: x, y, psi=0.1, metric='IV'
- output: best_bins, best_iv

In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
import toad

In [2]:
data = pd.read_csv('./data/german_credit.csv')
data.rename(columns={'default':'creditability'},inplace=True)

print(data.shape) # 1000 data and 20 features 
data.head()

(1000, 21)


Unnamed: 0,creditability,account_check_status,duration_in_month,credit_history,purpose,credit_amount,savings,present_emp_since,installment_as_income_perc,personal_status_sex,...,present_res_since,property,age,other_installment_plans,housing,credits_this_bank,job,people_under_maintenance,telephone,foreign_worker
0,0,< 0 DM,6,critical account/ other credits existing (not ...,domestic appliances,1169,unknown/ no savings account,.. >= 7 years,4,male : single,...,4,real estate,67,none,own,2,skilled employee / official,1,"yes, registered under the customers name",yes
1,1,0 <= ... < 200 DM,48,existing credits paid back duly till now,domestic appliances,5951,... < 100 DM,1 <= ... < 4 years,2,female : divorced/separated/married,...,2,real estate,22,none,own,1,skilled employee / official,1,none,yes
2,0,no checking account,12,critical account/ other credits existing (not ...,(vacation - does not exist?),2096,... < 100 DM,4 <= ... < 7 years,2,male : single,...,3,real estate,49,none,own,1,unskilled - resident,2,none,yes
3,0,< 0 DM,42,existing credits paid back duly till now,radio/television,7882,... < 100 DM,4 <= ... < 7 years,2,male : single,...,4,if not A121 : building society savings agreeme...,45,none,for free,1,skilled employee / official,2,none,yes
4,1,< 0 DM,24,delay in paying off in the past,car (new),4870,... < 100 DM,1 <= ... < 4 years,3,male : single,...,4,unknown / no property,53,none,for free,2,skilled employee / official,2,none,yes


In [93]:
Xtr,Xts,Ytr,Yts = train_test_split(data.drop('creditability',axis=1),data['creditability'],test_size=0.25,random_state=450)
data_tr = pd.concat([Xtr,Ytr],axis=1)
#增加一列区分训练/测试的特征
# data_tr['type'] = 'train'
data_ts = pd.concat([Xts,Yts],axis=1)
# data_ts['type'] = 'test'

In [94]:
selected_train, drop_lst= toad.selection.select(data_tr,target = 'creditability', empty = 0.8, iv = 0.02, corr = 0.7, return_drop=True)

selected_test = data_ts[selected_train.columns]

print(selected_train.shape)
drop_lst 

(750, 17)


{'empty': array([], dtype=float64),
 'iv': array(['present_res_since', 'credits_this_bank',
        'people_under_maintenance', 'telephone'], dtype=object),
 'corr': array([], dtype=object)}

In [95]:
quality = toad.quality(data,'creditability')
quality.head(6)

Unnamed: 0,iv,gini,entropy,unique
account_check_status,0.666012,0.368037,0.545196,4.0
duration_in_month,0.354784,0.406755,0.609659,33.0
credit_amount,0.351455,0.40868,0.610864,921.0
credit_history,0.293234,0.39409,0.580631,5.0
age,0.211197,0.414339,0.610863,53.0
savings,0.19601,0.404838,0.591377,5.0


In [96]:
def PSI_cates(dev_x, val_x): 
    """对类别型变量计算PSI
    """
    dev_nrows = dev_x.shape[0]  
    val_nrows = val_x.shape[0] 
    dev_x_set = set(dev_x)
    PSI = 0  
    # 每一个取值分别计算PSI  
    for i in dev_x_set:   
        dev_cnt = [p for p in dev_x if p == i]  
        dev_ratio = len(dev_cnt) / dev_nrows + 1e-10  
        val_cnt = [p for p in val_x if p == i]  
        val_ratio = len(val_cnt) / val_nrows + 1e-10  
        psi = (dev_ratio - val_ratio) * np.log(dev_ratio/val_ratio)
        PSI += psi  
    return PSI  

- 卡方

In [114]:
bins = {}
target = 'creditability'

In [115]:
combiner = toad.transform.Combiner()
combiner.fit(selected_train, y=target, method='chi', min_samples = 0.05, empty_separate=True)
bins['chi'] = combiner.export()
binned_train = combiner.transform(selected_train)
binned_test = combiner.transform(selected_test)

# 单调性+PSI
feats_dict = []
for feat in binned_train.columns:
    if feat==target:
        continue
    badrate = binned_train.groupby(feat)[target].mean().sort_index()
    flag_mono = badrate.is_monotonic_decreasing or badrate.is_monotonic_increasing
    flag_psi = PSI_cates(binned_train[feat].values, binned_test[feat].values)<0.1
    if flag_mono and flag_psi:
        feats_dict.append(feat)
feats_iv = toad.stats.IV(binned_train[feats_dict+[target]], target).T.rename(columns={0:'chi'})        

- 等距

In [116]:
for n_bin in range(3,6):
    combiner.fit(selected_train, y=target, method='step', n_bins = n_bin, empty_separate=True)
    bins['step'+str(n_bin)] = combiner.export()
    binned_train = combiner.transform(selected_train)
    binned_test = combiner.transform(selected_test)

    feats_dict = []
    for feat in binned_train.columns:
        if feat==target:
            continue
        badrate = binned_train.groupby(feat)[target].mean().sort_index()
        flag_mono = badrate.is_monotonic_decreasing or badrate.is_monotonic_increasing
        flag_psi = PSI_cates(binned_train[feat].values, binned_test[feat].values)<0.1
        if flag_mono and flag_psi:
            feats_dict.append(feat)
    feats_iv_1 = toad.stats.IV(binned_train[feats_dict+[target]], target).T.rename(columns={0:'step'+str(n_bin)})   
    feats_iv = pd.merge(feats_iv, feats_iv_1, left_index=True, right_index=True, how='outer')

In [119]:
feats_iv.fillna(0)

Unnamed: 0,chi,step3,step4,step5
account_check_status,0.585107,0.557684,0.585107,0.585107
age,0.0,0.029788,0.067503,0.0
credit_amount,0.0,0.123476,0.155383,0.136026
credit_history,0.231579,0.155276,0.231579,0.234536
duration_in_month,0.275793,0.0,0.0,0.0
foreign_worker,0.0,0.062412,0.062412,0.062412
housing,0.070562,0.070562,0.070562,0.070562
installment_as_income_perc,0.024604,0.01965,0.024604,0.024604
job,0.034999,0.034999,0.035003,0.035003
other_debtors,0.0,0.071072,0.071072,0.071072


- 等频

- 决策树

In [None]:
# 单调性+PSI
feats_dict = []
for feat in binned_data.columns[:-2]:
    badrate = binned_data.groupby(feat)['creditability'].mean().sort_index()
    flag_mono = badrate.is_monotonic_decreasing or badrate.is_monotonic_increasing
    flag_psi = PSI_cates(binned_train['account_check_status'].values, binned_test['account_check_status'].values)<0.1
    if flag_mono and flag_psi:
        feats_dict.append(feat)
feats_iv = toad.stats.IV(binned_data[feats_dict+['creditability']], 'creditability').T.rename(columns={0:0})        

In [None]:
#设置分箱号 
combiner.set_rules(adj_bin)

#将特征的值转化为分箱的箱号。
binned_data = combiner.transform(selected_data)

#计算WOE
transer = toad.transform.WOETransformer()

#对WOE的值进行转化，映射到原数据集上。对训练集用fit_transform,测试集用transform.
data_tr_woe = transer.fit_transform(binned_data, binned_data['creditability'], exclude=['creditability','type'])
data_ts_woe = transer.transform(combiner.transform(selected_test))