In [1]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

import warnings
warnings.filterwarnings("ignore")

In [2]:
def data_read(path, file_name):
    """数据读取函数

    Args:
        path (str): 数据文件路径
        file_name (str): 数据文件名
    """
    df = pd.read_csv(os.path.join(path, file_name), delim_whitespace=True, header=None)

    columns = ['status_account', 'duration', 'credit_history', 'purpose', 'amount', 'svaing_account', 'present_emp', 'income_rate',
               'personal_status', 'other_debtors', 'residence_info', 'property', 'age', 'inst_plans', 'housing', 'num_credits', 'job',
               'dependents', 'telephone', 'foreign_worker', 'target']
    df.columns = columns # 变量重命名

    df.target =  df.target -1 # 将标签变量由状态1,2转换为0,1,0表示好用户，1表示坏用户

    # 数据分为训练集和验证集，训练集用于得到编码函数，验证集用已知编码规则编码
    data_train, data_test = train_test_split(df, test_size=0.2, random_state=0, stratify=df.target)

    return data_train, data_test

In [3]:
def cal_advantage(temp, point, method, flag='sel'):
    """计算当前切分点下的指标值

    Args:
        temp (DataFrame): 上一部的分箱结果
        point ([type]): 切分点， 以此来划分分箱
        method ([type]): 分箱方法选择， 1：chi-merge，2：IV值， 3：信息熵
        flag (str, optional): [description]. Defaults to 'sel'.
    """
   
    if flag == 'sel': 
         # 用于最优切分点选择，这里是二叉树
        bin_num = 2
        good_bad_matrix = np.empty((bin_num, 3))
        for ii in range(bin_num):
            if ii == 0:
                df_temp_1 = temp[temp['bin_raw'] <= point]
            else:
                df_temp_1 = temp[temp['bin_raw'] > point]
            # 计算每个箱内的好坏样本数
            good_bad_matrix[ii][0] = df_temp_1['good'].sum()
            good_bad_matrix[ii][1] = df_temp_1['bad'].sum()
            good_bad_matrix[ii][2] = df_temp_1['total'].sum()

    elif flag == 'gain':
        # 用于计算本次分箱后的指标结果，即分箱数，每增加一个，就要算一下当前分箱下的指标结果
        bin_num = temp['bin'].max()
        good_bad_matrix = np.empty((bin_num, 3))
        for ii in range(bin_num):
            df_temp_1 = temp[temp['bin'] == (ii+1)]
            good_bad_matrix[ii][0] = df_temp_1['good'].sum()
            good_bad_matrix[ii][1] = df_temp_1['bad'].sum()
            good_bad_matrix[ii][2] = df_temp_1['total'].sum()

    # 计算总样本中的好坏样本
    total_matrix = np.empty(3)
    total_matrix[0] = temp.good.sum()
    total_matrix[1] = temp.bad.sum()
    total_matrix[2] = temp.total.sum()

    # Chi-merge分箱
    if method == 1:
        X2 = 0
        for i in range(bin_num):
            for j in range(2):
                expect = (total_matrix[j] / total_matrix[2]) * good_bad_matrix[i][2]
                X2 = X2 + (good_bad_matrix[i][j] - expect) ** 2 / expect
        M_value = X2
    
    # IV分箱
    elif method == 2:
        if pd.isnull(total_matrix[0]) or pd.isnull(total_matrix[1]) or total_matrix[0] == 0 or total_matrix[1] == 0:
            M_value = np.NaN
        else:
            IV = 0
            for i in range(bin_num):
                # 好坏比
                weight = good_bad_matrix[i][1] / total_matrix[1] - good_bad_matrix[i][0] / total_matrix[0]
                IV = IV + weight * np.log((good_bad_matrix[i][1] * total_matrix[0]) / (good_bad_matrix[i][0] * total_matrix[1]))
            M_value = IV

    # 信息熵分箱
    elif method == 3:
        # 总的信息熵
        entropy_total = 0
        for j in range(2):
            weight = total_matrix[j] / total_matrix[2]
            entropy_total = entropy_total - weight * (np.log(weight))

        # 计算条件熵
        entropy_cond = 0 
        for i in range(bin_num):
            entropy_temp = 0
            for j in range(2):
                entropy_temp = entropy_temp - ((good_bad_matrix[i][j] / good_bad_matrix[i][2]) * np.log(good_bad_matrix[i][j] / good_bad_matrix[i][2]))
            entropy_cond = entropy_cond + good_bad_matrix[i][2] / total_matrix[2] * entropy_temp
        # 计算归一化信息熵增益
        M_value = 1 - (entropy_cond / entropy_total)

    # Best-Ks分箱
    else:
        pass

    return M_value


In [4]:
def best_split(df_temp0, method, bin_num):
    """在每个候选集中寻找切分点，完成一次分裂（select_split_point函数的中间过程函数）

    Args:
        df_temp0 (dataframe): 上一次分箱后的结果
        method (int): 分箱方法选择，1：chi-merge， 2：IV值， 3：信息熵
        bin_num ([type]): 分箱编号，在不同编号的分箱结果中继续二分
    """
    df_temp0 = df_temp0.sort_values(by=['bin', 'bad_rate'])
    point_len = len(df_temp0[df_temp0['bin'] == bin_num]) # 候选集的长度
    bestValue = 0
    bestI = 1
    # 以候选集的每个切分点做分隔，计算指标值
    for i in range(1, point_len):
        # 计算指标值
        value = cal_advantage(df_temp0, i, method, flag='sel')
        if bestValue < value:
            bestValue = value
            bestI = i
    # 创建新的切分变量
    df_temp0['split'] = np.where(df_temp0['bin_raw'] <= bestI, 1, 0)
    df_temp0 = df_temp0.drop('bin_raw', axis=1)
    newbinDS = df_temp0.sort_values(by=['split', 'bad_rate'])

    newbinDS_0 = newbinDS[newbinDS['split'] == 0]
    newbinDS_1 = newbinDS[newbinDS['split'] == 1]
    newbinDS_0['bin_raw'] = range(1, len(newbinDS_0) + 1)
    newbinDS_1['bin_raw'] = range(1, len(newbinDS_1) + 1)
    newbinDS = pd.concat([newbinDS_0, newbinDS_1], axis=0)

    return newbinDS 

In [5]:
def select_split_point(temp_bin, method):
    """二叉树分割方式，从候选者中挑选每次的最优切分点，与切分后的指标计算(cont_var_bin函数的中间过程函数)

    Args:
        temp_bin (dataframe): 分箱后的结果
        method (int): 分箱方法选择，1:chi-merge, 2:IV值, 3:信息熵
    """
    temp_bin = temp_bin.sort_values(by=['bin', 'bad_rate'])
    # 得到最大的分箱值
    max_num = max(temp_bin['bin'])
    temp_main = dict()
    bin_i_value = []
    for i in range(1, max_num + 1):
        df_temp = temp_bin[temp_bin['bin'] == i]
        if df_temp.shape[0] > 1:
            # bin=i的做分裂
            temp_split = best_split(df_temp, method, i)
            # 完成一次分箱，更新bin的值
            temp_split['bin'] = np.where(temp_split['split'] == 1, max_num+1, temp_split['bin'])
            # 取出bin!=i合并为新组
            temp_main[i] = temp_bin[temp_bin['bin'] != i]
            temp_main[i] = pd.concat([temp_main[i], temp_split], axis=0, sort=False)
            # 计算新分组的指标值
            value = cal_advantage(temp_main[i], 0, method, flag='gain')
            newdata = [i, value]
            bin_i_value.append(newdata)
    bin_i_value.sort(key=lambda x: x[1], reverse=True)
    binNum = bin_i_value[0][0]
    newBins = temp_main[binNum].drop('split', axis=1)

    return newBins.sort_values(by=['bin', 'bad_rate']), round(bin_i_value[0][1], 4)

In [6]:
def init_equal_bin(x, bin_rate):
    """初始化等距分组，cont_var_bin函数的中间过程函数

    Args:
        x (Series): 要分组的变量值
        bin_rate ([type]): 比例值1/bin_rate
    """
    # 异常值剔除，只考虑90%的最大值与最小值，边界与-inf或inf分为一组
    if len(x[x > np.percentile(x, 95)]) > 0 and len(np.unique(x)) >= 30:
        var_up = min(x[x > np.percentile(x, 95)])
    else:
        var_up = max(x)
    if len(x[x < np.percentile(x, 5)]) > 0:
        var_low = max(x[x < np.percentile(x, 5)])
    else:
        var_low = min(x)
    
    # 初始化分组
    bin_num = int(1 / bin_rate)
    dist_bin = (var_up - var_low) / bin_num
    bin_up, bin_low = [], []
    for i in range(1, bin_num+1):
        if i == 1:
            bin_up.append(var_low + i * dist_bin)
            bin_low.append(-np.inf)
        elif i == bin_num:
            bin_up.append(np.inf)
            bin_low.append(var_low + (i-1) * dist_bin)
        else:
            bin_up.append(var_low + i * dist_bin)
            bin_low.append(var_low + (i-1) * dist_bin)
    result = pd.DataFrame({'bin_up': bin_up, 'bin_low': bin_low})
    result.index.name = 'bin_num'

    return result

In [7]:
def limit_min_sample(temp_cont, bin_min_num_0):
    """分箱约束条件：每个箱内的样本数不能小于bin_min_num_0，cont_var_bin函数的中间过程函数

    Args:
        temp_cont (dataframe): 初始化分箱后的结果
        bin_min_num_0 ([type]): 每组内的最小样本限制
    """
    for i in temp_cont.index:
        rowdata = temp_cont.loc[i, :]
        if i == temp_cont.index.max():
            # 如果是最后一个箱就取倒数第二个值
            ix = temp_cont[temp_cont.index < i].index.max()
        else:
            # 否则就取大于i的最小的分箱值
            ix = temp_cont[temp_cont.index > i].index.min()

        # 如果0,1，total项中样本的数量小于20则进行合并
        if rowdata['total'] <= bin_min_num_0:
            # 与相邻的bin合并
            temp_cont.loc[ix, 'bad'] = temp_cont.loc[ix, 'bad'] + rowdata['bad']
            temp_cont.loc[ix, 'good'] = temp_cont.loc[ix, 'good'] + rowdata['good']
            temp_cont.loc[ix, 'total'] = temp_cont.loc[ix, 'total'] + rowdata['total']
            if i < temp_cont.index.max():
                temp_cont.loc[ix, 'bin_low'] = rowdata['bin_low']
            else:
                temp_cont.loc[ix, 'bin_up'] = rowdata['bin_up']
            temp_cont = temp_cont.drop(i, axis=0)
    return temp_cont.sort_values(by=['bad_rate'])

In [8]:
def cont_var_bin_map(x, bin_init):
    """用于训练集与测试集的分箱映射: 按照初始化分箱结果，对原始值进行分箱映射

    Args:
        x ([type]): [description]
        bin_init ([type]): [description]

    Returns:
        [type]: [description]
    """

    temp = x.copy()
    for i in bin_init.index:
        bin_up = bin_init['bin_up'][i]
        bin_low = bin_init['bin_low'][i]
        # 寻找出 > lower and <= upper的位置
        if pd.isnull(bin_up) or pd.isnull(bin_up):
            temp[pd.isnull(temp)] = i
        else:
            index = (x > bin_low) & (x <= bin_up)
            temp[index] = i
    temp.name = temp.name + '_BIN'

    return temp

In [9]:
def merge_bin(sub, i):
    """将相同箱内的样本数合并，区间合并

    Args:
        sub (dataframe): 分箱结果子集，如bin=1的结果
        i (int): 分箱标号

    Returns:
        dataframe: 返回合并结果
    """

    l = len(sub)
    total = sub['total'].sum()
    first = sub.iloc[0, :]
    last = sub.iloc[l-1, :]

    lower = first['bin_low']
    upper = last['bin_up']
    df = pd.DataFrame()
    df = df.append([i, lower, upper, total], ignore_index=True).T
    df.columns = ['bin', 'bin_low', 'bin_up', 'total']

    return df

In [None]:
def cont_var_bin(x, y, method, mmin=5, mmax=10, bin_rate=0.01, stop_limit=0.1, bin_min_num=20):
    """连续变量分箱

    Args:
        x ([type]): 待分箱的变量
        y ([type]): 目标变量
        method (int): 分箱方法选择，1:chi-merge, 2:IV值, 3:信息熵
        mmin (int, optional): 最小分箱数. Defaults to 5.
        mmax (int, optional): 最大分箱数. Defaults to 10.
        bin_rate (float, optional): 等距初始化分箱参数. Defaults to 0.01.
        stop_limit (float, optional): 分箱前后的最小增益限值，即early stopping策略的限制. Defaults to 0.1.
        bin_min_num (int, optional): 最小样本数，分箱初始化后每个箱内的最小样本数不能少于该值，否则进行分箱合并. Defaults to 20.

    Returns:
        [type]: [description]
    """

    # 缺失值单独取出来
    df_na = pd.DataFrame({'x': x[pd.isnull(x)], 'y': y[pd.isnull(x)]})
    y = y[~pd.isnull(x)]
    x = x[~pd.isnull(x)]
    # 初始化分箱，等距的方式，后面加上约束条件，没有箱内样本数没有现在
    bin_init = init_equal_bin(x, bin_rate)
    # 分箱映射
    bin_map = cont_var_bin_map(x, bin_init)

    df_temp = pd.concat([x, y, bin_map], axis=1)
    # 计算每个bin中好坏样本的频数
    df_temp_1 = pd.crosstab(index=df_temp[bin_map.name], columns=y)
    df_temp_1.rename(columns=dict(zip([0,1], ['good', 'bad'])), inplace=True)
    # 计算每个bin中一共有多少样本
    df_temp_2 = pd.DataFrame(df_temp.groupby(bin_map.name).count().iloc[:, 0])
    df_temp_2.columns = ['total']
    df_temp_all = pd.merge(pd.concat([df_temp_1, df_temp_2], axis=1), bin_init, left_index=True, right_index=True, how='left')

    # 做分箱上下限的整理，让候选点连续
    for j in range(df_temp_all.shape[0]-1):
        if df_temp_all.bin_low.loc[df_temp_all.index[j+1]] != df_temp_all.bin_up.loc[df_temp_all.index[j]]:
            df_temp_all.bin_low.loc[df_temp_all.index[j+1]] = df_temp_all.bin_up.loc[df_temp_all.index[j]]

    # 离散变量中这个值为bad_rate，连续变量时为索引，索引值是分箱初始化时箱内有变量的箱的索引
    df_temp_all['bad_rate'] = df_temp_all.index
    # 最小样本数限制，进行分箱合并
    df_temp_all = limit_min_sample(df_temp_all, bin_min_num)
    # 将合并后的最大箱数与设定的箱数进行比较，这个应该是分箱数的最大值
    if mmax >= df_temp_all.shape[0]:
        mmax = df_temp_all.shape[0] - 1
    
    if mmin >= df_temp_all.shape[0]:
        gain_value_save0 = 0
        gain_rate_save0 = 0
        df_temp_all['bin'] = np.linspace(1, df_temp_all.shape[0], df_temp_all.shape[0], dtype=int)
        data = df_temp_all[['bin_low', 'bin_up', 'total', 'bin']]
        data.index = data['bin']
    else:
        df_temp_all['bin'] = 1
        df_temp_all['bin_raw'] = range(1, len(df_temp_all) + 1)
        df_temp_all['var'] = df_temp_all.index # 初始化分箱编号
        gain_1 = 1e-10
        gain_rate_save0 = []
        gain_value_save0 = []
        ##分箱约束：最大分箱数限制
        for i in range(1,mmax):
            df_temp_all, gain_2 = select_split_point(df_temp_all, method=method)
            gain_rate = gain_2 / gain_1 - 1  ## ratio gain
            gain_value_save0.append(np.round(gain_2,4))
            if i == 1:
                gain_rate_save0.append(0.5)
            else:
                gain_rate_save0.append(np.round(gain_rate,4))
            gain_1 = gain_2
            if df_temp_all.bin.max() >= mmin and df_temp_all.bin.max() <= mmax:
                if gain_rate <= stop_limit or pd.isnull(gain_rate):
                    break
                
        df_temp_all = df_temp_all.rename(columns={'var': 'oldbin'})
        temp_Map1 = df_temp_all.drop(['good', 'bad', 'bad_rate', 'bin_raw'], axis=1)
        temp_Map1 = temp_Map1.sort_values(by=['bin', 'oldbin'])
        # get new lower, upper, bin, total for sub
        data = pd.DataFrame()
        for i in temp_Map1['bin'].unique():
            ##得到这个箱内的上下界
            sub_Map = temp_Map1[temp_Map1['bin'] == i]
            rowdata = merge_bin(sub_Map, i)
            data = data.append(rowdata, ignore_index=True)
    
        # resort data
        data = data.sort_values(by='bin_low')
        data = data.drop('bin', axis=1)
        mmax = df_temp_all.bin.max()
        data['bin'] = range(1, mmax + 1)
        data.index = data['bin']

    ##将缺失值的箱加过来
    if len(df_na) > 0:
        row_num = data.shape[0] + 1
        data.loc[row_num, 'bin_low'] = np.nan
        data.loc[row_num, 'bin_up'] = np.nan
        data.loc[row_num, 'total'] = df_na.shape[0]
        data.loc[row_num, 'bin'] = data.bin.max() + 1
        
    return data, gain_value_save0, gain_rate_save0