In [1]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

import warnings
warnings.filterwarnings("ignore")

In [2]:
def data_read(path, file_name):
    """数据读取函数

    Args:
        path (str): 数据文件路径
        file_name (str): 数据文件名
    """
    df = pd.read_csv(os.path.join(path, file_name), delim_whitespace=True, header=None)

    columns = ['status_account', 'duration', 'credit_history', 'purpose', 'amount', 'svaing_account', 'present_emp', 'income_rate',
               'personal_status', 'other_debtors', 'residence_info', 'property', 'age', 'inst_plans', 'housing', 'num_credits', 'job',
               'dependents', 'telephone', 'foreign_worker', 'target']
    df.columns = columns # 变量重命名

    df.target =  df.target -1 # 将标签变量由状态1,2转换为0,1,0表示好用户，1表示坏用户

    # 数据分为训练集和验证集，训练集用于得到编码函数，验证集用已知编码规则编码
    data_train, data_test = train_test_split(df, test_size=0.2, random_state=0, stratify=df.target)

    return data_train, data_test

In [3]:
def cal_advantage(temp, point, method, flag='sel'):
    """计算当前切分点下的指标值

    Args:
        temp (DataFrame): 上一部的分箱结果
        point ([type]): 切分点， 以此来划分分箱
        method ([type]): 分箱方法选择， 1：chi-merge，2：IV值， 3：信息熵
        flag (str, optional): [description]. Defaults to 'sel'.
    """
   
    if flag == 'sel': 
         # 用于最优切分点选择，这里是二叉树
        bin_num = 2
        good_bad_matrix = np.empty((bin_num, 3))
        for ii in range(bin_num):
            if ii == 0:
                df_temp_1 = temp[temp['bin_raw'] <= point]
            else:
                df_temp_1 = temp[temp['bin_raw'] > point]
            # 计算每个箱内的好坏样本数
            good_bad_matrix[ii][0] = df_temp_1['good'].sum()
            good_bad_matrix[ii][1] = df_temp_1['bad'].sum()
            good_bad_matrix[ii][2] = df_temp_1['total'].sum()

    elif flag == 'gain':
        # 用于计算本次分箱后的指标结果，即分箱数，每增加一个，就要算一下当前分箱下的指标结果
        bin_num = temp['bin'].max()
        good_bad_matrix = np.empty((bin_num, 3))
        for ii in range(bin_num):
            df_temp_1 = temp[temp['bin'] == (ii+1)]
            good_bad_matrix[ii][0] = df_temp_1['good'].sum()
            good_bad_matrix[ii][1] = df_temp_1['bad'].sum()
            good_bad_matrix[ii][2] = df_temp_1['total'].sum()

    # 计算总样本中的好坏样本
    total_matrix = np.empty(3)
    total_matrix[0] = temp.good.sum()
    total_matrix[1] = temp.bad.sum()
    total_matrix[2] = temp.total.sum()

    # Chi-merge分箱
    if method == 1:
        X2 = 0
        for i in range(bin_num):
            for j in range(2):
                expect = (total_matrix[j] / total_matrix[2]) * good_bad_matrix[i][2]
                X2 = X2 + (good_bad_matrix[i][j] - expect) ** 2 / expect
        M_value = X2
    
    # IV分箱
    elif method == 2:
        if pd.isnull(total_matrix[0]) or pd.isnull(total_matrix[1]) or total_matrix[0] == 0 or total_matrix[1] == 0:
            M_value = np.NaN
        else:
            IV = 0
            for i in range(bin_num):
                # 好坏比
                weight = good_bad_matrix[i][1] / total_matrix[1] - good_bad_matrix[i][0] / total_matrix[0]
                IV = IV + weight * np.log((good_bad_matrix[i][1] * total_matrix[0]) / (good_bad_matrix[i][0] * total_matrix[1]))
            M_value = IV

    # 信息熵分箱
    elif method == 3:
        # 总的信息熵
        entropy_total = 0
        for j in range(2):
            weight = total_matrix[j] / total_matrix[2]
            entropy_total = entropy_total - weight * (np.log(weight))

        # 计算条件熵
        entropy_cond = 0 
        for i in range(bin_num):
            entropy_temp = 0
            for j in range(2):
                entropy_temp = entropy_temp - ((good_bad_matrix[i][j] / good_bad_matrix[i][2]) * np.log(good_bad_matrix[i][j] / good_bad_matrix[i][2]))
            entropy_cond = entropy_cond + good_bad_matrix[i][2] / total_matrix[2] * entropy_temp
        # 计算归一化信息熵增益
        M_value = 1 - (entropy_cond / entropy_total)

    # Best-Ks分箱
    else:
        pass

    return M_value


In [4]:
def best_split(df_temp0, method, bin_num):
    """在每个候选集中寻找切分点，完成一次分裂（select_split_point函数的中间过程函数）

    Args:
        df_temp0 (dataframe): 上一次分箱后的结果
        method (int): 分箱方法选择，1：chi-merge， 2：IV值， 3：信息熵
        bin_num ([type]): 分箱编号，在不同编号的分箱结果中继续二分
    """
    df_temp0 = df_temp0.sort_values(by=['bin', 'bad_rate'])
    point_len = len(df_temp0[df_temp0['bin'] == bin_num]) # 候选集的长度
    bestValue = 0
    bestI = 1
    # 以候选集的每个切分点做分隔，计算指标值
    for i in range(1, point_len):
        # 计算指标值
        value = cal_advantage(df_temp0, i, method, flag='sel')
        if bestValue < value:
            bestValue = value
            bestI = i
    # 创建新的切分变量
    df_temp0['split'] = np.where(df_temp0['bin_raw'] <= bestI, 1, 0)
    df_temp0 = df_temp0.drop('bin_raw', axis=1)
    newbinDS = df_temp0.sort_values(by=['split', 'bad_rate'])

    newbinDS_0 = newbinDS[newbinDS['split'] == 0]
    newbinDS_1 = newbinDS[newbinDS['split'] == 1]
    newbinDS_0['bin_raw'] = range(1, len(newbinDS_0) + 1)
    newbinDS_1['bin_raw'] = range(1, len(newbinDS_1) + 1)
    newbinDS = pd.concat([newbinDS_0, newbinDS_1], axis=0)

    return newbinDS 

In [None]:
def select_split_point(temp_bin, method):
    """二叉树分割方式，从候选者中挑选每次的最优切分点，与切分后的指标计算(cont_var_bin函数的中间过程函数)

    Args:
        temp_bin (dataframe): 分箱后的结果
        method (int): 分箱方法选择，1:chi-merge, 2:IV值, 3:信息熵
    """
    temp_bin = temp_bin.sort_values(by=['bin', 'bad_rate'])