## Feature evaluation funcs

* coverage

* entropy, iv, ks 

* psi (two samples)

In [None]:
# <api>
import bisect
import random
import math
from joblib import Parallel, delayed

import pandas as pd
import numpy as np
import numexpr as ne
import matplotlib.pyplot as plt

import sklearn.metrics as metrics
from sklearn.utils.validation import column_or_1d
from sklearn import tree
from sklearn.model_selection import cross_val_score

In [None]:
# <api>


def _missing_value(data, columns, continuousDomain, categoricalDomain):
    if columns in continuousDomain:
        col = data[columns]
        mis_val = ne.evaluate("min(col)") - 1
    elif columns in categoricalDomain:
        mis_val = 'NA'
    else:
        raise NameError('columns %s is not in the feature set' % columns)
    return {columns: mis_val}


def _getTrainQuantile(df, columns, fillna=-1, step=0.1):
    length = [x + 1 for x in range(int(1 / step))]

    if length[-1] < 1:
        length += [1]

    q = [x * 0.1 for x in length]
    quantile = [fillna] + [x for x in df[columns].quantile(q).values]
    return {columns: quantile}


def _getBins(data, columns, quantile, fillna=-1):
    data['group'] = np.searchsorted(quantile, data[columns].fillna(fillna))
    return data


def _getPercent(df, columns):
    length = float(len(df))
    value_set = (df['group'].value_counts() / length).to_dict()
    return {columns: value_set}


def _getPercentCategorical(df, columns, fillna='NA', dataset='train', train=None):
    if dataset == 'test' and train is None:
        raise ValueError('Missing train set distribution.')
    n = df[columns].shape[0]
    df = (df[columns].fillna(fillna).value_counts() / float(n)).to_dict()
    return {columns: df}

In [None]:
# <api>
class PSICalculation:
    def __init__(self, test_data, train_data, invalid_ftr=[], categoricalDomain=None):

        self.test_data = test_data
        self._missing_val_fill = {}
        self._train_quantile = {}
        self.psi_bycol = {}
        self.psi_total = {}
        self.result = {"train": {},
                       "test": {}}

        if isinstance(train_data, dict) is False:
            self.train_data = train_data
            self._pre_train = False
        elif isinstance(train_data, dict):
            self._pre_train = True
            self._initialize(train_data)

        if invalid_ftr:
            if isinstance(invalid_ftr, list) is False:
                raise NameError(
                    'invalid feature should be a list, not %s' % type(invalid_ftr))

        if self._pre_train:
            self.feature_names = [x for x in list(self.train_data['quantile'].keys())
                                  if (x in test_data.columns) and (x not in invalid_ftr)]
        else:
            self.feature_names = [x for x in train_data.columns
                                  if (x in test_data.columns) and (x not in invalid_ftr)]

        if categoricalDomain:
            self._categoricalDomain = [
                x for x in categoricalDomain if x not in invalid_ftr]
            self._continuousDomain = [
                x for x in self.feature_names if x not in invalid_ftr + categoricalDomain]
        else:
            self._continuousDomain = []
            self._categoricalDomain = []
            for columns in self.feature_names:
                if invalid_ftr:
                    if columns not in invalid_ftr:
                        if self.test_data[columns].dtypes != object:
                            self._continuousDomain.append(columns)
                        else:
                            self._categoricalDomain.append(columns)
                else:
                    if self.test_data[columns].dtypes != object:
                        self._continuousDomain.append(columns)
                    else:
                        self._categoricalDomain.append(columns)

    def _initialize(self, train_data):
        if self._pre_train:
            if {'missing_value', 'quantile', 'percentile'} - train_data.keys():
                raise ValueError(
                    'Training data information is illegal.\
                    Please include both percentile and missing value.')
            else:
                self._missing_val_fill.update(train_data['missing_value'])
                self._train_quantile.update(train_data['quantile'])
                self.result['train'].update(train_data['percentile'])
                self.train_data = train_data

    def _getPSI(self, columns, verbose=0):

        train_value_set = self.result["train"].get(columns)
        test_value_set = self.result["test"].get(columns)
        psi = {}

        total_value_list = set(
            list(train_value_set.keys()) + list(test_value_set.keys()))
        for key in total_value_list:
            if (key not in test_value_set.keys()) or (key not in train_value_set.keys()):
                score = abs(test_value_set.get(key, 0) -
                            train_value_set.get(key, 0))
            elif test_value_set[key] == 0:
                score = abs(test_value_set.get(key, 0) -
                            train_value_set.get(key, 0))
            else:
                score = (test_value_set[key] - train_value_set[key]) * math.log(
                    test_value_set[key] / train_value_set[key])
            psi.update({key: score})

        self.psi_bycol.update({columns: psi})
        score = 0
        for item in self.psi_bycol[columns]:
            score += self.psi_bycol[columns][item]
        self.psi_total.update({columns: score})
        if verbose > 0:
            print(self.psi_total[columns])

    def calculatePSI(self, columns, verbose=0):

        if not self._pre_train:
            self._missing_val_fill.update(_missing_value(
                self.train_data, columns, self._continuousDomain, self._categoricalDomain))
        na_value = self._missing_val_fill.get(columns)

        if columns in self._continuousDomain:
            if not self._pre_train:
                self._train_quantile.update(_getTrainQuantile(
                    self.train_data, columns, fillna=na_value, step=0.1))
                df_train = _getBins(
                    self.train_data, columns, self._train_quantile[columns], fillna=na_value)
                self.result["train"].update(_getPercent(df_train, columns))
            df_test = _getBins(self.test_data, columns,
                               self._train_quantile[columns], fillna=na_value)
            self.result["test"].update(_getPercent(df_test, columns))

        elif columns in self._categoricalDomain:
            if not self._pre_train:
                self.result["train"].update(_getPercentCategorical(
                    self.train_data, columns, fillna=na_value, dataset='train'))
            self.result["test"].update(_getPercentCategorical(self.test_data, columns,
                                                              fillna=na_value, dataset='test',
                                                              train=self.result['train']))
        else:
            raise NameError('columns %s not in the feature set' % columns)

        self._getPSI(columns, verbose=verbose)

        # 如果需要得到所有column的psi值，需要在for循环中调用calculatePSI函数

In [None]:
# <api>

# ver 0.1.5.1
# transform: manual woe replacement added
# bug fix: min_leaf optimization fixed


class WoE:
    """
    Basic functionality for WoE bucketing of continuous and discrete variables
    :param self.bins: DataFrame WoE transformed variable and all related statistics
    :param self.iv: Information Value of the transformed variable
    """

    def __init__(self, qnt_num=16, min_block_size=16, spec_values=None,
                 v_type='c', bins=None, t_type='b'):
        """
        :param qnt_num: Number of buckets (quartiles) for continuous variable split
        :param min_block_size: minimum number of observation in each bucket (
                                continuous variables), incl. optimization restrictions
        :param spec_values: List or Dictionary {'label': value} of special values (
                                frequent items etc.)
        :param v_type: 'c' for continuous variable, 'd' - for discrete
        :param bins: Predefined bucket borders for continuous variable split
        :t_type : Binary 'b' or continous 'c' target variable
        :return: initialized class
        """
        self.__qnt_num = qnt_num  # Num of buckets/quartiles
        self._predefined_bins = None if bins is None else np.array(bins) if type(
            bins) == list else bins  # user bins for continuous variables
        self.type = v_type  # if 'c' variable should be continuous, if 'd' - discrete
        self._min_block_size = min_block_size  # Min num of observation in bucket
        self._gb_ratio = None  # Ratio of good and bad in the sample
        self.bins = None  # WoE Buckets (bins) and related statistics
        self.d_bins = None  # bins for discrete values
        self.c_bins = None
        self.sp_bins = None
        self.df = None  # Training sample DataFrame with initial data and assigned woe
        self.df_sp_values = None
        self.df_disc = None
        self.df_cont = None
        self.qnt_num = None  # Number of quartiles used for continuous part of variable binning
        self.t_type = t_type  # Type of target variable
        self.mis_val = {}
        if type(spec_values) == dict:  # Parsing special values to dict for cont variables
            self.spec_values = {}
            for k, v in spec_values.items():
                if v.startswith('d_'):
                    self.spec_values[k] = v
                else:
                    self.spec_values[k] = 'd_' + v
        else:
            if spec_values is None:
                self.spec_values = {}
            else:
                self.spec_values = {i: 'd_' + str(i) for i in spec_values}

    # Reading data to DataFrame, init DataFrame and spec value preprocessing
    def read_data(self, x, y):
        # Data quality checks
        try:
            x = pd.Series(column_or_1d(x)).reset_index(drop=True)
        except Exception as e:
            raise ValueError('1 dimension data should be input')
        y = pd.Series(y).reset_index(drop=True)

        if not x.size == y.size:
            raise Exception("Y size doesn't match X size")

        # Calc total good bad ratio in the sample
        t_bad = np.sum(y)
        if t_bad == 0 or t_bad == y.size:
            raise ValueError(
                "There should be BAD and GOOD observations in the sample")

        if np.max(y) > 1 or np.min(y) < 0:
            raise ValueError("Y range should be between 0 and 1")

        # Make data frame for calculations
        self.df = pd.DataFrame({"X": x, "Y": y, 'order': np.arange(x.size)})
        self.df['labels'] = self.df['X'].apply(lambda x: '0')
        # difference of label and labels:label_show &label used
        self.df['label'] = self.df['labels']
        sp_values_flag = self.df['X'].isin(self.spec_values.keys(
        )).values | self.df['X'].isnull().values | (self.df['X'] == -1).values
        self.df_sp_values = self.df[sp_values_flag].copy()
        self.df_cont = self.df[np.logical_not(sp_values_flag)].copy()
        if self.df_cont['X'].unique().size == 1:
            self.spec_values.update({self.df_cont['X'].unique()[0]:
                                     'd_' + str(self.df_cont['X'].unique()[0])
                                     if self.type == 'd' else '0'})
        sp_values_flag = self.df['X'].isin(self.spec_values.keys(
        )).values | self.df['X'].isnull().values | (self.df['X'] == -1).values
        self.df_sp_values = self.df[sp_values_flag].copy()
        self.df_cont = self.df[np.logical_not(sp_values_flag)].copy()

        self.df_sp_values, sp_bins = self._disc_labels(self.df_sp_values)
        self.sp_bins = sp_bins
        return self

    def fit(self, x=None, y=None):
        """
        Fit WoE transformation
        :param x: continuous or discrete predictor
        :param y: binary target variable
        :return: WoE class
        """
        # 当fit不是在optimize中被调用时：
        if x is not None:
            self.read_data(x, y)
            if self.type == 'd':
                if len(self.df_cont['X'].unique()) + len(self.sp_bins) > 100:
                    self.optimize()
                    return self

            if len(self.df_cont) == 0:
                self.df_cont = None
            self.df_disc, self.df_cont = self._split_sample(self.df_cont)
            # # labeling data

            df_disc, d_bins = self._disc_labels(self.df_disc)
            self.df_disc = df_disc if d_bins is not None else self.df_disc
            self.d_bins = d_bins if d_bins is not None else self.d_bins
            df_cont, c_bins = self._cont_labels(self.df_cont)
            self.df_cont = df_cont if c_bins is not None else self.df_cont
            self.c_bins = c_bins if c_bins is not None else self.c_bins

        # getting continuous and discrete values together
        self.df = self.df_sp_values.append(self.df_cont)
        self.df = self.df.append(self.df_disc)
        self.bins = self.sp_bins.append(self.c_bins)
        self.bins = self.bins.append(self.d_bins)
        # calculating woe and other statistics
        self._calc_stat()
        # sorting appropriately for further cutting in transform method
        self.bins.sort_values('bins', inplace=True)
        # returning to original observation order
        self.df.sort_values('order', inplace=True)
        self.df.set_index(self.df['X'].index, inplace=True)
        return self

    def fit_transform(self, x, y):
        """
        Fit WoE transformation
        :param x: continuous or discrete predictor
        :param y: binary target variable
        :return: WoE transformed variable
        """
        self.fit(x, y)
        return self.df['woe']

    def _split_sample(self, df):
        if self.type == 'd':
            return df, None
        else:
            return None, df

    def _disc_labels(self, df):
        if df is None:
            return None, None
        df['labels'] = df['X'].apply(
            lambda x: self.spec_values[x] if x in self.spec_values.keys() else 'd_' + str(x))
        df['label'] = df['X'].apply(
            lambda x: self.spec_values[x] if x in self.spec_values.keys() else 'd_' + str(x))
        d_bins = pd.DataFrame({"bins": df['X'].unique()})
        d_bins['labels'] = d_bins['bins'].apply(
            lambda x: self.spec_values[x] if x in self.spec_values.keys() else 'd_' + str(x))
        d_bins['label'] = d_bins['bins'].apply(
            lambda x: self.spec_values[x] if x in self.spec_values.keys() else 'd_' + str(x))
        return df, d_bins

    def _cont_labels(self, df):
        # check whether there is a continuous part
        if df is None:
            return None, None
        # Max buckets num calc
        self.qnt_num = int(np.minimum(
            df['X'].size / self._min_block_size, self.__qnt_num)) + 1
        # cuts - label num for each observation, bins - quartile thresholds
        bins = None
        cuts = None

        if self._predefined_bins is None:
            try:
                cuts, bins = pd.qcut(
                    df["X"], self.qnt_num, retbins=True, labels=False)
                bins = np.append((min(self.df_cont['X']) - 1,), bins[1:-1])
                cuts_show = pd.cut(df['X'].copy(), bins=np.append(
                    bins, (max(self.df_cont['X']),)))
            except ValueError as ex:

                if ex.args[0].startswith('Bin edges must be unique'):
                    self.optimize()
                    return None, None
                    ex.args = (
                        'Please reduce number of bins or encode\
                        frequent items as special values',) + ex.args
                    raise
        else:
            bins = self._predefined_bins
            if bins[0] != min(self.df_cont['X']) - 1:
                bins = np.append((min(self.df_cont['X']) - 1,), bins)
            cuts = pd.cut(df['X'], bins=np.append(bins, (max(self.df_cont['X']),)),
                          labels=np.arange(len(bins)).astype(str))
            cuts_show = pd.cut(df['X'], bins=np.append(
                bins, (max(self.df_cont['X']),)))
        df["labels"] = cuts.astype(str)
        df['label'] = cuts_show.astype(str)

        c_bins = pd.DataFrame(
            {"bins": bins, "labels": np.arange(len(bins)).astype(str),
             'label': cuts_show.cat.categories})
        self.df_cont, self.c_bins = df, c_bins
        return df, c_bins

    # calculating woe and other statistics in this part
    def _calc_stat(self):
        # calculating WoE
        col_names = {'count_nonzero': 'bad', 'size': 'obs'}
        stat = self.df.groupby("labels")['Y'].agg(
            [np.mean, np.count_nonzero, np.size]).rename(columns=col_names).copy()
        if self.t_type != 'b':
            stat['bad'] = stat['mean'] * stat['obs']
        stat['good'] = stat['obs'] - stat['bad']
        t_good = stat['good'].sum()
        t_bad = stat['bad'].sum()

        iv_calc = stat.copy()
        iv_calc['pg'] = iv_calc['good'].apply(
            lambda x: x / t_good if x != 0 else 1 / (t_good + len(iv_calc)))
        iv_calc['pb'] = iv_calc['bad'].apply(
            lambda x: x / t_bad if x != 0 else 1 / (t_good + len(iv_calc)))
        iv_calc['odds_prob'] = iv_calc['pb'] / iv_calc['pg']
        iv_calc['woe'] = iv_calc['odds_prob'].apply(lambda x: np.log(x))
        iv_calc['iv'] = (iv_calc['pb'] - iv_calc['pg']) * iv_calc['woe']
        stat['odds'] = stat['good'] / stat['bad']
        stat['woe'] = iv_calc['woe']
        iv_stat = {'labels': iv_calc.index.values, 'iv': iv_calc['iv'].values}

        iv_stat = pd.DataFrame(data=iv_stat)
        self.iv_stat = iv_stat

        self.iv = iv_stat['iv'].sum()
        # adding stat data to bins
        self.bins = pd.merge(
            stat, self.bins, left_index=True, right_on=['labels'])
        self.bins = pd.merge(self.bins, iv_stat, on=['labels'])
        label_woe = self.bins[['woe', 'labels',
                               'label']].drop_duplicates()
        self.df = pd.merge(self.df, label_woe, left_on=[
                           'labels'], right_on=['labels'])

    def transform(self, x, manual_woe=None):
        """
        Transforms input variable according to previously fitted rule
        :param x: input variable
        :param manual_woe: one can change fitted woe with manual
                            values by providing dict {label: new_woe_value}
        :return: DataFrame with transformed with original and transformed variables
        """
        if not isinstance(x, pd.Series):
            raise TypeError("pandas.Series type expected")

        if self.bins is None:
            raise Exception('Fit the model first, please')
        df = pd.DataFrame({"X": x, 'order': np.arange(x.size)})
        # splitting to discrete and continous pars
        df_sp_values, df_cont = self._split_sample(df)

        # Replacing original with manual woe
        if manual_woe:
            tr_bins = self.bins[['woe', 'labels', 'label']].copy()
            if not type(manual_woe) == dict:
                TypeError("manual_woe should be dict")
            else:
                for key in manual_woe:
                    tr_bins['woe'].mask(tr_bins['labels'] ==
                                        key, manual_woe[key], inplace=True)
        else:
            tr_bins = self.bins
            # function checks existence of special values,
            # raises error if sp do not exist in training set

        def get_sp_label(x_):
            if x_ in self.spec_values.keys():
                return self.spec_values[x_]
            else:
                str_x = 'd_' + str(x_)
                if str_x in list(self.bins['labels']):
                    return str_x
                else:
                    raise ValueError(
                        'Value {} does not exist in the training set'.format(str_x))

        # assigning labels to discrete part
        df_sp_values['labels'] = df_sp_values['X'].apply(get_sp_label)
        df_sp_values['label'] = df_sp_values['X'].apply(get_sp_label)
        # assigning labels to continuous part
        c_bins = self.bins[self.bins['labels'].apply(
            lambda z: not z.startswith('d_'))]

        if not self.type == 'd':
            # if bins[0] != min(self.df['X']) - 1:
            #     bins = np.append((min(self.df['X']) - 1,), bins)
            cuts = pd.cut(df_cont['X'], bins=np.append(
                c_bins["bins"], (max(self.df_cont['X']),)), labels=c_bins["labels"])
            cuts_show = pd.cut(df_cont['X'], bins=np.append(
                c_bins["bins"], (max(self.df_cont['X']),)))
            df_cont['labels'] = cuts.astype(str)
            df_cont['label'] = cuts_show.astype(str)
        # Joining continuous and discrete parts
        df = df_sp_values.append(df_cont)

        # assigning woe
        df = pd.merge(df, tr_bins[['woe', 'labels', 'label']].drop_duplicates(), left_on=[
                      'labels'], right_on=['labels'])
        # returning to original observation order
        df.sort_values('order', inplace=True)
        return df.set_index(x.index)

    def merge(self, label1, label2=None):
        """
        Merge of buckets with given labels
        In case of discrete variable, both labels should be provided.
                    As the result labels will be marget to one bucket.
        In case of continous variable, only label1 should be provided.
                    It will be merged with the next label.
        :param label1: first label to merge
        :param label2: second label to merge
        :return:
        """
        spec_values = self.spec_values.copy()
        c_bins = self.bins[self.bins['labels'].apply(
            lambda x: not x.startswith('d_'))].copy()
        # removing bucket for continuous variable
        if label2 is None and not label1.startswith('d_'):
            c_bins = c_bins[c_bins['labels'] != label1]
        else:
            if not (label1.startswith('d_') and label2.startswith('d_')):
                raise Exception('Labels should be discrete simultaneously')
            for i in self.bins[self.bins['labels'] == label1]['bins']:
                spec_values[i] = label1 + '_' + label2
            bin2 = self.bins[self.bins['labels'] == label2]['bins'].iloc[0]
            spec_values[bin2] = label1 + '_' + label2
        new_woe = WoE(self.__qnt_num, self._min_block_size,
                      spec_values, self.type, c_bins['bins'], self.t_type)
        return new_woe.fit(self.df['X'], self.df['Y'])

    def plot(self, sort_values=True, labels=False):
        """
        Plot WoE transformation and default rates
        :param sort_values: whether to sort discrete variables by woe, continuous by labels
        :param labels: plot labels or intervals for continuous buckets
        :return: plotting object
        """
        bar_width = 0.8
        woe_fig = plt.figure()
        plt.title('Number of Observations and WoE per bucket')
        ax = woe_fig.add_subplot(111)
        ax.set_ylabel('Observations')
        plot_data = self.bins[['labels', 'woe',
                               'obs', 'bins']].copy().drop_duplicates()

        if sort_values:
            if self.type == 'd':
                plot_data.sort_values('woe', inplace=True)
            else:
                cont_labels = plot_data['labels'].apply(
                    lambda z: not z.startswith('d_'))
                temp_data = plot_data[cont_labels].sort_values('bins')
                plot_data = temp_data.append(
                    plot_data[~cont_labels].sort_values('labels'))
        # creating plot labels
        plot_data['plot_bins'] = plot_data['bins'].apply(
            lambda x: '{}'.format(x))
        if not self.type == 'd':
            right_bound = plot_data['plot_bins'].iloc[1:].append(
                pd.Series([str(max(self.df_cont['X']))]))
            plot_data['plot_bins'] = plot_data['plot_bins'].add(' : ')
            plot_data['plot_bins'] = plot_data['plot_bins'].add(
                list(right_bound))
        cont_labels = plot_data['labels'].apply(
            lambda z: not z.startswith('d_'))
        plot_data['plot_bins'] = np.where(
            cont_labels, plot_data['plot_bins'], plot_data['labels'])
        # start plotting
        index = np.arange(plot_data.shape[0])
        plt.xticks(index + 0.8 * bar_width,
                   plot_data['labels'] if labels else plot_data['plot_bins'])
        plt.bar(index, plot_data['obs'], bar_width,
                color='b', label='Observations')
        ax2 = ax.twinx()
        ax2.set_ylabel('Weight of Evidence')
        ax2.plot(index + bar_width / 2,
                 plot_data['woe'], 'bo-', linewidth=4.0, color='r', label='WoE')
        handles1, labels1 = ax.get_legend_handles_labels()
        handles2, labels2 = ax2.get_legend_handles_labels()
        handles = handles1 + handles2
        labels = labels1 + labels2
        plt.legend(handles, labels)
        woe_fig.autofmt_xdate()
        return woe_fig

    def new_bin(self, criterion='gini', fix_depth=None, max_depth=None,
                cv=3, scoring=None, min_samples_leaf=None):
        """
        WoE bucketing optimization (continuous variables only)
        :param criterion: binary tree split criteria
        :param fix_depth: use tree of a fixed depth (2^fix_depth buckets)
        :param max_depth: maximum tree depth for a optimum cross-validation search
        :param cv: number of cv buckets
        :param scoring: scorer for cross_val_score
        :param min_samples_leaf: minimum number of observations in each of optimized buckets
        :return: WoE class with optimized continuous variable split
        """
        if self.t_type == 'b':
            tree_type = tree.DecisionTreeClassifier
        else:
            tree_type = tree.DecisionTreeRegressor

        m_depth = int(np.log2(self.__qnt_num)) + 1 if max_depth is None else max_depth
        min_samples_leaf = self._min_block_size if min_samples_leaf is None else min_samples_leaf
        cont = self.df_cont
        if cont is None:
            return None, None
        if cont.empty:
            return None, None

        x_train = cont['X']
        y_train = cont['Y']
        x_train = x_train.values.reshape(x_train.shape[0], 1)
        start = 1
        cv_scores = []

        if fix_depth is None:
            for i in range(start, m_depth):
                d_tree = tree_type(
                    criterion=criterion, max_depth=i, min_samples_leaf=min_samples_leaf)

                if self.type == 'd':
                    scores = cross_val_score(d_tree, pd.get_dummies(
                        cont['X']), cont['Y'], cv=cv, scoring=scoring)
                else:
                    scores = cross_val_score(
                        d_tree, x_train, y_train, cv=cv, scoring=scoring)
                cv_scores.append(scores.mean())
            best = np.argmax(cv_scores) + start
        else:
            best = fix_depth

        final_tree = tree_type(
            criterion=criterion, max_depth=best, min_samples_leaf=min_samples_leaf)

        if self.type == 'd':
            final_tree.fit(pd.get_dummies(cont['X']), cont['Y'])
        else:
            final_tree.fit(x_train, y_train)

        '''
        print(final_tree.tree_.feature)
        from sklearn.tree import export_graphviz
        import os
        export_graphviz(final_tree)
        os.system('dot -Tpng tree.dot -o tree.png')
        '''

        # for categorical feature's optimization
        if self.type == 'd':
            opt_bins = final_tree.tree_.feature[final_tree.tree_.feature >= 0]
            cat_f_sort = sorted(cont['X'].unique())
            bins = self.d_bins['bins'].as_matrix().tolist(
            ) if self.d_bins is not None else []

            for item in opt_bins:
                bins.append(cat_f_sort[item])
            cat_f_new = self.df['X'].apply(
                lambda x: "others" if x not in bins else x)

            return cat_f_new, bins

        # for continuous feature's optimization
        opt_bins = np.sort(
            final_tree.tree_.threshold[final_tree.tree_.feature >= 0])
        if len(opt_bins) == 0:
            opt_bins = np.append((min(self.df_cont['X']) - 1,), opt_bins)
        if opt_bins[0] != min(self.df_cont['X']) - 1:
            opt_bins = np.append((min(self.df_cont['X']) - 1,), opt_bins)
        cuts = pd.cut(cont['X'], bins=np.append(opt_bins, (max(self.df_cont['X']),)),
                      labels=np.arange(len(opt_bins)).astype(str))
        cuts_show = pd.cut(cont['X'], np.append(
            opt_bins, (max(self.df_cont['X']),)))
        self.df_cont['labels'] = cuts.astype(str)
        self.df_cont['label'] = cuts_show.astype(str)
        self.df = self.df_sp_values.append(self.df_cont)

        c_bins = pd.DataFrame(
            {"bins": opt_bins, "labels": np.arange(len(opt_bins)).astype(str),
             'label': cuts_show.cat.categories})
        opt_bins = pd.concat([self.d_bins, c_bins])
        self.bins = opt_bins
        self.c_bins = opt_bins
        return self.df, opt_bins

    def optimize(self, criterion='gini', fix_depth=None, max_depth=None, cv=3,
                 scoring='roc_auc', min_samples_leaf=None):
        """
        WoE bucketing optimization
        :param criterion: binary tree split criteria
        :param fix_depth: use tree of a fixed depth (2^fix_depth buckets)
        :param max_depth: maximum tree depth for a optimum cross-validation search
        :param cv: number of cv buckets
        :param scoring: scorer for cross_val_score
        :param min_samples_leaf: minimum number of observations in each of optimized buckets
        :return: WoE class with optimized continuous variable split
        """

        # for categorical feature's optimization
        if self.type == 'd':
            cat_f_new, bins = self.new_bin(criterion=criterion, fix_depth=fix_depth,
                                           max_depth=max_depth, min_samples_leaf=min_samples_leaf,
                                           cv=cv, scoring=scoring)
            return self.fit(cat_f_new, self.df['Y'])

        # for continuous feature's optimization
        df_new, opt_bins = self.new_bin(criterion=criterion, fix_depth=fix_depth,
                                        max_depth=max_depth, min_samples_leaf=min_samples_leaf,
                                        cv=cv, scoring=scoring)
        return self.fit(None, None)

### 覆盖率计算

In [None]:
# <api>
def coverage_sigle_col(df, mi_values=[]):
    mi_item_total = df.iloc[df.isnull().values].shape[0]

    for mi_item in mi_values:
        mi_item_total += list(df).count(mi_item)

    mi_perc = mi_item_total / len(df)
    name_coverage = 1 - mi_perc
    return name_coverage

In [None]:
# <api>


def coverage_calc_multiprocess(df, colnames, mi_values=[]):
    result = []

    coverage_list = Parallel(n_jobs=-1)(delayed(coverage_sigle_col)
                                        (df[ftr], mi_values=mi_values) for ftr in colnames)
    result = coverage_list

    output = pd.DataFrame({'feature_name': colnames, 'coverage': result})
    output = output.sort_values('coverage', ascending=False)
    output = output.reset_index().drop('index', axis=1)

    return output[['feature_name', 'coverage']]

In [None]:
# <api>


def coverage_calc(df, colnames, mi_values=[]):
    result = []

    for name in colnames:
        mi_item_total = df.iloc[df[name].isnull().values].shape[0]

        for mi_item in mi_values:
            mi_item_total += list(df[name]).count(mi_item)

        mi_perc = mi_item_total / len(df[name])
        name_coverage = 1 - mi_perc
        result.append(name_coverage)

    output = pd.DataFrame({'feature_name': colnames, 'coverage': result})
    output = output.sort_values('coverage', ascending=False)
    output = output.reset_index().drop('index', axis=1)

    return output[['feature_name', 'coverage']]

### 信用分分析

In [None]:
# <api>


def score_analysis_optimize(df, colname, label, method=None,
                            opt_criterion='gini', qnt_num=10,
                            min_block_size=200, max_depth=6,
                            min_samples_leaf=200, cv=5,
                            scoring="roc_auc", bins=None):
    """
    Calculate ScoreIV using decision tree optimize.
    df - dataframe
    colname - column name：e.g. score, InternetRisk.UMengCreditScore
    label - label column name
    """

    woe = WoE(qnt_num=qnt_num,  min_block_size=min_block_size,
              v_type='c', bins=bins, t_type='b')
    woe.read_data(df[colname], df[label])
    if method == 'optimize':
        woe2 = woe.optimize(criterion=opt_criterion, max_depth=max_depth,
                            min_samples_leaf=min_samples_leaf,
                            cv=cv, scoring=scoring)
    elif method == 'quantile':
        woe2 = woe.fit(df[colname], df[label])
    else:
        raise ValueError('you should choose method from quantile and optimize')
    output = woe2.bins[['mean', 'bad', 'obs', 'good', 'odds',
                        'woe', 'label', 'iv']]

    print(woe2.iv)
    woe2.plot()
    return output, woe2.iv

### 计算IV,不需要区分Categorical和Continuous feature

In [1]:
# <api>


def iv_calc_single_col(df, col_name, label, method=None, opt_criterion='gini',
                       qnt_num=10, min_block_size=200, max_depth=6,
                       min_samples_leaf=200, cv=5, scoring="roc_auc",
                       bins=None, verbose=0, plt_show=0):
    """
    Element func for parallel calculating iv.
    """
    if df[col_name].dtypes != object:
        woe = WoE(qnt_num=qnt_num,  min_block_size=min_block_size,
                  v_type='c', bins=bins, t_type='b')
    else:
        woe = WoE(qnt_num=qnt_num,  min_block_size=min_block_size,
                  v_type='d', bins=bins, t_type='b')
    woe.read_data(df[col_name], df[label])
    woe2 = woe.optimize(criterion=opt_criterion, max_depth=max_depth,
                        min_samples_leaf=min_samples_leaf,
                        cv=cv, scoring=scoring) if method == 'optimize' else woe.fit(
        df[col_name], df[label]) if method == 'quantile' else 'invalid_value'

    if woe2 == 'invalid_value':
        raise ValueError(
            'you should choose method from quantile and optimize')

    if verbose > 0:
        print(woe2.bins[['mean', 'bad', 'obs', 'good',
                         'odds', 'woe', 'label', 'iv']])
        print("====================================")
        print("feature {0} iv value: {1}".format(col_name, woe2.iv))
        print("====================================")
    if plt_show > 0:
        woe2.plot()
        plt.show()
    return woe2.iv

In [3]:
# <api>


def iv_calc(df, colnames, label, method=None, opt_criterion='gini',
            qnt_num=10, min_block_size=200, max_depth=6,
            min_samples_leaf=200, cv=5, scoring="roc_auc",
            bins=None, verbose=0, plt_show=0):
    """
    Calculate IV using decision tree optimize.
    -------------------------------------------------------------------
    This func can be used to calculate IV of both Categorical & Continuous features.
    ScoreIV can also be calculated with this func.
    params:
        method: method which used to divide the sample column: quantile or optimize(
            when use quantile method to Categorical feature, we just don't use tree method opt)
        qnt_num & min_block_size: used for quantile method
        opt_cirterion & max_depth & min_samples_leaf & cv & scoring：used for decision tree optimize
        bins: a manual set list of cut point which used with quantile method
    """
    result = []
    # df = df.copy().reset_index(drop=True)
    if label in colnames:
        colnames = list(colnames)
        colnames.remove(label)
    iv_list = Parallel(n_jobs=-1)(delayed(iv_calc_single_col)(df[[ftr, label]], col_name=ftr,
                                                              label=label, method=method,
                                                              opt_criterion=opt_criterion,
                                                              qnt_num=qnt_num,
                                                              min_block_size=min_block_size,
                                                              max_depth=max_depth,
                                                              min_samples_leaf=min_samples_leaf,
                                                              cv=cv, scoring=scoring,
                                                              bins=bins, verbose=verbose,
                                                              plt_show=plt_show)
                                  for ftr in colnames)
    result = iv_list

    output = pd.DataFrame({'feature_name': colnames, 'iv': result})
    output = output.sort_values('iv', ascending=False)
    output = output.reset_index().drop('index', axis=1)

    return output[['feature_name', 'iv']]

In [None]:
# <api>
def iv_calc_single_col_twosample(df, df2, col_name, label, method=None, opt_criterion='gini',
                       qnt_num=10, min_block_size=200, max_depth=6,
                       min_samples_leaf=200, cv=5, scoring="roc_auc",
                       bins=None, verbose=0, plt_show=0):
    """
    Element func for parallel calculating iv.
    """
    if df[col_name].dtypes != object:
        woe = WoE(qnt_num=qnt_num,  min_block_size=min_block_size,
                  v_type='c', bins=bins, t_type='b')
    else:
        woe = WoE(qnt_num=qnt_num,  min_block_size=min_block_size,
                  v_type='d', bins=bins, t_type='b')
    woe.read_data(df[col_name], df[label])
    woe2 = woe.optimize(criterion=opt_criterion, max_depth=max_depth,
                        min_samples_leaf=min_samples_leaf,
                        cv=cv, scoring=scoring) if method == 'optimize' else woe.fit(
        df[col_name], df[label]) if method == 'quantile' else 'invalid_value'

    if woe2 == 'invalid_value':
        raise ValueError(
            'you should choose method from quantile and optimize')
    if df[col_name].dtypes != object:
        woe3 = WoE(qnt_num=qnt_num,  min_block_size=min_block_size,
                  v_type='c', bins=list(woe.bins['bins'].values), t_type='b')
    else:
        woe3 = WoE(qnt_num=qnt_num,  min_block_size=min_block_size,
                  v_type='d', bins=list(woe.bins['bins'].values), t_type='b')
    woe4 = woe3.fit(df2[col_name], df2[label])

        
    if verbose > 0:
        print(woe2.bins[['mean', 'bad', 'obs', 'good',
                         'odds', 'woe', 'label', 'iv']])
        print("====================================")
        print("feature {0} iv value: {1}".format(col_name, woe2.iv))
        print("====================================")
    if plt_show > 0:
        woe2.plot()
        plt.show()
    return woe2.iv, woe4.iv

In [None]:
# <api>
def iv_calc_twosample(df1,df2, colnames, label, method=None, opt_criterion='gini',
            qnt_num=10, min_block_size=200, max_depth=6,
            min_samples_leaf=200, cv=5, scoring="roc_auc",
            bins=None, verbose=0, plt_show=0):
    """
    Calculate IV using decision tree optimize.
    -------------------------------------------------------------------
    This func can be used to calculate IV of both Categorical & Continuous features.
    ScoreIV can also be calculated with this func.
    params:
        method: method which used to divide the sample column: quantile or optimize(
            when use quantile method to Categorical feature, we just don't use tree method opt)
        qnt_num & min_block_size: used for quantile method
        opt_cirterion & max_depth & min_samples_leaf & cv & scoring：used for decision tree optimize
        bins: a manual set list of cut point which used with quantile method
    """
    result = []
    # df = df.copy().reset_index(drop=True)
    if label in colnames:
        colnames = list(colnames)
        colnames.remove(label)
    iv_list = Parallel(n_jobs=-1)(delayed(iv_calc_single_col_twosample)(df1[[ftr, label]],df2[[ftr, label]], col_name=ftr,
                                                              label=label, method=method,
                                                              opt_criterion=opt_criterion,
                                                              qnt_num=qnt_num,
                                                              min_block_size=min_block_size,
                                                              max_depth=max_depth,
                                                              min_samples_leaf=min_samples_leaf,
                                                              cv=cv, scoring=scoring,
                                                              bins=bins, verbose=verbose,
                                                              plt_show=plt_show)
                                  for ftr in colnames)
    
    result, result2  = [item[0] for item in iv_list],[item[1] for item in iv_list]

    output = pd.DataFrame({'feature_name': colnames, 'iv_train': result,'iv_test':result2})
    output = output.sort_values('iv_train', ascending=False)
    output = output.reset_index().drop('index', axis=1)

    return output[['feature_name', 'iv_train','iv_test']]

In [None]:
# <api>


def iv_calc_single_process(df, colnames, label, method=None, opt_criterion='gini',
                           qnt_num=10, min_block_size=200, max_depth=6,
                           min_samples_leaf=200, cv=5, scoring="roc_auc",
                           bins=None, verbose=0, plt_show=0):
    """
    Calculate IV using decision tree optimize.
    -------------------------------------------------------------------
    This func can be used to calculate IV of both Categorical & Continuous features.
    ScoreIV can also be calculated with this func.
    params:
        method: method which used to divide the sample column: quantile or optimize(
            when use quantile method to Categorical feature, we just don't use tree method opt)
        qnt_num & min_block_size: used for quantile method
        opt_cirterion & max_depth & min_samples_leaf & cv & scoring：used for decision tree optimize
        bins: a manual set list of cut point which used with quantile method
    """
    result = []
    # df = df.copy().reset_index(drop=True)
    for name in colnames:
        if df[name].dtypes != object:
            woe = WoE(qnt_num=qnt_num,  min_block_size=min_block_size,
                      v_type='c', bins=bins, t_type='b')
        else:
            woe = WoE(qnt_num=qnt_num,  min_block_size=min_block_size,
                      v_type='d', bins=bins, t_type='b')
        woe.read_data(df[name], df[label])
        woe2 = woe.optimize(criterion=opt_criterion, max_depth=max_depth,
                            min_samples_leaf=min_samples_leaf,
                            cv=cv, scoring=scoring) if method == 'optimize' else woe.fit(
            df[name], df[label]) if method == 'quantile' else 'invalid_value'

        if woe2 == 'invalid_value':
            raise ValueError(
                'you should choose method from quantile and optimize')
        result.append(woe2.iv)

        if verbose > 0:
            print(woe2.bins[['mean', 'bad', 'obs', 'good',
                             'odds', 'woe', 'label', 'iv']])
            print("====================================")
            print("feature {0} iv value: {1}".format(name, woe2.iv))
            print("====================================")
        if plt_show > 0:
            woe2.plot()
            plt.show()
    output = pd.DataFrame({'feature_name': colnames, 'iv': result})
    output = output.sort_values('iv', ascending=False)
    output = output.reset_index().drop('index', axis=1)

    return output[['feature_name', 'iv']]

### 对Categorical计算IV

In [None]:
# <api>


def iv_calc_categorical(df, colnames, label):
    """
    This func can be replaced by iv_calc with method 'quantile'.
    This func doesn't take None value into account, kinda bug.
    """

    result = []

    for name in colnames:
        good_all = float(len(df[df[label] == 0]))
        bad_all = float(len(df[df[label] == 1]))

        iv = 0
        df_ = df[[name, label]]
        for item in df_[name].unique():
            df__ = df_[df_[name] == item]
            bad = float(len(df__[df__[label] == 1]))
            good = float(len(df__[df__[label] == 0]))
            if(good == 0):
                good = 1
                good_all += len(df_[name].unique())
            if(bad == 0):
                bad = 1
                bad_all += len(df_[name].unique())
            iv += (bad / bad_all - good / good_all) * np.log((bad / bad_all) / (good / good_all))
        result.append(iv)

    output = pd.DataFrame({'feature_name': colnames, 'iv': result})
    output = output.sort_values('iv', ascending=False)
    output = output.reset_index().drop('index', axis=1)

    return output[['feature_name', 'iv']]

### 计算PSI

In [None]:
# <api>


def balanceSample(dt, y_true, seed=27):
    random.seed(seed)

    bad = dt[dt[y_true] == 1]
    good = dt[dt[y_true] == 0]

    if(good.shape[0] >= bad.shape[0]):
        ids = [i for i in range(good.shape[0])]
        random.shuffle(ids)
        good = good.reset_index(drop=True)
        good = good.iloc[ids]
        good = dt[dt[y_true] == 0].head(bad.shape[0])
        new_eval_set = pd.concat([bad, good], axis=0)
    else:
        ids = [i for i in range(bad.shape[0])]
        random.shuffle(ids)
        bad = bad.reset_index(drop=True)
        bad = bad.iloc[ids]
        bad = dt[dt[y_true] == 1].head(good.shape[0])
        new_eval_set = pd.concat([bad, good], axis=0)

    return new_eval_set


def score_bin_bucket(score_col):

    # score_list = pd.Series([300,850]).quantile([i/11 for i in range(12)]).values
    a, score_list = pd.qcut(range(300, 851), 11, retbins=True)
    score_list[-1] = 851.0

    label = []
    for i in score_col:
        idx = bisect.bisect(score_list, i)
        if (idx < 1 or idx > 11) and i != -1:
            raise ValueError('Invalid score out of range 300-850')
        elif i == -1:
            label.append(-1)
        else:
            label.append(idx)
    return label


def score_psi_calc(df, groupby, base, compare, colnames, target=None, method=None):
    """
    method: fix or quantile
        fix: cut point range(300,850,50), used Laplace smooth, kinda bug.
        quantile: quantile cut on base and calc percentage on compare without Laplace.
    """
    if method == 'fix':
        result = []
        base_ = df.iloc[df[groupby] == base].reset_index()
        compare_ = df.iloc[df[groupby] == compare].reset_index()

        # re-balance sample to keep odds = 1
        if target is not None:
            base_ = balanceSample(base_, target)
            compare_ = balanceSample(compare_, target)

        l_base = len(base_)
        l_compare = len(compare_)

        for name in colnames:
            psi = 0
            l_b = l_base
            l_c = l_compare

            b = base_[[name]]
            c = compare_[[name]]

            b['range'] = score_bin_bucket(b[name])
            c['range'] = score_bin_bucket(c[name])

            for i in range(1, 12):
                b_ = len(b.query('range == @i'))
                # b[(b['range'] == i)])
                c_ = len(c.query('range == @i'))
                if(b_ == 0):  # laplas
                    b_ = 1
                    l_b = l_b + 11
                if(c_ == 0):
                    c_ = 1
                    l_c = l_c + 11
                psi += (b_ / l_b - c_ / l_c) * np.log((b_ / l_b) / (c_ / l_c))

            b_ = len(b.query('range == -1'))
            c_ = len(c.query('range == -1'))
            if(b_ == 0):
                b_ = 1
                l_b = l_b + 11
            elif(c_ == 0):
                c_ = 1
                l_c = l_c + 11
            psi += (b_ / l_b - c_ / l_c) * np.log((b_ / l_b) / (c_ / l_c))
            result.append(psi)

        tag = base + "-" + compare

        output = pd.DataFrame(
            {'group': tag, 'feature_name': colnames, 'psi_score': result})
        output = output.sort_values('psi_score', ascending=False)
        output = output.reset_index().drop('index', axis=1)[
            ['group', 'feature_name', 'psi_score']]
        output = output.sort_values(['psi_score'], ascending=False)

        return output
    elif method == 'quantile':
        return psi_calc(df, groupby, base, compare, colnames, target)
    else:
        raise ValueError('you should choose method from quantile and fix')


def score_psi_calc_twosample(base, compare, colnames, target=None, method=None):
    if method == 'fix':
        result = []
        base_ = base.reset_index()
        compare_ = compare.reset_index()

        # re-balance sample to keep odds = 1
        if target is not None:
            base_ = balanceSample(base_, target)
            compare_ = balanceSample(compare_, target)

        l_base = len(base_)
        l_compare = len(compare_)

        for name in colnames:
            psi = 0
            l_b = l_base
            l_c = l_compare

            b = base_[[name]]
            c = compare_[[name]]
            b['range'] = score_bin_bucket(b[name])
            c['range'] = score_bin_bucket(c[name])

            for i in range(1, 12):
                b_ = len(b.query('range == @i'))
                # b[(b['range'] == i)])
                c_ = len(c.query('range == @i'))
                if(b_ == 0):  # laplas
                    b_ = 1
                    l_b = l_b + 11
                if(c_ == 0):
                    c_ = 1
                    l_c = l_c + 11
                psi += (b_ / l_b - c_ / l_c) * np.log((b_ / l_b) / (c_ / l_c))

            b_ = len(b.query('range == -1'))
            c_ = len(c.query('range == -1'))
            if(b_ == 0):
                b_ = 1
                l_b = l_b + 11
            elif(c_ == 0):
                c_ = 1
                l_c = l_c + 11
            psi += (b_ / l_b - c_ / l_c) * np.log((b_ / l_b) / (c_ / l_c))
            result.append(psi)

        output = pd.DataFrame({'feature_name': colnames, 'psi_score': result})
        output = output.sort_values('psi_score', ascending=False)
        output = output.reset_index().drop('index', axis=1)[
            ['feature_name', 'psi_score']]

        return output
    elif method == 'quantile':
        return psi_calc_twosample(base, compare, colnames, target)
    else:
        raise ValueError('you should choose method from quantile and fix')

### PSI计算，不需要区分Categorical，Continuous 或者 Score

In [None]:
# <api>


def psi_calc(df, groupby, base, compare, colnames, target=None):
    """
    Use PSICalculation to calculate PSI，which filled Missing value and fix the bug of Laplace.
    ------------------------------------------------------------------------
    Can be used on Continuous, Categorical feature and Score
    """

    result = []
    base_ = df.iloc[df[groupby] == base].reset_index()
    compare_ = df.iloc[df[groupby] == compare].reset_index()

    # re-balance sample to keep odds = 1
    if target is not None:
        base_ = balanceSample(base_, target)
        compare_ = balanceSample(compare_, target)

    psi = PSICalculation(compare_, base_)

    for name in colnames:
        psi.calculatePSI(name)

    result = psi.psi_total
    r = []
    for x in colnames:
        r.append(result[x])
    tag = base + "-" + compare
    output = pd.DataFrame(
        {'group': tag, 'feature_name': colnames, 'psi_score': r})
    output = output.sort_values('psi_score', ascending=False)
    output = output.reset_index().drop('index', axis=1)[
        ['group', 'feature_name', 'psi_score']]

    return output

In [None]:
# <api>
def psi_single_col(base, compare, col_name):
    psi = PSICalculation(compare, base)
    psi.calculatePSI(col_name)
    return psi.psi_total[col_name]

In [None]:
# <api>


def psi_calc_twosample_multiprocess(base, compare, colnames, target=None):
    """
    Use PSICalculation to calculate PSI，which filled Missing value and fix the bug of Laplace.
    ------------------------------------------------------------------------
    Can be used on Continuous, Categorical feature and Score
    """

    result = []
    base_ = base.reset_index()
    compare_ = compare.reset_index()

    # re-balance sample to keep odds = 1
    if target is not None:
        base_ = balanceSample(base_, target)
        compare_ = balanceSample(compare_, target)

    psi_list = Parallel(n_jobs=-1)(delayed(psi_single_col)
                                   (base_, compare_, ftr) for ftr in colnames)

    r = psi_list
#     r = []
#     for x in colnames:
#         r.append(result[x])
    output = pd.DataFrame({'feature_name': colnames, 'psi_score': r})
    output = output.sort_values('psi_score', ascending=False)
    output = output.reset_index().drop('index', axis=1)[
        ['feature_name', 'psi_score']]

    return output

In [None]:
# <api>


def psi_calc_twosample(base, compare, colnames, target=None):
    """
    Use PSICalculation to calculate PSI，which filled Missing value and fix the bug of Laplace.
    ------------------------------------------------------------------------
    Can be used on Continuous, Categorical feature and Score
    """

    result = []
    base_ = base.reset_index()
    compare_ = compare.reset_index()

    # re-balance sample to keep odds = 1
    if target is not None:
        base_ = balanceSample(base_, target)
        compare_ = balanceSample(compare_, target)

    psi = PSICalculation(compare_, base_)

    for name in colnames:
        psi.calculatePSI(name)

    result = psi.psi_total
    r = []
    for x in colnames:
        r.append(result[x])
    output = pd.DataFrame({'feature_name': colnames, 'psi_score': r})
    output = output.sort_values('psi_score', ascending=False)
    output = output.reset_index().drop('index', axis=1)[
        ['feature_name', 'psi_score']]

    return output

### ks curve and calc

* max ks calc

* prec/rec

In [None]:
# <api>


def greaterThan(a, b):
    return 1 if a > b else 0

# <api>


def metricsPlot(x, y, xlab, ylab, title):
    plt.plot(x, y)
    plt.xlabel(xlab)
    plt.ylabel(ylab)
    plt.title(title)
    plt.legend(loc='lower right')
    # plt.show()

# <api>


def prec(Y_true, Y_predprob, t):
    vfunc = np.vectorize(greaterThan)
    return metrics.precision_score(Y_true, vfunc(Y_predprob, t))

# <api>


def rec(Y_true, Y_predprob, t):
    vfunc = np.vectorize(greaterThan)
    return metrics.recall_score(Y_true, vfunc(Y_predprob, t))

# <api>


def ks_curve(Y_true, Y_predprob, fig_path, low_bound=-0.01, high_bound=1.01, is_score=False):
    """
    Kolmogorov-Smirnov Test.
    x range [low_bound, high_bound]
    prob:  [-0.01, 1.01]
    score: [301, 851]
    """
    #
    df = pd.DataFrame({'Y_truth': Y_true, 'Y_predprob_1': Y_predprob})
    a = df[df.Y_truth == 0]['Y_predprob_1']
    b = df[df.Y_truth == 1]['Y_predprob_1']

    a1 = a.reset_index(drop=True)
    b1 = b.reset_index(drop=True)

    data1, data2 = map(np.asarray, (a1, b1))
    n1 = data1.shape[0]
    n2 = data2.shape[0]
    n1 = len(data1)
    n2 = len(data2)
    data1 = np.sort(data1)
    data2 = np.sort(data2)

    data_all = np.concatenate([data1, data2])
    cdf1 = np.searchsorted(data1, data_all, side='right') / (1. * n1)
    cdf2 = np.searchsorted(data2, data_all, side='right') / (1. * n2)

    cdf_abs_dif = np.absolute(cdf1 - cdf2)
    d = np.max(cdf_abs_dif)

    pos1 = -1
    pos2 = -1
    for i in range(len(cdf_abs_dif)):
        if np.isclose(d, cdf_abs_dif[i]):
            pos1, pos2 = cdf1[i], cdf2[i]
            break

    y_1 = np.arange(n1) / float(n1)
    y_2 = np.arange(n2) / float(n2)

    x_1_idx = []
    for i in range(len(y_1)):
        if np.isclose(pos1, y_1[i]):
            x_1_idx.append(i)
            break
    x_2_idx = []
    for i in range(len(y_2)):
        if np.isclose(pos2, y_2[i]):
            x_2_idx.append(i)
            break

    x_0 = (data1[x_1_idx[0]] + data2[x_2_idx[0]]) / 2.

    plt.clf()
    plt.figure(figsize=(8, 5))
    plt.plot(data1, y_1, label="good sample")
    plt.plot(data2, y_2, label="bad sample")
    plt.legend(loc='lower right')
    plt.plot([x_0, x_0], [y_1[x_1_idx], y_2[x_2_idx]], linestyle="--")
    plt.scatter([x_0, x_0], [y_1[x_1_idx], y_2[x_2_idx]], 50, color='orange')
    plt.xlim([low_bound, high_bound])
    plt.ylim([-0.01, 1.01])
    plt.xlabel('Probability')
    plt.ylabel('F_n(Probability)')
    plt.title('Kolmogorov - Smirnov Chart')
    plt.savefig(fig_path)

    if is_score:
        Y_predvalue = high_bound - Y_predprob
        x0_predvalue = high_bound - x_0
    else:
        Y_predvalue = Y_predprob
        x0_predvalue = x_0

    return (d, round(x_0, 2),
            round(prec(Y_true, Y_predvalue, round(x0_predvalue, 2)), 2),
            round(rec(Y_true, Y_predvalue, round(x0_predvalue, 2)), 2))