# 8.1.0. Score Band Analysis

In [1]:
import pandas as pd
import numpy as np
import pickle
import yaml
from pathlib import Path
from src.utils import calculate_metrics, load_pickle

In [2]:
def ks2(Y_test: pd.DataFrame, Y_scores: pd.DataFrame, predict_column_name: str,
        target: str, n_bins: int = 10, strategy: str = "quantile", tree_kwargs = None) -> pd.DataFrame:
    
    df_aux = pd.DataFrame([Y_test, Y_scores]).T
    
    from sklearn.preprocessing import KBinsDiscretizer
    from sklearn.tree import DecisionTreeClassifier
    
    if strategy == "tree":
        dt = DecisionTreeClassifier(**tree_kwargs).fit(df_aux[[predict_column_name]], df_aux[target])
        print(dt)
        df_aux["bucket"] = dt.apply(df_aux[[predict_column_name]])
        class_values = sorted(df_aux["bucket"].unique())
        n = len(class_values)
        print(n)
        
        def p(new_df, bin_column_name = "risk_tier"):
            new_df.loc[:,bin_column_name] = dt.apply(new_df[[predict_column_name]])
            new_df[bin_column_name] = new_df[bin_column_name].map(dict(zip(class_values,class_labels)))
            return new_df

    else:
        est = KBinsDiscretizer(n_bins=n_bins, encode="ordinal", strategy=strategy)
        Y_scores = df_aux[[predict_column_name]]
        est.fit(Y_scores)
        df_aux["bucket"] = est.transform(Y_scores)

    
    kstable = df_aux.groupby("bucket", as_index=False).agg(
        {predict_column_name: ["min", "max"], target: ["sum", "count"]})
    kstable.columns = ["_".join(a) for a in kstable.columns.to_flat_index()]
    kstable = kstable.rename(
        columns={
            f"{target}_count": "clients",
            f"{target}_sum": "bads"
            })
    kstable["goods"] = (
        kstable["clients"] -
        kstable["bads"]).astype("int")
    kstable["bads_rate"] = round(
        kstable["bads"] / df_aux[df_aux[target] == 1][target].count() * 100,
        3
    )
    kstable["goods_rate"] = round(
        kstable["goods"] /
        df_aux[df_aux[target] == 0][target].count() * 100,
        3
    )
    kstable["cum_bads_rate"] = kstable["bads_rate"].cumsum()
    kstable["cum_goods_rate"] = kstable["goods_rate"].cumsum()
    kstable["KS"] = (
        kstable["cum_goods_rate"] -
        kstable["cum_bads_rate"]).astype("float")
    
    
    return kstable[['bucket_', f'{predict_column_name}_min', f'{predict_column_name}_max', 'clients', 'bads',
       'goods', 'bads_rate', 'goods_rate', 'cum_bads_rate', 'cum_goods_rate',
       'KS']]


## 1. Data Preparation 

In [3]:
with open("config.yaml", "r") as f:
    config = yaml.safe_load(f)
    
numeric_features = config["filter_features"]["numerical"]

features = numeric_features
target = config["main"]["target"]
test_data_path = Path.cwd().parent / config["main"]["data_test_path"]

test_df = pd.read_pickle(test_data_path)

model_path = Path.cwd().parent / config["main"]["model_path"]
model = load_pickle(Path.cwd().parent / config["main"]["model_path"])

X_test, Y_test = test_df[features], test_df[target]

test_df["score"] = model.predict_proba(X_test)[:, 1]


### - KS by Bin

In [4]:
predict_column_name = 'score'
ks_df = ks2(
    Y_test=Y_test, 
    Y_scores=test_df[predict_column_name],
    predict_column_name=predict_column_name,
    target=target,
    strategy="quantile",
    n_bins=10
)
ks_df.round(3)



Unnamed: 0,bucket_,score_min,score_max,clients,bads,goods,bads_rate,goods_rate,cum_bads_rate,cum_goods_rate,KS
0,0.0,0.309,0.381,376,35.0,341,5.385,10.673,5.385,10.673,5.288
1,1.0,0.381,0.415,393,51.0,342,7.846,10.704,13.231,21.377,8.146
2,2.0,0.415,0.44,383,61.0,322,9.385,10.078,22.616,31.455,8.839
3,3.0,0.44,0.46,386,67.0,319,10.308,9.984,32.924,41.439,8.515
4,4.0,0.46,0.481,384,67.0,317,10.308,9.922,43.232,51.361,8.129
5,5.0,0.481,0.506,385,75.0,310,11.538,9.703,54.77,61.064,6.294
6,6.0,0.506,0.514,74,13.0,61,2.0,1.909,56.77,62.973,6.203
7,7.0,0.514,0.534,1079,192.0,887,29.538,27.762,86.308,90.735,4.427
8,8.0,0.534,0.649,385,89.0,296,13.692,9.264,100.0,99.999,-0.001


In [9]:
test_df_aux = test_df[test_df["credit_reports__loans_count"]>0]
X_test_aux, Y_test_aux = test_df_aux[features], test_df_aux[target]

predict_column_name = 'score'
ks_df = ks2(
    Y_test=Y_test_aux, 
    Y_scores=test_df_aux[predict_column_name],
    predict_column_name=predict_column_name,
    target=target,
    strategy="quantile",
    n_bins=10
)
ks_df.round(3)

Unnamed: 0,bucket_,score_min,score_max,clients,bads,goods,bads_rate,goods_rate,cum_bads_rate,cum_goods_rate,KS
0,0.0,0.309,0.377,261,16.0,245,3.628,11.311,3.628,11.311,7.683
1,1.0,0.377,0.408,261,36.0,225,8.163,10.388,11.791,21.699,9.908
2,2.0,0.408,0.431,260,34.0,226,7.71,10.434,19.501,32.133,12.632
3,3.0,0.431,0.444,261,46.0,215,10.431,9.926,29.932,42.059,12.127
4,4.0,0.444,0.461,260,45.0,215,10.204,9.926,40.136,51.985,11.849
5,5.0,0.461,0.477,261,46.0,215,10.431,9.926,50.567,61.911,11.344
6,6.0,0.477,0.492,261,49.0,212,11.111,9.788,61.678,71.699,10.021
7,7.0,0.492,0.515,260,48.0,212,10.884,9.788,72.562,81.487,8.925
8,8.0,0.515,0.553,261,57.0,204,12.925,9.418,85.487,90.905,5.418
9,9.0,0.554,0.649,261,64.0,197,14.512,9.095,99.999,100.0,0.001
