# 8.1.0. Score Band Analysis

### Overview
This section presents the analysis of score bands derived from the final model. Score bands help to segment the population based on the risk of default. The segments are created using the score predicted by the model, allowing us to observe the distribution of bad and good loans across different score ranges.

The Score Band Analysis provides valuable insights into the model's performance across different risk segments. It helps in understanding the model's discrimination ability across various thresholds and is crucial for setting appropriate cut-offs based on the companyâ€™s risk appetite.

In [1]:
import pandas as pd
import numpy as np
import pickle
import yaml
from pathlib import Path
from src.utils import calculate_metrics, load_pickle

import warnings
warnings.filterwarnings('ignore')

In [2]:
def ks2(Y_test: pd.DataFrame, Y_scores: pd.DataFrame, predict_column_name: str,
        target: str, n_bins: int = 10, strategy: str = "quantile", tree_kwargs = None) -> pd.DataFrame:
    
    df_aux = pd.DataFrame([Y_test, Y_scores]).T
    
    from sklearn.preprocessing import KBinsDiscretizer
    from sklearn.tree import DecisionTreeClassifier
    
    if strategy == "tree":
        dt = DecisionTreeClassifier(**tree_kwargs).fit(df_aux[[predict_column_name]], df_aux[target])
        print(dt)
        df_aux["bucket"] = dt.apply(df_aux[[predict_column_name]])
        class_values = sorted(df_aux["bucket"].unique())
        n = len(class_values)
        print(n)
        
        def p(new_df, bin_column_name = "risk_tier"):
            new_df.loc[:,bin_column_name] = dt.apply(new_df[[predict_column_name]])
            new_df[bin_column_name] = new_df[bin_column_name].map(dict(zip(class_values,class_labels)))
            return new_df

    else:
        est = KBinsDiscretizer(n_bins=n_bins, encode="ordinal", strategy=strategy)
        Y_scores = df_aux[[predict_column_name]]
        est.fit(Y_scores)
        df_aux["bucket"] = est.transform(Y_scores)

    
    kstable = df_aux.groupby("bucket", as_index=False).agg(
        {predict_column_name: ["min", "max"], target: ["sum", "count"]})
    kstable.columns = ["_".join(a) for a in kstable.columns.to_flat_index()]
    kstable = kstable.rename(
        columns={
            f"{target}_count": "clients",
            f"{target}_sum": "bads"
            })
    kstable["goods"] = (
        kstable["clients"] -
        kstable["bads"]).astype("int")
    kstable["bads_rate"] = round(
        kstable["bads"] / df_aux[df_aux[target] == 1][target].count() * 100,
        3
    )
    kstable["goods_rate"] = round(
        kstable["goods"] /
        df_aux[df_aux[target] == 0][target].count() * 100,
        3
    )
    kstable["cum_bads_rate"] = kstable["bads_rate"].cumsum()
    kstable["cum_goods_rate"] = kstable["goods_rate"].cumsum()
    kstable["KS"] = (
        kstable["cum_goods_rate"] -
        kstable["cum_bads_rate"]).astype("float")
    
    
    return kstable[['bucket_', f'{predict_column_name}_min', f'{predict_column_name}_max', 'clients', 'bads',
       'goods', 'bads_rate', 'goods_rate', 'cum_bads_rate', 'cum_goods_rate',
       'KS']]


## 1. Data Preparation 

In [3]:
with open("config.yaml", "r") as f:
    config = yaml.safe_load(f)
    
numeric_features = config["filter_features"]["numerical"]

features = numeric_features
target = config["main"]["target"]
test_data_path = Path.cwd().parent / config["main"]["data_test_path"]

test_df = pd.read_pickle(test_data_path)

model_path = Path.cwd().parent / config["main"]["model_path"]
model = load_pickle(Path.cwd().parent / config["main"]["model_path"])

X_test, Y_test = test_df[features], test_df[target]

test_df["score"] = model.predict_proba(X_test)[:, 1]


### - KS by Bin

#### Overall 

In [4]:
predict_column_name = 'score'
ks_df = ks2(
    Y_test=Y_test, 
    Y_scores=test_df[predict_column_name],
    predict_column_name=predict_column_name,
    target=target,
    strategy="quantile",
    n_bins=10
)
ks_df.round(3)

Unnamed: 0,bucket_,score_min,score_max,clients,bads,goods,bads_rate,goods_rate,cum_bads_rate,cum_goods_rate,KS
0,0.0,0.309,0.381,376,35.0,341,5.385,10.673,5.385,10.673,5.288
1,1.0,0.381,0.415,393,51.0,342,7.846,10.704,13.231,21.377,8.146
2,2.0,0.415,0.44,383,61.0,322,9.385,10.078,22.616,31.455,8.839
3,3.0,0.44,0.46,386,67.0,319,10.308,9.984,32.924,41.439,8.515
4,4.0,0.46,0.481,384,67.0,317,10.308,9.922,43.232,51.361,8.129
5,5.0,0.481,0.506,385,75.0,310,11.538,9.703,54.77,61.064,6.294
6,6.0,0.506,0.514,74,13.0,61,2.0,1.909,56.77,62.973,6.203
7,7.0,0.514,0.534,1079,192.0,887,29.538,27.762,86.308,90.735,4.427
8,8.0,0.534,0.649,385,89.0,296,13.692,9.264,100.0,99.999,-0.001


#### Loans Count = 0

In [5]:
test_df_aux = test_df[test_df["credit_reports__loans_count"]==0]
X_test_aux, Y_test_aux = test_df_aux[features], test_df_aux[target]

predict_column_name = 'score'
ks_df = ks2(
    Y_test=Y_test_aux, 
    Y_scores=test_df_aux[predict_column_name],
    predict_column_name=predict_column_name,
    target=target,
    strategy="quantile",
    n_bins=5
)
ks_df.round(3)

Unnamed: 0,bucket_,score_min,score_max,clients,bads,goods,bads_rate,goods_rate,cum_bads_rate,cum_goods_rate,KS
0,0.0,0.327,0.459,248,39.0,209,18.66,20.311,18.66,20.311,1.651
1,1.0,0.46,0.514,990,170.0,820,81.34,79.689,100.0,100.0,0.0


#### Loans Count > 0 

In [6]:
test_df_aux = test_df[test_df["credit_reports__loans_count"]>0]
X_test_aux, Y_test_aux = test_df_aux[features], test_df_aux[target]

predict_column_name = 'score'
ks_df = ks2(
    Y_test=Y_test_aux, 
    Y_scores=test_df_aux[predict_column_name],
    predict_column_name=predict_column_name,
    target=target,
    strategy="quantile",
    n_bins=10
)
ks_df.round(2)

Unnamed: 0,bucket_,score_min,score_max,clients,bads,goods,bads_rate,goods_rate,cum_bads_rate,cum_goods_rate,KS
0,0.0,0.31,0.38,261,16.0,245,3.63,11.31,3.63,11.31,7.68
1,1.0,0.38,0.41,261,36.0,225,8.16,10.39,11.79,21.7,9.91
2,2.0,0.41,0.43,260,34.0,226,7.71,10.43,19.5,32.13,12.63
3,3.0,0.43,0.44,261,46.0,215,10.43,9.93,29.93,42.06,12.13
4,4.0,0.44,0.46,260,45.0,215,10.2,9.93,40.14,51.98,11.85
5,5.0,0.46,0.48,261,46.0,215,10.43,9.93,50.57,61.91,11.34
6,6.0,0.48,0.49,261,49.0,212,11.11,9.79,61.68,71.7,10.02
7,7.0,0.49,0.51,260,48.0,212,10.88,9.79,72.56,81.49,8.92
8,8.0,0.51,0.55,261,57.0,204,12.92,9.42,85.49,90.9,5.42
9,9.0,0.55,0.65,261,64.0,197,14.51,9.1,100.0,100.0,0.0


In [15]:
test_df_aux["risk_bands"] = np.where(
    test_df_aux["score"] < 0.44, "low", np.where(test_df_aux["score"] < 0.49, "medium", np.where(test_df_aux["score"]>=0.49, "high", ""))
)
test_df_aux.groupby(["risk_bands"]).agg({target: ["count", "sum", "mean"]})

Unnamed: 0_level_0,target,target,target
Unnamed: 0_level_1,count,sum,mean
risk_bands,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
high,806,171,0.212159
low,936,112,0.119658
medium,865,158,0.182659
