# Loan Approval Prediction

This notebook analyses the Loan Approval Prediction data

- https://www.kaggle.com/competitions/playground-series-s4e10/data

### Steps:
- Step 1: Load Data

### Models

xx

### Results

xx

# Step 0: Imports

In [None]:
#--------------------
# General
import pandas as pd
import numpy as np
import copy
pd.set_option('display.max_columns', 10000)
pd.set_option('display.max_rows', 10000)
pd.options.display.float_format = '{:,.2f}'.format
#--------------------

#--------------------
# Category Encoder
import category_encoders as ce
#--------------------

#--------------------
# Seaborn
import seaborn as sns
import matplotlib.pyplot as plt
#--------------------

#--------------------
# Models
# XGBoost
from xgboost import XGBClassifier

# LightGBM
from lightgbm import LGBMClassifier
#--------------------

#--------------------
# SHAP Explainability
import shap
shap.initjs()
#--------------------

#--------------------
# Model Performance
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, roc_curve
#--------------------

#--------------------
# Gini by split
def gini_group(y_true=None, y_score=None, group=None, round=2):
    """
    Calculate Gini for each group
    """
    levels=group.unique()
    values=[]
    for this_level in levels:
        this_y_true=y_true[group==this_level]
        this_y_score=y_score[group==this_level]
        values.append(roc_auc_score(y_true=this_y_true, y_score=this_y_score)*2-1)
    return pd.DataFrame({'group':levels, 'Gini':values}).round(round)
#--------------------

#--------------------
# ROC Curve
def roc_curve_group(y_true=None, y_score=None, group=None, round=2):
    """
    Calculate ROC Curve for each group
    """
    levels=group.unique()
    prep_auc=pd.DataFrame()
    for this_level in levels:
        this_y_true=y_true[group==this_level]
        this_y_score=y_score[group==this_level]
        fpr, tpr, _= roc_curve(y_true=this_y_true, y_score=this_y_score)
        t1=pd.DataFrame({'Model': this_level, 'fpr': fpr, 'tpr': tpr})
        prep_auc=pd.concat([prep_auc, t1], axis=0)
    return prep_auc.round(round)
#--------------------

# Step 1: Load Data

In [None]:
# Load Data
df=pd.read_csv("train.csv").drop(columns=['id'])

# Information
df.info()

# Step 2: Plot Data

### Response

### Numerical Features 

### Categorical Featues 

In [None]:
# Plot Response

# Summarise for the plot
df_smry=df['loan_status'].value_counts().reset_index()
df_smry['loan_status']=df_smry['loan_status'].map({0:'Good', 1:'Bad'})

# Response plot
plt.figure(figsize=(4,3))
ax=sns.barplot(df_smry, x='loan_status', y='count')
ax.bar_label(ax.containers[0])
ax.set_xlabel('Credit Risk Type')
ax.set_ylabel('Count of Examples')
plt.ylim(0, 60000) 
plt.title('Count of Goods and Bads')
plt.show()

# Population default rate
print('---------------')
print('Population Default Rate')
print(df['loan_status'].mean().round(2))
print('---------------')

In [None]:
# Plot Numeric Features

# Numeric Features
numeric_cols = df.select_dtypes(include=[np.number]).columns.drop('loan_status')
num_numeric_cols=len(numeric_cols)

# Set up grid dimensions
cols = 3
rows = (num_numeric_cols + cols - 1) // cols  # Calculate required rows

# Create subplots
fig, axes = plt.subplots(rows, cols, figsize=(10, 3 * rows))
axes = axes.flatten()  # Flatten to make indexing easier

# KDE Plots
for i, this_var in enumerate(numeric_cols):
    sns.kdeplot(data=df, 
                x=this_var, 
                hue='loan_status', 
                fill=True, 
                common_norm=False,
                cut=0,
                alpha=0.5,
                bw_adjust=2.5,
                ax=axes[i])
fig.suptitle('Stratified Density Plots', fontsize=16)

# Hide any unused subplots
for j in range(i + 1, len(axes)):
    axes[j].set_visible(False)

plt.tight_layout()
plt.show()

In [None]:
# Plot Categorical Features

# Identify categorical columns
categorical_cols = df.select_dtypes(exclude=[np.number]).columns
num_categorical_cols = len(categorical_cols)

# Set up grid dimensions
cols = 2
rows = (num_categorical_cols + cols - 1) // cols  # Calculate required rows

# Create subplots
fig, axes = plt.subplots(rows, cols, figsize=(10, 5 * rows))
axes = axes.flatten()  # Flatten to make indexing easier

# Loop over categorical columns
for i, col in enumerate(categorical_cols):
    df_smry = df[['loan_status', col]].groupby(col).mean().reset_index().round(2)

    ax = axes[i]  # Get the current axis
    sns.barplot(data=df_smry, x=col, y='loan_status', ax=ax)
    ax.bar_label(ax.containers[0])
    ax.set_xlabel(col)
    ax.set_ylabel('Bad Rate')
    ax.tick_params(axis='x', rotation=90)
    ax.set_ylim(0, 1)
    ax.set_title(f'Bad Rate by {col}')

# Hide any unused subplots
for j in range(i + 1, len(axes)):
    axes[j].set_visible(False)

plt.tight_layout()
plt.show()

# Step 3: Information Values

In [147]:
def woe_iv(feature=None, target=None, feature_name=None, num_binns=4):
    """
    Calculate Weight of Evidence (WoE) and Information Value (IV) for a given feature.
    
    Parameters:
        feature (pd.Series): The feature column for which WoE and IV need to be calculated.
        target (pd.Series): The target column (binary) against which IV is calculated.
        num_binns (int): Number of bins to divide the feature values (for numeric features).
    
    Returns:
        tuple: A tuple containing:
            - tbl (pd.DataFrame): Table with WoE and IV computations.
            - iv (float): The Information Value for the feature.
    """
    # Check if the feature is numeric
    if np.issubdtype(feature.dtype, np.number):
        # Create bins based on quantiles for target = 1
        _, bins = pd.qcut(feature[target == 1], q=num_binns, retbins=True)
        # Extend bin range for edge cases
        bins[0] = -999999999
        bins[-1] = 999999999
        # Bin the feature values
        binned = pd.cut(feature, bins=bins)
    else:
        # Use categorical feature as is
        binned = feature

    # Create a crosstab between binned feature and target
    tbl = pd.crosstab(index=binned, columns=target)
    # Calculate percentages for target=0 and target=1
    tbl['% 0'] = tbl[0] / sum(tbl[0])
    tbl['% 1'] = tbl[1] / sum(tbl[1])
    # Calculate Weight of Evidence (WoE)
    tbl['woe'] = np.log(tbl['% 0'] / tbl['% 1'])
    # Calculate IV component for each bin
    tbl['iv_comp'] = (tbl['% 0'] - tbl['% 1']) * tbl['woe']

    # Remove the name of the columns (if set, for cleaner output)
    tbl.columns.name = ''
    # Reset the index of the DataFrame to make the bins or levels part of the columns
    tbl.reset_index(inplace=True)
    # Add a new column to indicate the feature name
    tbl['feature'] = feature_name
    # Rename the feature column to 'level' for clarity
    tbl.rename(columns={feature_name: 'level'}, inplace=True)
    # Reorder the columns to place 'feature' as the first column
    order=[tbl.columns.to_list()[-1]] + tbl.columns.to_list()[0:-1]
    tbl=tbl[order]

    # Total Information Value (IV)
    iv = sum(tbl['iv_comp'])
    
    return tbl, iv

# Extract all features excluding the target column
all_features = df.drop(columns='loan_status').columns.to_list()
num_features = len(all_features)

# Loop through each feature to calculate IV
for i in range(0, num_features):
    if i==0:
        woe_results=pd.DataFrame();        
    
    this_feature = all_features[i]
    # Calculate IV for the current feature
    woe_tbl, _ = woe_iv(feature=df[this_feature], 
                         target=df['loan_status'], 
                         feature_name=this_feature, 
                         num_binns=4)
    # Append the IV value to the results list
    woe_results=pd.concat([woe_results, woe_tbl])

woe_results\
    .groupby('feature')['iv_comp']\
    .sum()\
    .reset_index()\
    .sort_values('iv_comp', ascending=False)\
    .reset_index(drop=True)

Unnamed: 0,feature,iv_comp
0,loan_grade,1.24
1,loan_percent_income,1.07
2,loan_int_rate,0.98
3,person_home_ownership,0.6
4,person_income,0.48
5,cb_person_default_on_file,0.23
6,loan_amnt,0.16
7,person_emp_length,0.11
8,loan_intent,0.1
9,person_age,0.0


In [149]:
df.dtypes

person_age                      int64
person_income                   int64
person_home_ownership          object
person_emp_length             float64
loan_intent                    object
loan_grade                     object
loan_amnt                       int64
loan_int_rate                 float64
loan_percent_income           float64
cb_person_default_on_file      object
cb_person_cred_hist_length      int64
loan_status                     int64
dtype: object