## Loan approval classifier
In this notebooks a XGBoost loan approval classifier for credit application is trained. Overview of this notebook: 
1. Load data
2. Explanatory data analysis
3. Split features, labels, train, validate and test set
4. Train XGBoost
5. Make prediction
6. Export predictions

To build this classifier an online tutorial was consulted: https://towardsdatascience.com/fine-tuning-bert-for-text-classification-54e7df642894.

### Load libraries and helper code

In [None]:
import sys
import random
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
import seaborn as sns
from IPython.display import Markdown, display

# matplotlib
import matplotlib as mpl
import matplotlib.pyplot as plt

# sklearn
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

# xgboost classifier
from xgboost import XGBClassifier

# initialize dataset
from helper_functions import *

### 1. Load data
Load one-hot encoded version of German Credit dataset.

In [None]:
gd = GermanDataset(
    
    # default pre-processing
    custom_preprocessing=default_preprocessing
)

Convert to pandas data frame.

In [None]:
df_gd = gd.convert_to_dataframe()[0]
df_gd.head()

In [None]:
df_gd.shape

### 2. Explanatory data analysis

Unbalanced dataset

In [None]:
plt.figure(figsize=[6, 3])
plt.hist(df_gd["credit"])
plt.title("credit")
plt.show()

In [None]:
# create bins and labels
bins = [17,25,35,45,55,65,75]
labels = ["%s-%s" %(bins[idx-1]+1,bins[idx]) for idx in range(1,len(bins))]

# bin age colum
b = pd.cut(df_gd['age'], bins=bins, labels=labels, include_lowest=False)

# groupby age and credit column
df_grouped_age = df_gd.groupby(['credit', b]).size().unstack(fill_value=0).stack().reset_index(name='count')
df_grouped_age = df_grouped_age.groupby(['age','credit']).sum()
df_grouped_age

In [None]:
bin_default_dict = {}
bin_non_default_dict = {}

# iterate through age groups to compute default percentage
for label in labels:
    
    # number of defaulting applicants 
    n_default = df_grouped_age.loc[(label, 1.0)]['count']
    n_non_default = df_grouped_age.loc[(label, 0.0)]['count']
    
    # append to dictionary
    bin_default_dict["%s" %label] = str('{0:.0f}'.format((n_default/(n_default+n_non_default))*100)) + "%"
    bin_non_default_dict["%s" %label] = str('{0:.0f}'.format((n_non_default/(n_default+n_non_default))*100)) + "%"

bin_default_dict

In [None]:
color_dark = "#0043CE"
color_light = "#D9D9D9"

# plot
fig, ax = plt.subplots(figsize=(15,10))
df_grouped_age.unstack().plot(kind='barh', stacked=True, ax=ax, color=[color_dark,color_light], edgecolor='k')

# x-axis
ax.set_xlabel("Frequency", fontsize=20, fontweight='bold', color=color_dark)
ax.tick_params(axis='both', which='major', labelsize=16)
ax.set_xlim([0,430])
ax.xaxis.set_label_coords(0.5, -0.1)

# y-axis
ax.set_ylabel("Age group", fontsize=20, fontweight='bold', color=color_dark)
ax.yaxis.set_label_coords(-0.1, 0.5)

# iterate through labels to annotate text
for i in range(0,6):
    label = labels[i]
    x_coord = df_grouped_age.loc[label]['count'].sum()
    y_coord = i
    
    # annotate text
    plt.text(x_coord+17.5, y_coord+0.05, bin_non_default_dict[label], ha="center", va="bottom", color=color_dark, fontsize=18, fontweight="bold")
    plt.text(x_coord+17.5, y_coord-0.25, bin_default_dict[label], ha="center", va="bottom", color='#6B6666', fontsize=18, fontweight="bold")

# legend
ax.legend(['No default','Default'], fontsize=18)
    
plt.show()

In [None]:
# create bins and labels
bins = [0,1]
labels = [0.0]

# bin sex colum
b = pd.cut(df_gd['sex'], bins=bins, labels=labels, include_lowest=False)

# groupby sex and credit column
df_grouped_sex = df_gd.groupby(['credit', 'sex']).size().unstack(fill_value=0).stack().reset_index(name='count')
df_grouped_sex = df_grouped_sex.groupby(['sex','credit']).sum()
df_grouped_sex

In [None]:
bin_default_sex_dict = {}
bin_non_default_sex_dict = {}

labels = [0.0,1.0]

# iterate through age groups to compute default percentage
for label in labels:
    
    # number of defaulting applicants 
    n_default = df_grouped_sex.loc[(label, 1.0)]['count']
    n_non_default = df_grouped_sex.loc[(label, 0.0)]['count']
    
    # append to dictionary
    bin_default_sex_dict["%s" %label] = str('{0:.0f}'.format((n_default/(n_default+n_non_default))*100)) + "%"
    bin_non_default_sex_dict["%s" %label] = str('{0:.0f}'.format((n_non_default/(n_default+n_non_default))*100)) + "%"

bin_default_sex_dict

In [None]:
bin_non_default_sex_dict

In [None]:
# plot
fig, ax = plt.subplots(figsize=(15,10))
df_grouped_sex.unstack().plot(kind='barh', stacked=True, ax=ax, color=[color_dark,color_light], edgecolor='k')

# x-axis
ax.set_xlabel("Frequency", fontsize=20, fontweight='bold', color=color_dark)
ax.tick_params(axis='both', which='major', labelsize=16)
ax.xaxis.set_label_coords(0.5, -0.1)
ax.set_xlim([0,790])

# y-axis
ax.set_ylabel("Sex", fontsize=20, fontweight='bold', color=color_dark)
ax.set_yticklabels(['male','female'])
ax.yaxis.set_label_coords(-0.1, 0.5)

# iterate through labels to annotate text
for i in range(0,2):
    label = labels[i]
    x_coord = df_grouped_sex.loc[label]['count'].sum()
    y_coord = i
    
    # annotate text
    plt.text(x_coord+35, y_coord+0.1, bin_non_default_sex_dict["%s" %label], ha="center", va="bottom", color=color_dark, fontsize=18, fontweight="bold")
    plt.text(x_coord+35, y_coord-0.15, bin_default_sex_dict["%s" %label], ha="center", va="bottom", color='#6B6666', fontsize=18, fontweight="bold")

# legend
ax.legend(['No default','Default'], fontsize=18)
    
plt.show()

### 3. Split features, labels, train, validate and test set

In [None]:
X = df_gd.drop('credit', axis=1)
y = df_gd['credit']

# Splitting X and y into train and test version
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state=10)

### 4. Train XGBoost

Initialize model and specify model parameters

In [None]:
xgbc = XGBClassifier()
XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bynode=1, colsample_bytree=1, gamma=0, learning_rate=0.1,
       max_delta_step=0, max_depth=3, min_child_weight=1, missing=None,
       n_estimators=100, n_jobs=1, nthread=None,
       objective='multi:softprob', random_state=0, reg_alpha=0,
       reg_lambda=1, scale_pos_weight=1, seed=None, silent=None,
       subsample=1, verbosity=1) 

In [None]:
xgbc.fit(X_train, y_train)

### 5. Make predictions
Predict

In [None]:
y_hat = xgbc.predict(X_test)

Merging predictions to dataframe

In [None]:
# Converting y_hat from np to df
predictions_col = pd.DataFrame(index=X_test.index)
predictions_col['predicted_class'] = y_hat.tolist()
predictions_col['true_class'] = y_test.tolist()

# Calculating the errors with the absolute value 
predictions_col['errors'] = abs(predictions_col['predicted_class'] - predictions_col['true_class'])

# Adding predictions to test data
entire_dataset = pd.merge(X_test, predictions_col, left_index = True, right_index = True)
entire_dataset.head()

#### Confusion matrix

In [None]:
predictions = entire_dataset["predicted_class"]
true = entire_dataset["true_class"]

cm = confusion_matrix(true, predictions, labels=[1, 0], normalize='pred')
cmap0 = mpl.colors.LinearSegmentedColormap.from_list(
        'unevently divided', ['#618EC7','#fffde4'])

disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=['True', 'False'])
disp.plot(cmap=cmap0)

print('Acc: ', accuracy_score(predictions,true))

### 6. Export data

In [None]:
entire_dataset.to_csv('./pred_XGBoost.csv', index=False)