## Environment Setup

In [1]:
# import pandas, numpy, seaborn, pyplot
import numpy as np 
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
from math import sqrt
%matplotlib inline

In [2]:
# import train_test_split, RepeatedStratifiedKFold, cross_val_score, LogisticRegression, roc_curve, roc_auc_score,
# confusion_matrix, precision_recall_curve, auc, f_classif, Pipeline, BaseEstimator, TransformerMixin, chi2_contingency
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_recall_curve
from sklearn.feature_selection import f_classif
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator
from sklearn.base import TransformerMixin
from scipy.stats import chi2_contingency

## Import Data

In [3]:
# load the dataset
df=pd.read_csv('credit_risk_dataset.csv',low_memory=False)

## Data Exploration

In [4]:
# explore the dataset(use the data dictionary)
# look at the columns, which data type is present in a column, range of values in each column, their mean, etc.
df.head()

Unnamed: 0,id,member_id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,...,total_bal_il,il_util,open_rv_12m,open_rv_24m,max_bal_bc,all_util,total_rev_hi_lim,inq_fi,total_cu_tl,inq_last_12m
0,1077501,1296599,5000,5000,4975.0,36 months,10.65,162.87,B,B2,...,,,,,,,,,,
1,1077430,1314167,2500,2500,2500.0,60 months,15.27,59.83,C,C4,...,,,,,,,,,,
2,1077175,1313524,2400,2400,2400.0,36 months,15.96,84.33,C,C5,...,,,,,,,,,,
3,1076863,1277178,10000,10000,10000.0,36 months,13.49,339.31,C,C1,...,,,,,,,,,,
4,1075358,1311748,3000,3000,3000.0,60 months,12.69,67.79,B,B5,...,,,,,,,,,,


In [5]:
df.shape

(466285, 74)

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 466285 entries, 0 to 466284
Data columns (total 74 columns):
 #   Column                       Non-Null Count   Dtype  
---  ------                       --------------   -----  
 0   id                           466285 non-null  int64  
 1   member_id                    466285 non-null  int64  
 2   loan_amnt                    466285 non-null  int64  
 3   funded_amnt                  466285 non-null  int64  
 4   funded_amnt_inv              466285 non-null  float64
 5   term                         466285 non-null  object 
 6   int_rate                     466285 non-null  float64
 7   installment                  466285 non-null  float64
 8   grade                        466285 non-null  object 
 9   sub_grade                    466285 non-null  object 
 10  emp_title                    438697 non-null  object 
 11  emp_length                   445277 non-null  object 
 12  home_ownership               466285 non-null  object 
 13 

In [7]:
df.describe()

Unnamed: 0,id,member_id,loan_amnt,funded_amnt,funded_amnt_inv,int_rate,installment,annual_inc,dti,delinq_2yrs,...,total_bal_il,il_util,open_rv_12m,open_rv_24m,max_bal_bc,all_util,total_rev_hi_lim,inq_fi,total_cu_tl,inq_last_12m
count,466285.0,466285.0,466285.0,466285.0,466285.0,466285.0,466285.0,466281.0,466285.0,466256.0,...,0.0,0.0,0.0,0.0,0.0,0.0,396009.0,0.0,0.0,0.0
mean,13079730.0,14597660.0,14317.277577,14291.801044,14222.329888,13.829236,432.061201,73277.38,17.218758,0.284678,...,,,,,,,30379.09,,,
std,10893710.0,11682370.0,8286.509164,8274.3713,8297.637788,4.357587,243.48555,54963.57,7.851121,0.797365,...,,,,,,,37247.13,,,
min,54734.0,70473.0,500.0,500.0,0.0,5.42,15.67,1896.0,0.0,0.0,...,,,,,,,0.0,,,
25%,3639987.0,4379705.0,8000.0,8000.0,8000.0,10.99,256.69,45000.0,11.36,0.0,...,,,,,,,13500.0,,,
50%,10107900.0,11941080.0,12000.0,12000.0,12000.0,13.66,379.89,63000.0,16.87,0.0,...,,,,,,,22800.0,,,
75%,20731210.0,23001540.0,20000.0,20000.0,19950.0,16.49,566.58,88960.0,22.78,0.0,...,,,,,,,37900.0,,,
max,38098110.0,40860830.0,35000.0,35000.0,35000.0,26.06,1409.99,7500000.0,39.99,29.0,...,,,,,,,9999999.0,,,


In [8]:
# get a list of columns that have more than 80% null values
df.columns[df.isnull().mean() < 0.8]

Index(['id', 'member_id', 'loan_amnt', 'funded_amnt', 'funded_amnt_inv',
       'term', 'int_rate', 'installment', 'grade', 'sub_grade', 'emp_title',
       'emp_length', 'home_ownership', 'annual_inc', 'verification_status',
       'issue_d', 'loan_status', 'pymnt_plan', 'url', 'desc', 'purpose',
       'title', 'zip_code', 'addr_state', 'dti', 'delinq_2yrs',
       'earliest_cr_line', 'inq_last_6mths', 'mths_since_last_delinq',
       'open_acc', 'pub_rec', 'revol_bal', 'revol_util', 'total_acc',
       'initial_list_status', 'out_prncp', 'out_prncp_inv', 'total_pymnt',
       'total_pymnt_inv', 'total_rec_prncp', 'total_rec_int',
       'total_rec_late_fee', 'recoveries', 'collection_recovery_fee',
       'last_pymnt_d', 'last_pymnt_amnt', 'next_pymnt_d', 'last_credit_pull_d',
       'collections_12_mths_ex_med', 'mths_since_last_major_derog',
       'policy_code', 'application_type', 'acc_now_delinq', 'tot_coll_amt',
       'tot_cur_bal', 'total_rev_hi_lim'],
      dtype='object')

In [9]:
# drop columns with more than 80% null values
df=df[df.columns[df.isnull().mean() < 0.8]]
df.head()

Unnamed: 0,id,member_id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,...,next_pymnt_d,last_credit_pull_d,collections_12_mths_ex_med,mths_since_last_major_derog,policy_code,application_type,acc_now_delinq,tot_coll_amt,tot_cur_bal,total_rev_hi_lim
0,1077501,1296599,5000,5000,4975.0,36 months,10.65,162.87,B,B2,...,,Jan-16,0.0,,1,INDIVIDUAL,0.0,,,
1,1077430,1314167,2500,2500,2500.0,60 months,15.27,59.83,C,C4,...,,Sep-13,0.0,,1,INDIVIDUAL,0.0,,,
2,1077175,1313524,2400,2400,2400.0,36 months,15.96,84.33,C,C5,...,,Jan-16,0.0,,1,INDIVIDUAL,0.0,,,
3,1076863,1277178,10000,10000,10000.0,36 months,13.49,339.31,C,C1,...,,Jan-15,0.0,,1,INDIVIDUAL,0.0,,,
4,1075358,1311748,3000,3000,3000.0,60 months,12.69,67.79,B,B5,...,Feb-16,Jan-16,0.0,,1,INDIVIDUAL,0.0,,,


In [10]:
# drop redundant and forward-looking columns
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 466285 entries, 0 to 466284
Data columns (total 56 columns):
 #   Column                       Non-Null Count   Dtype  
---  ------                       --------------   -----  
 0   id                           466285 non-null  int64  
 1   member_id                    466285 non-null  int64  
 2   loan_amnt                    466285 non-null  int64  
 3   funded_amnt                  466285 non-null  int64  
 4   funded_amnt_inv              466285 non-null  float64
 5   term                         466285 non-null  object 
 6   int_rate                     466285 non-null  float64
 7   installment                  466285 non-null  float64
 8   grade                        466285 non-null  object 
 9   sub_grade                    466285 non-null  object 
 10  emp_title                    438697 non-null  object 
 11  emp_length                   445277 non-null  object 
 12  home_ownership               466285 non-null  object 
 13 

In [11]:
#dropping redundant column isn't done completely for now
df_drop=df.drop(labels=['id','member_id','zip_code','emp_title','verification_status','issue_d','pymnt_plan','url','addr_state','delinq_2yrs','application_type','desc'],axis=1,inplace=True)

In [12]:
# re-explore the dataset
#df.describe()
#df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 466285 entries, 0 to 466284
Data columns (total 44 columns):
 #   Column                       Non-Null Count   Dtype  
---  ------                       --------------   -----  
 0   loan_amnt                    466285 non-null  int64  
 1   funded_amnt                  466285 non-null  int64  
 2   funded_amnt_inv              466285 non-null  float64
 3   term                         466285 non-null  object 
 4   int_rate                     466285 non-null  float64
 5   installment                  466285 non-null  float64
 6   grade                        466285 non-null  object 
 7   sub_grade                    466285 non-null  object 
 8   emp_length                   445277 non-null  object 
 9   home_ownership               466285 non-null  object 
 10  annual_inc                   466281 non-null  float64
 11  loan_status                  466285 non-null  object 
 12  purpose                      466285 non-null  object 
 13 

## Identify the target variable

In [13]:
# identify the target variable(target column) no need to code this just identify
# Here the target column is loan_status.

In [14]:
# explore the unique values in the target column
df['loan_status'].unique()

array(['Fully Paid', 'Charged Off', 'Current', 'Default',
       'Late (31-120 days)', 'In Grace Period', 'Late (16-30 days)',
       'Does not meet the credit policy. Status:Fully Paid',
       'Does not meet the credit policy. Status:Charged Off'],
      dtype=object)

In [15]:
# create a new column based on the target column that will be our target variable
loan_status=df.groupby('loan_status').mean()
loan_status

Unnamed: 0_level_0,loan_amnt,funded_amnt,funded_amnt_inv,int_rate,installment,annual_inc,dti,inq_last_6mths,mths_since_last_delinq,open_acc,...,recoveries,collection_recovery_fee,last_pymnt_amnt,collections_12_mths_ex_med,mths_since_last_major_derog,policy_code,acc_now_delinq,tot_coll_amt,tot_cur_bal,total_rev_hi_lim
loan_status,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Charged Off,14518.758682,14470.241318,14342.342678,15.998834,439.992667,64750.370887,18.21997,0.994585,34.308502,10.999317,...,926.523955,96.189387,454.438219,0.006993,42.481076,1.0,0.003602,127.370317,114522.093018,26195.836948
Current,15177.366474,15172.779584,15164.954547,13.773271,447.410438,74703.80479,18.026668,0.695584,33.349143,11.529832,...,0.0,0.0,457.937174,0.012376,42.782234,1.0,0.005191,184.464786,140668.380119,31438.554947
Default,15284.885817,15284.885817,15279.967293,16.136394,455.537296,65611.8475,19.253089,0.917067,33.953202,11.971154,...,0.0,0.0,477.220733,0.008413,43.317757,1.0,0.00601,132.382064,109985.603194,27184.632678
Does not meet the credit policy. Status:Charged Off,9527.233903,9248.127464,5807.001406,14.597148,305.158804,69525.915033,14.343732,4.672823,25.903433,10.001319,...,579.002047,122.187443,305.497911,0.0,,1.0,0.0,,,
Does not meet the credit policy. Status:Fully Paid,8853.231891,8679.376258,6411.141563,13.978642,287.069225,72145.418271,14.107173,4.00051,27.46679,10.102446,...,0.0,0.0,2178.940287,0.0,,1.0,0.002039,,,
Fully Paid,13214.394226,13169.988064,13072.610934,13.255943,411.086245,73709.612302,15.937997,0.839346,35.339155,10.794407,...,0.0,0.0,7165.937941,0.005469,43.037954,1.0,0.002598,219.721618,143006.253262,29996.098672
In Grace Period,16128.289892,16128.289892,16120.110393,15.827072,486.777092,74033.926462,18.801208,0.831214,32.568089,11.719644,...,0.0,0.0,546.211116,0.01335,43.729143,1.0,0.003179,219.261688,130041.024675,25975.458766
Late (16-30 days),15800.779967,15800.779967,15793.313843,15.964466,477.734885,72390.530665,18.625952,0.904762,30.740469,11.692939,...,0.0,0.0,534.378596,0.021346,42.072193,1.0,0.004926,215.194794,133734.06717,26412.624685
Late (31-120 days),15553.757246,15549.630435,15542.374393,15.947754,465.948607,69316.828822,19.090399,0.880145,32.760579,11.729275,...,0.0,0.0,506.048239,0.011304,43.02838,1.0,0.006377,196.418793,121340.395781,26409.483552


In [16]:
# Drop the original target column
df_drop=df.drop(labels=['loan_status'],axis=1)
df_drop.head()

Unnamed: 0,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,emp_length,home_ownership,...,last_pymnt_amnt,next_pymnt_d,last_credit_pull_d,collections_12_mths_ex_med,mths_since_last_major_derog,policy_code,acc_now_delinq,tot_coll_amt,tot_cur_bal,total_rev_hi_lim
0,5000,5000,4975.0,36 months,10.65,162.87,B,B2,10+ years,RENT,...,171.62,,Jan-16,0.0,,1,0.0,,,
1,2500,2500,2500.0,60 months,15.27,59.83,C,C4,< 1 year,RENT,...,119.66,,Sep-13,0.0,,1,0.0,,,
2,2400,2400,2400.0,36 months,15.96,84.33,C,C5,10+ years,RENT,...,649.91,,Jan-16,0.0,,1,0.0,,,
3,10000,10000,10000.0,36 months,13.49,339.31,C,C1,10+ years,RENT,...,357.48,,Jan-15,0.0,,1,0.0,,,
4,3000,3000,3000.0,60 months,12.69,67.79,B,B5,1 year,RENT,...,67.79,Feb-16,Jan-16,0.0,,1,0.0,,,


## Split Data

In [17]:
# split data into 80/20 while keeping the distribution of bad loans in test set same as that in the pre-split dataset(X_train, y_train, etc)

In [18]:
# specifically hard copying the training sets to avoid Pandas' SetttingWithCopyWarning when we play around with this data later on
# you can refer to this link https://github.com/scikit-learn/scikit-learn/issues/8723
# this is currently an open issue between Pandas and Scikit-Learn teams

In [19]:
# create a helper function clean up a column which has values given along with years, assign 0 to NANs and convert to numeric

In [20]:
# apply to X_train

In [21]:
# confirm our transformation by looking at unique values of this column

### Date columns

In [22]:
# create a function to convert date columns to datetime format and create a new column as a difference between today and the respective date
# details of the function
    # store current month
    # convert to datetime format
    # calculate the difference in months and add to a new column
    # make any resulting -ve values to be equal to the max date
    # drop the original date column
# Note : In current date use the same data to maintain uniformilty across everyone's code. Use : 2020-08-01

In [23]:
# apply to X_train

In [24]:
# check these new columns

### Term column

In [25]:
# write a function to remove 'months' string from the 'term' column and convert it to numeric 
# use the function on 'term' column of X_Train

## Feature Selection

In [26]:
# first divide training data into categorical and numerical subsets

## Chi-squared statistic for categorical features

In [27]:
# define an empty dictionary to store chi-squared test results

In [28]:
# loop over each column in the training set to calculate chi-statistic with the target variable

In [29]:
# convert the dictionary to a DF

In [30]:
# keep only the top four categorical features

## ANOVA F-Statistic for numerical feature

In [31]:
# since f_class_if does not accept missing values, we will do a very crude imputation of missing values

In [32]:
# Calculate F Statistic and corresponding p values

In [33]:
# convert to a DF

In [34]:
# keep only the top 20 features and calculate pair-wise correlations between them
# save the top 20 numerical features in a list

## Pair wise correlations to detect multicollinearity

In [35]:
# calculate pair-wise correlations between them

In [36]:
# drop 2 features based on their multicollinearity with other features

In [37]:
# Define a helper function to drop the 4 categorical features with least p-values for chi squared test, 14 numerical features with least F-Statistic
# and 2 numerical features with high multicollinearity

In [38]:
# apply to X_train

## creating dummy variables
### convert discrete variables to dummy variables

In [39]:
# write a function to create dummy variables

In [40]:
# apply to our final four categorical variables

## Update the test data set with all data cleaning procedures performed so far

In [41]:
# also reindex the dummied test set variables to make sure all the feature columns in the train set are also available in the test set

## WoE Binning/Feature Engineering

In [42]:
# we will analyze both categorical and numerical features based on their categorical/binned WoEs and IVs and then combine some of these binned categories together through a custom python class with fit_transform method

In [43]:
# create copies of the 4 training sets(by the names X_train_prepr, y_train_prepr, etc) to be preprocessed using WoE

## analyze WoEs and IVs of discrete features

In [44]:
# write a function that takes 3 arguments: a dataframe (X_train_prepr), a string (column name), and a dataframe (y_train_prepr)
# the function should returns a dataframe as a result

In [45]:
# set the default style of the graphs to the seaborn style. 

In [46]:
# define a function for plotting WoE across categories that takes 2 arguments: a dataframe and a number

In [47]:
# apply these on all four categorical columns

In [48]:
# observe graphs of WOE. If there is a continuous increase in WoE across the different categories then we do not need to combine any features together and should leave all these categories as they are

In [49]:
# if there are no missing values in the grade column leave it as it is, otherwise createa a separate and independent category for all Missing values that would never be combined with any other category
# you will come across this scenario when working through features

In [50]:
# define a function to calculate WoE of continuous variables
# this is same as the function we defined earlier for discrete variables
# the only difference are the 2 lines of code that need to be commented in the function that results in the df being sorted by continuous variable values

In [51]:
# apply this on continuous variables

In [52]:
# fine classing using the cut method if there are a large number of unique values

In [53]:
# don't use the features which have
    # very low IV
    # which have unusually high IV
    # WoE ranges between a very small range, implying low power of differentiating between good and bad loans

In [54]:
# if there is a feature for which most of the values are inside a particular range and very few outside then 
    # create one category for values outside that range
    # apply your approach to all other values(which are inside the range)

In [55]:
# for some of the columns values would feel out of place like utilization being greater than 1 in some values which is very rare so filter those out

In [56]:
# while plotting WOE if you have some doubt in curve you can also zoom on some portion to understand the nature of graph in that area

In [57]:
# if the IV is borderline close to the minimum or maximum ideal threshold, you can proceed without ignoring that feature

## Define Custom Class for WoE Binning/Reengineering

In [58]:
# here we will create a custom scikit-learn class to take care of all binning transformations on any given data set
# this custom class will help us in performing k fold cross validation

In [59]:
# create a list of all the reference categories, i.e. one category from each of the global features

In [60]:
# this custom class will create new categorical dummy features based on the cut-off points that we manually identified based on the WoE plots and IV above

In [61]:
# structure this class so that it also allows a fit_transform method to be implemented on it, thereby allowing you to use it as part of a scikit-learn Pipeline 

## PD Model

In [62]:
# reconfirm shape of the 4 datasets

In [63]:
# define modeling pipeline

In [64]:
# define cross-validation criteria
# RepeatedStratifiedKFold automatially takes care of the class imbalance while splitting

In [65]:
# fit and evaluate the logistic regression pipeline with cross-validation as defined in cv

In [66]:
# print the mean AUROC score and Gini

In [67]:
# fit the pipeline on the whole training set

In [68]:
# create a transformed training set through our WoE_Binning custom class

In [69]:
# store the column names in X_train as a list

In [70]:
# create a summary table of our logistic regression model(name it summary_table)

In [71]:
# create a new column in the dataframe, called 'Coefficients', with row values the transposed coefficients from the 'LogisticRegression' model

In [72]:
# increase the index of every row of the dataframe with 1 to store our model intercept in 1st row

In [73]:
# assign our model intercept to this new row

In [74]:
# sort the dataframe by index

## Prediction Time!

In [75]:
# make preditions on our test set

In [76]:
# get the predicted probabilities(name it y_hat_test_proba)

In [77]:
# select the probabilities of only the positive class (class 1 - default) 

In [78]:
# we will now create a new DF with actual classes and the predicted probabilities
# create a temp y_test DF to reset its index to allow proper concaternation with y_hat_test_proba

In [79]:
# check the shape to make sure the number of rows is same as that in y_test

In [80]:
# rename the columns

In [81]:
# makes the index of one dataframe equal to the index of another dataframe

## Confusion Matrix and AUROC on Test Set

In [82]:
# assign a threshold value to differentiate good with bad

In [83]:
# crate a new column for the predicted class based on predicted probabilities and threshold
# we will determine this optimal threshold later in this project

In [84]:
# create the confusion matrix

In [85]:
# get the values required to plot a ROC curve

In [86]:
# plot the ROC curve

In [87]:
# plot a secondary diagonal line, with dashed line style and black color to represent a no-skill classifier

In [88]:
# calculate the Area Under the Receiver Operating Characteristic Curve (AUROC) on our test set

In [89]:
# calculate Gini from AUROC

In [90]:
# draw a PR curve
# calculate the no skill line as the proportion of the positive class

In [91]:
# plot the no skill precision-recall curve

In [92]:
# calculate inputs for the PR curve

In [93]:
# calculate inputs for the PR curve

In [94]:
# plot PR curve

In [95]:
# calculate PR AUC

## Applying the Model - Scorecard Creation

In [96]:
# print the summary_table

In [97]:
# create a new dataframe with one column
# its values are the values from the 'reference_categories' list
# name it 'Feature name'

In [98]:
# create a second column called 'Coefficients' which contains only 0 values

In [99]:
# concatenates two dataframes

In [100]:
# we reset the index of a dataframe

In [101]:
# create a new column called 'Original feature name' which contains the value of the 'Feature name' column up to the column symbol

In [102]:
# define the min and max threshholds for our scorecard

In [103]:
# calculate the sum of the minimum coefficients of each category within the original feature name

In [104]:
# calculate the sum of the maximum coefficients of each category within the original feature name

In [105]:
# create a new columns that has the imputed calculated Score based on the multiplication of the coefficient by the ratio of the differences between
# maximum & minimum score and maximum & minimum sum of cefficients

In [106]:
# update the calculated score of the Intercept (i.e. the default score for each loan)

In [107]:
# round the values of the 'Score - Calculation' column and store them in a new column

In [108]:
# check the min and max possible scores of our scorecard

In [109]:
# both our min and max scores are out by +1
# we need to manually adjust this
# to decide which one we'll evaluate based on the rounding differences of the minimum category within each Original Feature Name

In [110]:
# we can get by deducting 1 from the Intercept

In [111]:
# recheck min and max possible scores

## Calculating credit scores for all observations in the test data set

In [112]:
# create a transformed test set through our WoE_Binning custom class

In [113]:
# insert an Intercept column in its beginning to align with the # of rows in scorecard

In [114]:
# get the list of our final scorecard scores

In [115]:
# check the shapes of test set and scorecard before doing matrix dot multiplication

In [116]:
# we can see that the test set has a few less columns than the rows in scorecard due to the reference categories
# since the reference categories will always be scored as 0 based on the scorecard, it is safe to add these categories to the end of test set with 0 values

In [117]:
# need to reshape scorecard_scores so that it is of proper shape to allow for matrix dot multiplication

In [118]:
# matrix dot multiplication of test set with scorecard scores

## Setting loan approval cut-offs

In [119]:
# calculate Youden's J-Statistic to identify the best threshhold

In [120]:
# locate the index of the largest J

In [121]:
# print the best threshold

In [122]:
# this means that based on the Youden's J statistic, this is the ideal probability threshold which minimizes the FPR and maximimizes the TPR
# which means all samples with a predicted probability higher than this should be classified as in Default and vice versa
# this ideal threshold might appear to be counterintuitive compared to the default probability threshold of 0.5 but remember that we used the class_weight parameter when fitting our logistic regression model that would have helped us

# we can confirm this by looking at our original confusion matrix with the updated threshold

In [123]:
# update the threshold value

In [124]:
# crate a new column for the predicted class based on predicted probabilities and threshold

In [125]:
# create the confusion matrix

In [126]:
# the updated confusion matrix would show a marked improvement in the TPR at marginal cost of lower TNR but at the same time, FNR has improved drastically with a corresponding marginal increase in FPR

# find the corresponding acceptance and rejection rates on the test set at this ideal threshold

In [127]:
# create a new DF comprising of the thresholds from the ROC output

In [128]:
# calculate Score corresponding to each threshold

In [129]:
# define a function called 'n_approved' which assigns a value of 1 if a predicted probability is greater than the parameter p which is a threshold and a value of 0 if it is not
# then it sums the column.
# for given any percentage values the function will return the number of rows wih estimated probabilites greater than the threshold

In [130]:
# assuming that all credit applications above a given probability of being 'good' will be approved
# when we apply the 'n_approved' function to a threshold it will return the number of approved applications
# here we calculate the number of approved appliations for all thresholds

In [131]:
# then we calculate the number of rejected applications for each threshold
# it is the difference between the total number of applications and the approved applications for that threshold

In [132]:
# approval rate equals the ratio of the approved applications and all applications

In [133]:
# rejection rate equals one minus approval rate

In [134]:
# look at the approval and rejection rates at our ideal threshold

In [135]:
# compare the above rates with the case of the default 0.5 threshold

In [136]:
# you would find that 0.5 threshold would result in a very high rejection rate with a corresponding loss of business

# we will stick with our ideal threshold and the corresponding Creidt Score of (fill yourself) and can also monitor the model's performance in production if more data were available