# Libraries

In [370]:
import pandas as pd
import numpy as np
import matplotlib
import seaborn 
import openpyxl
import scipy
import matplotlib.pyplot as plt
import statsmodels.api as sm
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report, precision_score, recall_score, f1_score, roc_auc_score, precision_recall_curve, auc, confusion_matrix
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import OrdinalEncoder
from imblearn.over_sampling import SMOTE

# Data

Loading the data

In [371]:
df = pd.read_csv('Loan_default.csv')

In [372]:
default_count = df['Default'].sum()

print(f"Number of instances where Default is 1: {default_count}")

Number of instances where Default is 1: 29653


# Dealing with class imbalance

Using a binomial model, we assume a 0.5 probability of the outcome variable. As in our sample, we have a roughly 0.12 probability of default, we may consider balancing the sample such that the default probability of 0.5

To do that, we randomly sample from the observations of no-default (downward) such that the default and no-default are matched in size.

## Downsampling

I do admit, this is wholly pulled from ChatGPT 

In [373]:
# Separate the majority and minority classes
majority_class = df[df['Default'] == 0]
minority_class = df[df['Default'] == 1]

# Downsample the majority class to match the number of instances in the minority class
downsampled_majority = majority_class.sample(n=len(minority_class), random_state=9112023)

# Combine the downsampled majority class with the original minority class
balanced_df = pd.concat([downsampled_majority, minority_class])

y = balanced_df['Default']

# Shuffle the rows to mix the classes
df = balanced_df.sample(frac=1, random_state=9112023).reset_index(drop=True)

Verifying that the balanced_df is indeed holding a 50/50 split between default and no-default observations

In [374]:
df.describe()

Unnamed: 0,Age,Income,LoanAmount,CreditScore,MonthsEmployed,NumCreditLines,InterestRate,LoanTerm,DTIRatio,Default
count,59306.0,59306.0,59306.0,59306.0,59306.0,59306.0,59306.0,59306.0,59306.0,59306.0
mean,40.492008,77943.901241,135176.603463,567.161366,55.446903,2.538917,14.555326,36.018008,0.506626,0.5
std,14.950919,40121.12442,70906.74003,158.973285,34.559096,1.119115,6.58673,16.992073,0.230089,0.500004
min,18.0,15002.0,5000.0,300.0,0.0,1.0,2.0,12.0,0.1,0.0
25%,27.0,41894.25,74510.0,429.0,25.0,2.0,9.05,24.0,0.31,0.0
50%,39.0,76327.0,138994.0,565.0,53.0,3.0,15.02,36.0,0.51,0.5
75%,53.0,112814.5,198012.0,703.0,85.0,4.0,20.34,48.0,0.71,1.0
max,69.0,149995.0,249993.0,849.0,119.0,4.0,25.0,60.0,0.9,1.0


## Oversampling with SMOTE if we prefer to use it

Important!!! This method conducts encoding and splitting (train/test) first so the next couple of elements need to be omitted. I also did not fully test the compatibility of the later code with SMOTE as I find downsampling more appropriate in this model's case due to the large enough sample even after the downsampling.

In [375]:
# # Separate features (X) and target variable (y)
# X = df.drop('Default', axis=1)
# y = df['Default']

# # Split the data into training and testing sets
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=9112023)

# # Combine training and testing sets for encoding
# combined_data = pd.concat([X_train, X_test])

# # Encode categorical variables using OrdinalEncoder
# ordinal_encoder = OrdinalEncoder()
# combined_data_encoded = pd.DataFrame(ordinal_encoder.fit_transform(combined_data.select_dtypes(include=['object'])), columns=combined_data.select_dtypes(include=['object']).columns)

# # Split the combined data back into training and testing sets
# X_train_encoded = combined_data_encoded.iloc[:len(X_train), :]
# X_test_encoded = combined_data_encoded.iloc[len(X_train):, :]

# # Use SMOTE to oversample the minority class in the training set
# smote = SMOTE(random_state=9112023)
# X_train, y_train = smote.fit_resample(X_train_encoded, y_train)

# # Print the counts of the target variable before and after oversampling
# print("Class distribution after SMOTE:", y_train.value_counts())

# Encoding categorical variables as numerical

In [376]:
# Extract numerical and categorical features
numerical_X = df[['Age', 'Income', 'LoanAmount', 'CreditScore', 'MonthsEmployed', 'NumCreditLines', 'InterestRate', 'LoanTerm', 'DTIRatio']]
categorical_X = df[['Education', 'EmploymentType', 'MaritalStatus', 'HasMortgage', 'HasDependents', 'LoanPurpose', 'HasCoSigner']]

# Apply OrdinalEncoder to selected categorical variables
ordinal_encoder = OrdinalEncoder()
categorical_X_encoded = pd.DataFrame(ordinal_encoder.fit_transform(categorical_X), columns=categorical_X.columns)

# Convert the entire DataFrame to numeric dtype
categorical_X_encoded = categorical_X_encoded.astype('float')

# Combine numerical and encoded categorical features
X = pd.concat([numerical_X, categorical_X_encoded], axis=1)

# Refreshing the DataFrame (df) such that it includes the encoded features
df = pd.concat([X, df['Default']], axis=1)

In [377]:
df.describe()

Unnamed: 0,Age,Income,LoanAmount,CreditScore,MonthsEmployed,NumCreditLines,InterestRate,LoanTerm,DTIRatio,Education,EmploymentType,MaritalStatus,HasMortgage,HasDependents,LoanPurpose,HasCoSigner,Default
count,59306.0,59306.0,59306.0,59306.0,59306.0,59306.0,59306.0,59306.0,59306.0,59306.0,59306.0,59306.0,59306.0,59306.0,59306.0,59306.0,59306.0
mean,40.492008,77943.901241,135176.603463,567.161366,55.446903,2.538917,14.555326,36.018008,0.506626,1.462786,1.554345,0.991333,0.48678,0.480626,1.983307,0.479142,0.5
std,14.950919,40121.12442,70906.74003,158.973285,34.559096,1.119115,6.58673,16.992073,0.230089,1.113357,1.112683,0.82495,0.499829,0.499629,1.417557,0.499569,0.500004
min,18.0,15002.0,5000.0,300.0,0.0,1.0,2.0,12.0,0.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,27.0,41894.25,74510.0,429.0,25.0,2.0,9.05,24.0,0.31,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
50%,39.0,76327.0,138994.0,565.0,53.0,3.0,15.02,36.0,0.51,1.0,2.0,1.0,0.0,0.0,2.0,0.0,0.5
75%,53.0,112814.5,198012.0,703.0,85.0,4.0,20.34,48.0,0.71,2.0,3.0,2.0,1.0,1.0,3.0,1.0,1.0
max,69.0,149995.0,249993.0,849.0,119.0,4.0,25.0,60.0,0.9,3.0,3.0,2.0,1.0,1.0,4.0,1.0,1.0


# Splitting into test and train sets

In [378]:
# Defining explanatory and target variables

y = df['Default']  

# Splitting into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=9112023)

# Print the shapes of the sets
print("Training set shape:", X_train.shape, y_train.shape)
print("Testing set shape:", X_test.shape, y_test.shape)

Training set shape: (47444, 16) (47444,)
Testing set shape: (11862, 16) (11862,)


In [379]:
print(X_train.dtypes)

Age                 int64
Income              int64
LoanAmount          int64
CreditScore         int64
MonthsEmployed      int64
NumCreditLines      int64
InterestRate      float64
LoanTerm            int64
DTIRatio          float64
Education         float64
EmploymentType    float64
MaritalStatus     float64
HasMortgage       float64
HasDependents     float64
LoanPurpose       float64
HasCoSigner       float64
dtype: object


## Standardizing Numerical Features

In [380]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)

## Fitting the model

With StatsModels to have a neat summary

In [381]:
model = sm.Logit(y_train, sm.add_constant(X_train))
result = model.fit()

print(result.summary())

Optimization terminated successfully.
         Current function value: 0.595213
         Iterations 5


                           Logit Regression Results                           
Dep. Variable:                Default   No. Observations:                47444
Model:                          Logit   Df Residuals:                    47427
Method:                           MLE   Df Model:                           16
Date:                Fri, 12 Jan 2024   Pseudo R-squ.:                  0.1413
Time:                        06:59:18   Log-Likelihood:                -28239.
converged:                       True   LL-Null:                       -32886.
Covariance Type:            nonrobust   LLR p-value:                     0.000
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const         -0.0121      0.010     -1.193      0.233      -0.032       0.008
x1            -0.5872      0.011    -55.670      0.000      -0.608      -0.567
x2            -0.3300      0.010    -32.115      0.0

This model with the scikit package for further evaluations

In [382]:
model = LogisticRegression(random_state=9112023)

# Fit the logistic regression model to the training data
result_sklearn = model.fit(X_train, y_train)

# Evaluation of the model

In [383]:
# I'm also scaling X_test to have it done before later steps
X_test = scaler.transform(X_test)

# Make predictions on the test set
y_test_pred = model.predict(X_test)

## Accuracy

In [384]:
from sklearn.metrics import accuracy_score

accuracy = accuracy_score(y_test, y_test_pred)
print("Accuracy:", accuracy)


Accuracy: 0.6762771876580678


## Precision

In [385]:
from sklearn.metrics import precision_score

precision = precision_score(y_test, y_test_pred)
print("Precision:", precision)


Precision: 0.6782202966172305


## Recall

In [386]:
recall = recall_score(y_test, y_test_pred)
print("Recall:", recall)

Recall: 0.680715838769025


## F1 Score

In [387]:
f1 = f1_score(y_test, y_test_pred)
print("F1 Score:", f1)

F1 Score: 0.6794657762938231


## Area Under the Receiver Operating Characteristic (ROC) Curve (AUC-ROC)

In [388]:
auc_roc = roc_auc_score(y_test, model.predict_proba(X_test)[:, 1])
print("AUC-ROC:", auc_roc)

AUC-ROC: 0.7420071616173065


## Area Under the Precision-Recall Curve (AUC-PR)

In [389]:
precision_curve, recall_curve, _ = precision_recall_curve(y_test, model.predict_proba(X_test)[:, 1])
auc_pr = auc(recall_curve, precision_curve)
print("AUC-PR:", auc_pr)

AUC-PR: 0.7372814702951697


## Confusion Matrix

In [390]:
cm = confusion_matrix(y_test, y_test_pred)
print("Confusion Matrix:\n", cm)

Confusion Matrix:
 [[3952 1931]
 [1909 4070]]
