# Libraries

In [54]:
import pandas as pd
import numpy as np
import matplotlib
import seaborn 
import openpyxl
import scipy
import matplotlib.pyplot as plt
import statsmodels.api as sm
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report, precision_score, recall_score, f1_score, roc_auc_score, precision_recall_curve, auc, confusion_matrix
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import OrdinalEncoder
from imblearn.over_sampling import SMOTE

# Data

Loading the data

In [55]:
df = pd.read_csv('Loan_default.csv')

In [56]:
default_count = df['Default'].sum()

print(f"Number of instances where Default is 1: {default_count}")

Number of instances where Default is 1: 29653


# Dealing with class imbalance

Using a binomial model, we assume a 0.5 probability of the outcome variable. As in our sample, we have a roughly 0.12 probability of default, we may consider balancing the sample such that the default probability of 0.5

To do that, we randomly sample from the observations of no-default (downward) such that the default and no-default are matched in size.

## Downsampling

I do admit, this is wholly pulled from ChatGPT 

In [57]:
# Separate the majority and minority classes
majority_class = df[df['Default'] == 0]
minority_class = df[df['Default'] == 1]

# Downsample the majority class to match the number of instances in the minority class
downsampled_majority = majority_class.sample(n=len(minority_class), random_state=9112023)

# Combine the downsampled majority class with the original minority class
balanced_df = pd.concat([downsampled_majority, minority_class])

y = balanced_df['Default']

# Shuffle the rows to mix the classes
df = balanced_df.sample(frac=1, random_state=9112023).reset_index(drop=True)

Verifying that the balanced_df is indeed holding a 50/50 split between default and no-default observations

In [58]:
df.describe()

Unnamed: 0,Age,Income,LoanAmount,CreditScore,MonthsEmployed,NumCreditLines,InterestRate,LoanTerm,DTIRatio,Default
count,59306.0,59306.0,59306.0,59306.0,59306.0,59306.0,59306.0,59306.0,59306.0,59306.0
mean,40.492008,77943.901241,135176.603463,567.161366,55.446903,2.538917,14.555326,36.018008,0.506626,0.5
std,14.950919,40121.12442,70906.74003,158.973285,34.559096,1.119115,6.58673,16.992073,0.230089,0.500004
min,18.0,15002.0,5000.0,300.0,0.0,1.0,2.0,12.0,0.1,0.0
25%,27.0,41894.25,74510.0,429.0,25.0,2.0,9.05,24.0,0.31,0.0
50%,39.0,76327.0,138994.0,565.0,53.0,3.0,15.02,36.0,0.51,0.5
75%,53.0,112814.5,198012.0,703.0,85.0,4.0,20.34,48.0,0.71,1.0
max,69.0,149995.0,249993.0,849.0,119.0,4.0,25.0,60.0,0.9,1.0


## Oversampling with SMOTE if we prefer to use it

### Important!!! This method conducts encoding and splitting (train/test) first so the next couple of elements need to be omitted. I also did not fully test the compatibility of the later code with SMOTE as I find downsampling more appropriate in this model's case due to the large enough sample even after the downsampling.

In [59]:
# # Separate features (X) and target variable (y)
# X = df.drop('Default', axis=1)
# y = df['Default']

# # Split the data into training and testing sets
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=9112023)

# # Combine training and testing sets for encoding
# combined_data = pd.concat([X_train, X_test])

# # Encode categorical variables using OrdinalEncoder
# ordinal_encoder = OrdinalEncoder()
# combined_data_encoded = pd.DataFrame(ordinal_encoder.fit_transform(combined_data.select_dtypes(include=['object'])), columns=combined_data.select_dtypes(include=['object']).columns)

# # Split the combined data back into training and testing sets
# X_train_encoded = combined_data_encoded.iloc[:len(X_train), :]
# X_test_encoded = combined_data_encoded.iloc[len(X_train):, :]

# # Use SMOTE to oversample the minority class in the training set
# smote = SMOTE(random_state=9112023)
# X_train, y_train = smote.fit_resample(X_train_encoded, y_train)

# # Print the counts of the target variable before and after oversampling
# print("Class distribution after SMOTE:", y_train.value_counts())

# Encoding categorical variables as numerical

In [60]:
# Extract numerical and categorical features
numerical_X = df[['Age', 'Income', 'LoanAmount', 'CreditScore', 'MonthsEmployed', 'NumCreditLines', 'InterestRate', 'LoanTerm', 'DTIRatio']]
categorical_X = df[['Education', 'EmploymentType', 'MaritalStatus', 'HasMortgage', 'HasDependents', 'LoanPurpose', 'HasCoSigner']]

# Apply OrdinalEncoder to selected categorical variables
ordinal_encoder = OrdinalEncoder()
categorical_X_encoded = pd.DataFrame(ordinal_encoder.fit_transform(categorical_X), columns=categorical_X.columns)

# Convert the entire DataFrame to numeric dtype
categorical_X_encoded = categorical_X_encoded.astype('float')

# Combine numerical and encoded categorical features
X = pd.concat([numerical_X, categorical_X_encoded], axis=1)

# Refreshing the DataFrame (df) such that it includes the encoded features
df = pd.concat([X, df['Default']], axis=1)

In [61]:
df.describe()

Unnamed: 0,Age,Income,LoanAmount,CreditScore,MonthsEmployed,NumCreditLines,InterestRate,LoanTerm,DTIRatio,Education,EmploymentType,MaritalStatus,HasMortgage,HasDependents,LoanPurpose,HasCoSigner,Default
count,59306.0,59306.0,59306.0,59306.0,59306.0,59306.0,59306.0,59306.0,59306.0,59306.0,59306.0,59306.0,59306.0,59306.0,59306.0,59306.0,59306.0
mean,40.492008,77943.901241,135176.603463,567.161366,55.446903,2.538917,14.555326,36.018008,0.506626,1.462786,1.554345,0.991333,0.48678,0.480626,1.983307,0.479142,0.5
std,14.950919,40121.12442,70906.74003,158.973285,34.559096,1.119115,6.58673,16.992073,0.230089,1.113357,1.112683,0.82495,0.499829,0.499629,1.417557,0.499569,0.500004
min,18.0,15002.0,5000.0,300.0,0.0,1.0,2.0,12.0,0.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,27.0,41894.25,74510.0,429.0,25.0,2.0,9.05,24.0,0.31,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
50%,39.0,76327.0,138994.0,565.0,53.0,3.0,15.02,36.0,0.51,1.0,2.0,1.0,0.0,0.0,2.0,0.0,0.5
75%,53.0,112814.5,198012.0,703.0,85.0,4.0,20.34,48.0,0.71,2.0,3.0,2.0,1.0,1.0,3.0,1.0,1.0
max,69.0,149995.0,249993.0,849.0,119.0,4.0,25.0,60.0,0.9,3.0,3.0,2.0,1.0,1.0,4.0,1.0,1.0


# Splitting into test and train sets

In [62]:
# Defining explanatory and target variables

y = df['Default']  

# Splitting into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=9112023)

# Print the shapes of the sets
print("Training set shape:", X_train.shape, y_train.shape)
print("Testing set shape:", X_test.shape, y_test.shape)

Training set shape: (47444, 16) (47444,)
Testing set shape: (11862, 16) (11862,)


In [63]:
print(X_train.dtypes)

Age                 int64
Income              int64
LoanAmount          int64
CreditScore         int64
MonthsEmployed      int64
NumCreditLines      int64
InterestRate      float64
LoanTerm            int64
DTIRatio          float64
Education         float64
EmploymentType    float64
MaritalStatus     float64
HasMortgage       float64
HasDependents     float64
LoanPurpose       float64
HasCoSigner       float64
dtype: object


## Standardizing Numerical Features

In [64]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)

## Fitting the model

With StatsModels to have a neat summary

In [65]:
model = sm.Logit(y_train, sm.add_constant(X_train)).fit_regularized(alpha=0.5, L1_wt=0.5)

print(model.summary())

Optimization terminated successfully    (Exit mode 0)
            Current function value: 0.5952436924147182
            Iterations: 58
            Function evaluations: 58
            Gradient evaluations: 58
                           Logit Regression Results                           
Dep. Variable:                Default   No. Observations:                47444
Model:                          Logit   Df Residuals:                    47427
Method:                           MLE   Df Model:                           16
Date:                Fri, 12 Jan 2024   Pseudo R-squ.:                  0.1413
Time:                        07:02:13   Log-Likelihood:                -28239.
converged:                       True   LL-Null:                       -32886.
Covariance Type:            nonrobust   LLR p-value:                     0.000
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------


Try increasing solver accuracy or number of iterations, decreasing alpha, or switch solvers


This model with the scikit package for further evaluations

In [66]:
model = LogisticRegression(penalty='elasticnet', solver='saga', random_state=9112023, l1_ratio=0.5, C=1.0)

# Fit the logistic regression model to the training data
result_sklearn = model.fit(X_train, y_train)

# Hyperparameter tuning

Discrete search

In [67]:
#step_2 = 0.01
step_3 = 0.2
#step_4 = 1
#step_5 = 10
#step_6 = 100

param_grid_C = {
    'C': np.concatenate([
        #np.arange(0.01, 0.1 + step_2, step_2),
        np.arange(0.1, 1 + step_3, step_3),
        #np.arange(1, 10 + step_4, step_4),
        #np.arange(10, 100 + step_5, step_5),
        #np.arange(100, 1000 + step_6, step_6),
    ]).tolist()
}

param_grid_C

{'C': [0.1,
  0.30000000000000004,
  0.5000000000000001,
  0.7000000000000001,
  0.9000000000000001,
  1.1000000000000003]}

In [68]:

step_3 = 0.2

param_grid_l1_ratio = {
    'l1_ratio': np.concatenate([
        np.arange(0.1, 1 + step_3, step_3),
    ]).tolist()
}

param_grid_l1_ratio

{'l1_ratio': [0.1,
  0.30000000000000004,
  0.5000000000000001,
  0.7000000000000001,
  0.9000000000000001,
  1.1000000000000003]}

In [69]:
merged_param_grid = {}

merged_param_grid.update(param_grid_C)

merged_param_grid.update(param_grid_l1_ratio)

merged_param_grid

{'C': [0.1,
  0.30000000000000004,
  0.5000000000000001,
  0.7000000000000001,
  0.9000000000000001,
  1.1000000000000003],
 'l1_ratio': [0.1,
  0.30000000000000004,
  0.5000000000000001,
  0.7000000000000001,
  0.9000000000000001,
  1.1000000000000003]}

In [70]:
model = LogisticRegression(penalty='elasticnet', solver='saga', random_state=9112023)

# Create the GridSearchCV object
grid_search = GridSearchCV(model, merged_param_grid, cv=5, scoring='accuracy')

# Fit the grid search to the data
grid_search.fit(X_train, y_train)

# Get the best model and its parameters
best_model = grid_search.best_estimator_
best_parameters = grid_search.best_params_

# Print the best parameters
print("Best Parameters:", best_parameters)

30 fits failed out of a total of 180.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
30 fits failed with the following error:
Traceback (most recent call last):
  File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/sklearn/linear_model/_logistic.py", line 1160, in fit
    self._validate_params()
  File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/sklearn/base.py", line 600, in _validate_params
    validate_parameter_constraints(
  File "/Library/Frameworks/Python.f

Best Parameters: {'C': 0.7000000000000001, 'l1_ratio': 0.1}


In [71]:
best_c_value = best_parameters['C']
formatted_best_c = round(best_c_value, 2)  
print("Formatted Best C:", formatted_best_c)

best_l1_ratio_value = best_parameters['l1_ratio']
formatted_best_l1_ratio = round(best_l1_ratio_value, 2)  
print("Formatted Best l1_ratio:", formatted_best_l1_ratio)


Formatted Best C: 0.7
Formatted Best l1_ratio: 0.1


Continuous search

In [72]:
# best_acc = 0

# for i in range (1, 100):
#     lasso_lambda = i/10
#     model_iter = LogisticRegression(penalty='l1', C=1/lasso_lambda, solver='liblinear', random_state=9112023)
#     model_iter.fit(X_train, y_train)
    
#     y_pred_iter = model_iter.predict(X_test)
#     accuracy_iter = accuracy_score(y_test, y_pred_iter)
#     if accuracy_iter > best_acc:
#         best_acc = accuracy_iter
#         lambda_best = lasso_lambda

# C_best = 1/lambda_best
# print(lambda_best)
# print(C_best)

### Repeating fitting the model with optimal hyperparameters

In [73]:
model = LogisticRegression(penalty='l1', C=formatted_best_c, solver='liblinear', random_state=9112023)
model.fit(X_train, y_train)

# Transforming the test set with the scaler
X_test_scaled = scaler.transform(X_test)

# Making predictions on the scaled test set
y_test_pred = model.predict(X_test_scaled)

# Evaluation of the model

In [74]:
X_test = scaler.transform(X_test)

# Make predictions on the test set
y_test_pred = model.predict(X_test)

## Accuracy

In [75]:
from sklearn.metrics import accuracy_score

accuracy = accuracy_score(y_test, y_test_pred)
print("Accuracy:", accuracy)


Accuracy: 0.6763614904737818


## Precision

In [76]:
from sklearn.metrics import precision_score

precision = precision_score(y_test, y_test_pred)
print("Precision:", precision)


Precision: 0.6782739086971009


## Recall

In [77]:
recall = recall_score(y_test, y_test_pred)
print("Recall:", recall)

Recall: 0.6808830908178625


## F1 Score

In [78]:
f1 = f1_score(y_test, y_test_pred)
print("F1 Score:", f1)

F1 Score: 0.6795759953259327


## Area Under the Receiver Operating Characteristic (ROC) Curve (AUC-ROC)

In [79]:
auc_roc = roc_auc_score(y_test, model.predict_proba(X_test)[:, 1])
print("AUC-ROC:", auc_roc)

AUC-ROC: 0.7420095781435945


## Area Under the Precision-Recall Curve (AUC-PR)

In [80]:
precision_curve, recall_curve, _ = precision_recall_curve(y_test, model.predict_proba(X_test)[:, 1])
auc_pr = auc(recall_curve, precision_curve)
print("AUC-PR:", auc_pr)

AUC-PR: 0.737291069848898


## Confusion Matrix

In [81]:
cm = confusion_matrix(y_test, y_test_pred)
print("Confusion Matrix:\n", cm)

Confusion Matrix:
 [[3952 1931]
 [1908 4071]]
