In [2]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import cross_val_score, GridSearchCV, train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.model_selection import cross_val_predict
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

In [9]:
# Load the dataset
df = pd.read_csv(r"C:\Users\linds\Documents\git\loan_history_almost_clean.csv")
df.head(5)

Unnamed: 0,default,amount_of_loan,credit_rating,years_with_bank,rent_mortgage_own,income,age
0,0,1000,B,2.0,0,19200.0,24
1,1,6500,A,2.0,1,66000.0,28
2,0,2400,A,2.0,0,60000.0,36
3,0,10000,C,3.0,0,62000.0,24
4,1,4000,C,2.0,0,20000.0,28


In [13]:
len(df)

7727

In [11]:
#Check for Null values
df.isnull().values.any()

True

In [12]:
# WHat columns have null values?
df.isnull().sum()

default              0
amount_of_loan       0
credit_rating        0
years_with_bank      1
rent_mortgage_own    0
income               0
age                  0
dtype: int64

In [15]:
# With only one null, it can be dropped
df = df.dropna()

In [16]:
len(df)

7726

## Data must be converted to numerical, mapping will be used in this instance

In [18]:
# map credit rating to a number
int_credit_rating = {"credit_rating": {"A": 1, "B": 2, "C": 3, "D": 4, "E": 5, "F": 5, "G": 5}}
df = df.replace(int_credit_rating)
df.head()

Unnamed: 0,default,amount_of_loan,credit_rating,years_with_bank,rent_mortgage_own,income,age
0,0,1000,2,2.0,0,19200.0,24
1,1,6500,1,2.0,1,66000.0,28
2,0,2400,1,2.0,0,60000.0,36
3,0,10000,3,3.0,0,62000.0,24
4,1,4000,3,2.0,0,20000.0,28


### Logistic Regression

In [19]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df.drop('default', axis=1), df['default'], test_size=0.2, random_state=42)

# Create a logistic regression model with L1 regularization
lr_model = LogisticRegression(penalty='l1', solver='liblinear')

param_grid = {'C': [0.1, 1, 10, 100],
              'class_weight': [None, 'balanced'],
              'max_iter': [100, 500, 1000]}

# Perform grid search using 5-fold cross-validation
lr_grid = GridSearchCV(lr_model, param_grid, cv=5, scoring='accuracy')

# Fit the grid search to the training data
lr_grid.fit(X_train, y_train)

# Print the best hyperparameters and corresponding test accuracy
print("Best hyperparameters: ", lr_grid.best_params_)
print("Test accuracy: ", lr_grid.score(X_test, y_test))

Best hyperparameters:  {'C': 0.1, 'class_weight': 'balanced', 'max_iter': 500}
Test accuracy:  0.6319534282018111


In [21]:
# Fit the model on the training set
lr_model.fit(X_train, y_train)

# Predict the classes of the testing set
y_pred = lr_model.predict(X_test)

# Calculate the test accuracy
accuracy = lr_model.score(X_test, y_test)
print(f'Test accuracy: {accuracy:.2f}')

# display the confusion matrix
#
# TP FP
# FN TN
cm = confusion_matrix(y_test, y_pred)
print(cm)

Test accuracy: 0.62
[[589 210]
 [376 371]]
