In [None]:
import pandas as pd
import numpy as np
import joblib
import performance_metrics
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold
from sklearn.metrics import confusion_matrix
from pickle import load

In [None]:
data = pd.read_csv(r"../data/processed_data/complete_data.csv", index_col=0)


In [None]:
x = data.drop(["TARGET"], axis=1)
y = data.filter(["TARGET"], axis=1)

scaler = load(open(r"scaler.pkl", "rb"))
x_scaled = scaler.transform(x)
y = y.values.reshape(y.shape[0])

x_scaled.shape, y.shape


In [None]:
kfold = KFold(n_splits=5, shuffle=True, random_state=663)

results = pd.DataFrame(columns=["accuracy", "precision", "recall", "f1", "auc", "fpr"])
results.index.name = "Model"

model = LogisticRegression(max_iter=1000)

counter = 1
confusion_matrix_sum = None

# set up kfold cross-validation
for train_index, test_index in kfold.split(x_scaled):
    x_train, x_test = x_scaled[train_index], x_scaled[test_index]
    y_train, y_test = y[train_index], y[test_index]
     
    # fit model to train set
    model.fit(x_train, y_train)
    y_pred = model.predict(x_test)
    
    # calculate confusion matrix, and append data for later visualisation
    cm = confusion_matrix(y_test, y_pred)
    fpr = cm[0][1] / cm.sum()
    
    if confusion_matrix_sum is None:
        confusion_matrix_sum = cm
    else:
        confusion_matrix_sum += cm
    
    # calculate metrics and append to dataframe
    metrics = performance_metrics.metrics(y_test, y_pred) 
    accuracy, precision, recall, f1, auc_result = metrics.calculate_metrics()
    results.loc[counter] = [accuracy, precision, recall, f1, auc_result, fpr]
    
    print(f"Fold {counter} completed")
    counter += 1

# calculate the average over all models
results.loc["Average"] = [results.accuracy.mean(),
                          results.precision.mean(),
                          results.recall.mean(),
                          results.f1.mean(),
                          results.auc.mean(),
                          results.fpr.mean()]

In [None]:
results.round(3) * 100

In [None]:
true_fpr = round(confusion_matrix_sum[0][1] / confusion_matrix_sum.sum() * 100,4)

print(f"The true FPR is {true_fpr}%")

In [None]:
sns.heatmap(confusion_matrix_sum, annot=True, cmap='Blues', fmt='d')

# Set labels, title, and axis ticks
plt.xlabel('Predicted labels')
plt.ylabel('True labels')
plt.title('Confusion Matrix')

tick_labels = ['Default', 'non-default']
tick_positions = [0, 1]
plt.xticks(ticks=[pos + 0.5 for pos in tick_positions], labels=tick_labels)
plt.yticks(ticks=[pos + 0.5 for pos in tick_positions], labels=tick_labels)

# Show the plot
plt.show()

In [None]:
# joblib_file = r"logistic_regression/LogisticRegression_v1.03.pkl"
# joblib.dump(clf, joblib_file)
