In [None]:
conda 

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import json

# custom classes for cross validation and scoring
import cross_validation
import performance_metrics

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold
from skopt import BayesSearchCV

ModuleNotFoundError: No module named 'skopt'

In [None]:
data = pd.read_csv(r"../data/processed_data/complete_data.csv", index_col=0)


In [None]:
x = data.drop(["TARGET"], axis=1)
y = data.filter(["TARGET"], axis=1)

scaler = load(open(r"scaler/scaler.pkl", "rb"))
x_scaled = scaler.transform(x)
y = y.values.reshape(y.shape[0])

x_scaled.shape, y.shape


In [None]:
split = 0.8

hp_x, hp_y = x_scaled[:int(len(x_scaled) * split)], y[:int(len(y) * split)] # hyperparameter set
kf_x, kf_y = x_scaled[int(len(x_scaled) * split):], y[int(len(y) * split):] # kfold set

hp_x.shape, hp_y.shape, kf_x.shape, kf_y.shape

In [None]:
param_space = {
    "C": (1e-3, 1e+2, "log-uniform"),
    "penalty": ["l2"],
    "solver": ["liblinear", "saga", "lbfgs", "sag"]
}

model = LogisticRegression(max_iter=1000)

bay_search = BayesSearchCV(model, param_space, n_iter=25, cv=5, verbose=2, random_state=512)
bay_search.fit(hp_x, hp_y)

In [None]:
# save the model architecture 
best_params = bay_search.best_params_
with open("logistic_regression/lr-hyperparameters.json", "w") as json_file:
    json.dump(best_params, json_file)

In [None]:
# load the best hyperparameters
with open("logistic_regression/lr-hyperparameters.json", "r") as json_file:
    load_hyperparameters = json.load(json_file)
    
model = LogisticRegression(**load_hyperparameters)

kfold = KFold(n_splits=5, shuffle=True, random_state=663)

# run cross validation for the model
cv = cross_validation.cross_val(kf_x, kf_y)
results, confusion_matrix = cv.run_validation(kfold=kfold, model=loaded_model)

In [None]:
# save to csv for easy viewing
results.to_csv(r"../results/lr-scores.csv")

results.round(4) * 100

In [None]:
# save confusion matrix 
cm_df = pd.DataFrame(confusion_matrix)
cm_df.to_csv(r"../results/lr-cmdata.csv")

In [None]:
fig = plt.subplot()

sns.heatmap(confusion_matrix, annot=True, cmap='Blues', fmt='d')

# Set labels, title, and axis ticks
plt.xlabel('Predicted labels')
plt.ylabel('True labels')
plt.title('Confusion Matrix')

tick_labels = ['Default', 'non-default']
tick_positions = [0, 1]
plt.xticks(ticks=[pos + 0.5 for pos in tick_positions], labels=tick_labels)
plt.yticks(ticks=[pos + 0.5 for pos in tick_positions], labels=tick_labels)

# Show the plot
plt.show()

plt.tight_layout()
plt.savefig(r"../results/lr-cm.png")