# Master Thesis: Bankruptcy Prediction for European Countries
Code written by Marc Zeugin (UZH)

## Load modules

In [None]:
from sklearn.metrics import precision_score, recall_score, f1_score, classification_report, PrecisionRecallDisplay, confusion_matrix, RocCurveDisplay
from sklearn.model_selection import train_test_split, StratifiedKFold
from imblearn.under_sampling import EditedNearestNeighbours
from sklearn.experimental import enable_iterative_imputer
from sklearnex.ensemble import RandomForestClassifier
from sklearnex.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.impute import IterativeImputer
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline
from imblearn.combine import SMOTEENN
from skopt.space import Real, Integer
from matplotlib import pyplot as plt
from sklearn.svm import LinearSVC
from skopt import BayesSearchCV
from joblib import dump, Memory
from itertools import product
from tempfile import mkdtemp
from shutil import rmtree
import seaborn as sns
import pandas as pd
import numpy as np
import matplotlib
import warnings
warnings.filterwarnings(action="ignore")
import pyarrow
import glob
import time

Set random seed

In [None]:
np.random.seed(1)

## 1. Setup key variables

In [None]:
# determine training/testing ratio, default is 0.2
tt_size = 0.2
# number of splits for k-fold crossvalidation, default is 5
k_splits = 5
# set number of jobs to run in parallel, -1 means all processors, default is 1
jobs = -1
# number of iterations for BayesSearchCV
n_iterations = 40
# select scoring model to optimize, default is average_precision, the value used for the precision-recall curve
scoring_metric = 'average_precision'
# label for classification report
label = ['Non-Bankrupt', 'Bankrupt']

# absolute path to dataset
path = 'C:/Users/marczeugin/Documents/Masterthesis/'
# extension of dataset type
ext = '*.csv'

# determine the size (in inches) of the precision-recall curve figure, default is (4, 2)
figure_size = (4, 2)
# set dpi of small graphs, default is 100
dpi_low = 100
# set dpi of medium graphs, default is 200
dpi_med = 200
# set dpi of large graphs, default is 250
dpi_high = 250

# enable subsample of total dataset to be used for hyperparameter tuning, default is True
allow_subsample = True
# set subsample size of total dataset, default is .1 e.g. 10%
subsample_size = 0.1
# enable quick overview to run with subsample
allow_subsample_overview = True
# enable hyperparameter tuning with subsample
allow_subsample_hyperparameter = True

# allow to load dataset with SMOTEENN already run and imputation already imputed, default is True
allow_computed_set = True

Create model lists

In [None]:
name_list = ['complete', 'without years', 'without macro', 'without years and macro', 'complete SMOTEENN', 'without years SMOTEENN', 'without macro SMOTEENN', 'without years and macro SMOTEENN']

X_list = np.load('X_list.npy', allow_pickle=True).tolist()
X_list_m = np.load('X_list_m.npy', allow_pickle=True).tolist()
y_list = np.load('y_list.npy', allow_pickle=True).tolist()
y_list_m = np.load('y_list_m.npy', allow_pickle=True).tolist()

### 6.7. Hyperparameter tuning for SVM

#### 6.7.1. Without SMOTEENN

In [None]:
params_list_svmc = dict()
params_list_svmc['C'] = Real(1e-6, 1000.0, prior='log-uniform')
params_list_svmc['penalty'] = ['l2', 'l1']
params_list_svmc['loss'] = ['hinge', 'squared_hinge']
model = LinearSVC(random_state=1, class_weight='balanced', dual=False, tol=1e-5)
kfold = StratifiedKFold(n_splits=3, random_state=1, shuffle=True)
grid = BayesSearchCV(estimator=model, search_spaces=params_list_svmc, scoring=scoring_metric, cv=kfold, error_score=0, random_state=1, n_iter=n_iterations, n_jobs=jobs)
fig_cols = 2
fig_rows = 4

svmc_means_list = []
svmc_names_list = []
svmc_params_list = []
svmc_max_mean_list = []
svmc_best_result_list = []
svmc_best_C = []
svmc_best_penalty = []
svmc_best_loss = []

for i in range(0, int(len(X_list_m))):
    start = time.perf_counter()
    grid_result = grid.fit(X_list_m[i][0], y_list_m[i][0])
    means = grid_result.cv_results_['mean_test_score']
    stds = grid_result.cv_results_['std_test_score']
    params = grid_result.cv_results_['params']
    
    svmc_names = []
    for k in range(len(params)):
        new_string = str(round(params[k]['C'],4)) + ' ' + params[k]['penalty'] + ' ' + params[k]['loss']
        svmc_names.append(new_string)
    
    index_to_rm = []
    for index, item in enumerate(params):
        if item in params[:index]:
            index_to_rm.append(index)
    means = np.delete(means, index_to_rm)
    svmc_names = np.delete(svmc_names, index_to_rm)
    params = np.delete(params, index_to_rm)
    max_mean = np.argmax(means)
    
    svmc_means_list.append(means)
    svmc_names_list.append(svmc_names)
    svmc_params_list.append(params)
    svmc_max_mean_list.append(max_mean)
    svmc_best_result_list.append(grid_result.best_score_)

    print(f'Best {scoring_metric} of {round(grid_result.best_score_,4)} for {params[max_mean]}')

    end = time.perf_counter()
    print(f'Ran hyperparameter tuning in {end-start:0.4f} seconds')

    svmc_best_C.append(grid_result.best_params_["C"])
    svmc_best_penalty.append(grid_result.best_params_["penalty"])
    svmc_best_loss.append(grid_result.best_params_["loss"])

#### 6.7.2. With SMOTEENN

In [None]:
cachedir = mkdtemp()
memory = Memory(cachedir=cachedir, verbose=10)

In [None]:
params_list_svmc_smoteenn = dict()
params_list_svmc_smoteenn['model__C'] = Real(1e-6, 1000.0, prior='log-uniform')
params_list_svmc_smoteenn['model__penalty'] = ['l1', 'l2']
params_list_svmc_smoteenn['model__loss'] = ['hinge', 'squared_hinge']
pipe = Pipeline([('SMOTEENN', SMOTEENN(smote=SMOTE(sampling_strategy='minority', random_state=1), enn=EditedNearestNeighbours(sampling_strategy='all'), 
                                       random_state=1, n_jobs=jobs)), ('scaler', StandardScaler()), ('model', LinearSVC(random_state=1, dual=False, tol=1e-5))], memory=memory)
kfold = StratifiedKFold(n_splits=3, random_state=1, shuffle=True)
grid = BayesSearchCV(estimator=pipe, search_spaces=params_list_svmc_smoteenn, scoring=scoring_metric, cv=kfold, error_score=0, random_state=1, n_iter=n_iterations, n_jobs=jobs)

for i in range(0, int(len(X_list_m))):
    start = time.perf_counter()
    grid_result = grid.fit(X_list_m[i][0], y_list_m[i][0])
    means = grid_result.cv_results_['mean_test_score']
    stds = grid_result.cv_results_['std_test_score']
    params = grid_result.cv_results_['params']
    
    svmc_names = []
    for k in range(len(params)):
        new_string = str(round(params[k]['model__C'],4)) + ' ' + params[k]['model__penalty'] + ' ' + params[k]['model__loss']
        svmc_names.append(new_string)
    
    index_to_rm = []
    for index, item in enumerate(means):
        if item == 0.0:
            index_to_rm.append(index)
    means = np.delete(means, index_to_rm)
    svmc_names = np.delete(svmc_names, index_to_rm)
    params = np.delete(params, index_to_rm)

    index_to_rm2 = []
    for index, item in enumerate(params):
        if item in params[:index]:
            index_to_rm2.append(index)
    means = np.delete(means, index_to_rm2)
    svmc_names = np.delete(svmc_names, index_to_rm2)
    params = np.delete(params, index_to_rm2)
    max_mean = np.argmax(means)
    
    svmc_means_list.append(means)
    svmc_names_list.append(svmc_names)
    svmc_params_list.append(params)
    svmc_max_mean_list.append(max_mean)
    svmc_best_result_list.append(grid_result.best_score_)

    print(f'Best {scoring_metric} of {round(grid_result.best_score_,4)} for {params[max_mean]}')

    end = time.perf_counter()
    print(f'Ran hyperparameter tuning in {end-start:0.4f} seconds')

    svmc_best_C.append(grid_result.best_params_["model__C"])
    svmc_best_penalty.append(grid_result.best_params_["model__penalty"])
    svmc_best_loss.append(grid_result.best_params_["model__loss"])
    
rmtree(cachedir)

In [None]:
col = -1
row = 0

fig, ax = plt.subplots(int(len(svmc_means_list)/2), 2)
fig.set_size_inches((30, 30))
fig.tight_layout(h_pad=16, w_pad=2)
fig.set_dpi(dpi_low)
for i in range(0, int(len(svmc_means_list))):
    if i % 2 == 0:
        col += 1
        row = 0
    if i % 2 == 1:
        row += 1
        
    ax[col, row].plot(range(len(svmc_names_list[i])), svmc_means_list[i], color='grey', marker='o', markerfacecolor='black')
    ax[col, row].plot(svmc_max_mean_list[i], svmc_best_result_list[i], marker='o', markerfacecolor='red', markeredgecolor="red")
    ax[col, row].set_title(f'Mean {scoring_metric} for SVM with stratified {k_splits}-fold crossvalidation\n{name_list[i]}')
    ax[col, row].set_xticks(range(len(svmc_names_list[i])))
    ax[col, row].set_xticklabels(svmc_names_list[i], rotation=45, ha='right')
    ax[col, row].set_xlabel('SVM Hyperparameters (Kernel and C)')
    ax[col, row].set_ylabel(scoring_metric)

plt.show()

### 6.8. SVM with optimal hyperparameters

In [None]:
svmc_models = []
svmc_accuracy = []
svmc_precision = []
svmc_recall = []
svmc_f1 = []
svmc_classification_report = []
svmc_test_prediction = []

for i, variant in enumerate(X_list):
    start = time.perf_counter()
    svmc = LinearSVC(penalty=svmc_best_penalty[i], loss=svmc_best_loss[i], C=svmc_best_C[i], random_state=1, class_weight='balanced', dual=False, tol=1e-5)
    svmc.fit(variant[0], y_list[i][0])
    svmc_test_pred = svmc.predict(variant[1])
    svmc_test_prediction.append(svmc_test_pred)

    svmc_accuracy.append(svmc.score(variant[1], y_list[i][1]))
    svmc_precision.append(precision_score(y_list[i][1], svmc_test_pred))
    svmc_recall.append(recall_score(y_list[i][1], svmc_test_pred))
    svmc_f1.append(f1_score(y_list[i][1], svmc_test_pred))

    svmc_classification_report.append(classification_report(y_list[i][1], svmc_test_pred, target_names=label))

    end = time.perf_counter()
    print(f'Ran model training and testing in {end-start:0.4f} seconds')

    svmc_models.append(svmc)

In [None]:
svmc_pr = []

fig, ax = plt.subplots(int(len(svmc_means_list)/2), 2)
fig.set_size_inches((20, 15))
fig.tight_layout(pad=5.0)
col = -1
row = 0
for i in range(0, int(len(svmc_means_list))):
    if i % 2 == 0:
        col += 1
        row = 0
    elif i % 2 == 1:
        row += 1
    
    svmc_display = PrecisionRecallDisplay.from_estimator(svmc_models[i], X_list[i][1], y_list[i][1], name='SVM', color='black', ax=ax[col, row])
    svmc_pr.append(svmc_display)
    ax[col, row].set_title('SVM Precision-Recall curve '+name_list[i], fontweight='bold')
    ax[col, row].legend(loc='best')
    ax[col, row].set_xlabel('Recall')
    ax[col, row].set_ylabel('Precision')
    ax[col, row].set_xlim(-0.05, 1.05)
    ax[col, row].set_xticks([0.0, 0.2, 0.4, 0.6, 0.8, 1.0])
    ax[col, row].set_ylim(0, 1.05)
    ax[col, row].set_yticks([0.00, 0.25, 0.50, 0.75, 1.00])
plt.show()

#### Summary

In [None]:
linestyle_list = ['solid', 'dashed', 'dotted', 'dashdot', 'solid', 'dashed', 'dotted', 'dashdot']
color_list = ['black', 'black', 'black', 'black', 'grey', 'grey', 'grey', 'grey']

ax = plt.gca()
fig = plt.gcf()
fig.set_size_inches(8, 4)
fig.set_dpi(dpi_high)
for i in range(0, int(len(svmc_means_list))):
    svmc_display = PrecisionRecallDisplay.from_estimator(svmc_models[i], X_list[i][1], y_list[i][1], name=f'SVM {name_list[i]}', 
                                                            color=color_list[i], linestyle=linestyle_list[i], ax=plt.gca())
ax.set_title("SVM Combined Precision-Recall curve")
ax.legend(loc='best', prop={'size': 8})
ax.set_xlabel('Recall')
ax.set_ylabel('Precision')
ax.set_xlim(-0.05, 1.05)
ax.set_ylim(0, 1.05)
plt.show()

In [None]:
ax = plt.gca()
fig = plt.gcf()
fig.set_size_inches(8, 4)
fig.set_dpi(dpi_high)
for i in range(0, 4):
    svmc_display = PrecisionRecallDisplay.from_estimator(svmc_models[i], X_list[i][1], y_list[i][1], name=f'SVM {name_list[i]}', 
                                                            color=color_list[i], linestyle=linestyle_list[i], ax=plt.gca())
ax.set_title("SVM Combined Precision-Recall curve")
ax.legend(loc='best', prop={'size': 8})
ax.set_xlabel('Recall')
ax.set_ylabel('Precision')
ax.set_xlim(-0.05, 1.05)
ax.set_ylim(0, 1.05)
plt.show()

In [None]:
ax = plt.gca()
fig = plt.gcf()
fig.set_size_inches(8, 4)
fig.set_dpi(dpi_high)
for i in range(4, int(len(svmc_means_list))):
    svmc_display = PrecisionRecallDisplay.from_estimator(svmc_models[i], X_list[i][1], y_list[i][1], name=f'SVM {name_list[i]}', 
                                                            color=color_list[i], linestyle=linestyle_list[i], ax=plt.gca())
ax.set_title("SVM Combined Precision-Recall curve")
ax.legend(loc='best', prop={'size': 8})
ax.set_xlabel('Recall')
ax.set_ylabel('Precision')
ax.set_xlim(-0.05, 1.05)
ax.set_ylim(0, 1.05)
plt.show()

AUROC

In [None]:
ax = plt.gca()
fig = plt.gcf()
fig.set_size_inches(8, 4)
fig.set_dpi(dpi_high)
for i in range(0, int(len(svmc_means_list))):
    svmc_display = RocCurveDisplay.from_estimator(svmc_models[i], X_list[i][1], y_list[i][1], name=f'SVM {name_list[i]}', 
                                                            color=color_list[i], linestyle=linestyle_list[i], ax=plt.gca())
ax.set_title("SVM Combined AUROC")
ax.legend(loc='best', prop={'size': 8})
ax.set_xlabel('Recall')
ax.set_ylabel('Precision')
ax.set_xlim(-0.05, 1.05)
ax.set_ylim(0, 1.05)
plt.show()

In [None]:
ax = plt.gca()
fig = plt.gcf()
fig.set_size_inches(8, 4)
fig.set_dpi(dpi_high)
for i in range(0, 4):
    svmc_display = RocCurveDisplay.from_estimator(svmc_models[i], X_list[i][1], y_list[i][1], name=f'SVM {name_list[i]}', 
                                                            color=color_list[i], linestyle=linestyle_list[i], ax=plt.gca())
ax.set_title("SVM Combined AUROC")
ax.legend(loc='best', prop={'size': 8})
ax.set_xlabel('Recall')
ax.set_ylabel('Precision')
ax.set_xlim(-0.05, 1.05)
ax.set_ylim(0, 1.05)
plt.show()

In [None]:
ax = plt.gca()
fig = plt.gcf()
fig.set_size_inches(8, 4)
fig.set_dpi(dpi_high)
for i in range(4, int(len(svmc_means_list))):
    svmc_display = RocCurveDisplay.from_estimator(svmc_models[i], X_list[i][1], y_list[i][1], name=f'SVM {name_list[i]}', 
                                                            color=color_list[i], linestyle=linestyle_list[i], ax=plt.gca())
ax.set_title("SVM Combined AUROC")
ax.legend(loc='best', prop={'size': 8})
ax.set_xlabel('Recall')
ax.set_ylabel('Precision')
ax.set_xlim(-0.05, 1.05)
ax.set_ylim(0, 1.05)
plt.show()

Plot confusion matrix for each classifier

In [None]:
confusion_matrix_list = []
heat_map_list = []

temp_conf_mat = []
temp_heat_map = []
for i in range(0, int(len(svmc_means_list))):
    conf_mat = confusion_matrix(y_list[i][1], svmc_test_prediction[i])
    temp_conf_mat.append(conf_mat)
    heat = pd.DataFrame(conf_mat, columns=np.unique(y_list[i][1]), index=np.unique(y_list[i][1]))
    heat.index.name = 'Actual'
    heat.columns.name = 'Predicted'
    temp_heat_map.append(heat)

confusion_matrix_list.append(temp_conf_mat)
heat_map_list.append(temp_heat_map)

fig, ax = plt.subplots(int(len(svmc_means_list)/4), 4, figsize=(20, 18))
fig.subplots_adjust(left=None, bottom=None, right=2, top=None, wspace=None, hspace=0.3)

sns.set(font_scale=1.4)
lc = 'Grey'
lw = 1
col = -1
row = 0
for i in range(int(len(svmc_means_list))):
    if i % 4 == 0:
        col += 1
        row = 0
    else:
        row += 1
    
    sns.heatmap(heat_map_list[-1][i], cmap='Greys', annot=True, annot_kws={'size': 16}, fmt='g', ax=ax[col, row], xticklabels=label, yticklabels=label, linecolor=lc, linewidths=lw)
    ax[col, row].set_title(('SVM '+name_list[i]), fontweight='bold')
sns.set(font_scale=1)
plt.show()

In [None]:
matplotlib.rc_file_defaults()

Save output for model persistence

In [None]:
data = {'Model 1': [], 'Model 2': [], 'Model 3': [], 'Model 4': [], 'Model 5': [], 'Model 6': [], 'Model 7': [], 'Model 8': []}
for i, model in enumerate(data):
    data[model].append(svmc_means_list[i])
    data[model].append(svmc_params_list[i])
    data[model].append(svmc_best_result_list[i])
    data[model].append(svmc_best_C[i])
    data[model].append(svmc_best_penalty[i])
    data[model].append(svmc_best_loss[i])
    dump(svmc_models[i], f'models/output-svmc{i}.joblib')
    data[model].append(svmc_accuracy[i])
    data[model].append(svmc_precision[i])
    data[model].append(svmc_recall[i])
    data[model].append(svmc_f1[i])
    data[model].append(svmc_classification_report[i])
    data[model].append(svmc_test_prediction[i])
    data[model].append(svmc_pr[i])
    data[model].append(heat_map_list[0][i])
    
output_df = pd.DataFrame(data)
index_list = ['svmc_means_list','svmc_params_list','svmc_best_result_list','svmc_best_C','svmc_best_penalty','svmc_best_loss','svmc_accuracy','svmc_precision','svmc_recall','svmc_f1','svmc_classification_report',
                'svmc_test_prediction','svmc_pr','heat_map_svmc']
output_df.insert(0, 'Name', index_list)
output_df.set_index(output_df['Name'], inplace=True)
output_df.drop('Name', axis=1, inplace=True)
dump(output_df, 'models/output-svmc.joblib')