# ML models results overview

Liberie varie da installare

In [None]:
#!pip install pandas
#!pip install xlsxwriter

Inclusione delle librerie utilizzate

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib_inline.backend_inline
# Change plot output format
matplotlib_inline.backend_inline.set_matplotlib_formats('svg')
from pandas import ExcelWriter

Variabili di gestione files

In [None]:
# Path of the directory containing .pkl file of the different ML models results, can be changed
DATASET_PATH = r"C:\Users\Andre\OneDrive - Università degli Studi di Parma\Tirocinio\ML_models\ML_model_experiments.pkl"

# True = export summary file in the OUTPUT_PATH
to_export = True

# Content of the exported document, possible values are 'Rankings' or 'Description',
# the default value is 'Description'
export_content = "Rankings"

# Path of the output file report, can be changed
OUTPUT_PATH = r"C:\Users\Andre\OneDrive - Università degli Studi di Parma\Tirocinio\ML_models"

Leggo il dataset

In [None]:
dataset = pd.read_pickle(DATASET_PATH)

Rimuovo eventuali esperimenti duplicati

In [None]:
dataset.drop_duplicates(subset=[ 'Accuracy - test',
                                 'Precision - test',
                                 'Specificity - test',
                                 'Recall - test',
                                 'F1-score - test',
                                 'AUC - test'],
                        keep='last',
                        inplace=True)

Mostro i risultati ordinati per accuracy

In [None]:
dataset.sort_values(by=['F1-score - test'], ascending=False)

In [None]:
dataset

Plot model type

In [None]:
# Create a dataset list in order to export a unique excel file with multiple sheets
dataset_list = []

In [None]:
keep_columns = ["Model type", "Accuracy - test", 'Specificity - test', "Precision - test",
                "Recall - test", "F1-score - test", "AUC - test"]
if export_content == "Rankings":
    d1 = dataset[keep_columns].groupby(["Model type"]).mean().sort_values(by=['F1-score - test'], ascending=False)
else:
    d1 = dataset[keep_columns].groupby(["Model type"]).agg([np.mean, np.std])
dataset_list.append(d1.round(3))
d1

In [None]:
sns.set_theme(style="white")
plt.subplots(figsize=(9, 6))
sns.barplot(x="Model type", y="Accuracy - test", data=dataset)
if to_export:
    plt.savefig(OUTPUT_PATH + "/Classification_metrics/Accuracy_test.pdf", bbox_inches="tight")

In [None]:
sns.set_theme(style="white")
plt.subplots(figsize=(9, 6))
sns.barplot(x="Model type", y="Specificity - test", data=dataset)
if to_export:
    plt.savefig(OUTPUT_PATH + "/Classification_metrics/Specificity_test.pdf", bbox_inches="tight")

In [None]:
sns.set_theme(style="white")
plt.subplots(figsize=(9, 6))
sns.barplot(x="Model type", y="Precision - test", data=dataset)
if to_export:
    plt.savefig(OUTPUT_PATH + "/Classification_metrics/Precision_test.pdf", bbox_inches="tight")

In [None]:
sns.set_theme(style="white")
plt.subplots(figsize=(9, 6))
sns.barplot(x="Model type", y="Recall - test", data=dataset)
if to_export:
    plt.savefig(OUTPUT_PATH + "/Classification_metrics/Recall_test.pdf", bbox_inches="tight")

In [None]:
sns.set_theme(style="white")
plt.subplots(figsize=(9, 6))
sns.barplot(x="Model type", y="F1-score - test", data=dataset)
if to_export:
    plt.savefig(OUTPUT_PATH + "/Classification_metrics/F1-score_test.pdf", bbox_inches="tight")

In [None]:
sns.set_theme(style="white")
plt.subplots(figsize=(9, 6))
sns.barplot(x="Model type", y="AUC - test", data=dataset)
if to_export:
    plt.savefig(OUTPUT_PATH + "/Classification_metrics/AUC_test.pdf", bbox_inches="tight")

In [None]:
results = pd.melt(dataset, id_vars=['Model type'],
                  value_vars=['Accuracy - test', 'Specificity - test', 'Precision - test', 'Recall - test', 'F1-score - test', 'AUC - test'],
                  var_name='Metric')
sns.set_theme(style="white")
plt.subplots(figsize=(9, 6))
sns.barplot(x="Model type", y="value", hue="Metric", data=results, orient="v")
plt.legend(loc='lower left')
if to_export:
    plt.savefig(OUTPUT_PATH + "/Classification_metrics/Metrics_test.pdf", bbox_inches="tight")

Plot dataset train/test

In [None]:
keep_columns = ["Dataset - train/test", "Accuracy - test", 'Specificity - test', "Precision - test",
                "Recall - test", "F1-score - test", "AUC - test"]
if export_content == "Rankings":
    d2 = dataset[keep_columns].groupby(["Dataset - train/test"]).mean().sort_values(by=['F1-score - test'], ascending=False)
else:
    d2 = dataset[keep_columns].groupby(["Dataset - train/test"]).agg([np.mean, np.std])
dataset_list.append(d2.round(3))
d2

In [None]:
sns.set_theme(style="white")
plt.subplots(figsize=(9, 6))
sns.barplot(x="Accuracy - test", y="Dataset - train/test", data=dataset, orient="h")
if to_export:
    plt.savefig(OUTPUT_PATH + "/Classification_metrics/Dataset_accuracy_test.pdf", bbox_inches="tight")

In [None]:
sns.set_theme(style="white")
plt.subplots(figsize=(9, 6))
sns.barplot(x="Specificity - test", y="Dataset - train/test", data=dataset, orient="h")
if to_export:
    plt.savefig(OUTPUT_PATH + "/Classification_metrics/Dataset_specificity_test.pdf", bbox_inches="tight")

In [None]:
sns.set_theme(style="white")
plt.subplots(figsize=(9, 6))
sns.barplot(x="Precision - test", y="Dataset - train/test", data=dataset, orient="h")
if to_export:
    plt.savefig(OUTPUT_PATH + "/Classification_metrics/Dataset_precision_test.pdf", bbox_inches="tight")

In [None]:
sns.set_theme(style="white")
plt.subplots(figsize=(9, 6))
sns.barplot(x="Recall - test", y="Dataset - train/test", data=dataset, orient="h")
if to_export:
    plt.savefig(OUTPUT_PATH + "/Classification_metrics/Dataset_recall_test.pdf", bbox_inches="tight")

In [None]:
sns.set_theme(style="white")
plt.subplots(figsize=(9, 6))
sns.barplot(x="AUC - test", y="Dataset - train/test", data=dataset, orient="h")
if to_export:
    plt.savefig(OUTPUT_PATH + "/Classification_metrics/Dataset_AUC_test.pdf", bbox_inches="tight")

In [None]:
sns.set_theme(style="white")
plt.subplots(figsize=(9, 6))
sns.barplot(x="F1-score - test", y="Dataset - train/test", data=dataset, orient="h")
if to_export:
    plt.savefig(OUTPUT_PATH + "/Classification_metrics/Dataset_F1-score_test.pdf", bbox_inches="tight")

In [None]:
results = pd.melt(dataset, id_vars=['Dataset - train/test'],
                  value_vars=['Accuracy - test', 'Specificity - test', 'Precision - test', 'Recall - test', 'F1-score - test', 'AUC - test'],
                  var_name='Metric')
sns.set_theme(style="white")
plt.subplots(figsize=(9, 20))
sns.barplot(x="value", y="Dataset - train/test", hue="Metric", data=results, orient="h")
plt.legend(loc='lower left')

# Change y labels in order to have more space for the graph part
ylabels = list(dict.fromkeys(results["Dataset - train/test"].to_list()))
ylabels_new = [label.replace('filtered_active_bankruptcy_', 'filtered_active_bankruptcy_\n') for label in ylabels]
plt.yticks(np.arange(len(ylabels_new)), ylabels_new)

if to_export:
    plt.savefig(OUTPUT_PATH + "/Classification_metrics/Dataset_metrics_test.pdf", bbox_inches="tight")

Plot model type & dataset train/test

In [None]:
keep_columns = ["Model type", "Dataset - train/test", "Accuracy - test", 'Specificity - test',
                "Precision - test", "Recall - test", "F1-score - test", "AUC - test"]
if export_content == "Rankings":
    d3 = dataset[keep_columns].groupby(["Model type", "Dataset - train/test"]).mean().sort_values(by=['F1-score - test'], ascending=False)
else:
    d3 = dataset[keep_columns].groupby(["Model type", "Dataset - train/test"]).agg([np.mean, np.std])
dataset_list.append(d3.round(3))
d3

Plot number of components

In [None]:
keep_columns = ["Number of features", "Accuracy - test", 'Specificity - test', "Precision - test",
                "Recall - test", "F1-score - test", "AUC - test"]
if export_content == "Rankings":
    d4 = dataset[keep_columns].groupby(["Number of features"]).mean().sort_values(by=['F1-score - test'], ascending=False)
else:
    d4 = dataset[keep_columns].groupby(["Number of features"]).agg([np.mean, np.std])
dataset_list.append(d4.round(3))
d4

In [None]:
results = pd.melt(dataset[dataset["Dimensionality reduction technique"] == "N.A."], id_vars=['Number of features'],
                  value_vars=['Accuracy - test', 'Specificity - test', 'Precision - test', 'Recall - test', 'F1-score - test', 'AUC - test'],
                  var_name='Metric')
sns.set_theme(style="white")
plt.subplots(figsize=(9, 6))
sns.barplot(x="Number of features", y="value", hue="Metric", data=results, orient="v")
plt.legend(loc='lower left')
if to_export:
    plt.savefig(OUTPUT_PATH + "/Classification_metrics/Number_of_features_metrics.pdf", bbox_inches="tight")

Plot model type & number of components

In [None]:
keep_columns = ["Model type", "Number of features", "Accuracy - test", 'Specificity - test',
                "Precision - test", "Recall - test", "F1-score - test", "AUC - test"]
if export_content == "Rankings":
    d5 = dataset[keep_columns].groupby(["Model type", "Number of features"]).mean().sort_values(by=['F1-score - test'], ascending=False)
else:
    d5 = dataset[keep_columns].groupby(["Model type", "Number of features"]).agg([np.mean, np.std])
dataset_list.append(d5.round(3))
d5

Plot dimensionality reduction technique & number of components

In [None]:
keep_columns = ["Dimensionality reduction technique", "Number of features", "Accuracy - test", 'Specificity - test',
                "Precision - test", "Recall - test", "F1-score - test", "AUC - test"]
if export_content == "Rankings":
    d6 = dataset[keep_columns].groupby(["Dimensionality reduction technique", "Number of features"]).mean().sort_values(by=['F1-score - test'], ascending=False)
else:
    d6 = dataset[keep_columns].groupby(["Dimensionality reduction technique", "Number of features"]).agg([np.mean, np.std])
dataset_list.append(d6.round(3))
d6

Plot dimensionality reduction technique

In [None]:
keep_columns = ["Dimensionality reduction technique", "Accuracy - test", 'Specificity - test',
                "Precision - test", "Recall - test", "F1-score - test", "AUC - test"]
if export_content == "Rankings":
    d7 = dataset[keep_columns].groupby(["Dimensionality reduction technique"]).mean().sort_values(by=['F1-score - test'], ascending=False)
else:
    d7 = dataset[keep_columns].groupby(["Dimensionality reduction technique"]).agg([np.mean, np.std])
dataset_list.append(d7.round(3))
d7

In [None]:
results = pd.melt(dataset, id_vars=['Dimensionality reduction technique'],
                  value_vars=['Accuracy - test', 'Specificity - test', 'Precision - test', 'Recall - test', 'F1-score - test', 'AUC - test'],
                  var_name='Metric')
sns.set_theme(style="white")
plt.subplots(figsize=(9, 6))
sns.barplot(x="Dimensionality reduction technique", y="value", hue="Metric", data=results, orient="v")
plt.legend(loc='lower left')
if to_export:
    plt.savefig(OUTPUT_PATH + "/Classification_metrics/Dimensionality_reduction_technique_score_test.pdf", bbox_inches="tight")

Plot imbalanced data technique

In [None]:
keep_columns = ["Imbalanced data technique", "Accuracy - test", 'Specificity - test',
                "Precision - test", "Recall - test", "F1-score - test", "AUC - test"]
if export_content == "Rankings":
    d8 = dataset[keep_columns].groupby(["Imbalanced data technique"]).mean().sort_values(by=['F1-score - test'], ascending=False)
else:
    d8 = dataset[keep_columns].groupby(["Imbalanced data technique"]).agg([np.mean, np.std])
dataset_list.append(d8.round(3))
d8

In [None]:
results = pd.melt(dataset, id_vars=['Imbalanced data technique'],
                  value_vars=['Accuracy - test', 'Specificity - test', 'Precision - test', 'Recall - test', 'F1-score - test', 'AUC - test'],
                  var_name='Metric')
sns.set_theme(style="white")
plt.subplots(figsize=(9, 6))
sns.barplot(x="Imbalanced data technique", y="value", hue="Metric", data=results, orient="v")
plt.legend(loc='lower left')
if to_export:
    plt.savefig(OUTPUT_PATH + "/Classification_metrics/Imbalanced_data_technique_score_test.pdf", bbox_inches="tight")

Plot Imbalanced data technique & Dataset train/test

In [None]:
keep_columns = ["Imbalanced data technique", "Dataset - train/test", "Accuracy - test", 'Specificity - test',
                "Precision - test", "Recall - test", "F1-score - test", "AUC - test"]
if export_content == "Rankings":
    d9 = dataset[keep_columns].groupby(["Imbalanced data technique", "Dataset - train/test"]).mean().sort_values(by=['F1-score - test'], ascending=False)
else:
    d9 = dataset[keep_columns].groupby(["Imbalanced data technique", "Dataset - train/test"]).agg([np.mean, np.std])
dataset_list.append(d9.round(3))
d9

Plot model & validation metrics

In [None]:
keep_columns = ["Model type", "Accuracy - validation", 'Specificity - validation', "Precision - validation",
                "Recall - validation", "F1-score - validation", "AUC - validation"]
if export_content == "Rankings":
    d10 = dataset[keep_columns].groupby(["Model type"]).mean().sort_values(by=['F1-score - validation'], ascending=False)
else:
    d10 = dataset[keep_columns].groupby(["Model type"]).agg([np.mean, np.std])
dataset_list.append(d10.round(3))
d10

In [None]:
results = pd.melt(dataset, id_vars=['Model type'],
                  value_vars=['Accuracy - validation', 'Specificity - validation', 'Precision - validation',
                              'Recall - validation', 'F1-score - validation', 'AUC - validation'],
                  var_name='Metric')
sns.set_theme(style="white")
plt.subplots(figsize=(9, 6))
sns.barplot(x="Model type", y="value", hue="Metric", data=results, orient="v")
plt.legend(loc='lower left')
if to_export:
    plt.savefig(OUTPUT_PATH + "/Classification_metrics/Metrics_validation.pdf", bbox_inches="tight")

Plot model & training time

In [None]:
keep_columns = ["Model type", "Training time"]
if export_content == "Rankings":
    d11 = dataset[keep_columns].groupby(["Model type"]).mean().sort_values(by=['Training time'], ascending=False)
else:
    d11 = dataset[keep_columns].groupby(["Model type"]).agg([np.mean, np.std])
dataset_list.append(d11.round(3))
d11

In [None]:
sns.set_theme(style="white")
plt.subplots(figsize=(9, 6))
ax = sns.barplot(x="Model type", y="Training time", data=dataset)
ax.set(ylabel='Training time (s)')
if to_export:
    plt.savefig(OUTPUT_PATH + "/Classification_metrics/Training_time.pdf", bbox_inches="tight")

Plot train/test split

In [None]:
keep_columns = ["Train/Test split", "Accuracy - test", 'Specificity - test',
                "Precision - test", "Recall - test", "F1-score - test", "AUC - test"]
if export_content == "Rankings":
    d12 = dataset[keep_columns].groupby(["Train/Test split"]).mean().sort_values(by=['F1-score - test'], ascending=False)
else:
    d12 = dataset[keep_columns].groupby(["Train/Test split"]).agg([np.mean, np.std])
dataset_list.append(d12.round(3))
d12

In [None]:
results = pd.melt(dataset, id_vars=['Train/Test split'],
                  value_vars=['Accuracy - test', 'Specificity - test', 'Precision - test', 'Recall - test', 'F1-score - test', 'AUC - test'],
                  var_name='Metric')
sns.set_theme(style="white")
plt.subplots(figsize=(9, 6))
sns.barplot(x="Train/Test split", y="value", hue="Metric", data=results, orient="v")
plt.legend(loc='lower left')
if to_export:
    plt.savefig(OUTPUT_PATH + "/Classification_metrics/Train_test_split.pdf", bbox_inches="tight")

Esporto in formato xlsx se richiesto

In [None]:
# Function in order to generate a unique excel file with multiple sheets
# from multiple pandas datasets
def save_xls(list_dfs, xls_path, na_replace):
    with ExcelWriter(xls_path, engine="xlsxwriter") as writer:
        for n, df in enumerate(list_dfs):
            if na_replace:
                df.to_excel(writer, 'sheet%s' % n, na_rep="N.A.")
            else:
                df.to_excel(writer, 'sheet%s' % n)
            writer.sheets["sheet"+str(n)].set_column(0, 10, 35)

In [None]:
if to_export:
    # Experiments dataset
    dataset.to_excel(OUTPUT_PATH + "/ML_model_experiments.xlsx", engine='xlsxwriter')
    if export_content == "Rankings":
        na_rep = True
    else:
        na_rep = False
    # Rankings/Description datasets
    save_xls(dataset_list, OUTPUT_PATH + "/" + export_content.lower() + ".xlsx", na_replace=na_rep)