# ML models results overview

Liberie varie da installare

In [None]:
#!pip install pandas
#!pip install xlsxwriter

Inclusione delle librerie utilizzate

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import re
import matplotlib.pyplot as plt
import matplotlib_inline.backend_inline
# Change plot output format
matplotlib_inline.backend_inline.set_matplotlib_formats('svg')
from pandas import ExcelWriter

Variabili di gestione files

In [None]:
# Path of the directory containing .pkl file of the different ML models results, can be changed
DATASET_PATH = r"C:\Users\Andre\OneDrive - Università degli Studi di Parma\Tirocinio\ML_models\ML_model_experiments.pkl"

# True = export summary file in the OUTPUT_PATH
to_export = True

# Content of the exported document, possible values are 'Rankings' or 'Description',
# the default value is 'Description'
export_content = "Description"

# Path of the output file report, can be changed
OUTPUT_PATH = r"C:\Users\Andre\OneDrive - Università degli Studi di Parma\Tirocinio\ML_models"

Leggo il dataset

In [None]:
dataset = pd.read_pickle(DATASET_PATH)

Rimuovo eventuali esperimenti duplicati

In [None]:
dataset.drop_duplicates(subset=[ 'Accuracy - test',
                                 'Precision - test',
                                 'Specificity - test',
                                 'Recall - test',
                                 'F1-score - test',
                                 'MCC - test',
                                 'AUC - test'],
                        keep='last',
                        inplace=True)

In [None]:
dataset[dataset["Dimensionality reduction technique"] == "N.A."]

Mostro i risultati ordinati per F1-score sul test set

In [None]:
dataset.sort_values(by=['F1-score - test'], ascending=False)

In [None]:
#dataset = dataset[dataset["Dataset - train/test"].str.contains('history3', regex=False)]

In [None]:
dataset[dataset["Dataset - train/test"].str.contains('CE|SP', regex=True)]

Plot model type

In [None]:
# Create a dataset list in order to export a unique excel file with multiple sheets
dataset_list = []
keys_order = ["Decision Tree", "Random Forest", "Gradient Boosting", "Logistic regression", "SVC Classifier"]

In [None]:
keep_columns = ["Model type", "Accuracy - test", 'Specificity - test', "Precision - test",
                "Recall - test", "F1-score - test", "AUC - test"]
if export_content == "Rankings":
    d1 = dataset[dataset["Dimensionality reduction technique"] == "N.A."][keep_columns].groupby(["Model type"]).mean().sort_values(by=['F1-score - test'], ascending=False)
else:
    d1 = dataset[dataset["Dimensionality reduction technique"] == "N.A."][keep_columns].groupby(["Model type"]).agg([np.mean, np.std])
dataset_list.append(d1.round(2))
d1

In [None]:
sns.set_theme(style="white")
plt.subplots(figsize=(9, 6))
sns.barplot(x="Model type", y="Accuracy - test", data=dataset[dataset["Dimensionality reduction technique"] == "N.A."], order=keys_order)
if to_export:
    plt.savefig(OUTPUT_PATH + "/Classification_metrics/Accuracy_test.pdf", bbox_inches="tight")

In [None]:
sns.set_theme(style="white")
plt.subplots(figsize=(9, 6))
sns.barplot(x="Model type", y="Specificity - test", data=dataset[dataset["Dimensionality reduction technique"] == "N.A."], order=keys_order)
if to_export:
    plt.savefig(OUTPUT_PATH + "/Classification_metrics/Specificity_test.pdf", bbox_inches="tight")

In [None]:
sns.set_theme(style="white")
plt.subplots(figsize=(9, 6))
sns.barplot(x="Model type", y="Precision - test", data=dataset[dataset["Dimensionality reduction technique"] == "N.A."], order=keys_order)
if to_export:
    plt.savefig(OUTPUT_PATH + "/Classification_metrics/Precision_test.pdf", bbox_inches="tight")

In [None]:
sns.set_theme(style="white")
plt.subplots(figsize=(9, 6))
sns.barplot(x="Model type", y="Recall - test", data=dataset[dataset["Dimensionality reduction technique"] == "N.A."], order=keys_order)
if to_export:
    plt.savefig(OUTPUT_PATH + "/Classification_metrics/Recall_test.pdf", bbox_inches="tight")

In [None]:
sns.set_theme(style="white")
plt.subplots(figsize=(9, 6))
sns.barplot(x="Model type", y="F1-score - test", data=dataset[dataset["Dimensionality reduction technique"] == "N.A."], order=keys_order)
if to_export:
    plt.savefig(OUTPUT_PATH + "/Classification_metrics/F1-score_test.pdf", bbox_inches="tight")

In [None]:
sns.set_theme(style="white")
plt.subplots(figsize=(9, 6))
sns.barplot(x="Model type", y="MCC - test", data=dataset[dataset["Dimensionality reduction technique"] == "N.A."], order=keys_order)
if to_export:
    plt.savefig(OUTPUT_PATH + "/Classification_metrics/MCC_test.pdf", bbox_inches="tight")

In [None]:
sns.set_theme(style="white")
plt.subplots(figsize=(9, 6))
sns.barplot(x="Model type", y="AUC - test", data=dataset[dataset["Dimensionality reduction technique"] == "N.A."], order=keys_order)
if to_export:
    plt.savefig(OUTPUT_PATH + "/Classification_metrics/AUC_test.pdf", bbox_inches="tight")

In [None]:
results = pd.melt(dataset[dataset["Dimensionality reduction technique"] == "N.A."], id_vars=['Model type'],
                  value_vars=['Accuracy - test', 'Specificity - test', 'Precision - test', 'Recall - test', 'F1-score - test', 'MCC - test', 'AUC - test'],
                  var_name='Metric')
sns.set_theme(style="white")
plt.subplots(figsize=(9, 6))
sns.barplot(x="Model type", y="value", hue="Metric", data=results, orient="v", order=keys_order)
plt.legend(loc='lower left')
if to_export:
    plt.savefig(OUTPUT_PATH + "/Classification_metrics/Metrics_test.pdf", bbox_inches="tight")

Plot dataset train/test

In [None]:
keep_columns = ["Dataset - train/test", "Accuracy - test", 'Specificity - test', "Precision - test",
                "Recall - test", "F1-score - test", "MCC - test", "AUC - test"]
if export_content == "Rankings":
    d2 = dataset[dataset["Dimensionality reduction technique"] == "N.A."][keep_columns].groupby(["Dataset - train/test"]).mean().sort_values(by=['F1-score - test'], ascending=False)
else:
    d2 = dataset[dataset["Dimensionality reduction technique"] == "N.A."][keep_columns].groupby(["Dataset - train/test"]).agg([np.mean, np.std])
dataset_list.append(d2.round(2))
d2

In [None]:
dataset_order = sorted(dataset["Dataset - train/test"].drop_duplicates())
sns.set_theme(style="white")
plt.subplots(figsize=(9, 6))
sns.barplot(x="Accuracy - test", y="Dataset - train/test", data=dataset[dataset["Dimensionality reduction technique"] == "N.A."], orient="h", order=dataset_order)
if to_export:
    plt.savefig(OUTPUT_PATH + "/Classification_metrics/Dataset_accuracy_test.pdf", bbox_inches="tight")

In [None]:
sns.set_theme(style="white")
plt.subplots(figsize=(9, 6))
sns.barplot(x="Specificity - test", y="Dataset - train/test", data=dataset[dataset["Dimensionality reduction technique"] == "N.A."], orient="h", order=dataset_order)
if to_export:
    plt.savefig(OUTPUT_PATH + "/Classification_metrics/Dataset_specificity_test.pdf", bbox_inches="tight")

In [None]:
sns.set_theme(style="white")
plt.subplots(figsize=(9, 6))
sns.barplot(x="Precision - test", y="Dataset - train/test", data=dataset[dataset["Dimensionality reduction technique"] == "N.A."], orient="h", order=dataset_order)
if to_export:
    plt.savefig(OUTPUT_PATH + "/Classification_metrics/Dataset_precision_test.pdf", bbox_inches="tight")

In [None]:
sns.set_theme(style="white")
plt.subplots(figsize=(9, 6))
sns.barplot(x="Recall - test", y="Dataset - train/test", data=dataset[dataset["Dimensionality reduction technique"] == "N.A."], orient="h", order=dataset_order)
if to_export:
    plt.savefig(OUTPUT_PATH + "/Classification_metrics/Dataset_recall_test.pdf", bbox_inches="tight")

In [None]:
sns.set_theme(style="white")
plt.subplots(figsize=(9, 6))
sns.barplot(x="MCC - test", y="Dataset - train/test", data=dataset[dataset["Dimensionality reduction technique"] == "N.A."], orient="h", order=dataset_order)
if to_export:
    plt.savefig(OUTPUT_PATH + "/Classification_metrics/Dataset_MCC_test.pdf", bbox_inches="tight")

In [None]:
sns.set_theme(style="white")
plt.subplots(figsize=(9, 6))
sns.barplot(x="AUC - test", y="Dataset - train/test", data=dataset[dataset["Dimensionality reduction technique"] == "N.A."], orient="h", order=dataset_order)
if to_export:
    plt.savefig(OUTPUT_PATH + "/Classification_metrics/Dataset_AUC_test.pdf", bbox_inches="tight")

In [None]:
sns.set_theme(style="white")
plt.subplots(figsize=(9, 6))
sns.barplot(x="F1-score - test", y="Dataset - train/test", data=dataset[dataset["Dimensionality reduction technique"] == "N.A."], orient="h", order=dataset_order)
if to_export:
    plt.savefig(OUTPUT_PATH + "/Classification_metrics/Dataset_F1-score_test.pdf", bbox_inches="tight")

In [None]:
results = pd.melt(dataset[dataset["Dimensionality reduction technique"] == "N.A."], id_vars=['Dataset - train/test'],
                  value_vars=['Accuracy - test', 'Specificity - test', 'Precision - test', 'Recall - test', 'F1-score - test', 'MCC - test', 'AUC - test'],
                  var_name='Metric')
sns.set_theme(style="white")
plt.subplots(figsize=(9, 20))
sns.barplot(x="value", y="Dataset - train/test", hue="Metric", data=results, orient="h", order=dataset_order)
plt.legend(loc='lower left')

# Change y labels in order to have more space for the graph part
ylabels = list(dict.fromkeys(results["Dataset - train/test"].to_list()))
ylabels_new = [label.replace('filtered_active_bankruptcy_', 'filtered_active_bankruptcy_\n') for label in ylabels]
plt.yticks(np.arange(len(ylabels_new)), ylabels_new)

if to_export:
    plt.savefig(OUTPUT_PATH + "/Classification_metrics/Dataset_metrics_test.pdf", bbox_inches="tight")

In [None]:
# Divide datasets in four categories
def dataset_mapping(dataset_name):
    if "history" in dataset_name:
        return "Datasets based on temporal series"
    elif "raw_full" in dataset_name:
        return "Datasets based on all raw values"
    elif "raw" in dataset_name:
        return "Datasets based on a subset of raw values"
    elif "small" in dataset_name:
        return "Datasets based only on financial estimators"
    elif "big" in dataset_name:
        return "Datasets based only on financial estimators"
    else:
        return "Other type of dataset"

# Apply mapping
results["Dataset - train/test"] = results["Dataset - train/test"].apply(dataset_mapping)
dataset_order = ["Datasets based only on financial estimators", "Datasets based on a subset of raw values",
                  "Datasets based on all raw values", "Datasets based on temporal series"]

In [None]:
sns.set_theme(style="white")
plt.subplots(figsize=(9, 6))
sns.barplot(x="value", y="Dataset - train/test", hue="Metric", data=results, orient="h", order=dataset_order)
plt.legend(loc='lower left')

if to_export:
    plt.savefig(OUTPUT_PATH + "/Classification_metrics/Dataset_categories_metrics_test.pdf", bbox_inches="tight")

Plot model type & dataset train/test

In [None]:
keep_columns = ["Model type", "Dataset - train/test", "Accuracy - test", 'Specificity - test',
                "Precision - test", "Recall - test", "F1-score - test", "MCC - test", "AUC - test"]
if export_content == "Rankings":
    d3 = dataset[dataset["Dimensionality reduction technique"] == "N.A."][keep_columns].groupby(["Model type", "Dataset - train/test"]).mean().sort_values(by=['F1-score - test'], ascending=False)
else:
    d3 = dataset[dataset["Dimensionality reduction technique"] == "N.A."][keep_columns].groupby(["Model type", "Dataset - train/test"]).agg([np.mean, np.std])
dataset_list.append(d3.round(2))
d3

Plot number of components

In [None]:
keep_columns = ["Number of features", "Accuracy - test", 'Specificity - test', "Precision - test",
                "Recall - test", "F1-score - test", "MCC - test", "AUC - test"]
if export_content == "Rankings":
    d4 = dataset[dataset["Dimensionality reduction technique"] == "N.A."][keep_columns].groupby(["Number of features"]).mean().sort_values(by=['F1-score - test'], ascending=False)
else:
    d4 = dataset[dataset["Dimensionality reduction technique"] == "N.A."][keep_columns].groupby(["Number of features"]).agg([np.mean, np.std])
dataset_list.append(d4.round(2))
d4

In [None]:
results = pd.melt(dataset[dataset["Dimensionality reduction technique"] == "N.A."], id_vars=['Number of features'],
                  value_vars=['Accuracy - test', 'Specificity - test', 'Precision - test', 'Recall - test', 'F1-score - test', 'MCC - test', 'AUC - test'],
                  var_name='Metric')
sns.set_theme(style="white")
plt.subplots(figsize=(9, 6))
sns.barplot(x="Number of features", y="value", hue="Metric", data=results, orient="v")
plt.legend(loc='lower left')
if to_export:
    plt.savefig(OUTPUT_PATH + "/Classification_metrics/Number_of_features_metrics.pdf", bbox_inches="tight")

Plot model type & number of components

In [None]:
keep_columns = ["Model type", "Number of features", "Accuracy - test", 'Specificity - test',
                "Precision - test", "Recall - test", "F1-score - test", "MCC - test", "AUC - test"]
if export_content == "Rankings":
    d5 = dataset[dataset["Dimensionality reduction technique"] == "N.A."][keep_columns].groupby(["Model type", "Number of features"]).mean().sort_values(by=['F1-score - test'], ascending=False)
else:
    d5 = dataset[dataset["Dimensionality reduction technique"] == "N.A."][keep_columns].groupby(["Model type", "Number of features"]).agg([np.mean, np.std])
dataset_list.append(d5.round(2))
d5

Plot dimensionality reduction technique & number of components

In [None]:
keep_columns = ["Dimensionality reduction technique", "Number of features", "Accuracy - test", 'Specificity - test',
                "Precision - test", "Recall - test", "F1-score - test", "MCC - test", "AUC - test"]
if export_content == "Rankings":
    d6 = dataset[keep_columns].groupby(["Dimensionality reduction technique", "Number of features"]).mean().sort_values(by=['F1-score - test'], ascending=False)
else:
    d6 = dataset[keep_columns].groupby(["Dimensionality reduction technique", "Number of features"]).agg([np.mean, np.std])
dataset_list.append(d6.round(2))
d6

Plot dimensionality reduction technique

In [None]:
keep_columns = ["Dimensionality reduction technique", "Accuracy - test", 'Specificity - test',
                "Precision - test", "Recall - test", "F1-score - test", "MCC - test", "AUC - test"]
if export_content == "Rankings":
    d7 = dataset[keep_columns].groupby(["Dimensionality reduction technique"]).mean().sort_values(by=['F1-score - test'], ascending=False)
else:
    d7 = dataset[keep_columns].groupby(["Dimensionality reduction technique"]).agg([np.mean, np.std])
dataset_list.append(d7.round(2))
d7

Plot imbalanced data technique

In [None]:
keep_columns = ["Imbalanced data technique", "Accuracy - test", 'Specificity - test',
                "Precision - test", "Recall - test", "F1-score - test", "MCC - test", "AUC - test"]
if export_content == "Rankings":
    d8 = dataset[dataset["Dimensionality reduction technique"] == "N.A."][keep_columns].groupby(["Imbalanced data technique"]).mean().sort_values(by=['F1-score - test'], ascending=False)
else:
    d8 = dataset[dataset["Dimensionality reduction technique"] == "N.A."][keep_columns].groupby(["Imbalanced data technique"]).agg([np.mean, np.std])
dataset_list.append(d8.round(2))
d8

In [None]:
results = pd.melt(dataset[dataset["Dimensionality reduction technique"] == "N.A."], id_vars=['Imbalanced data technique'],
                  value_vars=['Accuracy - test', 'Specificity - test', 'Precision - test', 'Recall - test', 'F1-score - test', 'MCC - test', 'AUC - test'],
                  var_name='Metric')
sns.set_theme(style="white")
plt.subplots(figsize=(9, 6))
sns.barplot(x="Imbalanced data technique", y="value", hue="Metric", data=results, orient="v")
plt.legend(loc='lower left')
if to_export:
    plt.savefig(OUTPUT_PATH + "/Classification_metrics/Imbalanced_data_technique_score_test.pdf", bbox_inches="tight")

Plot Imbalanced data technique & Dataset train/test

In [None]:
keep_columns = ["Imbalanced data technique", "Dataset - train/test", "Accuracy - test", 'Specificity - test',
                "Precision - test", "Recall - test", "F1-score - test",  "MCC - test", "AUC - test"]
if export_content == "Rankings":
    d9 = dataset[keep_columns][dataset["Dimensionality reduction technique"] == "N.A."].groupby(["Imbalanced data technique", "Dataset - train/test"]).mean().sort_values(by=['F1-score - test'], ascending=False)
else:
    d9 = dataset[keep_columns][dataset["Dimensionality reduction technique"] == "N.A."].groupby(["Imbalanced data technique", "Dataset - train/test"]).agg([np.mean, np.std])
dataset_list.append(d9.round(2))
d9

Plot Imbalanced data technique & Model type

In [None]:
keep_columns = ["Imbalanced data technique", "Model type", "Accuracy - test", 'Specificity - test',
                "Precision - test", "Recall - test", "F1-score - test", "MCC - test", "AUC - test"]
if export_content == "Rankings":
    d10 = dataset[keep_columns][dataset["Dimensionality reduction technique"] == "N.A."].groupby(["Imbalanced data technique", "Model type"]).mean().sort_values(by=['F1-score - test'], ascending=False)
else:
    d10 = dataset[keep_columns][dataset["Dimensionality reduction technique"] == "N.A."].groupby(["Imbalanced data technique", "Model type"]).agg([np.mean, np.std])
dataset_list.append(d10.round(2))
d10

Plot model & validation metrics

In [None]:
keep_columns = ["Model type", "Accuracy - validation", 'Specificity - validation', "Precision - validation",
                "Recall - validation", "F1-score - validation", "MCC - validation", "AUC - validation"]
if export_content == "Rankings":
    d11 = dataset[dataset["Dimensionality reduction technique"] == "N.A."][keep_columns].groupby(["Model type"]).mean().sort_values(by=['F1-score - validation'], ascending=False)
else:
    d11 = dataset[dataset["Dimensionality reduction technique"] == "N.A."][keep_columns].groupby(["Model type"]).agg([np.mean, np.std])
dataset_list.append(d11.round(2))
d11

In [None]:
results = pd.melt(dataset[dataset["Dimensionality reduction technique"] == "N.A."], id_vars=['Model type'],
                  value_vars=['Accuracy - validation', 'Specificity - validation', 'Precision - validation',
                              'Recall - validation', 'F1-score - validation',  'MCC - validation', 'AUC - validation'],
                  var_name='Metric')
sns.set_theme(style="white")
plt.subplots(figsize=(9, 6))
sns.barplot(x="Model type", y="value", hue="Metric", data=results, orient="v", order=keys_order)
plt.legend(loc='lower left')
if to_export:
    plt.savefig(OUTPUT_PATH + "/Classification_metrics/Metrics_validation.pdf", bbox_inches="tight")

Plot model & training time

In [None]:
keep_columns = ["Model type", "Training time"]
if export_content == "Rankings":
    d12 = dataset[dataset["Dimensionality reduction technique"] == "N.A."][keep_columns].groupby(["Model type"]).mean().sort_values(by=['Training time'], ascending=False)
else:
    d12 = dataset[dataset["Dimensionality reduction technique"] == "N.A."][keep_columns].groupby(["Model type"]).agg([np.mean, np.std])
dataset_list.append(d12.round(2))
d12

In [None]:
sns.set_theme(style="white")
plt.subplots(figsize=(9, 6))
ax = sns.barplot(x="Model type", y="Training time", data=dataset[dataset["Dimensionality reduction technique"] == "N.A."], order=keys_order)
ax.set(ylabel='Training time (s)')
if to_export:
    plt.savefig(OUTPUT_PATH + "/Classification_metrics/Training_time.pdf", bbox_inches="tight")

Plot train/test split

In [None]:
keep_columns = ["Train/Test split", "Accuracy - test", 'Specificity - test',
                "Precision - test", "Recall - test", "F1-score - test", "MCC - test", "AUC - test"]
if export_content == "Rankings":
    d13 = dataset[dataset["Dimensionality reduction technique"] == "N.A."][keep_columns].groupby(["Train/Test split"]).mean().sort_values(by=['F1-score - test'], ascending=False)
else:
    d13 = dataset[dataset["Dimensionality reduction technique"] == "N.A."][keep_columns].groupby(["Train/Test split"]).agg([np.mean, np.std])
dataset_list.append(d13.round(2))
d13

In [None]:
results = pd.melt(dataset[dataset["Dimensionality reduction technique"] == "N.A."], id_vars=['Train/Test split'],
                  value_vars=['Accuracy - test', 'Specificity - test', 'Precision - test', 'Recall - test', 'F1-score - test', 'MCC - test', 'AUC - test'],
                  var_name='Metric')
sns.set_theme(style="white")
plt.subplots(figsize=(9, 6))
sns.barplot(x="Train/Test split", y="value", hue="Metric", data=results, orient="v")
plt.legend(loc='lower left')
if to_export:
    plt.savefig(OUTPUT_PATH + "/Classification_metrics/Train_test_split.pdf", bbox_inches="tight")

Plot PCA number of components

In [None]:
selected_dataset = dataset[(dataset['Dimensionality reduction technique'] == "PCA")
                           & (dataset['Dataset - train/test'] == "filtered_active_bankruptcy_raw_full_history4_0.pkl")]
# Plot only if not empty
if selected_dataset.shape[0] != 0:
    sns.set_theme(style="white")
    plt.subplots(figsize=(9, 6))
    sns.barplot(x="Number of features", y="F1-score - test", hue="Model type", data=selected_dataset, orient="v")
    plt.legend(loc='lower left')
    if to_export:
        plt.savefig(OUTPUT_PATH + "/Classification_metrics/PCA_num_features_score_test.pdf", bbox_inches="tight")

In [None]:
# Plot only if not empty
if selected_dataset.shape[0] != 0:
    # Take each dictionary containing the couples (feature_name, importance)
    dict_list = selected_dataset[(selected_dataset["Features importance"] != "N.A.")]["Features importance"].tolist()

    # Create two auxiliary dictionaries
    sum = {}
    count = {}

    # Set all the keys' possible values to 0
    # For each dictionary
    for dictionary in dict_list:
        # For each couple (key, value)
        for key, value in dictionary.items():
            sum[key] = 0
            count[key] = 0

    # Increment all the keys based on the dictionaries values
    # For each dictionary
    for dictionary in dict_list:
        # For each couple (key, value)
        for key, value in dictionary.items():
            sum[key] += value
            count[key] += 1

    # Create mean dictionary
    mean = {}

    # Compute mean for each key
    for key, value in sum.items():
        mean[key] = sum[key] / count[key]

    # Order the dictionary by value
    ordered_mean = dict(sorted(mean.items(), key=lambda item: item[1], reverse=True))

    # Take the 20 most important features
    keys = list(ordered_mean.keys())[:20]

    # Create a list of the 50 most important features values
    feature_importance = []
    feature_names = []

    # For each dictionary
    for dictionary in dict_list:
        # For each couple (key, value)
        for key, value in dictionary.items():
            # If one of the 50 most important features
            if key in keys:
                feature_names.append(key)
                feature_importance.append(value)


    # Plot the results
    sns.set_theme(style="white")
    plt.subplots(figsize=(9, 5))
    ax = sns.barplot(x=feature_importance, y=feature_names, order=keys)
    ax.set(xlabel='Component importance')

    if to_export:
        plt.savefig(OUTPUT_PATH + "/Classification_metrics/PCA_component_importance.pdf", bbox_inches="tight")

Plot feature importance

In [None]:
# Take each dictionary containing the couples (feature_name, importance)
dict_list = dataset[(dataset["Features importance"] != "N.A.") & (dataset["Dimensionality reduction technique"] == "N.A.")]["Features importance"].tolist()

# Create two auxiliary dictionaries
sum = {}
count = {}

# Set all the keys' possible values to 0
# For each dictionary
for dictionary in dict_list:
    # For each couple (key, value)
    for key, value in dictionary.items():
        sum[key] = 0
        count[key] = 0

# Increment all the keys based on the dictionaries values
# For each dictionary
for dictionary in dict_list:
    # For each couple (key, value)
    for key, value in dictionary.items():
        sum[key] += value
        count[key] += 1

# Create mean dictionary
mean = {}

# Compute mean for each key
for key, value in sum.items():
    mean[key] = sum[key] / count[key]

# Order the dictionary by value
ordered_mean = dict(sorted(mean.items(), key=lambda item: item[1], reverse=True))

# Take the 50 most important features
keys = list(ordered_mean.keys())[:50]

# Create a list of the 50 most important features values
feature_importance = []
feature_names = []

# For each dictionary
for dictionary in dict_list:
    # For each couple (key, value)
    for key, value in dictionary.items():
        # If one of the 50 most important features
        if key in keys:
            feature_names.append(key)
            feature_importance.append(value)


# Plot the results
sns.set_theme(style="white")
plt.subplots(figsize=(9, 15))
ax = sns.barplot(x=feature_importance, y=feature_names, order=keys)
ax.set(xlabel='Feature importance')

if to_export:
    plt.savefig(OUTPUT_PATH + "/Classification_metrics/Feature_importance.pdf", bbox_inches="tight")

In [None]:
feature_dataset = pd.DataFrame()
feature_dataset["Feature name"] = feature_names
feature_dataset["Feature importance"] = feature_importance
if export_content == "Rankings":
    d14 = feature_dataset.groupby(["Feature name"]).mean().sort_values(by=['Feature importance'], ascending=False)
else:
    d14 = feature_dataset.groupby(["Feature name"]).agg([np.mean, np.std]).sort_values(by=[('Feature importance', 'mean')], ascending=False)
dataset_list.append(d14.round(2))
d14

Plot feature importance financial estimators

In [None]:
# Collect only the financial estimators names
financial_estimator_names_regex = "PN/TOTALE DEBITI*|DEB. PREV \+ TRIB/ATTIVO*|TEMPO MEDIO RISCOSSIONE \(TMR\)*|TEMPO MEDIO DI PAGAMENTO \(TMP\)|" \
                                  "PFN\/EBITDA*|PFN\/PN*|GEARING*|ROS*|WORKING CAPITAL\/NET SALES*|CASH\/CURRENT LIABILITIES*|" \
                                  "ACCOUNTS RECEIVABLE\/INVENTORY*|EBIT\/INTEREST EXPENSES*|ATT.BR\/ATTIVO*|RICAVI\/ATTIVO*|EBITDA\/TOTALE DEBITI*"
r = re.compile(financial_estimator_names_regex, re.IGNORECASE)
financial_estimator_keys = list(filter(r.match, ordered_mean.keys()))


# Create dictionary feature name: feature value mean
financial_estimator_dictionary = {}

# For each dictionary
for dictionary in dict_list:
    # For each couple (key, value)
    for key, value in ordered_mean.items():
        # If the key is a financial estimator
        if key in financial_estimator_keys:
            financial_estimator_dictionary[key] = value

# Order by feature importance
financial_estimator_dictionary = dict(sorted(financial_estimator_dictionary.items(), key=lambda item: item[1], reverse=True))

# Collect all value to plot mean and expected value range
financial_estimator_feature_importance = []
financial_estimator_feature_names = []

# For each dictionary
for dictionary in dict_list:
    # For each couple (key, value)
    for key, value in dictionary.items():
        # If the key is a financial estimator
        if key in financial_estimator_keys:
            financial_estimator_feature_names.append(key)
            financial_estimator_feature_importance.append(value)

# Plot the results
sns.set_theme(style="white")
plt.subplots(figsize=(9, 15))
ax = sns.barplot(x=financial_estimator_feature_importance, y=financial_estimator_feature_names, order=financial_estimator_dictionary.keys())
ax.set(xlabel='Feature importance')

if to_export:
    plt.savefig(OUTPUT_PATH + "/Classification_metrics/Financial_estimator_importance.pdf", bbox_inches="tight")

In [None]:
financial_estimator_feature_dataset = pd.DataFrame()
financial_estimator_feature_dataset["Feature name"] = financial_estimator_feature_names
financial_estimator_feature_dataset["Feature importance"] = financial_estimator_feature_importance
if export_content == "Rankings":
    d15 = financial_estimator_feature_dataset.groupby(["Feature name"]).mean().sort_values(by=['Feature importance'], ascending=False)
else:
    d15 = financial_estimator_feature_dataset.groupby(["Feature name"]).agg([np.mean, np.std])\
        .sort_values(by=[('Feature importance', 'mean')], ascending=False)
dataset_list.append(d15.round(2))
d15

Plot feature importance raw values

In [None]:
# Create dictionary feature name: feature value mean
raw_values_dictionary = {}

# For each dictionary
for dictionary in dict_list:
    # For each couple (key, value)
    for key, value in ordered_mean.items():
        # If the key is a raw value
        if key not in financial_estimator_keys:
            raw_values_dictionary[key] = value

# Order by feature importance
raw_values_dictionary = dict(sorted(raw_values_dictionary.items(), key=lambda item: item[1], reverse=True))
raw_values_top_keys = list(raw_values_dictionary.keys())[:50]

# Collect all value to plot mean and expected value range
raw_values_feature_importance = []
raw_values_feature_names = []

# For each dictionary
for dictionary in dict_list:
    # For each couple (key, value)
    for key, value in dictionary.items():
        # If the key is a raw value
        if key in raw_values_top_keys:
            raw_values_feature_names.append(key)
            raw_values_feature_importance.append(value)

# Plot the results
sns.set_theme(style="white")
plt.subplots(figsize=(9, 15))
ax = sns.barplot(x=raw_values_feature_importance, y=raw_values_feature_names, order=raw_values_top_keys)
ax.set(xlabel='Feature importance')

if to_export:
    plt.savefig(OUTPUT_PATH + "/Classification_metrics/Raw_values_importance.pdf", bbox_inches="tight")

Plot model type, unbalanced technique, train/test split

In [None]:
keep_columns = ["Model type", "Imbalanced data technique", "Dataset - train/test", "Accuracy - test", 'Specificity - test',
                "Precision - test", "Recall - test", "F1-score - test",  "MCC - test", "AUC - test"]
if export_content == "Rankings":
    d16 = dataset[keep_columns].groupby(["Model type", "Imbalanced data technique", "Dataset - train/test"]).mean().sort_values(by=['F1-score - test'], ascending=False)
else:
    d16 = dataset[keep_columns].groupby(["Model type", "Imbalanced data technique", "Dataset - train/test"]).agg([np.mean, np.std])
dataset_list.append(d16.round(2))
d16

Esporto in formato xlsx se richiesto

In [None]:
# Function in order to generate a unique excel file with multiple sheets
# from multiple pandas datasets
def save_xls(list_dfs, xls_path, na_replace):
    with ExcelWriter(xls_path, engine="xlsxwriter") as writer:
        for n, df in enumerate(list_dfs):
            if na_replace:
                df.to_excel(writer, 'sheet%s' % n, na_rep="N.A.")
            else:
                df.to_excel(writer, 'sheet%s' % n)
            writer.sheets["sheet"+str(n)].set_column(0, 10, 35)

In [None]:
if to_export:
    # Experiments dataset
    dataset.to_excel(OUTPUT_PATH + "/ML_model_experiments.xlsx", engine='xlsxwriter')
    if export_content == "Rankings":
        na_rep = True
    else:
        na_rep = False
    # Rankings/Description datasets
    save_xls(dataset_list, OUTPUT_PATH + "/" + export_content.lower() + ".xlsx", na_replace=na_rep)