# Evaluation of different supervised classifiers

Liberie varie da installare

In [None]:
#!pip install pandas
#!pip install sklearn
#!pip install seaborn
#!pip install imblearn

Inclusione delle librerie utilizzate

In [None]:
from os.path import exists
from datetime import date
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn import metrics
from sklearn.decomposition import PCA

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC

Variabili di gestione files, parametri del modello e della fase di training

In [None]:
# Path of the dataset in .pkl format, can be changed
PATH_DATASET = r"C:\Users\Andre\OneDrive - Università degli Studi di Parma\Tirocinio\Dataset_output\filtered_active_bankruptcy_small.pkl"

# True = Standardize data, can be changed
to_standardize = True

# True = Transform the dataset into an equivalent one with less features, can be changed
reduce_feature = False

# Specify the number of features we want after applying the dimensionality reduction of the dataset, can be changed
number_feature = 15

# True = Oversample/Undersample the least/most populated class (Bankruptcy), can be changed
avoid_imbalanced_training = True

# Oversample or Undersample, can be changed.
# It only affects the notebook if avoid_imbalanced_training is True
imbalanced_data_technique = "Oversample"

# True = Also use non financial indexes features like the legal form or the size of the company
additional_features = True

# A value between [0, 1], it represent the percentage of records not used during training time, can be changed
train_test_split_amount = 0.25

# Select a random state value in order to control the randomness effect, can be changed
rnd_state = 25

# Specify the number of cuncurrent jobs in order to speed up certain traning phases.
# Specify -1 in order to use all the job available, the default one is 1, can be changed
n_jobs = -1

# True = Export the current experiment inside the dataframe that collects all of them, can be changed
to_export = False

# Path of the dataset in .pkl format to store all the experiments, can be changed and if not present it will be created
OUTPUT_PATH = r"C:\Users\Andre\OneDrive - Università degli Studi di Parma\Tirocinio\ML_models"

Controllo se esiste il dataset di output

In [None]:
# Only if we want to export
if to_export:
    if exists(OUTPUT_PATH + "/ML_model_experiments.pkl"):
        # If the dataset exists, read it
        output_dataset = pd.read_pickle(OUTPUT_PATH + "/ML_model_experiments.pkl")
        print("Dataset di output trovato")
    else:
        # Otherwise create it
        output_dataset = pd.DataFrame()
        print("Dataset di output non trovato, creo un nuovo dataset")

Lettura del dataset di input

In [None]:
dataset = pd.read_pickle(PATH_DATASET)
print("Il dataset da utilizzare ha", dataset.shape[0], "record e", dataset.shape[1], "colonne")

Suddivisione del dataset in X e Y, dove X sono le features in ingresso (indicatori finanziari) e Y è la risposta in output (attivo/bancarotta)

In [None]:
X_features_names = ['PN/Totale Debiti',
                    'Deb. Prev + Trib/Attivo',
                    'Tempo medio riscossione (TMR)',
                    'Tempo medio di pagamento (TMP)',
                    'PFN/EBITDA',
                    'PFN/PN',
                    'Gearing',
                    'ROS',
                    'Working capital/net sales',
                    'Cash/Current Liabilities',
                    'Accounts receivable/inventory',
                    'EBIT/interest expenses',
                    'Att.Br/Attivo',
                    'Ricavi/Attivo',
                    'EBITDA/Totale Debiti']

if additional_features:
    X_features_names.append('Legal Form')
    X_features_names.append('Number of employees')
    #X_features_names.append('Company Size')

Y_feature_name = 'Legal Status'

X_dataset = dataset[X_features_names].copy()

if additional_features:
    # One hot encoding
    X_dataset = X_dataset.join(pd.get_dummies(dataset['Legal Form']))
    X_dataset.drop('Legal Form', axis = 1, inplace=True)

    #X_dataset = X_dataset.join(pd.get_dummies(dataset['Company Size']))
    #X_dataset.drop('Company Size', axis = 1, inplace=True)

    # Save the new feature names
    X_features_names = X_dataset.columns.to_list()

Y_dataset = dataset[Y_feature_name].copy()
Y_dataset.replace({"Active": 0, "Bankruptcy": 1}, inplace=True)

Esempio di record

In [None]:
pd.concat([X_dataset.head(1), Y_dataset.head(1)], axis=1)

Uso la tecnica di random oversampling o undersampling per evitare un allenamento di un modello con classi sbilanciate

In [None]:
if avoid_imbalanced_training:
    # Oversample
    if imbalanced_data_technique == "Oversample":
        sm = SMOTE(random_state=rnd_state, n_jobs=n_jobs)
        X_dataset, Y_dataset = sm.fit_resample(X_dataset, Y_dataset)
    # Undersample
    elif imbalanced_data_technique == "Undersample":
        undersample = RandomUnderSampler(sampling_strategy='majority', random_state=rnd_state)
        X_dataset, Y_dataset = undersample.fit_resample(X_dataset, Y_dataset)
    else:
        print("Error: wrong variable value about imbalanced data")

Standardizzo i dati contenuti in X

In [None]:
if to_standardize:
    scaler = StandardScaler()
    X_dataset = scaler.fit_transform(X_dataset)

Applico la SVD del dataset con PCA

In [None]:
if reduce_feature:
    pca = PCA(n_components=number_feature)
    X_dataset = pca.fit_transform(X_dataset)

Divido i 2 dataset in train e test

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X_dataset,
                                                    Y_dataset,
                                                    stratify=Y_dataset,
                                                    test_size=train_test_split_amount,
                                                    random_state=rnd_state)

Confronto vari modelli supervisionati

In [None]:
# Hyperparameters of each model
log_reg_params = [{"C":0.01}, {"C":0.1}, {"C":1}, {"C":10}]
dec_tree_params = [{"criterion": "gini"}, {"criterion": "entropy"}]
rand_for_params = [{"criterion": "gini", "n_jobs": n_jobs}, {"criterion": "entropy", "n_jobs": n_jobs}]
kneighbors_params = [{"n_neighbors":3, "n_jobs": n_jobs}, {"n_neighbors":5, "n_jobs": n_jobs}]
naive_bayes_params = [{}]
svc_params = [{"C":0.01}, {"C":0.1}, {"C":1}, {"C":10}]

# List of each model with its hyperparameters
supervised_models = [
    ["Logistic Regression", LogisticRegression, log_reg_params],
    ["Decision Tree", DecisionTreeClassifier, dec_tree_params],
    ["Random Forest", RandomForestClassifier, rand_for_params],
   # ["K Neighbors", KNeighborsClassifier, kneighbors_params],
   # ["Naive Bayes", GaussianNB, naive_bayes_params],
   # ["Support Vector Machine", SVC, svc_params]
]

In [None]:
scores = []
for model_name, Model, params_list in supervised_models:
    for params in params_list:
        # Define model
        if model_name in ["K Neighbors", "Naive Bayes"]:
            model = Model(**params)
        else:
            model = Model(**params, random_state=rnd_state)
        # Train the model
        model.fit(X_train, Y_train)

        Y_predicted = model.predict(X_test)

        # Compute different metrics
        accuracy = accuracy_score(Y_test, Y_predicted)
        precision = precision_score(Y_test, Y_predicted)
        recall = recall_score(Y_test, Y_predicted)
        f1 = f1_score(Y_test, Y_predicted)

        Y_probabilities = model.predict_proba(X_test)
        preds = Y_probabilities[:,1]
        fpr, tpr, threshold = metrics.roc_curve(Y_test, preds)
        roc_auc = metrics.auc(fpr, tpr)

        if to_export:
            # Create a new record
            new_record = pd.DataFrame({"Date": [date.today()],
                                       "Model type": [model_name],
                                       "Model parameters": [model.get_params()],
                                       "Data source": [PATH_DATASET.split("\\")[-1]],
                                       "Features": [X_features_names],
                                       "Standardized": [to_standardize],
                                       "Imbalanced data corrections": [avoid_imbalanced_training],
                                       "Imbalanced data technique": [imbalanced_data_technique],
                                       "Number of active companies used": [Y_dataset.value_counts()[0]],
                                       "Number of bankruptcy companies used": [Y_dataset.value_counts()[1]],
                                       "Train/Test split": [train_test_split_amount],
                                       "Random state": [rnd_state],
                                       "Accuracy": [accuracy],
                                       "Precision": [precision],
                                       "Recall": [recall],
                                       "F1-score": [f1],
                                       "AUC": [roc_auc]})
            # Append the new record
            output_dataset = pd.concat([output_dataset, new_record], ignore_index=True, axis=0)

        # Friendly print to keep track of the evaluation stage
        print(model_name, params, accuracy)

Esporto i nuovi esperimenti

In [None]:
if to_export:
    # Export the dataset
    output_dataset.to_pickle(OUTPUT_PATH + "/ML_model_experiments.pkl")
    print("Esprimenti aggiunti al dataset")