# Primo prototipo di modello ML: decision tree

Liberie varie da installare

In [None]:
#!pip install pandas
#!pip install sklearn
#!pip install seaborn
#!pip install imblearn

Inclusione delle librerie utilizzate

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn import tree
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE

Variabili di gestione files

In [None]:
# Path of the dataset in .pkl format, can be changed
PATH_DATASET = r"C:\Users\Andre\OneDrive - Università degli Studi di Parma\Tirocinio\Dataset_output\filtered_active_bankruptcy_small.pkl"

# True = Standardize data, can be changed
to_standardize = False

# True = Oversample the least populated class (Bankruptcy), can be changed
avoid_imbalanced_training = True

# Select a random state value in order to control the randomness effect, can be changed
rnd_state = 25

Lettura del dataset

In [None]:
dataset = pd.read_pickle(PATH_DATASET)
print("Il dataset da utilizzare ha", dataset.shape[0], "record e", dataset.shape[1], "colonne")

Suddivisione del dataset in X e Y, dove X sono le features in ingresso (indicatori finanziari) e Y è la risposta in output (attivo/bancarotta)

In [None]:
X_features_names = ['PN/Totale Debiti',
                    'Deb. Prev + Trib/Attivo',
                    'Tempo medio riscossione (TMR)',
                    'Tempo medio di pagamento (TMP)',
                    'PFN/EBITDA',
                    'PFN/PN',
                    'Gearing',
                    'ROS',
                    'Working capital/net sales',
                    'Cash/Current Liabilities',
                    'Accounts receivable/inventory',
                    'EBIT/interest expenses',
                    'Att.Br/Attivo',
                    'Ricavi/Attivo',
                    'EBITDA/Totale Debiti']

Y_feature_name = 'Legal Status'

X_dataset = dataset[X_features_names]
Y_dataset = dataset[Y_feature_name]

Stampo i primi record dei 2 nuovi dataset per chiarezza

In [None]:
X_dataset.head()

In [None]:
Y_dataset.head()

Codifico la variabile di risposta (Active/Bankruptcy) in (0/1)

In [None]:
Y_dataset.replace({"Active": 0, "Bankruptcy": 1}, inplace=True)
Y_dataset.head()

Controllo il numero di record per ciascuna classe

In [None]:
Y_dataset.value_counts()

Uso la tecnica di random oversampling per evitare un allenamento di un modello con classi sbilanciate

In [None]:
if avoid_imbalanced_training:
    sm = SMOTE(random_state=rnd_state)
    X_dataset, Y_dataset = sm.fit_resample(X_dataset, Y_dataset)
Y_dataset.value_counts()

Standardizzo i dati contenuti in X

In [None]:
if to_standardize:
    scaler = StandardScaler()
    X_dataset = scaler.fit_transform(X_dataset)

Divido i 2 dataset in train e test

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X_dataset, Y_dataset, stratify=Y_dataset, test_size=0.30, random_state=rnd_state)

Creo primo prototipo di decision tree e lo alleno sui dati di train

In [None]:
decision_tree_classifier = tree.DecisionTreeClassifier()
decision_tree_classifier.fit(X_train, Y_train)

Guardo come si comporta sui dati di test che il modello non ha mai visto

In [None]:
Y_predicted = decision_tree_classifier.predict(X_test)
score = accuracy_score(Y_test, Y_predicted)
print("L'accuratezza è", score)

In [None]:
conf_matrix = confusion_matrix(y_true=Y_test, y_pred=Y_predicted)
conf_matrix = conf_matrix / conf_matrix.astype(np.float64).sum(axis=1)

ax = plt.subplot()
sns.heatmap(conf_matrix, annot=True, vmin=0.0, vmax=1.0, fmt=".2f", cmap="Blues", ax=ax)

ax.set_xlabel('Predicted labels')
ax.set_ylabel('True labels')
ax.set_title('Confusion Matrix')
ax.xaxis.set_ticklabels(['Active', 'Bankruptcy'])
ax.yaxis.set_ticklabels(['Active', 'Bankruptcy'])

Stampo in formato testuale il decision tree

In [None]:
text_representation = tree.export_text(decision_tree_classifier, feature_names=X_features_names)
print(text_representation)