# Secondo prototipo di modello ML: random forest

Liberie varie da installare

In [None]:
#!pip install pandas
#!pip install sklearn
#!pip install seaborn
#!pip install imblearn

Inclusione delle librerie utilizzate

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler

Variabili di gestione files, parametri del modello e della fase di training

In [None]:
# Path of the dataset in .pkl format, can be changed
PATH_DATASET = r"C:\Users\Andre\OneDrive - Università degli Studi di Parma\Tirocinio\Dataset_output\filtered_active_bankruptcy_big.pkl"

# True = Standardize data, can be changed
to_standardize = True

# True = Oversample/Undersample the least/most populated class (Bankruptcy), can be changed
avoid_imbalanced_training = True

# Oversample or Undersample, can be changed.
# It only affects the notebook if avoid_imbalanced_training is True
imbalanced_data_technique = "Undersample"

# True = Also use non financial indexes features like the legal form or the size of the company
additional_features = True

# A value between [0, 1], it represent the percentage of records not used during training time, can be changed
train_test_split_amount = 0.25

# Select a random state value in order to control the randomness effect, can be changed
rnd_state = 25

# Specify the number of cuncurrent jobs in order to speed up certain traning phases.
# Specify -1 in order to use all the job available, the default one is 1, can be changed
n_jobs = 6

Lettura del dataset

In [None]:
dataset = pd.read_pickle(PATH_DATASET)
print("Il dataset da utilizzare ha", dataset.shape[0], "record e", dataset.shape[1], "colonne")

Suddivisione del dataset in X e Y, dove X sono le features in ingresso (indicatori finanziari) e Y è la risposta in output (attivo/bancarotta)

In [None]:
X_features_names = ['PN/Totale Debiti',
                    'Deb. Prev + Trib/Attivo',
                    'Tempo medio riscossione (TMR)',
                    'Tempo medio di pagamento (TMP)',
                    'PFN/EBITDA',
                    'PFN/PN',
                    'Gearing',
                    'ROS',
                    'Working capital/net sales',
                    'Cash/Current Liabilities',
                    'Accounts receivable/inventory',
                    'EBIT/interest expenses',
                    'Att.Br/Attivo',
                    'Ricavi/Attivo',
                    'EBITDA/Totale Debiti']

if additional_features:
    X_features_names.append('Legal Form')
    #X_features_names.append('Company Size')

Y_feature_name = 'Legal Status'

X_dataset = dataset[X_features_names].copy()

if additional_features:
    # One hot encoding
    X_dataset = X_dataset.join(pd.get_dummies(dataset['Legal Form']))
    X_dataset.drop('Legal Form', axis = 1, inplace=True)

    #X_dataset = X_dataset.join(pd.get_dummies(dataset['Company Size']))
    #X_dataset.drop('Company Size', axis = 1, inplace=True)

    # Save the new feature names
    X_features_names = X_dataset.columns.to_list()

Y_dataset = dataset[Y_feature_name].copy()



Stampo i primi record dei 2 nuovi dataset per chiarezza

In [None]:
X_dataset.head()

In [None]:
Y_dataset.head()

Codifico la variabile di risposta (Active/Bankruptcy) in (0/1)

In [None]:
Y_dataset.replace({"Active": 0, "Bankruptcy": 1}, inplace=True)
Y_dataset.head()

Controllo il numero di record per ciascuna classe

In [None]:
Y_dataset.value_counts()

Uso la tecnica di random oversampling o undersampling per evitare un allenamento di un modello con classi sbilanciate

In [None]:
if avoid_imbalanced_training:
    # Oversample
    if imbalanced_data_technique == "Oversample":
        sm = SMOTE(random_state=rnd_state, n_jobs=n_jobs)
        X_dataset, Y_dataset = sm.fit_resample(X_dataset, Y_dataset)
    # Undersample
    elif imbalanced_data_technique == "Undersample":
        undersample = RandomUnderSampler(sampling_strategy='majority', random_state=rnd_state)
        X_dataset, Y_dataset = undersample.fit_resample(X_dataset, Y_dataset)
    else:
        print("Error: wrong variable value about imbalanced data")
Y_dataset.value_counts()

Standardizzo i dati contenuti in X

In [None]:
if to_standardize:
    scaler = StandardScaler()
    X_dataset = scaler.fit_transform(X_dataset)

Divido i 2 dataset in train e test

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X_dataset,
                                                    Y_dataset,
                                                    stratify=Y_dataset,
                                                    test_size=train_test_split_amount,
                                                    random_state=rnd_state)

Creo primo prototipo di random forest e lo alleno sui dati di train

In [None]:
random_forest_classifier = RandomForestClassifier(random_state=rnd_state, n_jobs=n_jobs)
random_forest_classifier.fit(X_train, Y_train)

Guardo come si comporta sui dati di test che il modello non ha mai visto

In [None]:
Y_predicted = random_forest_classifier.predict(X_test)
score = accuracy_score(Y_test, Y_predicted)

print("L'accuratezza è", score)

Curva ROC

In [None]:
probs = random_forest_classifier.predict_proba(X_test)
preds = probs[:,1]
fpr, tpr, threshold = metrics.roc_curve(Y_test, preds)
roc_auc = metrics.auc(fpr, tpr)

plt.title('Receiver Operating Characteristic')
plt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc)
plt.legend(loc = 'lower right')
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()

Matrice di confusione

In [None]:
conf_matrix = confusion_matrix(y_true=Y_test, y_pred=Y_predicted)
conf_matrix = conf_matrix / conf_matrix.astype(np.float64).sum(axis=1)

ax = plt.subplot()
sns.heatmap(conf_matrix, annot=True, vmin=0.0, vmax=1.0, fmt=".2f", cmap="Blues", ax=ax)

ax.set_xlabel('Predicted labels')
ax.set_ylabel('True labels')
ax.set_title('Confusion Matrix')
ax.xaxis.set_ticklabels(['Active', 'Bankruptcy'])
ax.yaxis.set_ticklabels(['Active', 'Bankruptcy'])

Mostro l'importanza di ogni feature

In [None]:
plt.figure(figsize=(20,10))
feat_importances = pd.Series(random_forest_classifier.feature_importances_, index=X_features_names)
feat_importances.plot(kind='barh')

Provo utilizzando la tecnica del cross-validation score

In [None]:
start_split = 5
end_split = 21
for n_split in range(start_split, end_split, 5):
    # prepare the cross-validation procedure
    cv = KFold(n_splits=n_split, random_state=rnd_state, shuffle=True)
    # create model
    random_forest_classifier_cv = RandomForestClassifier(random_state=rnd_state, n_jobs=n_jobs)
    # evaluate model
    scores = cross_val_score(random_forest_classifier_cv, X_dataset, Y_dataset, scoring='accuracy', cv=cv, n_jobs=n_jobs)
    # report performance
    print("L'accuratezza con", n_split, "split è", np.mean(scores))

Testiamo diverse random forest con parametri diversi (hypertuning parameters)

In [None]:
# Each list contains all the value of a specific parameter we want to test
random_forest_parameter_criterions = ["entropy", "gini"]
random_forest_parameter_min_samples_splits = [2, 5, 10, 100, 1000]
random_forest_parameter_min_samples_leaves = [1, 2, 5, 10, 100, 1000]

# List to save each score
random_forest_scores = []

# Iterate over each parameter
for criterion in random_forest_parameter_criterions:
    for min_samples_split in random_forest_parameter_min_samples_splits:
        for min_samples_leaf in random_forest_parameter_min_samples_leaves:
            # Create the decision tree with the desired parameters
            test_random_forest = RandomForestClassifier(criterion=criterion,
                                                        min_samples_split=min_samples_split,
                                                        min_samples_leaf=min_samples_leaf,
                                                        random_state=rnd_state,
                                                        n_jobs=n_jobs)
            # Train the model
            test_random_forest.fit(X_train, Y_train)
            # Test the model
            Y_predicted = test_random_forest.predict(X_test)
            # Obtain the accuracy
            score = accuracy_score(Y_test, Y_predicted)
            # Add the results to the list
            random_forest_scores.append((criterion, min_samples_split, min_samples_leaf, score))
            # Print the result (unordered)
            print("Criterion:", criterion,
                  "min_samples_split:", min_samples_split,
                  "min_samples_leaf:", min_samples_leaf,
                  "--- SCORE:", score)

In [None]:
# Print the result (ordered)
random_forest_scores.sort(key=lambda x:x[-1], reverse=True)

for criterion, min_samples_split, min_samples_leaf, score in random_forest_scores:
    print("Criterion:", criterion,
          "min_samples_split:", min_samples_split,
          "min_samples_leaf:", min_samples_leaf,
          "--- SCORE:", score)