# Primo prototipo di ML automatico con la libreria TPOT

Liberie varie da installare

In [None]:
#!pip install tpot
#!pip install pandas

Inclusione delle librerie utilizzate

In [None]:
from tpot import TPOTClassifier
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np

Variabili di gestione files e parametri per tpot

In [None]:
# Path of the dataset in .pkl format, can be changed
PATH_DATASET = r"C:\Users\Andre\OneDrive - Università degli Studi di Parma\Tirocinio\Dataset_output\filtered_active_bankruptcy_raw_full_2.pkl"

# True = Also use non financial indexes features like the legal form or the size of the company
additional_features = False

# A value between [0, 1], it represent the percentage of records not used during training time, can be changed
train_test_split_amount = 0.25

# Select a random state value in order to control the randomness effect, can be changed
rnd_state = 25

# Specify the number of cuncurrent jobs in order to speed up certain traning phases.
# Specify -1 in order to use all the job available, the default one is 1, can be changed
n_jobs = 6

# See TPOT API
'''
classifier_config_dict = {
    # Classifiers
    'sklearn.ensemble.RandomForestClassifier': {
        'n_estimators': [100],
        'criterion': ["gini", "entropy"],
        'max_features': np.arange(0.05, 1.01, 0.05),
        'min_samples_split': range(2, 21),
        'min_samples_leaf': range(1, 21),
        'bootstrap': [True, False]
    },

    # Preprocessors
    'imblearn.over_sampling.RandomOverSampler': {
    },

    'imblearn.under_sampling.RandomUnderSampler':{
    }
}
'''

Lettura del dataset di input

In [None]:
dataset = pd.read_pickle(PATH_DATASET)
print("Il dataset da utilizzare ha", dataset.shape[0], "record e", dataset.shape[1], "colonne")

Suddivisione del dataset in X e Y, dove X sono le features in ingresso e Y è la risposta in output (attivo/bancarotta)

In [None]:
# Remove descriptive columns
removed_columns = [
    "Ragione sociale",
    "Province",
    "Accounting closing date",
    "Legal Status"
]

if not additional_features:
    removed_columns.append("Legal Form")
    removed_columns.append("Company Size")
    removed_columns.append("Number of employees")

# X dataset
X_dataset = dataset.copy()
X_dataset.drop(removed_columns, axis=1, inplace=True)

# Manage additional features
if additional_features:
    # One hot encoding
    X_dataset = X_dataset.join(pd.get_dummies(dataset['Legal Form']))
    X_dataset.drop('Legal Form', axis=1, inplace=True)

    X_dataset = X_dataset.join(pd.get_dummies(dataset['Company Size']))
    X_dataset.drop('Company Size', axis=1, inplace=True)


# Save the new feature names
X_features_names = X_dataset.columns.to_list()

# Y dataset
Y_dataset = dataset['Legal Status'].copy()

Stampo i primi record dei 2 nuovi dataset per chiarezza

In [None]:
X_dataset.head()

In [None]:
Y_dataset.head()

Codifico la variabile di risposta (Active/Bankruptcy) in (0/1)

In [None]:
Y_dataset.replace({"Active": 0, "Bankruptcy": 1}, inplace=True)
Y_dataset.head()

Divido i 2 dataset in train e test

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X_dataset,
                                                    Y_dataset,
                                                    stratify=Y_dataset,
                                                    test_size=train_test_split_amount,
                                                    random_state=rnd_state)

Classificatore TPOT

In [None]:
tpot = TPOTClassifier(verbosity=3, scoring="balanced_accuracy", n_jobs=n_jobs, random_state=rnd_state, max_time_mins=10)
tpot.fit(X_train, Y_train)

Migliore accuratezza ottenuta da TPOT

In [None]:
print(tpot.score(X_test, Y_test))

Miglior modello trovato (score maggiore)

In [None]:
print(tpot.fitted_pipeline_)