# Complete dataset (active + bankrupt) focused on financial estimators and their history

Librerie varie da installare

In [None]:
#!pip install pandas
#!pip install matplotlib
#!pip install seaborn

Inclusione delle librerie utilizzate

In [None]:
import pandas as pd
import numpy as np

Variabili di gestione files

In [None]:
# Path of the files, can be changed
PATH_ACTIVE_DATASET = r"C:\Users\Andre\OneDrive - Università degli Studi di Parma\Tirocinio\Dataset_output\active_out_small_history.pkl"

PATH_BANKRUPT_DATASET = r"C:\Users\Andre\OneDrive - Università degli Studi di Parma\Tirocinio\Dataset_output\bankruptcy_out_small_history.pkl"

# True = export summary file in the OUTPUT_PATH
to_export = True

# Specify the strength of the quantile data filtering, recommended between 0.05 and 0.25
quantile_amount = 0.25

# True = replace missing value and use all the 4 years, False = keep n year and remove the records with missing values
replace_na_value = False

# Number of year to be kept (between 2 and 4)
keep_n_year = 2

# Path of the desired output file, can be changed
OUTPUT_PATH = r"C:\Users\Andre\OneDrive - Università degli Studi di Parma\Tirocinio\Dataset_output"

Leggo il dataset composto dai 2 file pickle

In [None]:
active_dataset = pd.read_pickle(PATH_ACTIVE_DATASET)
bankrupt_dataset = pd.read_pickle(PATH_BANKRUPT_DATASET)

Unisco i 2 dataset

In [None]:
complete_dataset = pd.concat([active_dataset, bankrupt_dataset])
complete_dataset.reset_index(inplace=True)

Prelevo solo i campi degli indici finanziari

In [None]:
indexes_dataset = complete_dataset.iloc[: , 46:].copy()

Aggiungo i campi descrittivi

In [None]:
additional_columns = ['Legal Status',
                     'Tax Code Number',
                     'CCIAA Number']
export_dataset = indexes_dataset.copy()
export_dataset[additional_columns] = complete_dataset[additional_columns]

In [None]:
# Reorder columns
export_dataset = export_dataset[['Legal Status',
                                 'Tax Code Number',
                                 'CCIAA Number',
                                 'PN/Totale Debiti',
                                 'Deb. Prev + Trib/Attivo',
                                 'Tempo medio riscossione (TMR)',
                                 'Tempo medio di pagamento (TMP)',
                                 'PFN/EBITDA',
                                 'PFN/PN',
                                 'Gearing',
                                 'ROS',
                                 'Working capital/net sales',
                                 'Cash/Current Liabilities',
                                 'Accounts receivable/inventory',
                                 'EBIT/interest expenses',
                                 'Att.Br/Attivo',
                                 'Ricavi/Attivo',
                                 'EBITDA/Totale Debiti']]

In [None]:
export_dataset.head()

Rinomino il nome campi per consistenza con gli altri dataset

In [None]:
export_dataset.rename(columns={"Tax Code Number": "Tax code number", "CCIAA Number": "CCIAA number"}, inplace=True)

Tengo traccia dello storico dei vari bilanci per la medesima impresa in unico record

In [None]:
export_dataset = export_dataset.set_index(["Tax code number", "CCIAA number", "Legal Status",
                                           export_dataset.groupby(["Tax code number", "CCIAA number", "Legal Status"]).cumcount()+1]).unstack().sort_index(level=1, axis=1)
export_dataset.columns = export_dataset.columns.map('{0[0]}_{0[1]}'.format)
export_dataset.reset_index(inplace=True)
export_dataset.isna().sum()

In [None]:
print(export_dataset.shape)

Gestisco il dataset in base a come gestire i valori mancanti

In [None]:
if replace_na_value:
    export_dataset.fillna(0.0, inplace=True)
else:
    start_year = 2
    # Remove NA from the first n years
    while start_year <= keep_n_year:
        year_cols = export_dataset.filter(regex=("_"+str(start_year)+"$"), axis=1).columns
        export_dataset.dropna(subset = year_cols, inplace=True)
        start_year += 1
    # Drop columns of the last n years, if available
    if start_year <= 4:
        while start_year <= 4:
            year_cols = export_dataset.filter(regex=("_"+str(start_year)+"$"), axis=1).columns
            export_dataset.drop(columns=year_cols, axis=1, inplace=True)
            start_year += 1

In [None]:
export_dataset.isna().sum()

Analizzo il dataset completo

In [None]:
pd.set_option('display.float_format', lambda x: '%0.4f' % x)
export_dataset.describe().T

Esporto in csv e pickle

In [None]:
if to_export:
    dataset_name = "complete_active_bankruptcy"
    if "small" in PATH_ACTIVE_DATASET and "small" in PATH_BANKRUPT_DATASET:
        dataset_name += "_small"
    elif "big" in PATH_ACTIVE_DATASET and "big" in PATH_BANKRUPT_DATASET:
        dataset_name += "_big"
    else:
        dataset_name += "_mixed"

    dataset_name += "_history"

    if not replace_na_value:
        dataset_name += str(keep_n_year)

    export_dataset.to_csv(OUTPUT_PATH + "/" + dataset_name + ".csv")
    export_dataset.to_pickle(OUTPUT_PATH + "/" + dataset_name + ".pkl")
    print("Dataset", dataset_name, "esportato")

Rimuovo outliers

In [None]:
filtered_dataset = pd.DataFrame()
# Do the following steps for each feature
for index_col in export_dataset.columns:
    # Only numeric columns
    if np.issubdtype(export_dataset[index_col].dtype , np.number):
        # Compute the first and third quartile
        Q1 = export_dataset[index_col].quantile(quantile_amount)
        Q3 = export_dataset[index_col].quantile(1 - quantile_amount)
        # The intermediate quantile value is the difference between the third and the first one
        IQR = Q3 - Q1
        # The two bounds: lower and upper bound are computed
        MIN = Q1 - 1.5 * IQR
        MAX = Q1 + 1.5 * IQR
        # Remove the records outside the range [lowerbound, upperbound]
        filtered_dataset = export_dataset[(export_dataset[index_col] >= MIN) & (export_dataset[index_col] <= MAX)]

Analizzo il dataset con outlier rimossi

In [None]:
filtered_dataset.describe().T

Esporto in csv e pickle

In [None]:
if to_export:
    filtered_dataset_name = dataset_name.replace("complete", "filtered")
    discarded_percentage = round(100 - filtered_dataset.shape[0] / export_dataset.shape[0] * 100)
    filtered_dataset.to_csv(OUTPUT_PATH + "/" + filtered_dataset_name + "_" + str(discarded_percentage) + ".csv")
    filtered_dataset.to_pickle(OUTPUT_PATH + "/" + filtered_dataset_name + "_" + str(discarded_percentage) + ".pkl")
    print("Dataset", filtered_dataset_name, "esportato")