# Complete dataset (active + bankrupt) focused on raw financial values and their history

Librerie varie da installare

In [None]:
#!pip install pandas
#!pip install matplotlib
#!pip install seaborn

Inclusione delle librerie utilizzate

In [None]:
import pandas as pd
import numpy as np

Variabili di gestione files

In [None]:
# Path of the files, can be changed
PATH_ACTIVE_DATASET = r"C:\Users\Andre\OneDrive - Università degli Studi di Parma\Tirocinio\Dataset_output\active_raw_history.pkl"

PATH_BANKRUPT_DATASET = r"C:\Users\Andre\OneDrive - Università degli Studi di Parma\Tirocinio\Dataset_output\bankruptcy_raw_history.pkl"

# True = export summary file in the OUTPUT_PATH
to_export = True

# Specify the strength of the quantile data filtering, recommended between 0.05 and 0.25
quantile_amount = 0.05

# True = replace missing value and use all the 4 years, False = keep n year and remove the records with missing values
replace_na_value = False

# Number of year to be kept (between 2 and 4)
keep_n_year = 4

# Path of the desired output file, can be changed
OUTPUT_PATH = r"C:\Users\Andre\OneDrive - Università degli Studi di Parma\Tirocinio\Dataset_output"

Leggo il dataset composto dai 2 file pickle

In [None]:
active_dataset = pd.read_pickle(PATH_ACTIVE_DATASET)
bankrupt_dataset = pd.read_pickle(PATH_BANKRUPT_DATASET)

Unisco i 2 dataset

In [None]:
complete_dataset = pd.concat([active_dataset, bankrupt_dataset])
complete_dataset.reset_index(inplace=True)
complete_dataset.drop(columns="index", axis=1, inplace=True)
# Remove excessive descriptive columns
complete_dataset.drop(columns=["Ragione sociale", "Province", "Vat number", "Accounting closing date", "Company Size", "Legal Form"], axis=1, inplace=True)

Tengo traccia dello storico dei vari bilanci per la medesima impresa in unico record

In [None]:
complete_dataset = complete_dataset.set_index(["Tax code number", "CCIAA number", "Legal Status",
                                               complete_dataset.groupby(["Tax code number", "CCIAA number", "Legal Status"]).cumcount()+1]).unstack().sort_index(level=1, axis=1)
complete_dataset.columns = complete_dataset.columns.map('{0[0]}_{0[1]}'.format)
complete_dataset.reset_index(inplace=True)
complete_dataset.isna().sum()

In [None]:
print(complete_dataset.shape)

Gestisco il dataset in base a come gestire i valori mancanti

In [None]:
if replace_na_value:
    complete_dataset.fillna(0.0, inplace=True)
else:
    start_year = 2
    # Remove NA from the first n years
    while start_year <= keep_n_year:
        year_cols = complete_dataset.filter(regex=("_"+str(start_year)+"$"), axis=1).columns
        complete_dataset.dropna(subset = year_cols, inplace=True)
        start_year += 1
    # Drop columns of the last n years, if available
    if start_year <= 4:
        while start_year <= 4:
            year_cols = complete_dataset.filter(regex=("_"+str(start_year)+"$"), axis=1).columns
            complete_dataset.drop(columns=year_cols, axis=1, inplace=True)
            start_year += 1

Analizzo il dataset completo

In [None]:
pd.set_option('display.float_format', lambda x: '%0.4f' % x)
complete_dataset.describe().T

Esporto in csv e pickle

In [None]:
if to_export:
    dataset_name = "complete_active_bankruptcy_raw_history"

    if not replace_na_value:
        dataset_name += str(keep_n_year)

    complete_dataset.to_csv(OUTPUT_PATH + "/" + dataset_name + ".csv")
    complete_dataset.to_pickle(OUTPUT_PATH + "/" + dataset_name + ".pkl")
    print("Dataset", dataset_name, "esportato")

Rimuovo possibili outliers

In [None]:
filtered_dataset = pd.DataFrame()
# Do the following steps for each feature
for index_col in complete_dataset.columns:
    # Only numeric columns
    if np.issubdtype(complete_dataset[index_col].dtype , np.number):
        # Compute the first and third quartile
        Q1 = complete_dataset[index_col].quantile(quantile_amount)
        Q3 = complete_dataset[index_col].quantile(1 - quantile_amount)
        # The intermediate quantile value is the difference between the third and the first one
        IQR = Q3 - Q1
        # The two bounds: lower and upper bound are computed
        MIN = Q1 - 1.5 * IQR
        MAX = Q1 + 1.5 * IQR
        # Remove the records outside the range [lowerbound, upperbound]
        filtered_dataset = complete_dataset[(complete_dataset[index_col] >= MIN) & (complete_dataset[index_col] <= MAX)]

Esporto in csv e pickle

In [None]:
if to_export:
    filtered_dataset_name = dataset_name.replace("complete", "filtered")
    discarded_percentage = round(100 - filtered_dataset.shape[0] / complete_dataset.shape[0] * 100)
    filtered_dataset.to_csv(OUTPUT_PATH + "/" + filtered_dataset_name + "_" + str(discarded_percentage) + ".csv")
    filtered_dataset.to_pickle(OUTPUT_PATH + "/" + filtered_dataset_name + "_" + str(discarded_percentage) + ".pkl")