# Bankrupt dataset ETL focused on ALL financial raw values

Librerie varie da installare

In [None]:
#!pip install pandas

Inclusione delle librerie utilizzate

In [None]:
import pandas as pd
import os
from os.path import exists
import glob
import numpy as np
import re

Variabili di gestione files

In [None]:
# Path of the directory containing the bankruptcy companies .csv files, can be changed
PATH_BANKRUPT = r"C:\Users\Andre\OneDrive - Università degli Studi di Parma\Tirocinio\Dataset\Bankruptcy_rielaborati_G"

# True = export summary file in the OUTPUT_PATH
to_export = True

# Path of the desired output file, can be changed
OUTPUT_PATH = r"C:\Users\Andre\OneDrive - Università degli Studi di Parma\Tirocinio\Dataset_output"

Leggo i file .csv contenuti in una directory e le sue sottodirectories (BANKRUPT)

In [None]:
# Dataset with all the csv files
input_dataset = pd.DataFrame()

# Path of each csv file
# Path description: Bankruptcy_rielaborate_G/*.csv
csv_files = glob.glob(os.path.join(PATH_BANKRUPT, "**/*.csv"), recursive=True)

# Loop over the list of csv files
for f in csv_files:

    # Print the location and filename
    print('Location:', f)
    print('File Name:', f.split("\\")[-1])

    # Read each csv file
    partial_dataset = pd.read_csv(f, sep=";", dtype=object, thousands=",", decimal=".")

    # Append the partial dataset
    input_dataset = pd.concat([input_dataset, partial_dataset])


print("Dimensione dataset delle società in bancarotta: ", input_dataset.shape)

Gestisco i nomi delle colonne in modo consistente

In [None]:
for col in input_dataset.columns:
    old_col_name = col
    new_col_name = re.sub("\r\nEUR\r\nYear - \d|\r\nYear - \d|\nEUR\nYear - \d|Year - \d|EUR", "", old_col_name)
    new_col_name = re.sub("\r\n|\r|\n", " ", new_col_name)
    new_col_name = re.sub(" Last avail. yr", "", new_col_name)
    input_dataset.rename(columns={old_col_name: new_col_name}, inplace=True)

In [None]:
# Combine all columns with the same name
input_dataset = input_dataset.groupby(level=0, axis=1).sum()

Rimuovo colonne inutilizzabili o addirittura controproducenti per la creazione di futuri modelli di ML

In [None]:
removed_columns = [ # descriptive columns
                    "Tax code number",
                    "CCIAA number",
                    "VAT number",
                    "ATECO 2007 code",
                    "ATECO 2007 description",
                    "Employees ",
                    "Number of employees",
                    # summary columns
                    "TOTAL ASSETS",
                    "TOTAL RECEIVABLES",
                    "TOTAL PROVISIONS FOR RISKS AND CHARGES",
                    "PROFIT (LOSS) THIRD PARTIES",
                    "PROFIT (LOSS) GROUP",
                    # garbage and not available active columns
                    "Unnamed: 0",
                    "(Depreciation provision)",
                    "Due from comp. under parent companies control",
                    "Extraordinary charges",
                    "Extraordinary revenues",
                    "of which capital gains",
                    "of which capital losses",
                    "of which taxes previous period",
                    "Total depreciation, amortization and writedowns",
                    "TOTAL receivables due from shareholders",
                    "MINORITY INTERESTS SHAREHOLDERS\' FUNDS",
                    "TOTAL EXTRAORDINARY REVENUES AND CHARGES",
                    "Total personnel costs"
]

input_dataset.drop(columns=removed_columns, axis=1, inplace=True)

Tutte le colonne in minuscolo, tranne qualche eccezione per essere consistente con gli altri dataset

In [None]:
input_dataset.columns = [x.lower() for x in input_dataset.columns]
columns_map = {"legal status": "Legal Status",
               "legal form": "Legal Form",
               "company size": "Company Size"
               }
input_dataset.rename(columns=columns_map, inplace=True)

Sistemo typo di qualche colonna

In [None]:
columns_map = {"due to shar. for loans" : "due to shareholders for loans",
               "employees": "number of employees",
               "negative reserves for own shares (+/-)": "negative reserves for own shares ",
               "other companies": "others company",
               "total liabilities and shareholders' funds": "total liabilities and shareholders funds",
               "total shareholders' funds": "total shareholders funds",
               "total financial income and charges": "total financial income and charge",
               "company name": "ragione sociale",
               "from fin. receiv. (of which: from subsidiaries, associated, parent cies and cies under parent cies control) ":
                   "from fin. receiv. (of which: from subsidiaries, associated, parent cies and cies under parent cies control)",
               "minority interests shareholders' funds": "minority interests shareholders funds"
}
input_dataset.rename(columns=columns_map, inplace=True)

Tutti i valori non disponibili o errati sono rimpiazzati con NaN

In [None]:
# Dictionary with the wrong values
replace_values = {
    "n.a.": np.NaN,
    "n.d.": np.NaN,
    "n.s.": np.NaN,
    "nan": np.NaN,
    "#VALUE!": np.NaN,
    "UDATACHI": np.NaN
}

input_dataset.replace(replace_values, inplace=True)


input_dataset.isna().sum()

Rimuovo NaN

In [None]:
input_dataset.dropna(inplace=True)

Cast delle colonne del dataset a tipi numerici e descrittivi (stringhe)

In [None]:
str_cols = ['ragione sociale',
            'province',
            'Legal Form',
            'Legal Status',
            'accounting closing date']

for col in input_dataset.columns:
    if col in str_cols:
        input_dataset[col] = input_dataset[col].astype(str)
    else:
        input_dataset[col] = pd.to_numeric(input_dataset[col])

Rendo la colonna Legal Status contenente un unico valore (Bankruptcy)

In [None]:
input_dataset["Legal Status"] = "Bankruptcy"

Sostituisco i valori del campo “Legal Form” con “Società di capitali”, “Società di persone” o “Altro”

In [None]:
# Map società di capitali
societa_capitali_values = ["S.R.L.",
                           "Limited liability company - SRL",
                           "One-person company with limited liability - SRL",
                           "S.R.L. a socio unico",
                           "S.R.L. semplificata",
                           "S.C.A.R.L.",
                           "Cooperative company with limited liability – SCARL",
                           "Limited liability company, simplified",
                           "Società consortile a responsabilita' limitata",
                           "S.P.A.",
                           "S.C.A.R.L.P.A.",
                           "Joint stock company - SPA",
                           "Cooperative company with limited liability by shares - SCARLPA",
                           "S.P.A. a socio unico",
                           "One-person joint stock company - SPA",
                           "Società consortile per azioni",
                           "Consortium by shares",
                           "PICC. S.C.A.R.L.",
                           "Cooperative company with limited liability, small - SCARL",
                           "Limited liability consortium cooperative company",
                           "Società consortile cooperativa a responsabilità limitata",
                           "S.A.P.A.",
                           "S.R.L. a capitale ridotto",
                           "Public limited company - SA",
                           "Limited partnership with shares - SAPA",
                           "S.C.A.R.I.",
                           "Consortium",
                           "Consorzio",
                           "Società di capitali"]

# Dictionary società di capitali
dict_capitali = dict.fromkeys(societa_capitali_values, "Società di capitali")

# Map società di persone
societa_persone_values = ["Limited partnership - SAS",
                          "S.N.C.",
                          "Società semplice",
                          "General partnership - SNC",
                          "Partnership - SS",
                          "S.A.S.",
                          "Cooperativa sociale",
                          "Social cooperative company",
                          "Società cooperativa consortile",
                          "Consortium of cooperatives",
                          "Small cooperative company",
                          "Società di persone"]

# Dictionary società di persone
dict_persone = dict.fromkeys(societa_persone_values, "Società di persone")

# General dictionary
dict = {**dict_capitali, **dict_persone}

# Apply the dictionary and apply the value "Altro" when it is not a dictionary's value
input_dataset["Legal Form"] = input_dataset["Legal Form"].map(dict).fillna("Altro").astype(str)

Aggiungo colonna denominata “Company Size” per distinguere tra “Grandi”, “Medie” e “Piccole” imprese

In [None]:
# Define company size label based on some column values
def label_size(row):
    score_grandi = 0
    score_medie = 0
    score_piccole = 0

    if row["revenues from sales and services"] >= 50000000:
        score_grandi += 1
    elif 10000000 <= row["revenues from sales and services"] < 50000000:
        score_medie += 1
    else:
        score_piccole += 1


    if row["number of employees"] >= 250:
        score_grandi += 1
    elif 50 <= row["number of employees"] < 250:
        score_medie += 1
    else:
        score_piccole += 1

    score_max = max(score_grandi, score_medie, score_piccole)

    if score_grandi == score_max:
        return "Grandi"
    elif score_medie == score_max:
        return "Medie"
    else:
        return "Piccole"

In [None]:
input_dataset["Company Size"] = input_dataset.apply(lambda row: label_size(row), axis=1)

Riordino colonne come nel dataset delle società in attivo

In [None]:
if exists(OUTPUT_PATH + "/active_raw_full.pkl"):
    active_dataset = pd.read_pickle(OUTPUT_PATH + "/active_raw_full.pkl")
    columns_order = active_dataset.columns.to_list()
    input_dataset = input_dataset[columns_order]

Esporto in csv e pickle

In [None]:
if to_export:
    input_dataset.to_csv(OUTPUT_PATH + "/bankruptcy_raw_full.csv")
    input_dataset.to_pickle(OUTPUT_PATH + "/bankruptcy_raw_full.pkl")