# Dataset summaries

L'obiettivo di questo notebook è quello di confrontare diversi dataset riguardanti società in attivo/fallimento

Librerie da installare

In [None]:
#!pip install pandas
#!pip install matplotlib
#!pip install seaborn

Inclusione delle librerie utilizzate

In [None]:
import pandas as pd
import os
import glob
import matplotlib.pyplot as plt
from matplotlib.ticker import FormatStrFormatter
import seaborn as sns

Variabili di gestione files

In [None]:
# Path of the directory containing .pkl files of the different datasets, can be changed
DATASETS_PATH = r"C:\Users\Andre\OneDrive - Università degli Studi di Parma\Tirocinio\Dataset_output"

# True = export summary file in the OUTPUT_PATH
to_export = False

# Path of the output file report, can be changed
OUTPUT_PATH = r"C:\Users\Andre\OneDrive - Università degli Studi di Parma\Tirocinio\Dataset_reports"

In [None]:
# List containing all the datasets
dataset_list = []

# Find every pkl file in a certain directory
pkl_files = glob.glob(os.path.join(DATASETS_PATH, "*.pkl"))

for f in pkl_files:

    # Print the location and filename
    print('Location:', f)
    filename = f.split("\\")[-1]
    print('File Name:', filename)

    # Load the dataset from the pkl file
    dataset = pd.read_pickle(f)

    # Append the dataset to the list
    dataset_list.append((filename, dataset))

Creo un piccolo dataset per raggruppare le statistiche

In [None]:
# Lists to save specific statistics of different datasets
filenames = []
sizes = []
amounts_active = []
amounts_bankruptcy = []
amounts_societa_capitali = []
amounts_societa_persone = []
amounts_societa_altro = []
amounts_dimensione_grandi = []
amounts_dimensione_medie = []
amounts_dimensione_piccole = []
amounts_number_features = []

# Iterate over each dataset couple and insert the different statistics inside the list
for dataset_couple in dataset_list:
    # Name of the file
    filename = dataset_couple[0]
    # Pandas dataset
    dataset = dataset_couple[1]
    # Amount of active companies
    amount_active = len(dataset[dataset["Legal Status"] == "Active"])
    # Amount of bankruptcy companies
    amount_bankruptcy = len(dataset[dataset["Legal Status"] == "Bankruptcy"])
    # Amount of "società di capitali"
    amount_societa_capitali = len(dataset[dataset["Legal Form"] == "Società di capitali"])
    # Amount of "società di persone"
    amount_societa_persone = len(dataset[dataset["Legal Form"] == "Società di persone"])
    # Amount of "altro"
    amount_societa_altro = len(dataset[dataset["Legal Form"] == "Altro"])
    # Amount of big companies
    amount_dimensione_grandi = len(dataset[dataset["Company Size"] == "Grandi"])
    # Amount of medium companies
    amount_dimensione_medie = len(dataset[dataset["Company Size"] == "Medie"])
    # Amount of small companies
    amount_dimensione_piccole = len(dataset[dataset["Company Size"] == "Piccole"])

    # Insert into the the specific list
    # Filename
    filenames.append(filename)
    # Amount of records
    sizes.append(dataset.shape[0])
    # Amount of active companies
    amounts_active.append(amount_active)
    # Amount of bankruptcy companies
    amounts_bankruptcy.append(amount_bankruptcy)
    # Amount of "società di capitali"
    amounts_societa_capitali.append(amount_societa_capitali)
    # Amount of "società di persone"
    amounts_societa_persone.append(amount_societa_persone)
    # Amount of "altro"
    amounts_societa_altro.append(amount_societa_altro)
    # Amount of big companies
    amounts_dimensione_grandi.append(amount_dimensione_grandi)
    # Amount of medium companies
    amounts_dimensione_medie.append(amount_dimensione_medie)
    # Amount of small companies
    amounts_dimensione_piccole.append(amount_dimensione_piccole)
    # Number of features
    amounts_number_features.append(dataset.shape[1] - 1)

In [None]:
# Create the summary dataset
dataset = pd.DataFrame()
dataset["Filenames"] = filenames
dataset["Sizes"] = sizes
dataset["Number of features"] = amounts_number_features

dataset["Amounts Active"] = amounts_active
dataset["Amounts Bankruptcy"] = amounts_bankruptcy

dataset["Ratios Active"] = (dataset["Amounts Active"] / dataset["Sizes"]).apply("{:.2%}".format)
dataset["Ratios Bankruptcy"] = (dataset["Amounts Bankruptcy"] / dataset["Sizes"]).apply("{:.2%}".format)

dataset["Amounts Società di capitali"] = amounts_societa_capitali
dataset["Amounts Società di persone"] = amounts_societa_persone
dataset["Amounts Altro"] = amounts_societa_altro

dataset["Ratios Società di capitali"] = (dataset["Amounts Società di capitali"] / dataset["Sizes"]).apply("{:.2%}".format)
dataset["Ratios Società di persone"] = (dataset["Amounts Società di persone"] / dataset["Sizes"]).apply("{:.2%}".format)
dataset["Ratios Altro"] = (dataset["Amounts Altro"] / dataset["Sizes"]).apply("{:.2%}".format)

dataset["Amounts Big"] = amounts_dimensione_grandi
dataset["Amounts Medium"] = amounts_dimensione_medie
dataset["Amounts Small"] = amounts_dimensione_piccole

dataset["Ratios Big"] = (dataset["Amounts Big"] / dataset["Sizes"]).apply("{:.2%}".format)
dataset["Ratios Medium"] = (dataset["Amounts Medium"] / dataset["Sizes"]).apply("{:.2%}".format)
dataset["Ratios Small"] = (dataset["Amounts Small"] / dataset["Sizes"]).apply("{:.2%}".format)

dataset.head(10)

Plot delle diverse dimensioni

In [None]:
fig, ax = plt.subplots(figsize=(10,5))
ax.xaxis.set_major_formatter(FormatStrFormatter('%i'))
sns.barplot(x='Sizes', y='Filenames', data=dataset)
plt.title("Dataset sizes comparison")

In [None]:
fig, ax = plt.subplots(figsize=(10,5))
ax.xaxis.set_major_formatter(FormatStrFormatter('%i'))
sns.barplot(x='Number of features', y='Filenames', data=dataset)
plt.title("Dataset number of features comparison")

Esporto come file excel

In [None]:
if to_export:
    dataset.to_excel(OUTPUT_PATH + "/datasets_summaries.xlsx")