# Complete dataset (active + bankrupt) estimators

Librerie varie da installare

In [None]:
#!pip install pandas
#!pip install matplotlib
#!pip install seaborn

Inclusione delle librerie utilizzate

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

Variabili di gestione files

In [None]:
# Path of the files, can be changed
PATH_ACTIVE_DATASET = r"C:\Users\Andre\OneDrive - Università degli Studi di Parma\Tirocinio\Dataset_filtrato\indexes_active.pkl"

PATH_BANKRUPT_DATASET = r"C:\Users\Andre\OneDrive - Università degli Studi di Parma\Tirocinio\Dataset_filtrato\indexes_bankruptcy.pkl"

# True = export summary file in the OUTPUT_PATH
to_export = False

# Path of the desired output file, can be changed
OUTPUT_PATH = r"C:\Users\Andre\OneDrive - Università degli Studi di Parma\Tirocinio\Dataset_filtrato"

Leggo il dataset composto dai 2 file pickle

In [None]:
active_dataset = pd.read_pickle(PATH_ACTIVE_DATASET)
bankrupt_dataset = pd.read_pickle(PATH_BANKRUPT_DATASET)

Unisco i 2 dataset

In [None]:
complete_dataset = pd.concat([active_dataset, bankrupt_dataset])
complete_dataset.reset_index(inplace=True)

Analizzo il dataset completo

In [None]:
pd.set_option('display.float_format', lambda x: '%0.4f' % x)
complete_dataset.describe().T

Prelevo solo i campi degli indici finanziari

In [None]:
indexes_dataset = complete_dataset.iloc[: , 41:].copy()
indexes_corr = indexes_dataset.corr()

In [None]:
sns.set_theme(style="white")

# Generate a mask for the upper triangle
mask = np.triu(np.ones_like(indexes_corr, dtype=bool))

# Set up the matplotlib figure
f, ax = plt.subplots(figsize=(15, 15))

# Generate a custom diverging colormap
cmap = sns.diverging_palette(230, 20, as_cmap=True)

# Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(indexes_corr, mask=mask, cmap=cmap, vmax=1,
            center=0, annot=True, fmt='.2f',
            square=True, linewidths=.5, cbar_kws={"shrink": .5})

Esporto il dataset completo

In [None]:
str_cols = ['Ragione sociale',
            'Province',
            'Legal Form',
            'Legal Status',
            'Accounting closing date']
export_dataset = indexes_dataset.copy()
export_dataset[str_cols] = complete_dataset[str_cols]

In [None]:
# Reorder columns
export_dataset = export_dataset[['Ragione sociale',
                                 'Province',
                                 'Accounting closing date',
                                 'Legal Form',
                                 'Legal Status',
                                 'PN/Totale Debiti',
                                 'Deb. Prev + Trib/Attivo',
                                 'Tempo medio riscossione (TMR)',
                                 'Tempo medio di pagamento (TMP)',
                                 'PFN/EBITDA',
                                 'PFN/PN',
                                 'Gearing',
                                 'ROS',
                                 'Working capital/net sales',
                                 'Cash/Current Liabilities',
                                 'Accounts receivable/inventory',
                                 'EBIT/interest expenses',
                                 'Att.Br/Attivo',
                                 'Ricavi/Attivo',
                                 'EBITDA/Totale Debiti']]

In [None]:
export_dataset.describe().T

Esporto in csv e pickle

In [None]:
if to_export:
    export_dataset.to_csv(OUTPUT_PATH + "/indexes_complete.csv")
    export_dataset.to_pickle(OUTPUT_PATH + "/indexes_complete.pkl")

Cerco outliers

In [None]:
# Define numeric features cols
indexes_cols = [ 'PN/Totale Debiti',
                 'Deb. Prev + Trib/Attivo',
                 'Tempo medio riscossione (TMR)',
                 'Tempo medio di pagamento (TMP)',
                 'PFN/EBITDA',
                 'PFN/PN',
                 'Gearing',
                 'ROS',
                 'Working capital/net sales',
                 'Cash/Current Liabilities',
                 'Accounts receivable/inventory',
                 'EBIT/interest expenses',
                 'Att.Br/Attivo',
                 'Ricavi/Attivo',
                 'EBITDA/Totale Debiti']

In [None]:
for index_col in indexes_cols:
    plt.figure()
    sns.histplot(export_dataset, hue="Legal Status", x=index_col, bins=5, multiple="stack")

In [None]:
for index_col in indexes_cols:
    plt.figure()
    sns.boxplot(x=export_dataset[index_col], y=export_dataset["Legal Status"])

Rimuovo outliers

In [None]:
filtered_export_dataset = pd.DataFrame()
for index_col in indexes_cols:
    Q1 = export_dataset[index_col].quantile(0.25)
    Q3 = export_dataset[index_col].quantile(0.75)
    IQR = Q3 - Q1
    MIN = max(export_dataset[index_col].min(), Q1 - 1.5 * IQR)
    MAX = min(export_dataset[index_col].max(), Q1 + 1.5 * IQR)

    filtered_export_dataset = export_dataset[(export_dataset[index_col] >= MIN) & (export_dataset[index_col] <= MAX)]
    plt.figure()
    sns.boxplot(x=filtered_export_dataset[index_col], y=filtered_export_dataset["Legal Status"])

Analizzo il dataset con outlier rimossi

In [None]:
filtered_export_dataset.describe().T

In [None]:
indexes_corr = filtered_export_dataset.corr()
sns.set_theme(style="white")

# Generate a mask for the upper triangle
mask = np.triu(np.ones_like(indexes_corr, dtype=bool))

# Set up the matplotlib figure
f, ax = plt.subplots(figsize=(15, 15))

# Generate a custom diverging colormap
cmap = sns.diverging_palette(230, 20, as_cmap=True)

# Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(indexes_corr, mask=mask, cmap=cmap, vmax=1,
            center=0, annot=True, fmt='.2f',
            square=True, linewidths=.5, cbar_kws={"shrink": .5})

Guardo scatterplot tra le diverse features (a.k.a. indici finanziari)

In [None]:
# Take a subsample of 1000 active companies and 1000 bankrupt companies
sample = filtered_export_dataset.groupby("Legal Status").apply(lambda x: x.sample(1000))

# Remove index in order to print the scatter plot
sample.reset_index(drop=True, inplace=True)

# Draw the scatter plot between features
sns.pairplot(sample, hue="Legal Status", corner=True, vars=indexes_cols)

Esporto in csv e pickle

In [None]:
if to_export:
    filtered_export_dataset.to_csv(OUTPUT_PATH + "/filtered_indexes_complete.csv")
    filtered_export_dataset.to_pickle(OUTPUT_PATH + "/filtered_indexes_complete.pkl")