# Complete dataset (active + bankrupt) estimators

Librerie varie da installare

In [None]:
#!pip install pandas
#!pip install matplotlib
#!pip install seaborn

Inclusione delle librerie utilizzate

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

Variabili di gestione files

In [None]:
# Path of the files, can be changed
PATH_ACTIVE_DATASET = r"C:\Users\Andre\OneDrive - Università degli Studi di Parma\Tirocinio\Dataset_output\active_out_big.pkl"

PATH_BANKRUPT_DATASET = r"C:\Users\Andre\OneDrive - Università degli Studi di Parma\Tirocinio\Dataset_output\bankruptcy_out_big.pkl"

# True = export summary file in the OUTPUT_PATH
to_export = False

# True = plot the scatterplot between the raw columns (takes time!)
raw_plot = False

# Path of the desired output file, can be changed
OUTPUT_PATH = r"C:\Users\Andre\OneDrive - Università degli Studi di Parma\Tirocinio\Dataset_output"

Leggo il dataset composto dai 2 file pickle

In [None]:
active_dataset = pd.read_pickle(PATH_ACTIVE_DATASET)
bankrupt_dataset = pd.read_pickle(PATH_BANKRUPT_DATASET)

Unisco i 2 dataset

In [None]:
complete_dataset = pd.concat([active_dataset, bankrupt_dataset])
complete_dataset.reset_index(inplace=True)

Analizzo il dataset completo

In [None]:
pd.set_option('display.float_format', lambda x: '%0.4f' % x)
complete_dataset.describe().T

Analizzo le colonne grezze (ossia le colonne che non sono indicatori finanziari calcolati a posteriori)

In [None]:
if raw_plot:
    # Take only raw columns with the legal status
    raw_dataset = complete_dataset.iloc[: , 6:42].copy()

    # Take a subsample of 1000 active companies and 1000 bankrupt companies
    sample = raw_dataset.groupby("Legal Status").apply(lambda x: x.sample(1000))

    # Remove index in order to print the scatter plot
    sample.reset_index(drop=True, inplace=True)

    # Draw the scatter plot between features
    sns.pairplot(sample, hue="Legal Status", corner=True)

Prelevo solo i campi degli indici finanziari

In [None]:
indexes_dataset = complete_dataset.iloc[: , 42:].copy()
indexes_corr = indexes_dataset.corr()

In [None]:
sns.set_theme(style="white")

# Generate a mask for the upper triangle
mask = np.triu(np.ones_like(indexes_corr, dtype=bool))

# Set up the matplotlib figure
f, ax = plt.subplots(figsize=(15, 15))

# Generate a custom diverging colormap
cmap = sns.diverging_palette(230, 20, as_cmap=True)

# Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(indexes_corr, mask=mask, cmap=cmap, vmax=1,
            center=0, annot=True, fmt='.2f',
            square=True, linewidths=.5, cbar_kws={"shrink": .5})

Esporto il dataset completo

In [None]:
str_cols = ['Ragione sociale',
            'Province',
            'Legal Form',
            'Legal Status',
            'Accounting closing date',
            'Company Size']
export_dataset = indexes_dataset.copy()
export_dataset[str_cols] = complete_dataset[str_cols]

In [None]:
# Reorder columns
export_dataset = export_dataset[['Ragione sociale',
                                 'Province',
                                 'Accounting closing date',
                                 'Legal Form',
                                 'Legal Status',
                                 'Company Size',
                                 'PN/Totale Debiti',
                                 'Deb. Prev + Trib/Attivo',
                                 'Tempo medio riscossione (TMR)',
                                 'Tempo medio di pagamento (TMP)',
                                 'PFN/EBITDA',
                                 'PFN/PN',
                                 'Gearing',
                                 'ROS',
                                 'Working capital/net sales',
                                 'Cash/Current Liabilities',
                                 'Accounts receivable/inventory',
                                 'EBIT/interest expenses',
                                 'Att.Br/Attivo',
                                 'Ricavi/Attivo',
                                 'EBITDA/Totale Debiti']]

In [None]:
export_dataset.describe().T

Esporto in csv e pickle

In [None]:
if to_export:
    if "small" in PATH_ACTIVE_DATASET and "small" in PATH_BANKRUPT_DATASET:
        export_dataset.to_csv(OUTPUT_PATH + "/complete_active_bankruptcy_small.csv")
        export_dataset.to_pickle(OUTPUT_PATH + "/complete_active_bankruptcy_small.pkl")
    elif "big" in PATH_ACTIVE_DATASET and "big" in PATH_BANKRUPT_DATASET:
        export_dataset.to_csv(OUTPUT_PATH + "/complete_active_bankruptcy_big.csv")
        export_dataset.to_pickle(OUTPUT_PATH + "/complete_active_bankruptcy_big.pkl")
    else:
        export_dataset.to_csv(OUTPUT_PATH + "/complete_active_bankruptcy_mixed.csv")
        export_dataset.to_pickle(OUTPUT_PATH + "/complete_active_bankruptcy_mixed.pkl")

Cerco outliers

In [None]:
# Define numeric features cols
indexes_cols = [ 'PN/Totale Debiti',
                 'Deb. Prev + Trib/Attivo',
                 'Tempo medio riscossione (TMR)',
                 'Tempo medio di pagamento (TMP)',
                 'PFN/EBITDA',
                 'PFN/PN',
                 'Gearing',
                 'ROS',
                 'Working capital/net sales',
                 'Cash/Current Liabilities',
                 'Accounts receivable/inventory',
                 'EBIT/interest expenses',
                 'Att.Br/Attivo',
                 'Ricavi/Attivo',
                 'EBITDA/Totale Debiti']

In [None]:
fig, axes = plt.subplots(5, 3, figsize=(20,25))
row_index = 0
col_index = 0
for index_col in indexes_cols:
    sns.histplot(export_dataset, hue="Legal Status", x=index_col, bins=10, multiple="stack", ax=axes[row_index, col_index])
    col_index += 1
    if col_index == 3:
        row_index += 1
        col_index = 0

In [None]:
fig, axes = plt.subplots(5, 3, figsize=(20,25), sharey=True)
row_index = 0
col_index = 0
for index_col in indexes_cols:
    sns.boxplot(x=export_dataset[index_col], y=export_dataset["Legal Status"], ax=axes[row_index, col_index])
    col_index += 1
    if col_index == 3:
        row_index += 1
        col_index = 0

Rimuovo outliers

In [None]:
fig, axes = plt.subplots(5, 3, figsize=(20,25), sharey=True)
row_index = 0
col_index = 0

filtered_export_dataset = pd.DataFrame()
# Do the following steps for each feature
for index_col in indexes_cols:
    # Compute the first and third quartile
    Q1 = export_dataset[index_col].quantile(0.25)
    Q3 = export_dataset[index_col].quantile(0.75)
    # The intermediate quantile value is the difference between the third and the first one
    IQR = Q3 - Q1
    # The two bounds: lower and upper bound are computed
    MIN = Q1 - 1.5 * IQR
    MAX = Q1 + 1.5 * IQR
    # Remove the records outside the range [lowerbound, upperbound]
    filtered_export_dataset = export_dataset[(export_dataset[index_col] >= MIN) & (export_dataset[index_col] <= MAX)]

    # Show the boxplot after filtering the dataset column values
    sns.boxplot(x=filtered_export_dataset[index_col], y=filtered_export_dataset["Legal Status"], ax=axes[row_index, col_index])
    col_index += 1
    if col_index == 3:
        row_index += 1
        col_index = 0

Analizzo il dataset con outlier rimossi

In [None]:
filtered_export_dataset.describe().T

In [None]:
indexes_corr = filtered_export_dataset.corr()
sns.set_theme(style="white")

# Generate a mask for the upper triangle
mask = np.triu(np.ones_like(indexes_corr, dtype=bool))

# Set up the matplotlib figure
f, ax = plt.subplots(figsize=(15, 15))

# Generate a custom diverging colormap
cmap = sns.diverging_palette(230, 20, as_cmap=True)

# Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(indexes_corr, mask=mask, cmap=cmap, vmax=1,
            center=0, annot=True, fmt='.2f',
            square=True, linewidths=.5, cbar_kws={"shrink": .5})

Guardo la correlazione tra le variabili di input e la risposta di output

In [None]:
plt.figure(figsize=(8, 12))

# Create a dataset where the output value (Active/Bankruptcy) is a numerical value of 1/0
# and the input values (15 features) are simply copied from the original dataset
target_dataset = pd.DataFrame()
target_dataset["Legal Status"] = filtered_export_dataset["Legal Status"].replace({"Active": 1, "Bankruptcy": 0})
target_dataset[indexes_cols] = filtered_export_dataset[indexes_cols]

# Compute correlation
target_corr = target_dataset.corr()

# Print and sort them in order of correlation
heatmap = sns.heatmap(target_corr[["Legal Status"]].sort_values(by="Legal Status", ascending=False), vmin=-1, vmax=1, annot=True, cmap="RdBu")

Guardo scatterplot tra le diverse features (a.k.a. indici finanziari)

In [None]:
# Take a subsample of 1000 active companies and 1000 bankrupt companies
sample = filtered_export_dataset.groupby("Legal Status").apply(lambda x: x.sample(1000))

# Remove index in order to print the scatter plot
sample.reset_index(drop=True, inplace=True)

# Draw the scatter plot between features
sns.pairplot(sample, hue="Legal Status", corner=True, vars=indexes_cols)

Esporto in csv e pickle

In [None]:
if to_export:
    if "small" in PATH_ACTIVE_DATASET and "small" in PATH_BANKRUPT_DATASET:
        filtered_export_dataset.to_csv(OUTPUT_PATH + "/filtered_active_bankruptcy_small.csv")
        filtered_export_dataset.to_pickle(OUTPUT_PATH + "/filtered_active_bankruptcy_small.pkl")
    elif "big" in PATH_ACTIVE_DATASET and "big" in PATH_BANKRUPT_DATASET:
        filtered_export_dataset.to_csv(OUTPUT_PATH + "/filtered_active_bankruptcy_big.csv")
        filtered_export_dataset.to_pickle(OUTPUT_PATH + "/filtered_active_bankruptcy_big.pkl")
    else:
        filtered_export_dataset.to_csv(OUTPUT_PATH + "/filtered_active_bankruptcy_mixed.csv")
        filtered_export_dataset.to_pickle(OUTPUT_PATH + "/filtered_active_bankruptcy_mixed.pkl")