# Complete dataset (active + bankrupt) focused on all raw financial values

Librerie varie da installare

In [None]:
#!pip install pandas
#!pip install matplotlib
#!pip install seaborn

Inclusione delle librerie utilizzate

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

Variabili di gestione files

In [None]:
# Path of the files, can be changed
PATH_ACTIVE_DATASET = r"C:\Users\Andre\OneDrive - Università degli Studi di Parma\Tirocinio\Dataset_output\active_raw_full.pkl"

PATH_BANKRUPT_DATASET = r"C:\Users\Andre\OneDrive - Università degli Studi di Parma\Tirocinio\Dataset_output\bankruptcy_raw_full.pkl"

# True = export summary file in the OUTPUT_PATH
to_export = True

# True = select the number of active and bankruptcy records used to generate plots (recommended at least 10000)
sample_size = 100000

# Specify the strength of the quantile data filtering, recommended between 0.05 and 0.25
quantile_amount = 0.25

# Path of the desired output file, can be changed
OUTPUT_PATH = r"C:\Users\Andre\OneDrive - Università degli Studi di Parma\Tirocinio\Dataset_output"

Leggo il dataset composto dai 2 file pickle

In [None]:
active_dataset = pd.read_pickle(PATH_ACTIVE_DATASET)
bankrupt_dataset = pd.read_pickle(PATH_BANKRUPT_DATASET)

Unisco i 2 dataset

In [None]:
complete_dataset = pd.concat([active_dataset, bankrupt_dataset])
complete_dataset.reset_index(inplace=True)
complete_dataset.drop(columns="index", axis=1, inplace=True)

Analizzo il dataset completo

In [None]:
pd.set_option('display.float_format', lambda x: '%0.4f' % x)
complete_dataset.describe().T

Prendo sottocampione del dataset per generare i grafici

In [None]:
# Take a subsample of 1000 active companies and 1000 bankrupt companies
sample = complete_dataset.groupby("Legal Status").apply(lambda x: x.sample(sample_size))

# Remove index in order to print the scatter plot
sample.reset_index(drop=True, inplace=True)

Guardo come sono correlate le varie voci di bilancio

In [None]:
corr = sample.corr()

sns.set_theme(style="white")

# Generate a mask for the upper triangle
mask = np.triu(np.ones_like(corr, dtype=bool))

# Set up the matplotlib figure
plt.subplots(figsize=(100, 100))

# Generate a custom diverging colormap
cmap = sns.diverging_palette(230, 20, as_cmap=True)

# Draw the heatmap with the mask and correct aspect ratio
ax = sns.heatmap(corr, mask=mask, cmap=cmap, vmin=-1, vmax=1,
                 center=0, square=True, linewidths=.5,
                 cbar_kws={"shrink": .5})

ax.set_title("Correlation matrix with all the raw features")

Guardo la correlazione tra le variabili di input e la risposta di output

In [None]:
plt.figure(figsize=(40, 50))

# Create a dataset where the output value (Active/Bankruptcy) is a numerical value of 1/0
# and the input values (15 features) are simply copied from the original dataset
target_dataset = sample.copy()
target_dataset["Legal Status"] = target_dataset["Legal Status"].replace({"Active": 0, "Bankruptcy": 1})

# Compute correlation
target_corr = target_dataset.corr()

# Print and sort them in order of correlation
ax = sns.heatmap(target_corr[["Legal Status"]].sort_values(by="Legal Status", ascending=False), vmin=-1, vmax=1,
                                                                annot=True, cmap="RdBu", linewidths=.5, cbar_kws={"shrink": .5})
ax.set_title("Correlation with Legal Status and all the raw features")

Esporto in csv e pickle

In [None]:
if to_export:
    complete_dataset.to_csv(OUTPUT_PATH + "/complete_active_bankruptcy_raw_full.csv")
    complete_dataset.to_pickle(OUTPUT_PATH + "/complete_active_bankruptcy_raw_full.pkl")

Rimuovo possibili outliers

In [None]:
filtered_dataset = pd.DataFrame()
# Do the following steps for each feature
for index_col in complete_dataset.columns:
    # Only numeric columns
    if np.issubdtype(complete_dataset[index_col].dtype , np.number):
        # Compute the first and third quartile
        Q1 = complete_dataset[index_col].quantile(quantile_amount)
        Q3 = complete_dataset[index_col].quantile(1 - quantile_amount)
        # The intermediate quantile value is the difference between the third and the first one
        IQR = Q3 - Q1
        # The two bounds: lower and upper bound are computed
        MIN = Q1 - 1.5 * IQR
        MAX = Q1 + 1.5 * IQR
        # Remove the records outside the range [lowerbound, upperbound]
        filtered_dataset = complete_dataset[(complete_dataset[index_col] >= MIN) & (complete_dataset[index_col] <= MAX)]

In [None]:
filtered_dataset.describe().T

Esporto in csv e pickle

In [None]:
if to_export:
    discarded_percentage = round(100 - filtered_dataset.shape[0] / complete_dataset.shape[0] * 100)
    filtered_dataset.to_csv(OUTPUT_PATH + "/filtered_active_bankruptcy_raw_full" + "_" + str(discarded_percentage) + ".csv")
    filtered_dataset.to_pickle(OUTPUT_PATH + "/filtered_active_bankruptcy_raw_full" + "_" + str(discarded_percentage) + ".pkl")