# Complete dataset (active + bankrupt) focused on raw financial values

Librerie varie da installare

In [None]:
#!pip install pandas
#!pip install matplotlib
#!pip install seaborn

Inclusione delle librerie utilizzate

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

Variabili di gestione files

In [None]:
# Path of the files, can be changed
PATH_ACTIVE_DATASET = r"C:\Users\Andre\OneDrive - Università degli Studi di Parma\Tirocinio\Dataset_output\active_raw.pkl"

PATH_BANKRUPT_DATASET = r"C:\Users\Andre\OneDrive - Università degli Studi di Parma\Tirocinio\Dataset_output\bankruptcy_raw.pkl"

# True = export summary file in the OUTPUT_PATH
to_export = False

# Path of the desired output file, can be changed
OUTPUT_PATH = r"C:\Users\Andre\OneDrive - Università degli Studi di Parma\Tirocinio\Dataset_output"

Leggo il dataset composto dai 2 file pickle

In [None]:
active_dataset = pd.read_pickle(PATH_ACTIVE_DATASET)
bankrupt_dataset = pd.read_pickle(PATH_BANKRUPT_DATASET)

Unisco i 2 dataset

In [None]:
complete_dataset = pd.concat([active_dataset, bankrupt_dataset])
complete_dataset.reset_index(inplace=True)

Analizzo il dataset completo

In [None]:
pd.set_option('display.float_format', lambda x: '%0.4f' % x)
complete_dataset.describe().T

Guardo come sono correlate le varie voci di bilancio

In [None]:
corr = complete_dataset.corr()

sns.set_theme(style="white")

# Generate a mask for the upper triangle
mask = np.triu(np.ones_like(corr, dtype=bool))

# Set up the matplotlib figure
plt.subplots(figsize=(30, 30))

# Generate a custom diverging colormap
cmap = sns.diverging_palette(230, 20, as_cmap=True)

# Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(corr, mask=mask, cmap=cmap, vmax=1,
            center=0, annot=True, fmt='.2f',
            square=True, linewidths=.5, cbar_kws={"shrink": .5})

Guardo la correlazione tra le variabili di input e la risposta di output

In [None]:
plt.figure(figsize=(8, 12))

# Create a dataset where the output value (Active/Bankruptcy) is a numerical value of 1/0
# and the input values (15 features) are simply copied from the original dataset
target_dataset = complete_dataset.copy()
target_dataset["Legal Status"] = target_dataset["Legal Status"].replace({"Active": 0, "Bankruptcy": 1})
target_dataset.drop(columns="index", axis=1, inplace=True)

# Compute correlation
target_corr = target_dataset.corr()

# Print and sort them in order of correlation
heatmap = sns.heatmap(target_corr[["Legal Status"]].sort_values(by="Legal Status", ascending=False), vmin=-1, vmax=1, annot=True, cmap="RdBu")

Guardo scatterplot tra le diverse features (a.k.a. colonne grezze)

In [None]:
# Take a subsample of 1000 active companies and 1000 bankrupt companies
sample = complete_dataset.groupby("Legal Status").apply(lambda x: x.sample(1000))

# Remove index in order to print the scatter plot
sample.reset_index(drop=True, inplace=True)

# Draw the scatter plot between features
sns.pairplot(sample, hue="Legal Status", corner=True)

Esporto in csv e pickle

In [None]:
if to_export:
    complete_dataset.to_csv(OUTPUT_PATH + "/complete_raw.csv")
    complete_dataset.to_pickle(OUTPUT_PATH + "/complete_raw.pkl")