# Active dataset ETL focused on ALL financial raw values

Librerie varie da installare

In [None]:
#!pip install pandas

Inclusione delle librerie utilizzate

In [None]:
import pandas as pd
import os
import glob
import numpy as np
import re

Variabili di gestione files

In [None]:
# Path of the directory containing the active companies .csv files, can be changed
PATH_ACTIVE = r"C:\Users\Andre\OneDrive - Università degli Studi di Parma\Tirocinio\Dataset\Attive_rielaborate_G"

# True = export summary file in the OUTPUT_PATH
to_export = True

# Path of the desired output file, can be changed
OUTPUT_PATH = r"C:\Users\Andre\OneDrive - Università degli Studi di Parma\Tirocinio\Dataset_output"

Leggo i file .csv contenuti in una directory e le sue sottodirectories (ACTIVE)

In [None]:
# Dataset with all the csv files
input_dataset = pd.DataFrame()

# Path of each csv file
# Path description: Attive_rielaborate_G/2015|2016|2017/*.csv
csv_files = glob.glob(os.path.join(PATH_ACTIVE, "**/*.csv"), recursive=True)


# Loop over the list of csv files
for f in csv_files:

    # Print the location and filename
    print('Location:', f)
    print('File Name:', f.split("\\")[-1])
    print('Year:', f.split("\\")[-2])

    # Read each csv file
    partial_dataset = pd.read_csv(f, sep=";", dtype=object, thousands=",", decimal=".")
    partial_dataset["Reference_year"] = f.split("\\")[-2]

    # Append the partial dataset
    input_dataset = pd.concat([input_dataset, partial_dataset])


print("Dimensione dataset delle società in attivo: ", input_dataset.shape)

Rimuovo colonne inutilizzabili o addirittura controproducenti per la creazione di futuri modelli di ML

In [None]:
removed_columns = [ # descriptive columns
                    "Tax Code Number",
                    "CCIAA Number",
                    "Vat Number",
                    " Ateco 2007 C",
                    "Ateco 2007 D",
                    "Employees",
                    # summary columns
                    "TOTAL receivables due from shareholders",
                    "Total asset",
                    "TOTAL RECEIVABLES",
                    "TOTAL PROVISIONS FOR RISKS AND CHARGES",
                    "Total personnel costs",
                    "Total depreciation, amortization and writedowns",
                    "PROFIT (LOSS) THIRD PARTIES",
                    "PROFIT (LOSS) GROUP",
                    # garbage or not available bankruptcy columns
                    "Tax receiv. for prepaid taxes - beyond 12 months\nEUR\n2014",
                    "Research and dev. Exp",
                    "Depreaction of intangible fixed assets",
                    "Total third party assets"
]

input_dataset.drop(columns=removed_columns, axis=1, inplace=True)

# All columns starting as Unnamed: X
input_dataset = input_dataset[input_dataset.columns.drop(list(input_dataset.filter(regex='Unnamed')))]
# All columns ending with .1 (duplicate)
input_dataset = input_dataset[input_dataset.columns.drop(list(input_dataset.filter(regex='.1$')))]

Rinomino colonne con typo vari

In [None]:
columns_map = {"Legal Status ": "Legal Status",
               "TOTAL SHAREHOLDERS' FUNDS": "TOTAL SHAREHOLDERS FUNDS",
               "TOTAL LIABILITIES AND SHAREHOLDERS' FUNDS": "TOTAL LIABILITIES AND SHAREHOLDERS FUNDS",
               "Due to suppliers - beyond 12 ": "Due to suppliers - beyond 12 months",
               "Due to shar. for loans": "Due to shareholders for loans",
               "Tax receiv. -  for prepaid taxes": "Tax receiv. for prepaid taxes",
               "No of avaiable years": "No of available years",
               "Dividend down paymen": "Dividend down payment",
               "Receiv. due from assoc. Comp": "Receiv. due from assoc. comp.",
               "Depreciation, amortization and write-downs of fixed assets" : "Depreciation, amortization and writedowns of fixed assets",
               "Other operating expense": "Other operating expenses",
               "Write-down of fixed assets": "Writedown of fixed assets"}
input_dataset.rename(columns=columns_map, inplace=True)

Rinomino tutte le celle con andate a capo (\n) e/o valuta alla fine del nome (EUR) e standardizzo tutte le voci in minuscolo

In [None]:
input_dataset = input_dataset.rename(columns=lambda x: re.sub('\n|EUR','',x))
input_dataset.columns = [x.lower() for x in input_dataset.columns]

Elimino eventuali colonne duplicate

In [None]:
# Drop eventual new duplicated columns
input_dataset = input_dataset.loc[:,~input_dataset.columns.duplicated()]

Tutti i valori non disponibili o errati sono rimpiazzati con NaN

In [None]:
# Descriptive column names
str_cols = ['ragione sociale',
            'province',
            'legal form',
            'legal status',
            'accounting closing date'
]

# Numeric column names
not_str_cols = input_dataset.columns.to_list()
for col in str_cols:
    if col in str_cols:
        not_str_cols.remove(col)

# Dictionary with the wrong values
replace_values = {
    "n.a.": np.NaN,
    "n.d.": np.NaN,
    "n.s.": np.NaN,
    "nan": np.NaN,
    "#VALUE!": np.NaN,
    "UDATACHI": np.NaN
}

input_dataset.replace(replace_values, inplace=True)

input_dataset[not_str_cols] = input_dataset[not_str_cols].replace(to_replace="Active", value=np.NaN)

input_dataset.isna().sum()

Casto le colonne a tipi di dati sensati

In [None]:
for col in input_dataset.columns:
    if col in str_cols:
        input_dataset[col] = input_dataset[col].astype(str)
    else:
        input_dataset[col] = input_dataset[col].str.replace(',', '')
        input_dataset[col] = pd.to_numeric(input_dataset[col])

Standardizzo la colonna Legal Status

In [None]:
input_dataset["legal status"] = "Active"

Rimuovo tutti i record con parte descrittiva non disponibile

In [None]:
input_dataset.dropna(subset=str_cols, inplace=True)
input_dataset = input_dataset[input_dataset["ragione sociale"] != "nan"]

Metto un valore di default a tutti i valori non disponibili

In [None]:
input_dataset.fillna(0, inplace=True)

Sostituisco i valori del campo “Legal Form” con “Società di capitali”, “Società di persone” o “Altro”

In [None]:
# Map società di capitali
societa_capitali_values = ["S.R.L.",
                           "Limited liability company - SRL",
                           "One-person company with limited liability - SRL",
                           "S.R.L. a socio unico",
                           "S.R.L. semplificata",
                           "S.C.A.R.L.",
                           "Cooperative company with limited liability – SCARL",
                           "Limited liability company, simplified",
                           "Società consortile a responsabilita' limitata",
                           "S.P.A.",
                           "S.C.A.R.L.P.A.",
                           "Joint stock company - SPA",
                           "Cooperative company with limited liability by shares - SCARLPA",
                           "S.P.A. a socio unico",
                           "One-person joint stock company - SPA",
                           "Società consortile per azioni",
                           "Consortium by shares",
                           "PICC. S.C.A.R.L.",
                           "Cooperative company with limited liability, small - SCARL",
                           "Limited liability consortium cooperative company",
                           "Società consortile cooperativa a responsabilità limitata",
                           "S.A.P.A.",
                           "S.R.L. a capitale ridotto",
                           "Public limited company - SA",
                           "Limited partnership with shares - SAPA",
                           "S.C.A.R.I.",
                           "Consortium",
                           "Consorzio",
                           "Società di capitali"]

# Dictionary società di capitali
dict_capitali = dict.fromkeys(societa_capitali_values, "Società di capitali")

# Map società di persone
societa_persone_values = ["Limited partnership - SAS",
                          "S.N.C.",
                          "Società semplice",
                          "General partnership - SNC",
                          "Partnership - SS",
                          "S.A.S.",
                          "Cooperativa sociale",
                          "Social cooperative company",
                          "Società cooperativa consortile",
                          "Consortium of cooperatives",
                          "Small cooperative company",
                          "Società di persone"]

# Dictionary società di persone
dict_persone = dict.fromkeys(societa_persone_values, "Società di persone")

# General dictionary
dict = {**dict_capitali, **dict_persone}

# Apply the dictionary and apply the value "Altro" when it is not a dictionary's value
input_dataset["legal form"] = input_dataset["legal form"].map(dict).fillna("Altro").astype(str)

Aggiungo colonna denominata “Company Size” per distinguere tra “Grandi”, “Medie” e “Piccole” imprese

In [None]:
# Define company size label based on some column values
def label_size(row):
    score_grandi = 0
    score_medie = 0
    score_piccole = 0

    if row["revenues from sales and services"] >= 50000000:
        score_grandi += 1
    elif 10000000 <= row["revenues from sales and services"] < 50000000:
        score_medie += 1
    else:
        score_piccole += 1

    if row["number of employees"] >= 250:
        score_grandi += 1
    elif 50 <= row["number of employees"] < 250:
        score_medie += 1
    else:
        score_piccole += 1

    score_max = max(score_grandi, score_medie, score_piccole)

    if score_grandi == score_max:
        return "Grandi"
    elif score_medie == score_max:
        return "Medie"
    else:
        return "Piccole"

In [None]:
input_dataset["company size"] = input_dataset.apply(lambda row: label_size(row), axis=1)

Rinomino colonne per tenere i nomi consistenti con gli altri dataset

In [None]:
columns_map = {"legal status": "Legal Status",
               "legal form": "Legal Form",
               "company size": "Company Size",
               "ragione sociale": "Ragione sociale",
               "province": "Province",
               "accounting closing date": "Accounting closing date",
               "number of employees": "Number of employees",
               "reference_year": "Reference_year"
}
input_dataset.rename(columns=columns_map, inplace=True)

Stampo statistiche dataset ottenuto

In [None]:
input_dataset.describe().T

Esporto in csv e pickle

In [None]:
if to_export:
    input_dataset.to_csv(OUTPUT_PATH + "/active_raw_full.csv")
    input_dataset.to_pickle(OUTPUT_PATH + "/active_raw_full.pkl")