# Master Thesis: Bankruptcy Prediction for European Countries
Code written by Marc Zeugin (UZH)

## Load modules

In [None]:
from sklearn.metrics import precision_score, recall_score, f1_score, classification_report, PrecisionRecallDisplay, confusion_matrix, RocCurveDisplay
from sklearn.model_selection import train_test_split, StratifiedKFold
from imblearn.under_sampling import EditedNearestNeighbours
from sklearn.experimental import enable_iterative_imputer
from sklearnex.ensemble import RandomForestClassifier
from sklearnex.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.impute import IterativeImputer
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline
from imblearn.combine import SMOTEENN
from skopt.space import Real, Integer
from matplotlib import pyplot as plt
from skopt import BayesSearchCV
from joblib import dump, Memory
from sklearnex.svm import SVC
from itertools import product
from tempfile import mkdtemp
from shutil import rmtree
import seaborn as sns
import pandas as pd
import numpy as np
import matplotlib
import warnings
warnings.filterwarnings(action="ignore")
import pyarrow
import glob

Set random seed

In [None]:
np.random.seed(1)

## 1. Setup key variables

In [None]:
# determine training/testing ratio, default is 0.2
tt_size = 0.2
# number of splits for k-fold crossvalidation, default is 5
k_splits = 5
# set number of jobs to run in parallel, -1 means all processors, default is 1
jobs = -1
# number of iterations for BayesSearchCV
n_iterations = 40
# select scoring model to optimize, default is average_precision, the value used for the precision-recall curve
scoring_metric = 'average_precision'
# label for classification report
label = ['Non-Bankrupt', 'Bankrupt']

# absolute path to dataset
path = 'C:/Users/marczeugin/Documents/Masterthesis/datasets/'
# extension of dataset type
ext = '*.csv'

# determine the size (in inches) of the precision-recall curve figure, default is (4, 2)
figure_size = (4, 2)
# set dpi of small graphs, default is 100
dpi_low = 100
# set dpi of medium graphs, default is 200
dpi_med = 200
# set dpi of large graphs, default is 250
dpi_high = 250

# enable subsample of total dataset to be used for hyperparameter tuning, default is True
allow_subsample = True
# set subsample size of total dataset, default is .1 e.g. 10%
subsample_size = 0.1
# enable quick overview to run with subsample
allow_subsample_overview = True
# enable hyperparameter tuning with subsample
allow_subsample_hyperparameter = True

# allow to load dataset with SMOTEENN already run and imputation already imputed, default is True
allow_computed_set = False

## 2. Import data from CSV file

In [None]:
li = []
for file_name in glob.glob(path+ext):
    df = pd.read_csv(file_name, na_values=['n.a.',0], index_col=False)
    df.drop(columns=df.columns[-2:], axis=1, inplace=True)
    df.drop(columns=['Company name Latin alphabet', 'Quoted', 'Branch', 'OwnData', 'Woco', 'NACE Rev. 2 core code (4 digits)'], axis=1, inplace=True)
    li.append(df)

bankruptcy_comp_df = pd.concat(li, axis=0, ignore_index=True)
print(bankruptcy_comp_df.shape)

### 2.1. Initial dataset manipulation

#### 2.1.1. Set bankrupt to 1 or 0

In [None]:
bankruptcy_comp_df = bankruptcy_comp_df.replace({'Inactive': {"Yes": 1, "No": 0}})
bankruptcy_comp_df['Inactive'] = bankruptcy_comp_df['Inactive'].astype(np.int8)

#### 2.1.2. Rename bankrupt column

In [None]:
bankruptcy_comp_df.rename(columns={'Inactive': 'Bankrupt', 'GNI growth last Year': 'GNI last Year', 'GNI growth Year - 1': 'GNI Year - 1', 'GNI growth Year - 2': 'GNI Year - 2', 
                                   'GNI growth Year - 3': 'GNI Year - 3', 'GNI growth Year - 4': 'GNI Year - 4', 'GNI growth Year - 5': 'GNI Year - 5'}, inplace=True)

### 2.2. Remove from dataset

#### 2.2.1. Remove data based on year

In [None]:
original_rows = bankruptcy_comp_df.shape[0]
bankruptcy_comp_df = bankruptcy_comp_df[bankruptcy_comp_df['Last avail. Year'] > 2000]
bankruptcy_comp_df = bankruptcy_comp_df[bankruptcy_comp_df['Last avail. Year'] < 2021]
new_rows = bankruptcy_comp_df.shape[0]
print(f'Removed a total of {original_rows - new_rows} rows, that is {round(100-new_rows/original_rows*100,4)}%')

#### 2.2.2. Remove data based on missing values

In [None]:
bankruptcy_comp_df = bankruptcy_comp_df[bankruptcy_comp_df.isnull().sum(axis=1) <= bankruptcy_comp_df.shape[1]/4]
new_rows = bankruptcy_comp_df.shape[0]
print(f'Removed a total of {original_rows - new_rows} rows, that is {round(100-new_rows/original_rows*100,4)}%')

### 2.3. Display basic information of the dataset

In [None]:
print(f'Initial dataset shape: {bankruptcy_comp_df.shape}')

### 2.4. Check dataset quality

In [None]:
missing_values_count = bankruptcy_comp_df.isna().sum().sum()
print(f'Total missing values: {missing_values_count}')
print('--'*60)
duplicates = bankruptcy_comp_df.duplicated().sum()
print('Any duplicated values: ' + str(duplicates))

#### 2.4.1. Remove duplicates

In [None]:
if duplicates > 0:
    bankruptcy_comp_df.drop_duplicates(inplace=True)
    print(f'Dataset shape after removing duplicates: {bankruptcy_comp_df.shape}')
    duplicates = bankruptcy_comp_df.duplicated().sum()
    print('Duplicated values left: ' + str(duplicates))

#### 2.4.2. Check missing values in columns

In [None]:
print(bankruptcy_comp_df.isna().sum().sort_values(ascending=False))

#### 2.4.3. Remove columns with more than 50% missing values
This removes 5 columns with more than 99.89% missing values identified in 2.4.2.

In [None]:
bankruptcy_comp_df.drop(columns=['EBIT Year - 4', 'EBIT Year - 3', 'EBIT Year - 2', 'EBIT Year - 1', 'EBIT last Year'], axis=1, inplace=True)

In [None]:
missing_values_count = bankruptcy_comp_df.isna().sum().sum()
print(f'Total missing values: {missing_values_count}')

#### 2.4.4. Split train and test data

In [None]:
X = bankruptcy_comp_df.drop(columns=['Bankrupt'], axis=1)
y = bankruptcy_comp_df['Bankrupt']

In [None]:
if allow_subsample:
    subsample_size_inversed = (1 - tt_size)/(1 - subsample_size)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=tt_size, random_state=1, stratify=y)
print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)

#### 2.4.5. Impute train data

In [None]:
if allow_computed_set:
    X_train = pd.read_feather(path+'0after_imputation_train.feather')
    X_train = X_train.set_index('index')
elif missing_values_count > 0:
    X_train = X_train.replace('NaN', np.nan)
    values = X_train.iloc[:,2:].values
    imputer = IterativeImputer(random_state=1, tol=1e-8)
    imputer.fit(values)
    imputed_train_values = imputer.transform(values)
    X_train.iloc[:,2:] = imputed_train_values
    print(f'Missing train values: {np.isnan(imputed_train_values).sum()}')
    missing_values = []

In [None]:
print(X_train.shape)
bankruptcy_comp_df_reset_index = X_train.reset_index()
bankruptcy_comp_df_reset_index.to_feather(path+'0after_imputation_train.feather')

#### 2.4.6. Impute test data

In [None]:
if allow_computed_set:
    X_test = pd.read_feather(path+'0after_imputation_test.feather')
    X_test = X_test.set_index('index')
elif missing_values_count > 0:
    X_test = X_test.replace('NaN', np.nan)
    values = X_test.iloc[:,2:].values
    imputed_test_values = imputer.transform(values)
    X_test.iloc[:,2:] = imputed_test_values
    print(f'Missing test values: {np.isnan(imputed_test_values).sum()}')
    missing_values = []

Save results

In [None]:
print(X_test.shape)
if allow_computed_set == False:
    bankruptcy_comp_df_reset_index = X_test.reset_index()
    bankruptcy_comp_df_reset_index.to_feather(path+'0after_imputation_test.feather')

In [None]:
print(X_train.shape, y_train.shape)

In [None]:
if allow_subsample:
    X_sub, _, y_sub, _ = train_test_split(X_train, y_train, test_size=subsample_size_inversed, random_state=1, stratify=y_train)
    
    X_train_sub, X_test_sub, y_train_sub, y_test_sub = train_test_split(X_sub, y_sub, test_size=tt_size, random_state=1)
    print(X_train_sub.shape, X_test_sub.shape)

In [None]:
df_list = [X_train, X_test, X_train_sub, X_test_sub]

### 2.5. Create ratios and growths

#### 2.5.1. Create macroeconomic variables

In [None]:
variables_ending = ['last Year', 'Year - 1', 'Year - 2', 'Year - 3', 'Year - 4', 'Year - 5']
new_variables_ending = ['t', 't-1', 't-2', 't-3', 't-4']
#Special case for government deficit

for frame in df_list:
    for i in range(len(variables_ending)):
        frame['Deficit GDP ' + variables_ending[i]] = frame['Revenue GDP '+variables_ending[i]] - frame['Expense GDP '+variables_ending[i]]
        frame.drop(columns=['Revenue GDP '+variables_ending[i],'Expense GDP '+variables_ending[i]], axis=1, inplace=True)

    macro_variables_name = ['Inflation', 'Deficit GDP', 'Unemployment Rate', 'Reserves', 'Interest Rates', 'CPI', 'GNI']
    macro_variables_growth_name = ['Inflation growth', 'Deficit growth', 'Unemployment growth', 'Reserves growth', 'Interest rates growth', 'CPI growth', 'GNI growth']

    for i in range(len(macro_variables_name)):
        for k in range(len(variables_ending)-1):
            macro_present = macro_variables_name[i] + " " + variables_ending[k]
            macro_past = macro_variables_name[i] + " " + variables_ending[k+1]
            name = macro_variables_growth_name[i] + " " + new_variables_ending[k]
            
            frame[name] = (frame[macro_present]-frame[macro_past])/frame[macro_past]

#### 2.5.2. Remove macroeconomic non-growth variables

In [None]:
for frame in df_list:
    for i in range(len(macro_variables_name)):
        for k in range(len(variables_ending)):
            macro_present = macro_variables_name[i] + " " + variables_ending[k]
            
            frame.drop(columns=[macro_present], axis=1, inplace=True)
            
    frame.drop(columns=['GDP growth Year - 5'], axis=1, inplace=True)
    frame.rename(columns={'GDP growth last Year':'GDP growth t', 'GDP growth Year - 1':'GDP growth t-1', 'GDP growth Year - 2':'GDP growth t-2', 
                            'GDP growth Year - 3':'GDP growth t-3', 'GDP growth Year - 4':'GDP growth t-4'}, inplace=True)

    print(f'Removed {12+i*k+1} columns')

#### 2.5.3. Create financial ratios

In [None]:
variables_ending = variables_ending[:-1]
financial_variables_dividend = ['Net income', 'Current assets', 'Sales', 'Current assets', 'Cash flow', 'Net income', 'Cash & cash equivalent', 'Net income']
financial_variables_divider = ['Total assets', 'Current liabilities', 'Total assets', 'Total assets', 'Total assets', 'Financial expenses',
                               'Current liabilities', 'Sales']
financial_variables_quotient = ['PR1', 'LiR1', 'OR1', 'LiR2', 'CFR1', 'SR1', 'LiR3', 'PR2']

for frame in df_list:
    for i in range(len(financial_variables_divider)):
        for k in range(len(variables_ending)):
            dividend = financial_variables_dividend[i]+" "+variables_ending[k]
            divider = financial_variables_divider[i]+" "+variables_ending[k]
            quotient = financial_variables_quotient[i]+" "+new_variables_ending[k]
            frame[quotient] = frame[dividend]/frame[divider]

    #Special case for CSR1
    financial_variables_dividend1 = 'Current liabilities'
    financial_variables_dividend2 = 'Non-current liabilities'
    financial_variables_divider_CSR1 = 'Total assets'
    for k in range(len(variables_ending)):
        dividend1 = financial_variables_dividend1+" "+variables_ending[k]
        dividend2 = financial_variables_dividend2+" "+variables_ending[k]
        divider = financial_variables_divider_CSR1+" "+variables_ending[k]
        quotient = "CSR1 "+new_variables_ending[k]
        
        frame[quotient] = (frame[dividend1]+frame[dividend2])/frame[divider]

    #Special case for CFR2
    financial_variables_dividend_CFR2 = 'Cash flow'
    financial_variables_divider1 = 'Current liabilities'
    financial_variables_divider2 = 'Non-current liabilities'
    for k in range(len(variables_ending)):
        dividend = financial_variables_dividend_CFR2+" "+variables_ending[k]
        divider1 = financial_variables_divider1+" "+variables_ending[k]
        divider2 = financial_variables_divider2+" "+variables_ending[k]
        quotient = "CFR2 "+new_variables_ending[k]
            
        frame[quotient] = frame[dividend]/(frame[divider1]+frame[divider2])

    #Special case for GR1
    financial_variables_combined = 'Total assets'
    for k in range(len(variables_ending)-1):
        dividend1 = financial_variables_combined+" "+variables_ending[k]
        dividend2 = financial_variables_combined+" "+variables_ending[k+1]
        divider = financial_variables_combined+" "+variables_ending[k+1]
        quotient = "GR1 "+new_variables_ending[k]
        
        frame[quotient] = (frame[dividend1]-frame[dividend2])/frame[divider]

#### 2.5.4. Remove financial non-ratio variables

In [None]:
financial_variables_to_remove = ['Net income', 'Current assets', 'Sales', 'Total assets', 'Cash flow', 'Cash & cash equivalent', 'Current liabilities', 
                                 'Non-current liabilities', 'Financial expenses']
for frame in df_list:
    for i in range(len(financial_variables_to_remove)):
        for k in range(len(variables_ending)):
            frame.drop(columns=[financial_variables_to_remove[i] + " " + variables_ending[k]], axis=1, inplace=True)
            
    print(f'{i*k} columns deleted')

#### 2.5.5. Remove rows with inf or -inf

In [None]:
old_shape = X_train.shape[0]
y_train = y_train.drop(X_train[X_train['CFR2 t'] == np.inf].index, axis=0)
X_train = X_train.drop(X_train[X_train['CFR2 t'] == np.inf].index, axis=0)
y_train = y_train.drop(X_train[X_train['CFR2 t'] == -np.inf].index, axis=0)
X_train = X_train.drop(X_train[X_train['CFR2 t'] == -np.inf].index, axis=0)
y_train = y_train.drop(X_train[X_train['CFR2 t-1'] == np.inf].index, axis=0)
X_train = X_train.drop(X_train[X_train['CFR2 t-1'] == np.inf].index, axis=0)
y_train = y_train.drop(X_train[X_train['CFR2 t-1'] == -np.inf].index, axis=0)
X_train = X_train.drop(X_train[X_train['CFR2 t-1'] == -np.inf].index, axis=0)
y_train = y_train.drop(X_train[X_train['CFR2 t-2'] == np.inf].index, axis=0)
X_train = X_train.drop(X_train[X_train['CFR2 t-2'] == np.inf].index, axis=0)
y_train = y_train.drop(X_train[X_train['CFR2 t-2'] == -np.inf].index, axis=0)
X_train = X_train.drop(X_train[X_train['CFR2 t-2'] == -np.inf].index, axis=0)
y_train = y_train.drop(X_train[X_train['CFR2 t-3'] == np.inf].index, axis=0)
X_train = X_train.drop(X_train[X_train['CFR2 t-3'] == np.inf].index, axis=0)
y_train = y_train.drop(X_train[X_train['CFR2 t-3'] == -np.inf].index, axis=0)
X_train = X_train.drop(X_train[X_train['CFR2 t-3'] == -np.inf].index, axis=0)
y_train = y_train.drop(X_train[X_train['CFR2 t-4'] == np.inf].index, axis=0)
X_train = X_train.drop(X_train[X_train['CFR2 t-4'] == np.inf].index, axis=0)
y_train = y_train.drop(X_train[X_train['CFR2 t-4'] == -np.inf].index, axis=0)
X_train = X_train.drop(X_train[X_train['CFR2 t-4'] == -np.inf].index, axis=0)
new_shape = X_train.shape[0]
print(f'removed {old_shape - new_shape} new shapes {X_train.shape} & {y_train.shape}')

In [None]:
old_shape = X_test.shape[0]
y_test = y_test.drop(X_test[X_test['CFR2 t'] == np.inf].index, axis=0)
X_test = X_test.drop(X_test[X_test['CFR2 t'] == np.inf].index, axis=0)
y_test = y_test.drop(X_test[X_test['CFR2 t'] == -np.inf].index, axis=0)
X_test = X_test.drop(X_test[X_test['CFR2 t'] == -np.inf].index, axis=0)
y_test = y_test.drop(X_test[X_test['CFR2 t-1'] == np.inf].index, axis=0)
X_test = X_test.drop(X_test[X_test['CFR2 t-1'] == np.inf].index, axis=0)
y_test = y_test.drop(X_test[X_test['CFR2 t-1'] == -np.inf].index, axis=0)
X_test = X_test.drop(X_test[X_test['CFR2 t-1'] == -np.inf].index, axis=0)
y_test = y_test.drop(X_test[X_test['CFR2 t-2'] == np.inf].index, axis=0)
X_test = X_test.drop(X_test[X_test['CFR2 t-2'] == np.inf].index, axis=0)
y_test = y_test.drop(X_test[X_test['CFR2 t-2'] == -np.inf].index, axis=0)
X_test = X_test.drop(X_test[X_test['CFR2 t-2'] == -np.inf].index, axis=0)
y_test = y_test.drop(X_test[X_test['CFR2 t-3'] == np.inf].index, axis=0)
X_test = X_test.drop(X_test[X_test['CFR2 t-3'] == np.inf].index, axis=0)
y_test = y_test.drop(X_test[X_test['CFR2 t-3'] == -np.inf].index, axis=0)
X_test = X_test.drop(X_test[X_test['CFR2 t-3'] == -np.inf].index, axis=0)
y_test = y_test.drop(X_test[X_test['CFR2 t-4'] == np.inf].index, axis=0)
X_test = X_test.drop(X_test[X_test['CFR2 t-4'] == np.inf].index, axis=0)
y_test = y_test.drop(X_test[X_test['CFR2 t-4'] == -np.inf].index, axis=0)
X_test = X_test.drop(X_test[X_test['CFR2 t-4'] == -np.inf].index, axis=0)
new_shape = X_test.shape[0]
print(f'removed {old_shape - new_shape} new shapes {X_test.shape} & {y_test.shape}')

In [None]:
old_shape = X_train_sub.shape[0]
y_train_sub = y_train_sub.drop(X_train_sub[X_train_sub['CFR2 t'] == np.inf].index, axis=0)
X_train_sub = X_train_sub.drop(X_train_sub[X_train_sub['CFR2 t'] == np.inf].index, axis=0)
y_train_sub = y_train_sub.drop(X_train_sub[X_train_sub['CFR2 t'] == -np.inf].index, axis=0)
X_train_sub = X_train_sub.drop(X_train_sub[X_train_sub['CFR2 t'] == -np.inf].index, axis=0)
y_train_sub = y_train_sub.drop(X_train_sub[X_train_sub['CFR2 t-1'] == np.inf].index, axis=0)
X_train_sub = X_train_sub.drop(X_train_sub[X_train_sub['CFR2 t-1'] == np.inf].index, axis=0)
y_train_sub = y_train_sub.drop(X_train_sub[X_train_sub['CFR2 t-1'] == -np.inf].index, axis=0)
X_train_sub = X_train_sub.drop(X_train_sub[X_train_sub['CFR2 t-1'] == -np.inf].index, axis=0)
y_train_sub = y_train_sub.drop(X_train_sub[X_train_sub['CFR2 t-2'] == np.inf].index, axis=0)
X_train_sub = X_train_sub.drop(X_train_sub[X_train_sub['CFR2 t-2'] == np.inf].index, axis=0)
y_train_sub = y_train_sub.drop(X_train_sub[X_train_sub['CFR2 t-2'] == -np.inf].index, axis=0)
X_train_sub = X_train_sub.drop(X_train_sub[X_train_sub['CFR2 t-2'] == -np.inf].index, axis=0)
y_train_sub = y_train_sub.drop(X_train_sub[X_train_sub['CFR2 t-3'] == np.inf].index, axis=0)
X_train_sub = X_train_sub.drop(X_train_sub[X_train_sub['CFR2 t-3'] == np.inf].index, axis=0)
y_train_sub = y_train_sub.drop(X_train_sub[X_train_sub['CFR2 t-3'] == -np.inf].index, axis=0)
X_train_sub = X_train_sub.drop(X_train_sub[X_train_sub['CFR2 t-3'] == -np.inf].index, axis=0)
y_train_sub = y_train_sub.drop(X_train_sub[X_train_sub['CFR2 t-4'] == np.inf].index, axis=0)
X_train_sub = X_train_sub.drop(X_train_sub[X_train_sub['CFR2 t-4'] == np.inf].index, axis=0)
y_train_sub = y_train_sub.drop(X_train_sub[X_train_sub['CFR2 t-4'] == -np.inf].index, axis=0)
X_train_sub = X_train_sub.drop(X_train_sub[X_train_sub['CFR2 t-4'] == -np.inf].index, axis=0)
new_shape = X_train_sub.shape[0]
print(f'removed {old_shape - new_shape} new shapes {X_train_sub.shape} & {y_train_sub.shape}')

In [None]:
old_shape = X_test_sub.shape[0]
y_test_sub = y_test_sub.drop(X_test_sub[X_test_sub['CFR2 t'] == np.inf].index, axis=0)
X_test_sub = X_test_sub.drop(X_test_sub[X_test_sub['CFR2 t'] == np.inf].index, axis=0)
y_test_sub = y_test_sub.drop(X_test_sub[X_test_sub['CFR2 t'] == -np.inf].index, axis=0)
X_test_sub = X_test_sub.drop(X_test_sub[X_test_sub['CFR2 t'] == -np.inf].index, axis=0)
y_test_sub = y_test_sub.drop(X_test_sub[X_test_sub['CFR2 t-1'] == np.inf].index, axis=0)
X_test_sub = X_test_sub.drop(X_test_sub[X_test_sub['CFR2 t-1'] == np.inf].index, axis=0)
y_test_sub = y_test_sub.drop(X_test_sub[X_test_sub['CFR2 t-1'] == -np.inf].index, axis=0)
X_test_sub = X_test_sub.drop(X_test_sub[X_test_sub['CFR2 t-1'] == -np.inf].index, axis=0)
y_test_sub = y_test_sub.drop(X_test_sub[X_test_sub['CFR2 t-2'] == np.inf].index, axis=0)
X_test_sub = X_test_sub.drop(X_test_sub[X_test_sub['CFR2 t-2'] == np.inf].index, axis=0)
y_test_sub = y_test_sub.drop(X_test_sub[X_test_sub['CFR2 t-2'] == -np.inf].index, axis=0)
X_test_sub = X_test_sub.drop(X_test_sub[X_test_sub['CFR2 t-2'] == -np.inf].index, axis=0)
y_test_sub = y_test_sub.drop(X_test_sub[X_test_sub['CFR2 t-3'] == np.inf].index, axis=0)
X_test_sub = X_test_sub.drop(X_test_sub[X_test_sub['CFR2 t-3'] == np.inf].index, axis=0)
y_test_sub = y_test_sub.drop(X_test_sub[X_test_sub['CFR2 t-3'] == -np.inf].index, axis=0)
X_test_sub = X_test_sub.drop(X_test_sub[X_test_sub['CFR2 t-3'] == -np.inf].index, axis=0)
y_test_sub = y_test_sub.drop(X_test_sub[X_test_sub['CFR2 t-4'] == np.inf].index, axis=0)
X_test_sub = X_test_sub.drop(X_test_sub[X_test_sub['CFR2 t-4'] == np.inf].index, axis=0)
y_test_sub = y_test_sub.drop(X_test_sub[X_test_sub['CFR2 t-4'] == -np.inf].index, axis=0)
X_test_sub = X_test_sub.drop(X_test_sub[X_test_sub['CFR2 t-4'] == -np.inf].index, axis=0)
new_shape = X_test_sub.shape[0]
print(f'removed {old_shape - new_shape} new shapes {X_test_sub.shape} & {y_test_sub.shape}')

### 2.6. Downcast datatypes
to reduce memory space

In [None]:
for frame in df_list:
    frame.info()
    print('--'*60)
    frame.iloc[:,2:] = frame.iloc[:,2:].apply(pd.to_numeric, downcast='float')
    print(frame.info())

### 2.7. Check highly correlated features
Results in the same features being removed then removing then doing it sequentially, thus the computationally more efficient approach is chosen

In [None]:
correlated_features = []
correlated_features_names = []
correlation_matrix = X_train.corr()

for i in range(len(correlation_matrix.columns)):
    for j in range(i):
        if abs(correlation_matrix.iloc[i, j]) > 0.9 and X_train.columns.get_loc(correlation_matrix.columns[i]) not in correlated_features:
            correlated_features.append(X_train.columns.get_loc(correlation_matrix.columns[i]))
            correlated_features_names.append(correlation_matrix.columns[i])

In [None]:
print(correlated_features_names)
print(len(correlated_features))

### 2.8. Display more detailed infos of the dataset

In [None]:
print(f'{bankruptcy_comp_df["Bankrupt"].value_counts()[0]} non-bankrupt companies - {round(bankruptcy_comp_df["Bankrupt"].value_counts()[0]/len(bankruptcy_comp_df)*100,2)}% of the dataset')
print(f'{bankruptcy_comp_df["Bankrupt"].value_counts()[1]} bankrupt companies - {round(bankruptcy_comp_df["Bankrupt"].value_counts()[1]/len(bankruptcy_comp_df)*100,2)}% of the dataset')

#### 2.8.1. Show class distribution

In [None]:
plt.figure(figsize=(8, 5))
sns.countplot(x=bankruptcy_comp_df['Bankrupt'], palette=['#000000','#808080'])
plt.title('Class distribution: non-bankrupt and bankrupt firms', fontsize=14, pad=20)
plt.ticklabel_format(style='plain', axis='y')
plt.yticks(np.arange(0, 2500000, 500000))
plt.ylabel('Number of firms')
plt.text(x=-0.19, y=bankruptcy_comp_df["Bankrupt"].value_counts()[0]+10000, s=f'n={bankruptcy_comp_df["Bankrupt"].value_counts()[0]:,}')
plt.text(x=0.83, y=bankruptcy_comp_df["Bankrupt"].value_counts()[1]+20000, s=f'n={bankruptcy_comp_df["Bankrupt"].value_counts()[1]:,}')
plt.show()

#### 2.8.2. Show distribution for all features of training set

In [None]:
X_train.hist(figsize=(50,50), bins=50, edgecolor='black', color='Grey')
plt.show()

In [None]:
plt.figure(figsize=(50,50))
ax = sns.boxplot(data=X_train, orient='h', palette='Greys')
ax.set_title('Variables Boxplots', fontsize=40)
ax.set(xscale='log')
plt.show()

In [None]:
corr = X_train.corr()
fig, ax = plt.subplots(figsize=(50,50))
sns.heatmap(corr, ax=ax, cmap='Greys', linewidth=0.1)

In [None]:
X_train.describe()

#### 2.8.3. Show distribution for all features of testing set

In [None]:
X_test.hist(figsize=(50,50), bins=50, edgecolor='black', color='Grey')
plt.show()

In [None]:
plt.figure(figsize=(50,50))
ax = sns.boxplot(data=X_test, orient='h', palette='Greys')
ax.set_title('Variables Boxplots', fontsize=40)
ax.set(xscale='log')
plt.show()

In [None]:
corr = X_test.corr()
fig, ax = plt.subplots(figsize=(50,50))
sns.heatmap(corr, ax=ax, cmap='Greys', linewidth=0.1)

In [None]:
X_test.describe()

#### 2.8.4. Show distribution of complete dataset

In [None]:
bankruptcy_df = pd.concat([X_train, X_test])
print(bankruptcy_df.shape)

Save intermediate results

In [None]:
np.save('complete_list.npy', bankruptcy_df)

List all variables, their mean, and standard deviation

In [None]:
stats_list = []

for col in bankruptcy_df.columns:
    if col == 'Country ISO code' or col == 'Last avail. Year':
        continue
    name = col
    mean = bankruptcy_df[col].mean()
    std = bankruptcy_df[col].std()
    stats_list.append([name, mean, std])

print(len(stats_list))

In [None]:
for stats in stats_list:
    print(f'{stats[0]} & {round(stats[1],4)} & {round(stats[2],4)} & \\\\')

#### 2.8.5. Selected distributions

In [None]:
GDP_growth_t = bankruptcy_comp_df['GDP growth t'].loc[bankruptcy_comp_df['Bankrupt'] == 1].value
sns.displot(GDP_growth_t)

#### 2.8.6. Different distribution

In [None]:
print(bankruptcy_comp_df.groupby('Country ISO code').Bankrupt.agg(no_bankrupt=('sum'), no_firms=('count'), bankrupt_percentage=(lambda x: str(round(x.sum()/x.count()*100,4))+"%")))

In [None]:
print(bankruptcy_comp_df.groupby('Last avail. Year').Bankrupt.agg(no_bankrupt=('sum'), no_non_bankrupt=(lambda x: str(x.count()-x.sum())), no_firms=('count')))

## 3. Creating datasets (Models 1-4)

In [None]:
X_train['Bankrupt'] = y_train
X_test['Bankrupt'] = y_test
X_train_sub['Bankrupt'] = y_train_sub
X_test_sub['Bankrupt'] = y_test_sub
X_train_for_smoteenn = X_train
X_test_for_smoteenn = X_test

### 3.1. Info on training & testing split

In [None]:
print(X_train.shape)
print(X_test.shape)

In [None]:
X_train = X_train.drop(columns=['Bankrupt', 'Country ISO code', 'Last avail. Year'], axis=1)
X_test = X_test.drop(columns=['Bankrupt', 'Country ISO code', 'Last avail. Year'], axis=1)
X_train_sub = X_train_sub.drop(columns=['Bankrupt', 'Country ISO code', 'Last avail. Year'], axis=1)
X_test_sub = X_test_sub.drop(columns=['Bankrupt', 'Country ISO code', 'Last avail. Year'], axis=1)

### 3.2. Creating models 1-4

#### 3.2.1. Complete (Model 1)

In [None]:
print(f'Shape of X train: {X_train.shape} | Shape of y train: {y_train.shape} | Percentage bankrupt: {round(y_train.sum()/y_train.count()*100,3)}%')
print(f'Shape of X test: {X_test.shape} | Shape of y test: {y_test.shape} | Percentage bankrupt: {round(y_test.sum()/y_test.count()*100,3)}%')
if allow_subsample:
    print(f'Shape of X_sub: {X_train_sub.shape} | Shape of y_sub: {y_train_sub.shape} | Percentage bankrupt: {round(y_train_sub.sum()/y_train_sub.count()*100,3)}%')
    print(f'Shape of X_sub: {X_test_sub.shape} | Shape of y_sub: {y_test_sub.shape} | Percentage bankrupt: {round(y_test_sub.sum()/y_test_sub.count()*100,3)}%')

In [None]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.fit_transform(X_test)
if allow_subsample:
    X_train_sub = scaler.fit_transform(X_train_sub)
    X_test_sub = scaler.fit_transform(X_test_sub)

#### 3.2.2. Without time (Model 2)

In [None]:
X_y_train = X_train[:,::5]
X_y_test = X_test[:,::5]
print(f'Shape of X train: {X_y_train.shape} | Shape of y train: {y_train.shape} | Percentage bankrupt: {round(y_train.sum()/y_train.count()*100,3)}%')
print(f'Shape of X test: {X_y_test.shape} | Shape of y test: {y_test.shape} | Percentage bankrupt: {round(y_test.sum()/y_test.count()*100,3)}%')
if allow_subsample:
    X_y_train_sub = X_train_sub[:,::5]
    X_y_test_sub = X_test_sub[:,::5]
    print(f'Shape of X_sub: {X_y_train_sub.shape} | Shape of y_sub: {y_train_sub.shape} | Percentage bankrupt: {round(y_train_sub.sum()/y_train_sub.count()*100,3)}%')
    print(f'Shape of X_sub: {X_y_test_sub.shape} | Shape of y_sub: {y_test_sub.shape} | Percentage bankrupt: {round(y_test_sub.sum()/y_test_sub.count()*100,3)}%')

#### 3.2.3. Without macro (Model 3)

In [None]:
X_m_train = X_train[:,40:]
X_m_test = X_test[:,40:]
print(f'Shape of X train: {X_m_train.shape} | Shape of y train: {y_train.shape} | Percentage bankrupt: {round(y_train.sum()/y_train.count()*100,3)}%')
print(f'Shape of X test: {X_m_test.shape} | Shape of y test: {y_test.shape} | Percentage bankrupt: {round(y_test.sum()/y_test.count()*100,3)}%')
if allow_subsample:
    X_m_train_sub = X_train_sub[:,40:]
    X_m_test_sub = X_test_sub[:,40:]
    print(f'Shape of X_sub: {X_m_train_sub.shape} | Shape of y_sub: {y_train_sub.shape} | Percentage bankrupt: {round(y_train_sub.sum()/y_train_sub.count()*100,3)}%')
    print(f'Shape of X_sub: {X_m_test_sub.shape} | Shape of y_sub: {y_test_sub.shape} | Percentage bankrupt: {round(y_test_sub.sum()/y_test_sub.count()*100,3)}%')

#### 3.2.4. Without time and macro (Model 4)

In [None]:
X_y_m_train = X_m_train[:,::5]
X_y_m_test = X_m_test[:,::5]
print(f'Shape of X train: {X_y_m_train.shape} | Shape of y train: {y_train.shape} | Percentage bankrupt: {round(y_train.sum()/y_train.count()*100,3)}%')
print(f'Shape of X test: {X_y_m_test.shape} | Shape of y test: {y_test.shape} | Percentage bankrupt: {round(y_test.sum()/y_test.count()*100,3)}%')
if allow_subsample:
    X_y_m_train_sub = X_m_train_sub[:,::5]
    X_y_m_test_sub = X_m_test_sub[:,::5]
    print(f'Shape of X_sub: {X_y_m_train_sub.shape} | Shape of y_sub: {y_train_sub.shape} | Percentage bankrupt: {round(y_train_sub.sum()/y_train_sub.count()*100,3)}%')
    print(f'Shape of X_sub: {X_y_m_test_sub.shape} | Shape of y_sub: {y_test_sub.shape} | Percentage bankrupt: {round(y_test_sub.sum()/y_test_sub.count()*100,3)}%')

## 4. Feature selection

Get index of highly correlated features to a list

In [None]:
X_train_for_smoteenn_remove = X_train_for_smoteenn.drop(columns=['Country ISO code', 'Last avail. Year', 'Bankrupt'], axis=1)
remove_complete = [False]*X_train_for_smoteenn_remove.shape[1]
for el in correlated_features:
    remove_complete[(int(el)-2)] = True
    
remove_complete_y = remove_complete[::5]
remove_complete_m = remove_complete[40:]
remove_complete_y_m = remove_complete_m[::5]

Remove highly correlated features from training set

In [None]:
remove_complete = [i for i, x in enumerate(remove_complete) if x]
remove_complete_y = [i for i, x in enumerate(remove_complete_y) if x]
remove_complete_m = [i for i, x in enumerate(remove_complete_m) if x]
remove_complete_y_m = [i for i, x in enumerate(remove_complete_y_m) if x]

In [None]:
X_train = np.delete(X_train, remove_complete, axis=1)
X_y_train = np.delete(X_y_train, remove_complete_y, axis=1)
X_m_train = np.delete(X_m_train, remove_complete_m, axis=1)
X_y_m_train = np.delete(X_y_m_train, remove_complete_y_m, axis=1)
X_train_sub = np.delete(X_train_sub, remove_complete, axis=1)
X_y_train_sub = np.delete(X_y_train_sub, remove_complete_y, axis=1)
X_m_train_sub = np.delete(X_m_train_sub, remove_complete_m, axis=1)
X_y_m_train_sub = np.delete(X_y_m_train_sub, remove_complete_y_m, axis=1)

In [None]:
X_test = np.delete(X_test, remove_complete, axis=1)
X_y_test = np.delete(X_y_test, remove_complete_y, axis=1)
X_m_test = np.delete(X_m_test, remove_complete_m, axis=1)
X_y_m_test = np.delete(X_y_m_test, remove_complete_y_m, axis=1)

In [None]:
print(f'X train number of features: {len(X_train[0])}')
print(f'X y train number of features: {len(X_y_train[0])}')
print(f'X m train number of features: {len(X_m_train[0])}')
print(f'X y m train number of features: {len(X_y_m_train[0])}')

Create model lists

In [None]:
name_list = ['complete', 'without years', 'without macro', 'without years and macro', 'complete SMOTEENN', 'without years SMOTEENN', 'without macro SMOTEENN', 'without years and macro SMOTEENN']

if allow_subsample_hyperparameter:
    X_list_m = [[X_train_sub, X_test_sub], [X_y_train_sub, X_y_test_sub], [X_m_train_sub, X_m_test_sub], [X_y_m_train_sub, X_y_m_test_sub], ]
    y_list_m = [(y_train_sub, y_test_sub), (y_train_sub, y_test_sub), (y_train_sub, y_test_sub), (y_train_sub, y_test_sub)]
else:
    X_list_m = [[X_train, X_test], [X_y_train, X_y_test], [X_m_train, X_m_test], [X_y_m_train, X_y_m_test]]
    y_list_m = [(y_train, y_test), (y_train, y_test), (y_train, y_test), (y_train, y_test)]

X_list = [[X_train, X_test], [X_y_train, X_y_test], [X_m_train, X_m_test], [X_y_m_train, X_y_m_test]]
y_list = [(y_train, y_test), (y_train, y_test), (y_train, y_test), (y_train, y_test)]

In [None]:
if allow_computed_set == False:
    bankruptcy_comp_df_reset_index = bankruptcy_comp_df.reset_index()
    bankruptcy_comp_df_reset_index.to_feather(path+'1ready_for_smoteenn.feather')

## 5. SMOTEENN

In [None]:
sme = SMOTEENN(smote=SMOTE(sampling_strategy='minority', random_state=1), enn=EditedNearestNeighbours(sampling_strategy='all'), random_state=1, n_jobs=jobs)
    
print(X_train_for_smoteenn.groupby('Country ISO code').Bankrupt.agg(no_bankrupt=('sum'), no_firms=('count'), bankrupt_percentage=(lambda x: str(round(x.sum()/x.count()*100,4))+"%")))
print('--'*60)

if allow_computed_set:
    X_train_smoteenn = pd.read_feather(path+'2after_smoteenn.feather')
    X_train_smoteenn = X_train_smoteenn.set_index('index')
    print(X_train_smoteenn.shape)
else:
    countries = X_train_for_smoteenn['Country ISO code'].unique()
    print(countries)
    print('--'*60)
    
    all_countries = []
    
    for country in countries:
        
        if country == 'IT':
            it_df = X_train_for_smoteenn.loc[X_train_for_smoteenn['Country ISO code']=='IT']

            print(it_df.shape)

            X_it = it_df.drop(['Bankrupt', 'Country ISO code'], axis=1)
            y_it = it_df['Bankrupt']

            X_train, X_test, y_train, y_test = train_test_split(X_it, y_it, test_size=0.3333, random_state=1, stratify=it_df['Bankrupt'])

            X_train1, X_test1, y_train1, y_test1 = train_test_split(X_train, y_train, test_size=0.5, random_state=1, stratify=y_train)
            X_test2 = X_test
            y_test2 = y_test

            X_train1['Bankrupt'] = y_train1
            X_test1['Bankrupt'] = y_test1
            X_test2['Bankrupt'] = y_test2
            print(X_train1.shape, X_test1.shape, X_test2.shape)

            datasets = [X_train1, X_test1, X_test2]

            all_it = []
            i = 0
            for dataset in datasets:
                i += 1
                print(f'{i} started')
                X_temp = dataset.drop(['Bankrupt'], axis=1)
                y_temp = dataset['Bankrupt']
                print(X_temp.shape, y_temp.shape)
                X_temp_resampled, y_temp_resampled = sme.fit_resample(X_temp, y_temp)
                temp_comp_df = pd.concat([y_temp_resampled, X_temp_resampled], axis=1)
                temp_comp_df['Country ISO code']='IT'
                print('IT '+str(i) + ' of 3 done!')
                all_countries.append(temp_comp_df)
        else:
            print(f'{country} started')
            temp_df = X_train_for_smoteenn.loc[X_train_for_smoteenn['Country ISO code']==country]
            X_temp = temp_df.drop(['Bankrupt', 'Country ISO code'], axis=1)
            y_temp = temp_df['Bankrupt']
            X_temp_resampled, y_temp_resampled = sme.fit_resample(X_temp, y_temp)
            temp_comp_df = pd.concat([y_temp_resampled, X_temp_resampled], axis=1)
            temp_comp_df['Country ISO code']=country
            all_countries.append(temp_comp_df)
            print(f'{country} done!')
    
    print('--'*60)
    X_train_smoteenn = pd.concat(all_countries)

print(X_train_smoteenn.groupby('Country ISO code').Bankrupt.agg(no_bankrupt=('sum'), no_firms=('count'), bankrupt_percentage=(lambda x: str(round(x.sum()/x.count()*100,4))+"%")))

In [None]:
to_concat = [X_train_smoteenn, X_test_for_smoteenn]
bankruptcy_smoteenn_df = pd.concat(to_concat)
if allow_computed_set == False:
    bankruptcy_comp_df_reset_index = X_train_smoteenn.reset_index()
    bankruptcy_comp_df_reset_index.to_feather(path+'2after_smoteenn.feather')

### 5.1. Splitting into training and testing set

In [None]:
X_train_resampled = X_train_smoteenn.drop(['Bankrupt', 'Country ISO code', 'Last avail. Year'], axis=1)
y_train_resampled = X_train_smoteenn['Bankrupt']
X_test_resampled = X_test_for_smoteenn.drop(['Bankrupt', 'Country ISO code', 'Last avail. Year'], axis=1)
y_test_resampled = X_test_for_smoteenn['Bankrupt']

### 5.2. Creating datasets (Models 5-8)

#### 5.2.1. Complete SMOTEENN (Model 5)

In [None]:
print(f'Shape of X train: {X_train_resampled.shape} | Shape of y train: {y_train_resampled.shape} | Percentage bankrupt training: {round(y_train_resampled.sum()/y_train_resampled.count()*100,3)}%')
print(f'Shape of X test: {X_test_resampled.shape} | Shape of y test: {y_test_resampled.shape} | Percentage bankrupt testing: {round(y_test_resampled.sum()/y_test_resampled.count()*100,3)}%')
X_train_resampled = scaler.fit_transform(X_train_resampled)
X_test_resampled = scaler.fit_transform(X_test_resampled)

#### 5.2.2. Without time (Model 6)

In [None]:
X_y_train_resampled = X_train_resampled[:,::5]
X_y_test_resampled = X_test_resampled[:,::5]
print(f'Shape of X train: {X_y_train_resampled.shape} | Shape of y train: {y_train_resampled.shape} | Percentage bankrupt: {round(y_train_resampled.sum()/y_train_resampled.count()*100,3)}%')
print(f'Shape of X test: {X_y_test_resampled.shape} | Shape of y test: {y_test_resampled.shape} | Percentage bankrupt: {round(y_test_resampled.sum()/y_test_resampled.count()*100,3)}%')

#### 5.2.3. Without macro (Model 7)

In [None]:
X_m_train_resampled = X_train_resampled[:,40:]
X_m_test_resampled = X_test_resampled[:,40:]
print(f'Shape of X train: {X_m_train_resampled.shape} | Shape of y train: {y_train_resampled.shape} | Percentage bankrupt: {round(y_train_resampled.sum()/y_train_resampled.count()*100,3)}%')
print(f'Shape of X test: {X_m_test_resampled.shape} | Shape of y test: {y_test_resampled.shape} | Percentage bankrupt: {round(y_test_resampled.sum()/y_test_resampled.count()*100,3)}%')

#### 5.2.4. Without time and macro (Model 8)

In [None]:
X_y_m_train_resampled = X_m_train_resampled[:,::5]
X_y_m_test_resampled = X_m_test_resampled[:,::5]
print(f'Shape of X train: {X_y_m_train_resampled.shape} | Shape of y train: {y_train_resampled.shape} | Percentage bankrupt: {round(y_train_resampled.sum()/y_train_resampled.count()*100,3)}%')
print(f'Shape of X test: {X_y_m_test_resampled.shape} | Shape of y test: {y_test_resampled.shape} | Percentage bankrupt: {round(y_test_resampled.sum()/y_test_resampled.count()*100,3)}%')

### 5.3. Remove features with high correlation

In [None]:
X_train_resampled = np.delete(X_train_resampled, remove_complete, axis=1)
X_y_train_resampled = np.delete(X_y_train_resampled, remove_complete_y, axis=1)
X_m_train_resampled = np.delete(X_m_train_resampled, remove_complete_m, axis=1)
X_y_m_train_resampled = np.delete(X_y_m_train_resampled, remove_complete_y_m, axis=1)

print(f'X train resampled number of features: {len(X_train_resampled[0])}')
print(f'X y train resampled number of features: {len(X_y_train_resampled[0])}')
print(f'X m train resampled number of features: {len(X_m_train_resampled[0])}')
print(f'X y m train resampled number of features: {len(X_y_m_train_resampled[0])}')

In [None]:
X_test_resampled = np.delete(X_test_resampled, remove_complete, axis=1)
X_y_test_resampled = np.delete(X_y_test_resampled, remove_complete_y, axis=1)
X_m_test_resampled = np.delete(X_m_test_resampled, remove_complete_m, axis=1)
X_y_m_test_resampled = np.delete(X_y_m_test_resampled, remove_complete_y_m, axis=1)

### 5.4. Append to X_list & y_list

In [None]:
X_list.append([X_train_resampled, X_test_resampled])
X_list.append([X_y_train_resampled, X_y_test_resampled])
X_list.append([X_m_train_resampled, X_m_test_resampled])
X_list.append([X_y_m_train_resampled, X_y_m_test_resampled])
y_list.append((y_train_resampled, y_test_resampled))
y_list.append((y_train_resampled, y_test_resampled))
y_list.append((y_train_resampled, y_test_resampled))
y_list.append((y_train_resampled, y_test_resampled))

### 5.5. Display class distribution after SMOTEENN

In [None]:
plt.figure(figsize=(8, 5))
sns.countplot(x=bankruptcy_smoteenn_df['Bankrupt'], palette=['#000000','#808080'])
plt.title('Class distribution: non-bankrupt and bankrupt firms', fontsize=14, pad=20)
plt.ticklabel_format(style='plain', axis='y')
plt.yticks(np.arange(0, 2500000, 500000))
plt.ylabel('Number of firms')
plt.text(x=-0.19, y=bankruptcy_smoteenn_df["Bankrupt"].value_counts()[0]+10000, s=f'n={bankruptcy_smoteenn_df["Bankrupt"].value_counts()[0]:,}')
plt.text(x=0.83, y=bankruptcy_smoteenn_df["Bankrupt"].value_counts()[1]+20000, s=f'n={bankruptcy_smoteenn_df["Bankrupt"].value_counts()[1]:,}')
plt.show()

### 5.6. Display combined class distribution

In [None]:
fig, ax = plt.subplots(1, 2, figsize=(16,8), sharey=True)
fig.tight_layout(pad=6.0)
sns.countplot(x=bankruptcy_comp_df["Bankrupt"], palette=['#000000','#808080'], ax=ax[0])
sns.countplot(x=bankruptcy_smoteenn_df['Bankrupt'], palette=['#000000','#808080'], ax=ax[1])
ax[0].set_title('Class distribution: non-bankrupt and bankrupt firms', fontsize=18, pad=20)
ax[0].set_ylabel('Number of firms', fontsize=12)
ax[0].set_yticks(np.arange(0, 2500000, 500000))
ax[0].set_yticklabels(np.arange(0, 2500000, 500000), fontsize=12)
ax[1].set_title('Class distribution: non-bankrupt and bankrupt firms\nwith SMOTEENN', fontsize=18, pad=20)
ax[1].set_ylabel('Number of firms', fontsize=12)
ax[1].set_yticks(np.arange(0, 2500000, 500000))
ax[1].set_yticklabels(np.arange(0, 2500000, 500000), fontsize=12)
ax[1].yaxis.set_tick_params(labelbottom=True)
fig.show()

Save lists

In [None]:
np.save('X_list.npy', X_list)
np.save('X_list_m.npy', X_list_m)
np.save('y_list.npy', y_list)
np.save('y_list_m.npy', y_list_m)