In [55]:
import numpy as np
import pandas as pd

In [56]:
# load dataset
df = pd.read_csv('data.csv')

In [57]:
# see how many observations and features are there
df.shape

(6819, 96)

In [58]:
# check if we should impute any data
df.isna().sum().sum() # no imputation necessary

0

In [59]:
# get a glimple of the dataset
df.sample(3)

Unnamed: 0,Bankrupt?,ROA(C) before interest and depreciation before interest,ROA(A) before interest and % after tax,ROA(B) before interest and depreciation after tax,Operating Gross Margin,Realized Sales Gross Margin,Operating Profit Rate,Pre-tax net Interest Rate,After-tax net Interest Rate,Non-industry income and expenditure/revenue,...,Net Income to Total Assets,Total assets to GNP price,No-credit Interval,Gross Profit to Sales,Net Income to Stockholder's Equity,Liability to Equity,Degree of Financial Leverage (DFL),Interest Coverage Ratio (Interest expense to EBIT),Net Income Flag,Equity to Liability
1499,0,0.504997,0.545519,0.548263,0.610956,0.610876,0.99901,0.797398,0.809315,0.303454,...,0.797125,0.001459,0.623606,0.610955,0.840175,0.284586,0.031054,0.569196,1,0.020006
1464,0,0.55711,0.608373,0.592055,0.611914,0.611914,0.999095,0.797534,0.809416,0.303514,...,0.832448,0.000421,0.623885,0.611912,0.844421,0.285362,0.02687,0.565508,1,0.019295
3454,0,0.644518,0.659507,0.690508,0.627459,0.627459,0.999425,0.797974,0.809833,0.303591,...,0.866871,0.555479,0.623305,0.627457,0.843716,0.275535,0.026796,0.565183,1,0.123572


In [60]:
# find dependent and independent vaiables
target = 'Bankrupt?'
all_feats = df.columns.to_list()
if target in all_feats: all_feats.remove(target)

In [61]:
# check for imbalance
df[target].value_counts(normalize=True) # the dataset is clearly imbalances

0    0.967737
1    0.032263
Name: Bankrupt?, dtype: float64

In [62]:
# divide the data into depdendet and independent variables
Xv = df.drop(target, axis=1)
yv = df[target]

In [63]:
X = Xv.values
y = yv.values

In [None]:
# SMOTE combined with Under Sampling will be used to balance the dataset

In [18]:
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold

In [29]:
from imblearn.pipeline import Pipeline

In [64]:
# divide the dataset into train and validate in order to build 
# a basic clasyfication model
skf = StratifiedKFold(n_splits=2)
for train_index, valid_index in skf.split(X,y):
    X_train, X_valid = X[train_index], X[valid_index]
    y_train, y_valid = y[train_index], y[valid_index]

In [65]:
# a CatBoostClassifier was chosen as our basic model
from catboost import CatBoostClassifier

In [66]:
# some basic parameters
params = {'depth': 10, 'n_estimators': 100, 'silent': True}

In [25]:
basic_cat_boost = CatBoostClassifier(**params)

In [67]:
# training the model and predicting values on the validation dataset
basic_cat_boost.fit(X_train, y_train)
y_pred = basic_cat_boost.predict(X_valid)

In [32]:
# checking the cross validation score of ROC curve
cross_val_score(basic_cat_boost, X, y, cv=2, scoring='roc_auc').mean()

0.9236478434486783

In [28]:
# checking the cross validation score on balanced datasets
# constructed for k neighbors
from sklearn.model_selection import RepeatedStratifiedKFold

In [31]:
neighbors = list(range(1,8))
for k in neighbors:
    model = basic_cat_boost
    over = SMOTE(sampling_strategy=0.1, k_neighbors=k)
    under = RandomUnderSampler(sampling_strategy=0.5)
    steps = [('o', over), ('u', under), ('m', model)]

    pipeline = Pipeline(steps=steps)
    cv = RepeatedStratifiedKFold(n_splits=2, n_repeats=2, random_state=1)
    score = cross_val_score(pipeline, X, y, scoring='roc_auc', n_jobs=-1)
    score = np.mean(score)

    print(f'{k}->{score}')

1->0.9329259320755977
2->0.9278264588123986
3->0.9318555671819164
4->0.9289077834656441
5->0.9262623930392213
6->0.9307058288794623
7->0.9319413472724349


In [37]:
# k = 1 was chosen
over = SMOTE(sampling_strategy=0.1, k_neighbors=1)
under = RandomUnderSampler(sampling_strategy=0.5)
steps = [('o', over), ('u', under)]
pipeline = Pipeline(steps=steps)
X_smote, y_smote = pipeline.fit_resample(X,y)

In [38]:
# comparison of value counts of the balanced and imbalanced dataset
from collections import Counter

In [39]:
Counter(y)

Counter({1: 220, 0: 6599})

In [40]:
Counter(y_smote)

Counter({0: 1318, 1: 659})

In [50]:
X_smote.shape

(1977, 95)

In [51]:
# construct the new dataset
dff  = pd.DataFrame(X_smote, columns=all_feats)
dff[target] = y_smote

In [68]:
dff.to_csv('data_balanced.csv')