In [1]:
import pytorch_tabnet
import pandas as pd
import numpy as np
import sklearn.metrics
from pytorch_tabnet.tab_model import TabNetClassifier
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import cross_validate
import torch
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import roc_auc_score, accuracy_score
from sklearn.model_selection import train_test_split

In [2]:
pd.options.display.max_rows = 96
df = pd.read_csv("data.csv")
X = df.drop('Bankrupt?', axis=1).to_numpy()
y = df.loc[:,'Bankrupt?'].to_numpy()
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state = 1)

In [3]:
# define the model
clf1_nopreproc = TabNetClassifier(optimizer_fn=torch.optim.Adam,
                       optimizer_params=dict(lr=2e-2),
                       scheduler_params={"step_size":10, # how to use learning rate scheduler
                                         "gamma":0.9},
                       scheduler_fn=torch.optim.lr_scheduler.StepLR,
                       mask_type='entmax' # "sparsemax"
                      )

# fit the model
clf1_nopreproc.fit(
    x_train,y_train,
    eval_set=[(x_train, y_train), (x_test, y_test)],
    eval_name=['train', 'test'],
    eval_metric=['accuracy', 'balanced_accuracy'],
    max_epochs=1000 , patience=50,
    batch_size=256, virtual_batch_size=128,
    num_workers=0,
    weights = {0: 1, 1: 7},
    drop_last=False
)



epoch 0  | loss: 0.48154 | train_accuracy: 0.9681  | train_balanced_accuracy: 0.49962 | test_accuracy: 0.96188 | test_balanced_accuracy: 0.49924 |  0:00:22s
epoch 1  | loss: 0.32994 | train_accuracy: 0.9681  | train_balanced_accuracy: 0.49962 | test_accuracy: 0.96261 | test_balanced_accuracy: 0.49962 |  0:00:37s
epoch 2  | loss: 0.29035 | train_accuracy: 0.96884 | train_balanced_accuracy: 0.5     | test_accuracy: 0.96334 | test_balanced_accuracy: 0.5     |  0:00:51s
epoch 3  | loss: 0.24728 | train_accuracy: 0.96884 | train_balanced_accuracy: 0.5     | test_accuracy: 0.96334 | test_balanced_accuracy: 0.5     |  0:01:01s
epoch 4  | loss: 0.23532 | train_accuracy: 0.96884 | train_balanced_accuracy: 0.5     | test_accuracy: 0.96334 | test_balanced_accuracy: 0.5     |  0:01:15s
epoch 5  | loss: 0.22896 | train_accuracy: 0.96884 | train_balanced_accuracy: 0.5     | test_accuracy: 0.96334 | test_balanced_accuracy: 0.5     |  0:01:31s
epoch 6  | loss: 0.24284 | train_accuracy: 0.96884 | train



In [4]:
ypred_test = clf1_nopreproc.predict(x_test)
print(sklearn.metrics.accuracy_score(y_test, ypred_test))
print(sklearn.metrics.f1_score(y_test, ypred_test))
print(sklearn.metrics.precision_score(y_test, ypred_test))
print(sklearn.metrics.recall_score(y_test, ypred_test))

0.9523460410557185
0.45378151260504207
0.391304347826087
0.54


In [None]:
cv = RepeatedKFold(n_splits = 5, n_repeats = 1, random_state = 1)
n_scores = cross_validate(clf1_nopreproc, X, y, scoring = ['accuracy', 'precision', 'recall', 'f1'], fit_params = {
    'max_epochs': 1000 ,
    'patience': 50,
    'batch_size': 256,
    'virtual_batch_size': 128,
    'num_workers': 0,
    'weights': {0: 1, 1: 6.75},
    'drop_last': False
},
    cv = cv, n_jobs = -1)

for k in n_scores:
    print('{}: {:.3f} ({:.3f})'.format(k, n_scores[k].mean(), n_scores[k].std()))

In [None]:
'weights': {0: 1, 1: 6}
fit_time: 848.301 (1.275)
score_time: 0.129 (0.015)
test_accuracy: 0.959 (0.004)
test_precision: 0.353 (0.031)
test_recall: 0.324 (0.052)
test_f1: 0.336 (0.038)

'weights': {0: 1, 1: 6.5}
fit_time: 791.901 (0.861)
score_time: 0.095 (0.016)
test_accuracy: 0.962 (0.004)
test_precision: 0.390 (0.039)
test_recall: 0.336 (0.081)
test_f1: 0.358 (0.058)

'weights': {0: 1, 1: 6.25}
fit_time: 942.896 (2.039)
score_time: 0.140 (0.036)
test_accuracy: 0.959 (0.005)
test_precision: 0.359 (0.053)
test_recall: 0.327 (0.089)
test_f1: 0.331 (0.054)

'weights': {0: 1, 1: 6.75}
fit_time: 979.978 (4.718)
score_time: 0.131 (0.039)
test_accuracy: 0.959 (0.001)
test_precision: 0.349 (0.018)
test_recall: 0.302 (0.087)
test_f1: 0.315 (0.055)

In [7]:
#Создали класс, реализующий ансамль XGB-моделей, обученных на случайных сбалансированных подвыборках
class ResampledEnsemble(sklearn.base.BaseEstimator):

    def __init__(self, n_estimators = 5, zeros_ratio = 1, scale_pos_weight = 1, power = 1, ensemble_quantile = None):
        self.n_estimators = n_estimators
        self.classes_ = np.array([0,1])
        self.zeros_ratio = zeros_ratio
        self.scale_pos_weight = scale_pos_weight
        self.power = power
        #добавляем вместо среднего арифметического квантиль
        self.ensemble_quantile = ensemble_quantile

    def fit(self, X, y):
        self.estimators = []
        for i in range(self.n_estimators):
            ones = y == 1
            num_ones = ones.sum()
            zeros_indexes = np.where(np.logical_not(ones))[0]
            zeros_subsample = np.random.choice(zeros_indexes, size=int(num_ones*self.zeros_ratio), replace=False, p=None)
            final_indexes = np.concatenate([np.where(ones)[0], zeros_subsample])
            model = TabNetClassifier(optimizer_fn=torch.optim.Adam,
                       optimizer_params=dict(lr=2e-2),
                       scheduler_params={"step_size":10, # how to use learning rate scheduler
                                         "gamma":0.9},
                       scheduler_fn=torch.optim.lr_scheduler.StepLR,
                       mask_type='entmax' # "sparsemax"
                      )
            model.fit(X[final_indexes,:], y[final_indexes],
                      max_epochs=1000 , patience=50,
    batch_size=256, virtual_batch_size=128,
    num_workers=0,
    weights = {0: 1, 1: self.scale_pos_weight},
    drop_last=False)
            self.estimators.append(model)
            #print("Fitting of model number {} is finished".format(i))
            if model.classes_[0] > model.classes_[1]:
                raise Exception("Wrong order of classes")

    def predict_proba(self, X):
        results = np.zeros([self.n_estimators, X.shape[0], 2]) #количество строк в иксе, количество столбцов по числу классов [0] и [1]
        for i in range(self.n_estimators):
            results[i,:,:] = self.estimators[i].predict_proba(X)
        if self.ensemble_quantile is None:
            ensemble_results = np.mean(results**self.power, axis = 0)
        else:
            ensemble_results = np.quantile(results**self.power, self.ensemble_quantile, axis = 0)
        row_sums =  np.sum(ensemble_results, axis = 1)
        return ensemble_results/row_sums[:, None]

    def predict(self, X):
        res_classes = np.zeros(X.shape[0])
        results = self.predict_proba(X)
        for i in range(X.shape[0]):
            res_classes[i] = self.classes_[np.argmax(results[i,:])]
        return res_classes

In [8]:
model = ResampledEnsemble(zeros_ratio = 20, scale_pos_weight = 55, power = 0.001, ensemble_quantile = 0.9)
cv = RepeatedKFold(n_splits = 5, n_repeats = 1, random_state = 1)
n_scores = cross_validate(model, X, y, scoring = ['accuracy', 'precision', 'recall', 'f1'],
    cv = cv, n_jobs = 4)

for k in n_scores:
    print('{}: {:.3f} ({:.3f})'.format(k, n_scores[k].mean(), n_scores[k].std()))

fit_time: 2948.245 (394.815)
score_time: 0.820 (0.265)
test_accuracy: 0.959 (0.002)
test_precision: 0.378 (0.026)
test_recall: 0.420 (0.052)
test_f1: 0.397 (0.033)


In [None]:
# model = ResampledEnsemble(zeros_ratio = 20, scale_pos_weight = 55, power = 0.001, ensemble_quantile = 0.01)
fit_time: 2814.031 (284.226)
score_time: 0.507 (0.063)
test_accuracy: 0.956 (0.002)
test_precision: 0.352 (0.021)
test_recall: 0.439 (0.035)
test_f1: 0.390 (0.013)

# model = ResampledEnsemble(zeros_ratio = 20, scale_pos_weight = 55, power = 0.001, ensemble_quantile = 0.9)
fit_time: 2948.245 (394.815)
score_time: 0.820 (0.265)
test_accuracy: 0.959 (0.002)
test_precision: 0.378 (0.026)
test_recall: 0.420 (0.052)
test_f1: 0.397 (0.033)