In [1]:
import pandas as pd
import numpy as np
import sklearn.base
from sklearn.model_selection import cross_validate
from sklearn.model_selection import RepeatedKFold
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegressionCV
from sklearn import svm
from sklearn.tree import DecisionTreeClassifier
import imblearn
from collections import Counter
import hyperopt
import matplotlib.pyplot as plt
from sklearn.ensemble import StackingClassifier

In [2]:
#задание количества выводимых на экран строк
pd.options.display.max_rows = 96
#создание датафрейма
df = pd.read_csv("data.csv")
#вывели типы данных, X - входные данные (95 колонок), y - выходные данные (1 колонка)
print(df.dtypes)

Bankrupt?                                                     int64
 ROA(C) before interest and depreciation before interest    float64
 ROA(A) before interest and % after tax                     float64
 ROA(B) before interest and depreciation after tax          float64
 Operating Gross Margin                                     float64
 Realized Sales Gross Margin                                float64
 Operating Profit Rate                                      float64
 Pre-tax net Interest Rate                                  float64
 After-tax net Interest Rate                                float64
 Non-industry income and expenditure/revenue                float64
 Continuous interest rate (after tax)                       float64
 Operating Expense Rate                                     float64
 Research and development expense rate                      float64
 Cash flow rate                                             float64
 Interest-bearing debt interest rate            

In [3]:
#Создали класс, реализующий ансамль XGB-моделей, обученных на случайных сбалансированных подвыборках
class XGBoostEnsemble(sklearn.base.BaseEstimator, sklearn.base.ClassifierMixin):

    def __init__(self, n_estimators = 100, zeros_ratio = 1, scale_pos_weight = 1, power = 1, ensemble_quantile = None):
        self.n_estimators = n_estimators
        self.classes_ = np.array([0,1])
        self.zeros_ratio = zeros_ratio
        self.scale_pos_weight = scale_pos_weight
        self.power = power
        #добавляем вместо среднего арифметического квантиль
        self.ensemble_quantile = ensemble_quantile

    def fit(self, X, y):
        self.estimators = []
        for i in range(self.n_estimators):
            ones = y == 1
            num_ones = ones.sum()
            zeros_indexes = np.where(np.logical_not(ones))[0]
            zeros_subsample = np.random.choice(zeros_indexes, size=int(num_ones*self.zeros_ratio), replace=False, p=None)
            final_indexes = np.concatenate([np.where(ones)[0], zeros_subsample])
            model = XGBClassifier(scale_pos_weight = self.scale_pos_weight)
            model.fit(X[final_indexes,:], y[final_indexes])
            self.estimators.append(model)
            #print("Fitting of model number {} is finished".format(i))
            if model.classes_[0] > model.classes_[1]:
                raise Exception("Wrong order of classes")

    def predict_proba(self, X):
        results = np.zeros([self.n_estimators, X.shape[0], 2]) #количество строк в иксе, количество столбцов по числу классов [0] и [1]
        for i in range(self.n_estimators):
            results[i,:,:] = self.estimators[i].predict_proba(X)
        if self.ensemble_quantile is None:
            ensemble_results = np.mean(results**self.power, axis = 0)
        else:
            ensemble_results = np.quantile(results**self.power, self.ensemble_quantile, axis = 0)
        row_sums =  np.sum(ensemble_results, axis = 1)
        return ensemble_results/row_sums[:, None]

    def predict(self, X):
        res_classes = np.zeros(X.shape[0])
        results = self.predict_proba(X)
        for i in range(X.shape[0]):
            res_classes[i] = self.classes_[np.argmax(results[i,:])]
        return res_classes

In [4]:
#Создали класс, реализующий ансамль LogisticRegression-моделей, обученных на случайных сбалансированных подвыборках
class LassoEnsemble(sklearn.base.BaseEstimator, sklearn.base.ClassifierMixin):

    def __init__(self, n_estimators = 100, zeros_ratio = 1, scale_pos_weight = 1, power = 1, ensemble_quantile = None, column_probability = 1):
        self.n_estimators = n_estimators
        self.classes_ = np.array([0,1])
        self.zeros_ratio = zeros_ratio
        self.scale_pos_weight = scale_pos_weight
        self.power = power
        #добавляем вместо среднего арифметического квантиль
        self.ensemble_quantile = ensemble_quantile
        #Добавили рандомизацию по колонкам признака
        self.column_probability = column_probability

    def fit(self, X, y):
        self.estimators = []
        self.estimators_columns = []
        for i in range(self.n_estimators):
            #хранит индикатор того будет ли использоваться столбец данных
            outcome = np.random.binomial(n = 1, p = self.column_probability, size = X.shape[1])
            while outcome.sum() == 0: #индикатор того, что ни один столбец не выбран
                outcome = np.random.binomial(n = 1, p = self.column_probability, size = X.shape[1])
            columns = np.where(outcome == 1)[0]
            self.estimators_columns.append(columns)
            ones = y == 1
            num_ones = ones.sum()
            zeros_indexes = np.where(np.logical_not(ones))[0]
            zeros_subsample = np.random.choice(zeros_indexes, size = int(num_ones*self.zeros_ratio), replace=False, p=None)
            final_indexes = np.concatenate([np.where(ones)[0], zeros_subsample])
            # print(final_indexes)
            # print(X[final_indexes, columns])
            model = LogisticRegressionCV(penalty='l2', solver="saga", class_weight = {0: 1, 1: self.scale_pos_weight}, max_iter = 100000)
            #model.fit(X[final_indexes, :], y[final_indexes])
            model.fit(X[np.ix_(final_indexes, columns)], y[final_indexes])
            self.estimators.append(model)
            #print("Fitting of model number {} is finished".format(i))
            if model.classes_[0] > model.classes_[1]:
                raise Exception("Wrong order of classes")

    def predict_proba(self, X):
        results = np.zeros([self.n_estimators, X.shape[0], 2]) #количество строк в иксе, количество столбцов по числу классов [0] и [1]
        for i in range(self.n_estimators):
            results[i,:,:] = self.estimators[i].predict_proba(X[:, self.estimators_columns[i]])
        if self.ensemble_quantile is None:
            ensemble_results = np.mean(results**self.power, axis = 0)
        else:
            ensemble_results = np.quantile(results**self.power, self.ensemble_quantile, axis = 0)
        row_sums =  np.sum(ensemble_results, axis = 1)
        return ensemble_results/row_sums[:, None]

    def predict(self, X):
        res_classes = np.zeros(X.shape[0])
        results = self.predict_proba(X)
        for i in range(X.shape[0]):
            res_classes[i] = self.classes_[np.argmax(results[i,:])]
        return res_classes

In [5]:
#Создали класс, реализующий ансамль LogisticRegression-моделей, обученных на случайных сбалансированных подвыборках
class SVMEnsemble(sklearn.base.BaseEstimator, sklearn.base.ClassifierMixin):

    def __init__(self, n_estimators = 100, zeros_ratio = 1, scale_pos_weight = 1, ensemble_quantile = 0.5, column_probability = 1):
        self.n_estimators = n_estimators
        self.classes_ = np.array([0,1])
        self.zeros_ratio = zeros_ratio
        self.scale_pos_weight = scale_pos_weight
        #добавляем вместо среднего арифметического квантиль
        self.ensemble_quantile = ensemble_quantile
        #Добавили рандомизацию по колонкам признака
        self.column_probability = column_probability

    def fit(self, X, y):
        self.estimators = []
        self.estimators_columns = []
        for i in range(self.n_estimators):
            #хранит индикатор того будет ли использоваться столбец данных
            outcome = np.random.binomial(n = 1, p = self.column_probability, size = X.shape[1])
            while outcome.sum() == 0: #индикатор того, что ни один столбец не выбран
                outcome = np.random.binomial(n = 1, p = self.column_probability, size = X.shape[1])
            columns = np.where(outcome == 1)[0]
            self.estimators_columns.append(columns)
            ones = y == 1
            num_ones = ones.sum()
            zeros_indexes = np.where(np.logical_not(ones))[0]
            zeros_subsample = np.random.choice(zeros_indexes, size = int(num_ones*self.zeros_ratio), replace=False, p=None)
            final_indexes = np.concatenate([np.where(ones)[0], zeros_subsample])
            # print(final_indexes)
            # print(X[final_indexes, columns])
            model = svm.SVC(class_weight = {0: 1, 1: self.scale_pos_weight})
            #model.fit(X[final_indexes, :], y[final_indexes])
            model.fit(X[np.ix_(final_indexes, columns)], y[final_indexes])
            self.estimators.append(model)
            #print("Fitting of model number {} is finished".format(i))
            if model.classes_[0] > model.classes_[1]:
                raise Exception("Wrong order of classes")

    def predict(self, X):
        results = np.zeros([self.n_estimators, X.shape[0]])
        for i in range(self.n_estimators):
            results[i,:] = self.estimators[i].predict(X[:, self.estimators_columns[i]])
        res_classes = (results.sum(axis = 0) >= results.shape[0]*self.ensemble_quantile).astype(int)
        return res_classes

In [6]:
base_learners = [
                 ('XGBoost', XGBoostEnsemble(n_estimators = 20, zeros_ratio = 20, scale_pos_weight = 55, ensemble_quantile = 0.5)),
                 ('Lasso', LassoEnsemble(n_estimators = 100, zeros_ratio = 2, column_probability = 0.05, ensemble_quantile = 0.87)),
                 ('SVM', SVMEnsemble(ensemble_quantile = 0.6, column_probability = 0.01))
                ]

In [8]:
X = df.drop('Bankrupt?', axis=1).to_numpy()
y = df.loc[:,'Bankrupt?'].to_numpy()

model = StackingClassifier(estimators = base_learners, final_estimator = sklearn.linear_model.LogisticRegression(), cv = RepeatedKFold(n_splits = 5, n_repeats = 1, random_state = 1))
cv = RepeatedKFold(n_splits = 5, n_repeats = 1, random_state = 1)
n_scores = cross_validate(model, X, y, scoring = ['accuracy', 'precision', 'recall', 'f1'], cv = cv, n_jobs = 4)

for k in n_scores:
    print('{}: {:.3f} ({:.3f})'.format(k, n_scores[k].mean(), n_scores[k].std()))

fit_time: 806.983 (123.234)
score_time: 5.201 (0.979)
test_accuracy: 0.968 (0.004)
test_precision: 0.516 (0.067)
test_recall: 0.281 (0.111)
test_f1: 0.352 (0.092)


In [9]:
X = df.drop('Bankrupt?', axis=1).to_numpy()
y = df.loc[:,'Bankrupt?'].to_numpy()

model = StackingClassifier(estimators = base_learners, final_estimator = sklearn.tree.DecisionTreeClassifier(max_depth = 20), cv = RepeatedKFold(n_splits = 5, n_repeats = 1, random_state = 1))
cv = RepeatedKFold(n_splits = 5, n_repeats = 1, random_state = 1)
n_scores = cross_validate(model, X, y, scoring = ['accuracy', 'precision', 'recall', 'f1'], cv = cv, n_jobs = 4)

for k in n_scores:
    print('{}: {:.3f} ({:.3f})'.format(k, n_scores[k].mean(), n_scores[k].std()))

fit_time: 841.684 (146.338)
score_time: 6.118 (1.220)
test_accuracy: 0.954 (0.004)
test_precision: 0.302 (0.037)
test_recall: 0.332 (0.042)
test_f1: 0.316 (0.038)


In [None]:
X = df.drop('Bankrupt?', axis=1).to_numpy()
y = df.loc[:,'Bankrupt?'].to_numpy()

model = StackingClassifier(estimators = base_learners, final_estimator = XGBClassifier(), cv = RepeatedKFold(n_splits = 5, n_repeats = 1, random_state = 1))
cv = RepeatedKFold(n_splits = 5, n_repeats = 1, random_state = 1)
n_scores = cross_validate(model, X, y, scoring = ['accuracy', 'precision', 'recall', 'f1'], cv = cv, n_jobs = 4)

for k in n_scores:
    print('{}: {:.3f} ({:.3f})'.format(k, n_scores[k].mean(), n_scores[k].std()))

KeyboardInterrupt: 