In [19]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn import model_selection
from sklearn.linear_model import LogisticRegression
import time
import sklearn
from sklearn.metrics import accuracy_score, confusion_matrix,f1_score
from sklearn import metrics
from sklearn.feature_selection import RFE
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.ensemble import StackingClassifier
from sklearn.neighbors import KNeighborsClassifier
import optuna
from sklearn.ensemble import VotingClassifier
import matplotlib.pyplot as plt

from warnings import filterwarnings
filterwarnings("ignore")

In [20]:
data=pd.read_csv('data/data_final.csv')

## Отбор признаков

In [14]:
#Выясним есть ли корреляция с целевым признаком
corr_matrix = data.corr(method='kendall')
corr_matrix['hasRegionalSupport'].sort_values(ascending=False)

hasRegionalSupport            1.000000
okogu_code_1                  0.269164
share_grants                  0.257334
transformed_meanSum_grants    0.256736
okfs_code_0                   0.236082
                                ...   
mainOkved_code_4             -0.059856
opf_code_5                   -0.079098
mainOkved_version            -0.111389
minjustStatus_0              -0.114969
okogu_code_4                 -0.178215
Name: hasRegionalSupport, Length: 159, dtype: float64

Корреляции с целевым признаком нет, рассмотрим пары сильноскоррелированных признаков

In [46]:
data.corr().abs().unstack().sort_values(ascending=False).drop_duplicates().head(20)

egrulStatus                           egrulStatus                             1.000000
okogu_code                            okogu_code_1                            0.981792
okogu_code_4                          okogu_code_3                            0.978794
share_contracts                       transformed_meanSum_contracts           0.967945
share_grants                          transformed_meanSum_grants              0.966411
okfs_code_3                           okfs_code                               0.899441
transformed_meanSum_fedSubsidies      share_fedSubsidies                      0.880486
okfs_code_0                           okogu_code_1                            0.845291
                                      okogu_code                              0.839811
transformed_meanSum_contracts         transformed_incomeTotal                 0.799178
transformed_incomeTotal               share_contracts                         0.773554
okfs_code_3                           okogu

In [22]:
data = data.drop(['transformed_meanSum_grants','transformed_meanSum_fedSubsidies',\
    'transformed_meanSum_contracts','transformed_incomeTotal','okogu_code','okfs_code','minjustStatus_0',\
        'okogu_code_3','okfs_code_3','statusDetail_shortName_Ликвидируется','okfs_code_0','okpo'],axis=1)

### Построение модели

In [23]:
y = data['hasRegionalSupport']
X = data.drop('hasRegionalSupport',axis=1)

X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, stratify=y, test_size=0.3, random_state=0)

In [24]:
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)


(239962, 146) (239962,)
(102842, 146) (102842,)


### 1. Логическая регрессия

In [25]:
logreg = LogisticRegression()
logreg.fit(X_train, y_train)

y_pred = logreg.predict(X_test)
#print('Accuracy of logistic regression classifier on test set: {:.2f}'.format(logreg.score(y_test,y_pred)))
print(metrics.classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.97      1.00      0.98     99549
           1       0.00      0.00      0.00      3293

    accuracy                           0.97    102842
   macro avg       0.48      0.50      0.49    102842
weighted avg       0.94      0.97      0.95    102842



Такие метрики, думаю, получились из-за несбалансированности целевого признака.

Проведем обогащение синтетическими данными через SMOTE

In [26]:
sm = SMOTE(random_state=0)
X_train_s, y_train_s = sm.fit_resample(X_train, y_train)

print('Train shape before oversampling:', X_train.shape) 
print('Class balance before oversampling: \n', y_train.value_counts(), sep='')
print('-'*40)
print('Train shape after oversampling:', X_train_s.shape)
print('Class balance after oversampling: \n', y_train_s.value_counts(), sep='')

Train shape before oversampling: (239962, 146)
Class balance before oversampling: 
hasRegionalSupport
0    232279
1      7683
Name: count, dtype: int64
----------------------------------------
Train shape after oversampling: (464558, 146)
Class balance after oversampling: 
hasRegionalSupport
0    232279
1    232279
Name: count, dtype: int64


Значение метрик крайне низкое. Надо отметить, что т.к. распределение целевой переменной в тестовой выборке осталось несбалансированным, поэтому метрика accuracy может показывать себя плохо. Ориентируемся на метрику f1. Так же по условию задачи нам важно охватить как можно больше объектов класса 1, значит обращаем внимание на метрику RECALL при выборе итоговой модели.

Напишем функцию для обучения и оценки классификаторов:

In [27]:
def predictor(model):
    # засекаем время
    tic = time.perf_counter()
    
    # обучаем модель 
    model.fit(X_train_s, y_train_s)
    
    # делаем предсказания с помощью модели
    y_train_pred = model.predict(X_train_s)
    y_pred = model.predict(X_test)

    # метрики
    train_recall = metrics.recall_score(y_train_s,y_train_pred)
    f1_train = metrics.f1_score(y_train_s,y_train_pred)
    test_precision = metrics.precision_score(y_test,y_pred)
    test_recall = metrics.recall_score(y_test,y_pred)
    f1_test = metrics.f1_score(y_test,y_pred)
    accuracy_test = metrics.accuracy_score(y_test,y_pred)
    
    #список с метриками по моделям
    list_model.append([str(model),test_precision,test_recall,f1_test,accuracy_test])
    
    # выводим отчет по метрикам и потраченному времени
    print(f"train: recall= {train_recall: 0.2f}, f1 = {f1_train: 0.2f}")
    #print(f"test: recall= {test_recall: 0.2f}, f1 = {f1_test: 0.2f}, accuracy = {accuracy_test: 0.2f}")
    print(metrics.classification_report(y_test,y_pred))
    #print(confusion_matrix(y_test, y_pred))
    toc = time.perf_counter()
    print(f"Processed in {toc - tic: 0.4f} seconds")

In [28]:
# Создадим список для сбора информации по основным метрикам рассмотренных ниже моделей
list_model = []

In [29]:
LR_model = LogisticRegression(random_state=0)
predictor(LR_model)

train: recall=  0.84, f1 =  0.82
              precision    recall  f1-score   support

           0       0.98      0.79      0.88     99549
           1       0.08      0.54      0.14      3293

    accuracy                           0.78    102842
   macro avg       0.53      0.66      0.51    102842
weighted avg       0.95      0.78      0.85    102842

Processed in  7.1119 seconds


Далее будем сразу применять оптимизацию для поиска лучших параметров для модели.

In [30]:
def optuna_lr(trial):
    # задаем пространства поиска гиперпараметров
    penalty = trial.suggest_categorical('penalty', ['l2','none'])
    solver = trial.suggest_categorical('solver', ['lbfgs', 'sag'])
    C = trial.suggest_float('C', 0.01, 1.0)

    # создаем модель
    model = LogisticRegression(penalty=penalty,
                               solver=solver,
                               C=C,
                               random_state=0)
  
    # обучаем модель
    model.fit(X_train_s, y_train_s)
    score = metrics.f1_score(y_train_s, model.predict(X_train_s))

    return score

In [31]:
%%time
# cоздаем объект исследования
# можем напрямую указать, что нам необходимо максимизировать метрику direction="maximize"
study = optuna.create_study(study_name="LogisticRegression", direction="maximize")
# ищем лучшую комбинацию гиперпараметров n_trials раз
study.optimize(optuna_lr, n_trials=20)
# выводим результаты на обучающей выборке

print("Наилучшие значения гиперпараметров {}".format(study.best_params))
print("f1_score на обучающем наборе: {:.2f}".format(study.best_value))

[I 2023-11-07 00:15:10,669] A new study created in memory with name: LogisticRegression


[I 2023-11-07 00:16:10,809] Trial 0 finished with value: 0.6338619964523923 and parameters: {'penalty': 'none', 'solver': 'sag', 'C': 0.4547967457107647}. Best is trial 0 with value: 0.6338619964523923.
[I 2023-11-07 00:17:07,055] Trial 1 finished with value: 0.6338619964523923 and parameters: {'penalty': 'none', 'solver': 'sag', 'C': 0.4804302696552668}. Best is trial 0 with value: 0.6338619964523923.
[I 2023-11-07 00:18:03,218] Trial 2 finished with value: 0.6338619964523923 and parameters: {'penalty': 'l2', 'solver': 'sag', 'C': 0.8239562916680173}. Best is trial 0 with value: 0.6338619964523923.
[I 2023-11-07 00:18:09,023] Trial 3 finished with value: 0.8181952238906586 and parameters: {'penalty': 'none', 'solver': 'lbfgs', 'C': 0.7601891893218617}. Best is trial 3 with value: 0.8181952238906586.
[I 2023-11-07 00:19:05,586] Trial 4 finished with value: 0.6338619964523923 and parameters: {'penalty': 'none', 'solver': 'sag', 'C': 0.9607470865557789}. Best is trial 3 with value: 0.818

Наилучшие значения гиперпараметров {'penalty': 'l2', 'solver': 'lbfgs', 'C': 0.8301064436773966}
f1_score на обучающем наборе: 0.82
CPU times: total: 13min 59s
Wall time: 6min 14s


In [32]:
LR_model_best = LogisticRegression(penalty='l2', solver='lbfgs',C=0.20, random_state=0)
predictor(LR_model)

train: recall=  0.84, f1 =  0.82
              precision    recall  f1-score   support

           0       0.98      0.79      0.88     99549
           1       0.08      0.54      0.14      3293

    accuracy                           0.78    102842
   macro avg       0.53      0.66      0.51    102842
weighted avg       0.95      0.78      0.85    102842

Processed in  6.2792 seconds


### 2. Древо решений

In [33]:
DR_model = DecisionTreeClassifier(random_state=0)
predictor(DR_model)

train: recall=  1.00, f1 =  1.00
              precision    recall  f1-score   support

           0       0.98      0.97      0.97     99549
           1       0.28      0.37      0.32      3293

    accuracy                           0.95    102842
   macro avg       0.63      0.67      0.65    102842
weighted avg       0.96      0.95      0.95    102842

Processed in  8.3259 seconds


In [34]:
def optuna_dt(trial):
  # задаем пространства поиска гиперпараметров
  criterion = trial.suggest_categorical('criterion',['gini','entropy'])
  max_depth = trial.suggest_int('max_depth', 5, 10, 1)
  min_samples_leaf = trial.suggest_int('min_samples_leaf', 2, 5, 1)

  # создаем модель
  model = DecisionTreeClassifier (criterion=criterion,
                                  max_depth=max_depth,
                                  min_samples_leaf=min_samples_leaf,
                                  random_state=0)
  # обучаем модель
  model.fit(X_train_s, y_train_s)
  score = metrics.f1_score(y_train_s, model.predict(X_train_s))

  return score

In [35]:
%%time
# cоздаем объект исследования
# можем напрямую указать, что нам необходимо максимизировать метрику direction="maximize"
study = optuna.create_study(study_name="DecisionTreeClassifier", direction="maximize")
# ищем лучшую комбинацию гиперпараметров n_trials раз
study.optimize(optuna_dt, n_trials=20)

print("Наилучшие значения гиперпараметров {}".format(study.best_params))
print("f1_score на обучающем наборе: {:.2f}".format(study.best_value))

[I 2023-11-07 00:21:39,669] A new study created in memory with name: DecisionTreeClassifier
[I 2023-11-07 00:21:43,496] Trial 0 finished with value: 0.8977872022269485 and parameters: {'criterion': 'entropy', 'max_depth': 8, 'min_samples_leaf': 2}. Best is trial 0 with value: 0.8977872022269485.
[I 2023-11-07 00:21:47,265] Trial 1 finished with value: 0.8977671141449898 and parameters: {'criterion': 'entropy', 'max_depth': 8, 'min_samples_leaf': 3}. Best is trial 0 with value: 0.8977872022269485.
[I 2023-11-07 00:21:51,723] Trial 2 finished with value: 0.9206945940012335 and parameters: {'criterion': 'gini', 'max_depth': 10, 'min_samples_leaf': 4}. Best is trial 2 with value: 0.9206945940012335.
[I 2023-11-07 00:21:54,626] Trial 3 finished with value: 0.882113408847159 and parameters: {'criterion': 'gini', 'max_depth': 6, 'min_samples_leaf': 3}. Best is trial 2 with value: 0.9206945940012335.
[I 2023-11-07 00:21:59,046] Trial 4 finished with value: 0.9206458060658268 and parameters: {'

Наилучшие значения гиперпараметров {'criterion': 'gini', 'max_depth': 10, 'min_samples_leaf': 3}
f1_score на обучающем наборе: 0.92
CPU times: total: 1min 19s
Wall time: 1min 19s


In [36]:
DR_model_best = DecisionTreeClassifier(criterion='gini', max_depth=10, min_samples_leaf=2,random_state=0)
predictor(DR_model)

train: recall=  1.00, f1 =  1.00
              precision    recall  f1-score   support

           0       0.98      0.97      0.97     99549
           1       0.28      0.37      0.32      3293

    accuracy                           0.95    102842
   macro avg       0.63      0.67      0.65    102842
weighted avg       0.96      0.95      0.95    102842

Processed in  8.1762 seconds


### 3. Бэггинг

In [37]:
BC_model = BaggingClassifier(random_state=0)
predictor(BC_model)

train: recall=  1.00, f1 =  1.00
              precision    recall  f1-score   support

           0       0.98      0.98      0.98     99549
           1       0.40      0.30      0.34      3293

    accuracy                           0.96    102842
   macro avg       0.69      0.64      0.66    102842
weighted avg       0.96      0.96      0.96    102842

Processed in  57.6805 seconds


### 4. Случайный лес

In [38]:
RFC_model = RandomForestClassifier(random_state=0)
predictor(RFC_model)

train: recall=  1.00, f1 =  1.00
              precision    recall  f1-score   support

           0       0.98      0.99      0.98     99549
           1       0.46      0.28      0.35      3293

    accuracy                           0.97    102842
   macro avg       0.72      0.64      0.67    102842
weighted avg       0.96      0.97      0.96    102842

Processed in  85.0665 seconds


In [39]:
from sklearn import ensemble

def optuna_rf(trial):
  # задаем пространства поиска гиперпараметров
  n_estimators = trial.suggest_int('n_estimators', 100, 300, 50)
  max_depth = trial.suggest_int('max_depth', 5, 10, 1)
  min_samples_leaf = trial.suggest_int('min_samples_leaf', 2, 5, 1)

  # создаем модель
  model = RandomForestClassifier(n_estimators=n_estimators,
                                 max_depth=max_depth,
                                 min_samples_leaf=min_samples_leaf,
                                 random_state=0)
  # обучаем модель
  model.fit(X_train_s, y_train_s)
  score = metrics.f1_score(y_train_s, model.predict(X_train_s))

  return score
  

In [40]:
%%time
# cоздаем объект исследования
# можем напрямую указать, что нам необходимо максимизировать метрику direction="maximize"
study = optuna.create_study(study_name="RandomForestClassifier", direction="maximize")
# ищем лучшую комбинацию гиперпараметров n_trials раз
study.optimize(optuna_rf, n_trials=10)

# выводим результаты на обучающей выборке
print("Наилучшие значения гиперпараметров {}".format(study.best_params))
print("f1_score на обучающем наборе: {:.2f}".format(study.best_value))

[I 2023-11-07 00:25:29,892] A new study created in memory with name: RandomForestClassifier
[I 2023-11-07 00:26:06,133] Trial 0 finished with value: 0.920476037202766 and parameters: {'n_estimators': 100, 'max_depth': 10, 'min_samples_leaf': 5}. Best is trial 0 with value: 0.920476037202766.
[I 2023-11-07 00:27:04,998] Trial 1 finished with value: 0.9094820626909423 and parameters: {'n_estimators': 200, 'max_depth': 8, 'min_samples_leaf': 4}. Best is trial 0 with value: 0.920476037202766.
[I 2023-11-07 00:28:18,795] Trial 2 finished with value: 0.9216258739787805 and parameters: {'n_estimators': 200, 'max_depth': 10, 'min_samples_leaf': 4}. Best is trial 2 with value: 0.9216258739787805.
[I 2023-11-07 00:28:41,962] Trial 3 finished with value: 0.8965444023481411 and parameters: {'n_estimators': 100, 'max_depth': 6, 'min_samples_leaf': 5}. Best is trial 2 with value: 0.9216258739787805.
[I 2023-11-07 00:29:31,870] Trial 4 finished with value: 0.9159836108741218 and parameters: {'n_estim

Наилучшие значения гиперпараметров {'n_estimators': 200, 'max_depth': 10, 'min_samples_leaf': 4}
f1_score на обучающем наборе: 0.92
CPU times: total: 9min 12s
Wall time: 9min 16s


In [41]:
RFC_model_best = RandomForestClassifier(**study.best_params,random_state=0)
predictor(RFC_model)

train: recall=  1.00, f1 =  1.00
              precision    recall  f1-score   support

           0       0.98      0.99      0.98     99549
           1       0.46      0.28      0.35      3293

    accuracy                           0.97    102842
   macro avg       0.72      0.64      0.67    102842
weighted avg       0.96      0.97      0.96    102842

Processed in  82.7265 seconds


### 5. Дополнительные деревья

In [42]:
ETC_model = ExtraTreesClassifier(n_estimators=200, max_depth=15, min_samples_leaf=5,random_state=0)
predictor(ETC_model)

train: recall=  0.96, f1 =  0.94
              precision    recall  f1-score   support

           0       0.98      0.90      0.94     99549
           1       0.16      0.57      0.25      3293

    accuracy                           0.89    102842
   macro avg       0.57      0.74      0.60    102842
weighted avg       0.96      0.89      0.92    102842

Processed in  124.6071 seconds


### 6. Адаптивный бустинг (AdaBoost)

In [43]:
ABC_model = AdaBoostClassifier(learning_rate=0.5,random_state=0)
predictor(ABC_model)

train: recall=  0.92, f1 =  0.91
              precision    recall  f1-score   support

           0       0.99      0.91      0.94     99549
           1       0.17      0.59      0.27      3293

    accuracy                           0.90    102842
   macro avg       0.58      0.75      0.61    102842
weighted avg       0.96      0.90      0.92    102842

Processed in  48.1124 seconds


### 7. Градиентный бустинг (Gradient Boosting)

In [44]:
GBC_model = GradientBoostingClassifier(learning_rate=0.1,random_state=0)
predictor(GBC_model)

train: recall=  0.93, f1 =  0.93
              precision    recall  f1-score   support

           0       0.99      0.92      0.95     99549
           1       0.20      0.59      0.30      3293

    accuracy                           0.91    102842
   macro avg       0.59      0.75      0.63    102842
weighted avg       0.96      0.91      0.93    102842

Processed in  142.6401 seconds


### 8. Экстремальное повышение градиента (XGBoost)

In [45]:
XGB_model = XGBClassifier(random_state=0)
predictor(XGB_model)

train: recall=  0.97, f1 =  0.98
              precision    recall  f1-score   support

           0       0.98      0.98      0.98     99549
           1       0.40      0.40      0.40      3293

    accuracy                           0.96    102842
   macro avg       0.69      0.69      0.69    102842
weighted avg       0.96      0.96      0.96    102842

Processed in  9.3049 seconds


### 9. Повышение градиента на основе гистограммы

In [46]:
XGBC_model = HistGradientBoostingClassifier(learning_rate=0.1,random_state=0)
predictor(XGBC_model)

train: recall=  0.97, f1 =  0.97
              precision    recall  f1-score   support

           0       0.98      0.96      0.97     99549
           1       0.32      0.49      0.39      3293

    accuracy                           0.95    102842
   macro avg       0.65      0.73      0.68    102842
weighted avg       0.96      0.95      0.95    102842

Processed in  16.7821 seconds


In [47]:
from sklearn import ensemble

def optuna_rf(trial):
  # задаем пространства поиска гиперпараметров
  learning_rate = trial.suggest_float('learning_rate', 0.1, 1)
  max_depth = trial.suggest_int('max_depth', 5, 10, 1)
  min_samples_leaf = trial.suggest_int('min_samples_leaf', 15, 25, 1)

  # создаем модель
  model = HistGradientBoostingClassifier (learning_rate=learning_rate,
                                 max_depth=max_depth,
                                 min_samples_leaf=min_samples_leaf,
                                 random_state=0)
  # обучаем модель
  model.fit(X_train_s, y_train_s)
  score = metrics.f1_score(y_train_s, model.predict(X_train_s))

  return score
  

In [48]:
%%time
# cоздаем объект исследования
# можем напрямую указать, что нам необходимо максимизировать метрику direction="maximize"
study = optuna.create_study(study_name="HistGradientBoostingClassifier", direction="maximize")
# ищем лучшую комбинацию гиперпараметров n_trials раз
study.optimize(optuna_rf, n_trials=10)

# выводим результаты на обучающей выборке
print("Наилучшие значения гиперпараметров {}".format(study.best_params))
print("f1_score на обучающем наборе: {:.2f}".format(study.best_value))

[I 2023-11-07 00:41:50,802] A new study created in memory with name: HistGradientBoostingClassifier
[I 2023-11-07 00:42:03,031] Trial 0 finished with value: 0.9661504092110068 and parameters: {'learning_rate': 0.1970760271321534, 'max_depth': 5, 'min_samples_leaf': 19}. Best is trial 0 with value: 0.9661504092110068.
[I 2023-11-07 00:42:09,146] Trial 1 finished with value: 0.9623391219696693 and parameters: {'learning_rate': 0.8620462375726238, 'max_depth': 8, 'min_samples_leaf': 19}. Best is trial 0 with value: 0.9661504092110068.
[I 2023-11-07 00:42:16,833] Trial 2 finished with value: 0.9742678379246906 and parameters: {'learning_rate': 0.6176284879142232, 'max_depth': 7, 'min_samples_leaf': 25}. Best is trial 2 with value: 0.9742678379246906.
[I 2023-11-07 00:42:22,272] Trial 3 finished with value: 0.9669610490195024 and parameters: {'learning_rate': 0.6647396792332775, 'max_depth': 10, 'min_samples_leaf': 19}. Best is trial 2 with value: 0.9742678379246906.
[I 2023-11-07 00:42:32,

Наилучшие значения гиперпараметров {'learning_rate': 0.25480944050604853, 'max_depth': 9, 'min_samples_leaf': 23}
f1_score на обучающем наборе: 0.98
CPU times: total: 6min 13s
Wall time: 1min 24s


In [49]:
XGBC_model_best = HistGradientBoostingClassifier(**study.best_params,random_state=0)
predictor(XGBC_model_best)

train: recall=  0.97, f1 =  0.98
              precision    recall  f1-score   support

           0       0.98      0.98      0.98     99549
           1       0.39      0.43      0.41      3293

    accuracy                           0.96    102842
   macro avg       0.68      0.71      0.69    102842
weighted avg       0.96      0.96      0.96    102842

Processed in  12.6526 seconds


10. Метод ближайших соседей

In [50]:
kNN_model = sklearn.neighbors.KNeighborsClassifier(n_jobs=-1)
predictor(kNN_model)

train: recall=  1.00, f1 =  0.97
              precision    recall  f1-score   support

           0       0.98      0.92      0.95     99549
           1       0.18      0.52      0.27      3293

    accuracy                           0.91    102842
   macro avg       0.58      0.72      0.61    102842
weighted avg       0.96      0.91      0.93    102842

Processed in  464.4241 seconds


### Промежуточный итог


In [51]:
metrics_df = pd.DataFrame(
    data = list_model,
    columns= ['model_name','precision','recall','f1','accuracy'])

metrics_df

Unnamed: 0,model_name,precision,recall,f1,accuracy
0,LogisticRegression(random_state=0),0.078108,0.535682,0.136337,0.782686
1,LogisticRegression(random_state=0),0.078108,0.535682,0.136337,0.782686
2,DecisionTreeClassifier(random_state=0),0.28312,0.369268,0.320506,0.949865
3,DecisionTreeClassifier(random_state=0),0.28312,0.369268,0.320506,0.949865
4,BaggingClassifier(random_state=0),0.397217,0.303371,0.344008,0.962953
5,RandomForestClassifier(random_state=0),0.464516,0.284239,0.352675,0.96659
6,RandomForestClassifier(random_state=0),0.464516,0.284239,0.352675,0.96659
7,"ExtraTreesClassifier(max_depth=15, min_samples...",0.162292,0.574552,0.253093,0.891416
8,"AdaBoostClassifier(learning_rate=0.5, random_s...",0.172659,0.590647,0.267207,0.896268
9,GradientBoostingClassifier(random_state=0),0.199442,0.586395,0.297649,0.911388


Показатели метрик слишком малы.

### Рассмотрим ансамбли ансамблей. Выберем все модели, показавшие recall выше 0.55. 

In [52]:
estimators = [('et', ExtraTreesClassifier(n_estimators=200, max_depth=15, min_samples_leaf=5,random_state=0)),
              ('ab', AdaBoostClassifier(learning_rate=0.5,random_state=0)),
              ('gb', GradientBoostingClassifier(learning_rate=0.1,random_state=0))
              ]

11. Стек оценок с итоговым классификатором (Stacking)

In [53]:
SC_model = StackingClassifier(estimators=estimators)
predictor(SC_model)

train: recall=  0.95, f1 =  0.94
              precision    recall  f1-score   support

           0       0.98      0.93      0.96     99549
           1       0.20      0.55      0.29      3293

    accuracy                           0.92    102842
   macro avg       0.59      0.74      0.63    102842
weighted avg       0.96      0.92      0.93    102842

Processed in  1419.6196 seconds


12. Жесткий классификатор голосования

In [54]:
from sklearn.ensemble import VotingClassifier

VCh_model = VotingClassifier(estimators=estimators, voting='hard')
predictor(VCh_model)

train: recall=  0.94, f1 =  0.93
              precision    recall  f1-score   support

           0       0.99      0.92      0.95     99549
           1       0.19      0.59      0.29      3293

    accuracy                           0.91    102842
   macro avg       0.59      0.75      0.62    102842
weighted avg       0.96      0.91      0.93    102842

Processed in  315.3704 seconds


13. Средневзвешенные вероятности (классификатор мягкого голосования)

In [55]:
VCs_model = VotingClassifier(estimators=estimators, voting='soft')
predictor(VCs_model)

train: recall=  0.95, f1 =  0.94
              precision    recall  f1-score   support

           0       0.99      0.92      0.95     99549
           1       0.19      0.59      0.29      3293

    accuracy                           0.91    102842
   macro avg       0.59      0.75      0.62    102842
weighted avg       0.96      0.91      0.93    102842

Processed in  313.8726 seconds


14. Подведем итоги по  всем построенным моделям.

In [56]:
metrics_df = pd.DataFrame(
    data = list_model,
    columns= ['model_name','precision','recall','f1','accuracy'])

metrics_df

Unnamed: 0,model_name,precision,recall,f1,accuracy
0,LogisticRegression(random_state=0),0.078108,0.535682,0.136337,0.782686
1,LogisticRegression(random_state=0),0.078108,0.535682,0.136337,0.782686
2,DecisionTreeClassifier(random_state=0),0.28312,0.369268,0.320506,0.949865
3,DecisionTreeClassifier(random_state=0),0.28312,0.369268,0.320506,0.949865
4,BaggingClassifier(random_state=0),0.397217,0.303371,0.344008,0.962953
5,RandomForestClassifier(random_state=0),0.464516,0.284239,0.352675,0.96659
6,RandomForestClassifier(random_state=0),0.464516,0.284239,0.352675,0.96659
7,"ExtraTreesClassifier(max_depth=15, min_samples...",0.162292,0.574552,0.253093,0.891416
8,"AdaBoostClassifier(learning_rate=0.5, random_s...",0.172659,0.590647,0.267207,0.896268
9,GradientBoostingClassifier(random_state=0),0.199442,0.586395,0.297649,0.911388


Если при обучении моделей на тренировочной выборке показатели матрик показывают хороший результат, то на тестовой выборке метрики слишком низкие из-за несбалансированности целевой переменной. Ясно, что для построения качественной модели необходимо дополнение в стартовые данные. 

Хорошей модели мы не получили, но для выполнения задания подготовки модели для продакшена выберем модель с большим значением метрики f1. Это модель HistGradientBoostingClassifier с подобранными гиперпараметрами.

In [57]:
import pickle
# Производим сериализацию обученной модели
model = pickle.dumps(XGBC_model_best)

print(type(model))
print(type(XGBC_model_best))

# Сохранение выбранной обученной модели в файл pickle
with open("XGBC_model_best.pkl", "wb") as output:
    pickle.dump(XGBC_model_best, output)

<class 'bytes'>
<class 'sklearn.ensemble._hist_gradient_boosting.gradient_boosting.HistGradientBoostingClassifier'>
