In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from  sklearn.ensemble import IsolationForest
import warnings
warnings.filterwarnings('ignore')
from sklearn.preprocessing  import LabelEncoder
from sklearn import linear_model 
from sklearn import tree 
from sklearn import ensemble 
from sklearn import metrics 
from sklearn import preprocessing 
from sklearn.model_selection import train_test_split 
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import StackingClassifier
from scipy.stats import zscore

Подготовалтиваем данные для моделей

In [3]:
df_balances=pd.read_csv('../data/processed/bank_fin_processed.csv')

In [4]:
X = df_balances.drop(['deposit'], axis=1)
y = df_balances['deposit']
 
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state = 42, test_size = 0.33)
select_best_estimator = SelectKBest(score_func=f_classif, k=15).fit(X_train, y_train)
X_train = pd.DataFrame(select_best_estimator.transform(X_train), columns = list(select_best_estimator.get_feature_names_out()))
X_test = pd.DataFrame(select_best_estimator.transform(X_test), columns = list(select_best_estimator.get_feature_names_out()))

In [5]:
#Нормализация
scaler = MinMaxScaler().fit(X_train)
X_train = pd.DataFrame(scaler.transform(X_train), columns=X_train.columns)
X_test = pd.DataFrame(scaler.transform(X_test), columns=X_test.columns)

In [6]:
def test_model(clf, X_test, y_test):
    print("accuracy: ", metrics.accuracy_score(y_test, clf.predict(X_test)))
    print("f1 score: ", metrics.f1_score(y_test, clf.predict(X_test)))
    print("precision: ", metrics.precision_score(y_test, clf.predict(X_test)))
    print("recall: ", metrics.recall_score(y_test, clf.predict(X_test)))

In [7]:
# обучаем логистическую регрессию и рассчитайте метрики качества

from sklearn.linear_model import LogisticRegression
clf = LogisticRegression(solver="sag", random_state=42).fit(X_train, y_train)
test_model(clf, X_test, y_test)

accuracy:  0.8031430934656741
f1 score:  0.7811158798283262
precision:  0.8229974160206718
recall:  0.7432905484247374


In [8]:
# обучаем решающие деревья, настраиваем максимальную глубину

clf = DecisionTreeClassifier(criterion = 'entropy', random_state=42)
clf.fit(X_train, y_train)

print("train accuracy: ", metrics.accuracy_score(y_train, clf.predict(X_train)))
print("test metrics: ")
test_model(clf, X_test, y_test)

train accuracy:  0.9997283346916599
test metrics: 
accuracy:  0.7515853322304935
f1 score:  0.7327202610501334
precision:  0.7453228726614364
recall:  0.720536756126021


Наблюдаем сильное переобучение.  
Подберем наилучшее значение гиперпараметра - глубину дерева

for depth in range(2, 10):
    clf = DecisionTreeClassifier(max_depth=depth, criterion = 'entropy', random_state=42)
    clf.fit(X_train, y_train)

    print(f"MX_DEPTH={depth}")
    print("train accuracy: ", metrics.accuracy_score(y_train, clf.predict(X_train)))
    print("test metrics: ")
    test_model(clf, X_test, y_test)
    print("\n"*3)

In [9]:
# подбераем оптимальные параметры с помощью gridsearch
svc = DecisionTreeClassifier()
parameters = {'min_samples_split': [2, 5, 7, 10], 'max_depth':[3, 5, 7], "random_state": [42]}
clf = GridSearchCV(svc, parameters)

In [10]:
clf.fit(X_train, y_train)
print(test_model(clf, X_test, y_test))
print("Лучшие параметры: ", clf.best_params_)

accuracy:  0.8149986214502344
f1 score:  0.8116755543081673
precision:  0.7820443482963764
recall:  0.8436406067677946
None
Лучшие параметры:  {'max_depth': 7, 'min_samples_split': 2, 'random_state': 42}


#### Решение задачи классификации: ансамбли моделей и построение прогноза

In [11]:
# обучаем на данных случайный лес
clf = RandomForestClassifier(n_estimators = 100, criterion = 'gini', min_samples_leaf = 5, max_depth = 10, random_state = 42)
clf.fit(X_train, y_train)
test_model(clf, X_test, y_test)

accuracy:  0.8235456299972429
f1 score:  0.8159861989649224
precision:  0.8044217687074829
recall:  0.8278879813302217


In [12]:
# используем для классификации градиентный бустинг и сравниваемЧ качество со случайным лесом
# обучаем на  данных случайный лес
clf = GradientBoostingClassifier(n_estimators = 300, min_samples_leaf = 5, max_depth = 5, random_state = 42, learning_rate=0.05)
clf.fit(X_train, y_train)
test_model(clf, X_test, y_test)

accuracy:  0.8249241797628895
f1 score:  0.8161019403417318
precision:  0.8102357676825762
recall:  0.8220536756126021


In [13]:

clf1 = RandomForestClassifier(n_estimators = 100, criterion = 'gini', min_samples_leaf = 5, max_depth = 10, random_state = 42)
clf2 = GradientBoostingClassifier(n_estimators = 300, min_samples_leaf = 5, max_depth = 5, random_state = 42, learning_rate=0.05)
clf3 = LogisticRegression(solver="sag", random_state=42).fit(X_train, y_train)

estimators = [("rf", clf1), ("gbc", clf2), ("lr", clf3)]
main_clf = StackingClassifier(estimators=estimators,
                         final_estimator=LogisticRegression())
main_clf.fit(X_train, y_train)
test_model(main_clf, X_test, y_test)

accuracy:  0.8238213399503722
f1 score:  0.8136482939632546
precision:  0.8134110787172012
recall:  0.8138856476079347


In [14]:
# оцениваем, какие признаки демонстрируют наибольшую  важность в модели градиентного бустинга
clf = GradientBoostingClassifier(n_estimators = 300, min_samples_leaf = 5, max_depth = 5, random_state = 42, learning_rate=0.05)
clf.fit(X_train, y_train)
df_balances_feature_imp = pd.Series(clf.feature_importances_, index=X_train.columns.values)
print(df_balances_feature_imp)

balance             0.070068
housing             0.053103
loan                0.008972
duration            0.510826
campaign            0.021073
pdays               0.051025
previous            0.013983
job_student         0.006209
contact_unknown     0.080405
month_mar           0.035356
month_may           0.010407
month_oct           0.021709
month_sep           0.009282
poutcome_success    0.106889
poutcome_unknown    0.000694
dtype: float64


In [15]:
# Реализуем оптимизацию гиперпараметров с помощью Optuna

import optuna

def objective(trial):
    n_estimators = trial.suggest_int('n_estimators', 100, 200, 1)
    max_depth = trial.suggest_int('max_depth', 10, 30, 1)
    min_samples_leaf = trial.suggest_int('min_samples_leaf', 2, 10, 1)

    clf = RandomForestClassifier(n_estimators = n_estimators,
                                    min_samples_leaf = min_samples_leaf,
                                    max_depth = max_depth, random_state = 42, criterion = 'gini')
    clf.fit(X_train, y_train)

    return metrics.f1_score(y_test, clf.predict(X_test))

study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=50)

[I 2024-06-17 16:48:22,628] A new study created in memory with name: no-name-223ba16d-a7a7-4fba-b7ce-e4be61c1d1dd
[I 2024-06-17 16:48:23,150] Trial 0 finished with value: 0.8161281098084072 and parameters: {'n_estimators': 143, 'max_depth': 19, 'min_samples_leaf': 6}. Best is trial 0 with value: 0.8161281098084072.
[I 2024-06-17 16:48:23,748] Trial 1 finished with value: 0.8157516527737856 and parameters: {'n_estimators': 174, 'max_depth': 25, 'min_samples_leaf': 8}. Best is trial 0 with value: 0.8161281098084072.
[I 2024-06-17 16:48:24,203] Trial 2 finished with value: 0.81566161760503 and parameters: {'n_estimators': 118, 'max_depth': 25, 'min_samples_leaf': 5}. Best is trial 0 with value: 0.8161281098084072.
[I 2024-06-17 16:48:24,578] Trial 3 finished with value: 0.8121387283236994 and parameters: {'n_estimators': 115, 'max_depth': 11, 'min_samples_leaf': 10}. Best is trial 0 with value: 0.8161281098084072.
[I 2024-06-17 16:48:24,984] Trial 4 finished with value: 0.8146651270207852

In [16]:
best_params = study.best_params
clf = RandomForestClassifier(random_state = 42, **best_params).fit(X_train, y_train)
test_model(clf, X_test, y_test)

accuracy:  0.8293355390129584
f1 score:  0.8228898426323319
precision:  0.8074115665356542
recall:  0.838973162193699


Результат был улучшен по сравнению с предыдущими  моделями

### Альтернативный метод оптимизации гиперпараметров
#### Применим популярный метод - Hyperopt
Hyperopt использует в основном алгоритмы, основанные на процессах оценки, такие как Tree-structured Parzen Estimator (TPE) и случайный поиск.
Код для его установки можно запустить в слудующей ячейке

In [17]:
# !pip install hyperopt

In [18]:
from hyperopt import hp, fmin, tpe, Trials, STATUS_OK

# Определение пространства поиска гиперпараметров
space = {
    'n_estimators': hp.choice('n_estimators', range(100, 201)),  # границы, как в Optuna
    'max_depth': hp.choice('max_depth', range(10, 31)),
    'min_samples_leaf': hp.choice('min_samples_leaf', range(2, 11))
}

# Функция для оптимизации
def objective(params):
    clf = RandomForestClassifier(**params, random_state=42, criterion='gini')
    clf.fit(X_train, y_train)
    score = metrics.f1_score(y_test, clf.predict(X_test))
    return {'loss': -score, 'status': STATUS_OK, 'params': params}

# Запуск оптимизации
trials = Trials()
best = fmin(fn=objective,
            space=space,
            algo=tpe.suggest,
            max_evals=50,
            trials=trials)

print("Лучшие параметры:")
print(best)

100%|██████████| 50/50 [00:25<00:00,  1.94trial/s, best loss: -0.822419216471261]
Лучшие параметры:
{'max_depth': 1, 'min_samples_leaf': 0, 'n_estimators': 84}


In [19]:
best = trials.best_trial["result"]["params"]

In [20]:
clf = RandomForestClassifier(random_state = 42, **best).fit(X_train, y_train)
test_model(clf, X_test, y_test)

accuracy:  0.8287841191066998
f1 score:  0.822419216471261
precision:  0.8065058889512058
recall:  0.838973162193699


Также в коде выше представлен дополнительный меотд подбора гиперпараметров - Hyperopt
В данном случае он показал себя хуже чем популярный метод - Optuna
Метрики итоговой модели прииспользовании Optuna:
accuracy:  0.8293355390129584
f1 score:  0.823193373321908
precision:  0.8063794068270845
recall:  0.8407234539089848

Метрики итоговой модели прииспользовании Hyperopt:
accuracy:  0.8287841191066998
f1 score:  0.822520720205773
precision:  0.8061624649859944
recall:  0.8395565927654609

Таким образом, немного лучше себя показывает перебор параметров с помощью алгоритма Optuna