<h1> Импорт библиотек </h1>

In [1]:
import pandas as pd
import numpy as np
import joblib
import os

from catboost import CatBoostClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import roc_auc_score
from sklearn.ensemble import VotingClassifier

<h1> Загрузка данных </h1>

In [3]:
data = pd.read_csv('../data/raw/train.csv')

data_train, val_train = train_test_split(
    data,
    test_size = 0.2,
    random_state = 42,
    stratify = data['Exited']
)

data_train.to_csv('../processed', index = False)
val_train.to_csv('../processed', index = False)

**Разбиваем данные, выделяем признаки**

In [4]:
# Тренировочкая выборка
y_train = data_train['Exited']
X_train = data_train.drop(columns=['id', 'CustomerId', 'Surname', 'Exited'])

# Валидационная выборка для проверки метрики
y_val = val_train['Exited']
X_val = val_train.drop(columns=['id', 'CustomerId', 'Surname', 'Exited'])

# Категориальные признаки (будут передаваться в catboost)
cat_features = ['Geography', 'Gender']

<h1> Обучаем CatBoostClassifier </h1>

In [7]:
# Определяем доступность GPU
import subprocess

def detect_catboost_task_type():
    try:
        output = subprocess.check_output(['nvidia-smi'], stderr=subprocess.DEVNULL)
        return 'GPU'
    except:
        return 'CPU'

task_type = detect_catboost_task_type()

In [11]:
cat_model = CatBoostClassifier(
    verbose=0,
    cat_features=cat_features,
    task_type=task_type
)

# Определяем сетку гиперпараметров для перебора
param_grid_catboost = {
    'learning_rate': [0.05, 0.2, 0.6],
    'depth': [4, 6, 8],
    'n_estimators': [200, 1000],
    'l2_leaf_reg' : [2, 5, 10] 
}

**Запуск модели**

In [None]:
grid_search_catboost = GridSearchCV(
    estimator=cat_model,
    param_grid=param_grid_catboost,
    verbose=1,
    scoring='roc_auc',
    cv=15
)

grid_search_catboost.fit(X_train, y_train)

**Поиск лучших параметров и выбор лучшей модели**

In [None]:
print("Лучшие параметры: ", grid_search_catboost.best_params_)
print("Лучшая модель: ", grid_search_catboost.best_estimator_)

**На валидации измеряем roc_auc**

In [None]:
best_catboost = grid_search_catboost.best_estimator
best_catboost_pred = best_catboost.predict_proba(X_val)[:, 1]
roc_auc_catb = roc_auc_score(y_val, best_catboost_pred)

print(f'ROC_AUC на отложенной выборке для CatBoostClassifier: {roc_auc:.4f}')

In [None]:
# Сохраняем нашу лучшую модель
joblib.dump(best_catboost, '../models/best_catboost.pkl')

<h1> Обучаем XGBClassifier </h1>

In [15]:
from xgboost import XGBClassifier

X_train_xgb = pd.get_dummies(X_train.copy())
y_train_xgb = y_train.copy()

X_val_xgb = pd.get_dummies(X_val.copy())
y_val_xgb = y_val.copy()

In [14]:
params = {
    "learning_rate" : [0.01, 0.1, 0.2, 0.3],
    "n_estimators" : [30, 50, 70, 100, 120, 200],
    "max_depth" : [5, 7, 9, 11, 15, 20, 25, 50]
}

xgb_model = XGBClassifier()

**Запускаем модель**

In [None]:
grid_search_xgb = GridSearchCV(
    estimator=xgb_model,
    param_grid=params,
    scoring='roc_auc',
    cv=15,
    verbose=1
)

grid_search_xgb.fit(X_trrain_xgb, y_train_xgb)

In [None]:
best_xgb = grid_search_xgb.best_estimator_

best_xgb_pred = best_xgb.predict_proba(X_val_xgb)[:, 1]
roc_auc_xgb = roc_auc_score(y_val_xgb, best_xgb_pred)

print(f'ROC_AUC на отложенной выборке для XGBClassifier: {roc_auc:.4f}')

In [None]:
# Сохраняем нашу лучшую модель
joblib.dump(best_catboost, '../models/best_xgb.pkl')

<h1>Обучаем ансамбль</h1>

- **Cоздаем ансамбль catboost'ов**
- **Усредняем предсказания**

**Создаем модели**

In [18]:
model1 = CatBoostClassifier(learning_rate=0.1, depth=6, iterations=100, random_state=42, verbose=0, cat_features=cat_features, task_type=task_type)
model2 = CatBoostClassifier(learning_rate=0.2, depth=6, iterations=100, random_state=42, verbose=0, cat_features=cat_features, task_type=task_type)
model3 = CatBoostClassifier(learning_rate=0.1, depth=3, iterations=100, random_state=42, verbose=0, cat_features=cat_features, task_type=task_type)
model4 = CatBoostClassifier(learning_rate=0.3, depth=5, iterations=100, random_state=42, verbose=0, cat_features=cat_features, task_type=task_type)
model5 = CatBoostClassifier(learning_rate=0.4, depth=4, iterations=100, random_state=42, verbose=0, cat_features=cat_features, task_type=task_type)


model6 = CatBoostClassifier(learning_rate=0.1, depth=6, iterations= 100, random_state=42, verbose=0, cat_features=cat_features, task_type=task_type)
model7 = CatBoostClassifier(learning_rate=0.1, depth=4, iterations=100, random_state=42, verbose=0, cat_features=cat_features, task_type=task_type)
model8 = CatBoostClassifier(learning_rate=0.15, depth=7, iterations= 100, random_state=42, verbose=0,   cat_features=cat_features, task_type=task_type)
model9 = CatBoostClassifier(learning_rate=0.25, depth=3, iterations=100, random_state=42, verbose=0, cat_features=cat_features, task_type=task_type)
model10 = CatBoostClassifier(learning_rate=0.35, depth=5, iterations=100, random_state=42, verbose=0, cat_features=cat_features, task_type=task_type)

**Строим ансамбль и запускаем**

In [None]:
ensemble = VotingClassifier(
    estimators=[('cat1', model1), ('cat2', model2), ('cat3', model3), ('cat4', model4), ('cat5', model5),
               ('cat6', model6), ('cat7', model7), ('cat8', model8), ('cat9', model9), ('cat10', model10)],
    voting='soft',  # 'soft' для усреднения вероятностей, 'hard' для большинства голосов
)

ensemble.fit(X_train, y_train)

**Измеряем roc_auc ансамбля на валидации**

In [None]:
ensemble_pred = ensemble.predict_proba(X_val)[:, 1]
roc_auc_ensemble = roc_auc_score(y_val, ensemble_pred)

print(f'ROC_AUC на отложенной выборке для Ensemble: {roc_auc:.4f}')

In [None]:
joblib.dump(ensemble_pred, '../models/best_ensemble.pkl')

<h1>Находим лучшую</h1>

In [None]:
best_model = max([
    ("CatBoost", roc_auc_catb),
    ("XGBClassifier", roc_auc_xgb),
    ("Ensemble", roc_auc_ensemble)
], key=labmda x: x[1])

print(f"Best model: {best_model[0]} with ROC-AUC: {best_model[1]:.4f}")