# Описание задачи

Ссылка на данные: https://ods.ai/competitions/mtsmlcup

Можно ли составить хотя бы приблизительное представление о человеке, обладая информацией о сайтах, которые он посещает? Можно ли по таким цифровым следам пользователя (на каких сайтах с каких IP он сидел, сколько раз заходил, какое у него устройство) понять, кто этот пользователь? Мужчина или женщина? 

Действительно, в Digital-рекламе часто сегмент включает себя пол. Эта задача особенно актуальна для рекламных DSP-площадок, которые в OpenRTB запросах получают такие данные с частотой 200 000 запросов в секунду со всех сайтов, размещающих рекламу за деньги.

В этой части проекта осуществляем обучение базовой модели, подбор оптимальных гиперпараметров, а также уменьшаем переобучение. 

Описание данных:
- user_id – ID пользователя;
- age – Возраст пользователя;
- is_male – Пол пользователя : мужчина (1-Да, 0-Нет);
- 0-49 - массив скрытых факторов для каждого пользователя.

Target - is_male.

# Import

In [1]:
import numpy as np
import pandas as pd
from scipy import stats

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import accuracy_score, roc_auc_score, precision_score, \
recall_score, log_loss, f1_score

from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

import matplotlib.pyplot as plt
import seaborn as sns
import optuna

import warnings
warnings.filterwarnings('ignore')

RAND = 10
N_FOLDS = 5

In [2]:
data_path = 'data/df.csv'

In [3]:
df = pd.read_csv(data_path)

In [5]:
df.head()

Unnamed: 0,age,is_male,user_id,0,1,2,3,4,5,6,...,40,41,42,43,44,45,46,47,48,49
0,41.0,0,99002,-0.018215,0.056048,0.036799,0.052251,-0.013346,-0.023392,0.043092,...,0.052035,-0.121878,0.055076,-0.015839,0.018824,-0.094934,0.015466,0.011858,0.013139,-0.054794
1,41.0,1,29286,0.008094,0.001213,-0.000524,0.001148,0.003961,-0.002873,-0.00093,...,0.002693,-0.004197,0.006693,0.001544,0.003873,0.00077,0.003408,0.003392,-0.001343,0.000868
2,53.0,1,353838,0.02118,0.068266,0.096674,0.065289,-0.009294,0.058854,-0.021472,...,0.059732,0.021463,0.042012,0.043192,0.019687,-0.022575,0.016078,0.011212,0.055767,-0.080996
3,24.0,1,159197,0.044245,-0.079523,-0.012748,0.010571,-0.007838,-0.033496,0.035646,...,0.033311,-0.013801,0.081155,-0.040711,0.018557,-0.029355,0.005674,0.014074,-0.030055,0.023635
4,33.0,0,32977,0.014237,0.048163,-0.003383,0.007215,0.029555,-0.014801,0.010186,...,0.02004,-0.001924,0.028181,0.003136,0.006599,0.040024,0.002026,0.02501,-0.015105,0.013612


In [6]:
def get_metrics(y_test: pd.Series, 
                y_pred: np.ndarray, 
                y_score: np.ndarray, 
                name: str) -> pd.DataFrame:
    """
    Cоздает pd.DataFrame с основными метриками
    """
    df_metrics = pd.DataFrame()
    
    df_metrics['model'] = [name]
    
    # Основные метрики для задачи классификации
    df_metrics['Accuracy'] = accuracy_score(y_test, y_pred)
    df_metrics['ROC_AUC'] = roc_auc_score(y_test, y_score[:, 1])
    df_metrics['Precision'] = precision_score(y_test, y_pred)
    df_metrics['Recall'] = recall_score(y_test, y_pred)
    df_metrics['f1'] = f1_score(y_test, y_pred)
    df_metrics['Logloss'] = log_loss(y_test, y_score)
    
    return df_metrics

In [7]:
def check_overfitting(y_train,
                      y_test,
                      X_train=None,
                      X_test=None,
                      model=None,
                      y_train_proba=None,
                      y_test_proba=None):
    """
    Проверяет на overfitting
    """
    if model is None:
        value_train = roc_auc_score(y_train, y_train_proba)
        value_test = roc_auc_score(y_test, y_test_proba)
    else:
        y_pred_train = model.predict_proba(X_train)[:, 1]
        y_pred_test = model.predict_proba(X_test)[:, 1]
        value_train = roc_auc_score(y_train, y_pred_train)
        value_test = roc_auc_score(y_test, y_pred_test)

    print(f'roc_auc_score train: %.3f' % value_train)
    print(f'roc_auc_score test: %.3f' % value_test)
    print(f'delta = {(abs(value_train - value_test)/value_test*100):.1f} %')

# Modeling

In [8]:
X = df.drop(['user_id', 'age', 'is_male'], axis=1)
y = df['is_male']

X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.2,
                                                    shuffle=True,
                                                    random_state=RAND)

X_train_, X_val, y_train_, y_val = train_test_split(X_train,
                                                    y_train,
                                                    test_size=0.16,
                                                    shuffle=True,
                                                    random_state=RAND)

## LightGBM

In [9]:
clf = LGBMClassifier(random_state=RAND)
eval_set = [(X_val, y_val)]

clf.fit(X_train_,
        y_train_,
        eval_metric="auc",
        eval_set=eval_set,
        early_stopping_rounds=100,
        verbose=False)

In [10]:
y_pred = clf.predict(X_train_)
y_score = clf.predict_proba(X_train_)
metrics = get_metrics(y_train_,
                      y_pred,
                      y_score,
                      name='LightTrain')
metrics

Unnamed: 0,model,Accuracy,ROC_AUC,Precision,Recall,f1,Logloss
0,LightTrain,0.853605,0.932076,0.850097,0.866615,0.858277,0.414676


In [11]:
y_pred = clf.predict(X_test)
y_score = clf.predict_proba(X_test)
metrics = metrics.append(get_metrics(y_test, 
                                     y_pred, 
                                     y_score, 
                                     name='LightTest'))
metrics

Unnamed: 0,model,Accuracy,ROC_AUC,Precision,Recall,f1,Logloss
0,LightTrain,0.853605,0.932076,0.850097,0.866615,0.858277,0.414676
0,LightTest,0.697547,0.786481,0.69061,0.704043,0.697262,0.551623


In [12]:
check_overfitting(y_train=y_train_, 
                  y_test=y_test, 
                  X_train=X_train_, 
                  X_test=X_test, 
                  model=clf)

roc_auc_score train: 0.932
roc_auc_score test: 0.786
delta = 18.5 %


**Вывод:** На тренировочных данных LGBMClassifier в baseline показал хорошие результаты. однако на тестовых данных основные метрики значительно ухудшились, что говорит о присутствии переобучения. 

## Catboost

In [13]:
clf = CatBoostClassifier(random_state=RAND, eval_metric="AUC")

clf.fit(X_train_,
        y_train_,
        eval_set=eval_set,
        early_stopping_rounds=100, 
        use_best_model=True,
        verbose=False)

<catboost.core.CatBoostClassifier at 0x7f85e0024c70>

In [14]:
y_pred = clf.predict(X_train_)
y_score = clf.predict_proba(X_train_)
metrics = metrics.append(
    get_metrics(y_train_, y_pred, y_score, name='CatTrain'))
metrics

Unnamed: 0,model,Accuracy,ROC_AUC,Precision,Recall,f1,Logloss
0,LightTrain,0.853605,0.932076,0.850097,0.866615,0.858277,0.414676
0,LightTest,0.697547,0.786481,0.69061,0.704043,0.697262,0.551623
0,CatTrain,0.874719,0.948916,0.874374,0.881765,0.878054,0.378393


In [15]:
y_pred = clf.predict(X_test)
y_score = clf.predict_proba(X_test)
metrics = metrics.append(
    get_metrics(y_test, y_pred, y_score, name='CatTest'))
metrics

Unnamed: 0,model,Accuracy,ROC_AUC,Precision,Recall,f1,Logloss
0,LightTrain,0.853605,0.932076,0.850097,0.866615,0.858277,0.414676
0,LightTest,0.697547,0.786481,0.69061,0.704043,0.697262,0.551623
0,CatTrain,0.874719,0.948916,0.874374,0.881765,0.878054,0.378393
0,CatTest,0.715094,0.802328,0.706845,0.724638,0.715631,0.535439


In [16]:
check_overfitting(y_train=y_train_, 
                  y_test=y_test, 
                  X_train=X_train_, 
                  X_test=X_test, 
                  model=clf)

roc_auc_score train: 0.949
roc_auc_score test: 0.802
delta = 18.3 %


**Вывод:** На тренировочных данных CatBoostClassifier в baseline отработал лучше, чем LGBMClassifier. Но также присутствует достаточно сильное переобучение. 
Для дальнейшей работы, я выбираю CatBoostClassifier и буду улучшать характеристики полученные им на выходе. 

# Tuning

In [17]:
grid = {
    'n_estimators': [1000],
    'learning_rate': np.linspace(0.01, 0.1, 5),
    'boosting_type' : ['Ordered', 'Plain'],
    'max_depth': list(range(3, 12)),
    'l2_leaf_reg': np.logspace(-5, 2, 5),
    'random_strength': list(range(10, 50, 5)),
    'bootstrap_type': ["Bayesian", "Bernoulli", "MVS", "No"],
    'border_count': [128, 254],
    'grow_policy': ["SymmetricTree", "Depthwise", "Lossguide"],
    'random_state': [RAND]
}

model = CatBoostClassifier(eval_metric="AUC", silent=True)
grid_search_result = model.randomized_search(grid,
                                             X=X_train,
                                             y=y_train, 
                                             verbose=False)


bestTest = 0.7842724225
bestIteration = 657

Metric AUC is not calculated on train by default. To calculate this metric on train, add hints=skip_train~false to metric parameters.

bestTest = 0.8077205856
bestIteration = 397

Metric AUC is not calculated on train by default. To calculate this metric on train, add hints=skip_train~false to metric parameters.

bestTest = 0.8051721289
bestIteration = 979

Metric AUC is not calculated on train by default. To calculate this metric on train, add hints=skip_train~false to metric parameters.

bestTest = 0.7770220608
bestIteration = 999

Metric AUC is not calculated on train by default. To calculate this metric on train, add hints=skip_train~false to metric parameters.

bestTest = 0.8108067084
bestIteration = 930

Metric AUC is not calculated on train by default. To calculate this metric on train, add hints=skip_train~false to metric parameters.
Training on fold [0/3]

bestTest = 0.8056508806
bestIteration = 778

Training on fold [1/3]

bestTes

In [18]:
cat_best = grid_search_result['params']

In [19]:
cat_grid = CatBoostClassifier(**cat_best,
                             eval_metric='AUC')
cat_grid.fit(X_train_,
             y_train_,
             eval_set=eval_set,
             early_stopping_rounds=100,
             verbose=False)

<catboost.core.CatBoostClassifier at 0x7f85e00249d0>

In [20]:
y_pred = cat_grid.predict(X_test)
y_score = cat_grid.predict_proba(X_test)
metrics = metrics.append(
    get_metrics(y_test, y_pred, y_score, name='CatGrid'))
metrics

Unnamed: 0,model,Accuracy,ROC_AUC,Precision,Recall,f1,Logloss
0,LightTrain,0.853605,0.932076,0.850097,0.866615,0.858277,0.414676
0,LightTest,0.697547,0.786481,0.69061,0.704043,0.697262,0.551623
0,CatTrain,0.874719,0.948916,0.874374,0.881765,0.878054,0.378393
0,CatTest,0.715094,0.802328,0.706845,0.724638,0.715631,0.535439
0,CatGrid,0.717736,0.805724,0.70821,0.730359,0.719114,0.537844


In [21]:
check_overfitting(y_train=y_train_, 
                  y_test=y_test, 
                  X_train=X_train_, 
                  X_test=X_test, 
                  model=cat_grid)

roc_auc_score train: 0.877
roc_auc_score test: 0.806
delta = 8.9 %


**Вывод:** После подбора гиперпараметров все основные метрики, включая ROC_AUC улучшились. Но переобучение по прежнему пресутствует. 

# Holdout

In [22]:
# ваш код
def cross_validation_cat(X_train: pd.DataFrame,
                         y_train: pd.Series,
                         X_test: pd.DataFrame,
                         y_test: pd.Series,
                         clf,
                         params: dict,
                         cat_features: list = None,
                         eval_metric: str = None,
                         early_stop: bool = False,
                         early_stopping_rounds: int = 100,
                         num_folds: int = 5,
                         random_state: int = 10,
                         shuffle: bool = True):
    """
    Получает результаты при помощи кросс-валидации для задачи Классиификации
    """
    folds = StratifiedKFold(n_splits=num_folds, 
                            random_state=random_state, 
                            shuffle=shuffle)
    score_oof = []
    pred_test = []
    pred_prob_test = []


    for fold, (train_index, 
               test_index) in enumerate(folds.split(X_train, y_train)):
        X_train_, X_val = X_train.iloc[train_index], X_train.iloc[test_index]
        y_train_, y_val = y_train.iloc[train_index], y_train.iloc[test_index]

        model = clf(**params)

        if early_stop == True:
            if eval_metric is None:
                model.fit(X_train_,
                          y_train_,
                          eval_set=[(X_val, y_val)],
                          cat_features=cat_features,
                          silent=True,
                          early_stopping_rounds=early_stopping_rounds)
            else:
                model.fit(X_train_,
                          y_train_,
                          eval_set=[(X_val, y_val)],
                          eval_metric=eval_metric,
                          silent=True,
                          cat_features=cat_features,
                          early_stopping_rounds=early_stopping_rounds)
        else:
            model.fit(X_train_, y_train_, cat_features=cat_features)

        y_pred_val = model.predict_proba(X_val)[:, 1]
        y_pred = model.predict(X_test)
        y_pred_prob = model.predict_proba(X_test)

        print(
            "Fold:", fold + 1,
            "AUC SCORE %.3f" % roc_auc_score(y_val, y_pred_val))
        print("---")

        # oof list
        score_oof.append(roc_auc_score(y_val, y_pred_val))
        # holdout list
        pred_test.append(y_pred)
        pred_prob_test.append(y_pred_prob)

    return score_oof, pred_test, pred_prob_test

In [23]:
score_oof, pred_test, pred_prob_test = cross_validation_cat(
    X_train,
    y_train,
    X_test,
    y_test,
    early_stop=True,
    early_stopping_rounds=100,
    num_folds=N_FOLDS,
    random_state=RAND,
    clf=CatBoostClassifier,
    params=cat_best)

Fold: 1 AUC SCORE 0.811
---
Fold: 2 AUC SCORE 0.815
---
Fold: 3 AUC SCORE 0.791
---
Fold: 4 AUC SCORE 0.805
---
Fold: 5 AUC SCORE 0.797
---


In [24]:
fin_test_pred = stats.mode(np.column_stack(pred_test), axis=1)[0]
fin_test_pred_prob = np.mean(pred_prob_test, axis=0)

print(f'ROC-AUC mean OOF = {np.mean(score_oof)}')
print(f'ROC-AUC HOLDOUT = {roc_auc_score(y_test, fin_test_pred_prob[:, 1])}')

ROC-AUC mean OOF = 0.803802800791982
ROC-AUC HOLDOUT = 0.8105923110533095


**Вывод:** После обучения алгоритма с использованием кросс-валидации, ROC-AUC на тестовой выборке стал еще лучше. 

In [25]:
metrics = metrics.append(
    get_metrics(y_test, fin_test_pred, fin_test_pred_prob, name='CatGridVal'))
metrics

Unnamed: 0,model,Accuracy,ROC_AUC,Precision,Recall,f1,Logloss
0,LightTrain,0.853605,0.932076,0.850097,0.866615,0.858277,0.414676
0,LightTest,0.697547,0.786481,0.69061,0.704043,0.697262,0.551623
0,CatTrain,0.874719,0.948916,0.874374,0.881765,0.878054,0.378393
0,CatTest,0.715094,0.802328,0.706845,0.724638,0.715631,0.535439
0,CatGrid,0.717736,0.805724,0.70821,0.730359,0.719114,0.537844
0,CatGridVal,0.723208,0.810592,0.713178,0.736842,0.724817,0.530456


**Вывод:** После проделанных манипуляций, метрики, по сравнению с Baseline, улучшились. 

# Stacking

In [26]:
meta_X = pd.DataFrame()
meta_X_test = pd.DataFrame()

In [27]:
def crossval_predict(model: str,
                     X_train: pd.DataFrame, 
                     y_train: pd.Series, 
                     X_test: pd.DataFrame, 
                     y_test: pd.Series,
                     name: str, 
                     best_param=None) -> pd.DataFrame:
    """
    Получает новые мета-признаки при помощи кросс-валидации
    """
    folds = StratifiedKFold(n_splits=N_FOLDS)
    pred_val = []
    pred_prob_val = []
    
    for fold, (train_index, test_index) in enumerate(folds.split(X_train, y_train)):
        X_train_, X_val = X_train.iloc[train_index], X_train.iloc[test_index]
        y_train_, y_val = y_train.iloc[train_index], y_train.iloc[test_index]
        if model == 'LGBMClassifier':
            if best_param == None:
                clf = LGBMClassifier(random_state=RAND)
                clf.fit(X_train_,
                        y_train_, 
                        eval_metric="auc", 
                        eval_set=[(X_val, y_val)], 
                        verbose=False,
                        early_stopping_rounds=100)
            else:
                clf = LGBMClassifier(**best_param)
                clf.fit(X_train_,
                        y_train_, 
                        eval_metric="auc", 
                        eval_set=[(X_val, y_val)], 
                        verbose=False, 
                        early_stopping_rounds=100)
        elif model == 'CatBoostClassifier': 
            if best_param == None:
                clf = CatBoostClassifier(random_state=RAND, eval_metric="AUC")
                clf.fit(X_train_,
                        y_train_,
                        eval_set=[(X_val, y_val)], 
                        use_best_model=True, 
                        silent=True,
                        early_stopping_rounds=100)
            else:
                clf = CatBoostClassifier(**best_param, eval_metric="AUC")
                clf.fit(X_train_,
                        y_train_,
                        eval_set=[(X_val, y_val)], 
                        use_best_model=True, 
                        silent=True, 
                        early_stopping_rounds=100)
        else:
            clf.fit(X_train_, y_train_)
            
        y_pred_val = clf.predict(X_val)
        y_pred_prob_val = clf.predict_proba(X_val)

        print("Fold:", fold + 1, 
              "ROC-AUC SCORE %.3f" % roc_auc_score(y_val, y_pred_prob_val[:, 1]))
        print("---")

        # holdout list
        pred_val.append(y_pred_val)
        pred_prob_val.append(y_pred_prob_val)

    if model == 'CatBoostClassifier':
        clf.fit(X_train, y_train, silent=True)
    else:
        clf.fit(X_train, y_train)
    
    meta_X[name] = np.concatenate(pred_prob_val)[:,1]
    meta_X_test[name] = clf.predict_proba(X_test)[:,1]
            
    return meta_X, meta_X_test

## LightGBM baseline

In [28]:
meta_X, meta_X_test = crossval_predict(model='LGBMClassifier', 
                                       X_train=X_train, 
                                       y_train=y_train, 
                                       X_test=X_test, 
                                       y_test=y_test, 
                                       name='lgb_1')

Fold: 1 ROC-AUC SCORE 0.797
---
Fold: 2 ROC-AUC SCORE 0.787
---
Fold: 3 ROC-AUC SCORE 0.790
---
Fold: 4 ROC-AUC SCORE 0.794
---
Fold: 5 ROC-AUC SCORE 0.794
---


## LightGBM tuning 1

In [29]:
def objective_lgb_1(trial, X, y, N_FOLDS, random_state):
    lgb_params = {
        "n_estimators": trial.suggest_categorical("n_estimators", [1000]),
        "learning_rate": trial.suggest_categorical("learning_rate", [0.021633553993414788]),
        #"learning_rate": trial.suggest_float("learning_rate", 0.001, 0.3, log=True),
        "num_leaves": trial.suggest_int("num_leaves", 20, 3000, step=20),
        "max_depth": trial.suggest_int("max_depth", 3, 12),
        "min_child_samples": trial.suggest_int("min_child_samples", 100, 70000, step=100),
        "reg_alpha": trial.suggest_int("reg_alpha", 0, 100),
        "reg_lambda": trial.suggest_int("reg_lambda", 0, 100),
        "min_split_gain": trial.suggest_int("min_split_gain", 0, 20),
        "subsample": trial.suggest_float("subsample", 0.2, 1.0),
        "subsample_freq": trial.suggest_categorical("subsample_freq", [1]),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.2, 1.0),
        "random_state": trial.suggest_categorical("random_state", [random_state])
    }
    
    cv = StratifiedKFold(n_splits=N_FOLDS, shuffle=True, random_state=RAND)

    cv_predicts = np.empty(N_FOLDS)
    for idx, (train_idx, test_idx) in enumerate(cv.split(X, y)):
        X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

        pruning_callback = optuna.integration.LightGBMPruningCallback(
            trial, "auc")
        model = LGBMClassifier(**lgb_params)
        model.fit(X_train,
                  y_train,
                  eval_set=[(X_test, y_test)],
                  eval_metric="auc",
                  early_stopping_rounds=100,
                  callbacks=[pruning_callback],
                  verbose=False)

        preds = model.predict_proba(X_test)[:, 1]
        cv_predicts[idx] = roc_auc_score(y_test, preds)

    return np.mean(cv_predicts)

In [30]:
study_lgb_1 = optuna.create_study(direction="maximize", study_name="tun_1")
func = lambda trial: objective_lgb_1(trial, 
                                     X_train, 
                                     y_train, 
                                     N_FOLDS=N_FOLDS, 
                                     random_state=RAND)
optuna.logging.set_verbosity(optuna.logging.WARNING)
study_lgb_1.optimize(func, n_trials=20, show_progress_bar=True)

[32m[I 2023-04-16 17:01:43,870][0m A new study created in memory with name: tun_1[0m


  0%|          | 0/20 [00:00<?, ?it/s]

In [31]:
meta_X, meta_X_test = crossval_predict(model='LGBMClassifier', 
                                       X_train=X_train, 
                                       y_train=y_train, 
                                       X_test=X_test, 
                                       y_test=y_test, 
                                       name='lgb_2', 
                                       best_param=study_lgb_1.best_params)

Fold: 1 ROC-AUC SCORE 0.772
---
Fold: 2 ROC-AUC SCORE 0.773
---
Fold: 3 ROC-AUC SCORE 0.776
---
Fold: 4 ROC-AUC SCORE 0.778
---
Fold: 5 ROC-AUC SCORE 0.776
---


## LightGBM tuning 2

In [32]:
study_lgb_2 = optuna.create_study(direction="maximize", study_name="tun_2")
func = lambda trial: objective_lgb_1(trial, 
                                     X_train, 
                                     y_train, 
                                     N_FOLDS=N_FOLDS, 
                                     random_state=RAND)
optuna.logging.set_verbosity(optuna.logging.WARNING)
study_lgb_2.optimize(func, n_trials=20, show_progress_bar=True)

  0%|          | 0/20 [00:00<?, ?it/s]

In [33]:
meta_X, meta_X_test = crossval_predict(model='LGBMClassifier', 
                                       X_train=X_train, 
                                       y_train=y_train, 
                                       X_test=X_test, 
                                       y_test=y_test, 
                                       name='lgb_2', 
                                       best_param=study_lgb_2.best_params)

Fold: 1 ROC-AUC SCORE 0.776
---
Fold: 2 ROC-AUC SCORE 0.776
---
Fold: 3 ROC-AUC SCORE 0.783
---
Fold: 4 ROC-AUC SCORE 0.782
---
Fold: 5 ROC-AUC SCORE 0.779
---


## LightGBM tuning 3

In [34]:
study_lgb_3 = optuna.create_study(direction="maximize", study_name="tun_3")
func = lambda trial: objective_lgb_1(trial, 
                                   X_train, 
                                   y_train, 
                                   N_FOLDS=N_FOLDS, 
                                   random_state=RAND)
optuna.logging.set_verbosity(optuna.logging.WARNING)
study_lgb_3.optimize(func, n_trials=20, show_progress_bar=True)

  0%|          | 0/20 [00:00<?, ?it/s]

In [35]:
meta_X, meta_X_test = crossval_predict(model='LGBMClassifier', 
                                       X_train=X_train, 
                                       y_train=y_train, 
                                       X_test=X_test, 
                                       y_test=y_test, 
                                       name='lgb_3', 
                                       best_param=study_lgb_3.best_params)

Fold: 1 ROC-AUC SCORE 0.787
---
Fold: 2 ROC-AUC SCORE 0.784
---
Fold: 3 ROC-AUC SCORE 0.788
---
Fold: 4 ROC-AUC SCORE 0.787
---
Fold: 5 ROC-AUC SCORE 0.786
---


## CatBoost tuning 

In [36]:
meta_X, meta_X_test = crossval_predict(model='CatBoostClassifier', 
                                       X_train=X_train, 
                                       y_train=y_train, 
                                       X_test=X_test, 
                                       y_test=y_test, 
                                       name='cat_1', 
                                       best_param=cat_best)

Fold: 1 ROC-AUC SCORE 0.805
---
Fold: 2 ROC-AUC SCORE 0.796
---
Fold: 3 ROC-AUC SCORE 0.805
---
Fold: 4 ROC-AUC SCORE 0.803
---
Fold: 5 ROC-AUC SCORE 0.803
---


## Final meta model

In [37]:
final_clf = LogisticRegression(random_state=RAND, class_weight='balanced')
final_clf.fit(meta_X, y_train)

In [38]:
y_pred_final = final_clf.predict(meta_X)
y_pred_proba_final = final_clf.predict_proba(meta_X)

metrics = metrics.append(
    get_metrics(y_train, y_pred_final, y_pred_proba_final, name='StakingTrain'))
metrics

Unnamed: 0,model,Accuracy,ROC_AUC,Precision,Recall,f1,Logloss
0,LightTrain,0.853605,0.932076,0.850097,0.866615,0.858277,0.414676
0,LightTest,0.697547,0.786481,0.69061,0.704043,0.697262,0.551623
0,CatTrain,0.874719,0.948916,0.874374,0.881765,0.878054,0.378393
0,CatTest,0.715094,0.802328,0.706845,0.724638,0.715631,0.535439
0,CatGrid,0.717736,0.805724,0.70821,0.730359,0.719114,0.537844
0,CatGridVal,0.723208,0.810592,0.713178,0.736842,0.724817,0.530456
0,StakingTrain,0.712642,0.795742,0.724432,0.709912,0.717099,0.54489


In [39]:
y_pred_final = final_clf.predict(meta_X_test)
y_pred_proba_final = final_clf.predict_proba(meta_X_test)

metrics = metrics.append(
    get_metrics(y_test, y_pred_final, y_pred_proba_final, name='StakingTest'))
metrics

Unnamed: 0,model,Accuracy,ROC_AUC,Precision,Recall,f1,Logloss
0,LightTrain,0.853605,0.932076,0.850097,0.866615,0.858277,0.414676
0,LightTest,0.697547,0.786481,0.69061,0.704043,0.697262,0.551623
0,CatTrain,0.874719,0.948916,0.874374,0.881765,0.878054,0.378393
0,CatTest,0.715094,0.802328,0.706845,0.724638,0.715631,0.535439
0,CatGrid,0.717736,0.805724,0.70821,0.730359,0.719114,0.537844
0,CatGridVal,0.723208,0.810592,0.713178,0.736842,0.724817,0.530456
0,StakingTrain,0.712642,0.795742,0.724432,0.709912,0.717099,0.54489
0,StakingTest,0.716038,0.806738,0.716389,0.705187,0.710744,0.530879


In [40]:
check_overfitting(y_train=y_train,
                  y_test=y_test,
                  y_train_proba=final_clf.predict_proba(meta_X)[:, 1],
                  y_test_proba=final_clf.predict_proba(meta_X_test)[:, 1])

roc_auc_score train: 0.794
roc_auc_score test: 0.814
delta = 2.4 %


**Вывод:** Таким образом. после применения стейкинга, метрики остались примерно на том же уровне, но переобучение стало меньше. 