In [69]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, KFold, StratifiedKFold

from sklearn.metrics import roc_auc_score, mean_absolute_error, mean_squared_error

from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.ensemble import StackingClassifier, StackingRegressor, RandomForestClassifier

from lightgbm import LGBMClassifier, LGBMRegressor
from catboost import CatBoostClassifier, CatBoostRegressor

import optuna

from get_metrics import get_metrics_classification, get_metrics_regression

import warnings
warnings.filterwarnings("ignore");

PATH = '../MTS ML Cup/'
RAND = 42
N_FOLDS = 3

pd.set_option('display.max_columns', None)
# pd.set_option('display.max_rows', None)

Прочтем файлы с данными

In [4]:
X_train = pd.read_csv(PATH+'X_train.csv')
X_train_ = pd.read_csv(PATH+'X_train_.csv')
X_val = pd.read_csv(PATH+'X_val.csv')
X_test = pd.read_csv(PATH+'X_test.csv')

y_train = pd.read_csv(PATH+'y_train.csv')
y_train_ = pd.read_csv(PATH+'y_train_.csv')
y_val = pd.read_csv(PATH+'y_val.csv')
y_test = pd.read_csv(PATH+'y_test.csv')

На всякий случай проверим данные на наличие пропусков

In [72]:
data_list = [X_train, X_train_, X_val, X_test,
             y_train, y_train_, y_val, y_test]

for  data in data_list:
    print(data.isna().sum().sum())

0
0
0
0
0
0
0
0


# Классификация. Определение пола

Функция для проверки на переобучение (для классификации)

In [5]:
def check_overfitting(model, X_train, y_train, X_test, y_test):
    """
    Проверка на overfitting для классификации 
    """
    y_score_train = model.predict_proba(X_train)
    y_score_test = model.predict_proba(X_test)
    value_train = roc_auc_score(y_train, y_score_train[:, 1])
    value_test = roc_auc_score(y_test, y_score_test[:, 1])

    print(f'{roc_auc_score.__name__} train: %.3f' % value_train)
    print(f'{roc_auc_score.__name__} test: %.3f' % value_test)
    print(f'delta = {(abs(value_train - value_test)/value_test*100):.1f} %')

## LightGBM

Попробуем улучшить значения метрик baseline (roc-auc 0.768) с помощью optuna. Сначала подберем лучшее количество базовых алгоритмов, потом зафиксируем его. Затем подберем лучшее значение шага и так же зафиксируем его. И потом с подобранными количеством базовых алгоритмов и шагом будем крутить остальные параметры.

In [6]:
# целевая функция
def objective_lgb(trial, X, y, N_FOLDS, random_state=RAND):
    lgb_params = {
#         "n_estimators": trial.suggest_categorical("n_estimators", [1000, 1500, 2000, 2500, 3000, 4000]),
        "n_estimators": trial.suggest_categorical("n_estimators", [2000]),
#         "learning_rate": trial.suggest_float("learning_rate",
#                                              0.001,
#                                              0.3),
        "learning_rate": trial.suggest_categorical("learning_rate", [0.09142235679746477]),
        "num_leaves": trial.suggest_int("num_leaves", 15, 40, step=5),
        "min_child_samples": trial.suggest_int("min_child_samples", 15, 40, step=5),
        "objective": trial.suggest_categorical("objective", ["binary"]),
        "random_state": trial.suggest_categorical("random_state", [random_state])
    }

    cv = StratifiedKFold(n_splits=N_FOLDS, shuffle=True, random_state=random_state)

    cv_predicts = np.empty(N_FOLDS)
    for idx, (train_idx, test_idx) in enumerate(cv.split(X, y)):
        X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

        pruning_callback = optuna.integration.LightGBMPruningCallback(
            trial, "auc")
        model = LGBMClassifier(**lgb_params)
        model.fit(X_train,
                  y_train,
                  eval_set=[(X_test, y_test)],
                  eval_metric="auc",
                  early_stopping_rounds=100,
                  callbacks=[pruning_callback],
                  verbose=0)

        probas = model.predict_proba(X_test)[:, 1]
        cv_predicts[idx] = roc_auc_score(y_test, probas)

    return np.mean(cv_predicts)

In [7]:
%%time
study = optuna.create_study(direction="maximize", study_name="LGB_01")
func = lambda trial: objective_lgb(
    trial, X_train, y_train.is_male, N_FOLDS=N_FOLDS, random_state=RAND)
optuna.logging.set_verbosity(optuna.logging.WARNING)
study.optimize(func, n_trials=7, show_progress_bar=True)

[32m[I 2023-04-17 12:21:38,771][0m A new study created in memory with name: LGB_01[0m


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=7.0), HTML(value='')))


CPU times: user 46min 36s, sys: 5min 3s, total: 51min 40s
Wall time: 6min 36s


Лучшие значения параметров

In [9]:
study.best_params

{'n_estimators': 2000,
 'learning_rate': 0.09142235679746477,
 'num_leaves': 40,
 'min_child_samples': 15,
 'objective': 'binary',
 'random_state': 42}

Обучим модель

In [10]:
%%time
eval_set = [(X_val, y_val.is_male)]
ratio = float(np.sum(y_train_.is_male == 0)) / np.sum(y_train_.is_male == 1)

lgbm_clf = LGBMClassifier(**study.best_params, scale_pos_weight=ratio)

lgbm_clf.fit(X_train_,
             y_train_.is_male,
             eval_set=eval_set,
             verbose=False,
             early_stopping_rounds=100);

CPU times: user 2min 8s, sys: 10.9 s, total: 2min 19s
Wall time: 18.5 s


Проверим факт переобучения

In [11]:
check_overfitting(lgbm_clf,
                  X_train_,
                  y_train_.is_male,
                  X_test,
                  y_test.is_male)

roc_auc_score train: 0.997
roc_auc_score test: 0.906
delta = 10.1 %


Сделаем предикт

In [12]:
predict = lgbm_clf.predict(X_test)
proba = lgbm_clf.predict_proba(X_test)

Таблица с метриками

In [13]:
metrics = get_metrics_classification(y_test=y_test.is_male,
                                     y_pred=predict,
                                     y_score=proba,
                                     name='LGBMClassifier_Optuna')
metrics

Unnamed: 0,model,Accuracy,ROC_AUC,Precision,Recall,f1,Logloss
0,LGBMClassifier_Optuna,0.818113,0.9056,0.8299,0.814836,0.822299,0.391573


## CatBoost

Попробуем улучшить значения метрик baseline (roc-auc 0.858) с помощью Random Search. Подбирать будем аналогичным предыдущему пункту способом (сначала n_estimators, затем learning_rate, а потом все остальное)

In [14]:
%%time
grid = {
#     "n_estimators": [1000, 1500, 2000, 2500, 3000, 4000],
    "n_estimators": [4000],
#     "learning_rate": np.logspace(-3, -0.9, 10),
    "learning_rate" : [0.12589254117941673],
    "boosting_type" : ['Ordered', 'Plain'],
    "max_depth": list(range(3, 17)),
    "bootstrap_type": ["Bayesian", "Bernoulli", "MVS", "No"],
    'grow_policy': ["SymmetricTree", "Depthwise", "Lossguide"],
    "random_state": [RAND]

}

model = CatBoostClassifier(loss_function="Logloss",
                           eval_metric="AUC",
                           silent=True)
grid_search_result = model.randomized_search(grid,
                                             X=X_train_,
                                             y=y_train_.is_male,
                                             plot=False);


bestTest = 0.8664721863
bestIteration = 3996

Metric AUC is not calculated on train by default. To calculate this metric on train, add hints=skip_train~false to metric parameters.
0:	loss: 0.8664722	best: 0.8664722 (0)	total: 42.7s	remaining: 6m 23s

bestTest = 0.94325345
bestIteration = 3998

Metric AUC is not calculated on train by default. To calculate this metric on train, add hints=skip_train~false to metric parameters.
1:	loss: 0.9432535	best: 0.9432535 (1)	total: 2m 9s	remaining: 8m 38s

bestTest = 0.9203525563
bestIteration = 3999

Metric AUC is not calculated on train by default. To calculate this metric on train, add hints=skip_train~false to metric parameters.
2:	loss: 0.9203526	best: 0.9432535 (1)	total: 3m 18s	remaining: 7m 43s

bestTest = 0.8681434651
bestIteration = 3999

Metric AUC is not calculated on train by default. To calculate this metric on train, add hints=skip_train~false to metric parameters.
3:	loss: 0.8681435	best: 0.9432535 (1)	total: 4m 39s	remaining: 6m 

Лучшие значения параметров

In [15]:
grid_search_result['params']

{'depth': 12,
 'random_seed': 42,
 'iterations': 4000,
 'learning_rate': 0.12589254117941673,
 'grow_policy': 'Lossguide',
 'boosting_type': 'Plain',
 'bootstrap_type': 'MVS'}

Обучим модель

In [16]:
%%time
ctbst_clf = CatBoostClassifier(**grid_search_result['params'],
                               scale_pos_weight=ratio,
                               loss_function='Logloss',
                               eval_metric='AUC')
ctbst_clf.fit(X_train_,
              y_train_.is_male,
              eval_set=eval_set,
              verbose=False,
              early_stopping_rounds=100);

CPU times: user 13min 37s, sys: 1min 15s, total: 14min 53s
Wall time: 2min 43s


<catboost.core.CatBoostClassifier at 0x7fdb499fb7c0>

Проверим факт переобучения

In [17]:
check_overfitting(ctbst_clf,
                  X_train_,
                  y_train_.is_male,
                  X_test,
                  y_test.is_male)

roc_auc_score train: 1.000
roc_auc_score test: 0.914
delta = 9.5 %


Сделаем предикт

In [18]:
predict = ctbst_clf.predict(X_test)
proba = ctbst_clf.predict_proba(X_test)

Таблица с метриками

In [19]:
metrics = metrics.append(get_metrics_classification(y_test=y_test.is_male,
                                                    y_pred=predict,
                                                    y_score=proba,
                                                    name='CatBoostClassifier_RandomSearch'))

metrics

Unnamed: 0,model,Accuracy,ROC_AUC,Precision,Recall,f1,Logloss
0,LGBMClassifier_Optuna,0.818113,0.9056,0.8299,0.814836,0.822299,0.391573
0,CatBoostClassifier_RandomSearch,0.829832,0.913606,0.837051,0.832596,0.834817,0.375223


## Stacking

Попробуем улучшить результат с помощью стекинга. В качестве базовых алгоритмов возьмем Tuned LGBMClassifier, Tuned CatBoostClassifier и Baseline RandomForestClassifier. Мета-алгоритмом будет логистическая регрессия.

In [20]:
estimators = [('lgb', lgbm_clf),
              ('ctb', ctbst_clf),
              ('rf', RandomForestClassifier(class_weight='balanced',
                                            random_state=RAND))]

cv = StratifiedKFold(n_splits=N_FOLDS)

meta = StackingClassifier(
    estimators=estimators,
    cv=cv,
    final_estimator=LogisticRegression(random_state=RAND, class_weight='balanced'))

Обучим модель

In [21]:
meta.fit(X_train, y_train.is_male);

Проверим факт переобучения

In [22]:
check_overfitting(meta,
                  X_train_,
                  y_train_.is_male,
                  X_test,
                  y_test.is_male)

roc_auc_score train: 0.999
roc_auc_score test: 0.924
delta = 8.1 %


Сделаем предикт

In [23]:
predict = meta.predict(X_test)
proba = meta.predict_proba(X_test)

Таблица с метриками

In [24]:
metrics = metrics.append(get_metrics_classification(y_test=y_test.is_male,
                                                    y_pred=predict,
                                                    y_score=proba,
                                                    name='StackingClassifier'))

metrics

Unnamed: 0,model,Accuracy,ROC_AUC,Precision,Recall,f1,Logloss
0,LGBMClassifier_Optuna,0.818113,0.9056,0.8299,0.814836,0.822299,0.391573
0,CatBoostClassifier_RandomSearch,0.829832,0.913606,0.837051,0.832596,0.834817,0.375223
0,StackingClassifier,0.844792,0.92426,0.854426,0.843128,0.848739,0.359731


## Выводы

После подбора гиперпараметров значения метрик значительно улучшились:
- Baseline LGB (roc-auc=0.768) -> Optuna LGB (roc-auc=0.905)
- Baseline CatBoost (roc-auc=0.858) -> Random Search CatBoost (roc-auc=0.913)

Также удалось немного улучшить результаты, используя стекинг (roc-auc=0.924)

# Регрессия. Определение возраста

Функция для проверки на переобучение (для регрессии)

In [9]:
def check_overfitting(model, X_train, y_train, X_test, y_test, metric_fun):
    """
    Проверка на overfitting для регрессии
    """
    y_pred_train = model.predict(X_train)
    y_pred_test = model.predict(X_test)
    value_train = metric_fun(y_train, y_pred_train)
    value_test = metric_fun(y_test, y_pred_test)

    print(f'{metric_fun.__name__} train: %.3f' % value_train)
    print(f'{metric_fun.__name__} test: %.3f' % value_test)
    print(f'delta = {(abs(value_train - value_test)/value_test*100):.1f} %')

## LightGBM

Попробуем улучшить значения метрик baseline (mae 7.519) с помощью optuna. Сначала подберем лучшее количество базовых алгоритмов, потом зафиксируем его. Затем подберем лучшее значение шага и так же зафиксируем его. И потом с подобранными количеством базовых алгоритмов и шагом будем крутить остальные параметры.

In [26]:
# целевая функция
def objective_lgb(trial, X, y, N_FOLDS, random_state=RAND):
    lgb_params = {
#         "n_estimators": trial.suggest_categorical("n_estimators", [1000, 1200, 1500, 1700, 2000, 2300]),
        "n_estimators": trial.suggest_categorical("n_estimators", [2000]),
#         "learning_rate": trial.suggest_float("learning_rate",
#                                              0.001,
#                                              0.3),
        "learning_rate": trial.suggest_categorical("learning_rate", [0.2902529161087996]),
        "num_leaves": trial.suggest_int("num_leaves", 10, 30, step=5),
        "min_child_samples": trial.suggest_int("min_child_samples", 20, 50, step=5),
        "objective": trial.suggest_categorical("objective", ["mae"]),
        "random_state": trial.suggest_categorical("random_state", [random_state])
    }

    cv = KFold(n_splits=N_FOLDS, shuffle=True, random_state=random_state)

    cv_predicts = np.empty(N_FOLDS)
    for idx, (train_idx, test_idx) in enumerate(cv.split(X, y)):
        X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

        pruning_callback = optuna.integration.LightGBMPruningCallback(
            trial, "l1")
        model = LGBMRegressor(**lgb_params)
        model.fit(X_train,
                  y_train,
                  eval_set=[(X_test, y_test)],
                  eval_metric="mae",
                  early_stopping_rounds=100,
                  callbacks=[pruning_callback],
                  verbose=0)

        preds = model.predict(X_test)
        cv_predicts[idx] = mean_absolute_error(y_test, preds)

    return np.mean(cv_predicts)

In [27]:
%%time
study = optuna.create_study(direction="minimize", study_name="LGB_02")
func = lambda trial: objective_lgb(
    trial, X_train, y_train.age, N_FOLDS=N_FOLDS, random_state=RAND)
optuna.logging.set_verbosity(optuna.logging.WARNING)
study.optimize(func, n_trials=7, show_progress_bar=True)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=7.0), HTML(value='')))


CPU times: user 46min 4s, sys: 5min 42s, total: 51min 47s
Wall time: 6min 34s


Лучшие значения параметров

In [28]:
study.best_params

{'n_estimators': 2000,
 'learning_rate': 0.2902529161087996,
 'num_leaves': 30,
 'min_child_samples': 20,
 'objective': 'mae',
 'random_state': 42}

Обучим модель

In [29]:
%%time
eval_set = [(X_val, y_val.age)]

lgbm_reg = LGBMRegressor(**study.best_params)

lgbm_reg.fit(X_train_,
             y_train_.age,
             eval_set=eval_set,
             verbose=False,
             early_stopping_rounds=100);

CPU times: user 2min 13s, sys: 14.9 s, total: 2min 27s
Wall time: 19.6 s


Проверим факт переобучения

In [30]:
check_overfitting(lgbm_reg,
                  X_train_,
                  y_train_.age,
                  X_test,
                  y_test.age,
                  metric_fun=mean_absolute_error)

mean_absolute_error train: 3.297
mean_absolute_error test: 5.303
delta = 37.8 %


Сделаем предикт

In [31]:
predict = lgbm_reg.predict(X_test)

Таблица с метриками

In [32]:
metrics = get_metrics_regression(y_test=y_test.age,
                                 y_pred=predict,
                                 X_test=X_test,
                                 name='LGBMRegressor_Optuna')

round(metrics.set_index('model'), 3)

Unnamed: 0_level_0,MAE,MSE,RMSE,RMSLE,R2 adjusted,MPE_%,MAPE_%,WAPE_%
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
LGBMRegressor_Optuna,5.303,65.682,8.104,0.207,0.477,-2.893,14.804,14.458


## CatBoost

Попробуем улучшить значения метрик baseline (mae 6.613) с помощью Random Search. Подбирать будем аналогичным предыдущему пункту способом (сначала n_estimators, затем learning_rate, а потом все остальное)

In [45]:
%%time
grid = {
#     "n_estimators": [1000, 1200, 1500, 1700, 2000, 2300],
    "n_estimators": [2300],
#     "learning_rate": np.logspace(-3, -0.9, 10),
    "learning_rate" : [0.12589254117941673],
    "boosting_type" : ['Ordered', 'Plain'],
    "max_depth": list(range(3, 17)),
    "bootstrap_type": ["Bayesian", "Bernoulli", "MVS", "No"],
    'grow_policy': ["SymmetricTree", "Depthwise", "Lossguide"],
    "random_state": [RAND]

}

model = CatBoostRegressor(loss_function="MAE",
                          eval_metric="MAE",
                          silent=True)
grid_search_result = model.randomized_search(grid,
                                             X=X_train_,
                                             y=y_train_.age,
                                             plot=False);


bestTest = 6.940566737
bestIteration = 2299

0:	loss: 6.9405667	best: 6.9405667 (0)	total: 19.1s	remaining: 2m 51s

bestTest = 5.263099178
bestIteration = 2299

1:	loss: 5.2630992	best: 5.2630992 (1)	total: 1m 3s	remaining: 4m 15s

bestTest = 5.944241067
bestIteration = 2299

2:	loss: 5.9442411	best: 5.2630992 (1)	total: 1m 37s	remaining: 3m 47s

bestTest = 6.696874157
bestIteration = 2299

3:	loss: 6.6968742	best: 5.2630992 (1)	total: 2m 18s	remaining: 3m 28s

bestTest = 5.018859413
bestIteration = 2299

4:	loss: 5.0188594	best: 5.0188594 (4)	total: 3m 53s	remaining: 3m 53s

bestTest = 4.839146924
bestIteration = 2299

5:	loss: 4.8391469	best: 4.8391469 (5)	total: 5m 32s	remaining: 3m 41s
Estimating final quality...
Training on fold [0/3]

bestTest = 5.056171003
bestIteration = 2299

Training on fold [1/3]

bestTest = 5.03675986
bestIteration = 2298

Training on fold [2/3]

bestTest = 4.988859112
bestIteration = 2299

CPU times: user 1h 12min 4s, sys: 4min 37s, total: 1h 16min 42s
Wa

Лучшие значения параметров

In [46]:
grid_search_result['params']

{'depth': 12,
 'random_seed': 42,
 'iterations': 2300,
 'learning_rate': 0.12589254117941673,
 'grow_policy': 'Lossguide',
 'boosting_type': 'Plain',
 'bootstrap_type': 'MVS'}

Обучим модель

In [47]:
%%time
ctbst_reg = CatBoostRegressor(**grid_search_result['params'],
                              loss_function='MAE',
                              eval_metric='MAE')
ctbst_reg.fit(X_train_,
              y_train_.age,
              eval_set=eval_set,
              verbose=False,
              early_stopping_rounds=100);

CPU times: user 9min 31s, sys: 45 s, total: 10min 16s
Wall time: 1min 53s


<catboost.core.CatBoostRegressor at 0x7ff450f19b80>

Проверим факт переобучения

In [48]:
check_overfitting(ctbst_reg,
                  X_train_,
                  y_train_.age,
                  X_test,
                  y_test.age,
                  metric_fun=mean_absolute_error)

mean_absolute_error train: 4.043
mean_absolute_error test: 5.458
delta = 25.9 %


Сделаем предикт

In [49]:
predict = ctbst_reg.predict(X_test)

Таблица с метриками

In [50]:
metrics = metrics.append(get_metrics_regression(y_test=y_test.age,
                                                y_pred=predict,
                                                X_test=X_test,
                                                name='CatBoostRegressor_RandomSearch'))

round(metrics.set_index('model'), 3)

Unnamed: 0_level_0,MAE,MSE,RMSE,RMSLE,R2 adjusted,MPE_%,MAPE_%,WAPE_%
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
LGBMRegressor_Optuna,5.303,65.682,8.104,0.207,0.477,-2.893,14.804,14.458
CatBoostRegressor_RandomSearch,5.458,68.453,8.274,0.209,0.455,-2.624,15.099,14.881


## Stacking

Попробуем улучшить результат с помощью стекинга. В качестве базовых алгоритмов возьмем Tuned LGBMRegressor, Tuned CatBoostRegressor, Baseline LGBMRegressor и Baseline CatBoostRegressor. Мета-алгоритмом будет LinearRegression.

In [64]:
estimators = [('lgb_tuned', lgbm_reg),
              ('lgb_baseline', LGBMRegressor(random_state=RAND)),
              ('ctb_tuned', ctbst_reg),
              ('ctb_baseline', CatBoostRegressor(random_state=RAND))]

cv = KFold(n_splits=N_FOLDS)

meta = StackingRegressor(
    estimators=estimators,
    cv=cv,
    final_estimator=LinearRegression())

Обучим модель

In [65]:
meta.fit(X_train, y_train.age);

Проверим факт переобучения

In [68]:
check_overfitting(meta,
                  X_train,
                  y_train.age,
                  X_test,
                  y_test.age,
                  metric_fun=mean_absolute_error)

mean_absolute_error train: 3.613
mean_absolute_error test: 5.217
delta = 30.7 %


Сделаем предикт

In [70]:
predict = meta.predict(X_test)

Таблица с метриками

In [71]:
metrics = metrics.append(get_metrics_regression(y_test=y_test.age,
                                                y_pred=predict,
                                                X_test=X_test,
                                                name='StackingRegressor'))

round(metrics.set_index('model'), 3)

Unnamed: 0_level_0,MAE,MSE,RMSE,RMSLE,R2 adjusted,MPE_%,MAPE_%,WAPE_%
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
LGBMRegressor_Optuna,5.303,65.682,8.104,0.207,0.477,-2.893,14.804,14.458
CatBoostRegressor_RandomSearch,5.458,68.453,8.274,0.209,0.455,-2.624,15.099,14.881
StackingRegressor,5.217,61.354,7.833,0.201,0.512,-4.588,14.839,14.224


## Выводы

После подбора гиперпараметров значения метрик стали лучше:
- Baseline LGB (mae=7.519) -> Optuna LGB (mae=5.303)
- Baseline CatBoost (mae=6.613) -> Random Search CatBoost (mae=5.458)

Также удалось немного улучшить результаты с помощью стекинга (mae=5.217)