In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import roc_auc_score, accuracy_score, precision_score, \
                            recall_score, f1_score, log_loss, precision_recall_curve

from catboost import CatBoostClassifier

from xgboost import XGBClassifier

import optuna

import yaml

import joblib

import warnings
warnings.filterwarnings('ignore')

In [2]:
def get_metrics(y_test, y_pred, y_score, name = "Default"):
    """Метрики для задачи классификации"""
    df_metrics = pd.DataFrame()

    df_metrics['model'] = [name]
    df_metrics['Accuracy'] = accuracy_score(y_test, y_pred)
    df_metrics['ROC_AUC'] = roc_auc_score(y_test, y_score[:, 1])
    df_metrics['Precision'] = precision_score(y_test, y_pred, zero_division=0)
    df_metrics['Recall'] = recall_score(y_test, y_pred, zero_division=0)
    df_metrics['f1'] = f1_score(y_test, y_pred, zero_division=0)
    df_metrics['Logloss'] = log_loss(y_test, y_score)

    return df_metrics

In [65]:
def get_metrics_in_dict(y_test, y_pred, y_score):
    """Метрики для задачи классификации"""
    metrics = {}
    
    metrics['Accuracy'] = float(round(accuracy_score(y_test, y_pred), 3))
    metrics['ROC_AUC'] = float(round(roc_auc_score(y_test, y_score[:, 1]), 3))
    metrics['Precision'] = float(round(precision_score(y_test, y_pred, 
                                                       zero_division=0), 3))
    metrics['Recall'] = float(round(recall_score(y_test, y_pred, 
                                                 zero_division=0), 3))
    metrics['f1'] = float(round(f1_score(y_test, y_pred, zero_division=0), 3))
    metrics['Logloss'] = float(round(log_loss(y_test, y_score), 3))
    
    return metrics

In [4]:
def open_file(file_path):
    with open(file_path, 'r') as file:
        return yaml.safe_load(file)
    
def save_file(file_path, data):       
    with open(file_path, 'w') as file:
        yaml.dump(data, file)

In [5]:
config_path = "../config/params.yaml"
config = yaml.load(open(config_path), Loader=yaml.FullLoader)

preproc = config["preprocessing"]
train = config['train']['win_predictor']

In [6]:
ensemble_params = open_file(train['params'])

In [30]:
win_predictor_metrics = open_file(train['metrics'])

# Baseline

В данном блоке мы строим и обучаем стекинг моделей, предсказывающих победителя закупок. Параметры для моделей подбираются с помощью байесовского оптимизатора   

Результатом этого блока являются файл с моделями, лучшими параметрами и метриками качества.

In [10]:
df_train = pd.read_csv(preproc['train_data'])
df_train = df_train.set_index('index')

df_train[:5]

Unnamed: 0_level_0,purchase,forsmallbiz,price,customer,supplier,is_winner,vectorized,month,reg_code,purchase_size,flag_won,n_unique_okpd2
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
0,141936,1,406903.35,10960,40062,0,[ 0.12142407 -0.0336704 -0.00717449 -0.101091...,2,62.0_26,1,0.0,3
1,75199,1,299821.66,7160,40062,0,[ 0.02812369 0.02294252 0.0104262 0.006939...,3,26.2_65,4,0.0,3
2,97840,1,366250.0,8259,40062,0,[ 0.08153069 0.03020425 -0.01425114 -0.042313...,2,62.0_72,5,0.0,3
3,64052,1,85013.0,6350,40062,0,[ 7.34494067e-02 3.06627049e-02 -1.24606798e-...,2,58.2_46,6,0.0,3
4,432364,0,13554.0,9788,5495,1,[ 0.09296399 0.19767287 -0.05634406 -0.004251...,7,drug_77,1,0.0,2


In [11]:
df_test = pd.read_csv(preproc['test_data'])
df_test = df_test.set_index('index')

df_test[:5]

Unnamed: 0_level_0,purchase,forsmallbiz,price,customer,supplier,is_winner,vectorized,month,reg_code,purchase_size,flag_won,n_unique_okpd2
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
0,596065,0,53874.0,9582,2940,1,[ 0.12289912 0.24247403 -0.06769952 -0.004744...,3,drug_77,1,1.0,2
1,467821,1,7500000.0,9591,7538,1,[ 0.05361487 0.01407206 -0.00556184 -0.023678...,9,38.2_77,1,0.0,4
2,88928,1,281370.35,7831,11016,0,[ 0.07224658 0.01950507 0.01632774 0.025729...,8,26.2_68,3,0.0,5
3,88928,1,281370.35,7831,574,0,[ 0.07224658 0.01950507 0.01632774 0.025729...,8,26.2_68,3,0.0,8
4,88928,1,281370.35,7831,7749,0,[ 0.07224658 0.01950507 0.01632774 0.025729...,8,26.2_68,3,0.0,8


In [12]:
df_train = df_train.astype(preproc['change_type_columns'])
df_test = df_test.astype(preproc['change_type_columns'])

In [13]:
df_train = df_train.drop(columns=train['drop_columns'])
df_test = df_test.drop(columns=train['drop_columns'])

In [14]:
df_train[:5]

Unnamed: 0_level_0,forsmallbiz,price,supplier,is_winner,month,reg_code,purchase_size,flag_won,n_unique_okpd2
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0,1,406903.35,40062,0,2,62.0_26,1,0.0,3
1,1,299821.66,40062,0,3,26.2_65,4,0.0,3
2,1,366250.0,40062,0,2,62.0_72,5,0.0,3
3,1,85013.0,40062,0,2,58.2_46,6,0.0,3
4,0,13554.0,5495,1,7,drug_77,1,0.0,2


In [18]:
df_test[:5]

Unnamed: 0_level_0,forsmallbiz,price,supplier,is_winner,month,reg_code,purchase_size,flag_won,n_unique_okpd2
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0,0,53874.0,2940,1,3,drug_77,1,1.0,2
1,1,7500000.0,7538,1,9,38.2_77,1,0.0,4
2,1,281370.35,11016,0,8,26.2_68,3,0.0,5
3,1,281370.35,574,0,8,26.2_68,3,0.0,8
4,1,281370.35,7749,0,8,26.2_68,3,0.0,8


In [19]:
X = df_train.drop('is_winner', axis=1)
Y = df_train['is_winner']

In [20]:
x_test = df_test.drop('is_winner', axis=1)
y_test = df_test['is_winner']

In [66]:
metrics = pd.DataFrame()

In [67]:
x_train_, x_val_, y_train_, y_val_ = train_test_split(X, Y,
                                                      test_size=train['test_size'],
                                                      stratify=Y,
                                                      random_state=train['random_state'])

ratio = y_val_[y_val_ == 0].shape[0] / y_val_[y_val_ == 1].shape[0]

## Catboost

In [68]:
model = CatBoostClassifier(random_state=train['random_state'], 
                           scale_pos_weight=ratio,
                           eval_metric='AUC',
                           cat_features=train['cat_features'])

model.fit(x_train_, y_train_,
          eval_set=[(x_val_, y_val_)],
          early_stopping_rounds=100,
          verbose=0)

<catboost.core.CatBoostClassifier at 0x7f2500701cc0>

In [69]:
y_pred = model.predict(x_train_)
y_score = model.predict_proba(x_train_)

metrics = pd.concat([metrics, get_metrics(y_train_, y_pred, y_score, 
                                          'Catboost_baseline_train')])

In [70]:
y_pred = model.predict(x_test)
y_score = model.predict_proba(x_test)

metrics = pd.concat([metrics, get_metrics(y_test, y_pred, y_score, 
                                          'Catboost_baseline_test')])

In [71]:
metrics

Unnamed: 0,model,Accuracy,ROC_AUC,Precision,Recall,f1,Logloss
0,Catboost_baseline_train,0.874073,0.936535,0.851756,0.770227,0.808942,0.310368
0,Catboost_baseline_test,0.648642,0.795729,0.495167,0.864295,0.629617,1.498652


In [72]:
win_predictor_metrics['basic_metrics']['catboost'] = get_metrics_in_dict(
    y_test, y_pred, y_score)

## XGboost

In [73]:
model = XGBClassifier(scale_pos_weight=ratio, 
                      random_state=train['random_state'],
                      tree_method='hist',
                      enable_categorical=True,
                      n_jobs=-1)

model.fit(x_train_, y_train_,
          eval_set=[(x_val_, y_val_)],
          eval_metric='auc',
          early_stopping_rounds=100,
          verbose=0)

In [74]:
y_pred = model.predict(x_train_)
y_score = model.predict_proba(x_train_)

metrics = pd.concat([metrics, get_metrics(y_train_, y_pred, y_score, 
                                          'Xgboost_baseline_train')])

In [75]:
y_pred = model.predict(x_test)
y_score = model.predict_proba(x_test)

metrics = pd.concat([metrics, get_metrics(y_test, y_pred, y_score, 
                                          'Xgboost_baseline_test')])

In [76]:
metrics

Unnamed: 0,model,Accuracy,ROC_AUC,Precision,Recall,f1,Logloss
0,Catboost_baseline_train,0.874073,0.936535,0.851756,0.770227,0.808942,0.310368
0,Catboost_baseline_test,0.648642,0.795729,0.495167,0.864295,0.629617,1.498652
0,Xgboost_baseline_train,0.875594,0.927699,0.8692,0.754039,0.807534,0.322288
0,Xgboost_baseline_test,0.648594,0.779512,0.495018,0.845074,0.624325,1.132961


In [77]:
win_predictor_metrics['basic_metrics']['xgboost'] = get_metrics_in_dict(
    y_test, y_pred, y_score)

# Tune

## Catboost

In [17]:
def objective(trial: optuna.Trial, x: pd.DataFrame, y: pd.Series) -> float:
    """
    Функция для оптимизации гиперпараметров CatBoostClassifier с помощью Optuna.

    Parameters
    ----------
    trial : optuna.Trial
        Объект Trial из Optuna для выборки и хранения параметров.
    x : pd.DataFrame
        Матрица признаков.
    y : pd.Series
        Вектор целевой переменной.

    Returns
    -------
    float
        Среднее значение метрики AUC-ROC для скользящего контроля.

    """

    # Установка гиперпараметров для подбора
    params = {
        'n_estimators': trial.suggest_categorical('n_estimators', [1000]),
#         'learning_rate': trial.suggest_float('learning_rate', 0.001, 1),
        'learning_rate': trial.suggest_categorical('learning_rate', [0.0787449098272658]),
        'depth': trial.suggest_int('depth', 4, 10),
        'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 0.0001, 100),
        'random_strength': trial.suggest_float('random_strength', 10, 50),
        'bagging_temperature': trial.suggest_float('bagging_temperature', 0, 100),
        'border_count': trial.suggest_categorical('border_count', [128]),
        'grow_policy': trial.suggest_categorical('grow_policy', ['SymmetricTree', 'Depthwise', 'Lossguide']),
        'random_state': train['random_state']  # предполагается, что `train` - глобальная переменная
    }

    # Выполнение скользящего контроля
    cv_pred = np.empty(train['N_FOLDS'])
    cv = StratifiedKFold(n_splits=train['N_FOLDS'], shuffle=True, random_state=train['random_state'])
    for fold, (train_idx, test_idx) in enumerate(cv.split(x, y)):
        x_train_, x_val_ = x.iloc[train_idx], x.iloc[test_idx]
        y_train_, y_val_ = y.iloc[train_idx], y.iloc[test_idx]
        
        ratio = y_train_[y_train_ == 0].shape[0] / y_train_[y_train_ == 1].shape[0]
        
        model = CatBoostClassifier(
            scale_pos_weight=ratio,
            cat_features=['supplier', 'reg_code'],
            verbose=-1,
            **params
        )
        model.fit(x_train_, y_train_,
                  eval_set=[(x_val_, y_val_)],
                  early_stopping_rounds=100,
                  verbose=0)

        y_pred = model.predict(x_val_)
        y_proba = model.predict_proba(x_val_)[:, 1]

        cv_pred[fold] = roc_auc_score(y_val_, y_proba)

    return np.mean(cv_pred)

In [20]:
func = lambda trial: objective(trial, X, Y)

study = optuna.create_study(direction="maximize")
study.optimize(func, show_progress_bar=True, n_trials=10, n_jobs=6)

[32m[I 2023-04-15 00:51:28,528][0m A new study created in memory with name: no-name-7c9e2f7f-cd62-416a-928f-0fe9d90dfcaa[0m


  0%|          | 0/10 [00:00<?, ?it/s]




KeyboardInterrupt



In [19]:
ensemble_params['catboost'] = study.best_params

In [20]:
save_file(train['params'], ensemble_params)

## XGboost

In [17]:
def objective(trial: optuna.Trial, x: pd.DataFrame, y: pd.Series, **kwargs) -> float:
    """
    This function defines the objective function for an Optuna study to tune hyperparameters
    for a LightGBM binary classification model. 

    Args:
        trial (optuna.Trial): A trial corresponding to a set of hyperparameters.
        x (pd.DataFrame): The features to be used for training and validation.
        y (pd.Series): The target variable for training and validation.

    Returns:
        float: The mean of the cross-validation AUC-ROC scores for the given set of hyperparameters.
    """
    params = {
        'n_estimators': trial.suggest_categorical('n_estimators', [1000]),
#         'learning_rate': trial.suggest_float('learning_rate', 0.001, 1),
        'learning_rate': trial.suggest_categorical('learning_rate', [0.21783913980394612]),
        "max_depth": trial.suggest_int("max_depth", 3, 10),
        'max_bin': trial.suggest_int('max_bin', 0, 100),
        "gamma": trial.suggest_int("gamma", 0, 20),
        "min_child_weight": trial.suggest_int("min_child_weight", 2, 20),
        "reg_alpha": trial.suggest_float("reg_alpha", 1e-3, 1e2, log=True),
        "reg_lambda": trial.suggest_float("reg_lambda", 1e-3, 1e2, log=True),
        "subsample": trial.suggest_float("subsample", 0.5, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
        "colsample_bylevel": trial.suggest_float("colsample_bylevel", 0.5,1.0),
        "colsample_bynode": trial.suggest_float("colsample_bynode", 0.5, 1.0),
        'random_state': train['random_state'],
    }

    cv_pred = np.empty(train['N_FOLDS'])
    cv = StratifiedKFold(n_splits=train['N_FOLDS'], shuffle=True, random_state=train['random_state'])

    for fold, (train_idx, test_idx) in enumerate(cv.split(x, y)):
        x_train_, x_val_ = x.iloc[train_idx], x.iloc[test_idx]
        y_train_, y_val_ = y.iloc[train_idx], y.iloc[test_idx]

        ratio = y_train_[y_train_ == 0].shape[0] / \
            y_train_[y_train_ == 1].shape[0]
        
        pruning = optuna.integration.XGBoostPruningCallback(trial, 'validation_0-auc')

        
        model = XGBClassifier(
            scale_pos_weight=ratio,
            tree_method='hist',
            enable_categorical=True,
            verbosity=0,
            n_jobs=-1,
            **params
        )
        model.fit(x_train_, y_train_,
                  eval_set=[(x_val_, y_val_)],
                  early_stopping_rounds=100,
                  eval_metric='auc',
                  callbacks=[pruning], 
                  verbose=0)

        
        y_pred = model.predict(x_val_)
        y_proba = model.predict_proba(x_val_)[:, 1]

        cv_pred[fold] = roc_auc_score(y_val_, y_proba)
        
    return (np.mean(cv_pred))

In [18]:
func = lambda trial: objective(trial, X, Y)

study = optuna.create_study(direction="maximize")
study.optimize(func, show_progress_bar=True, n_trials=20, n_jobs=6)

[32m[I 2023-04-15 11:48:06,008][0m A new study created in memory with name: no-name-975d2bc5-34db-4261-9813-941468bd0f7f[0m


  0%|          | 0/20 [00:00<?, ?it/s]

[32m[I 2023-04-15 11:51:25,278][0m Trial 3 finished with value: 0.8834798005032682 and parameters: {'n_estimators': 1000, 'learning_rate': 0.21783913980394612, 'max_depth': 8, 'max_bin': 53, 'gamma': 0, 'min_child_weight': 19, 'reg_alpha': 2.2690964684340553, 'reg_lambda': 0.006443915023953365, 'subsample': 0.8934072342383397, 'colsample_bytree': 0.8548968906017982, 'colsample_bylevel': 0.5876878665580205, 'colsample_bynode': 0.9643922141507364}. Best is trial 3 with value: 0.8834798005032682.[0m
[32m[I 2023-04-15 11:53:27,200][0m Trial 2 finished with value: 0.8810104464737272 and parameters: {'n_estimators': 1000, 'learning_rate': 0.21783913980394612, 'max_depth': 10, 'max_bin': 41, 'gamma': 18, 'min_child_weight': 20, 'reg_alpha': 7.027691563063873, 'reg_lambda': 0.647302864736654, 'subsample': 0.6664594762113506, 'colsample_bytree': 0.5857238651146067, 'colsample_bylevel': 0.586925528195432, 'colsample_bynode': 0.5429006751595139}. Best is trial 3 with value: 0.883479800503268

In [19]:
ensemble_params['xgboost'] = study.best_params

In [31]:
save_file(train['params'], ensemble_params)

# Train on best params

In [78]:
models = {}

In [79]:
metrics = pd.DataFrame()

## Catboost

In [80]:
params = ensemble_params['catboost']

In [81]:
ratio = y_test[y_test == 0].shape[0] / y_test[y_test == 1].shape[0]

In [82]:
model = CatBoostClassifier(scale_pos_weight=ratio,
                           cat_features=train['cat_features'],
                           **params)

model.fit(X, Y, verbose=0)

<catboost.core.CatBoostClassifier at 0x7f2500703c70>

In [83]:
y_pred = model.predict(X)
y_score = model.predict_proba(X)

metrics = pd.concat([metrics, get_metrics(Y, y_pred, y_score, 
                                          'Catboost_train')])

predictions_train = y_score[:,1]

In [84]:
y_pred = model.predict(x_test)
y_score = model.predict_proba(x_test)

metrics = pd.concat([metrics, get_metrics(y_test, y_pred, y_score, 
                                          'Catboost_test')])

predictions_test = y_score[:,1]

In [85]:
metrics

Unnamed: 0,model,Accuracy,ROC_AUC,Precision,Recall,f1,Logloss
0,Catboost_train,0.872004,0.933844,0.851423,0.763416,0.805021,0.311773
0,Catboost_test,0.651094,0.797918,0.497188,0.864174,0.631216,1.37657


In [86]:
win_predictor_metrics['best_metrics']['catboost'] = get_metrics_in_dict(
    y_test, y_pred, y_score)

In [37]:
models['catboost'] = model

## XGboost

In [87]:
params = ensemble_params['xgboost']

In [88]:
model = XGBClassifier(scale_pos_weight=ratio, 
                      tree_method='hist',
                      enable_categorical=True,
                      n_jobs=-1,
                      **params)

model.fit(X, Y, verbose=0)

In [89]:
y_pred = model.predict(X)
y_score = model.predict_proba(X)

metrics = pd.concat([metrics, get_metrics(Y, y_pred, y_score, 
                                          'XGboost_train')])

predictions_train = np.vstack((predictions_train, y_score[:, 1]))

In [90]:
y_pred = model.predict(x_test)
y_score = model.predict_proba(x_test)

metrics = pd.concat([metrics, get_metrics(y_test, y_pred, y_score, 
                                          'XGboost_test')])

predictions_test = np.vstack((predictions_test, y_score[:, 1]))

In [91]:
metrics

Unnamed: 0,model,Accuracy,ROC_AUC,Precision,Recall,f1,Logloss
0,Catboost_train,0.872004,0.933844,0.851423,0.763416,0.805021,0.311773
0,Catboost_test,0.651094,0.797918,0.497188,0.864174,0.631216,1.37657
0,XGboost_train,0.856609,0.911035,0.817549,0.753981,0.784479,0.344465
0,XGboost_test,0.633444,0.78583,0.483067,0.868112,0.620727,1.270424


In [92]:
win_predictor_metrics['best_metrics']['xgboost'] = get_metrics_in_dict(
    y_test, y_pred, y_score)

In [43]:
models['xgboost'] = model

## Stacking

In [93]:
model = GaussianNB()
model.fit(predictions_train.T, Y)

In [94]:
y_pred = model.predict(predictions_train.T)
y_score = model.predict_proba(predictions_train.T)

metrics = pd.concat([metrics, get_metrics(Y, y_pred, y_score, 
                                          'Ensemble_train')])

In [95]:
y_pred = model.predict(predictions_test.T)
y_score = model.predict_proba(predictions_test.T)

metrics = pd.concat([metrics, get_metrics(y_test, y_pred, y_score, 
                                          'Ensemble_test')])

In [96]:
metrics

Unnamed: 0,model,Accuracy,ROC_AUC,Precision,Recall,f1,Logloss
0,Catboost_train,0.872004,0.933844,0.851423,0.763416,0.805021,0.311773
0,Catboost_test,0.651094,0.797918,0.497188,0.864174,0.631216,1.37657
0,XGboost_train,0.856609,0.911035,0.817549,0.753981,0.784479,0.344465
0,XGboost_test,0.633444,0.78583,0.483067,0.868112,0.620727,1.270424
0,Ensemble_train,0.867511,0.928073,0.843632,0.757645,0.79833,0.544029
0,Ensemble_test,0.643313,0.799288,0.490877,0.86882,0.627322,2.918712


In [97]:
win_predictor_metrics['ensemble_metrics'] = get_metrics_in_dict(
    y_test, y_pred, y_score)

In [99]:
save_file(train['metrics'], win_predictor_metrics)

In [48]:
models['Naive_bayes'] = model

In [49]:
joblib.dump(models, train['models'])

['/home/sergey/projects/zakupki/models/models/ensemble_models.joblib']