In [37]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import roc_auc_score, accuracy_score, precision_score, \
                            recall_score, f1_score, log_loss, precision_recall_curve

from catboost import CatBoostClassifier

from xgboost import XGBClassifier

import optuna

import yaml

import joblib

import warnings
warnings.filterwarnings('ignore')

In [38]:
def get_metrics(y_test, y_pred, y_score, name = "Default"):
    """Метрики для задачи классификации"""
    df_metrics = pd.DataFrame()

    df_metrics['model'] = [name]
    df_metrics['Accuracy'] = accuracy_score(y_test, y_pred)
    df_metrics['ROC_AUC'] = roc_auc_score(y_test, y_score[:, 1])
    df_metrics['Precision'] = precision_score(y_test, y_pred, zero_division=0)
    df_metrics['Recall'] = recall_score(y_test, y_pred, zero_division=0)
    df_metrics['f1'] = f1_score(y_test, y_pred, zero_division=0)
    df_metrics['Logloss'] = log_loss(y_test, y_score)

    return df_metrics

In [39]:
def open_file(file_path):
    with open(file_path, 'r') as file:
        return yaml.safe_load(file)
    
def save_file(file_path, data):       
    with open(file_path, 'w') as file:
        yaml.dump(data, file)

In [40]:
config_path = "../config/params.yaml"
config = yaml.load(open(config_path), Loader=yaml.FullLoader)

preproc = config["preprocessing"]
train = config['train']

In [41]:
ensemble_params = open_file(train['ensemble_params'])

# Baseline

In [6]:
df_train = pd.read_csv(preproc['train_data'])
df_train = df_train.set_index('index')

df_train[:5]

Unnamed: 0_level_0,purchase,forsmallbiz,price,customer,supplier,is_winner,vectorized_tokens,month,reg_code,purchase_size,flag_won,n_unique_okpd2
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
0,248042,0,60706.6,3513,15708,0,[ 1.59887652e-02 2.36658968e-02 -4.08423709e-...,11,21.2_33,6,0.0,3
1,569110,1,42138.0,6570,15708,1,[ 0.07180544 0.01209602 0.00167402 0.044857...,11,20.5_53,3,0.0,3
2,254021,1,65000.1,3377,15708,0,[ 0.02361675 0.01336928 -0.00032915 -0.005782...,5,21.2_33,6,0.0,3
3,447931,1,341584.8,9732,15708,0,[ 3.30867594e-02 4.09576579e-02 -5.67706811e-...,3,21.2_77,5,0.0,3
4,395694,1,133452.0,9589,15708,0,[ 0.10526875 0.11765645 -0.01359718 -0.009087...,9,21.2_77,2,0.0,3


In [7]:
df_test = pd.read_csv(preproc['test_data'])
df_test = df_test.set_index('index')

df_test[:5]

Unnamed: 0_level_0,purchase,forsmallbiz,price,customer,supplier,is_winner,vectorized_tokens,month,reg_code,purchase_size,flag_won,n_unique_okpd2
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
0,63456,0,290000.0,6151,4841,1,[ 1.48227703e-02 2.26276631e-02 1.17342828e-...,7,10.8_45,1,0.0,2
1,41232,0,160044.4,3932,560,1,[ 5.70505795e-02 -1.79046954e-04 2.02757507e-...,8,19.2_34,1,1.0,2
2,120554,0,145530.0,9076,43003,0,[ 8.90472124e-02 2.44880769e-01 -6.71891250e-...,6,drug_92,2,0.0,4
3,120554,0,145530.0,9076,3109,1,[ 8.90472124e-02 2.44880769e-01 -6.71891250e-...,6,drug_92,2,1.0,4
4,594728,1,369175.6,9582,3796,1,[ 1.12817446e-01 1.13674459e-01 -2.37492026e-...,1,10.8_77,1,1.0,5


In [8]:
df_train = df_train.astype(preproc['change_type_columns'])
df_test = df_test.astype(preproc['change_type_columns'])

In [9]:
df_train = df_train.drop(columns=train['drop_columns_winner'])
df_test = df_test.drop(columns=train['drop_columns_winner'])

In [10]:
df_train[:5]

Unnamed: 0_level_0,forsmallbiz,price,supplier,is_winner,month,reg_code,purchase_size,flag_won,n_unique_okpd2
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0,0,60706.6,15708,0,11,21.2_33,6,0.0,3
1,1,42138.0,15708,1,11,20.5_53,3,0.0,3
2,1,65000.1,15708,0,5,21.2_33,6,0.0,3
3,1,341584.8,15708,0,3,21.2_77,5,0.0,3
4,1,133452.0,15708,0,9,21.2_77,2,0.0,3


In [11]:
df_test[:5]

Unnamed: 0_level_0,forsmallbiz,price,supplier,is_winner,month,reg_code,purchase_size,flag_won,n_unique_okpd2
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0,0,290000.0,4841,1,7,10.8_45,1,0.0,2
1,0,160044.4,560,1,8,19.2_34,1,1.0,2
2,0,145530.0,43003,0,6,drug_92,2,0.0,4
3,0,145530.0,3109,1,6,drug_92,2,1.0,4
4,1,369175.6,3796,1,1,10.8_77,1,1.0,5


In [12]:
X = df_train.drop('is_winner', axis=1)
Y = df_train['is_winner']

In [13]:
x_test = df_test.drop('is_winner', axis=1)
y_test = df_test['is_winner']

In [14]:
metrics = pd.DataFrame()

In [15]:
x_train_, x_val_, y_train_, y_val_ = train_test_split(X, Y,
                                                      test_size=train['test_size'],
                                                      stratify=Y,
                                                      random_state=train['random_state'])

ratio = y_val_[y_val_ == 0].shape[0] / y_val_[y_val_ == 1].shape[0]

## Catboost

In [16]:
model = CatBoostClassifier(random_state=train['random_state'], 
                           scale_pos_weight=ratio,
                           eval_metric='AUC',
                           cat_features=train['cat_features'])

model.fit(x_train_, y_train_,
          eval_set=[(x_val_, y_val_)],
          early_stopping_rounds=100,
          verbose=0)

<catboost.core.CatBoostClassifier at 0x7f5e99c44130>

In [17]:
y_pred = model.predict(x_train_)
y_score = model.predict_proba(x_train_)

metrics = pd.concat([metrics, get_metrics(y_train_, y_pred, y_score, 
                                          'Catboost_baseline_train')])

In [18]:
y_pred = model.predict(x_test)
y_score = model.predict_proba(x_test)

metrics = pd.concat([metrics, get_metrics(y_test, y_pred, y_score, 
                                          'Catboost_baseline_test')])

In [19]:
metrics

Unnamed: 0,model,Accuracy,ROC_AUC,Precision,Recall,f1,Logloss
0,Catboost_baseline_train,0.875218,0.937443,0.852581,0.77312,0.810909,0.309346
0,Catboost_baseline_test,0.646183,0.791117,0.49326,0.860302,0.627016,1.518133


## XGboost

In [27]:
model = XGBClassifier(scale_pos_weight=ratio, 
                      random_state=train['random_state'],
                      tree_method='hist',
                      enable_categorical=True,
                      n_jobs=-1)

model.fit(x_train_, y_train_,
          eval_set=[(x_val_, y_val_)],
          eval_metric='auc',
          early_stopping_rounds=100,
          verbose=0)

In [28]:
y_pred = model.predict(x_train_)
y_score = model.predict_proba(x_train_)

metrics = pd.concat([metrics, get_metrics(y_train_, y_pred, y_score, 
                                          'Xgboost_baseline_train')])

In [29]:
y_pred = model.predict(x_test)
y_score = model.predict_proba(x_test)

metrics = pd.concat([metrics, get_metrics(y_test, y_pred, y_score, 
                                          'Xgboost_baseline_test')])

In [30]:
metrics

Unnamed: 0,model,Accuracy,ROC_AUC,Precision,Recall,f1,Logloss
0,Catboost_baseline_train,0.875218,0.937443,0.852581,0.77312,0.810909,0.309346
0,Catboost_baseline_test,0.646183,0.791117,0.49326,0.860302,0.627016,1.518133
0,Lightgbm_baseline_train,0.862341,0.915192,0.84349,0.739435,0.788042,0.340734
0,Lightgbm_baseline_test,0.638516,0.778636,0.486931,0.851188,0.619481,1.188303
0,Xgboost_baseline_train,0.876117,0.92861,0.869934,0.754907,0.808349,0.320952
0,Xgboost_baseline_test,0.648523,0.774397,0.495028,0.833529,0.621155,1.136132


# Tune

## Catboost

In [17]:
def objective(trial: optuna.Trial, x: pd.DataFrame, y: pd.Series) -> float:

    params = {
        'n_estimators': trial.suggest_categorical('n_estimators', [1000]),
#         'learning_rate': trial.suggest_float('learning_rate', 0.0001, 0.1),
        'learning_rate': trial.suggest_categorical('learning_rate', [0.0787449098272658]),
        'depth': trial.suggest_int('depth', 4, 10),
        'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 0.0001, 100),
        'random_strength': trial.suggest_float('random_strength', 10, 50),
        'bagging_temperature': trial.suggest_float('bagging_temperature', 0, 100),
        'border_count': trial.suggest_categorical('border_count', [128]),
        'grow_policy': trial.suggest_categorical('grow_policy', ['SymmetricTree', 'Depthwise', 'Lossguide']),
        'random_state': train['random_state']
    }

    cv_pred = np.empty(train['N_FOLDS'])
    cv = StratifiedKFold(n_splits=train['N_FOLDS'], shuffle=True, random_state=train['random_state'])

    for fold, (train_idx, test_idx) in enumerate(cv.split(x, y)):
        x_train_, x_val_ = x.iloc[train_idx], x.iloc[test_idx]
        y_train_, y_val_ = y.iloc[train_idx], y.iloc[test_idx]
        
        ratio = y_train_[y_train_ == 0].shape[0] / \
            y_train_[y_train_ == 1].shape[0]
        
        model = CatBoostClassifier(
            scale_pos_weight=ratio,
            cat_features=['supplier', 'reg_code'],
            verbose=-1,
            **params
        )
        model.fit(x_train_, y_train_,
                  eval_set=[(x_val_, y_val_)],
                  early_stopping_rounds=100,
                  verbose=0)

        y_pred = model.predict(x_val_)
        y_proba = model.predict_proba(x_val_)[:, 1]

        cv_pred[fold] = roc_auc_score(y_val_, y_proba)
    return (np.mean(cv_pred))

In [20]:
func = lambda trial: objective(trial, X, Y)

study = optuna.create_study(direction="maximize")
study.optimize(func, show_progress_bar=True, n_trials=10, n_jobs=6)

[32m[I 2023-04-15 00:51:28,528][0m A new study created in memory with name: no-name-7c9e2f7f-cd62-416a-928f-0fe9d90dfcaa[0m


  0%|          | 0/10 [00:00<?, ?it/s]




KeyboardInterrupt



In [19]:
ensemble_params['catboost'] = study.best_params

In [20]:
save_file(train['ensemble_params'], ensemble_params)

## XGboost

In [17]:
def objective(trial: optuna.Trial, x: pd.DataFrame, y: pd.Series, **kwargs) -> float:
    """
    This function defines the objective function for an Optuna study to tune hyperparameters
    for a LightGBM binary classification model. 

    Args:
        trial (optuna.Trial): A trial corresponding to a set of hyperparameters.
        x (pd.DataFrame): The features to be used for training and validation.
        y (pd.Series): The target variable for training and validation.

    Returns:
        float: The mean of the cross-validation AUC-ROC scores for the given set of hyperparameters.
    """
    params = {
        'n_estimators': trial.suggest_categorical('n_estimators', [1000]),
#         'learning_rate': trial.suggest_float('learning_rate', 0.001, 1),
        'learning_rate': trial.suggest_categorical('learning_rate', [0.21783913980394612]),
        "max_depth": trial.suggest_int("max_depth", 3, 10),
        'max_bin': trial.suggest_int('max_bin', 0, 100),
        "gamma": trial.suggest_int("gamma", 0, 20),
        "min_child_weight": trial.suggest_int("min_child_weight", 2, 20),
        "reg_alpha": trial.suggest_float("reg_alpha", 1e-3, 1e2, log=True),
        "reg_lambda": trial.suggest_float("reg_lambda", 1e-3, 1e2, log=True),
        "subsample": trial.suggest_float("subsample", 0.5, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
        "colsample_bylevel": trial.suggest_float("colsample_bylevel", 0.5,1.0),
        "colsample_bynode": trial.suggest_float("colsample_bynode", 0.5, 1.0),
        'random_state': train['random_state'],
    }

    cv_pred = np.empty(train['N_FOLDS'])
    cv = StratifiedKFold(n_splits=train['N_FOLDS'], shuffle=True, random_state=train['random_state'])

    for fold, (train_idx, test_idx) in enumerate(cv.split(x, y)):
        x_train_, x_val_ = x.iloc[train_idx], x.iloc[test_idx]
        y_train_, y_val_ = y.iloc[train_idx], y.iloc[test_idx]

        ratio = y_train_[y_train_ == 0].shape[0] / \
            y_train_[y_train_ == 1].shape[0]
        
        pruning = optuna.integration.XGBoostPruningCallback(trial, 'validation_0-auc')

        
        model = XGBClassifier(
            scale_pos_weight=ratio,
            tree_method='hist',
            enable_categorical=True,
            verbosity=0,
            n_jobs=-1,
            **params
        )
        model.fit(x_train_, y_train_,
                  eval_set=[(x_val_, y_val_)],
                  early_stopping_rounds=100,
                  eval_metric='auc',
                  callbacks=[pruning], 
                  verbose=0)

        
        y_pred = model.predict(x_val_)
        y_proba = model.predict_proba(x_val_)[:, 1]

        cv_pred[fold] = roc_auc_score(y_val_, y_proba)
        
    return (np.mean(cv_pred))

In [18]:
func = lambda trial: objective(trial, X, Y)

study = optuna.create_study(direction="maximize")
study.optimize(func, show_progress_bar=True, n_trials=20, n_jobs=6)

[32m[I 2023-04-15 11:48:06,008][0m A new study created in memory with name: no-name-975d2bc5-34db-4261-9813-941468bd0f7f[0m


  0%|          | 0/20 [00:00<?, ?it/s]

[32m[I 2023-04-15 11:51:25,278][0m Trial 3 finished with value: 0.8834798005032682 and parameters: {'n_estimators': 1000, 'learning_rate': 0.21783913980394612, 'max_depth': 8, 'max_bin': 53, 'gamma': 0, 'min_child_weight': 19, 'reg_alpha': 2.2690964684340553, 'reg_lambda': 0.006443915023953365, 'subsample': 0.8934072342383397, 'colsample_bytree': 0.8548968906017982, 'colsample_bylevel': 0.5876878665580205, 'colsample_bynode': 0.9643922141507364}. Best is trial 3 with value: 0.8834798005032682.[0m
[32m[I 2023-04-15 11:53:27,200][0m Trial 2 finished with value: 0.8810104464737272 and parameters: {'n_estimators': 1000, 'learning_rate': 0.21783913980394612, 'max_depth': 10, 'max_bin': 41, 'gamma': 18, 'min_child_weight': 20, 'reg_alpha': 7.027691563063873, 'reg_lambda': 0.647302864736654, 'subsample': 0.6664594762113506, 'colsample_bytree': 0.5857238651146067, 'colsample_bylevel': 0.586925528195432, 'colsample_bynode': 0.5429006751595139}. Best is trial 3 with value: 0.883479800503268

In [19]:
ensemble_params['xgboost'] = study.best_params

In [31]:
save_file(train['ensemble_params'], ensemble_params)

# Train on best params

In [42]:
models = {}

In [43]:
metrics = pd.DataFrame()

## Catboost

In [44]:
params = ensemble_params['catboost']

In [45]:
ratio = y_test[y_test == 0].shape[0] / y_test[y_test == 1].shape[0]

In [46]:
model = CatBoostClassifier(scale_pos_weight=ratio,
                           cat_features=train['cat_features'],
                           **params)

model.fit(X, Y, verbose=0)

<catboost.core.CatBoostClassifier at 0x7fc1cebf1ba0>

In [47]:
y_pred = model.predict(X)
y_score = model.predict_proba(X)

metrics = pd.concat([metrics, get_metrics(Y, y_pred, y_score, 
                                          'Catboost_train')])

predictions_train = y_score[:,1]

In [48]:
y_pred = model.predict(x_test)
y_score = model.predict_proba(x_test)

metrics = pd.concat([metrics, get_metrics(y_test, y_pred, y_score, 
                                          'Catboost_test')])

predictions_test = y_score[:,1]

In [49]:
metrics

Unnamed: 0,model,Accuracy,ROC_AUC,Precision,Recall,f1,Logloss
0,Catboost_train,0.872432,0.933877,0.849931,0.766777,0.806216,0.312037
0,Catboost_test,0.647078,0.796187,0.494011,0.862839,0.628296,1.360022


In [50]:
models['catboost'] = model

## XGboost

In [51]:
params = ensemble_params['xgboost']

In [52]:
model = XGBClassifier(scale_pos_weight=ratio, 
                      tree_method='hist',
                      enable_categorical=True,
                      n_jobs=-1,
                      **params)

model.fit(X, Y, verbose=0)

In [53]:
y_pred = model.predict(X)
y_score = model.predict_proba(X)

metrics = pd.concat([metrics, get_metrics(Y, y_pred, y_score, 
                                          'XGboost_train')])

predictions_train = np.vstack((predictions_train, y_score[:, 1]))

In [54]:
y_pred = model.predict(x_test)
y_score = model.predict_proba(x_test)

metrics = pd.concat([metrics, get_metrics(y_test, y_pred, y_score, 
                                          'XGboost_test')])

predictions_test = np.vstack((predictions_test, y_score[:, 1]))

In [55]:
metrics

Unnamed: 0,model,Accuracy,ROC_AUC,Precision,Recall,f1,Logloss
0,Catboost_train,0.872432,0.933877,0.849931,0.766777,0.806216,0.312037
0,Catboost_test,0.647078,0.796187,0.494011,0.862839,0.628296,1.360022
0,XGboost_train,0.856545,0.910755,0.817614,0.753586,0.784296,0.344722
0,XGboost_test,0.63191,0.78302,0.481911,0.863167,0.618506,1.295346


In [56]:
models['xgboost'] = model

## Stacking

In [58]:
model = GaussianNB()
model.fit(predictions_train.T, Y)

In [59]:
y_pred = model.predict(predictions_train.T)
y_score = model.predict_proba(predictions_train.T)

metrics = pd.concat([metrics, get_metrics(Y, y_pred, y_score, 
                                          'Ensemble_train')])

In [60]:
y_pred = model.predict(predictions_test.T)
y_score = model.predict_proba(predictions_test.T)

metrics = pd.concat([metrics, get_metrics(y_test, y_pred, y_score, 
                                          'Ensemble_test')])

In [61]:
metrics

Unnamed: 0,model,Accuracy,ROC_AUC,Precision,Recall,f1,Logloss
0,Catboost_train,0.872432,0.933877,0.849931,0.766777,0.806216,0.312037
0,Catboost_test,0.647078,0.796187,0.494011,0.862839,0.628296,1.360022
0,XGboost_train,0.856545,0.910755,0.817614,0.753586,0.784296,0.344722
0,XGboost_test,0.63191,0.78302,0.481911,0.863167,0.618506,1.295346
0,Ensemble_train,0.867735,0.928137,0.843105,0.759074,0.798886,0.540572
0,Ensemble_test,0.641529,0.79644,0.489567,0.867534,0.625917,2.906066


In [62]:
models['Naive_bayes'] = model

In [63]:
joblib.dump(models, train['ensemble_models'])

['../models/models/ensemble_models.joblib']