In [97]:
import pandas as pd
import numpy as np

from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import KFold  # k-фолдная валидация
import copy
from catboost import CatBoostClassifier, CatBoostRegressor, Pool
from collections import Counter
from sklearn.metrics import accuracy_score

In [154]:
target= 'target'

X = df.drop(target, axis=1)
y = df['target']
cat_features = list(X.select_dtypes(include=['object']).columns)

### Модель Саtboost + Kfold

In [165]:
class CvModel:
    def __init__(self, clf):
        self.models = []
        self.clf = clf
        self.scores = []
        
    def fit(self, X, y, cv=3):
        self.models = []
        self.scores = []
        
        kf = KFold(n_splits=cv, shuffle=True)
        for train_index, val_index in kf.split(X):
            now_model = copy.deepcopy(self.clf)
            X_train, X_val = X.iloc[train_index], X.iloc[val_index]
            y_train, y_val = y.iloc[train_index], y.iloc[val_index]
           
            train_dataset = Pool(data=X_train, label=y_train, cat_features=cat_features)
            eval_dataset = Pool(data=X_val, label=y_val, cat_features=cat_features)
            
            now_model.fit(train_dataset, 
                          eval_set=eval_dataset,
                          early_stopping_rounds=150,
                          verbose=100)
            
            acc = accuracy_score(now_model.predict(X_val), y_val)
            print(f'Fold {i}/{cv} acc = {acc}')
            
            self.scores.append(acc)
            self.models.append(now_model)
        print('Mean acc = {}'.format(np.mean(self.scores)))
            
    def predict_proba(self, X):
        if len(self.models) == 0:
            raise Exception("There is not fitted model")
            
        res = [model.predict_proba(X) for model in self.models]
        return np.sum(res, axis=0)
    
    def predict(self, X):
        if len(self.models) == 0:
            raise Exception("There is not fitted model")
            
        preds = np.concatenate([model.predict(X).reshape(-1, 1) for model in self.models], axis=1)
#         print(preds.shape)
        res = [Counter(el).most_common()[0][0] for el in preds]
        return res


params_cat = {
    "n_estimators": 1000, #default кол-во деревьев, n_estimators, num_boost_round
    "learning_rate": 0.0276832670537493, #default
    "depth": 8,#default
    "l2_leaf_reg": 6, #default лямбда при L2
    'colsample_bylevel': 0.5782566005857097,
    'auto_class_weights':"SqrtBalanced",
    'boosting_type': 'Plain',
     'bootstrap_type': 'MVS',
    "use_best_model": True,
    "cat_features": cat_features,
    "task_type": 'CPU',# есть ещё GPU
    "loss_function": 'Logloss', #default
    "eval_metric": 'AUC', #для отслеживания остановки
    'random_seed':42,
}


In [166]:
clf = CatBoostClassifier(**params_cat)
cv_model = CvModel(clf)

In [167]:
cv_model.fit( X, y, cv=3)

0:	test: 0.8684020	best: 0.8684020 (0)	total: 73.7ms	remaining: 1m 13s
100:	test: 0.9648975	best: 0.9648975 (100)	total: 7.76s	remaining: 1m 9s
200:	test: 0.9676774	best: 0.9676774 (200)	total: 16s	remaining: 1m 3s
300:	test: 0.9690638	best: 0.9690638 (300)	total: 23.7s	remaining: 55s
400:	test: 0.9696346	best: 0.9696346 (400)	total: 31.1s	remaining: 46.5s
500:	test: 0.9702333	best: 0.9702492 (499)	total: 38.7s	remaining: 38.6s
600:	test: 0.9704638	best: 0.9704638 (600)	total: 46.8s	remaining: 31.1s
700:	test: 0.9707238	best: 0.9707366 (695)	total: 55.4s	remaining: 23.6s
800:	test: 0.9708770	best: 0.9708856 (793)	total: 1m 4s	remaining: 16s
900:	test: 0.9710350	best: 0.9710761 (886)	total: 1m 12s	remaining: 8.02s
999:	test: 0.9711681	best: 0.9711789 (993)	total: 1m 22s	remaining: 0us

bestTest = 0.9711789149
bestIteration = 993

Shrink model to first 994 iterations.
Fold price_category/3 acc = 0.9143283582089552
0:	test: 0.9039454	best: 0.9039454 (0)	total: 70.6ms	remaining: 1m 10s
100

### OPTUNA + Kfold

In [137]:
!pip install optuna catboost -q

In [138]:
import optuna

In [151]:
def objective(trial, return_models=False):
    cv = 3
    scores, models = [], []
    
    kf = KFold(n_splits=cv, shuffle=True)
    for train_index, val_index in kf.split(X):
            train_data = X.iloc[train_index], y.iloc[train_index]
            valid_data = X.iloc[val_index], y.iloc[val_index]
            
            # Подаем trials для перебора
            model, y_pred = fit_catboost(trial, train_data, valid_data) # Определили выше
            scores.append(accuracy_score(y_pred, valid_data[1]))
            models.append(model)
            break
         

    result = np.mean(scores)
    
    if return_models:
        return result, models
    else:
        return result
    
def fit_catboost(trial, train, val):
    X_train, y_train = train
    X_val, y_val = val
    
    param = {
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.03),
        "depth": trial.suggest_int("depth", 3, 9),
        "l2_leaf_reg": trial.suggest_int("l2_leaf_reg", 2, 50),
        "colsample_bylevel": trial.suggest_float("colsample_bylevel", 0.01, 0.8),
        "auto_class_weights": trial.suggest_categorical("auto_class_weights", ["SqrtBalanced", "Balanced", "None"]),
        "boosting_type": trial.suggest_categorical("boosting_type", ["Ordered", "Plain"]),
        "bootstrap_type": trial.suggest_categorical("bootstrap_type", ["Bayesian", "Bernoulli", "MVS"]),
        "eval_metric": "AUC",
    }
    if param["bootstrap_type"] == "Bayesian":
        param["bagging_temperature"] = trial.suggest_float("bagging_temperature", 0, 20)
        
    elif param["bootstrap_type"] == "Bernoulli":
        param["subsample"] = trial.suggest_float("subsample", 0.1, 1)
        
    clf = CatBoostClassifier(
        **param,
        thread_count=-1,
        random_seed=42,
        cat_features=cat_features,)

    clf.fit(
        X_train,
        y_train,
        eval_set=(X_val, y_val),
        verbose=0,
        plot=False,
        early_stopping_rounds=5,)

    y_pred = clf.predict(X_val)
    return clf, y_pred

study = optuna.create_study(direction="maximize")
study.optimize(objective,
               n_trials=100,
               n_jobs = -1,
               show_progress_bar=True,)

In [147]:
print("Best trial: score {}, params {}".format(study.best_trial.value, study.best_trial.params))

[I 2023-11-14 15:56:18,660] A new study created in memory with name: no-name-8d35020d-6075-4311-b693-d8a14b767ece


  0%|          | 0/100 [00:00<?, ?it/s]

[I 2023-11-14 15:56:20,378] Trial 0 finished with value: 0.8405970149253731 and parameters: {'learning_rate': 0.02863958971825184, 'depth': 4, 'l2_leaf_reg': 42, 'colsample_bylevel': 0.6655680433660395, 'auto_class_weights': 'SqrtBalanced', 'boosting_type': 'Plain', 'bootstrap_type': 'MVS'}. Best is trial 0 with value: 0.8405970149253731.
[I 2023-11-14 15:56:20,632] Trial 2 finished with value: 0.844776119402985 and parameters: {'learning_rate': 0.01828142341355038, 'depth': 4, 'l2_leaf_reg': 38, 'colsample_bylevel': 0.5828740116968846, 'auto_class_weights': 'Balanced', 'boosting_type': 'Plain', 'bootstrap_type': 'Bernoulli', 'subsample': 0.57156672547144}. Best is trial 2 with value: 0.844776119402985.
[I 2023-11-14 15:56:21,915] Trial 4 finished with value: 0.8441791044776119 and parameters: {'learning_rate': 0.014177016167583942, 'depth': 3, 'l2_leaf_reg': 8, 'colsample_bylevel': 0.45583225141869094, 'auto_class_weights': 'SqrtBalanced', 'boosting_type': 'Plain', 'bootstrap_type': '

[I 2023-11-14 15:56:48,334] Trial 24 finished with value: 0.8605970149253731 and parameters: {'learning_rate': 0.019786909373705055, 'depth': 9, 'l2_leaf_reg': 13, 'colsample_bylevel': 0.49412573493248635, 'auto_class_weights': 'SqrtBalanced', 'boosting_type': 'Plain', 'bootstrap_type': 'Bayesian', 'bagging_temperature': 9.481125880891328}. Best is trial 3 with value: 0.8877611940298508.
[I 2023-11-14 15:56:49,724] Trial 26 finished with value: 0.8602985074626865 and parameters: {'learning_rate': 0.020164489209105345, 'depth': 9, 'l2_leaf_reg': 12, 'colsample_bylevel': 0.5472263628937841, 'auto_class_weights': 'SqrtBalanced', 'boosting_type': 'Plain', 'bootstrap_type': 'Bayesian', 'bagging_temperature': 9.341255391096741}. Best is trial 3 with value: 0.8877611940298508.
[I 2023-11-14 15:56:49,989] Trial 25 finished with value: 0.8668656716417911 and parameters: {'learning_rate': 0.019695130909160023, 'depth': 9, 'l2_leaf_reg': 31, 'colsample_bylevel': 0.4998232537758507, 'auto_class_we

[I 2023-11-14 15:57:13,436] Trial 45 finished with value: 0.8802985074626866 and parameters: {'learning_rate': 0.025377669824287956, 'depth': 5, 'l2_leaf_reg': 37, 'colsample_bylevel': 0.44240250014698357, 'auto_class_weights': 'None', 'boosting_type': 'Plain', 'bootstrap_type': 'Bernoulli', 'subsample': 0.9905519736200141}. Best is trial 33 with value: 0.8892537313432836.
[I 2023-11-14 15:57:14,659] Trial 49 finished with value: 0.8280597014925373 and parameters: {'learning_rate': 0.028702415061758992, 'depth': 4, 'l2_leaf_reg': 42, 'colsample_bylevel': 0.5361377914049194, 'auto_class_weights': 'None', 'boosting_type': 'Plain', 'bootstrap_type': 'Bernoulli', 'subsample': 0.846504300308953}. Best is trial 33 with value: 0.8892537313432836.
[I 2023-11-14 15:57:14,752] Trial 48 finished with value: 0.8462686567164179 and parameters: {'learning_rate': 0.028673596936158306, 'depth': 4, 'l2_leaf_reg': 43, 'colsample_bylevel': 0.5371826826089141, 'auto_class_weights': 'None', 'boosting_type'

[I 2023-11-14 15:57:42,198] Trial 69 finished with value: 0.8549253731343284 and parameters: {'learning_rate': 0.029248101068932, 'depth': 8, 'l2_leaf_reg': 15, 'colsample_bylevel': 0.45088357559552567, 'auto_class_weights': 'SqrtBalanced', 'boosting_type': 'Ordered', 'bootstrap_type': 'MVS'}. Best is trial 33 with value: 0.8892537313432836.
[I 2023-11-14 15:57:44,023] Trial 64 finished with value: 0.888955223880597 and parameters: {'learning_rate': 0.02898880091574831, 'depth': 6, 'l2_leaf_reg': 10, 'colsample_bylevel': 0.6012815433620056, 'auto_class_weights': 'SqrtBalanced', 'boosting_type': 'Ordered', 'bootstrap_type': 'MVS'}. Best is trial 33 with value: 0.8892537313432836.
[I 2023-11-14 15:57:46,618] Trial 71 finished with value: 0.862089552238806 and parameters: {'learning_rate': 0.028994817870552126, 'depth': 7, 'l2_leaf_reg': 22, 'colsample_bylevel': 0.6066742531532294, 'auto_class_weights': 'SqrtBalanced', 'boosting_type': 'Ordered', 'bootstrap_type': 'Bayesian', 'bagging_tem

[I 2023-11-14 15:58:17,231] Trial 94 finished with value: 0.8626865671641791 and parameters: {'learning_rate': 0.029836470650063052, 'depth': 8, 'l2_leaf_reg': 6, 'colsample_bylevel': 0.589851819218691, 'auto_class_weights': 'SqrtBalanced', 'boosting_type': 'Plain', 'bootstrap_type': 'MVS'}. Best is trial 90 with value: 0.8961194029850746.
[I 2023-11-14 15:58:19,177] Trial 93 finished with value: 0.8898507462686567 and parameters: {'learning_rate': 0.027839530404761118, 'depth': 8, 'l2_leaf_reg': 7, 'colsample_bylevel': 0.5845064714699258, 'auto_class_weights': 'SqrtBalanced', 'boosting_type': 'Plain', 'bootstrap_type': 'MVS'}. Best is trial 90 with value: 0.8961194029850746.
[I 2023-11-14 15:58:20,709] Trial 95 finished with value: 0.8773134328358209 and parameters: {'learning_rate': 0.02782980529968912, 'depth': 8, 'l2_leaf_reg': 9, 'colsample_bylevel': 0.6310956787742105, 'auto_class_weights': 'SqrtBalanced', 'boosting_type': 'Plain', 'bootstrap_type': 'MVS'}. Best is trial 90 with 

### Optuna (без фолдов)

In [None]:
def objective(trial, return_models=False):

    train_data = X_train, y_train
    valid_data = X_valid, y_valid
            
    # Подаем trials для перебора
    model, y_pred = fit_catboost(trial, train_data, valid_data) # Определили выше
    result = roc_auc_score(valid_data[1],y_pred)
    
    
    if return_models:
        return result, model
    else:
        return result

def fit_catboost(trial, train, val):
    X_train, y_train = train
    X_val, y_val = val
    
    param = {
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3),
        "depth": trial.suggest_int("depth", 3, 9), #default = 6
        "l2_leaf_reg": trial.suggest_int("l2_leaf_reg", 2, 50), #default = 3
        "colsample_bylevel": trial.suggest_float("colsample_bylevel", 0.01, 0.8), #default = 1
        "auto_class_weights": trial.suggest_categorical("auto_class_weights", ["SqrtBalanced", "Balanced", "None"]),#default = None
        "boosting_type": trial.suggest_categorical("boosting_type", ["Ordered", "Plain"]),#default = Ordered
        "bootstrap_type": trial.suggest_categorical("bootstrap_type", ["Bayesian", "Bernoulli", "MVS"]), #default = Bayesian
        "eval_metric": "AUC",
    }
    if param["bootstrap_type"] == "Bayesian":
        param["bagging_temperature"] = trial.suggest_float("bagging_temperature", 0, 20)#default = 1
        
    elif param["bootstrap_type"] == "Bernoulli":
        param["subsample"] = trial.suggest_float("subsample", 0.1, 1)#default = 1
        
    clf = CatBoostClassifier(
        **param,
        thread_count=-1,
        random_seed=42,
        cat_features=cat_features,)

    clf.fit(
        X_train,
        y_train,
        eval_set=(X_val, y_val),
        verbose=200,
        plot=False,
        early_stopping_rounds=5,)

    y_pred = clf.predict_proba(X_val)[:,1]
    return clf, y_pred

In [None]:
study = optuna.create_study(direction="maximize")
study.optimize(objective,
               n_trials=100,
               n_jobs = -1,
               show_progress_bar=True,)