# Import

In [1]:
import numpy as np
import pandas as pd
from scipy import stats
import yaml
import joblib

from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import accuracy_score, roc_auc_score, precision_score, \
recall_score, log_loss, f1_score

from catboost import CatBoostClassifier

from typing import Dict

RAND = 10

import warnings
warnings.filterwarnings('ignore')

In [2]:
config_path = '../config/params.yml'
config = yaml.load(open(config_path), Loader=yaml.FullLoader)

preproc = config['preprocessing']
training = config['train']

In [3]:
df = pd.read_csv(preproc['df_path_proc'])

In [4]:
df.drop('Unnamed: 0', axis=1, inplace=True)

In [5]:
df.head()

Unnamed: 0,age,is_male,user_id,0,1,2,3,4,5,6,...,40,41,42,43,44,45,46,47,48,49
0,41.0,0,99002,0.01146,-0.045297,-0.000106,0.038355,-0.049731,0.017894,0.037578,...,-0.031913,-0.028657,0.082529,-0.013785,0.049856,0.002158,-0.053036,0.036244,-0.106226,0.068259
1,41.0,1,29286,-0.001348,-0.003459,0.003018,0.001728,0.000264,0.005089,0.001692,...,0.000378,-0.00268,-0.002831,0.007567,0.002133,0.00499,-0.004669,-0.002094,0.002885,-0.001464
2,53.0,1,353838,-0.021405,-0.00458,0.031048,0.022001,-0.018898,0.004544,-0.0279,...,-0.081318,0.125404,0.16998,-0.075722,0.119304,-0.038135,0.033725,-0.048156,0.013396,0.010371
3,24.0,1,159197,0.069789,-0.005299,0.089089,0.019686,0.035566,0.043845,-0.054492,...,-0.049698,0.054714,0.046483,-0.078137,-0.042038,-0.064442,-0.070671,-0.011174,-0.072168,0.008845
4,33.0,0,32977,0.004787,0.008874,0.015001,0.002139,-0.002624,-0.00112,-0.004672,...,-0.016129,-0.02755,0.012608,0.030987,0.015132,0.021481,-0.016787,-0.014854,0.014066,0.000676


In [6]:
def get_metrics(y_test: np.array, y_pred: np.array, y_proba: np.array) -> dict:
    '''
    Создает словарь с оновными метриками
    :param y_test: реальные данные
    :param y_pred: предсказанные значения
    :param y_proba: предсказанные вероятности
    :return: словарь с метриками
    '''
    dict_metrics = {
        'accuracy': round(accuracy_score(y_test, y_pred), 3),
        'roc_auc': round(roc_auc_score(y_test, y_proba[:, 1]), 3),
        'precision': round(precision_score(y_test, y_pred), 3),
        'recall': round(recall_score(y_test, y_pred), 3),
        'f1': round(f1_score(y_test, y_pred), 3),
        'logloss': round(log_loss(y_test, y_proba), 3)
    }

    return dict_metrics

# Modeling. Catboost

In [7]:
X = df.drop(columns=training['drop_model'], axis=1)
y = df[training['target_column']]

X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=preproc['test_size'],
                                                    shuffle=True,
                                                    random_state=training['random_state'])


In [8]:
clf = CatBoostClassifier(random_state=preproc['random_state'], eval_metric="AUC")

clf.fit(X_train,
        y_train,
        early_stopping_rounds=training['early_stopping_rounds'], 
        verbose=False)

<catboost.core.CatBoostClassifier at 0x7f8bb800ae50>

# Tuning

In [9]:
grid = {
    'n_estimators': [1000],
    'learning_rate': np.linspace(0.01, 0.1, 5),
    'boosting_type' : ['Ordered', 'Plain'],
    'max_depth': list(range(3, 12)),
    'l2_leaf_reg': np.logspace(-5, 2, 5),
    'random_strength': list(range(10, 50, 5)),
    'bootstrap_type': ["Bayesian", "Bernoulli", "MVS", "No"],
    'border_count': [128, 254],
    'grow_policy': ["SymmetricTree", "Depthwise", "Lossguide"],
    'random_state': [RAND]
}

model = CatBoostClassifier(eval_metric="AUC", silent=True)
grid_search_result = model.randomized_search(grid,
                                             X=X_train,
                                             y=y_train, 
                                             verbose=False)


bestTest = 0.7921391847
bestIteration = 549

Metric AUC is not calculated on train by default. To calculate this metric on train, add hints=skip_train~false to metric parameters.

bestTest = 0.8026251198
bestIteration = 411

Metric AUC is not calculated on train by default. To calculate this metric on train, add hints=skip_train~false to metric parameters.

bestTest = 0.8066151853
bestIteration = 971

Metric AUC is not calculated on train by default. To calculate this metric on train, add hints=skip_train~false to metric parameters.

bestTest = 0.7796334383
bestIteration = 211

Metric AUC is not calculated on train by default. To calculate this metric on train, add hints=skip_train~false to metric parameters.

bestTest = 0.8092687698
bestIteration = 817

Metric AUC is not calculated on train by default. To calculate this metric on train, add hints=skip_train~false to metric parameters.
Training on fold [0/3]

bestTest = 0.8036360884
bestIteration = 715

Training on fold [1/3]

bestTes

In [10]:
cat_best = grid_search_result['params']

In [11]:
best_params = training['params_path']

with open(best_params, 'w') as f:
    json.dump(cat_best, f)

# Holdout

In [12]:
# ваш код
def cross_validation_cat(X_train: pd.DataFrame,
                         y_train: pd.Series,
                         X_test: pd.DataFrame,
                         y_test: pd.Series,
                         clf,
                         params: dict,
                         cat_features: list = None,
                         eval_metric: str = None,
                         early_stop: bool = False,
                         early_stopping_rounds: int = training['early_stopping_rounds'],
                         num_folds: int = training['n_folds'],
                         random_state: int = training['random_state'],
                         shuffle: bool = True):
    """
    Получает результаты при помощи кросс-валидации для задачи Классиификации
    """
    folds = StratifiedKFold(n_splits=num_folds, 
                            random_state=random_state, 
                            shuffle=shuffle)
    score_oof = []
    pred_test = []
    pred_prob_test = []


    for fold, (train_index, 
               test_index) in enumerate(folds.split(X_train, y_train)):
        X_train_, X_val = X_train.iloc[train_index], X_train.iloc[test_index]
        y_train_, y_val = y_train.iloc[train_index], y_train.iloc[test_index]

        model = clf(**params)

        if early_stop == True:
            if eval_metric is None:
                model.fit(X_train_,
                          y_train_,
                          eval_set=[(X_val, y_val)],
                          cat_features=cat_features,
                          silent=True,
                          early_stopping_rounds=early_stopping_rounds)
            else:
                model.fit(X_train_,
                          y_train_,
                          eval_set=[(X_val, y_val)],
                          eval_metric=eval_metric,
                          silent=True,
                          cat_features=cat_features,
                          early_stopping_rounds=early_stopping_rounds)
        else:
            model.fit(X_train_, y_train_, cat_features=cat_features)

        y_pred_val = model.predict_proba(X_val)[:, 1]
        y_pred = model.predict(X_test)
        y_pred_prob = model.predict_proba(X_test)

        print(
            "Fold:", fold + 1,
            "AUC SCORE %.3f" % roc_auc_score(y_val, y_pred_val))
        print("---")

        # oof list
        score_oof.append(roc_auc_score(y_val, y_pred_val))
        # holdout list
        pred_test.append(y_pred)
        pred_prob_test.append(y_pred_prob)

    return score_oof, pred_test, pred_prob_test

In [13]:
score_oof, pred_test, pred_prob_test = cross_validation_cat(
    X_train,
    y_train,
    X_test,
    y_test,
    early_stop=True,
    early_stopping_rounds=training['early_stopping_rounds'],
    num_folds=training['n_folds'],
    random_state=training['random_state'],
    clf=CatBoostClassifier,
    params=cat_best)

Fold: 1 AUC SCORE 0.809
---
Fold: 2 AUC SCORE 0.811
---
Fold: 3 AUC SCORE 0.793
---
Fold: 4 AUC SCORE 0.806
---
Fold: 5 AUC SCORE 0.797
---


In [14]:
fin_test_pred = stats.mode(np.column_stack(pred_test), axis=1)[0]
fin_test_pred_prob = np.mean(pred_prob_test, axis=0)

print(f'ROC-AUC mean OOF = {np.mean(score_oof)}')
print(f'ROC-AUC HOLDOUT = {roc_auc_score(y_test, fin_test_pred_prob[:, 1])}')

ROC-AUC mean OOF = 0.8033355401587284
ROC-AUC HOLDOUT = 0.8113342237139753


In [15]:
model_path = training['model_path']

joblib.dump(clf, model_path)

['../../../../Курс_по_DS/Pet_project/Production/Models/model_cat.joblib']

In [16]:
metrics_path = training['metrics_path']

metrics = get_metrics(y_test, fin_test_pred, fin_test_pred_prob)
with open(metrics_path, 'w') as f:
    json.dump(metrics, f)

In [17]:
with open(metrics_path) as json_file:
    metrics = json.load(json_file)
metrics

{'accuracy': 0.722,
 'roc_auc': 0.811,
 'precision': 0.71,
 'recall': 0.743,
 'f1': 0.726,
 'logloss': 0.53}