In [2]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, StratifiedKFold, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, accuracy_score, precision_score, \
                            recall_score, f1_score, log_loss

from tqdm.notebook import tqdm

from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

RAND=42
N_FOLDS = 5

In [3]:
def get_metrics(y_test, y_pred, y_score, name = "Default"):
    """Метрики для задачи классификации"""
    df_metrics = pd.DataFrame()

    df_metrics['model'] = [name]
    df_metrics['Accuracy'] = accuracy_score(y_test, y_pred)
    df_metrics['ROC_AUC'] = roc_auc_score(y_test, y_score[:, 1])
    df_metrics['Precision'] = precision_score(y_test, y_pred)
    df_metrics['Recall'] = recall_score(y_test, y_pred)
    df_metrics['f1'] = f1_score(y_test, y_pred)
    df_metrics['Logloss'] = log_loss(y_test, y_score)

    return df_metrics

# Baseline models

In [4]:
df = pd.read_csv("eda.csv")
df[:5]

Unnamed: 0,id,rn,pre_since_opened,pre_since_confirmed,pre_pterm,pre_fterm,pre_till_pclose,pre_till_fclose,pre_loans_credit_limit,pre_loans_next_pay_summ,...,enc_paym_7_4,enc_paym_8_4,enc_paym_9_4,enc_paym_11_4,enc_paym_20_4,enc_paym_24_4,enc_loans_account_holder_type_4,enc_loans_credit_status_4,enc_loans_credit_type_4,enc_loans_account_cur_4
0,0,10,7,9,2,10,8,8,16,4,...,0.0,3.0,3.0,4.0,4.0,4.0,1.0,3.0,4.0,1.0
1,1,14,12,6,1,8,14,11,11,2,...,3.0,3.0,3.0,4.0,4.0,4.0,1.0,2.0,4.0,1.0
2,2,3,12,9,4,8,1,11,1,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,3,15,12,9,2,10,8,8,17,1,...,0.0,0.0,0.0,4.0,4.0,4.0,1.0,2.0,1.0,1.0
4,4,1,12,9,4,8,1,11,12,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [5]:
df.shape

(237887, 184)

In [6]:
df = df.set_index("id")

In [7]:
X = df.drop(columns=["flag"])
Y = df["flag"]

In [8]:
x_train, x_test, y_train, y_test = train_test_split(X, Y, 
                                                    test_size=0.2, 
                                                    stratify=Y, 
                                                    random_state=RAND)

In [9]:
model = LGBMClassifier(class_weight='balanced', random_state=RAND)
model.fit(x_train, y_train)

In [10]:
y_pred_tr = model.predict(x_train)
y_score_tr = model.predict_proba(x_train)

y_pred = model.predict(x_test)
y_score = model.predict_proba(x_test)

In [11]:
res = get_metrics(y_train, y_pred_tr, y_score_tr, name="lgbm_train")
res = res.append(get_metrics(y_test, y_pred, y_score, name="lgbm_test"))

  res = res.append(get_metrics(y_test, y_pred, y_score, name="lgbm_test"))


In [12]:
res

Unnamed: 0,model,Accuracy,ROC_AUC,Precision,Recall,f1,Logloss
0,lgbm_train,0.742009,0.854297,0.090882,0.81322,0.163492,0.510198
0,lgbm_test,0.730359,0.74579,0.070055,0.627119,0.12603,0.524206


In [13]:
weight = y_train[y_train==0].shape[0] / y_train[y_train==1].shape[0]

model = CatBoostClassifier(scale_pos_weight=weight, random_state=RAND, verbose=0)
model.fit(x_train, y_train)

<catboost.core.CatBoostClassifier at 0x7f25f0215540>

In [14]:
y_pred_tr = model.predict(x_train)
y_score_tr = model.predict_proba(x_train)

y_pred = model.predict(x_test)
y_score = model.predict_proba(x_test)

In [15]:
res = res.append(get_metrics(y_train, y_pred_tr, y_score_tr, name="catboost_train"))
res = res.append(get_metrics(y_test, y_pred, y_score, name="catboost_test"))

  res = res.append(get_metrics(y_train, y_pred_tr, y_score_tr, name="catboost_train"))
  res = res.append(get_metrics(y_test, y_pred, y_score, name="catboost_test"))


In [16]:
res

Unnamed: 0,model,Accuracy,ROC_AUC,Precision,Recall,f1,Logloss
0,lgbm_train,0.742009,0.854297,0.090882,0.81322,0.163492,0.510198
0,lgbm_test,0.730359,0.74579,0.070055,0.627119,0.12603,0.524206
0,catboost_train,0.880915,0.972948,0.202288,0.965254,0.334479,0.316045
0,catboost_test,0.849321,0.715211,0.084622,0.39322,0.139272,0.360949


In [17]:
model = LogisticRegression(class_weight='balanced', random_state=RAND)
model.fit(x_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [18]:
y_pred_tr = model.predict(x_train)
y_score_tr = model.predict_proba(x_train)

y_pred = model.predict(x_test)
y_score = model.predict_proba(x_test)

In [19]:
res = res.append(get_metrics(y_train, y_pred_tr, y_score_tr, "LG_train"))
res = res.append(get_metrics(y_test, y_pred, y_score, "LG_test"))
res

  res = res.append(get_metrics(y_train, y_pred_tr, y_score_tr, "LG_train"))
  res = res.append(get_metrics(y_test, y_pred, y_score, "LG_test"))


Unnamed: 0,model,Accuracy,ROC_AUC,Precision,Recall,f1,Logloss
0,lgbm_train,0.742009,0.854297,0.090882,0.81322,0.163492,0.510198
0,lgbm_test,0.730359,0.74579,0.070055,0.627119,0.12603,0.524206
0,catboost_train,0.880915,0.972948,0.202288,0.965254,0.334479,0.316045
0,catboost_test,0.849321,0.715211,0.084622,0.39322,0.139272,0.360949
0,LG_train,0.665733,0.74014,0.062415,0.697627,0.114578,0.604187
0,LG_test,0.664067,0.726959,0.060257,0.673898,0.110623,0.606836


**Выводы**  
Среди выбранных baseline-моделей catboost показал большое переобучение, что можно нивилировать, подобрав гиперпараметры; lightgbm показал меньше переобучение и большое значение ROC-AUC на тесте; логистическая регрессия показала слишком низкие метрики и на трейне и тест, что может говорить о том, что линейная модель не подходит для данной задачи.
  
Так как в нашем случае матрица объект-признаки имеет большую размерность, lightgbm является более подходящей моделью для обучения на стационарном устройстве, так как сам алгоритм работает быстрее.

# Tune

In [20]:
import optuna
import scipy.stats

In [25]:
def objective_gpt(trial: optuna.Trial, x, y):
    params = {
        'n_estimators': trial.suggest_categorical('n_estimators', [400]),
        'learning_rate': trial.suggest_loguniform('learning_rate', 1e-4, 1e-1),
#         'learning_rate': trial.suggest_categorical('learning_rate', [0.0002073959465051668]),
#         'max_bin': trial.suggest_int('max_bin', 100, 1000),
#         'num_leaves': trial.suggest_int('num_leaves', 10, 500),
#         'max_depth': trial.suggest_int('max_depth', 3, 20),
#         'min_child_samples': trial.suggest_int('min_child_samples', 100, 70000, step=100),
#         'lambda_l1': trial.suggest_int('lambda_l1', 0, 100),
#         'lambda_l2': trial.suggest_int('lambda_l2', 0, 100),
#         'min_split_gain': trial.suggest_loguniform('min_split_gain', 1e-4, 1e-1),
#         'objective': trial.suggest_categorical('objective', ['binary']),
#         'metric': trial.suggest_categorical('metric', ['auc']),
#         'feature_fraction': trial.suggest_uniform('feature_fraction', 0.2, 1.0),
#         'bagging_fraction': trial.suggest_uniform('bagging_fraction', 0.2, 1.0),
#         'bagging_freq': trial.suggest_int('bagging_freq', 1, 7),
        'random_state': trial.suggest_categorical('random_state', [RAND]),
    }

    cv_pred = np.empty(N_FOLDS)
    cv = StratifiedKFold(n_splits=N_FOLDS, shuffle=True, random_state=RAND)

    for fold, (train_idx, test_idx) in enumerate(cv.split(x, y)):
        x_train_, x_val_ = x.iloc[train_idx], x.iloc[test_idx]
        y_train_, y_val_ = y.iloc[train_idx], y.iloc[test_idx]

        ratio = y_train_[y_train_ == 0].shape[0] / \
            y_train_[y_train_ == 1].shape[0]

        pruning = optuna.integration.LightGBMPruningCallback(trial, 'auc')

        model = LGBMClassifier(
            scale_pos_weight=ratio,
            n_jobs=-1,
            **params
        )
        model.fit(x_train_, y_train_,
                  eval_metric='auc',
                  eval_set=[(x_val_, y_val_)],
                  early_stopping_rounds=100,
                  callbacks=[pruning],
                  verbose=0)

        y_pred = model.predict(x_val_)
        y_proba = model.predict_proba(x_val_)[:, 1]

        cv_pred[fold] = roc_auc_score(y_val_, y_proba)
    return (np.mean(cv_pred))

In [None]:
func = lambda trial: objective_gpt(trial, x_train, y_train)

study = optuna.create_study(direction="maximize")
study.optimize(func, n_trials=10, show_progress_bar=True, n_jobs=-1)

[32m[I 2023-03-17 23:21:43,643][0m A new study created in memory with name: no-name-91821bd2-9fd6-44e6-8f6e-2c51ac6ac780[0m
  self._init_valid()


  0%|          | 0/10 [00:00<?, ?it/s]

  'learning_rate': trial.suggest_loguniform('learning_rate', 1e-4, 1e-1),
  'learning_rate': trial.suggest_loguniform('learning_rate', 1e-4, 1e-1),
  'learning_rate': trial.suggest_loguniform('learning_rate', 1e-4, 1e-1),
  'learning_rate': trial.suggest_loguniform('learning_rate', 1e-4, 1e-1),
  'learning_rate': trial.suggest_loguniform('learning_rate', 1e-4, 1e-1),
  'learning_rate': trial.suggest_loguniform('learning_rate', 1e-4, 1e-1),
  'learning_rate': trial.suggest_loguniform('learning_rate', 1e-4, 1e-1),
  'learning_rate': trial.suggest_loguniform('learning_rate', 1e-4, 1e-1),


In [None]:
study.optimize(func, n_trials=50, show_progress_bar=True, n_jobs=-1)

In [24]:
study.best_params

{'n_estimators': 800,
 'learning_rate': 0.0002073959465051668,
 'max_bin': 379,
 'num_leaves': 190,
 'max_depth': 19,
 'min_child_samples': 200,
 'lambda_l1': 0,
 'lambda_l2': 16,
 'min_split_gain': 0.005772196326744969,
 'objective': 'binary',
 'metric': 'auc',
 'feature_fraction': 0.924434636052918,
 'bagging_fraction': 0.8746833718519047,
 'bagging_freq': 6,
 'random_state': 42}