In [22]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, accuracy_score, precision_score, \
                            recall_score, f1_score, log_loss

from tqdm.notebook import tqdm

from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

import joblib

RAND=42
N_FOLDS = 5

In [3]:
def get_metrics(y_test, y_pred, y_score, name = "Default"):
    """Метрики для задачи классификации"""
    df_metrics = pd.DataFrame()

    df_metrics['model'] = [name]
    df_metrics['Accuracy'] = accuracy_score(y_test, y_pred)
    df_metrics['ROC_AUC'] = roc_auc_score(y_test, y_score[:, 1])
    df_metrics['Precision'] = precision_score(y_test, y_pred)
    df_metrics['Recall'] = recall_score(y_test, y_pred)
    df_metrics['f1'] = f1_score(y_test, y_pred)
    df_metrics['Logloss'] = log_loss(y_test, y_score)

    return df_metrics

# Feature engineering

In [4]:
filepath = "../data/processed/train_data_proc.pq"
df = pd.read_parquet(filepath)
df[:5]

Unnamed: 0_level_0,rn,pre_since_opened,pre_since_confirmed,pre_pterm,pre_fterm,pre_till_pclose,pre_till_fclose,pre_loans_credit_limit,pre_loans_next_pay_summ,pre_loans_outstanding,...,enc_paym_21_4,enc_paym_22_4,enc_paym_23_4,enc_paym_24_4,enc_loans_account_holder_type_4,enc_loans_credit_status_4,enc_loans_credit_type_4,enc_loans_account_cur_4,pclose_flag_4,fclose_flag_4
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,10,7,9,2,10,8,8,16,4,2,...,3.0,3.0,3.0,4.0,1.0,3.0,4.0,1.0,0.0,0.0
1,14,12,6,1,8,14,11,11,2,3,...,3.0,3.0,3.0,4.0,1.0,2.0,4.0,1.0,0.0,0.0
2,3,12,9,4,8,1,11,1,1,4,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,15,12,9,2,10,8,8,17,1,2,...,3.0,3.0,3.0,4.0,1.0,2.0,1.0,1.0,0.0,1.0
4,1,12,9,4,8,1,11,12,1,3,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [5]:
df.shape

(237887, 234)

In [6]:
X = df.drop(columns=["flag"])
Y = df["flag"]

In [7]:
mms = MinMaxScaler()
X_scaled = mms.fit_transform(X)

In [8]:
x_train, x_test, y_train, y_test = train_test_split(X_scaled, Y, test_size=0.2, 
                                                    stratify=Y, 
                                                    random_state=RAND)

# Baseline models

In [8]:
# Создаем и обучаем бейслайн LightGBM
model = LGBMClassifier(class_weight='balanced', random_state=RAND)
model.fit(x_train, y_train)

In [9]:
y_pred_tr = model.predict(x_train)
y_score_tr = model.predict_proba(x_train)

y_pred = model.predict(x_test)
y_score = model.predict_proba(x_test)

In [10]:
res = get_metrics(y_train, y_pred_tr, y_score_tr, name="lgbm_train")
res = res.append(get_metrics(y_test, y_pred, y_score, name="lgbm_test"))

  res = res.append(get_metrics(y_test, y_pred, y_score, name="lgbm_test"))


In [11]:
res

Unnamed: 0,model,Accuracy,ROC_AUC,Precision,Recall,f1,Logloss
0,lgbm_train,0.740343,0.851627,0.089575,0.804064,0.161192,0.511775
0,lgbm_test,0.732629,0.757902,0.072856,0.649729,0.13102,0.519947


In [12]:
ratio = y_train[y_train==0].shape[0] / y_train[y_train==1].shape[0]

# Создаем и обучаем бейслайн Catboost
model = CatBoostClassifier(scale_pos_weight=ratio, random_state=RAND, verbose=0)
model.fit(x_train, y_train)

<catboost.core.CatBoostClassifier at 0x7f5c2059bc70>

In [13]:
y_pred_tr = model.predict(x_train)
y_score_tr = model.predict_proba(x_train)

y_pred = model.predict(x_test)
y_score = model.predict_proba(x_test)

In [14]:
res = res.append(get_metrics(y_train, y_pred_tr, y_score_tr, name="catboost_train"))
res = res.append(get_metrics(y_test, y_pred, y_score, name="catboost_test"))

  res = res.append(get_metrics(y_train, y_pred_tr, y_score_tr, name="catboost_train"))
  res = res.append(get_metrics(y_test, y_pred, y_score, name="catboost_test"))


In [15]:
res

Unnamed: 0,model,Accuracy,ROC_AUC,Precision,Recall,f1,Logloss
0,lgbm_train,0.740343,0.851627,0.089575,0.804064,0.161192,0.511775
0,lgbm_test,0.732629,0.757902,0.072856,0.649729,0.13102,0.519947
0,catboost_train,0.878624,0.971476,0.198942,0.962066,0.329706,0.321011
0,catboost_test,0.852159,0.728649,0.090722,0.417344,0.149044,0.35739


In [16]:
# Создаем и обучаем линейную регрессию
model = LogisticRegression(class_weight='balanced', random_state=RAND)
model.fit(x_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [17]:
y_pred_tr = model.predict(x_train)
y_score_tr = model.predict_proba(x_train)

y_pred = model.predict(x_test)
y_score = model.predict_proba(x_test)

In [18]:
res = res.append(get_metrics(y_train, y_pred_tr, y_score_tr, "LG_train"))
res = res.append(get_metrics(y_test, y_pred, y_score, "LG_test"))
res

  res = res.append(get_metrics(y_train, y_pred_tr, y_score_tr, "LG_train"))
  res = res.append(get_metrics(y_test, y_pred, y_score, "LG_test"))


Unnamed: 0,model,Accuracy,ROC_AUC,Precision,Recall,f1,Logloss
0,lgbm_train,0.740343,0.851627,0.089575,0.804064,0.161192,0.511775
0,lgbm_test,0.732629,0.757902,0.072856,0.649729,0.13102,0.519947
0,catboost_train,0.878624,0.971476,0.198942,0.962066,0.329706,0.321011
0,catboost_test,0.852159,0.728649,0.090722,0.417344,0.149044,0.35739
0,LG_train,0.667388,0.74173,0.062852,0.69873,0.11533,0.603157
0,LG_test,0.672307,0.740945,0.063355,0.693767,0.116106,0.599816


**Выводы**  
Среди выбранных baseline-моделей catboost показал большое переобучение, что можно нивилировать, подобрав гиперпараметры; lightgbm показал меньше переобучение и большое значение ROC-AUC на валидации; логистическая регрессия показала слишком низкие метрики и на трейне и на валидации, что может говорить о том, что линейная модель не подходит для данной задачи.
  
Так как в нашем случае матрица объект-признаки имеет большую размерность, lightgbm является более подходящей моделью для обучения на стационарном устройстве, так как сам алгоритм работает быстрее.

In [14]:
df = df.drop("pre_loans_total_overdue", axis=1)
df = df.set_index('id')

X = df.groupby(level=0).last()

for i in range(2, n+1):
    X = X.join(df.iloc[:,1:].groupby(level=0).nth(-i), rsuffix=f'_{i}', how='left')
    
X = X.fillna(0)

X_scaled = mms.transform(X)

KeyError: "['pre_loans_total_overdue'] not found in axis"

# Tune

In [9]:
import optuna
import scipy.stats as stats

In [12]:
def objective_gpt(trial: optuna.Trial, x, y):
    params = {
        'n_estimators': trial.suggest_categorical('n_estimators', [1000]),
#         'learning_rate': trial.suggest_float('learning_rate', 0.001, 3),
        'learning_rate': trial.suggest_categorical('learning_rate', [0.035745438823873034]),
        'max_bin': trial.suggest_int('max_bin', 100, 500),
        'num_leaves': trial.suggest_int('num_leaves', 10, 500),
        'max_depth': trial.suggest_int('max_depth', 3, 20),
        'min_child_samples': trial.suggest_int('min_child_samples', 100, 70000, step=100),
        'lambda_l1': trial.suggest_int('lambda_l1', 0, 100),
        'lambda_l2': trial.suggest_int('lambda_l2', 0, 100),
        'min_split_gain': trial.suggest_loguniform('min_split_gain', 1e-4, 1e-1),
        'objective': trial.suggest_categorical('objective', ['binary']),
        'metric': trial.suggest_categorical('metric', ['auc']),
        'feature_fraction': trial.suggest_uniform('feature_fraction', 0.2, 1.0),
        'bagging_fraction': trial.suggest_uniform('bagging_fraction', 0.2, 1.0),
        'bagging_freq': trial.suggest_int('bagging_freq', 1, 7),
        'random_state': trial.suggest_categorical('random_state', [RAND]),
    }

    cv_pred = np.empty(N_FOLDS)
    cv = StratifiedKFold(n_splits=N_FOLDS, shuffle=True, random_state=RAND)

    for fold, (train_idx, test_idx) in enumerate(cv.split(x, y)):
        x_train_, x_val_ = x[train_idx, :], x[test_idx, :]
        y_train_, y_val_ = y[train_idx], y[test_idx]

#         ratio = y_train_[y_train_ == 0].shape[0] / \
#             y_train_[y_train_ == 1].shape[0]

        pruning = optuna.integration.LightGBMPruningCallback(trial, 'auc')

        model = LGBMClassifier(
            class_weight="balanced",
            n_jobs=-1,
            **params
        )
        model.fit(x_train_, y_train_,
                  eval_metric='auc',
                  eval_set=[(x_val_, y_val_)],
                  early_stopping_rounds=100,
                  callbacks=[pruning],
                  verbose=0)

        y_pred = model.predict(x_val_)
        y_proba = model.predict_proba(x_val_)[:, 1]

        cv_pred[fold] = roc_auc_score(y_val_, y_proba)
    return (np.mean(cv_pred))

In [13]:
func = lambda trial: objective_gpt(trial, x_train, y_train.values)

study = optuna.create_study(direction="maximize")
study.optimize(func, n_trials=10, show_progress_bar=True, n_jobs=6)

[32m[I 2023-05-29 17:09:42,518][0m A new study created in memory with name: no-name-af6196ec-1eb6-47a5-8d7d-3c2aa556eb9e[0m
  self._init_valid()


  0%|          | 0/10 [00:00<?, ?it/s]

  'min_split_gain': trial.suggest_loguniform('min_split_gain', 1e-4, 1e-1),
  'feature_fraction': trial.suggest_uniform('feature_fraction', 0.2, 1.0),
  'bagging_fraction': trial.suggest_uniform('bagging_fraction', 0.2, 1.0),
  'min_split_gain': trial.suggest_loguniform('min_split_gain', 1e-4, 1e-1),
  'feature_fraction': trial.suggest_uniform('feature_fraction', 0.2, 1.0),
  'bagging_fraction': trial.suggest_uniform('bagging_fraction', 0.2, 1.0),
  'min_split_gain': trial.suggest_loguniform('min_split_gain', 1e-4, 1e-1),
  'feature_fraction': trial.suggest_uniform('feature_fraction', 0.2, 1.0),
  'bagging_fraction': trial.suggest_uniform('bagging_fraction', 0.2, 1.0),
  'min_split_gain': trial.suggest_loguniform('min_split_gain', 1e-4, 1e-1),
  'feature_fraction': trial.suggest_uniform('feature_fraction', 0.2, 1.0),
  'bagging_fraction': trial.suggest_uniform('bagging_fraction', 0.2, 1.0),
  'min_split_gain': trial.suggest_loguniform('min_split_gain', 1e-4, 1e-1),
  'feature_fraction'





In [15]:
study.optimize(func, n_trials=40, show_progress_bar=True, n_jobs=6)

  self._init_valid()


  0%|          | 0/40 [00:00<?, ?it/s]

  'min_split_gain': trial.suggest_loguniform('min_split_gain', 1e-4, 1e-1),
  'feature_fraction': trial.suggest_uniform('feature_fraction', 0.2, 1.0),
  'bagging_fraction': trial.suggest_uniform('bagging_fraction', 0.2, 1.0),
  'feature_fraction': trial.suggest_uniform('feature_fraction', 0.2, 1.0),
  'min_split_gain': trial.suggest_loguniform('min_split_gain', 1e-4, 1e-1),
  'bagging_fraction': trial.suggest_uniform('bagging_fraction', 0.2, 1.0),
  'min_split_gain': trial.suggest_loguniform('min_split_gain', 1e-4, 1e-1),
  'feature_fraction': trial.suggest_uniform('feature_fraction', 0.2, 1.0),
  'feature_fraction': trial.suggest_uniform('feature_fraction', 0.2, 1.0),
  'bagging_fraction': trial.suggest_uniform('bagging_fraction', 0.2, 1.0),




In [16]:
study.best_params

{'n_estimators': 1000,
 'learning_rate': 0.03574543882387304,
 'max_bin': 158,
 'num_leaves': 421,
 'max_depth': 16,
 'min_child_samples': 200,
 'lambda_l1': 84,
 'lambda_l2': 45,
 'min_split_gain': 0.0016972064854697496,
 'objective': 'binary',
 'metric': 'auc',
 'feature_fraction': 0.2057318339393228,
 'bagging_fraction': 0.8935140286078869,
 'bagging_freq': 5,
 'random_state': 42}

In [19]:
# Посчитаем микро- и макро- усреднение roc-auc 
cv = StratifiedKFold(n_splits=N_FOLDS, shuffle=True, random_state=RAND)
roc_aucs = []
y_pred_hold = []
y_score_hold = []

for fold, (train_idx, test_idx) in enumerate(cv.split(x_train, y_train)):
    x_train_, x_val_ = x_train[train_idx, :], x_train[test_idx, :]
    y_train_, y_val_ = y_train.values[train_idx], y_train.values[test_idx]

#     ratio = y_train_[y_train_ == 0].shape[0] / \
#         y_train_[y_train_ == 1].shape[0]

    model = LGBMClassifier(**study.best_params, class_weight='balanced', n_jobs=-1)
    model.fit(x_train_, y_train_,
              eval_metric='auc',
              eval_set=[(x_val_, y_val_)],
              early_stopping_rounds=100,
              verbose=0)

    y_score = model.predict_proba(x_val_)

    roc_aucs.append(roc_auc_score(y_val_, y_score[:, 1]))

    y_pred_hold.append(model.predict(x_test))
    y_score_hold.append(model.predict_proba(x_test))

y_pred = stats.mode(np.column_stack(y_pred_hold), axis=1)[0]
y_score = np.column_stack([np.mean(np.column_stack(y_score_hold)[:, ::2], axis=1),
                           np.mean(np.column_stack(y_score_hold)[:, 1::2], axis=1)])





















  y_pred = stats.mode(np.column_stack(y_pred_hold), axis=1)[0]


In [20]:
print(np.mean(roc_aucs))
print(roc_auc_score(y_test, y_score[:, 1]))

0.7587526699357604
0.7652572609410099


In [21]:
# Обучаем модель для сохранения
model = LGBMClassifier(**study.best_params, class_weight='balanced', n_jobs=-1)
model.fit(x_train, y_train)



In [23]:
# Создаем словарь с моделью и MinMaxScaler'ом
models = {}

models['lightgbm'] = model
models['mms'] = mms

In [24]:
# Сохраняем словарь с моделями в joblib файл
models_path = '../models.joblib'
joblib.dump(models, models_path)

['../models.joblib']