In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score, accuracy_score, precision_score, \
                            recall_score, f1_score, log_loss

from tqdm.notebook import tqdm

from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

import joblib


RAND=42
N_FOLDS = 5

In [2]:
def get_metrics(y_test, y_pred, y_score, name = "Default"):
    """Метрики для задачи классификации"""
    df_metrics = pd.DataFrame()

    df_metrics['model'] = [name]
    df_metrics['Accuracy'] = accuracy_score(y_test, y_pred)
    df_metrics['ROC_AUC'] = roc_auc_score(y_test, y_score[:, 1])
    df_metrics['Precision'] = precision_score(y_test, y_pred)
    df_metrics['Recall'] = recall_score(y_test, y_pred)
    df_metrics['f1'] = f1_score(y_test, y_pred)
    df_metrics['Logloss'] = log_loss(y_test, y_score)

    return df_metrics

# Feature engineering

In [3]:
filepath = "../data/transformed/train_data/train_data_0.pq.0"
X_0 = pd.read_parquet(filepath).set_index("id")
X_0[:5]

Unnamed: 0_level_0,enc_paym_23_2,pre_maxover2limit_2,pre_loans6090_1,enc_loans_credit_status_5,pre_fterm_11,pre_loans_outstanding_2,enc_loans_credit_type_6,pre_till_pclose_1,is_zero_util_1,pre_loans90_13,...,enc_paym_1_3,enc_loans_account_holder_type_2,pre_maxover2limit_11,pre_till_pclose_5,pre_since_opened_0,pre_since_opened_11,pre_since_opened_14,enc_paym_17_2,pre_pterm_16,rn
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0,0,0,0,0,1,0,1,6,0,...,0,0,0,0,0,0,0,0,0,10
1,0,0,0,0,1,2,0,1,10,0,...,3,0,0,0,0,0,3,0,0,14
2,0,0,0,0,0,1,0,2,1,0,...,1,0,1,0,0,0,0,0,0,3
3,0,0,0,0,1,1,0,5,8,0,...,1,0,0,0,0,0,0,0,3,15
4,0,0,0,0,0,0,0,1,1,0,...,1,0,0,0,0,0,0,0,0,1


In [37]:
filepath = "../data/transformed/train_data/train_data_0.pq.1"
X_1 = pd.read_parquet(filepath).set_index("id")
X_1[:5]

Unnamed: 0_level_0,pre_since_confirmed_12,pre_loans3060_8,pre_loans_credit_limit_17,pre_since_confirmed_1,enc_loans_account_holder_type_6,pre_loans5_6,is_zero_over2limit_0,pre_util_19,enc_paym_9_1,pre_loans5_13,...,pre_loans530_11,pre_maxover2limit_1,pclose_flag_1,enc_paym_0_0,enc_paym_22_1,is_zero_loans6090_1,enc_paym_8_1,pre_till_fclose_13,pre_over2limit_1,rn
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
125000,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,1,0,0,0,1
125001,0,0,0,0,0,3,0,0,2,0,...,0,0,0,0,0,2,2,0,0,3
125002,0,0,0,0,0,9,0,0,0,0,...,0,0,1,0,0,9,0,0,0,9
125003,0,0,0,0,0,6,0,0,0,0,...,0,0,1,0,0,6,0,0,0,6
125004,0,0,0,0,0,5,0,0,0,0,...,0,0,1,0,0,5,0,0,0,5


In [38]:
X = pd.concat([X_0, X_1])

In [41]:
X.shape

(250000, 420)

In [39]:
target_path = "../data/raw/train_target.csv"
Y = pd.read_csv(target_path, nrows=250000, index_col="id")
Y[:5]

Unnamed: 0_level_0,flag
id,Unnamed: 1_level_1
0,0
1,0
2,0
3,0
4,0


In [40]:
Y.shape

(250000, 1)

In [42]:
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, 
                                                    stratify=Y, 
                                                    random_state=RAND)

# Baseline models

In [43]:
# Создаем и обучаем бейслайн LightGBM
model = LGBMClassifier(n_estimators=1000, class_weight='balanced', random_state=RAND)
model.fit(x_train, y_train)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)


In [44]:
y_pred_tr = model.predict(x_train)
y_score_tr = model.predict_proba(x_train)

y_pred = model.predict(x_test)
y_score = model.predict_proba(x_test)

In [45]:
res = get_metrics(y_train, y_pred_tr, y_score_tr, name="lgbm_train")
res = res.append(get_metrics(y_test, y_pred, y_score, name="lgbm_test"))
res

  res = res.append(get_metrics(y_test, y_pred, y_score, name="lgbm_test"))


Unnamed: 0,model,Accuracy,ROC_AUC,Precision,Recall,f1,Logloss
0,lgbm_train,0.91342,0.987436,0.259622,0.971392,0.409735,0.238035
0,lgbm_test,0.87766,0.679152,0.079809,0.280543,0.124266,0.299737


In [46]:
ratio = y_train[y_train==0].shape[0] / y_train[y_train==1].shape[0]

# Создаем и обучаем бейслайн Catboost
model = CatBoostClassifier(scale_pos_weight=ratio, random_state=RAND, verbose=0)
model.fit(x_train, y_train)

<catboost.core.CatBoostClassifier at 0x7f60eb97d960>

In [47]:
y_pred_tr = model.predict(x_train)
y_score_tr = model.predict_proba(x_train)

y_pred = model.predict(x_test)
y_score = model.predict_proba(x_test)

In [48]:
res = res.append(get_metrics(y_train, y_pred_tr, y_score_tr, name="catboost_train"))
res = res.append(get_metrics(y_test, y_pred, y_score, name="catboost_test"))
res

  res = res.append(get_metrics(y_train, y_pred_tr, y_score_tr, name="catboost_train"))
  res = res.append(get_metrics(y_test, y_pred, y_score, name="catboost_test"))


Unnamed: 0,model,Accuracy,ROC_AUC,Precision,Recall,f1,Logloss
0,lgbm_train,0.91342,0.987436,0.259622,0.971392,0.409735,0.238035
0,lgbm_test,0.87766,0.679152,0.079809,0.280543,0.124266,0.299737
0,catboost_train,0.97118,0.842539,0.997647,0.068531,0.128252,0.106406
0,catboost_test,0.96886,0.7533,0.083333,0.000646,0.001283,0.125227


In [49]:
model = RandomForestClassifier(random_state=RAND, verbose=0)
model.fit(x_train, y_train)

  model.fit(x_train, y_train)


In [50]:
y_pred_tr = model.predict(x_train)
y_score_tr = model.predict_proba(x_train)

y_pred = model.predict(x_test)
y_score = model.predict_proba(x_test)

In [51]:
res = res.append(get_metrics(y_train, y_pred_tr, y_score_tr, "LG_train"))
res = res.append(get_metrics(y_test, y_pred, y_score, "LG_test"))
res

  res = res.append(get_metrics(y_train, y_pred_tr, y_score_tr, "LG_train"))
  res = res.append(get_metrics(y_test, y_pred, y_score, "LG_test"))


Unnamed: 0,model,Accuracy,ROC_AUC,Precision,Recall,f1,Logloss
0,lgbm_train,0.91342,0.987436,0.259622,0.971392,0.409735,0.238035
0,lgbm_test,0.87766,0.679152,0.079809,0.280543,0.124266,0.299737
0,catboost_train,0.97118,0.842539,0.997647,0.068531,0.128252,0.106406
0,catboost_test,0.96886,0.7533,0.083333,0.000646,0.001283,0.125227
0,LG_train,0.999465,0.999992,0.998851,0.983837,0.991287,0.025353
0,LG_test,0.96886,0.705033,0.222222,0.002586,0.005112,0.200392


In [52]:
# Создаем и обучаем линейную регрессию
model = LogisticRegression(class_weight='balanced', random_state=RAND)
model.fit(x_train, y_train)

  y = column_or_1d(y, warn=True)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [53]:
y_pred_tr = model.predict(x_train)
y_score_tr = model.predict_proba(x_train)

y_pred = model.predict(x_test)
y_score = model.predict_proba(x_test)

In [54]:
res = res.append(get_metrics(y_train, y_pred_tr, y_score_tr, "LG_train"))
res = res.append(get_metrics(y_test, y_pred, y_score, "LG_test"))
res

  res = res.append(get_metrics(y_train, y_pred_tr, y_score_tr, "LG_train"))
  res = res.append(get_metrics(y_test, y_pred, y_score, "LG_test"))


Unnamed: 0,model,Accuracy,ROC_AUC,Precision,Recall,f1,Logloss
0,lgbm_train,0.91342,0.987436,0.259622,0.971392,0.409735,0.238035
0,lgbm_test,0.87766,0.679152,0.079809,0.280543,0.124266,0.299737
0,catboost_train,0.97118,0.842539,0.997647,0.068531,0.128252,0.106406
0,catboost_test,0.96886,0.7533,0.083333,0.000646,0.001283,0.125227
0,LG_train,0.999465,0.999992,0.998851,0.983837,0.991287,0.025353
0,LG_test,0.96886,0.705033,0.222222,0.002586,0.005112,0.200392
0,LG_train,0.69241,0.761734,0.068636,0.711492,0.125196,0.592374
0,LG_test,0.68754,0.746294,0.065984,0.691661,0.120475,0.60003


**Выводы**  
Среди выбранных baseline-моделей catboost показал большое переобучение, что можно нивилировать, подобрав гиперпараметры; lightgbm показал меньше переобучение и большое значение ROC-AUC на валидации; логистическая регрессия показала слишком низкие метрики и на трейне и на валидации, что может говорить о том, что линейная модель не подходит для данной задачи.
  
Так как в нашем случае матрица объект-признаки имеет большую размерность, lightgbm является более подходящей моделью для обучения на стационарном устройстве, так как сам алгоритм работает быстрее.

# Tune

In [8]:
import optuna
import scipy.stats as stats

In [9]:
def objective_gpt(trial: optuna.Trial, x, y):
    params = {
        'n_estimators': trial.suggest_categorical('n_estimators', [1000]),
#         'learning_rate': trial.suggest_float('learning_rate', 0.001, 3),
        'learning_rate': trial.suggest_categorical('learning_rate', [0.01752291849636367]),
        'max_bin': trial.suggest_int('max_bin', 10, 500),
        'num_leaves': trial.suggest_int('num_leaves', 10, 1000),
        'max_depth': trial.suggest_int('max_depth', 3, 15),
        'min_child_samples': trial.suggest_int('min_child_samples', 100, 70000, step=100),
        'lambda_l1': trial.suggest_int('lambda_l1', 0, 100),
        'lambda_l2': trial.suggest_int('lambda_l2', 0, 100),
        'min_split_gain': trial.suggest_loguniform('min_split_gain', 1e-4, 1e-1),
        'objective': trial.suggest_categorical('objective', ['binary']),
        'metric': trial.suggest_categorical('metric', ['auc']),
        'feature_fraction': trial.suggest_uniform('feature_fraction', 0.2, 1.0),
        'bagging_fraction': trial.suggest_uniform('bagging_fraction', 0.2, 1.0),
        'bagging_freq': trial.suggest_int('bagging_freq', 1, 7),
        'random_state': trial.suggest_categorical('random_state', [RAND]),
    }

    cv_pred = np.empty(N_FOLDS)
    cv = StratifiedKFold(n_splits=N_FOLDS, shuffle=True, random_state=RAND)

    for fold, (train_idx, test_idx) in enumerate(cv.split(x, y)):
        x_train_, x_val_ = x[train_idx, :], x[test_idx, :]
        y_train_, y_val_ = y[train_idx], y[test_idx]

#         ratio = y_train_[y_train_ == 0].shape[0] / \
#             y_train_[y_train_ == 1].shape[0]

        pruning = optuna.integration.LightGBMPruningCallback(trial, 'auc')

        model = LGBMClassifier(
            class_weight="balanced",
            n_jobs=-1,
            **params
        )
        model.fit(x_train_, y_train_,
                  eval_metric='auc',
                  eval_set=[(x_val_, y_val_)],
                  early_stopping_rounds=100,
                  callbacks=[pruning],
                  verbose=0)

        y_pred = model.predict(x_val_)
        y_proba = model.predict_proba(x_val_)[:, 1]

        cv_pred[fold] = roc_auc_score(y_val_, y_proba)
    return (np.mean(cv_pred))

In [10]:
func = lambda trial: objective_gpt(trial, x_train, y_train.values)

study = optuna.create_study(direction="maximize")
study.optimize(func, n_trials=10, show_progress_bar=True, n_jobs=4)

[32m[I 2023-06-07 19:11:33,709][0m A new study created in memory with name: no-name-46a24ead-ed5b-4d9b-8764-8dec1a3d0a97[0m
  self._init_valid()


  0%|          | 0/10 [00:00<?, ?it/s]

  'min_split_gain': trial.suggest_loguniform('min_split_gain', 1e-4, 1e-1),
  'feature_fraction': trial.suggest_uniform('feature_fraction', 0.2, 1.0),
  'bagging_fraction': trial.suggest_uniform('bagging_fraction', 0.2, 1.0),
  'min_split_gain': trial.suggest_loguniform('min_split_gain', 1e-4, 1e-1),
  'feature_fraction': trial.suggest_uniform('feature_fraction', 0.2, 1.0),
  'bagging_fraction': trial.suggest_uniform('bagging_fraction', 0.2, 1.0),
  'min_split_gain': trial.suggest_loguniform('min_split_gain', 1e-4, 1e-1),
  'feature_fraction': trial.suggest_uniform('feature_fraction', 0.2, 1.0),
  'bagging_fraction': trial.suggest_uniform('bagging_fraction', 0.2, 1.0),
  'min_split_gain': trial.suggest_loguniform('min_split_gain', 1e-4, 1e-1),
  'feature_fraction': trial.suggest_uniform('feature_fraction', 0.2, 1.0),
  'bagging_fraction': trial.suggest_uniform('bagging_fraction', 0.2, 1.0),









/home/sergey/virtual_env/lib/python3.10/site-packages/optuna/trial/_trial.py:490: User

In [None]:
study.optimize(func, n_trials=10, show_progress_bar=True, n_jobs=4)

  self._init_valid()


  0%|          | 0/10 [00:00<?, ?it/s]

  'min_split_gain': trial.suggest_loguniform('min_split_gain', 1e-4, 1e-1),
  'feature_fraction': trial.suggest_uniform('feature_fraction', 0.2, 1.0),
  'bagging_fraction': trial.suggest_uniform('bagging_fraction', 0.2, 1.0),
  'feature_fraction': trial.suggest_uniform('feature_fraction', 0.2, 1.0),
  'bagging_fraction': trial.suggest_uniform('bagging_fraction', 0.2, 1.0),
  'bagging_fraction': trial.suggest_uniform('bagging_fraction', 0.2, 1.0),
  'min_split_gain': trial.suggest_loguniform('min_split_gain', 1e-4, 1e-1),
  'feature_fraction': trial.suggest_uniform('feature_fraction', 0.2, 1.0),
  'bagging_fraction': trial.suggest_uniform('bagging_fraction', 0.2, 1.0),


[32m[I 2023-06-07 19:29:10,529][0m Trial 10 pruned. Trial was pruned at iteration 0.[0m


  'min_split_gain': trial.suggest_loguniform('min_split_gain', 1e-4, 1e-1),
  'feature_fraction': trial.suggest_uniform('feature_fraction', 0.2, 1.0),
  'bagging_fraction': trial.suggest_uniform('bagging_fraction', 0.2, 1.0),
/ho

In [20]:
study.best_params

{'n_estimators': 1000, 'learning_rate': 0.01752291849636367}

In [27]:
# Посчитаем микро- и макро- усреднение roc-auc 
cv = StratifiedKFold(n_splits=N_FOLDS, shuffle=True, random_state=RAND)
roc_aucs = []
y_pred_hold = []
y_score_hold = []

for fold, (train_idx, test_idx) in enumerate(cv.split(x_train, y_train)):
    x_train_, x_val_ = x_train[train_idx, :], x_train[test_idx, :]
    y_train_, y_val_ = y_train.values[train_idx], y_train.values[test_idx]

#     ratio = y_train_[y_train_ == 0].shape[0] / \
#         y_train_[y_train_ == 1].shape[0]

    model = LGBMClassifier(**study.best_params, class_weight='balanced', n_jobs=-1)
    model.fit(x_train_, y_train_,
              eval_metric='auc',
              eval_set=[(x_val_, y_val_)],
              early_stopping_rounds=100,
              verbose=0)

    y_score = model.predict_proba(x_val_)

    roc_aucs.append(roc_auc_score(y_val_, y_score[:, 1]))

    y_pred_hold.append(model.predict(x_test))
    y_score_hold.append(model.predict_proba(x_test))

y_pred = stats.mode(np.column_stack(y_pred_hold), axis=1)[0]
y_score = np.column_stack([np.mean(np.column_stack(y_score_hold)[:, ::2], axis=1),
                           np.mean(np.column_stack(y_score_hold)[:, 1::2], axis=1)])





















  y_pred = stats.mode(np.column_stack(y_pred_hold), axis=1)[0]


In [28]:
print(np.mean(roc_aucs))
print(roc_auc_score(y_test, y_score[:, 1]))

0.756945243801509
0.7553409536969106


In [29]:
params = study.best_params

joblib.dump(params, "../data/params.joblib")

['../data/params.joblib']

# Out-of-core training

In [4]:
from scrypt import out_of_core_train

In [5]:
train_path = "../data/transformed/train_data/"
target_path = "../data/raw/train_target.csv"

# params = joblib.load("../data/params.joblib")

In [6]:
model = out_of_core_train(train_path, target_path)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)


  0%|          | 0/11 [00:00<?, ?it/s]

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.clas

In [7]:
# Создаем словарь с моделью и MinMaxScaler'ом
models = {}

models['lightgbm'] = model

In [7]:
# Сохраняем словарь с моделями в joblib файл
models_path = '../data/models.joblib'
joblib.dump(models, models_path)

['../data/models.joblib']