In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import roc_auc_score
import xgboost as xgb
import warnings
from lightgbm import LGBMClassifier
warnings.filterwarnings('ignore')
import lightgbm as lgb
import optuna

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report


**Кредитный скоринг** является важной банковской задачей.

Стандартный подход заключается в построении классических моделей машинного обучения, таких как логистическая регрессия и градиентный бустинг на табличных данных, в том числе с использованием *агрегирования* из некоторых последовательных данных, таких как истории транзакций клиентов.


### Описание полей:
- id - identifier of the application
- <span style = 'color : red'> **flag** </span> - target (целевая переменная)
- pre_since_opened - days from credit opening date to data collection date
- pre_since_confirmed - days from credit information confirmation date till data collection date
- pre_pterm - planned number of days from credit opening date to closing date
- pre_fterm - actual number of days from credit opening date to closing date
- pre_till_pclose - planned number of days from data collection date until loan closing date
- pre_till_fclose - actual number of days from data collection date until loan closing date
- pre_loans_credit_limit - credit limit
- pre_loans_next_pay_summ - amount of the next loan payment
- pre_loans_outstanding - outstanding loan amount
- pre_loans_total_overdue - current overdue amount
- pre_loans_max_overdue_sum - maximum overdue amount
- pre_loans_credit_cost_rate - total cost of credit
- pre_loans5 - number of delinquencies of up to 5 days
- pre_loans530 - number of delinquencies from 5 to 30 days
- pre_loans3060 - number of delinquencies from 30 to 60 days
- pre_loans6090 - number of delinquencies from 60 to 90 days
- pre_loans90 - number of delinquencies of more than 90 days
- is_zero_loans_5 - flag: no delinquencies of up to 5 days
- is_zero_loans_530 - flag: no delinquencies of 5 to 30 days
- is_zero_loans_3060 - flag: no delinquencies of 30 to 60 days
- is_zero_loans_6090 - flag: no delinquencies of 60 to 90 days
- is_zero_loans90 - flag: no delinquencies of more than 90 days
- pre_util - ratio of outstanding loan amount to credit limit
- pre_over2limit - ratio of currently overdue debt to credit limit
- pre_maxover2limit - ratio of maximum overdue debt to credit limit
- is_zero_util - flag: ratio of outstanding loan amount to credit limit equals 0
- is_zero_over2limit - flag: ratio of current overdue debt to credit limit equals 0
- is_zero_maxover2limit - flag: ratio of maximum overdue debt to credit limit equals 0
- **<span style = 'color : blue'> enc_paym_{0…n} </span>** - monthly payment statuses of the last n months
- enc_loans_account_holder_type - type of relation to the loan
- enc_loans_credit_status - credit status
- enc_loans_account_cur - currency of the loan
- enc_loans_credit_type - credit type
- pclose_flag - flag: planned number of days from opening date to closing date of the loan
- fclose_flag - flag: actual number of days from credit opening date to closing date undefined

# Open files

In [2]:
df_train = pd.read_parquet('P03_train.pq')
df_test = pd.read_parquet('P03_test.pq')
combine = [df_train, df_test]

In [3]:
df_train.to_csv('df_train.csv', index=False)

In [4]:
df_train.head()

Unnamed: 0,id,flag,pre_since_opened,pre_since_confirmed,pre_pterm,pre_fterm,pre_till_pclose,pre_till_fclose,pre_loans_credit_limit,pre_loans_next_pay_summ,...,enc_paym_21,enc_paym_22,enc_paym_23,enc_paym_24,enc_loans_account_holder_type,enc_loans_credit_status,enc_loans_credit_type,enc_loans_account_cur,pclose_flag,fclose_flag
0,41794,0,3,9,16,9,15,14,16,3,...,3,3,3,4,1,3,4,1,0,0
1,160704,0,9,6,0,1,0,4,1,3,...,3,3,3,4,1,2,3,1,0,0
2,41771,0,12,16,14,8,10,11,10,2,...,3,3,3,4,1,2,3,1,0,1
3,160727,0,7,9,17,16,14,8,14,3,...,3,3,3,4,1,2,4,1,0,0
4,160729,0,9,9,4,8,1,11,10,3,...,3,3,3,4,1,2,3,1,1,1


In [5]:
df_train.enc_loans_account_holder_type

0         1
1         1
2         1
3         1
4         1
         ..
174995    1
174996    1
174997    1
174998    1
174999    1
Name: enc_loans_account_holder_type, Length: 175000, dtype: int64

In [6]:

def safe_div(a, b):
    return np.where(b == 0, 0, a / b)

def feature_engineering(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()

    money_cols = [
        'pre_loans_credit_limit',
        'pre_loans_next_pay_summ',
        'pre_loans_outstanding'
    ]

    for c in money_cols:
        if c in df.columns:
            df[f'log_{c}'] = np.log1p(df[c])

    df['utilization'] = safe_div(
        df['pre_loans_outstanding'],
        df['pre_loans_credit_limit']
    )

    df['next_payment_ratio'] = safe_div(
        df['pre_loans_next_pay_summ'],
        df['pre_loans_credit_limit']
    )

    df['term_diff'] = df['pre_fterm'] - df['pre_pterm']
    df['early_close_flag'] = (df['term_diff'] < 0).astype(int)


    delinquency_cols = [
        'pre_loans5',
        'pre_loans530',
        'pre_loans3060',
        'pre_loans6090',
        'pre_loans90'
    ]

    df['total_delinquencies'] = df[delinquency_cols].sum(axis=1)

    df['ever_30_plus'] = (
        df[['pre_loans3060', 'pre_loans6090', 'pre_loans90']]
        .sum(axis=1) > 0
    ).astype(int)

    df['worst_delinquency'] = (
        (df['pre_loans90'] > 0) * 4 +
        (df['pre_loans6090'] > 0) * 3 +
        (df['pre_loans3060'] > 0) * 2 +
        (df['pre_loans530'] > 0) * 1
    )

    paym_cols = sorted(
        [c for c in df.columns if c.startswith('enc_paym_')],
        key=lambda x: int(x.split('_')[-1])
    )

    if paym_cols:
        paym = df[paym_cols]

        weights = np.linspace(1.0, 0.3, len(paym_cols))
        df['paym_decay'] = (paym * weights).sum(axis=1) / weights.sum()

        df['paym_recent_max'] = paym.iloc[:, -3:].max(axis=1)

        df['paym_bad_share'] = (paym >= 3).mean(axis=1)

        df['paym_roll_mean6'] = paym.iloc[:, -6:].mean(axis=1)


    df['util_x_delinquency'] = df['utilization'] * df['total_delinquencies']
    df['nextpay_x_delinquency'] = df['next_payment_ratio'] * df['total_delinquencies']
    df['paym_x_worst'] = df.get('paym_decay', 0) * df['worst_delinquency']

    flag_cols = [c for c in df.columns if c.startswith('is_zero_')]
    for c in flag_cols:
        df[c] = df[c].astype(int)

    return df


In [7]:
df_train1 = (feature_engineering(df_train))
df_test1 = (feature_engineering(df_test))

In [32]:
df_test1.shape

(75000, 77)

In [8]:
df_train1.shape

(175000, 78)

In [9]:
X = df_train1.drop(columns=['id', 'flag'])
y = df_train1['flag']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [10]:
def objective(trial):
    params = {
        "objective": "binary",
        "metric": "auc",
        "boosting_type": "gbdt",
        "learning_rate": trial.suggest_float("learning_rate", 0.005, 0.1, log=True),
        "num_leaves": trial.suggest_int("num_leaves", 20, 256),
        "max_depth": trial.suggest_int("max_depth", 3, 20),
        "min_child_samples": trial.suggest_int("min_child_samples", 10, 100),
        "feature_fraction": trial.suggest_float("feature_fraction", 0.6, 1.0),
        "bagging_fraction": trial.suggest_float("bagging_fraction", 0.6, 1.0),
        "bagging_freq": trial.suggest_int("bagging_freq", 1, 10),
        "lambda_l1": trial.suggest_float("lambda_l1", 0.0, 5.0),
        "lambda_l2": trial.suggest_float("lambda_l2", 0.0, 5.0),
        "class_weight": "balanced",
        "verbosity": -1,
        "seed": 42,
        "force_row_wise": True
    }

    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    auc_scores = []


    categorical_features = [
        col for col in X.columns if col.startswith('enc_') or col.endswith('_flag') or X[col].nunique() < 10
    ]
    X[categorical_features] = X[categorical_features].astype('category')

    for train_index, valid_index in skf.split(X, y):
        X_train = X.iloc[train_index]
        X_test = X.iloc[valid_index]
        y_train = y.iloc[train_index]
        y_test = y.iloc[valid_index]

        train_data = lgb.Dataset(X_train, label=y_train, categorical_feature=categorical_features)
        valid_data = lgb.Dataset(X_test, label=y_test, categorical_feature=categorical_features, reference=train_data)

        model = lgb.train(
            params,
            train_data,
            num_boost_round=2000,
            valid_sets=[valid_data],
            valid_names=["valid"],
            callbacks=[
                lgb.early_stopping(stopping_rounds=100, verbose=False)
            ]
        )

        preds = model.predict(X_test, num_iteration=model.best_iteration)
        auc = roc_auc_score(y_test, preds)
        auc_scores.append(auc)

    return np.mean(auc_scores)

In [11]:
sampler = optuna.samplers.TPESampler(
    multivariate=True,
    seed=42
)


study = optuna.create_study(
    direction="maximize",
    sampler=sampler
)

study.optimize(objective, n_trials=50)

print("Best AUC:", study.best_value)
print("Best params:", study.best_params)

[I 2026-01-26 13:55:29,683] A new study created in memory with name: no-name-40ffe376-271d-4111-9f9e-9191782f70e9
[I 2026-01-26 13:56:11,287] Trial 0 finished with value: 0.7097869361868793 and parameters: {'learning_rate': 0.015355286838886862, 'num_leaves': 245, 'max_depth': 16, 'min_child_samples': 64, 'feature_fraction': 0.6624074561769746, 'bagging_fraction': 0.662397808134481, 'bagging_freq': 1, 'lambda_l1': 4.330880728874676, 'lambda_l2': 3.005575058716044}. Best is trial 0 with value: 0.7097869361868793.
[I 2026-01-26 13:56:31,198] Trial 1 finished with value: 0.7108453404899766 and parameters: {'learning_rate': 0.04170553216181044, 'num_leaves': 24, 'max_depth': 20, 'min_child_samples': 85, 'feature_fraction': 0.6849356442713105, 'bagging_fraction': 0.6727299868828402, 'bagging_freq': 2, 'lambda_l1': 1.5212112147976886, 'lambda_l2': 2.6237821581611893}. Best is trial 1 with value: 0.7108453404899766.
[I 2026-01-26 13:57:11,832] Trial 2 finished with value: 0.7109183908049564 a

Best AUC: 0.7126424836584881
Best params: {'learning_rate': 0.010957094328361092, 'num_leaves': 27, 'max_depth': 10, 'min_child_samples': 25, 'feature_fraction': 0.6303335400803862, 'bagging_fraction': 0.7762335301514522, 'bagging_freq': 7, 'lambda_l1': 1.7798511802159651, 'lambda_l2': 2.3711021922848037}


In [12]:
params = dict(**study.best_params)
categorical_features = [
    c for c in X.columns
    if c.startswith("enc_") or c.endswith("_flag")
]

for c in categorical_features:
    X[c] = X[c].astype("category")
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

oof_preds = np.zeros(len(X))
models = []
auc_scores = []

for fold, (train_idx, valid_idx) in enumerate(skf.split(X, y), 1):
    print(f"Fold {fold}")

    X_train, X_valid = X.iloc[train_idx], X.iloc[valid_idx]
    y_train, y_valid = y.iloc[train_idx], y.iloc[valid_idx]

    train_data = lgb.Dataset(
        X_train,
        label=y_train,
        categorical_feature=categorical_features
    )

    valid_data = lgb.Dataset(
        X_valid,
        label=y_valid,
        categorical_feature=categorical_features,
        reference=train_data
    )

    model = lgb.train(
        params,
        train_data,
        num_boost_round=3000,
        valid_sets=[valid_data],
        callbacks=[
            lgb.early_stopping(stopping_rounds=200, verbose=False)
        ]
    )

    preds = model.predict(X_valid, num_iteration=model.best_iteration)
    oof_preds[valid_idx] = preds

    fold_auc = roc_auc_score(y_valid, preds)
    auc_scores.append(fold_auc)

    print(f"Fold AUC: {fold_auc:.5f}")
    models.append(model)

print("------------------------------------")
print(f"Mean AUC: {np.mean(auc_scores):.5f}")



Fold 1
Fold AUC: 0.70454
Fold 2
Fold AUC: 0.71295
Fold 3
Fold AUC: 0.72263
Fold 4
Fold AUC: 0.70144
Fold 5
Fold AUC: 0.71645
------------------------------------
Mean AUC: 0.71160


In [52]:
df_test1 = (feature_engineering(df_test))
test = df_test1.drop(['id'], axis = 1)
categorical_features_test = [
    c for c in test.columns if c.startswith("enc_") or c.endswith("_flag") or test[c].nunique() < 10
]
categorical_features_test.append('pre_loans530')
for c in categorical_features_test:
    test[c] = test[c].astype("category")

In [53]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 75000 entries, 0 to 74999
Data columns (total 76 columns):
 #   Column                         Non-Null Count  Dtype   
---  ------                         --------------  -----   
 0   pre_since_opened               75000 non-null  int64   
 1   pre_since_confirmed            75000 non-null  int64   
 2   pre_pterm                      75000 non-null  int64   
 3   pre_fterm                      75000 non-null  int64   
 4   pre_till_pclose                75000 non-null  int64   
 5   pre_till_fclose                75000 non-null  int64   
 6   pre_loans_credit_limit         75000 non-null  int64   
 7   pre_loans_next_pay_summ        75000 non-null  category
 8   pre_loans_outstanding          75000 non-null  category
 9   pre_loans_total_overdue        75000 non-null  category
 10  pre_loans_max_overdue_sum      75000 non-null  category
 11  pre_loans_credit_cost_rate     75000 non-null  int64   
 12  pre_loans5                     7

In [54]:
train_data = lgb.Dataset(X, label=y, categorical_feature=categorical_features)

final_model = lgb.train(
    params,
    train_data,
    num_boost_round=3000,
    valid_sets=[train_data],
    callbacks=[lgb.early_stopping(stopping_rounds=200)]
)


In [55]:
cols_diff = X.select_dtypes('category').columns.difference(test.select_dtypes('category').columns)
print(cols_diff)


Index([], dtype='object')


In [56]:
len(test.select_dtypes('category').columns) 

54

In [57]:

test_preds_proba = final_model.predict(test, num_iteration=final_model.best_iteration)

threshold = 0.3
test_preds_label = (test_preds_proba >= threshold).astype(int)

df_test1["pred_proba"] = test_preds_proba
df_test1["pred_label"] = test_preds_label


In [58]:
submission_int = pd.DataFrame(
    {
        "id": df_test1["id"],
        "flag": df_test1["pred_label"]
    }
)
submission_proba = pd.DataFrame(
    {
        "id": df_test1["id"],
        "flag": df_test1["pred_proba"]
    }
)

In [59]:
submission_int.head()

Unnamed: 0,id,flag
0,225096,0
1,220040,0
2,26535,0
3,77272,0
4,193584,0


In [60]:
submission_proba.head()

Unnamed: 0,id,flag
0,225096,0.01074
1,220040,0.053932
2,26535,0.069408
3,77272,0.181092
4,193584,0.021252


In [61]:
submission_int.to_csv('submission_int.csv', index=False)
submission_proba.to_csv('submission_proba.csv', index=False)

In [62]:
modellr = LogisticRegression(random_state=42)
X1 = df_train.drop(columns=['id', 'flag'])
y1 = df_train['flag']
X_train1, X_test1, y_train1, y_test1 = train_test_split(X, y, test_size=0.2, random_state=42)
modellr.fit(X_train1, y_train1)
accuracy = modellr.score(X_test1, y_test1)
print("Accuracy:", accuracy)
print("AUC:", roc_auc_score(y_test1, modellr.predict_proba(X_test1)[:, 1]))
print(classification_report(y_test1, modellr.predict(X_test1)))

Accuracy: 0.9693714285714286
AUC: 0.6653454929358746
              precision    recall  f1-score   support

           0       0.97      1.00      0.98     33928
           1       0.00      0.00      0.00      1072

    accuracy                           0.97     35000
   macro avg       0.48      0.50      0.49     35000
weighted avg       0.94      0.97      0.95     35000

