In [1]:
%load_ext autoreload
%autoreload 2

# Imports

In [2]:
import pandas as pd
import numpy as np
from scipy import special, stats

from optbinning import OptimalBinning

from sklearn.model_selection import StratifiedKFold, train_test_split
# from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
import sklearn as sci_kit

from lightgbm import LGBMClassifier
import lightgbm as lgb

import optuna

from itertools import combinations

from tqdm.auto import tqdm

import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots

from shap import TreeExplainer

# import random

# import cloudpickle

import warnings
warnings.filterwarnings('ignore')

In [3]:
print('Pandas version:', pd.__version__)
print('Numpy version:', np.__version__)
# print('Scikit-learn version:', sci_kit.__version__)
# print('LightGBM version:', lgb.__version__)
# print('Optuna version:', optuna.__version__)

Pandas version: 2.2.3
Numpy version: 2.2.5


# Configuration

In [4]:
class CONFIG:
    TARGET_COLUMN = 'target'
    TEST_SIZE = 0.25
    TRAIN_SIZE = 1 - TEST_SIZE

    RANDOM_STATE = 42
    N_FOLDS = 5

# Additional functions

## PSI

In [5]:
def count_psi(current, previous):
    return np.sum((current - previous) * np.log(current / previous))

def custom_psi_check(df_orig, col, year_current=2017, year_previous=2016):
    df = df_orig.copy()
    df.set_index('month_period', inplace=True)
    psi_array = []
    for month in range(1, 13):
        ref = f'{year_previous}-{month:02d}'
        tag = f'{year_current}-{month:02d}'
        ref_tag_index = [ref, tag]
        df_r = df.loc[ref_tag_index]
        if df_r[col].dtype.name != 'category':
            quantiles = [-np.inf, ] + sorted(list(map(float, set(df_r.loc[ref][col].quantile([0.2, 0.4, 0.6, 0.8]).values))))+ [np.inf]
            df_r[f'{col}_cut'] = pd.cut(df_r[col], bins=quantiles)
            df_r[f'{col}_cut'] = df_r[f'{col}_cut'].cat.add_categories(['Missing']).fillna('Missing')
        else:
            if df_r[col].isna().sum() > 0:
                df_r[f'{col}_cut'] = df_r[col].astype('object').fillna('Missing').astype('category')
            else:
                df_r[f'{col}_cut'] = df_r[col].copy()
        
        df_r = df_r.reset_index()
        count_table = df_r.groupby(['month_period', f'{col}_cut']).count().unstack(-1)[col]
        devided_table = count_table.div(count_table.sum(axis=1), axis=0)
        devided_table.fillna(0, inplace=True)
        devided_table.replace(0, 1e-7, inplace=True)
        devided_table.sort_index(inplace=True)
        last_row = devided_table.iloc[0, :]
        psi_array.append(float(devided_table.apply(lambda x: count_psi(last_row, x), axis=1).loc[tag]))
    return psi_array
    

## Correlation

In [6]:
def cramers_v(x, y):
    confusion_matrix = pd.crosstab(x, y)
    chi2, p_value, _, _ = stats.chi2_contingency(confusion_matrix)
    n = confusion_matrix.sum().sum()
    phi2 = chi2 / n
    r, k = confusion_matrix.shape
    phi2_corrected = max(0, phi2 - ((k - 1) * (r - 1)) / (n - 1))
    r_corr = r - ((r - 1) ** 2) / (n - 1)
    k_corr = k - ((k - 1) ** 2) / (n - 1)
    return np.sqrt(phi2_corrected / min(r_corr - 1, k_corr - 1)), p_value

def check_corr(df, mode='pearson'):
    if mode == 'pearson':
        df_ = df.select_dtypes('number')
        correlation_matrix = pd.DataFrame(
            np.ones((len(df_.columns), len(df_.columns))), 
            index=df_.columns, 
            columns=df_.columns
        )
        for (x, X), (y, Y) in combinations(df_.items(), 2):
            mask = (~X.isna()) & (~Y.isna())
            Xc = X[mask]
            Yc = Y[mask]
            r_obs, p_value = stats.pearsonr(Xc, Yc)
            if p_value < 0.05:
                correlation_matrix.loc[x, y] = correlation_matrix.loc[y, x] = r_obs
            else:
                correlation_matrix.loc[x, y] = correlation_matrix.loc[y, x] = 0
        return correlation_matrix
    elif mode == 'spearman':
        df_ = df.select_dtypes('number')
        correlation_matrix = pd.DataFrame(
            np.ones((len(df_.columns), len(df_.columns))), 
            index=df_.columns, 
            columns=df_.columns
        )
        for (x, X), (y, Y) in combinations(df_.items(), 2):
            mask = (~X.isna()) & (~Y.isna())
            Xc = X[mask]
            Yc = Y[mask]
            r_obs, p_value = stats.spearmanr(Xc, Yc)
            if p_value < 0.05:
                correlation_matrix.loc[x, y] = correlation_matrix.loc[y, x] = r_obs
            else:
                correlation_matrix.loc[x, y] = correlation_matrix.loc[y, x] = 0
        return correlation_matrix
    elif mode == 'cramer_v':
        df_ = df.select_dtypes(['object', 'category'])
        correlation_matrix = pd.DataFrame(
            np.ones((len(df_.columns), len(df_.columns))), 
            index=df_.columns, 
            columns=df_.columns
        )
        for (x, X), (y, Y) in combinations(df_.items(), 2):
            mask = (~X.isna()) & (~Y.isna())
            Xc = X[mask]
            Yc = Y[mask]
            cramer_v, p_value = cramers_v(Xc, Yc)
            if p_value < 0.05:
                correlation_matrix.loc[x, y] = correlation_matrix.loc[y, x] = cramer_v
            else:
                correlation_matrix.loc[x, y] = correlation_matrix.loc[y, x] = 0
        return correlation_matrix
    

## Shap explainer

In [7]:
def _shap_importances(model, X):
    explainer = TreeExplainer(
        model,
        feature_perturbation='tree_path_dependent',
    )
    coefs = explainer.shap_values(X)

    if isinstance(coefs, list):
        coefs = list(map(lambda x: np.abs(x).mean(0), coefs))
        coefs = np.sum(coefs, axis=0)
    else:
        coefs = np.abs(coefs).mean(0)

    return coefs


# Data

In [8]:
data = pd.read_parquet('loans_individual_data.parquet')

In [9]:
data['issue_d'] = pd.to_datetime(data['issue_d'], format='%b-%Y')

In [10]:
data['id'] = data['id'].astype(int)

data['earliest_cr_line'] = pd.to_datetime(data['earliest_cr_line'], format='%b-%Y')
data['last_pymnt_d'] = pd.to_datetime(data['last_pymnt_d'], format='%b-%Y')
data['last_credit_pull_d'] = pd.to_datetime(data['last_credit_pull_d'], format='%b-%Y')

data['int_rate%']= pd.to_numeric(data['int_rate'].str.strip('%'))
data['revol_util%'] = pd.to_numeric(data['revol_util'].str.strip('%'))

columns_to_drop = [
    'Unnamed: 0', 'title', 'zip_code', 'pymnt_plan', 'emp_title', 'int_rate', 'revol_util', 'url', 
    'fico_range_high', 'fico_range_low', 'grade', 'sub_grade', 'loan_amnt', 'funded_amnt', 'funded_amnt_inv'
]
data.drop(columns=columns_to_drop, inplace=True)

data['term_36_months'] = np.where(data['term'] == '36 months',1,0)
data.drop(columns=['term', ], inplace=True)

data['emp_length'].fillna('', inplace=True)
data['emp_length'] = pd.to_numeric(data['emp_length'].str.replace('<', '', regex=False).str[:2].str.strip(), errors='coerce')

data = data.apply(lambda col: col.str.strip().astype('category') if col.dtypes == 'object' else col)

loan_status_mapping = {
    'Fully Paid': 0,
    'Current': 0,
    'In Grace Period': 0,
    'Late (16-30 days)': 0,
    'Late (31-120 days)': 1,
    'Charged Off': 1,
    'Default': 1
}

data[CONFIG.TARGET_COLUMN] = data['loan_status'].map(loan_status_mapping)
data.drop(columns=['loan_status', ], inplace=True)

In [11]:
data['month_period'] = data['issue_d'].dt.to_period('M').astype(str)

In [12]:
fig = make_subplots(specs=[[{"secondary_y": True, }, ], ])
shape_df = data.groupby('month_period')[CONFIG.TARGET_COLUMN].count().reset_index()
target_df = data.groupby('month_period')[CONFIG.TARGET_COLUMN].mean().reset_index()

target_plot = px.line(x=target_df['month_period'], y=target_df[CONFIG.TARGET_COLUMN], markers=True, title='Target')
shape_plot = px.bar(x=shape_df['month_period'], y=shape_df[CONFIG.TARGET_COLUMN], opacity=0.5, title='Shape')

target_plot.update_traces(yaxis='y2')

fig.add_traces(target_plot.data + shape_plot.data)
fig.update_layout(title='Overview', yaxis=dict(title='Count of loans', ), yaxis2=dict(title='Target', ))

fig.show()

## Train/Test split

In [13]:
dev_data = data[data['issue_d'] < '2017-01-01']
oot_data = data[data['issue_d'] >= '2017-01-01']

In [14]:
train_data, test_data = train_test_split(dev_data, test_size=CONFIG.TEST_SIZE, random_state=CONFIG.RANDOM_STATE, stratify=dev_data[CONFIG.TARGET_COLUMN])
print(f'Train DR: {round(train_data[CONFIG.TARGET_COLUMN].mean(), 4)}, Test DR: {round(test_data[CONFIG.TARGET_COLUMN].mean(), 4)}')

Train DR: 0.1834, Test DR: 0.1834


In [15]:
CV_folds = list(StratifiedKFold(n_splits=CONFIG.N_FOLDS, shuffle=True, random_state=CONFIG.RANDOM_STATE).split(train_data, train_data[CONFIG.TARGET_COLUMN]))

In [16]:
all_features = list(data.columns)
all_features.remove(CONFIG.TARGET_COLUMN)
all_features.remove('id')
all_features.remove('month_period')
all_features.remove('issue_d')
all_features.remove('earliest_cr_line')
all_features.remove('last_pymnt_d')
all_features.remove('last_credit_pull_d')
all_features.remove('revol_util%')
all_features.remove('int_rate%')
all_features.remove('sec_app_fico_range_low')
all_features.remove('sec_app_fico_range_high')
all_features.remove('sec_app_earliest_cr_line')
all_features.remove('sec_app_inq_last_6mths')
all_features.remove('sec_app_mort_acc')
all_features.remove('sec_app_open_acc')
all_features.remove('sec_app_revol_util')
all_features.remove('sec_app_open_act_il')
all_features.remove('sec_app_num_rev_accts')
all_features.remove('sec_app_chargeoff_within_12_mths')
all_features.remove('sec_app_collections_12_mths_ex_med')
all_features.remove('revol_bal_joint')
all_features.remove('policy_code')
all_features.remove('application_type')
all_features.remove('annual_inc_joint')
all_features.remove('dti_joint')
all_features.remove('verification_status_joint')
all_features.remove('hardship_status')
all_features.remove('deferral_term')
all_features.remove('hardship_amount')
all_features.remove('hardship_start_date')
all_features.remove('hardship_end_date')
all_features.remove('payment_plan_start_date')
all_features.remove('hardship_length')
all_features.remove('hardship_dpd')
all_features.remove('hardship_loan_status')
all_features.remove('orig_projected_additional_accrued_interest')
all_features.remove('hardship_payoff_balance_amount')
all_features.remove('hardship_last_payment_amount')
all_features.remove('next_pymnt_d')
all_features.remove('tax_liens')
all_features.remove('hardship_flag')
all_features.remove('hardship_type')
all_features.remove('hardship_reason')
all_features.remove('debt_settlement_flag')
all_features.remove('last_fico_range_low')
all_features.remove('last_fico_range_high')
all_features.remove('last_pymnt_amnt')
all_features.remove('total_rec_prncp')
all_features.remove('recoveries')
all_features.remove('collection_recovery_fee')


# One-factor analysis

In [17]:
stats_df = pd.DataFrame()
iv_dict = {feature: 0 for feature in all_features}
imp_dict = {feature: 0 for feature in all_features}

for train_idx, val_idx in CV_folds:
    for feature in tqdm(all_features):
        model = LGBMClassifier(max_depth=3, importance_type='gain', n_estimators=1, random_state=CONFIG.RANDOM_STATE, verbose=-1, n_jobs=16, boosting_type='gbdt')
        model.fit(train_data[ [feature, ] ].iloc[train_idx], train_data[CONFIG.TARGET_COLUMN].iloc[train_idx])
        imp_dict[feature] += model.booster_.feature_importance(importance_type='gain')
        if train_data[feature].count() > 0:
            if train_data[feature].dtype in ['object', 'category']:
                optb = OptimalBinning(name=feature, dtype="categorical", min_prebin_size=0.05, monotonic_trend='auto_asc_desc', solver="cp", divergence="iv")
            else:
                optb = OptimalBinning(name=feature, dtype="numerical", min_prebin_size=0.05, monotonic_trend='auto_asc_desc', solver="cp", divergence="iv")
        
        optb.fit(train_data[feature].iloc[train_idx], train_data[CONFIG.TARGET_COLUMN].iloc[train_idx])
        x_bins = optb.splits
        
        if len(x_bins) > 0:
            if train_data[feature].dtype in ['object', 'category']:
                optb_cv = OptimalBinning(name=feature, dtype="categorical", min_prebin_size=0.05, monotonic_trend='auto_asc_desc', solver="cp", divergence="iv", user_splits=x_bins)
            else:
                optb_cv = OptimalBinning(name=feature, dtype="numerical", min_prebin_size=0.05, monotonic_trend='auto_asc_desc', solver="cp", divergence="iv", user_splits=x_bins)
            optb_cv.fit(train_data[feature].iloc[val_idx], train_data[CONFIG.TARGET_COLUMN].iloc[val_idx])
            iv_dict[feature] += optb_cv.binning_table.build()['IV'].loc['Totals']
        else:
            iv_dict[feature] += 0

iv_dict = {f: iv_dict[f] / CONFIG.N_FOLDS for f in all_features}
imp_dict = {f: imp_dict[f] / CONFIG.N_FOLDS for f in all_features}

stats_df = pd.DataFrame(index=all_features)
for feature in all_features:
    stats_df.loc[feature, 'imp'] = imp_dict[feature]
    stats_df.loc[feature, 'iv'] = iv_dict[feature] 

  0%|          | 0/80 [00:00<?, ?it/s]

  0%|          | 0/80 [00:00<?, ?it/s]

  0%|          | 0/80 [00:00<?, ?it/s]

  0%|          | 0/80 [00:00<?, ?it/s]

  0%|          | 0/80 [00:00<?, ?it/s]

In [18]:
tmp_features_to_analyze = list(stats_df[(stats_df['iv'] > stats_df['iv'].quantile(0.5)) & (stats_df['imp'] > stats_df['imp'].quantile(0.5))].index)

In [19]:
resp_dict_psi = {}
for col in tqdm(tmp_features_to_analyze):
    resp_dict_psi[col] = custom_psi_check(data[[col, 'month_period']], col)

to_delete_by_psi = []
for col in tqdm(tmp_features_to_analyze):
    if np.quantile(resp_dict_psi[col], 0.75) > 0.2:
        to_delete_by_psi.append(col)

for col in to_delete_by_psi:
    tmp_features_to_analyze.remove(col)

  0%|          | 0/39 [00:00<?, ?it/s]

  0%|          | 0/39 [00:00<?, ?it/s]

In [20]:
pearson_corr_df = check_corr(train_data[tmp_features_to_analyze], mode='pearson')
spearman_corr_df = check_corr(train_data[tmp_features_to_analyze], mode='spearman')
cramer_v_df = check_corr(train_data[tmp_features_to_analyze], mode='cramer_v')

pearson_corr_df.to_csv('artifacts/pearson_corr_df.csv')
spearman_corr_df.to_csv('artifacts/spearman_corr_df.csv')
cramer_v_df.to_csv('artifacts/cramer_v_df.csv')

In [21]:
X = train_data[tmp_features_to_analyze].copy()

In [22]:
reward_dict = {feature: 0 for feature in tmp_features_to_analyze}

def mape(x, y):
    return np.abs(x - y) / max(x, y)

for (x, X), (y, Y) in tqdm(combinations(X.items(), 2)):
    x_iv = stats_df.loc[x, 'iv']
    y_iv = stats_df.loc[y, 'iv']
    
    x_imp = stats_df.loc[x, 'imp']
    y_imp = stats_df.loc[y, 'imp']

    voting_x = pd.Series([y_iv < x_iv, y_imp < x_imp])
    voting_y = pd.Series([x_iv < y_iv, x_imp < y_imp])

    reward = pd.Series([mape(x_iv, y_iv), mape(x_imp, y_imp)])

    reward_dict[x] += np.sum(voting_x * reward)
    reward_dict[y] += np.sum(voting_y * reward)

M = len(tmp_features_to_analyze)
for feature in tmp_features_to_analyze:
    reward_dict[feature] /= (M - 1)

0it [00:00, ?it/s]

In [23]:
X = train_data[tmp_features_to_analyze].copy()

In [24]:
to_drop_by_strength = set()

for (x, X), (y, Y) in tqdm(combinations(X.items(), 2)):
    if pd.api.types.is_numeric_dtype(X) and pd.api.types.is_numeric_dtype(Y):
        if (abs(pearson_corr_df.loc[x, y]) >= 0.5) or (abs(spearman_corr_df.loc[x, y]) >= 0.5):
            if reward_dict[x] > reward_dict[y]:
                to_drop_by_strength.add(y)
            else:
                to_drop_by_strength.add(x)

    if pd.api.types.is_categorical_dtype(X) and pd.api.types.is_categorical_dtype(Y):
        if cramer_v_df.loc[x, y] >= 0.5:
            if reward_dict[x] > reward_dict[y]:
                to_drop_by_strength.add(y)
            else:
                to_drop_by_strength.add(x)

for feature in to_drop_by_strength:
    tmp_features_to_analyze.remove(feature)


0it [00:00, ?it/s]

# Modeling

In [25]:
X_train, y_train = train_data[tmp_features_to_analyze].copy(), train_data[CONFIG.TARGET_COLUMN].copy()

lgbm_fi_df = pd.DataFrame()
shap_fi_df = pd.DataFrame()

In [26]:
const_params = {
    'metric': 'auc',
    'boosting_type': 'gbdt',
    'extra_seed': CONFIG.RANDOM_STATE,
    'drop_seed': CONFIG.RANDOM_STATE,
    'random_state': CONFIG.RANDOM_STATE,
    'n_jobs': -1,
    'verbosity': -1,
    'deterministic': True,
    'force_col_wise': True,
    'n_estimators': 1000,
    'learning_rate': 0.05,
    'importance_type': 'gain',
}

In [27]:
def objective_logloss(trial):
    global lgbm_fi_df, shap_fi_df

    dist_params = {
        'num_leaves': trial.suggest_int('num_leaves', 7, 24),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'reg_alpha': trial.suggest_float('reg_alpha', 1e-8, 10.0, log=True),
        'reg_lambda': trial.suggest_float('reg_lambda', 1e-8, 10.0, log=True),
        'cat_l2': trial.suggest_float('cat_l2', 1e-3, 100.0, log=True),
        'cat_smooth': trial.suggest_float('cat_smooth', 1e-3, 100.0, log=True),
    }

    unified_params = const_params | dist_params
    unified_params['objective'] = 'binary'

    auc_dict = {'train': [], 'val': []}

    for train_idx, val_idx in CV_folds:
        model = LGBMClassifier(**unified_params)
        model.fit(
            X_train.iloc[train_idx], 
            y_train.iloc[train_idx],
            eval_set=[(X_train.iloc[val_idx], y_train.iloc[val_idx])],
            eval_metric='auc',
            eval_names='val',
        )

        gain_importance = pd.DataFrame(np.array([list(model.feature_importances_), ]), columns=model.feature_name_)
        # shap_importance = pd.DataFrame(np.array([list(_shap_importances(model, X_train.iloc[val_idx])), ]), columns=model.feature_name_)

        lgbm_fi_df = pd.concat([lgbm_fi_df, gain_importance])
        # shap_fi_df = pd.concat([shap_fi_df, shap_importance])

        auc_dict['val'].append(np.mean(model.evals_result_['val']['auc']))

        print(f'AUC val: {round(np.mean(model.evals_result_["val"]["auc"]), 4)}')

    model = LGBMClassifier(**unified_params)
    model.fit(
        X_train,
        y_train
    )
    
    y_pred = model.predict_proba(X_train)[:, 1]

    gini_train = 2 * roc_auc_score(y_train, y_pred) - 1
    gini_val = 2 * np.mean(auc_dict['val']) - 1
    
    print(f'Gini train: {round(gini_train, 4)}, Gini val: {round(gini_val, 4)}')

    return (1 - gini_val) + 5 * abs(gini_val - gini_train)

In [28]:
sampler = optuna.samplers.TPESampler(seed=CONFIG.RANDOM_STATE)
study = optuna.create_study(
    direction='minimize',
    pruner=optuna.pruners.HyperbandPruner(min_resource=10),
    sampler=sampler,
    study_name='lgbm_logloss'
)
study.optimize(objective_logloss, n_trials=10)

[I 2025-05-12 03:32:09,060] A new study created in memory with name: lgbm_logloss


AUC val: 0.8091
AUC val: 0.8087
AUC val: 0.806
AUC val: 0.8106
AUC val: 0.8088


[I 2025-05-12 03:33:16,156] Trial 0 finished with value: 0.5454666336707223 and parameters: {'num_leaves': 13, 'colsample_bytree': 0.9753571532049581, 'subsample': 0.8659969709057025, 'reg_alpha': 0.0024430162614261434, 'reg_lambda': 2.5361081166471375e-07, 'cat_l2': 0.0060252157362038605, 'cat_smooth': 0.0019517224641449498}. Best is trial 0 with value: 0.5454666336707223.


Gini train: 0.6499, Gini val: 0.6173
AUC val: 0.8106
AUC val: 0.8105
AUC val: 0.8079
AUC val: 0.812
AUC val: 0.8104


[I 2025-05-12 03:34:39,567] Trial 1 finished with value: 0.5875098024574263 and parameters: {'num_leaves': 22, 'colsample_bytree': 0.8005575058716043, 'subsample': 0.8540362888980227, 'reg_alpha': 1.5320059381854043e-08, 'reg_lambda': 5.3602947287282925, 'cat_l2': 14.528246637516036, 'cat_smooth': 0.011526449540315618}. Best is trial 0 with value: 0.5454666336707223.


Gini train: 0.6622, Gini val: 0.6205
AUC val: 0.8082
AUC val: 0.8077
AUC val: 0.8051
AUC val: 0.8099
AUC val: 0.8081


[I 2025-05-12 03:35:36,289] Trial 2 finished with value: 0.5056675583810986 and parameters: {'num_leaves': 10, 'colsample_bytree': 0.5917022549267169, 'subsample': 0.6521211214797689, 'reg_alpha': 0.00052821153945323, 'reg_lambda': 7.71800699380605e-05, 'cat_l2': 0.028585493941961918, 'cat_smooth': 1.1462107403425035}. Best is trial 2 with value: 0.5056675583810986.


Gini train: 0.6399, Gini val: 0.6156
AUC val: 0.8079
AUC val: 0.8075
AUC val: 0.805
AUC val: 0.8097
AUC val: 0.8078


[I 2025-05-12 03:36:33,374] Trial 3 finished with value: 0.4968607797697424 and parameters: {'num_leaves': 9, 'colsample_bytree': 0.6460723242676091, 'subsample': 0.6831809216468459, 'reg_alpha': 0.00012724181576752517, 'reg_lambda': 0.11656915613247415, 'cat_l2': 0.009962513222055111, 'cat_smooth': 0.3725393839578886}. Best is trial 3 with value: 0.4968607797697424.


Gini train: 0.6376, Gini val: 0.6152
AUC val: 0.8094
AUC val: 0.8094
AUC val: 0.8066
AUC val: 0.811
AUC val: 0.8094


[I 2025-05-12 03:37:37,961] Trial 4 finished with value: 0.5664264888308088 and parameters: {'num_leaves': 17, 'colsample_bytree': 0.5232252063599989, 'subsample': 0.8037724259507192, 'reg_alpha': 3.425445902633376e-07, 'reg_lambda': 3.850031979199519e-08, 'cat_l2': 55.51721685244726, 'cat_smooth': 67.32248920775338}. Best is trial 3 with value: 0.4968607797697424.


Gini train: 0.6553, Gini val: 0.6183
AUC val: 0.8101
AUC val: 0.8101
AUC val: 0.8074
AUC val: 0.8116
AUC val: 0.8101


[I 2025-05-12 03:38:48,723] Trial 5 finished with value: 0.6150876936119847 and parameters: {'num_leaves': 21, 'colsample_bytree': 0.6523068845866853, 'subsample': 0.5488360570031919, 'reg_alpha': 0.01439120761572808, 'reg_lambda': 9.148975058772307e-05, 'cat_l2': 0.00407559644007287, 'cat_smooth': 0.2991469302130216}. Best is trial 3 with value: 0.4968607797697424.


Gini train: 0.6667, Gini val: 0.6197
AUC val: 0.8071
AUC val: 0.8065
AUC val: 0.8039
AUC val: 0.8086
AUC val: 0.8068


[I 2025-05-12 03:39:46,370] Trial 6 finished with value: 0.48414585908226915 and parameters: {'num_leaves': 7, 'colsample_bytree': 0.954660201039391, 'subsample': 0.6293899908000085, 'reg_alpha': 0.009176996354542699, 'reg_lambda': 6.388511557344611e-06, 'cat_l2': 0.3984190594434688, 'cat_smooth': 0.5414413211338525}. Best is trial 6 with value: 0.48414585908226915.


Gini train: 0.6326, Gini val: 0.6132
AUC val: 0.8088
AUC val: 0.808
AUC val: 0.8057
AUC val: 0.8102
AUC val: 0.8084


[I 2025-05-12 03:40:48,062] Trial 7 finished with value: 0.4984275653579118 and parameters: {'num_leaves': 10, 'colsample_bytree': 0.9847923138822793, 'subsample': 0.8875664116805573, 'reg_alpha': 2.8542399074977594, 'reg_lambda': 1.1309571585271492, 'cat_l2': 0.9761125443110454, 'cat_smooth': 40.67908494359547}. Best is trial 6 with value: 0.48414585908226915.


Gini train: 0.6395, Gini val: 0.6165
AUC val: 0.8075
AUC val: 0.807
AUC val: 0.8044
AUC val: 0.8091
AUC val: 0.8072


[I 2025-05-12 03:41:41,649] Trial 8 finished with value: 0.4886484412210448 and parameters: {'num_leaves': 8, 'colsample_bytree': 0.5979914312095727, 'subsample': 0.522613644455269, 'reg_alpha': 8.471746987003668e-06, 'reg_lambda': 3.148441347423712e-05, 'cat_l2': 0.022737628102536857, 'cat_smooth': 13.921548533046499}. Best is trial 6 with value: 0.48414585908226915.


Gini train: 0.6346, Gini val: 0.6141
AUC val: 0.809
AUC val: 0.8089
AUC val: 0.8063
AUC val: 0.8106
AUC val: 0.8088


[I 2025-05-12 03:42:41,883] Trial 9 finished with value: 0.5347042739418595 and parameters: {'num_leaves': 13, 'colsample_bytree': 0.6404672548436904, 'subsample': 0.7713480415791243, 'reg_alpha': 1.8548894229694903e-07, 'reg_lambda': 0.16587190283399628, 'cat_l2': 0.0023591373063477136, 'cat_smooth': 85.98737339212276}. Best is trial 6 with value: 0.48414585908226915.


Gini train: 0.6479, Gini val: 0.6175


In [30]:
best_params = study.best_params | const_params

model = LGBMClassifier(**best_params)
model.fit(
    X_train,
    y_train
)

print(roc_auc_score(oot_data[CONFIG.TARGET_COLUMN], model.predict_proba(oot_data[tmp_features_to_analyze])[:, 1]))

0.804767618642444
