In [1]:
import os, sys, warnings, logging
os.environ["LIGHTGBM_DISABLE_STD_LOG"] = "1"
logging.getLogger('lightgbm').setLevel(logging.ERROR)

import lightgbm as lgb

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split,  RandomizedSearchCV
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import roc_auc_score, average_precision_score, log_loss, accuracy_score


In [2]:
DATA_PATH = "zbiór_8.csv"
RANDOM_SEED = 1
CORR_THRESHOLD = 0.77 #przełożenie na liczbę kolumn w modelu: 0.8 -50, 0,7 - 38, 0.75 - 38, 0,77 -45

In [3]:
df = pd.read_csv(DATA_PATH)

df_train, df_tmp = train_test_split(df, train_size=0.7, stratify=df["default"], random_state=RANDOM_SEED)
df_val, df_test = train_test_split(df_tmp, test_size=0.5, stratify=df_tmp["default"], random_state=RANDOM_SEED)

X_train, y_train = df_train.drop(columns="default"), df_train["default"]
X_val, y_val = df_val.drop(columns="default"), df_val["default"]
X_test, y_test = df_test.drop(columns="default"), df_test["default"]

In [4]:
class ColumnNamePreservingPipeline(Pipeline):
    def transform(self, X):
        X_transformed = super().transform(X)
        if isinstance(X_transformed, np.ndarray):
            if hasattr(self.steps[-1][1], "get_feature_names_out"):
                cols = self.steps[-1][1].get_feature_names_out(X.columns)
            else:
                cols = X.columns
            X_transformed = pd.DataFrame(X_transformed, columns=cols, index=X.index)
        return X_transformed


class ColumnNamePreservingCT(ColumnTransformer):
    def fit(self, X, y=None):
        super().fit(X, y)
        self.feature_names_out_ = self._get_feature_names_out()
        return self
    
    def transform(self, X):
        X_transformed = super().transform(X)
        if not hasattr(self, 'feature_names_out_'):
            self.feature_names_out_ = self._get_feature_names_out()
        return pd.DataFrame(X_transformed, columns=self.feature_names_out_, index=X.index)
    
    def _get_feature_names_out(self):
        cols = []
        for name, trans, cols_in in self.transformers_:
            if trans == 'drop':
                continue
            if hasattr(trans, 'get_feature_names_out'):
                cols.extend(trans.get_feature_names_out(cols_in))
            else:
                if hasattr(cols_in, '__len__') and not isinstance(cols_in, str):
                    cols.extend(cols_in)
                else:
                    cols.append(cols_in)
        return cols

In [5]:
class UnimportantDropper(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        zero_percentage = (X[:] == 0).sum() / len(X)

        cols_many_zeros = zero_percentage[zero_percentage > 0.5].index.tolist()

        cols_many_zeros = [col for col in cols_many_zeros 
                        if not col.startswith('wsk') 
                        and col != 'pkdKod' ]
        manual_cols=['szczegolnaFormaPrawna_Symbol', 'RP_przeplywy_operacyjne', 'RP_amortyzacja', 'wsk_pokrycie_zob_kr_gotowkowe_2', 'wsk_zadluzenia_gotowki_2', 'wsk_akt_generowania_got_2', 'wsk_pokrycie_wyd_fin_gotowkowe_2', 'wsk_zysk_CF_operacyjny', 'schemat_wsk_bilans']
        
        cols_to_drop = cols_many_zeros + manual_cols
        self.keep_ = [c for c in X.columns if c not in cols_to_drop]
        return self

    def transform(self, X):
        return X[self.keep_].copy()
    
    def get_feature_names_out(self, input_features=None):
        return getattr(self, 'keep_', input_features)
    
class CategoricalRuleBasedImputer(BaseEstimator, TransformerMixin):
    def __init__(self, cols_to_impute, missing_label="MISSING"):
        self.cols_to_impute = cols_to_impute
        self.missing_label = missing_label

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = X.copy()
        for col in self.cols_to_impute:
            X[col] = X[col].replace(["", " ", None, 0], np.nan)
            X[col] = X[col].fillna(self.missing_label)
        return X

    def get_feature_names_out(self, input_features=None):
        return np.array(self.cols_to_impute if input_features is None else input_features)

class RareCategoryGrouper(BaseEstimator, TransformerMixin):
    def __init__(self, min_count=15, rare_label="RARE"):
        self.min_count = min_count
        self.rare_label = rare_label
        self.frequent_categories_ = {}
        self.categorical_cols_ = None

    def fit(self, X, y=None):
        X = X.copy()

        # automatyczna detekcja kolumn kategorycznych
        self.categorical_cols_ = [
            col for col in X.columns 
            if X[col].dtype == "object" or str(X[col].dtype).startswith("category")
        ]

        for col in self.categorical_cols_:
            counts = X[col].value_counts(dropna=False)
            frequent = counts[counts >= self.min_count].index
            self.frequent_categories_[col] = set(frequent)

        return self

    def transform(self, X):
        X = X.copy()
        for col in self.categorical_cols_:
            frequent = self.frequent_categories_[col]
            X[col] = X[col].apply(
                lambda v: v if v in frequent else self.rare_label
            )
        return X

    def get_feature_names_out(self, input_features=None):
        return np.array(self.categorical_cols_ if input_features is None else input_features)


        
    
class NumericalRuleBasedImputer(BaseEstimator, TransformerMixin):
    def __init__(self, cols_to_impute):
        self.cols_to_impute = cols_to_impute

    def fit(self, X, y=None):
        X = X.copy()


        self.medians_ = {}
        for col in self.cols_to_impute:
            median_val = X.loc[X[col] != 0, col].median()
            self.medians_[col] = median_val

 
        wsk_cols = [c for c in X.columns if c.startswith('wsk')]
        self.drop_ = []
        self.fill_ = []
        for col in wsk_cols:
            pct_missing = ((X[col] == 0) | X[col].isna() | (X[col] == np.inf) | (X[col] == -np.inf)).mean() * 100
            if pct_missing > 20:
                self.drop_.append(col)  
            else:
                self.fill_.append(col)  

        self.keep_ = [c for c in X.columns if c not in self.drop_]
        return self

    def transform(self, X):
        X = X.copy()

        for col, median_val in self.medians_.items():
            if col in X.columns:
                X[col] = X[col].replace([0, np.inf, -np.inf], np.nan)
                X[col] = X[col].fillna(median_val)

        for col in self.fill_:
            if col in X.columns:
                median_val = X[col].median()
                X[col] = X[col].replace([0, np.inf, -np.inf], np.nan)
                X[col] = X[col].fillna(median_val)

        X = X.drop(columns=self.drop_, errors='ignore')

        return X

    def get_feature_names_out(self, input_features=None):
        return getattr(self, 'keep_', input_features)
    

In [6]:
class OutlierCapper(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        self.no_cap_cols = []
        self.moderate_iqr_cols = []
        self.percentile_cols = []

        for col in X.columns:
            Q1 = X[col].quantile(0.25)
            Q3 = X[col].quantile(0.75)
            IQR = Q3 - Q1
            
            lower_1_5 = Q1 - 1.5 * IQR
            upper_1_5 = Q3 + 1.5 * IQR
            
            below = (X[col] < lower_1_5).sum()
            above = (X[col] > upper_1_5).sum()
            perc_outliers = (below + above) / len(X[col])
            
            if perc_outliers == 0:
                self.no_cap_cols.append(col)
            elif perc_outliers <= 0.1:
                self.moderate_iqr_cols.append(col)
            else:
                self.percentile_cols.append(col)
        return self

    def transform(self, X):
        for col in self.moderate_iqr_cols:
            Q1 = X[col].quantile(0.25)
            Q3 = X[col].quantile(0.75)
            IQR = Q3 - Q1
            lower = Q1 - 1.5 * IQR
            upper = Q3 + 1.5 * IQR
            X[col] = X[col].clip(lower=lower, upper=upper)

        for col in self.percentile_cols:
            lower = X[col].quantile(0.01)
            upper = X[col].quantile(0.99)
            X[col] = X[col].clip(lower=lower, upper=upper)
        return X

    def get_feature_names_out(self, input_features=None):
        return getattr(self, 'keep_', input_features)
    
class HighCorrelationDropper(BaseEstimator, TransformerMixin):
    def __init__(self, threshold=0.8):
        self.threshold = threshold

    def fit(self, X, y=None):
        Xc = X.copy()
        Xc = Xc[Xc.columns.sort_values()]  
        while True:
            corr = Xc.corr().abs()
            upper = corr.where(np.triu(np.ones(corr.shape), k=1).astype(bool))

            # znajdź wszystkie pary z korelacją > threshold
            to_drop_pairs = [
                (col1, col2)
                for col1 in upper.columns
                for col2 in upper.index
                if pd.notnull(upper.loc[col1, col2]) and upper.loc[col1, col2] > self.threshold
            ]

            if not to_drop_pairs:
                break

            # zliczamy częstotliwość występowania kolumn w parach do usunięcia
            freq = {}
            for col1, col2 in to_drop_pairs:
                freq[col1] = freq.get(col1, 0) + 1
                freq[col2] = freq.get(col2, 0) + 1

            # wybieramy kolumnę, która występuje najczęściej
            max_freq = max(freq.values())
            max_cols = [col for col, f in freq.items() if f == max_freq]
            col_to_drop = sorted(max_cols)[-1]  # ostatnia alfabetycznie

            Xc = Xc.drop(columns=[col_to_drop])

        self.keep_ = Xc.columns.tolist()
        return self

    def transform(self, X):
        return X[self.keep_].copy()

    def get_feature_names_out(self, input_features=None):
        return getattr(self, 'keep_', input_features)
    
class LogTransformer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        # wykrycie mocno skośnych kolumn (>2)
        skew_values = X.skew()
        self.cols_ = skew_values[skew_values > 2].index.tolist()
        # zachowanie kolejności kolumn
        self.keep_ = X.columns.tolist()
        return self

    def transform(self, X):
        X = X.copy()
        for col in self.cols_:
            min_val = X[col].min()
            X[col] = X[col] + abs(min_val) + 1
            X[col] = np.log(X[col])
        return X

    def get_feature_names_out(self, input_features=None):
        return getattr(self, 'keep_', input_features)
    
class CorrDropper2(BaseEstimator, TransformerMixin):
    def __init__(self, threshold=0.8):
        self.threshold = threshold
        self.to_drop_ = []
        self.keep_ = []

    def fit(self, X, y=None):
        # liczymy korelacje tylko dla kolumn numerycznych
        corr_matrix = X.corr(numeric_only=True).abs()
        upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
        self.to_drop_ = [column for column in upper.columns if any(upper[column] > self.threshold)]
        # kolumny, które zachowujemy
        self.keep_ = [col for col in X.columns if col not in self.to_drop_]
        return self

    def transform(self, X):
        X_reduced = X.drop(columns=self.to_drop_, errors='ignore')
        return X_reduced

    def get_feature_names_out(self, input_features=None):
        # jeśli są zachowane kolumny, zwróć je
        if hasattr(self, 'keep_') and self.keep_:
            return self.keep_
        return input_features
 
class MakeCategory(BaseEstimator, TransformerMixin):
    def __init__(self, cat_cols):
        self.cat_cols = cat_cols

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = pd.DataFrame(X).copy()
        for col in self.cat_cols:
            X[col] = X[col].astype(str)
            X[col] = X[col].astype('category')
        return X
    def get_feature_names_out(self, input_features=None):
        return getattr(self, 'keep_', input_features)
    

class MakeNumber(BaseEstimator, TransformerMixin):
    def __init__(self, cat_cols):
        self.cat_cols = cat_cols

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = pd.DataFrame(X).copy()
        for col in X.columns:
            if col not in self.cat_cols:
                X[col] = X[col].astype('float')
        return X
    def get_feature_names_out(self, input_features=None):
        return getattr(self, 'keep_', input_features)

class DataFrameWrapper(BaseEstimator, TransformerMixin):
    def __init__(self, transformer):
        self.transformer = transformer
        
    def fit(self, X, y=None):
        self.transformer.fit(X, y)
        self.feature_names_ = X.columns  # po prostu weź kolumny wejściowe
        return self
        
    def transform(self, X):
        X_transformed = self.transformer.transform(X)
        return X_transformed

In [7]:
cat_cols = ['formaWlasnosci_Symbol', 'pkdKod', 'schemat_wsk_rzis'] 
num_cols = [col for col in X_train.columns if col not in cat_cols]

num_pipeline = ColumnNamePreservingPipeline([
    ("drop_num", UnimportantDropper()),
    ("imp_num", NumericalRuleBasedImputer(cols_to_impute=[
            'Aktywa', 'Aktywa_trwale', 'Aktywa_obrotowe', 'Naleznosci_krotkoterminowe',
            'Inwestycje_krotkoterminowe', 'Srodki_pieniezne', 'Kapital_wlasny', 'Kapital_podstawowy',
            'Zysk_netto', 'Zobowiazania_rezerwy', 'Zobowiazania_krotkoterminowe',
            'przychody_sprzedazy', 'zysk_sprzedazy', 'pozostale_przychody_oper', 'koszty_operacyjne_pozostale',
            'zysk_operacyjny', 'koszty_finansowe', 'zysk_brutto', 'podatek_dochodowy', 'koszty_operacyjne'
])),
    ("log", LogTransformer()),
    ("cap", OutlierCapper()),
    ("corr", HighCorrelationDropper()),
    ("corr2_num", CorrDropper2(threshold=CORR_THRESHOLD))
])

cat_pipeline = ColumnNamePreservingPipeline([
    ("drop_cat", UnimportantDropper()),
    ("imp_cat", CategoricalRuleBasedImputer(cols_to_impute=["formaWlasnosci_Symbol"]))
    ,("group", RareCategoryGrouper()) #zakomentować - wtedy nie będzie grupowania
])

preprocess = ColumnNamePreservingCT([
    ("num", num_pipeline, num_cols),
    ("cat", cat_pipeline, cat_cols) 
])

pipeline = ColumnNamePreservingPipeline([
    ("preprocess", DataFrameWrapper(preprocess)),
    ("make_category", MakeCategory(cat_cols=cat_cols)),
    ("make_number", MakeNumber(cat_cols=cat_cols))
])

pipeline

0,1,2
,steps,"[('preprocess', ...), ('make_category', ...), ...]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformer,ColumnNamePre..._wsk_rzis'])])

0,1,2
,cols_to_impute,"['Aktywa', 'Aktywa_trwale', ...]"

0,1,2
,threshold,0.8

0,1,2
,threshold,0.77

0,1,2
,cols_to_impute,['formaWlasnosci_Symbol']
,missing_label,'MISSING'

0,1,2
,min_count,15
,rare_label,'RARE'

0,1,2
,cat_cols,"['formaWlasnosci_Symbol', 'pkdKod', ...]"

0,1,2
,cat_cols,"['formaWlasnosci_Symbol', 'pkdKod', ...]"


In [8]:
#pipeline.fit(X_train, y_train)

X_train_t = pipeline.fit_transform(X_train)
X_val_t = pipeline.transform(X_val)
X_test_t = pipeline.transform(X_test)

X_train_t



Unnamed: 0,Aktywa_obrotowe,Aktywa_trwale,Inwestycje_krotkoterminowe,Kapital_podstawowy,Zobowiazania_krotkoterminowe,Zysk_netto,koszty_finansowe,koszty_operacyjne,koszty_operacyjne_pozostale,pozostale_przychody_oper,...,wsk_udzial_kap_obrotowego_w_fin,wsk_udzial_kap_wlasnego_aktywa_2,wsk_udzial_kapitalu_obrotowego_netto,wsk_udzial_zob_biez_sprzedaz_2,wsk_zast_kapitalu_obcego,wsk_zob_s_aktywa_rzeczowe,wsk_zwrot_aktywa_trwale,formaWlasnosci_Symbol,pkdKod,schemat_wsk_rzis
904,11.974238,12.292380,12.448512,10.310318,11.755491,16.791094,8.803867,12.681135,2.130610,8.966156,...,1.235752,0.811675,1.846653,12.996352,0.722927,-0.095168,9.054784,214.0,4729,SFJIN
1715,16.130252,13.215447,15.433125,15.645856,13.716577,16.931542,10.514321,13.199309,10.914976,12.356730,...,1.799647,0.864747,1.903711,12.996352,0.529942,-0.324372,9.052688,216.0,4799,SFJMA
1449,15.564927,13.884762,15.486363,11.002783,13.719264,16.870231,8.852157,16.539893,9.521676,10.046704,...,14.856211,0.860564,1.878375,12.996352,0.573953,-0.576143,9.052342,214.0,4120,SFJIN
894,11.687206,14.414306,12.272276,12.899322,11.511715,16.790206,11.211353,11.650630,9.189311,9.633266,...,1.019764,0.624237,1.767749,12.996352,1.331829,0.919805,9.050904,214.0,6820,SFJIN
1575,15.524757,16.152049,15.037374,8.535230,14.635356,16.836314,13.196719,14.751823,11.149226,12.356730,...,1.531837,0.817778,1.783136,12.996350,1.331829,0.383612,10.117578,215.0,4110,SFJIN
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
920,12.873967,12.292380,12.660987,9.618535,12.280958,16.794649,8.874922,13.330541,7.521464,10.346482,...,0.913509,0.658700,1.860195,12.996352,0.782622,0.272360,9.148354,214.0,6920,SFJMA
2480,12.876155,8.792550,12.674257,11.290294,12.664975,16.792349,8.803867,14.204238,8.922801,9.459222,...,1.255201,0.817443,1.922477,12.996352,0.733107,0.521269,10.558268,214.0,6499,SFJMI
748,12.125775,12.292380,12.668166,8.525360,11.904273,16.791224,7.967935,7.408974,8.252326,8.990192,...,414.274390,0.776414,1.836577,12.995196,0.753304,-0.434245,9.049726,214.0,6312,SFJIN
2866,14.832781,8.702788,12.360195,12.206278,13.803035,16.795474,9.652625,14.233689,2.348514,9.941957,...,0.673162,0.784110,1.872417,12.996350,0.752272,0.550354,9.381692,216.0,4647,SFJIN


In [9]:
y_train_t = y_train
y_val_t = y_val
y_test_t = y_test

### Establishing baseline for model

In [10]:
model = lgb.LGBMClassifier(objective='binary', random_state=RANDOM_SEED,verbose=-1)
model.fit(
    X_train_t, y_train_t,
    eval_set=[(X_val_t, y_val_t)]
)

y_val_pred_proba = model.predict_proba(X_val_t)[:,1]
y_val_pred_class = model.predict(X_val_t)

auc = roc_auc_score(y_val_t, y_val_pred_proba)

pr_auc = average_precision_score(y_val_t, y_val_pred_proba)

def ks_stat(y_true, y_proba):
    y_true = np.array(y_true)
    y_proba = np.array(y_proba)
    pos = y_proba[y_true==1]
    neg = y_proba[y_true==0]
    pos_cdf = np.searchsorted(np.sort(pos), np.sort(np.concatenate([pos, neg])), side='right') / len(pos)
    neg_cdf = np.searchsorted(np.sort(neg), np.sort(np.concatenate([pos, neg])), side='right') / len(neg)
    return np.max(np.abs(pos_cdf - neg_cdf))

ks = ks_stat(y_val_t, y_val_pred_proba)

ll = log_loss(y_val_t, y_val_pred_proba)

print(f"AUC: {auc:.4f}")
print(f"PR-AUC: {pr_auc:.4f}")
print(f"KS: {ks:.4f}")
print(f"Log-loss: {ll:.4f}")

AUC: 0.7126
PR-AUC: 0.2464
KS: 0.3198
Log-loss: 0.3296


In [11]:
# Trwa ze dwie minuty

# Niestety prowadzi do przeuczenia


import optuna
from sklearn.model_selection import cross_val_score

def objective(trial):

    params = {
        "objective": "binary",
        "metric": "auc",
        "random_state": RANDOM_SEED,
        "reg_alpha": trial.suggest_float("reg_alpha", 1e-8, 1.0, log=True),
        "reg_lambda": trial.suggest_float("reg_lambda", 1e-8, 1.0, log=True),
        "num_leaves": trial.suggest_int("num_leaves", 16, 256),
        "max_depth": trial.suggest_int("max_depth", -1, 12),
        "learning_rate": trial.suggest_float("learning_rate", 0.005, 0.2, log=True),
        "n_estimators": trial.suggest_int("n_estimators", 50, 1500),
        "subsample": trial.suggest_float("subsample", 0.5, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
        "min_child_samples": trial.suggest_int("min_child_samples", 5, 50),
    }

    model = lgb.LGBMClassifier(**params,verbose=-1)

    auc = cross_val_score(
        model,
        X_train_t,
        y_train_t,
        cv=3,
        scoring="roc_auc",
        n_jobs=-1
    ).mean()

    return auc


study = optuna.create_study(
    direction="maximize",
    study_name="lgbm_auc_bo"
)

study.optimize(objective, n_trials=150, show_progress_bar=True)

print("\nBest AUC:", study.best_value)
print("Best params:", study.best_params)

optuna.visualization.plot_optimization_history(study)
optuna.visualization.plot_param_importances(study)
optuna.visualization.plot_slice(study)

from optuna.visualization import plot_optimization_history, plot_param_importances, plot_slice

plot_optimization_history(study).show()
plot_param_importances(study).show()
plot_slice(study).show()

[I 2025-11-16 16:15:06,034] A new study created in memory with name: lgbm_auc_bo


  0%|          | 0/150 [00:00<?, ?it/s]

[I 2025-11-16 16:15:11,463] Trial 0 finished with value: 0.6752283657284271 and parameters: {'reg_alpha': 0.06769929781461793, 'reg_lambda': 0.001092473870893199, 'num_leaves': 131, 'max_depth': 0, 'learning_rate': 0.12261306621609136, 'n_estimators': 819, 'subsample': 0.960402332396897, 'colsample_bytree': 0.6210783848420788, 'min_child_samples': 32}. Best is trial 0 with value: 0.6752283657284271.
[I 2025-11-16 16:15:15,504] Trial 1 finished with value: 0.6910280093377773 and parameters: {'reg_alpha': 7.640966500683598e-06, 'reg_lambda': 4.110788776062083e-05, 'num_leaves': 156, 'max_depth': 7, 'learning_rate': 0.020407722603927366, 'n_estimators': 195, 'subsample': 0.8320464617683628, 'colsample_bytree': 0.714639286343538, 'min_child_samples': 49}. Best is trial 1 with value: 0.6910280093377773.
[I 2025-11-16 16:15:16,394] Trial 2 finished with value: 0.6556436368462566 and parameters: {'reg_alpha': 7.766796594488082e-06, 'reg_lambda': 0.00012161093944796791, 'num_leaves': 228, 'max

In [12]:
model = lgb.LGBMClassifier(objective='binary', random_state=RANDOM_SEED,verbose=-1)

param_dist = {
    'num_leaves': [15, 31, 63, 127],
    'max_depth': [3, 5, 7, 9, -1],
    'learning_rate': [0.01, 0.05, 0.1, 0.2],
    'n_estimators': [100, 200, 500, 1000],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0],
    'reg_alpha': [0, 0.01, 0.1],
    'reg_lambda': [0, 0.01, 0.1]
}

rs = RandomizedSearchCV(
    estimator=model,
    param_distributions=param_dist,
    n_iter=50,         
    scoring='roc_auc',
    cv=3,                
    verbose=2,
    random_state=RANDOM_SEED,
    n_jobs=-1
)

rs.fit(X_train_t, y_train_t)

print("Best AUC:", rs.best_score_)
print("Best params:", rs.best_params_)

Fitting 3 folds for each of 50 candidates, totalling 150 fits
Best AUC: 0.6873931686869424
Best params: {'subsample': 1.0, 'reg_lambda': 0.01, 'reg_alpha': 0, 'num_leaves': 63, 'n_estimators': 200, 'max_depth': 7, 'learning_rate': 0.01, 'colsample_bytree': 0.6}


In [13]:
model_bo = lgb.LGBMClassifier(
    **study.best_params,
    objective="binary",
    metric="auc",
    random_state=RANDOM_SEED,
    verbose=-1
)
model_rs = lgb.LGBMClassifier(objective='binary', random_state=RANDOM_SEED, **rs.best_params_,verbose=-1) 

In [14]:
def ks_stat(y_true, y_proba):
    y_true = np.array(y_true)
    y_proba = np.array(y_proba)
    pos = y_proba[y_true==1]
    neg = y_proba[y_true==0]
    pos_cdf = np.searchsorted(np.sort(pos), np.sort(np.concatenate([pos, neg])), side='right') / len(pos)
    neg_cdf = np.searchsorted(np.sort(neg), np.sort(np.concatenate([pos, neg])), side='right') / len(neg)
    return np.max(np.abs(pos_cdf - neg_cdf))

def results(model, X_train_t, y_train_t,X_val_t, y_val_t):
    model.fit(X_train_t, y_train_t, eval_set=[(X_val_t, y_val_t)])
    
    y_train_pred_proba = model.predict_proba(X_train_t)[:,1]
    y_train_pred_class = model.predict(X_train_t)
    
    train_auc = roc_auc_score(y_train_t, y_train_pred_proba)
    train_pr_auc = average_precision_score(y_train_t, y_train_pred_proba)
    train_ks = ks_stat(y_train_t, y_train_pred_proba)
    train_ll = log_loss(y_train_t, y_train_pred_proba)
    
    y_val_pred_proba = model.predict_proba(X_val_t)[:,1]
    y_val_pred_class = model.predict(X_val_t)
    
    val_auc = roc_auc_score(y_val_t, y_val_pred_proba)
    val_pr_auc = average_precision_score(y_val_t, y_val_pred_proba)
    val_ks = ks_stat(y_val_t, y_val_pred_proba)
    val_ll = log_loss(y_val_t, y_val_pred_proba)
    
    print("----- TRAIN -----")
    print(f"AUC: {train_auc:.4f}")
    print(f"PR-AUC: {train_pr_auc:.4f}")
    print(f"KS: {train_ks:.4f}")
    print(f"Log-loss: {train_ll:.4f}\n")
    
    print("----- VALIDATION -----")
    print(f"AUC: {val_auc:.4f}")
    print(f"PR-AUC: {val_pr_auc:.4f}")
    print(f"KS: {val_ks:.4f}")
    print(f"Log-loss: {val_ll:.4f}")


In [15]:
results(model_rs, X_train_t, y_train_t,X_val_t, y_val_t)

----- TRAIN -----
AUC: 0.9941
PR-AUC: 0.9619
KS: 0.9313
Log-loss: 0.1530

----- VALIDATION -----
AUC: 0.7304
PR-AUC: 0.2165
KS: 0.3532
Log-loss: 0.3427


In [16]:
results(model_bo, X_train_t, y_train_t,X_val_t, y_val_t)

----- TRAIN -----
AUC: 0.8862
PR-AUC: 0.5310
KS: 0.5958
Log-loss: 0.2370

----- VALIDATION -----
AUC: 0.7738
PR-AUC: 0.2937
KS: 0.4639
Log-loss: 0.3124


#### optuna - overfiting ale lepsze wyniki (oprócz log loss)
#### rs - mniejszy overfiting ale słabsze wyniki

### faworyt BO  0.77 z grupowaniem
#### ----- TRAIN -----
##### AUC: 0.8994
##### PR-AUC: 0.5654
##### KS: 0.6284
##### Log-loss: 0.2380

#### ----- VALIDATION -----
##### AUC: 0.7548
##### PR-AUC: 0.2816
##### KS: 0.4123
##### Log-loss: 0.3124

### BO 0.77 bez grupowania
#### ----- TRAIN -----
##### AUC: 0.9725
##### PR-AUC: 0.7988
##### KS: 0.8366
##### Log-loss: 0.2097

##### ----- VALIDATION -----
##### AUC: 0.7409
##### PR-AUC: 0.2393
##### KS: 0.3751
##### Log-loss: 0.3198