In [44]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split,  RandomizedSearchCV
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
import lightgbm as lgb
from sklearn.metrics import roc_auc_score, average_precision_score, log_loss, accuracy_score
import warnings

# Ignore all warnings
warnings.filterwarnings("ignore")

In [2]:
DATA_PATH = "zbiór_8.csv"
RANDOM_SEED = 1
CORR_THRESHOLD = 0.8

In [3]:
df = pd.read_csv(DATA_PATH)

df_train, df_tmp = train_test_split(df, train_size=0.7, stratify=df["default"], random_state=RANDOM_SEED)
df_val, df_test = train_test_split(df_tmp, test_size=0.5, stratify=df_tmp["default"], random_state=RANDOM_SEED)

X_train, y_train = df_train.drop(columns="default"), df_train["default"]
X_val, y_val = df_val.drop(columns="default"), df_val["default"]
X_test, y_test = df_test.drop(columns="default"), df_test["default"]

In [4]:
class ColumnNamePreservingPipeline(Pipeline):
    def transform(self, X):
        X_transformed = super().transform(X)
        if isinstance(X_transformed, np.ndarray):
            if hasattr(self.steps[-1][1], "get_feature_names_out"):
                cols = self.steps[-1][1].get_feature_names_out(X.columns)
            else:
                cols = X.columns
            X_transformed = pd.DataFrame(X_transformed, columns=cols, index=X.index)
        return X_transformed


class ColumnNamePreservingCT(ColumnTransformer):
    def fit(self, X, y=None):
        super().fit(X, y)
        self.feature_names_out_ = self._get_feature_names_out()
        return self
    
    def transform(self, X):
        X_transformed = super().transform(X)
        if not hasattr(self, 'feature_names_out_'):
            self.feature_names_out_ = self._get_feature_names_out()
        return pd.DataFrame(X_transformed, columns=self.feature_names_out_, index=X.index)
    
    def _get_feature_names_out(self):
        cols = []
        for name, trans, cols_in in self.transformers_:
            if trans == 'drop':
                continue
            if hasattr(trans, 'get_feature_names_out'):
                cols.extend(trans.get_feature_names_out(cols_in))
            else:
                if hasattr(cols_in, '__len__') and not isinstance(cols_in, str):
                    cols.extend(cols_in)
                else:
                    cols.append(cols_in)
        return cols

In [5]:
class UnimportantDropper(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        zero_percentage = (X[:] == 0).sum() / len(X)

        cols_many_zeros = zero_percentage[zero_percentage > 0.5].index.tolist()

        cols_many_zeros = [col for col in cols_many_zeros 
                        if not col.startswith('wsk') 
                        and col != 'pkdKod' ]
        manual_cols=['szczegolnaFormaPrawna_Symbol', 'RP_przeplywy_operacyjne', 'RP_amortyzacja', 'wsk_pokrycie_zob_kr_gotowkowe_2', 'wsk_zadluzenia_gotowki_2', 'wsk_akt_generowania_got_2', 'wsk_pokrycie_wyd_fin_gotowkowe_2', 'wsk_zysk_CF_operacyjny', 'schemat_wsk_bilans']
        
        cols_to_drop = cols_many_zeros + manual_cols
        self.keep_ = [c for c in X.columns if c not in cols_to_drop]
        return self

    def transform(self, X):
        return X[self.keep_].copy()
    
    def get_feature_names_out(self, input_features=None):
        return getattr(self, 'keep_', input_features)
    
class CategoricalRuleBasedImputer(BaseEstimator, TransformerMixin):
    def __init__(self, cols_to_impute, missing_label="MISSING"):
        self.cols_to_impute = cols_to_impute
        self.missing_label = missing_label

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = X.copy()
        for col in self.cols_to_impute:
            X[col] = X[col].replace(["", " ", None, 0], np.nan)
            X[col] = X[col].fillna(self.missing_label)
        return X

    def get_feature_names_out(self, input_features=None):
        return np.array(self.cols_to_impute if input_features is None else input_features)
    
        
    
class NumericalRuleBasedImputer(BaseEstimator, TransformerMixin):
    def __init__(self, cols_to_impute):
        self.cols_to_impute = cols_to_impute

    def fit(self, X, y=None):
        X = X.copy()


        self.medians_ = {}
        for col in self.cols_to_impute:
            median_val = X.loc[X[col] != 0, col].median()
            self.medians_[col] = median_val

 
        wsk_cols = [c for c in X.columns if c.startswith('wsk')]
        self.drop_ = []
        self.fill_ = []
        for col in wsk_cols:
            pct_missing = ((X[col] == 0) | X[col].isna() | (X[col] == np.inf) | (X[col] == -np.inf)).mean() * 100
            if pct_missing > 20:
                self.drop_.append(col)  
            else:
                self.fill_.append(col)  

        self.keep_ = [c for c in X.columns if c not in self.drop_]
        return self

    def transform(self, X):
        X = X.copy()

        for col, median_val in self.medians_.items():
            if col in X.columns:
                X[col] = X[col].replace([0, np.inf, -np.inf], np.nan)
                X[col] = X[col].fillna(median_val)

        for col in self.fill_:
            if col in X.columns:
                median_val = X[col].median()
                X[col] = X[col].replace([0, np.inf, -np.inf], np.nan)
                X[col] = X[col].fillna(median_val)

        X = X.drop(columns=self.drop_, errors='ignore')

        return X

    def get_feature_names_out(self, input_features=None):
        return getattr(self, 'keep_', input_features)
    

In [6]:
class OutlierCapper(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        self.no_cap_cols = []
        self.moderate_iqr_cols = []
        self.percentile_cols = []

        for col in X.columns:
            Q1 = X[col].quantile(0.25)
            Q3 = X[col].quantile(0.75)
            IQR = Q3 - Q1
            
            lower_1_5 = Q1 - 1.5 * IQR
            upper_1_5 = Q3 + 1.5 * IQR
            
            below = (X[col] < lower_1_5).sum()
            above = (X[col] > upper_1_5).sum()
            perc_outliers = (below + above) / len(X[col])
            
            if perc_outliers == 0:
                self.no_cap_cols.append(col)
            elif perc_outliers <= 0.1:
                self.moderate_iqr_cols.append(col)
            else:
                self.percentile_cols.append(col)
        return self

    def transform(self, X):
        for col in self.moderate_iqr_cols:
            Q1 = X[col].quantile(0.25)
            Q3 = X[col].quantile(0.75)
            IQR = Q3 - Q1
            lower = Q1 - 1.5 * IQR
            upper = Q3 + 1.5 * IQR
            X[col] = X[col].clip(lower=lower, upper=upper)

        for col in self.percentile_cols:
            lower = X[col].quantile(0.01)
            upper = X[col].quantile(0.99)
            X[col] = X[col].clip(lower=lower, upper=upper)
        return X

    def get_feature_names_out(self, input_features=None):
        return getattr(self, 'keep_', input_features)
    
class HighCorrelationDropper(BaseEstimator, TransformerMixin):
    def __init__(self, threshold=0.8):
        self.threshold = threshold

    def fit(self, X, y=None):
        Xc = X.copy()
        Xc = Xc[Xc.columns.sort_values()]  
        while True:
            corr = Xc.corr().abs()
            upper = corr.where(np.triu(np.ones(corr.shape), k=1).astype(bool))

            # znajdź wszystkie pary z korelacją > threshold
            to_drop_pairs = [
                (col1, col2)
                for col1 in upper.columns
                for col2 in upper.index
                if pd.notnull(upper.loc[col1, col2]) and upper.loc[col1, col2] > self.threshold
            ]

            if not to_drop_pairs:
                break

            # zliczamy częstotliwość występowania kolumn w parach do usunięcia
            freq = {}
            for col1, col2 in to_drop_pairs:
                freq[col1] = freq.get(col1, 0) + 1
                freq[col2] = freq.get(col2, 0) + 1

            # wybieramy kolumnę, która występuje najczęściej
            max_freq = max(freq.values())
            max_cols = [col for col, f in freq.items() if f == max_freq]
            col_to_drop = sorted(max_cols)[-1]  # ostatnia alfabetycznie

            Xc = Xc.drop(columns=[col_to_drop])

        self.keep_ = Xc.columns.tolist()
        return self

    def transform(self, X):
        return X[self.keep_].copy()

    def get_feature_names_out(self, input_features=None):
        return getattr(self, 'keep_', input_features)
    
class LogTransformer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        # wykrycie mocno skośnych kolumn (>2)
        skew_values = X.skew()
        self.cols_ = skew_values[skew_values > 2].index.tolist()
        # zachowanie kolejności kolumn
        self.keep_ = X.columns.tolist()
        return self

    def transform(self, X):
        X = X.copy()
        for col in self.cols_:
            min_val = X[col].min()
            X[col] = X[col] + abs(min_val) + 1
            X[col] = np.log(X[col])
        return X

    def get_feature_names_out(self, input_features=None):
        return getattr(self, 'keep_', input_features)
    
class CorrDropper2(BaseEstimator, TransformerMixin):
    def __init__(self, threshold=0.8):
        self.threshold = threshold
        self.to_drop_ = []
        self.keep_ = []

    def fit(self, X, y=None):
        # liczymy korelacje tylko dla kolumn numerycznych
        corr_matrix = X.corr(numeric_only=True).abs()
        upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
        self.to_drop_ = [column for column in upper.columns if any(upper[column] > self.threshold)]
        # kolumny, które zachowujemy
        self.keep_ = [col for col in X.columns if col not in self.to_drop_]
        return self

    def transform(self, X):
        X_reduced = X.drop(columns=self.to_drop_, errors='ignore')
        return X_reduced

    def get_feature_names_out(self, input_features=None):
        # jeśli są zachowane kolumny, zwróć je
        if hasattr(self, 'keep_') and self.keep_:
            return self.keep_
        return input_features
 
class MakeCategory(BaseEstimator, TransformerMixin):
    def __init__(self, cat_cols):
        self.cat_cols = cat_cols

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = pd.DataFrame(X).copy()
        for col in self.cat_cols:
            X[col] = X[col].astype(str)
            X[col] = X[col].astype('category')
        return X
    def get_feature_names_out(self, input_features=None):
        return getattr(self, 'keep_', input_features)
    

class MakeNumber(BaseEstimator, TransformerMixin):
    def __init__(self, cat_cols):
        self.cat_cols = cat_cols

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = pd.DataFrame(X).copy()
        for col in X.columns:
            if col not in self.cat_cols:
                X[col] = X[col].astype('float')
        return X
    def get_feature_names_out(self, input_features=None):
        return getattr(self, 'keep_', input_features)

class DataFrameWrapper(BaseEstimator, TransformerMixin):
    def __init__(self, transformer):
        self.transformer = transformer
        
    def fit(self, X, y=None):
        self.transformer.fit(X, y)
        self.feature_names_ = X.columns  # po prostu weź kolumny wejściowe
        return self
        
    def transform(self, X):
        X_transformed = self.transformer.transform(X)
        return X_transformed

In [7]:
cat_cols = ['formaWlasnosci_Symbol', 'pkdKod', 'schemat_wsk_rzis'] 
num_cols = [col for col in X_train.columns if col not in cat_cols]

num_pipeline = ColumnNamePreservingPipeline([
    ("drop_num", UnimportantDropper()),
    ("imp_num", NumericalRuleBasedImputer(cols_to_impute=[
            'Aktywa', 'Aktywa_trwale', 'Aktywa_obrotowe', 'Naleznosci_krotkoterminowe',
            'Inwestycje_krotkoterminowe', 'Srodki_pieniezne', 'Kapital_wlasny', 'Kapital_podstawowy',
            'Zysk_netto', 'Zobowiazania_rezerwy', 'Zobowiazania_krotkoterminowe',
            'przychody_sprzedazy', 'zysk_sprzedazy', 'pozostale_przychody_oper', 'koszty_operacyjne_pozostale',
            'zysk_operacyjny', 'koszty_finansowe', 'zysk_brutto', 'podatek_dochodowy', 'koszty_operacyjne'
])),
    ("log", LogTransformer()),
    ("cap", OutlierCapper()),
    ("corr", HighCorrelationDropper()),
    ("corr2_num", CorrDropper2(threshold=CORR_THRESHOLD))
])

cat_pipeline = ColumnNamePreservingPipeline([
    ("drop_cat", UnimportantDropper()),
    ("imp_cat", CategoricalRuleBasedImputer(cols_to_impute=["formaWlasnosci_Symbol"]))
])

preprocess = ColumnNamePreservingCT([
    ("num", num_pipeline, num_cols),
    ("cat", cat_pipeline, cat_cols) 
])

pipeline = ColumnNamePreservingPipeline([
    ("preprocess", DataFrameWrapper(preprocess)),
    ("make_category", MakeCategory(cat_cols=cat_cols)),
    ("make_number", MakeNumber(cat_cols=cat_cols))
])

pipeline

0,1,2
,steps,"[('preprocess', ...), ('make_category', ...), ...]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformer,ColumnNamePre..._wsk_rzis'])])

0,1,2
,cols_to_impute,"['Aktywa', 'Aktywa_trwale', ...]"

0,1,2
,threshold,0.8

0,1,2
,cols_to_impute,['formaWlasnosci_Symbol']
,missing_label,'MISSING'

0,1,2
,cat_cols,"['formaWlasnosci_Symbol', 'pkdKod', ...]"

0,1,2
,cat_cols,"['formaWlasnosci_Symbol', 'pkdKod', ...]"


In [8]:
#pipeline.fit(X_train, y_train)

X_train_t = pipeline.fit_transform(X_train)
X_val_t = pipeline.transform(X_val)
X_test_t = pipeline.transform(X_test)



In [9]:
y_train_t = y_train
y_val_t = y_val
y_test_t = y_test

### Establishing baseline for model

In [10]:
model = lgb.LGBMClassifier(objective='binary', random_state=RANDOM_SEED)
model.fit(
    X_train_t, y_train_t,
    eval_set=[(X_val_t, y_val_t)]
)

y_val_pred_proba = model.predict_proba(X_val_t)[:,1]
y_val_pred_class = model.predict(X_val_t)

auc = roc_auc_score(y_val_t, y_val_pred_proba)

pr_auc = average_precision_score(y_val_t, y_val_pred_proba)

def ks_stat(y_true, y_proba):
    y_true = np.array(y_true)
    y_proba = np.array(y_proba)
    pos = y_proba[y_true==1]
    neg = y_proba[y_true==0]
    pos_cdf = np.searchsorted(np.sort(pos), np.sort(np.concatenate([pos, neg])), side='right') / len(pos)
    neg_cdf = np.searchsorted(np.sort(neg), np.sort(np.concatenate([pos, neg])), side='right') / len(neg)
    return np.max(np.abs(pos_cdf - neg_cdf))

ks = ks_stat(y_val_t, y_val_pred_proba)

ll = log_loss(y_val_t, y_val_pred_proba)

print(f"AUC: {auc:.4f}")
print(f"PR-AUC: {pr_auc:.4f}")
print(f"KS: {ks:.4f}")
print(f"Log-loss: {ll:.4f}")

[LightGBM] [Info] Number of positive: 200, number of negative: 1900
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000555 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 10890
[LightGBM] [Info] Number of data points in the train set: 2100, number of used features: 45
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.095238 -> initscore=-2.251292
[LightGBM] [Info] Start training from score -2.251292
AUC: 0.6393
PR-AUC: 0.1852
KS: 0.2307
Log-loss: 0.3124


model = lgb.LGBMClassifier(objective='binary', random_state=RANDOM_SEED)

param_dist = {
    'num_leaves': [15, 31, 63, 127],
    'max_depth': [3, 5, 7, 9, -1],
    'learning_rate': [0.01, 0.05, 0.1, 0.2],
    'n_estimators': [100, 200, 500, 1000],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0],
    'reg_alpha': [0, 0.01, 0.1],
    'reg_lambda': [0, 0.01, 0.1]
}

rs = RandomizedSearchCV(
    estimator=model,
    param_distributions=param_dist,
    n_iter=50,         
    scoring='roc_auc',
    cv=3,                
    verbose=2,
    random_state=RANDOM_SEED,
    n_jobs=-1
)

rs.fit(X_train_t, y_train_t)

print("Best AUC:", rs.best_score_)
print("Best params:", rs.best_params_)

In [None]:
# Trwa ze dwie minuty

# Niestety prowadzi do przeuczenia


import optuna
from sklearn.model_selection import cross_val_score

def objective(trial):

    params = {
        "objective": "binary",
        "metric": "auc",
        "random_state": RANDOM_SEED,
        "reg_alpha": trial.suggest_float("reg_alpha", 1e-8, 1.0, log=True),
        "reg_lambda": trial.suggest_float("reg_lambda", 1e-8, 1.0, log=True),
        "num_leaves": trial.suggest_int("num_leaves", 16, 256),
        "max_depth": trial.suggest_int("max_depth", -1, 12),
        "learning_rate": trial.suggest_float("learning_rate", 0.005, 0.2, log=True),
        "n_estimators": trial.suggest_int("n_estimators", 50, 1500),
        "subsample": trial.suggest_float("subsample", 0.5, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
        "min_child_samples": trial.suggest_int("min_child_samples", 5, 50),
    }

    model = lgb.LGBMClassifier(**params)

    auc = cross_val_score(
        model,
        X_train_t,
        y_train_t,
        cv=3,
        scoring="roc_auc",
        n_jobs=-1
    ).mean()

    return auc


study = optuna.create_study(
    direction="maximize",
    study_name="lgbm_auc_bo"
)

study.optimize(objective, n_trials=50, show_progress_bar=True)

print("\nBest AUC:", study.best_value)
print("Best params:", study.best_params)

optuna.visualization.plot_optimization_history(study)
optuna.visualization.plot_param_importances(study)
optuna.visualization.plot_slice(study)

from optuna.visualization import plot_optimization_history, plot_param_importances, plot_slice

plot_optimization_history(study)
plot_param_importances(study)
plot_slice(study)

In [40]:
model_bo = lgb.LGBMClassifier(
    **study.best_params,
    objective="binary",
    metric="auc",
    random_state=RANDOM_SEED
)
model_rs = lgb.LGBMClassifier(objective='binary', random_state=RANDOM_SEED, **rs.best_params_) #zakomentować -wtedy pójdzie BO

In [45]:
def ks_stat(y_true, y_proba):
    y_true = np.array(y_true)
    y_proba = np.array(y_proba)
    pos = y_proba[y_true==1]
    neg = y_proba[y_true==0]
    pos_cdf = np.searchsorted(np.sort(pos), np.sort(np.concatenate([pos, neg])), side='right') / len(pos)
    neg_cdf = np.searchsorted(np.sort(neg), np.sort(np.concatenate([pos, neg])), side='right') / len(neg)
    return np.max(np.abs(pos_cdf - neg_cdf))

def results(model, X_train_t, y_train_t,X_val_t, y_val_t):
    model.fit(X_train_t, y_train_t, eval_set=[(X_val_t, y_val_t)])
    
    y_train_pred_proba = model.predict_proba(X_train_t)[:,1]
    y_train_pred_class = model.predict(X_train_t)
    
    train_auc = roc_auc_score(y_train_t, y_train_pred_proba)
    train_pr_auc = average_precision_score(y_train_t, y_train_pred_proba)
    train_ks = ks_stat(y_train_t, y_train_pred_proba)
    train_ll = log_loss(y_train_t, y_train_pred_proba)
    
    y_val_pred_proba = model.predict_proba(X_val_t)[:,1]
    y_val_pred_class = model.predict(X_val_t)
    
    val_auc = roc_auc_score(y_val_t, y_val_pred_proba)
    val_pr_auc = average_precision_score(y_val_t, y_val_pred_proba)
    val_ks = ks_stat(y_val_t, y_val_pred_proba)
    val_ll = log_loss(y_val_t, y_val_pred_proba)
    
    print("----- TRAIN -----")
    print(f"AUC: {train_auc:.4f}")
    print(f"PR-AUC: {train_pr_auc:.4f}")
    print(f"KS: {train_ks:.4f}")
    print(f"Log-loss: {train_ll:.4f}\n")
    
    print("----- VALIDATION -----")
    print(f"AUC: {val_auc:.4f}")
    print(f"PR-AUC: {val_pr_auc:.4f}")
    print(f"KS: {val_ks:.4f}")
    print(f"Log-loss: {val_ll:.4f}")


In [46]:
results(model_rs, X_train_t, y_train_t,X_val_t, y_val_t)

[LightGBM] [Info] Number of positive: 200, number of negative: 1900
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000771 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 10890
[LightGBM] [Info] Number of data points in the train set: 2100, number of used features: 45
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.095238 -> initscore=-2.251292
[LightGBM] [Info] Start training from score -2.251292
----- TRAIN -----
AUC: 0.9605
PR-AUC: 0.8222
KS: 0.7745
Log-loss: 0.1931

----- VALIDATION -----
AUC: 0.6706
PR-AUC: 0.1584
KS: 0.3284
Log-loss: 0.3611


In [47]:
results(model_bo, X_train_t, y_train_t,X_val_t, y_val_t)

[LightGBM] [Info] Number of positive: 200, number of negative: 1900
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000740 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 10890
[LightGBM] [Info] Number of data points in the train set: 2100, number of used features: 45
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.095238 -> initscore=-2.251292
[LightGBM] [Info] Start training from score -2.251292
----- TRAIN -----
AUC: 0.9806
PR-AUC: 0.8506
KS: 0.8792
Log-loss: 0.2023

----- VALIDATION -----
AUC: 0.7144
PR-AUC: 0.2474
KS: 0.3999
Log-loss: 0.3311


#### optuna - overfiting ale lepsze wyniki (oprócz log loss)
#### rs - mniejszy overfiting ale słabsze wyniki