In [71]:
# =====================================
# IMPORTY
# =====================================
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

import warnings

# Ignore all warnings
warnings.filterwarnings("ignore")

In [72]:

# =====================================
# PARAMETRY
# =====================================
DATA_PATH = "zbiór_8.csv"
RANDOM_SEED = 1
CORR_THRESHOLD = 0.8
N_BINS = 4
EPS = 1e-6



In [73]:

# =====================================
# WCZYTANIE + SPLIT
# =====================================
df = pd.read_csv(DATA_PATH)

df_train, df_tmp = train_test_split(df, train_size=0.7, stratify=df["default"], random_state=RANDOM_SEED)
df_val, df_test = train_test_split(df_tmp, test_size=0.5, stratify=df_tmp["default"], random_state=RANDOM_SEED)

X_train, y_train = df_train.drop(columns="default"), df_train["default"]
X_val, y_val = df_val.drop(columns="default"), df_val["default"]
X_test, y_test = df_test.drop(columns="default"), df_test["default"]

In [74]:
# -------------------------------------
# WRAPPERY DO ZACHOWANIA NAZW KOLUMN
# -------------------------------------
class ColumnNamePreservingPipeline(Pipeline):
    def transform(self, X):
        X_transformed = super().transform(X)
        if isinstance(X_transformed, np.ndarray):
            if hasattr(self.steps[-1][1], "get_feature_names_out"):
                cols = self.steps[-1][1].get_feature_names_out(X.columns)
            else:
                cols = X.columns
            X_transformed = pd.DataFrame(X_transformed, columns=cols, index=X.index)
        return X_transformed

class ColumnNamePreservingCT(ColumnTransformer):
    def transform(self, X):
        X_transformed = super().transform(X)
        cols = []
        for name, trans, cols_in in self.transformers_:
            if trans == 'drop':
                continue
            if hasattr(trans, 'get_feature_names_out'):
                cols.extend(trans.get_feature_names_out(cols_in))
            else:
                cols.extend(cols_in)
        return pd.DataFrame(X_transformed, columns=cols, index=X.index)


In [None]:

# =====================================
# TRANSFORMATORY
# =====================================

class UnimportantDropper(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        zero_percentage = (X[:] == 0).sum() / len(X)

        cols_many_zeros = zero_percentage[zero_percentage > 0.5].index.tolist()

        cols_many_zeros = [col for col in cols_many_zeros 
                        if not col.startswith('wsk') 
                        and col != 'pkdKod' ]
        manual_cols=['szczegolnaFormaPrawna_Symbol', 'RP_przeplywy_operacyjne', 'RP_amortyzacja', 'wsk_pokrycie_zob_kr_gotowkowe_2', 'wsk_zadluzenia_gotowki_2', 'wsk_akt_generowania_got_2', 'wsk_pokrycie_wyd_fin_gotowkowe_2', 'wsk_zysk_CF_operacyjny', 'schemat_wsk_bilans']
        
        cols_to_drop = cols_many_zeros + manual_cols
        self.keep_ = [c for c in X.columns if c not in cols_to_drop]
        
        
        return self

    def transform(self, X):
        return X[self.keep_].copy()
    
    def get_feature_names_out(self, input_features=None):
        return getattr(self, 'keep_', input_features)
    
class CategoricalRuleBasedImputer(BaseEstimator, TransformerMixin):
    def __init__(self, cols_to_impute):
        
        self.cols_to_impute = cols_to_impute
        self.modes_ = {}

    def fit(self, X, y=None):
        X = X.copy()
        
        for col in self.cols_to_impute:
            if X[col].notna().sum() > 0:
                self.modes_[col] = X[col].mode()[0]
            else:
                self.modes_[col] = None  # Na wypadek gdyby cała kolumna była NaN
        return self

    def transform(self, X):
        X = X.copy()
        for col, mode_val in self.modes_.items():
            if mode_val is not None:
                X[col] = X[col].replace(0, np.nan)
                X[col] = X[col].fillna(mode_val)
                
        return X
    def get_feature_names_out(self, input_features=None):
        return getattr(self, 'keep_', input_features)
    
        
        

class NumericalRuleBasedImputer(BaseEstimator, TransformerMixin):
    def __init__(self, cols_to_impute):
        self.cols_to_impute = cols_to_impute

    def fit(self, X, y=None):
        self.medians_ = {}
        missing_rates = {}

        wsk_cols = [c for c in X.columns if c.startswith("wsk")]
        for c in wsk_cols:
            r = (X[c].isna().mean() + (X[c] == 0).mean())
            missing_rates[c] = r

        self.low_ = [c for c, r in missing_rates.items() if r < 0.05]
        self.mid_ = [c for c, r in missing_rates.items() if 0.05 <= r < 0.20]
        self.drop_ = [c for c, r in missing_rates.items() if r >= 0.20]

        for col in (self.low_ + self.mid_ + self.cols_to_impute):
            if col in X.columns:
                self.medians_[col] = X[col].replace(0, np.nan).median()

        self.keep_ = [c for c in X.columns if c not in self.drop_]
        return self

    def transform(self, X):
        X = X.copy()
        X = X.drop(columns=self.drop_, errors="ignore")
        for col in (self.low_ + self.mid_ + self.cols_to_impute):
            if col in X.columns:
                X[col] = X[col].replace(0, np.nan).fillna(self.medians_[col])
        return X
    
    def get_feature_names_out(self, input_features=None):
        return getattr(self, 'keep_', input_features)
    

class LogTransformer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        # wykrycie mocno skośnych kolumn (>2)
        skew_values = X.skew()
        self.cols_ = skew_values[skew_values > 2].index.tolist()
        # zachowanie kolejności kolumn
        self.keep_ = X.columns.tolist()
        return self

    def transform(self, X):
        X = X.copy()
        X[self.cols_] = np.log1p(X[self.cols_])
        return X

    def get_feature_names_out(self, input_features=None):
        return getattr(self, 'keep_', input_features)




class OutlierCapper(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        self.no_cap_cols = []
        self.moderate_iqr_cols = []
        self.percentile_cols = []

        for col in X.columns:
            Q1 = X[col].quantile(0.25)
            Q3 =X[col].quantile(0.75)
            IQR = Q3 - Q1
            
            lower_1_5 = Q1 - 1.5 * IQR
            upper_1_5 = Q3 + 1.5 * IQR
            
            below = (X[col] < lower_1_5).sum()
            above = (X[col] > upper_1_5).sum()
            perc_outliers = (below + above) / len(X[col])
            
            if perc_outliers == 0:
                self.no_cap_cols.append(col)
            elif perc_outliers <= 0.1:
                self.moderate_iqr_cols.append(col)
            else:
                self.percentile_cols.append(col)
        return self

    def transform(self, X):
        for col in self.moderate_iqr_cols:
            Q1 = X[col].quantile(0.25)
            Q3 = X[col].quantile(0.75)
            IQR = Q3 - Q1
            lower = Q1 - 1.5 * IQR
            upper = Q3 + 1.5 * IQR
            X[col] = X[col].clip(lower=lower, upper=upper)

        for col in self.percentile_cols:
            lower = X[col].quantile(0.01)
            upper = X[col].quantile(0.99)
            X[col] = X[col].clip(lower=lower, upper=upper)
        return X

    def get_feature_names_out(self, input_features=None):
        return getattr(self, 'keep_', input_features)



class HighCorrelationDropper(BaseEstimator, TransformerMixin):
    def __init__(self, threshold=0.8):
        self.threshold = threshold

    def fit(self, X, y=None):
        Xc = X.copy()
        Xc = Xc[Xc.columns.sort_values()]  

        while True:
            corr = Xc.corr().abs()
            upper = corr.where(np.triu(np.ones(corr.shape), k=1).astype(bool))

            # znajdź wszystkie pary z korelacją > threshold
            to_drop_pairs = [
                (col1, col2)
                for col1 in upper.columns
                for col2 in upper.index
                if pd.notnull(upper.loc[col1, col2]) and upper.loc[col1, col2] > self.threshold
            ]

            if not to_drop_pairs:
                break

            # zliczamy częstotliwość występowania kolumn w parach do usunięcia
            freq = {}
            for col1, col2 in to_drop_pairs:
                freq[col1] = freq.get(col1, 0) + 1
                freq[col2] = freq.get(col2, 0) + 1

            # wybieramy kolumnę, która występuje najczęściej
            max_freq = max(freq.values())
            max_cols = [col for col, f in freq.items() if f == max_freq]
            col_to_drop = sorted(max_cols)[-1]  # ostatnia alfabetycznie

            Xc = Xc.drop(columns=[col_to_drop])

        self.keep_ = Xc.columns.tolist()
        return self

    def transform(self, X):
        return X[self.keep_].copy()

    def get_feature_names_out(self, input_features=None):
        return getattr(self, 'keep_', input_features)
    
    
class Binner(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        self.bin_edges_ = {}
        for col in X.columns:
            if X[col].nunique() >= 4:
                try:
                    _, bins = pd.qcut(X[col], q=N_BINS, retbins=True, duplicates='drop')
                    self.bin_edges_[col] = bins
                except:
                    pass
        self.keep_ = X.columns.tolist()
        return self

    def transform(self, X):
        X = X.copy()
        for col, bins in self.bin_edges_.items():
            X[col] = pd.cut(X[col], bins=bins, include_lowest=True)
        return X

    def get_feature_names_out(self, input_features=None):
        return getattr(self, 'keep_', input_features)

class WOETransformer(BaseEstimator, TransformerMixin):
    def fit(self, X, y):
        self.maps_ = {}
        tmp = pd.concat([X, y], axis=1)
        for col in X.columns:
            grp = tmp.groupby(col)[y.name].agg(['sum', 'count'])
            grp['good'] = grp['count'] - grp['sum']
            grp['bad_dist'] = grp['sum'] / grp['sum'].sum()
            grp['good_dist'] = grp['good'] / grp['good'].sum()
            grp['woe'] = np.log((grp['good_dist'] + EPS) / (grp['bad_dist'] + EPS))
            self.maps_[col] = grp['woe'].to_dict()
        self.keep_ = X.columns.tolist()
        return self

    def transform(self, X):
        X = X.copy()
        for col in X.columns:
            X[col] = X[col].map(self.maps_[col])
        return X

    def get_feature_names_out(self, input_features=None):
        return getattr(self, 'keep_', input_features)


In [76]:
# =====================================
# PIPELINE
# =====================================
cat_cols = ['formaWlasnosci_Symbol', 'pkdKod', 'schemat_wsk_rzis'] 
num_cols = [col for col in X_train.columns if col not in cat_cols]

num_pipeline = ColumnNamePreservingPipeline([
    ("drop_num", UnimportantDropper()),
    ("imp_num", NumericalRuleBasedImputer(cols_to_impute=[
            'Aktywa', 'Aktywa_trwale', 'Aktywa_obrotowe', 'Naleznosci_krotkoterminowe',
            'Inwestycje_krotkoterminowe', 'Srodki_pieniezne', 'Kapital_wlasny', 'Kapital_podstawowy',
            'Zysk_netto', 'Zobowiazania_rezerwy', 'Zobowiazania_krotkoterminowe',
            'przychody_sprzedazy', 'zysk_sprzedazy', 'pozostale_przychody_oper', 'koszty_operacyjne_pozostale',
            'zysk_operacyjny', 'koszty_finansowe', 'zysk_brutto', 'podatek_dochodowy'
])),
    ("log", LogTransformer()),
    ("cap", OutlierCapper()),
    ("corr", HighCorrelationDropper()),
    ("bin", Binner()),
    ("woe", WOETransformer())
])

cat_pipeline = ColumnNamePreservingPipeline([
    ("drop_cat", UnimportantDropper()),
    ("imp_cat", CategoricalRuleBasedImputer(cols_to_impute=["formaWlasnosci_Symbol"])),
    ("woe_cat", WOETransformer())
])

preprocess = ColumnNamePreservingCT([
    
    ("num", num_pipeline, num_cols),
    ("cat", cat_pipeline, cat_cols) 
])

pipeline = ColumnNamePreservingPipeline([
    ("preprocess", preprocess)
])

In [77]:
# =====================================
# FIT + TRANSFORM
# =====================================
pipeline.fit(X_train, y_train)

X_train_t = pipeline.transform(X_train)
X_val_t = pipeline.transform(X_val)
X_test_t = pipeline.transform(X_test)


In [78]:
X_train_t = X_train_t[sorted(X_train_t.columns)]
X_train_t

Unnamed: 0,Aktywa,Aktywa_obrotowe,Inwestycje_krotkoterminowe,Kapital_podstawowy,Naleznosci_krotkoterminowe,Zobowiazania_krotkoterminowe,Zobowiazania_rezerwy,Zysk_netto,formaWlasnosci_Symbol,koszty_finansowe,...,wsk_sytuacji_fin,wsk_udzial_kap_obrotowego_w_fin,wsk_udzial_kap_wlasnego_aktywa_2,wsk_udzial_kapitalu_obrotowego_netto,wsk_udzial_zob_biez_sprzedaz_2,wsk_zadluzenia,wsk_zast_kapitalu_obcego,wsk_zast_kapitalu_podstawowego,wsk_zob_s_aktywa_rzeczowe,wsk_zysk_op_zobowiazania
904,-0.186298,-0.221121,-0.107850,-0.122239,-0.346115,-0.165405,-0.166655,-0.520419,0.022720,-0.184865,...,-0.346115,0.229218,0.292275,0.045393,-0.294452,0.497577,0.192725,-0.175144,0.222685,0.046281
1715,0.552066,0.744436,0.787256,0.387764,0.587985,0.387764,0.418790,0.977528,-0.379485,0.450677,...,0.450677,0.229218,0.277674,0.349215,0.045393,0.357550,0.387764,-0.200986,0.115831,0.000000
1449,0.552066,0.744436,0.787256,0.005249,-0.158605,0.387764,0.418790,0.977528,0.022720,-0.998443,...,0.450677,0.091767,0.292275,0.349215,0.556385,0.257144,0.387764,0.068171,0.115831,0.845077
894,0.448388,-0.475001,-0.459531,0.387764,-0.158605,-0.165405,0.418790,-0.520419,0.022720,0.450677,...,-0.346115,0.111917,-0.412064,-0.240698,0.556385,-0.689105,-0.459531,0.068171,-0.549915,0.845077
1575,0.552066,0.744436,0.787256,-0.122239,0.587985,0.387764,0.418790,0.977528,-0.048527,0.450677,...,-0.095864,0.229218,0.292275,-0.240698,0.045393,0.497577,-0.459531,0.068171,0.480286,0.845077
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
920,-0.186298,-0.221121,-0.107850,-0.122239,-0.158605,-0.140759,-0.166655,0.393695,0.022720,-0.998443,...,-0.346115,0.111917,-0.412064,0.349215,0.556385,0.497577,0.000000,-0.175144,0.480286,0.845077
2480,-0.186298,0.344839,-0.107850,0.005249,0.242502,-0.140759,-0.166655,-0.080634,0.022720,-0.184865,...,0.450677,0.229218,0.292275,-0.064604,0.556385,0.497577,0.192725,-0.175144,-0.549915,0.000000
748,-0.186298,-0.221121,-0.107850,-0.122239,-0.346115,-0.165405,-0.166655,-0.080634,0.022720,0.068171,...,-0.346115,0.091767,0.292275,0.045393,-0.126040,0.497577,0.000000,-0.200986,0.115831,-0.428068
2866,0.552066,0.744436,-0.459531,0.387764,0.242502,0.387764,0.418790,0.977528,-0.379485,0.450677,...,0.450677,0.111917,0.292275,0.349215,0.045393,0.497577,0.000000,-0.175144,-0.549915,0.046281


In [79]:
# Lista kolumn w kolejności alfabetycznej
columns_sorted = sorted(X_train_t.columns.to_list())

# Wyświetlenie
print(columns_sorted)

['Aktywa', 'Aktywa_obrotowe', 'Inwestycje_krotkoterminowe', 'Kapital_podstawowy', 'Naleznosci_krotkoterminowe', 'Zobowiazania_krotkoterminowe', 'Zobowiazania_rezerwy', 'Zysk_netto', 'formaWlasnosci_Symbol', 'koszty_finansowe', 'koszty_operacyjne', 'koszty_operacyjne_pozostale', 'pkdKod', 'podatek_dochodowy', 'pozostale_przychody_oper', 'schemat_wsk_rzis', 'wsk_akt_generowania_got_1', 'wsk_cykl_konwersji_gotowki', 'wsk_ebitda_3', 'wsk_ebitda_aktywa_3', 'wsk_ebitda_koszty_finansowe_3', 'wsk_ebitda_koszty_odsetkowe_3', 'wsk_ebitda_zobowiazan_3', 'wsk_ebitda_zobowiazan_odsetki_3', 'wsk_fin_majatku_kapitalem', 'wsk_koszty_przychody', 'wsk_liczba_dni_istnienia', 'wsk_marza_ebitda_3', 'wsk_mnoznik_kap_wl', 'wsk_ogolnego_zadluzenia_1', 'wsk_plynnosc_biez_1', 'wsk_pokrycie_aktywow_kap_stalym', 'wsk_pokrycie_wyd_fin_gotowkowe_1', 'wsk_pokrycie_zob_kr_gotowkowe_1', 'wsk_poziom_kapitalu_obrotowego_netto', 'wsk_rent_operacyjna', 'wsk_rotacja_naleznosci', 'wsk_struktura_kap_wlasnego_s_2', 'wsk_struk

In [80]:
list1= ['Aktywa', 'Aktywa_trwale', 'Inwestycje_krotkoterminowe', 'Kapital_podstawowy', 'Naleznosci_krotkoterminowe', 'Zobowiazania_krotkoterminowe', 'Zobowiazania_rezerwy', 'Zysk_netto', 'default', 'formaWlasnosci_Symbol', 'koszty_finansowe', 'koszty_operacyjne', 'koszty_operacyjne_pozostale', 'pkdKod', 'podatek_dochodowy', 'pozostale_przychody_oper', 'przychody', 'przychody_sprzedazy', 'schemat_wsk_rzis', 'wsk_akt_generowania_got_1', 'wsk_cykl_konwersji_gotowki', 'wsk_ebitda_aktywa_3', 'wsk_ebitda_koszty_finansowe_3', 'wsk_ebitda_zobowiazan_3', 'wsk_ebitda_zobowiazan_odsetki_3', 'wsk_fin_majatku_kapitalem', 'wsk_koszty_przychody', 'wsk_liczba_dni_istnienia', 'wsk_marza_ebitda_3', 'wsk_ogolnego_zadluzenia_1', 'wsk_plynnosc_biez_2', 'wsk_pokrycie_aktywow_tr_kapitalem_st', 'wsk_pokrycie_wyd_fin_gotowkowe_1', 'wsk_pokrycie_zob_kr_gotowkowe_1', 'wsk_poziom_kapitalu_obrotowego_netto', 'wsk_rent_operacyjna', 'wsk_stopa_marzy_brutto', 'wsk_stopa_zysku_sprzedaz', 'wsk_struktura_kap_wlasnego_1', 'wsk_struktura_kap_wlasnego_s_2', 'wsk_struktury_finansowania', 'wsk_sytuacji_fin', 'wsk_udzial_kap_obrotowego_w_fin', 'wsk_udzial_kapitalu_obrotowego_netto', 'wsk_udzial_zob_biez_sprzedaz_2', 'wsk_zast_kapitalu_obcego', 'wsk_zast_kapitalu_podstawowego', 'wsk_zob_s_aktywa_rzeczowe', 'wsk_zwrot_aktywa_trwale', 'zysk_netto', 'zysk_operacyjny']
list2=columns_sorted
len( list(set(list1) ^ set(list2))) #powinno być ~0

23

In [81]:
list(set(list1) ^ set(list2))

['wsk_stopa_marzy_brutto',
 'wsk_zadluzenia',
 'wsk_pokrycie_aktywow_kap_stalym',
 'wsk_zysk_op_zobowiazania',
 'wsk_struktury_kapitalu',
 'zysk_operacyjny',
 'wsk_pokrycie_aktywow_tr_kapitalem_st',
 'wsk_rotacja_naleznosci',
 'wsk_struktura_kap_wlasnego_1',
 'Aktywa_trwale',
 'wsk_mnoznik_kap_wl',
 'wsk_udzial_kap_wlasnego_aktywa_2',
 'wsk_ebitda_koszty_odsetkowe_3',
 'wsk_zwrot_aktywa_trwale',
 'Aktywa_obrotowe',
 'wsk_plynnosc_biez_1',
 'zysk_netto',
 'default',
 'przychody_sprzedazy',
 'przychody',
 'wsk_plynnosc_biez_2',
 'wsk_stopa_zysku_sprzedaz',
 'wsk_ebitda_3']

In [82]:
intersection_with_list1 = list((set(list1) ^ set(list2)) & set(list1))
intersection_with_list1

['wsk_pokrycie_aktywow_tr_kapitalem_st',
 'zysk_netto',
 'default',
 'wsk_stopa_marzy_brutto',
 'przychody_sprzedazy',
 'przychody',
 'wsk_plynnosc_biez_2',
 'wsk_stopa_zysku_sprzedaz',
 'wsk_struktura_kap_wlasnego_1',
 'wsk_zwrot_aktywa_trwale',
 'Aktywa_trwale',
 'zysk_operacyjny']

In [83]:
intersection_with_list2 = list((set(list1) ^ set(list2)) & set(list2))
intersection_with_list2

['wsk_zadluzenia',
 'wsk_ebitda_3',
 'wsk_rotacja_naleznosci',
 'wsk_pokrycie_aktywow_kap_stalym',
 'wsk_zysk_op_zobowiazania',
 'wsk_mnoznik_kap_wl',
 'wsk_udzial_kap_wlasnego_aktywa_2',
 'wsk_struktury_kapitalu',
 'wsk_ebitda_koszty_odsetkowe_3',
 'Aktywa_obrotowe',
 'wsk_plynnosc_biez_1']

In [84]:
#TODO  rozbieżność rozmiarowa względem orginalnego kodu

In [85]:
# =====================================
# DEBUG WRAPPER
# =====================================
class DebugTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, transformer, name=None, show_columns_for=None):
        # UWAGA: przypisuj dokładnie to, co przyszło w argumentach
        self.transformer = transformer
        self.name = name
        self.show_columns_for = show_columns_for  # NIE zmieniamy tutaj, nawet jeśli to None!

    def fit(self, X, y=None):
        name = self.name or self.transformer.__class__.__name__
        self.transformer.fit(X, y)
        print(f"[{name} - fit] columns: {X.shape[1]}")
        return self

    def transform(self, X):
        name = self.name or self.transformer.__class__.__name__
        X_trans = self.transformer.transform(X)
        print(f"[{name} - transform] columns: {X_trans.shape[1]}")

        # Dodatkowe logowanie kolumn tylko dla wybranych transformerów
        if self.show_columns_for and name in self.show_columns_for:
            cols = getattr(X_trans, "columns", None)
            if cols is not None:
                print(f"[{name} - columns]: {list(cols)}")
            else:
                print(f"[{name} - no column names available]")
        return X_trans

    def get_feature_names_out(self, input_features=None):
        if hasattr(self.transformer, "get_feature_names_out"):
            return self.transformer.get_feature_names_out(input_features)
        return input_features

# =====================================
# PIPELINES
# =====================================
cat_cols = ['formaWlasnosci_Symbol', 'pkdKod', 'schemat_wsk_rzis'] 
num_cols = [col for col in X_train.columns if col not in cat_cols]

num_pipeline = ColumnNamePreservingPipeline([
    ("drop_num", DebugTransformer(UnimportantDropper(), "drop_num")),
    ("imp_num", DebugTransformer(NumericalRuleBasedImputer(cols_to_impute=[
            'Aktywa', 'Aktywa_trwale', 'Aktywa_obrotowe', 'Naleznosci_krotkoterminowe',
            'Inwestycje_krotkoterminowe', 'Srodki_pieniezne', 'Kapital_wlasny', 'Kapital_podstawowy',
            'Zysk_netto', 'Zobowiazania_rezerwy', 'Zobowiazania_krotkoterminowe',
            'przychody_sprzedazy', 'zysk_sprzedazy', 'pozostale_przychody_oper', 'koszty_operacyjne_pozostale',
            'zysk_operacyjny', 'koszty_finansowe', 'zysk_brutto', 'podatek_dochodowy'
    ]), "imp_num")),
    ("log", DebugTransformer(LogTransformer(), "log")),
    ("cap", DebugTransformer(OutlierCapper(), "cap")),
    ("corr", DebugTransformer(HighCorrelationDropper(), "corr", show_columns_for=["corr"])),
    ("bin", DebugTransformer(Binner(), "bin")),
    ("woe", DebugTransformer(WOETransformer(), "woe"))
])

cat_pipeline = ColumnNamePreservingPipeline([
    ("drop_cat", DebugTransformer(UnimportantDropper(), "drop_cat")),
    ("imp_cat", DebugTransformer(CategoricalRuleBasedImputer(cols_to_impute=["formaWlasnosci_Symbol"]), "imp_cat")),
    ("woe_cat", DebugTransformer(WOETransformer(), "woe_cat"))
])

preprocess = ColumnNamePreservingCT([
    ("num", num_pipeline, num_cols),
    ("cat", cat_pipeline, cat_cols) 
])

pipeline = ColumnNamePreservingPipeline([
    ("preprocess", preprocess)
])


In [86]:
pipeline.fit(X_train, y_train)

[drop_num - fit] columns: 216
[drop_num - transform] columns: 153
[imp_num - fit] columns: 153
[imp_num - transform] columns: 82
[log - fit] columns: 82
[log - transform] columns: 82
[cap - fit] columns: 82
[cap - transform] columns: 82
[corr - fit] columns: 82
[corr - transform] columns: 47
[corr - columns]: ['Aktywa', 'Aktywa_obrotowe', 'Inwestycje_krotkoterminowe', 'Kapital_podstawowy', 'Naleznosci_krotkoterminowe', 'Zobowiazania_krotkoterminowe', 'Zobowiazania_rezerwy', 'Zysk_netto', 'koszty_finansowe', 'koszty_operacyjne', 'koszty_operacyjne_pozostale', 'podatek_dochodowy', 'pozostale_przychody_oper', 'wsk_akt_generowania_got_1', 'wsk_cykl_konwersji_gotowki', 'wsk_ebitda_3', 'wsk_ebitda_aktywa_3', 'wsk_ebitda_koszty_finansowe_3', 'wsk_ebitda_koszty_odsetkowe_3', 'wsk_ebitda_zobowiazan_3', 'wsk_ebitda_zobowiazan_odsetki_3', 'wsk_fin_majatku_kapitalem', 'wsk_koszty_przychody', 'wsk_liczba_dni_istnienia', 'wsk_marza_ebitda_3', 'wsk_mnoznik_kap_wl', 'wsk_ogolnego_zadluzenia_1', 'wsk_p