In [1]:
# =====================================
# IMPORTY
# =====================================
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

import warnings

# Ignore all warnings
warnings.filterwarnings("ignore")

In [2]:

# =====================================
# PARAMETRY
# =====================================
DATA_PATH = "zbiór_8.csv"
RANDOM_SEED = 1
CORR_THRESHOLD = 0.8
N_BINS = 4
EPS = 1e-6



In [3]:

# =====================================
# WCZYTANIE + SPLIT
# =====================================
df = pd.read_csv(DATA_PATH)

df_train, df_tmp = train_test_split(df, train_size=0.7, stratify=df["default"], random_state=RANDOM_SEED)
df_val, df_test = train_test_split(df_tmp, test_size=0.5, stratify=df_tmp["default"], random_state=RANDOM_SEED)

X_train, y_train = df_train.drop(columns="default"), df_train["default"]
X_val, y_val = df_val.drop(columns="default"), df_val["default"]
X_test, y_test = df_test.drop(columns="default"), df_test["default"]

In [4]:
# -------------------------------------
# WRAPPERY DO ZACHOWANIA NAZW KOLUMN
# -------------------------------------
class ColumnNamePreservingPipeline(Pipeline):
    def transform(self, X):
        X_transformed = super().transform(X)
        if isinstance(X_transformed, np.ndarray):
            if hasattr(self.steps[-1][1], "get_feature_names_out"):
                cols = self.steps[-1][1].get_feature_names_out(X.columns)
            else:
                cols = X.columns
            X_transformed = pd.DataFrame(X_transformed, columns=cols, index=X.index)
        return X_transformed

class ColumnNamePreservingCT(ColumnTransformer):
    def transform(self, X):
        X_transformed = super().transform(X)
        cols = []
        for name, trans, cols_in in self.transformers_:
            if trans == 'drop':
                continue
            if hasattr(trans, 'get_feature_names_out'):
                cols.extend(trans.get_feature_names_out(cols_in))
            else:
                cols.extend(cols_in)
        return pd.DataFrame(X_transformed, columns=cols, index=X.index)


In [5]:

# =====================================
# TRANSFORMATORY
# =====================================

class UnimportantDropper(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        zero_percentage = (X[:] == 0).sum() / len(X)

        cols_many_zeros = zero_percentage[zero_percentage > 0.5].index.tolist()

        cols_many_zeros = [col for col in cols_many_zeros 
                        if not col.startswith('wsk') 
                        and col != 'pkdKod' ]
        manual_cols=['szczegolnaFormaPrawna_Symbol', 'RP_przeplywy_operacyjne', 'RP_amortyzacja', 'wsk_pokrycie_zob_kr_gotowkowe_2', 'wsk_zadluzenia_gotowki_2', 'wsk_akt_generowania_got_2', 'wsk_pokrycie_wyd_fin_gotowkowe_2', 'wsk_zysk_CF_operacyjny', 'schemat_wsk_bilans']
        
        cols_to_drop = cols_many_zeros + manual_cols
        self.keep_ = [c for c in X.columns if c not in cols_to_drop]
        
        return self

    def transform(self, X):
        return X[self.keep_].copy()
    
    def get_feature_names_out(self, input_features=None):
        return getattr(self, 'keep_', input_features)
    
class CategoricalRuleBasedImputer(BaseEstimator, TransformerMixin):
    def __init__(self, cols_to_impute):
        
        self.cols_to_impute = cols_to_impute
        self.modes_ = {}

    def fit(self, X, y=None):
        X = X.copy()
        
        for col in self.cols_to_impute:
            if X[col].notna().sum() > 0:
                self.modes_[col] = X[col].mode()[0]
            else:
                self.modes_[col] = None  # Na wypadek gdyby cała kolumna była NaN
        return self

    def transform(self, X):
        X = X.copy()
        for col, mode_val in self.modes_.items():
            if mode_val is not None:
                X[col] = X[col].replace(0, np.nan)
                X[col] = X[col].fillna(mode_val)
                
        return X
    def get_feature_names_out(self, input_features=None):
        return getattr(self, 'keep_', input_features)
    
        
        

class NumericalRuleBasedImputer(BaseEstimator, TransformerMixin):
    def __init__(self, cols_to_impute):
        self.cols_to_impute = cols_to_impute

    def fit(self, X, y=None):
        self.medians_ = {}
        groups = {
        '<5%': [],
        '5-20%': [],
        '20-50%': [],
        '>50%': []
        }

        for col in self.cols_to_impute:
            median_val = X.loc[X[col] != 0, col].median()
            X[col] = X[col].replace(0, np.nan)     
            X[col].fillna(median_val, inplace=True) 

        wsk_cols = [col for col in X.columns if col.startswith('wsk')]
        percent_missing = (
        ((X[wsk_cols].isna() | (X[wsk_cols] == 0) | (X[wsk_cols] == np.inf)).sum() / len(X)) * 100
        )

        for col, pct in percent_missing.items():
            if pct < 5:
                groups['<5%'].append(col)
            elif pct < 20:
                groups['5-20%'].append(col)
            elif pct < 50:
                groups['20-50%'].append(col)
            else:
                groups['>50%'].append(col)

        for group in ['<5%', '5-20%']:
            for col in groups[group]:
                if col in X.columns:
                    median_value = X[col].median()
                    X.replace([np.inf, -np.inf], np.nan, inplace=True)
                    X[col].fillna(median_value, inplace=True)

        self.drop_ = groups['20-50%'] + groups['>50%']

        self.keep_ = [c for c in X.columns if c not in self.drop_]
        return self

    def transform(self, X):
        X = X.copy()
        X = X.drop(columns=self.drop_, errors="ignore")
        return X
    
    def get_feature_names_out(self, input_features=None):
        return getattr(self, 'keep_', input_features)
    

class LogTransformer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        # wykrycie mocno skośnych kolumn (>2)
        skew_values = X.skew()
        self.cols_ = skew_values[skew_values > 2].index.tolist()
        # zachowanie kolejności kolumn
        self.keep_ = X.columns.tolist()
        return self

    def transform(self, X):
        X = X.copy()
        for col in self.cols_:
            min_val = X[col].min()
            X[col] = X[col] + abs(min_val) + 1
            X[col] = np.log(X[col])
        return X

    def get_feature_names_out(self, input_features=None):
        return getattr(self, 'keep_', input_features)


class OutlierCapper(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        self.no_cap_cols = []
        self.moderate_iqr_cols = []
        self.percentile_cols = []

        for col in X.columns:
            Q1 = X[col].quantile(0.25)
            Q3 = X[col].quantile(0.75)
            IQR = Q3 - Q1
            
            lower_1_5 = Q1 - 1.5 * IQR
            upper_1_5 = Q3 + 1.5 * IQR
            
            below = (X[col] < lower_1_5).sum()
            above = (X[col] > upper_1_5).sum()
            perc_outliers = (below + above) / len(X[col])
            
            if perc_outliers == 0:
                self.no_cap_cols.append(col)
            elif perc_outliers <= 0.1:
                self.moderate_iqr_cols.append(col)
            else:
                self.percentile_cols.append(col)
        return self

    def transform(self, X):
        for col in self.moderate_iqr_cols:
            Q1 = X[col].quantile(0.25)
            Q3 = X[col].quantile(0.75)
            IQR = Q3 - Q1
            lower = Q1 - 1.5 * IQR
            upper = Q3 + 1.5 * IQR
            X[col] = X[col].clip(lower=lower, upper=upper)

        for col in self.percentile_cols:
            lower = X[col].quantile(0.01)
            upper = X[col].quantile(0.99)
            X[col] = X[col].clip(lower=lower, upper=upper)
        return X

    def get_feature_names_out(self, input_features=None):
        return getattr(self, 'keep_', input_features)



class HighCorrelationDropper(BaseEstimator, TransformerMixin):
    def __init__(self, threshold=0.8):
        self.threshold = threshold

    def fit(self, X, y=None):
        Xc = X.copy()
        Xc = Xc[Xc.columns.sort_values()]  
        while True:
            corr = Xc.corr().abs()
            upper = corr.where(np.triu(np.ones(corr.shape), k=1).astype(bool))

            # znajdź wszystkie pary z korelacją > threshold
            to_drop_pairs = [
                (col1, col2)
                for col1 in upper.columns
                for col2 in upper.index
                if pd.notnull(upper.loc[col1, col2]) and upper.loc[col1, col2] > self.threshold
            ]

            if not to_drop_pairs:
                break

            # zliczamy częstotliwość występowania kolumn w parach do usunięcia
            freq = {}
            for col1, col2 in to_drop_pairs:
                freq[col1] = freq.get(col1, 0) + 1
                freq[col2] = freq.get(col2, 0) + 1

            # wybieramy kolumnę, która występuje najczęściej
            max_freq = max(freq.values())
            max_cols = [col for col, f in freq.items() if f == max_freq]
            col_to_drop = sorted(max_cols)[-1]  # ostatnia alfabetycznie

            Xc = Xc.drop(columns=[col_to_drop])

        self.keep_ = Xc.columns.tolist()
        return self

    def transform(self, X):
        return X[self.keep_].copy()

    def get_feature_names_out(self, input_features=None):
        return getattr(self, 'keep_', input_features)
    
    
class Binner(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        self.bin_edges_ = {}
        for col in X.columns:
            if X[col].nunique() >= 4:
                try:
                    _, bins = pd.qcut(X[col], q=N_BINS, retbins=True, duplicates='drop')
                    self.bin_edges_[col] = bins
                except:
                    pass
        self.keep_ = X.columns.tolist()
        return self

    def transform(self, X):
        X = X.copy()
        for col, bins in self.bin_edges_.items():
            X[col] = pd.cut(X[col], bins=bins, include_lowest=True)
        print(X)
        return X

    def get_feature_names_out(self, input_features=None):
        return getattr(self, 'keep_', input_features)

class WOETransformer(BaseEstimator, TransformerMixin):
    def fit(self, X, y):
        self.maps_ = {}
        tmp = pd.concat([X, y], axis=1)
        for col in X.columns:
            grp = tmp.groupby(col)[y.name].agg(['sum', 'count'])
            grp['good'] = grp['count'] - grp['sum']
            grp['bad_dist'] = grp['sum'] / grp['sum'].sum()
            grp['good_dist'] = grp['good'] / grp['good'].sum()
            grp['woe'] = np.log((grp['good_dist'] + EPS) / (grp['bad_dist'] + EPS))
            self.maps_[col] = grp['woe'].to_dict()
        self.keep_ = X.columns.tolist()
        return self

    def transform(self, X):
        X = X.copy()
        for col in X.columns:
            X[col] = X[col].map(self.maps_[col])
        return X

    def get_feature_names_out(self, input_features=None):
        return getattr(self, 'keep_', input_features)


In [15]:
# =====================================
# PIPELINE
# =====================================
cat_cols = ['formaWlasnosci_Symbol', 'pkdKod', 'schemat_wsk_rzis'] 
num_cols = [col for col in X_train.columns if col not in cat_cols]

num_pipeline = ColumnNamePreservingPipeline([
    ("drop_num", UnimportantDropper()),
    ("imp_num", NumericalRuleBasedImputer(cols_to_impute=[
            'Aktywa', 'Aktywa_trwale', 'Aktywa_obrotowe', 'Naleznosci_krotkoterminowe',
            'Inwestycje_krotkoterminowe', 'Srodki_pieniezne', 'Kapital_wlasny', 'Kapital_podstawowy',
            'Zysk_netto', 'Zobowiazania_rezerwy', 'Zobowiazania_krotkoterminowe',
            'przychody_sprzedazy', 'zysk_sprzedazy', 'pozostale_przychody_oper', 'koszty_operacyjne_pozostale',
            'zysk_operacyjny', 'koszty_finansowe', 'zysk_brutto', 'podatek_dochodowy'
])),
    ("log", LogTransformer()),
    ("cap", OutlierCapper()),
    ("corr", HighCorrelationDropper()),
    ("bin", Binner()),
    ("woe", WOETransformer())
])

cat_pipeline = ColumnNamePreservingPipeline([
    ("drop_cat", UnimportantDropper()),
    ("imp_cat", CategoricalRuleBasedImputer(cols_to_impute=["formaWlasnosci_Symbol"])),
    ("woe_cat", WOETransformer())
])

preprocess = ColumnNamePreservingCT([
    ("num", num_pipeline, num_cols),
    ("cat", cat_pipeline, cat_cols) 
])

pipeline = ColumnNamePreservingPipeline([
    ("preprocess", preprocess)
])

In [17]:
# =====================================
# FIT + TRANSFORM
# =====================================
pipeline.fit(X_train, y_train)

X_train_t = pipeline.transform(X_train)
# X_val_t = pipeline.transform(X_val)
# X_test_t = pipeline.transform(X_test)


       Aktywa_obrotowe     Aktywa_trwale    Inwestycje_krotkoterminowe  \
904   (11.757, 12.875]  (11.234, 12.292]              (12.416, 12.674]   
1715  (14.194, 17.849]  (13.201, 16.152]              (13.069, 15.963]   
1449  (14.194, 17.849]  (13.201, 16.152]              (13.069, 15.963]   
894    (8.101, 11.757]  (13.201, 16.152]  (12.209000000000001, 12.416]   
1575  (14.194, 17.849]  (13.201, 16.152]              (13.069, 15.963]   
...                ...               ...                           ...   
920   (11.757, 12.875]  (11.234, 12.292]              (12.416, 12.674]   
2480  (12.875, 14.194]   (8.282, 11.234]              (12.416, 12.674]   
748   (11.757, 12.875]  (11.234, 12.292]              (12.416, 12.674]   
2866  (14.194, 17.849]   (8.282, 11.234]  (12.209000000000001, 12.416]   
578   (11.757, 12.875]   (8.282, 11.234]              (12.416, 12.674]   

     Kapital_podstawowy    Kapital_wlasny  Zobowiazania_krotkoterminowe  \
904     (8.525, 10.821]    (10.72, 1

In [83]:
X_train_t = X_train_t[sorted(X_train_t.columns)]
X_train_t

Unnamed: 0,Aktywa_obrotowe,Aktywa_trwale,Inwestycje_krotkoterminowe,Kapital_podstawowy,Kapital_wlasny,Zobowiazania_krotkoterminowe,Zysk_netto,formaWlasnosci_Symbol,koszty_finansowe,koszty_operacyjne,...,wsk_struktury_kapitalu,wsk_sytuacji_fin,wsk_udzial_kap_obrotowego_w_fin,wsk_udzial_kap_wlasnego_aktywa_2,wsk_udzial_kapitalu_obrotowego_netto,wsk_udzial_zob_biez_sprzedaz_2,wsk_zast_kapitalu_obcego,wsk_zast_kapitalu_podstawowego,wsk_zob_s_aktywa_rzeczowe,wsk_zwrot_aktywa_trwale
904,-0.221121,-0.294452,-0.107850,-0.009966,-0.208217,-0.165405,-0.520419,0.022720,,0.045023,...,0.552066,-0.329114,0.243977,0.191054,0.115831,0.517254,0.000000,-0.175144,0.217220,0.387764
1715,0.744436,0.625090,0.787256,0.387764,1.086840,0.387764,0.977528,-0.379485,0.450677,0.000000,...,0.357550,0.450677,0.243977,0.299385,0.387764,0.115831,0.663468,-0.160694,0.115831,0.387764
1449,0.744436,0.625090,0.787256,0.005249,1.086840,0.387764,0.977528,0.022720,-0.998443,0.703214,...,0.357550,0.450677,0.091767,0.299385,0.387764,0.517254,0.663468,0.022306,0.115831,0.387764
894,-0.475001,0.625090,-0.459531,0.387764,-0.208217,-0.165405,-0.520419,0.022720,0.450677,-0.475001,...,-0.702340,-0.329114,0.165451,-0.362904,-0.329114,0.517254,-0.459531,0.387764,-0.549915,0.852646
1575,0.744436,,0.787256,-0.009966,1.086840,0.387764,0.977528,-0.048527,0.450677,0.703214,...,0.552066,-0.105892,0.243977,0.191054,-0.329114,0.115831,-0.459531,0.022306,0.450677,-0.043440
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
920,-0.221121,-0.294452,-0.107850,-0.009966,-0.208217,-0.140759,0.393695,0.022720,-0.998443,0.045023,...,0.552066,-0.329114,0.165451,-0.362904,0.387764,0.517254,0.000000,-0.175144,0.450677,-0.043440
2480,0.344839,-0.294452,,0.005249,-0.208217,-0.165405,-0.520419,0.022720,,0.045023,...,0.552066,0.450677,0.243977,0.191054,-0.064604,0.517254,0.000000,-0.175144,-0.549915,-0.043440
748,-0.221121,-0.294452,-0.107850,-0.236388,-0.208217,-0.165405,-0.080634,0.022720,0.068171,0.000000,...,0.552066,-0.329114,0.091767,0.191054,0.115831,-0.126040,0.000000,-0.160694,0.115831,-0.408659
2866,0.744436,-0.294452,-0.459531,0.387764,1.086840,0.387764,0.977528,-0.379485,0.450677,0.045023,...,0.552066,0.450677,0.165451,0.191054,0.387764,0.115831,0.000000,-0.175144,-0.549915,-0.043440


In [8]:
# Lista kolumn w kolejności alfabetycznej
columns_sorted = sorted(X_train_t.columns.to_list())

# Wyświetlenie
print(columns_sorted)

['Aktywa_obrotowe', 'Aktywa_trwale', 'Inwestycje_krotkoterminowe', 'Kapital_podstawowy', 'Kapital_wlasny', 'Zobowiazania_krotkoterminowe', 'Zysk_netto', 'formaWlasnosci_Symbol', 'koszty_finansowe', 'koszty_operacyjne', 'koszty_operacyjne_pozostale', 'pkdKod', 'podatek_dochodowy', 'pozostale_przychody_oper', 'przychody', 'przychody_sprzedazy', 'schemat_wsk_rzis', 'wsk_akt_generowania_got_1', 'wsk_cykl_konwersji_gotowki', 'wsk_ebitda_3', 'wsk_ebitda_aktywa_3', 'wsk_ebitda_koszty_finansowe_3', 'wsk_ebitda_koszty_odsetkowe_3', 'wsk_ebitda_zobowiazan_3', 'wsk_ebitda_zobowiazan_odsetki_3', 'wsk_fin_majatku_kapitalem', 'wsk_kapital_do_aktywa', 'wsk_koszty_przychody', 'wsk_liczba_dni_istnienia', 'wsk_marza_ebitda_3', 'wsk_mnoznik_kap_wl', 'wsk_plynnosc_biez_2', 'wsk_pokrycie_aktywow_kap_stalym', 'wsk_pokrycie_wyd_fin_gotowkowe_1', 'wsk_pokrycie_zob_kr_gotowkowe_1', 'wsk_poziom_kapitalu_obrotowego_netto', 'wsk_rent_operacyjna', 'wsk_rotacja_naleznosci', 'wsk_struktura_kap_wlasnego_s_2', 'wsk_st

In [9]:
list1= ['Aktywa_obrotowe', 'Aktywa_trwale', 'Inwestycje_krotkoterminowe', 'Kapital_podstawowy', 'Kapital_wlasny', 'Zobowiazania_krotkoterminowe', 'Zysk_netto', 'default', 'formaWlasnosci_Symbol', 'koszty_finansowe', 'koszty_operacyjne', 'koszty_operacyjne_pozostale', 'pkdKod', 'podatek_dochodowy', 'pozostale_przychody_oper', 'przychody', 'przychody_sprzedazy', 'schemat_wsk_rzis', 'wsk_akt_generowania_got_1', 'wsk_cykl_konwersji_gotowki', 'wsk_ebitda_3', 'wsk_ebitda_aktywa_3', 'wsk_ebitda_koszty_finansowe_3', 'wsk_ebitda_koszty_odsetkowe_3', 'wsk_ebitda_zobowiazan_3', 'wsk_ebitda_zobowiazan_odsetki_3', 'wsk_fin_majatku_kapitalem', 'wsk_kapital_do_aktywa', 'wsk_koszty_przychody', 'wsk_liczba_dni_istnienia', 'wsk_marza_ebitda_3', 'wsk_mnoznik_kap_wl', 'wsk_plynnosc_biez_2', 'wsk_pokrycie_aktywow_kap_stalym', 'wsk_pokrycie_wyd_fin_gotowkowe_1', 'wsk_pokrycie_zob_kr_gotowkowe_1', 'wsk_poziom_kapitalu_obrotowego_netto', 'wsk_rent_operacyjna', 'wsk_rotacja_naleznosci', 'wsk_struktura_kap_wlasnego_s_2', 'wsk_struktury_finansowania', 'wsk_struktury_kapitalu', 'wsk_sytuacji_fin', 'wsk_udzial_kap_obrotowego_w_fin', 'wsk_udzial_kap_wlasnego_aktywa_2', 'wsk_udzial_kapitalu_obrotowego_netto', 'wsk_udzial_zob_biez_sprzedaz_2', 'wsk_zast_kapitalu_obcego', 'wsk_zast_kapitalu_podstawowego', 'wsk_zob_s_aktywa_rzeczowe', 'wsk_zwrot_aktywa_trwale']
list2=columns_sorted
len( list(set(list1) ^ set(list2))) #powinno być ~0

1

In [10]:
list(set(list1) ^ set(list2))

['default']

In [11]:
intersection_with_list1 = list((set(list1) ^ set(list2)) & set(list1))
intersection_with_list1

['default']

In [12]:
intersection_with_list2 = list((set(list1) ^ set(list2)) & set(list2))
intersection_with_list2

[]

In [13]:
#TODO  rozbieżność rozmiarowa względem orginalnego kodu