In [1]:
# =====================================
# IMPORTY
# =====================================
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer



In [2]:

# =====================================
# PARAMETRY
# =====================================
DATA_PATH = "zbiór_8.csv"
RANDOM_SEED = 1
CORR_THRESHOLD = 0.8
N_BINS = 4
EPS = 1e-6



In [3]:

# =====================================
# WCZYTANIE + SPLIT
# =====================================
df = pd.read_csv(DATA_PATH)

df_train, df_tmp = train_test_split(df, train_size=0.7, stratify=df["default"], random_state=RANDOM_SEED)
df_val, df_test = train_test_split(df_tmp, test_size=0.5, stratify=df_tmp["default"], random_state=RANDOM_SEED)

X_train, y_train = df_train.drop(columns="default"), df_train["default"]
X_val, y_val = df_val.drop(columns="default"), df_val["default"]
X_test, y_test = df_test.drop(columns="default"), df_test["default"]

In [4]:
# -------------------------------------
# WRAPPERY DO ZACHOWANIA NAZW KOLUMN
# -------------------------------------
class ColumnNamePreservingPipeline(Pipeline):
    def transform(self, X):
        X_transformed = super().transform(X)
        if isinstance(X_transformed, np.ndarray):
            if hasattr(self.steps[-1][1], "get_feature_names_out"):
                cols = self.steps[-1][1].get_feature_names_out(X.columns)
            else:
                cols = X.columns
            X_transformed = pd.DataFrame(X_transformed, columns=cols, index=X.index)
        return X_transformed

class ColumnNamePreservingCT(ColumnTransformer):
    def transform(self, X):
        X_transformed = super().transform(X)
        cols = []
        for name, trans, cols_in in self.transformers_:
            if trans == 'drop':
                continue
            if hasattr(trans, 'get_feature_names_out'):
                cols.extend(trans.get_feature_names_out(cols_in))
            else:
                cols.extend(cols_in)
        return pd.DataFrame(X_transformed, columns=cols, index=X.index)


In [5]:

# =====================================
# TRANSFORMATORY
# =====================================
class UnimportantDropper(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        cols_to_drop = [
            c for c in X.columns
            if (X[c].isna().mean() + (pd.to_numeric(X[c], errors='coerce') == 0).mean()) >= 0.20
        ]
        self.keep_ = [c for c in X.columns if c not in cols_to_drop]
        return self

    def transform(self, X):
        return X[self.keep_].copy()
    
    def get_feature_names_out(self, input_features=None):
        return getattr(self, 'keep_', input_features)

class RuleBasedImputer(BaseEstimator, TransformerMixin):
    def __init__(self, cols_to_impute):
        self.cols_to_impute = cols_to_impute

    def fit(self, X, y=None):
        self.medians_ = {}
        missing_rates = {}

        wsk_cols = [c for c in X.columns if c.startswith("wsk")]
        for c in wsk_cols:
            r = (X[c].isna().mean() + (X[c] == 0).mean())
            missing_rates[c] = r

        self.low_ = [c for c, r in missing_rates.items() if r < 0.05]
        self.mid_ = [c for c, r in missing_rates.items() if 0.05 <= r < 0.20]
        self.drop_ = [c for c, r in missing_rates.items() if r >= 0.20]

        for col in (self.low_ + self.mid_ + self.cols_to_impute):
            if col in X.columns:
                self.medians_[col] = X[col].replace(0, np.nan).median()

        if "formaWlasnosci_Symbol" in X.columns:
            self.mode_forma_ = X["formaWlasnosci_Symbol"].mode()[0]

        self.keep_ = [c for c in X.columns if c not in self.drop_]
        return self

    def transform(self, X):
        X = X.copy()
        X = X.drop(columns=self.drop_, errors="ignore")
        for col in (self.low_ + self.mid_ + self.cols_to_impute):
            if col in X.columns:
                X[col] = X[col].replace(0, np.nan).fillna(self.medians_[col])
        if "formaWlasnosci_Symbol" in X.columns:
            X["formaWlasnosci_Symbol"] = X["formaWlasnosci_Symbol"].fillna(self.mode_forma_)
        return X
    
    def get_feature_names_out(self, input_features=None):
        return getattr(self, 'keep_', input_features)

class LogTransformer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        skew = X.skew().abs()
        self.cols_ = skew[skew > 2].index.tolist()
        self.keep_ = X.columns.tolist()
        return self

    def transform(self, X):
        X = X.copy()
        X[self.cols_] = np.log1p(X[self.cols_].clip(lower=0))
        return X

    def get_feature_names_out(self, input_features=None):
        return getattr(self, 'keep_', input_features)

class OutlierCapper(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        self.bounds_ = {}
        for col in X.columns:
            q1 = X[col].quantile(0.25)
            q3 = X[col].quantile(0.75)
            iqr = q3 - q1
            self.bounds_[col] = (q1 - 3 * iqr, q3 + 3 * iqr)
        self.keep_ = X.columns.tolist()
        return self

    def transform(self, X):
        X = X.copy()
        for col, (l, u) in self.bounds_.items():
            X[col] = X[col].clip(l, u)
        return X

    def get_feature_names_out(self, input_features=None):
        return getattr(self, 'keep_', input_features)

class HighCorrelationDropper(BaseEstimator, TransformerMixin):
    def __init__(self, threshold=CORR_THRESHOLD):
        self.threshold = threshold

    def fit(self, X, y=None):
        Xc = X.copy()
        while True:
            corr = Xc.corr().abs()
            np.fill_diagonal(corr.values, 0)
            if corr.max().max() < self.threshold:
                break
            drop = corr.max().idxmax()
            Xc = Xc.drop(columns=drop)
        self.keep_ = Xc.columns.tolist()
        return self

    def transform(self, X):
        return X[self.keep_].copy()
    
    def get_feature_names_out(self, input_features=None):
        return getattr(self, 'keep_', input_features)

class Binner(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        self.bin_edges_ = {}
        for col in X.columns:
            if X[col].nunique() >= 4:
                try:
                    _, bins = pd.qcut(X[col], q=N_BINS, retbins=True, duplicates='drop')
                    self.bin_edges_[col] = bins
                except:
                    pass
        self.keep_ = X.columns.tolist()
        return self

    def transform(self, X):
        X = X.copy()
        for col, bins in self.bin_edges_.items():
            X[col] = pd.cut(X[col], bins=bins, include_lowest=True)
        return X

    def get_feature_names_out(self, input_features=None):
        return getattr(self, 'keep_', input_features)

class WOETransformer(BaseEstimator, TransformerMixin):
    def fit(self, X, y):
        self.maps_ = {}
        tmp = pd.concat([X, y], axis=1)
        for col in X.columns:
            grp = tmp.groupby(col)[y.name].agg(['sum', 'count'])
            grp['good'] = grp['count'] - grp['sum']
            grp['bad_dist'] = grp['sum'] / grp['sum'].sum()
            grp['good_dist'] = grp['good'] / grp['good'].sum()
            grp['woe'] = np.log((grp['good_dist'] + EPS) / (grp['bad_dist'] + EPS))
            self.maps_[col] = grp['woe'].to_dict()
        self.keep_ = X.columns.tolist()
        return self

    def transform(self, X):
        X = X.copy()
        for col in X.columns:
            X[col] = X[col].map(self.maps_[col])
        return X

    def get_feature_names_out(self, input_features=None):
        return getattr(self, 'keep_', input_features)


In [6]:

# =====================================
# PIPELINE
# =====================================
num_cols = X_train.select_dtypes(include="number").columns
cat_cols = X_train.select_dtypes(exclude="number").columns

num_pipeline = ColumnNamePreservingPipeline([
    ("drop", UnimportantDropper()),
    ("impute", RuleBasedImputer(cols_to_impute=[
        'Aktywa','Kapital_wlasny','Zysk_netto','Zysk_strata_brutto',
        'Przychody','Koszty_operacyjne','Zobowiazania_dl','Zobowiazania_krt'
    ])),
    ("log", LogTransformer()),
    ("cap", OutlierCapper()),
    ("corr", HighCorrelationDropper()),
    ("bin", Binner()),
    ("woe", WOETransformer())
])

cat_pipeline = ColumnNamePreservingPipeline([
    ("imp_cat", RuleBasedImputer(cols_to_impute=[])),
    ("woe_cat", WOETransformer())
])

preprocess = ColumnNamePreservingCT([
    ("num", num_pipeline, num_cols),
    ("cat", cat_pipeline, cat_cols)
])

pipeline = ColumnNamePreservingPipeline([
    ("preprocess", preprocess)
])

In [7]:
# =====================================
# FIT + TRANSFORM
# =====================================
pipeline.fit(X_train, y_train)

X_train_t = pipeline.transform(X_train)
X_val_t = pipeline.transform(X_val)
X_test_t = pipeline.transform(X_test)


  adjusted = values - mean
  diff_b_a = subtract(b, a)
  diff_b_a = subtract(b, a)
  lerp_interpolation = asanyarray(add(a, diff_b_a * t, out=out))
  grp = tmp.groupby(col)[y.name].agg(['sum', 'count'])
  grp = tmp.groupby(col)[y.name].agg(['sum', 'count'])
  grp = tmp.groupby(col)[y.name].agg(['sum', 'count'])
  grp = tmp.groupby(col)[y.name].agg(['sum', 'count'])
  grp = tmp.groupby(col)[y.name].agg(['sum', 'count'])
  grp = tmp.groupby(col)[y.name].agg(['sum', 'count'])
  grp = tmp.groupby(col)[y.name].agg(['sum', 'count'])
  grp = tmp.groupby(col)[y.name].agg(['sum', 'count'])
  grp = tmp.groupby(col)[y.name].agg(['sum', 'count'])
  grp = tmp.groupby(col)[y.name].agg(['sum', 'count'])
  grp = tmp.groupby(col)[y.name].agg(['sum', 'count'])
  grp = tmp.groupby(col)[y.name].agg(['sum', 'count'])
  grp = tmp.groupby(col)[y.name].agg(['sum', 'count'])
  grp = tmp.groupby(col)[y.name].agg(['sum', 'count'])
  grp = tmp.groupby(col)[y.name].agg(['sum', 'count'])
  grp = tmp.groupby(col)[y.

In [8]:
# Lista kolumn w kolejności alfabetycznej
columns_sorted = sorted(X_train_t.columns.to_list())

# Wyświetlenie
print(columns_sorted)

['Aktywa_obrotowe', 'Kapital_podstawowy', 'Kapital_wlasny', 'Naleznosci_krotkoterminowe', 'Zobowiazania_rezerwy', 'formaWlasnosci_Symbol', 'koszty_operacyjne', 'pkdKod', 'przychody', 'schemat_wsk_bilans', 'schemat_wsk_rzis', 'szczegolnaFormaPrawna_Symbol', 'wsk_akt_generowania_got_1', 'wsk_cykl_konwersji_gotowki', 'wsk_ebitda_3', 'wsk_ebitda_koszty_finansowe_3', 'wsk_ebitda_zobowiazan_3', 'wsk_ebitda_zobowiazan_odsetki_4', 'wsk_fin_majatku_kapitalem', 'wsk_koszty_operacyjne', 'wsk_liczba_dni_istnienia', 'wsk_plynnosc_biez_1', 'wsk_plynnosc_szybka_2', 'wsk_pokrycie_wyd_fin_gotowkowe_1', 'wsk_pokrycie_zob_kr_gotowkowe_1', 'wsk_poziom_kapitalu_obrotowego_netto', 'wsk_poziom_kosztow_operacyjnych', 'wsk_stopa_zysku_sprzedaz', 'wsk_struktura_kap_wlasnego_s_2', 'wsk_struktury_finansowania', 'wsk_struktury_kapitalu', 'wsk_sytuacji_fin', 'wsk_udzial_kap_obrotowego_w_fin', 'wsk_udzial_kap_wlasnego_aktywa_2', 'wsk_udzial_kapitalu_obrotowego_netto', 'wsk_udzial_zob_biez_sprzedaz_2', 'wsk_zadluzeni

In [None]:
list1= ['Aktywa_trwale', 'Kapital_podstawowy', 'Kapital_wlasny', 'Naleznosci_krotkoterminowe', 'Srodki_pieniezne', 'Zobowiazania_krotkoterminowe', 'Zobowiazania_rezerwy', 'default', 'formaWlasnosci_Symbol', 'koszty_finansowe', 'koszty_operacyjne', 'koszty_operacyjne_pozostale', 'pkdKod', 'podatek_dochodowy', 'pozostale_przychody_oper', 'przychody', 'przychody_sprzedazy', 'schemat_wsk_rzis', 'wsk_akt_generowania_got_1', 'wsk_cykl_konwersji_gotowki', 'wsk_ebitda_koszty_finansowe_3', 'wsk_ebitda_zobowiazan_3', 'wsk_ebitda_zobowiazan_odsetki_4', 'wsk_fin_majatku_kapitalem', 'wsk_liczba_dni_istnienia', 'wsk_marza_ebitda_3', 'wsk_mnoznik_kap_wl', 'wsk_plynnosc_szybka_2', 'wsk_pokrycie_wyd_fin_gotowkowe_1', 'wsk_pokrycie_zob_kr_gotowkowe_1', 'wsk_poziom_kapitalu_obrotowego_netto', 'wsk_poziom_kosztow_operacyjnych', 'wsk_rent_operacyjna', 'wsk_stopa_marzy_brutto', 'wsk_stopa_zysku_sprzedaz', 'wsk_struktura_kap_obcego_s', 'wsk_struktura_kap_wlasnego_s_1', 'wsk_struktura_kap_wlasnego_s_2', 'wsk_struktury_finansowania', 'wsk_struktury_kapitalu', 'wsk_sytuacji_fin', 'wsk_udzial_kap_obrotowego_w_fin', 'wsk_udzial_kap_wlasnego_aktywa_2', 'wsk_udzial_kapitalu_obrotowego_netto', 'wsk_udzial_zob_biez_sprzedaz_2', 'wsk_zast_kapitalu_obcego', 'wsk_zast_kapitalu_podstawowego', 'wsk_zast_kapitalu_stalego', 'wsk_zob_s_aktywa_rzeczowe', 'wsk_zwrot_aktywa_mat', 'wsk_zwrot_aktywa_trwale', 'zysk_brutto', 'zysk_netto', 'zysk_operacyjny']
list2=columns_sorted
max(len(list1),len(list2))-len(set(list1) & set(list2)) #powinno być ~0

19

In [None]:
#TODO  rozbieżność rozmiarowa względem orginalnego kodu