In [537]:
# =====================================
# IMPORTY
# =====================================
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

import warnings

# Ignore all warnings
warnings.filterwarnings("ignore")

In [538]:

# =====================================
# PARAMETRY
# =====================================
DATA_PATH = "zbiór_8.csv"
RANDOM_SEED = 1
CORR_THRESHOLD = 0.8
N_BINS = 5
EPS = 1e-6
IV_THRESHOLD = 0.1



In [539]:

# =====================================
# WCZYTANIE + SPLIT
# =====================================
df = pd.read_csv(DATA_PATH)

df_train, df_tmp = train_test_split(df, train_size=0.7, stratify=df["default"], random_state=RANDOM_SEED)
df_val, df_test = train_test_split(df_tmp, test_size=0.5, stratify=df_tmp["default"], random_state=RANDOM_SEED)

X_train, y_train = df_train.drop(columns="default"), df_train["default"]
X_val, y_val = df_val.drop(columns="default"), df_val["default"]
X_test, y_test = df_test.drop(columns="default"), df_test["default"]

In [540]:
# -------------------------------------
# WRAPPERY DO ZACHOWANIA NAZW KOLUMN
# -------------------------------------
class ColumnNamePreservingPipeline(Pipeline):
    def transform(self, X):
        X_transformed = super().transform(X)
        if isinstance(X_transformed, np.ndarray):
            if hasattr(self.steps[-1][1], "get_feature_names_out"):
                cols = self.steps[-1][1].get_feature_names_out(X.columns)
            else:
                cols = X.columns
            X_transformed = pd.DataFrame(X_transformed, columns=cols, index=X.index)
        return X_transformed

class ColumnNamePreservingCT(ColumnTransformer):
    def transform(self, X):
        X_transformed = super().transform(X)
        cols = []
        for name, trans, cols_in in self.transformers_:
            if trans == 'drop':
                continue
            if hasattr(trans, 'get_feature_names_out'):
                cols.extend(trans.get_feature_names_out(cols_in))
            else:
                cols.extend(cols_in)
        return pd.DataFrame(X_transformed, columns=cols, index=X.index)


In [541]:
# =====================================
# TRANSFORMATORY
# =====================================

class UnimportantDropper(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        zero_percentage = (X[:] == 0).sum() / len(X)

        cols_many_zeros = zero_percentage[zero_percentage > 0.5].index.tolist()

        cols_many_zeros = [col for col in cols_many_zeros 
                        if not col.startswith('wsk') 
                        and col != 'pkdKod' ]
        manual_cols=['szczegolnaFormaPrawna_Symbol', 'RP_przeplywy_operacyjne', 'RP_amortyzacja', 'wsk_pokrycie_zob_kr_gotowkowe_2', 'wsk_zadluzenia_gotowki_2', 'wsk_akt_generowania_got_2', 'wsk_pokrycie_wyd_fin_gotowkowe_2', 'wsk_zysk_CF_operacyjny', 'schemat_wsk_bilans']
        
        cols_to_drop = cols_many_zeros + manual_cols
        self.keep_ = [c for c in X.columns if c not in cols_to_drop]
        
        return self

    def transform(self, X):
        return X[self.keep_].copy()
    
    def get_feature_names_out(self, input_features=None):
        return getattr(self, 'keep_', input_features)
    
class CategoricalRuleBasedImputer(BaseEstimator, TransformerMixin):
    def __init__(self, cols_to_impute):
        
        self.cols_to_impute = cols_to_impute
        self.modes_ = {}

    def fit(self, X, y=None):
        X = X.copy()
        
        for col in self.cols_to_impute:
            if X[col].notna().sum() > 0:
                self.modes_[col] = X[col].mode()[0]
            else:
                self.modes_[col] = None  # Na wypadek gdyby cała kolumna była NaN
        return self

    def transform(self, X):
        X = X.copy()
        for col, mode_val in self.modes_.items():
            if mode_val is not None:
                X[col] = X[col].replace(0, np.nan)
                X[col] = X[col].fillna(mode_val)
                
        return X
    def get_feature_names_out(self, input_features=None):
        return getattr(self, 'keep_', input_features)
    
        
        

class NumericalRuleBasedImputer(BaseEstimator, TransformerMixin):
    def __init__(self, cols_to_impute):
        self.cols_to_impute = cols_to_impute

    def fit(self, X, y=None):
        self.medians_ = {}
        groups = {
        '<5%': [],
        '5-20%': [],
        '20-50%': [],
        '>50%': []
        }

        for col in self.cols_to_impute:
            median_val = X.loc[X[col] != 0, col].median()
            X[col] = X[col].replace(0, np.nan)     
            X[col].fillna(median_val, inplace=True) 

        wsk_cols = [col for col in X.columns if col.startswith('wsk')]
        percent_missing = (
        ((X[wsk_cols].isna() | (X[wsk_cols] == 0) | (X[wsk_cols] == np.inf)).sum() / len(X)) * 100
        )

        for col, pct in percent_missing.items():
            if pct < 5:
                groups['<5%'].append(col)
            elif pct < 20:
                groups['5-20%'].append(col)
            elif pct < 50:
                groups['20-50%'].append(col)
            else:
                groups['>50%'].append(col)

        for group in ['<5%', '5-20%']:
            for col in groups[group]:
                if col in X.columns:
                    median_value = X[col].median()
                    X.replace([np.inf, -np.inf], np.nan, inplace=True)
                    X[col].fillna(median_value, inplace=True)

        self.drop_ = groups['20-50%'] + groups['>50%']

        self.keep_ = [c for c in X.columns if c not in self.drop_]
        return self

    def transform(self, X):
        X = X.copy()
        X = X.drop(columns=self.drop_, errors="ignore")
        return X
    
    def get_feature_names_out(self, input_features=None):
        return getattr(self, 'keep_', input_features)
    


In [542]:
class LogTransformer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        # wykrycie mocno skośnych kolumn (>2)
        skew_values = X.skew()
        self.cols_ = skew_values[skew_values > 2].index.tolist()
        # zachowanie kolejności kolumn
        self.keep_ = X.columns.tolist()
        return self

    def transform(self, X):
        X = X.copy()
        for col in self.cols_:
            min_val = X[col].min()
            X[col] = X[col] + abs(min_val) + 1
            X[col] = np.log(X[col])
        return X

    def get_feature_names_out(self, input_features=None):
        return getattr(self, 'keep_', input_features)


class OutlierCapper(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        self.no_cap_cols = []
        self.moderate_iqr_cols = []
        self.percentile_cols = []

        for col in X.columns:
            Q1 = X[col].quantile(0.25)
            Q3 = X[col].quantile(0.75)
            IQR = Q3 - Q1
            
            lower_1_5 = Q1 - 1.5 * IQR
            upper_1_5 = Q3 + 1.5 * IQR
            
            below = (X[col] < lower_1_5).sum()
            above = (X[col] > upper_1_5).sum()
            perc_outliers = (below + above) / len(X[col])
            
            if perc_outliers == 0:
                self.no_cap_cols.append(col)
            elif perc_outliers <= 0.1:
                self.moderate_iqr_cols.append(col)
            else:
                self.percentile_cols.append(col)
        return self

    def transform(self, X):
        for col in self.moderate_iqr_cols:
            Q1 = X[col].quantile(0.25)
            Q3 = X[col].quantile(0.75)
            IQR = Q3 - Q1
            lower = Q1 - 1.5 * IQR
            upper = Q3 + 1.5 * IQR
            X[col] = X[col].clip(lower=lower, upper=upper)

        for col in self.percentile_cols:
            lower = X[col].quantile(0.01)
            upper = X[col].quantile(0.99)
            X[col] = X[col].clip(lower=lower, upper=upper)
        return X

    def get_feature_names_out(self, input_features=None):
        return getattr(self, 'keep_', input_features)



class HighCorrelationDropper(BaseEstimator, TransformerMixin):
    def __init__(self, threshold=0.8):
        self.threshold = threshold

    def fit(self, X, y=None):
        Xc = X.copy()
        Xc = Xc[Xc.columns.sort_values()]  
        while True:
            corr = Xc.corr().abs()
            upper = corr.where(np.triu(np.ones(corr.shape), k=1).astype(bool))

            # znajdź wszystkie pary z korelacją > threshold
            to_drop_pairs = [
                (col1, col2)
                for col1 in upper.columns
                for col2 in upper.index
                if pd.notnull(upper.loc[col1, col2]) and upper.loc[col1, col2] > self.threshold
            ]

            if not to_drop_pairs:
                break

            # zliczamy częstotliwość występowania kolumn w parach do usunięcia
            freq = {}
            for col1, col2 in to_drop_pairs:
                freq[col1] = freq.get(col1, 0) + 1
                freq[col2] = freq.get(col2, 0) + 1

            # wybieramy kolumnę, która występuje najczęściej
            max_freq = max(freq.values())
            max_cols = [col for col, f in freq.items() if f == max_freq]
            col_to_drop = sorted(max_cols)[-1]  # ostatnia alfabetycznie

            Xc = Xc.drop(columns=[col_to_drop])

        self.keep_ = Xc.columns.tolist()
        return self

    def transform(self, X):
        return X[self.keep_].copy()

    def get_feature_names_out(self, input_features=None):
        return getattr(self, 'keep_', input_features)
    
    


In [543]:
class Binner(BaseEstimator, TransformerMixin):
    def __init__(self, n_bins=5):
        self.n_bins = n_bins

    def fit(self, X, y=None):
        self.bin_edges_ = {}
        self.keep_ = []
        
        for col in X.columns:
            s = X[col].replace([np.inf, -np.inf], np.nan).dropna()

            if s.nunique() >= 2:  # minimalnie 2 unikalne wartości potrzebne do podziału
                for q in range(self.n_bins, 1, -1):  # próbuj mniejszą liczbę binów, jeśli qcut się nie uda
                    try:
                        _, bins = pd.qcut(s, q=q, retbins=True, duplicates='drop')
                        # Rozszerz biny, żeby objęły cały zakres (i ewentualne outliery)
                        bins[0] = -np.inf
                        bins[-1] = np.inf
                        self.bin_edges_[col] = bins
                        break
                    except Exception:
                        continue
            # zapamiętaj kolumny, żeby zachować kolejność feature’ów
            self.keep_.append(col)

        return self

    def transform(self, X):
        X = X.copy()
        for col, bins in self.bin_edges_.items():
            # zamień infy na nan (będą obsłużone przez cut)
            X[col] = X[col].replace([np.inf, -np.inf], np.nan)
            X[col] = pd.cut(X[col], bins=bins, include_lowest=True)
        return X

    def get_feature_names_out(self, input_features=None):
        return getattr(self, 'keep_', input_features)


class WOETransformer(BaseEstimator, TransformerMixin):
    """
    Transformer obliczający Weight of Evidence (WOE) dla zmiennych kategorycznych.
    - Obsługuje nowe, wcześniej niewidziane kategorie w transform().
    - Dla nieznanych kategorii przypisuje losową wartość WOE z rozkładu znanych.
    """

    def __init__(self, random_state=None, warn_on_unknown=True):
        """
        Parametry:
        ----------
        random_state : int | None
            Losowe ziarno dla powtarzalności wyników.
        warn_on_unknown : bool, default=True
            Czy ostrzegać o nowych kategoriach przy transformacji.
        """
        self.random_state = random_state
        self.warn_on_unknown = warn_on_unknown
        if random_state is not None:
            np.random.seed(random_state)

    def fit(self, X, y):
        self.maps_ = {}
        tmp = pd.concat([X, y], axis=1)
        for col in X.columns:
            grp = tmp.groupby(col)[y.name].agg(['sum', 'count'])
            grp['good'] = grp['count'] - grp['sum']
            grp['bad_dist'] = grp['sum'] / grp['sum'].sum()
            grp['good_dist'] = grp['good'] / grp['good'].sum()
            grp['woe'] = np.log((grp['good_dist'] + EPS) / (grp['bad_dist'] + EPS))
            self.maps_[col] = grp['woe'].to_dict()
        self.keep_ = X.columns.tolist()
        return self

    def transform(self, X):
        X = X.copy()
        rng = np.random.default_rng(self.random_state)

        for col in X.columns:
            mapping = self.maps_[col]
            known_woe_values = np.array(list(mapping.values()))
            
            mapped = X[col].map(mapping)
            unknown_mask = mapped.isna()

            if unknown_mask.any():
                if self.warn_on_unknown:
                    unknown_values = X.loc[unknown_mask, col].unique()
                    warnings.warn(
                        f"WOETransformer: kolumna '{col}' zawiera {len(unknown_values)} nieznanych kategorii: {unknown_values}. "
                        f"Zastąpiono losowymi wartościami z rozkładu znanych WOE."
                    )

                # Losujemy wartości WOE dla nieznanych kategorii
                random_values = rng.choice(known_woe_values, size=unknown_mask.sum(), replace=True)
                mapped.loc[unknown_mask] = random_values

            X[col] = mapped

        return X

    def get_feature_names_out(self, input_features=None):
        return getattr(self, 'keep_', input_features)



class IVSelectorDropper(BaseEstimator, TransformerMixin):
    """
    Selekcja cech na podstawie wartości IV (Information Value).
    Zakłada, że kolumny zawierają już wartości WOE.
    Zwraca zawsze pandas.DataFrame z zachowaniem kolumn.
    """

    def __init__(self, threshold=IV_THRESHOLD):
        self.threshold = threshold

    def fit(self, X, y):
        # Konwersja do DataFrame/Series dla bezpieczeństwa
        X = pd.DataFrame(X)
        y = pd.Series(y)

        n_event = np.sum(y == 1)
        n_non = np.sum(y == 0)

        ivs = {}
        for col in X.columns:
            iv_sum = 0.0
            vals = np.unique(X[col])
            for val in vals:
                # udział event i non-event
                p_event = np.sum((X[col] == val) & (y == 1)) / n_event
                p_non_event = np.sum((X[col] == val) & (y == 0)) / n_non
                woe_val = float(val)  # zakładamy, że val to WOE
                iv_sum += (p_non_event - p_event) * woe_val
            ivs[col] = iv_sum

        # Series z IV i lista kolumn do zachowania
        self.ivs_ = pd.Series(ivs).sort_values(ascending=False)
        self.keep_ = self.ivs_[self.ivs_ >= self.threshold].index.tolist()

        return self

    def transform(self, X):
        X = pd.DataFrame(X)
        # wybór tylko kolumn z IV >= threshold
        X_selected = X[self.keep_]
        # upewniamy się, że wszystkie wartości numeryczne
        X_selected = X_selected.apply(pd.to_numeric, errors='coerce')
        return X_selected  # zawsze DataFrame

    def get_feature_names_out(self, input_features=None):
        return getattr(self, 'keep_', input_features)
    
    
class CorrDropper2(BaseEstimator, TransformerMixin):
    def __init__(self, threshold=0.8):
        self.threshold = threshold
        self.to_drop_ = []
        self.keep_ = []

    def fit(self, X, y=None):
        # liczymy korelacje tylko dla kolumn numerycznych
        corr_matrix = X.corr(numeric_only=True).abs()
        upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
        self.to_drop_ = [column for column in upper.columns if any(upper[column] > self.threshold)]
        # kolumny, które zachowujemy
        self.keep_ = [col for col in X.columns if col not in self.to_drop_]
        return self

    def transform(self, X):
        X_reduced = X.drop(columns=self.to_drop_, errors='ignore')
        
        return X_reduced

    def get_feature_names_out(self, input_features=None):
        # jeśli są zachowane kolumny, zwróć je
        if hasattr(self, 'keep_') and self.keep_:
            return self.keep_
        return input_features


In [544]:
# =====================================
# PIPELINE
# =====================================
cat_cols = ['formaWlasnosci_Symbol', 'pkdKod', 'schemat_wsk_rzis'] 
num_cols = [col for col in X_train.columns if col not in cat_cols]

num_pipeline = ColumnNamePreservingPipeline([
    ("drop_num", UnimportantDropper()),
    ("imp_num", NumericalRuleBasedImputer(cols_to_impute=[
            'Aktywa', 'Aktywa_trwale', 'Aktywa_obrotowe', 'Naleznosci_krotkoterminowe',
            'Inwestycje_krotkoterminowe', 'Srodki_pieniezne', 'Kapital_wlasny', 'Kapital_podstawowy',
            'Zysk_netto', 'Zobowiazania_rezerwy', 'Zobowiazania_krotkoterminowe',
            'przychody_sprzedazy', 'zysk_sprzedazy', 'pozostale_przychody_oper', 'koszty_operacyjne_pozostale',
            'zysk_operacyjny', 'koszty_finansowe', 'zysk_brutto', 'podatek_dochodowy'
])),
    ("log", LogTransformer()),
    ("cap", OutlierCapper()),
    ("corr", HighCorrelationDropper()),
    ("bin", Binner()),
    ("woe_num", WOETransformer()),
    ("IV_selector_num", IVSelectorDropper(threshold=IV_THRESHOLD)),
    ("corr2_num", CorrDropper2(threshold=CORR_THRESHOLD))
])

cat_pipeline = ColumnNamePreservingPipeline([
    ("drop_cat", UnimportantDropper()),
    ("imp_cat", CategoricalRuleBasedImputer(cols_to_impute=["formaWlasnosci_Symbol"])),
    ("woe_cat", WOETransformer()),
    ("IV_selector_cat", IVSelectorDropper(threshold=IV_THRESHOLD)),
    ("corr2_cat", CorrDropper2(threshold=CORR_THRESHOLD))
])

preprocess = ColumnNamePreservingCT([
    ("num", num_pipeline, num_cols),
    ("cat", cat_pipeline, cat_cols) 
])

pipeline = ColumnNamePreservingPipeline([
    ("preprocess", preprocess)
    
])

pipeline

In [545]:
# =====================================
# FIT + TRANSFORM
# =====================================
pipeline.fit(X_train, y_train)

X_train_t = pipeline.transform(X_train)
X_val_t = pipeline.transform(X_val)
X_test_t = pipeline.transform(X_test)



In [546]:
X_train_t = X_train_t[sorted(X_train_t.columns)]
X_train_t

Unnamed: 0,Aktywa_obrotowe,Inwestycje_krotkoterminowe,Kapital_wlasny,Zysk_netto,formaWlasnosci_Symbol,koszty_operacyjne,pkdKod,podatek_dochodowy,pozostale_przychody_oper,wsk_akt_generowania_got_1,...,wsk_koszty_przychody,wsk_mnoznik_kap_wl,wsk_plynnosc_biez_2,wsk_poziom_kapitalu_obrotowego_netto,wsk_rent_operacyjna,wsk_struktura_kap_wlasnego_s_2,wsk_struktury_finansowania,wsk_udzial_zob_biez_sprzedaz_2,wsk_zob_s_aktywa_rzeczowe,wsk_zwrot_aktywa_trwale
904,-0.294451,-0.107630,-0.249811,-0.127189,0.022720,-0.203598,-1.557395,-0.249811,-0.439949,0.146603,...,0.210624,0.210624,0.278299,-0.375301,0.552065,0.056542,-0.176096,0.313656,0.243976,-0.054067
1715,0.508715,0.798225,1.044536,1.044536,-0.379485,0.115831,7.875719,0.854782,0.597146,0.693142,...,0.552065,0.313656,0.085825,1.364515,0.313656,0.056542,0.744435,0.597146,0.000000,-0.054067
1449,0.508715,0.798225,1.044536,1.044536,0.022720,0.798225,-0.253189,0.854782,0.514324,0.693142,...,0.552065,0.313656,0.508715,1.364515,0.313656,0.056542,0.744435,0.597146,0.000000,-0.054067
894,-0.294451,-0.535242,0.146603,-0.535242,0.022720,-0.203598,0.616589,-0.249811,-0.104191,0.146603,...,-0.179860,-0.757364,0.508715,-0.375301,0.313656,-0.725233,-0.294451,0.313656,-0.725233,-0.105892
1575,0.508715,0.798225,1.044536,1.044536,-0.048527,0.210624,-0.379478,0.854782,0.597146,0.313656,...,-0.179860,0.210624,0.278299,1.364515,0.313656,0.387763,-0.294451,-0.131028,0.508715,0.027946
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
920,-0.054067,-0.107630,-0.249811,0.510202,0.022720,0.210624,0.552047,-0.188996,0.514324,0.693142,...,0.552065,0.210624,0.278299,-0.179860,0.313656,0.387763,0.744435,0.597146,0.387763,0.027946
2480,-0.054067,-0.535242,0.146603,-0.127189,0.022720,0.210624,-0.864906,-0.249811,-0.104191,0.146603,...,0.210624,0.210624,-0.027328,0.744435,0.027946,0.387763,-0.294451,0.313656,0.508715,0.027946
748,-0.294451,-0.107630,-0.249811,-0.127189,0.022720,-0.535242,-0.641764,-0.249811,-0.439949,0.146603,...,-0.399939,0.210624,0.278299,-0.375301,-0.316220,0.387763,-0.176096,-0.054067,0.000000,-0.516689
2866,0.508715,-0.107630,0.552065,0.510202,-0.379485,0.210624,-0.641674,0.779516,0.514324,0.146603,...,0.210624,0.210624,0.508715,1.364515,0.552065,0.387763,0.744435,-0.131028,0.508715,0.027946


In [547]:
X_train_t.isna().sum()[X_train_t.isna().sum() > 0]

Series([], dtype: int64)

In [548]:
X_train_t[X_train_t.isna().any(axis=1)]


Unnamed: 0,Aktywa_obrotowe,Inwestycje_krotkoterminowe,Kapital_wlasny,Zysk_netto,formaWlasnosci_Symbol,koszty_operacyjne,pkdKod,podatek_dochodowy,pozostale_przychody_oper,wsk_akt_generowania_got_1,...,wsk_koszty_przychody,wsk_mnoznik_kap_wl,wsk_plynnosc_biez_2,wsk_poziom_kapitalu_obrotowego_netto,wsk_rent_operacyjna,wsk_struktura_kap_wlasnego_s_2,wsk_struktury_finansowania,wsk_udzial_zob_biez_sprzedaz_2,wsk_zob_s_aktywa_rzeczowe,wsk_zwrot_aktywa_trwale


In [549]:
X_val_t = X_val_t[sorted(X_val_t.columns)]
X_val_t.isna().sum()[X_val_t.isna().sum() > 0]


Series([], dtype: int64)

In [550]:
X_test_t = X_test_t[sorted(X_test_t.columns)]
X_test_t.isna().sum()[X_test_t.isna().sum() > 0]

Series([], dtype: int64)

In [551]:
# Lista kolumn w kolejności alfabetycznej
columns_sorted = sorted(X_train_t.columns.to_list())

# Wyświetlenie
print(columns_sorted)

['Aktywa_obrotowe', 'Inwestycje_krotkoterminowe', 'Kapital_wlasny', 'Zysk_netto', 'formaWlasnosci_Symbol', 'koszty_operacyjne', 'pkdKod', 'podatek_dochodowy', 'pozostale_przychody_oper', 'wsk_akt_generowania_got_1', 'wsk_ebitda_3', 'wsk_ebitda_aktywa_3', 'wsk_ebitda_koszty_finansowe_3', 'wsk_ebitda_zobowiazan_3', 'wsk_fin_majatku_kapitalem', 'wsk_koszty_przychody', 'wsk_mnoznik_kap_wl', 'wsk_plynnosc_biez_2', 'wsk_poziom_kapitalu_obrotowego_netto', 'wsk_rent_operacyjna', 'wsk_struktura_kap_wlasnego_s_2', 'wsk_struktury_finansowania', 'wsk_udzial_zob_biez_sprzedaz_2', 'wsk_zob_s_aktywa_rzeczowe', 'wsk_zwrot_aktywa_trwale']


In [552]:
list1= ['Aktywa_obrotowe', 'Inwestycje_krotkoterminowe', 'Kapital_wlasny', 'Zysk_netto', 'formaWlasnosci_Symbol', 'koszty_operacyjne', 'pkdKod', 'podatek_dochodowy', 'pozostale_przychody_oper', 'wsk_akt_generowania_got_1', 'wsk_ebitda_3', 'wsk_ebitda_aktywa_3', 'wsk_ebitda_koszty_finansowe_3', 'wsk_ebitda_zobowiazan_3', 'wsk_fin_majatku_kapitalem', 'wsk_koszty_przychody', 'wsk_mnoznik_kap_wl', 'wsk_plynnosc_biez_2', 'wsk_poziom_kapitalu_obrotowego_netto', 'wsk_rent_operacyjna', 'wsk_struktura_kap_wlasnego_s_2', 'wsk_struktury_finansowania', 'wsk_udzial_zob_biez_sprzedaz_2', 'wsk_zob_s_aktywa_rzeczowe', 'wsk_zwrot_aktywa_trwale']
list2=columns_sorted
len( list(set(list1) ^ set(list2))) #powinno być ~0

0

In [553]:
list(set(list1) ^ set(list2))

[]

In [554]:
intersection_with_list1 = list((set(list1) ^ set(list2)) & set(list1))
intersection_with_list1

[]

In [555]:
intersection_with_list2 = list((set(list1) ^ set(list2)) & set(list2))
intersection_with_list2

[]

## To tak żeby szybko zobaczyć perspektywy

In [556]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    accuracy_score, roc_auc_score, precision_score, recall_score, f1_score
)
from IPython.display import display, HTML

# ---------------------------------------------------------
# 1️⃣ Model pipeline z regresją logistyczną
# ---------------------------------------------------------

model_pipeline = ColumnNamePreservingPipeline([
    ("preprocess", preprocess),
    ("clf", LogisticRegression(
        solver="lbfgs",            # stabilny, szybki, dobrze działa z regularizacją
        class_weight="balanced",   # automatyczne ważenie klas (1/freq)
        penalty="l2",              # L2 regularization (domyślna)
        C=0.4,                     # siła regularyzacji (mniejsza wartość = mocniejsza kara za złożony model)
        max_iter=500,              # więcej iteracji, żeby solver się zbiegał
        random_state=2,
        n_jobs=-1                  # równoległość (jeśli masz dużą liczbę zmiennych)
    ))
])

# ---------------------------------------------------------
# 2️⃣ Fit
# ---------------------------------------------------------

model_pipeline.fit(X_train, y_train)

# ---------------------------------------------------------
# 3️⃣ Predykcje
# ---------------------------------------------------------

y_pred_train = model_pipeline.predict(X_train)
y_proba_train = model_pipeline.predict_proba(X_train)[:, 1]

y_pred_val = model_pipeline.predict(X_val)
y_proba_val = model_pipeline.predict_proba(X_val)[:, 1]

# ---------------------------------------------------------
# 4️⃣ Funkcja do liczenia metryk
# ---------------------------------------------------------

def compute_metrics(y_true, y_pred, y_proba):
    return {
        "Accuracy": accuracy_score(y_true, y_pred),
        "Precision": precision_score(y_true, y_pred, zero_division=0),
        "Recall": recall_score(y_true, y_pred, zero_division=0),
        "F1 Score": f1_score(y_true, y_pred, zero_division=0),
        "ROC AUC": roc_auc_score(y_true, y_proba)
    }

train_metrics = compute_metrics(y_train, y_pred_train, y_proba_train)
val_metrics = compute_metrics(y_val, y_pred_val, y_proba_val)

# ---------------------------------------------------------
# 5️⃣ Tabela porównawcza
# ---------------------------------------------------------

results_df = pd.DataFrame([train_metrics, val_metrics], index=["Train", "Validation"]).T
results_df["Diff (Train - Val)"] = results_df["Train"] - results_df["Validation"]
results_df = results_df.round(2)

styled = (
    results_df.style
    .background_gradient(cmap="RdYlGn", axis=1)  # kolory: zielony=lepiej, czerwony=gorzej
    .format("{:.2f}")
    .set_caption(" Porównanie jakości modelu: Train vs Validation")
)

display(styled)


Unnamed: 0,Train,Validation,Diff (Train - Val)
Accuracy,0.73,0.55,0.19
Precision,0.24,0.14,0.11
Recall,0.84,0.7,0.14
F1 Score,0.38,0.23,0.15
ROC AUC,0.87,0.64,0.23
