In [7]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.utils.validation import check_is_fitted
from featurewiz_polars import Sulov_MRMR, Polars_DateTimeTransformer, Polars_CategoricalEncoder
from featurewiz_polars import Polars_MissingTransformer, YTransformer, Polars_ColumnEncoder
import polars as pl
import numpy as np
import pandas as pd
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
import time
from sklearn.pipeline import Pipeline
import pdb
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor

In [5]:
#mrmr = Sulov_MRMR(corr_threshold=0.7, verbose=0)
X_pipeline = Pipeline([
    ('datetime_transformer', Polars_DateTimeTransformer(datetime_features=[])), # Specify your datetime columns
    ('cat_transformer', Polars_CategoricalEncoder(encoding_type='ordinal', categorical_features='auto')),
    ('nan_transformer', Polars_MissingTransformer(strategy="median")),
    ('ytransformer', YTransformer()),
    ])
Y_pipeline = Pipeline([
    ('featurewiz', Sulov_MRMR(corr_threshold=0.7, verbose=0)),
    ])
##    Usage missing value fillers
feature_selection = Pipeline([
        ('X_pipeline', X_pipeline),
        ('Y_pipeline', Y_pipeline)
    ])
model_type = 'classification'
if model_type == 'Regression':
    model = RandomForestRegressor(n_estimators=100, random_state=99)
else:
    model = RandomForestClassifier(n_estimators=100, random_state=99)

In [8]:
class Classx( TransformerMixin): # Class name 
    def __init__(self, model=model, 
            model_type=model_type, encoding_type='target', 
            imputation_strategy='mean', corr_threshold = 0.7,
            verbose = 0):
        self.model = model
        self.model_type = model_type.lower()
        self.encoding_type = encoding_type.lower()
        self.imputation_strategy = imputation_strategy.lower()
        self.corr_threshold = corr_threshold
        self.feature_selection = feature_selection
        self.y_encoder = Polars_ColumnEncoder()

    def fit(self, X, y):
        self.feature_selection.fit(X,y)
        self.y_encoder.fit(y)
        return self

    def transform(self, X, y=None):
        if y is None:
            return self.feature_selection.transform(X)
        else:
            Xt = self.feature_selection.transform(X)
            yt = self.y_encoder.transform(y)
            return Xt, yt

    def fit_transform(self, X, y):
        self.fit(X, y)
        Xt = self.transform(X)
        yt = self.y_encoder.transform(y)
        return Xt, yt
testp = Classx()

In [10]:
datapath = "../../data_sets/"
filename = "heart.csv"

In [11]:
df = pl.read_csv(datapath+filename, null_values='NULL', try_parse_dates=True)#.sample(1000)
print('Loaded data...', df.shape)
target = 'target' # Replace with your target column name
model_type = 'Classification'
if target not in df.columns:
    print(f"Error: Target column '{target}' not found in the CSV file.")
    exit()
predictors = [x for x in df.columns if x!=target]
X = df[predictors]
y = df[target]
print('Data dimensions (rows x cols) = %d dims' %(int(X.shape[0]*X.shape[1])))
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
y_train.dtype

Loaded data... (303, 14)
Data dimensions (rows x cols) = 3939 dims


Int64

In [12]:
testp.fit(X, y)
Xt, yt = testp.transform(X_train,y_train)
print(type(Xt))
print(type(yt))

Model type: Classification
SULOV selected Features (13): ['thal', 'cp', 'exang', 'oldpeak', 'ca', 'thalach', 'chol', 'slope', 'trestbps', 'restecg', 'age', 'sex', 'fbs']

Recursive XGBoost selected Features (5): ['age', 'ca', 'cp', 'sex', 'thal']
<class 'polars.dataframe.frame.DataFrame'>
<class 'polars.series.series.Series'>


In [13]:
model.fit(Xt, yt)

In [14]:
base_importances = model.feature_importances_
tier_thresholds = np.percentile(base_importances, [33, 66])
tier_thresholds

array([0.18329003, 0.20593133])

In [15]:
# Stratify features into importance tiers
all_features = sorted(X.columns)
tiers = {
    'high': [f for f, imp in zip(all_features, base_importances) 
            if imp >= tier_thresholds[1]],
    'medium': [f for f, imp in zip(all_features, base_importances) 
            if tier_thresholds[0] <= imp < tier_thresholds[1]],
    'low': [f for f, imp in zip(all_features, base_importances) 
            if imp < tier_thresholds[0]]
}
tiers['high']

['age', 'ca']

In [17]:
dict(zip(all_features, base_importances))

{'age': 0.34969239500850263,
 'ca': 0.21793527042737812,
 'chol': 0.18459100264434322,
 'cp': 0.06510352448220108,
 'exang': 0.18267780743757508}