In [1]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.utils.validation import check_is_fitted
from featurewiz_polars import Sulov_MRMR, Polars_DateTimeTransformer, Polars_CategoricalEncoder
from featurewiz_polars import Polars_MissingTransformer, YTransformer, Polars_ColumnEncoder
import polars as pl
import numpy as np
import pandas as pd
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
import time
from sklearn.pipeline import Pipeline
import pdb
import copy
from featurewiz_polars import print_classification_metrics, print_regression_metrics
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor

Imported featurewiz_polars 0.1.1. Use the following syntax:
 >> from featurewiz_polars import Featurewiz_MRMR, Featurewiz_MRMR_Model
 >> wiz = Featurewiz_MRMR(model_type='Classification')
 >> X_transformed, y_transformed = wiz.fit_transform(X_train, y_train)
 >> X_test_transformed = wiz.transform(X_test)
 >> print(wiz.selected_features)
    


In [2]:
datapath = "../../../documents/ram/data_sets/"
filename = "ames_train.csv"

In [3]:
df = pl.read_csv(datapath+filename, null_values=['NULL','NA'], try_parse_dates=True, infer_schema_length=10000, ignore_errors=True, )#.sample(1000)
print('Loaded data...', df.shape)
target = 'SalePrice' # Replace with your target column name
model_type = 'Regression'
if target not in df.columns:
    print(f"Error: Target column '{target}' not found in the CSV file.")
    exit()
predictors = [x for x in df.columns if x!=target]
X = df[predictors]
y = df[target]
print('Data dimensions (rows x cols) = %d dims' %(int(X.shape[0]*X.shape[1])))
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
y_train.dtype

Loaded data... (1460, 81)
Data dimensions (rows x cols) = 116800 dims


Int64

In [4]:
# You need to transform y in case of classification problems ##
if model_type.lower() == 'classification':
    X_pipeline = Pipeline([
        ('datetime_transformer', Polars_DateTimeTransformer(datetime_features=[])), # Specify your datetime columns
        ('cat_transformer', Polars_CategoricalEncoder(encoding_type='ordinal', categorical_features='auto')),
        ('nan_transformer', Polars_MissingTransformer(strategy="median")),
        ('ytransformer', YTransformer()),
        ])
else:
    X_pipeline = Pipeline([
        ('datetime_transformer', Polars_DateTimeTransformer(datetime_features=[])), # Specify your datetime columns
        ('cat_transformer', Polars_CategoricalEncoder(encoding_type='ordinal', categorical_features='auto')),
        ('nan_transformer', Polars_MissingTransformer(strategy="median")),
        ])
Y_pipeline = Pipeline([
    ('featurewiz', Sulov_MRMR(model_type=model_type, corr_threshold=0.7, verbose=0)),
    ])
##    Usage missing value fillers
feature_selection = Pipeline([
        ('X_pipeline', X_pipeline),
        ('Y_pipeline', Y_pipeline)
    ])
if model_type == 'Regression':
    model = RandomForestRegressor(n_estimators=100, random_state=99)
else:
    model = RandomForestClassifier(n_estimators=100, random_state=99)

In [5]:
class Classx( TransformerMixin): # Class name 
    def __init__(self, model=model, 
            model_type=model_type, encoding_type='target', 
            imputation_strategy='mean', corr_threshold = 0.7,
            verbose = 0):
        self.model = model
        self.model_type = model_type.lower()
        self.encoding_type = encoding_type.lower()
        self.imputation_strategy = imputation_strategy.lower()
        self.corr_threshold = corr_threshold
        self.feature_selection = feature_selection
        self.y_encoder = Polars_ColumnEncoder()

    def fit(self, X, y):
        self.feature_selection.fit(X,y)
        self.y_encoder.fit(y)
        return self

    def transform(self, X, y=None):
        if y is None:
            return self.feature_selection.transform(X)
        else:
            Xt = self.feature_selection.transform(X)
            if model_type.lower() == 'classification':
                yt = self.y_encoder.transform(y)
            else:
                yt = y
            return Xt, yt

    def fit_transform(self, X, y):
        self.fit(X, y)
        Xt = self.transform(X)
        if model_type.lower() == 'classification':
            yt = self.y_encoder.transform(y)
        else:
            yt = y
        return Xt, yt
testp = Classx()

# Then with featurewiz polars edition

In [6]:
testp.fit(X_train, y_train)
Xt, yt = testp.transform(X_train,y_train)
Xtt = testp.transform(X_test)
print(type(Xt))
print(type(yt))

Model type: Regression
SULOV selected Features (78): ['1stFlrSF', '2ndFlrSF', '3SsnPorch', 'Alley', 'BedroomAbvGr', 'BldgType', 'BsmtCond', 'BsmtExposure', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtFinType1', 'BsmtFinType2', 'BsmtFullBath', 'BsmtHalfBath', 'BsmtQual', 'BsmtUnfSF', 'CentralAir', 'Condition1', 'Condition2', 'Electrical', 'EnclosedPorch', 'ExterCond', 'ExterQual', 'Exterior1st', 'Exterior2nd', 'Fence', 'FireplaceQu', 'Fireplaces', 'FullBath', 'Functional', 'GarageArea', 'GarageCars', 'GarageCond', 'GarageFinish', 'GarageQual', 'GarageType', 'GarageYrBlt', 'GrLivArea', 'HalfBath', 'Heating', 'HeatingQC', 'HouseStyle', 'Id', 'KitchenAbvGr', 'KitchenQual', 'LandContour', 'LandSlope', 'LotArea', 'LotConfig', 'LotFrontage', 'LotShape', 'LowQualFinSF', 'MSSubClass', 'MSZoning', 'MasVnrArea', 'MasVnrType', 'MiscFeature', 'MiscVal', 'MoSold', 'Neighborhood', 'OpenPorchSF', 'OverallCond', 'OverallQual', 'PavedDrive', 'PoolArea', 'PoolQC', 'RoofMatl', 'RoofStyle', 'SaleCondition', 'SaleType

# Model trained with the Polars selected feature sets

In [7]:
modely = copy.deepcopy(model)
modely.fit(Xt, yt)
y_predy = modely.predict(Xtt)

# performance of featurewiz polars featureset

In [8]:
if model_type.lower() == 'classification':
    print_classification_metrics(y_test.to_pandas().ravel(), y_predy, verbose=1)
else:
    print_regression_metrics(y_test.to_pandas().ravel(), y_predy, verbose=1)

    RMSE = 28089.508
    Norm RMSE = 32%
    MAE = 17550.220
    WAPE = 10%, Bias = 0.6%
    MAPE = 1079%
    R-Squared = 90%


  print_regression_metrics(y_test.to_pandas().ravel(), y_predy, verbose=1)


In [9]:
disto

NameError: name 'disto' is not defined

In [None]:
import polars as pl
import numpy as np
from itertools import combinations
from scipy.stats import chi2_contingency
from collections import defaultdict
from polars import selectors as cs
from typing import List, Dict
import pdb
class FeatureSelector:
    def __init__(self, corr_threshold=0.7):
        self.corr_threshold = corr_threshold

    def _calculate_correlations(self, X: pl.DataFrame, features: List[str],
                               numeric_cols: List[str], cat_cols: List[str]) -> pl.DataFrame:
        """Calculate correlations using optimized Polars methods"""
        # Get numeric correlations in vectorized way
        numeric_corrs = self._get_numeric_correlations(X, numeric_cols)
        
        # Get categorical correlations using existing logic
        cat_corrs = self._get_categorical_correlations(X, features, numeric_cols, cat_cols)
        
        # Combine and return
        return pl.concat([numeric_corrs, cat_corrs])
    
    def _get_numeric_correlations(self, X: pl.DataFrame, numeric_cols: List[str]) -> pl.DataFrame:
        """Vectorized numeric correlation calculation"""
        if not numeric_cols:
            return pl.DataFrame(schema=["feature_a", "feature_b", "correlation"])
        
        return (
            X.select(numeric_cols)
            .corr()
            .pipe(self._matrix_to_pairs)
            .with_columns(correlation=pl.col("correlation").abs())
        )
    
    def _get_categorical_correlations(self, X: pl.DataFrame, features: List[str],
                                     numeric_cols: List[str], cat_cols: List[str]) -> pl.DataFrame:
        """Calculate categorical correlations with optimizations"""
        pairs = []
        features = sorted(features)
        
        for f1, f2 in combinations(features, 2):
            # Skip numeric-numeric pairs already handled
            if {f1, f2}.issubset(numeric_cols):
                continue
                
            # Skip pairs with low cardinality
            if (X[f1].n_unique() < 2) or (X[f2].n_unique() < 2):
                continue
                
            # Calculate Cramer's V
            confusion = X.pivot(f2, index=f1, aggregate_function="len").fill_null(0)
            #chi2 = chi2_contingency(confusion.to_numpy())[0]
            cho2 = chi2_contingency(confusion.to_numpy()[:,1:])
            #n = X.height
            num = X.shape[0]
            #phi2 = chi2 / n
            phi2 = cho2[3] / num
            r, c = confusion.shape
            corr = np.sqrt(phi2 / min(r-1, c-1))
            
            pairs.append((f1, f2, abs(corr)))
        
        return pl.DataFrame(pairs, schema=["feature_a", "feature_b", "correlation"])
    
    def _adaptive_removal(self, corr_matrix: pl.DataFrame, mis_scores: Dict[str, float]) -> set:
        """Polars-optimized adaptive removal"""
        max_mis = max(mis_scores.values())
        
        return (
            corr_matrix
            # Calculate MIS ratios
            .with_columns(
                mis_a=pl.col("feature_a").map_dict(mis_scores),
                mis_b=pl.col("feature_b").map_dict(mis_scores)
            )
            # Identify removal candidates
            .with_columns(
                remove_a=pl.when(
                    (pl.col("mis_a") / pl.col("mis_b") < 0.7) &
                    (pl.col("mis_a") < 0.5 * max_mis)
                ).then(1).otherwise(0),
                remove_b=pl.when(
                    (pl.col("mis_b") / pl.col("mis_a") < 0.7) &
                    (pl.col("mis_b") < 0.5 * max_mis)
                ).then(1).otherwise(0)
            )
            # Aggregate removal counts
            .group_by("feature_a").agg(pl.sum("remove_a"))
            .group_by("feature_b").agg(pl.sum("remove_b"))
            .melt(value_vars=["feature_a", "feature_b"], value_name="feature")
            .group_by("feature").agg(pl.sum("value"))
            .filter(pl.col("value") > 0)
            .select("feature")
            .collect()
            .to_series()
            .to_set()
        )
    
    def _matrix_to_pairs(self, corr_matrix: pl.DataFrame) -> pl.DataFrame:
        """Convert correlation matrix to feature pairs"""
        return (
            corr_matrix
            .with_columns(feature_a=pl.Series(corr_matrix.columns))
            .melt(id_vars="feature_a", variable_name="feature_b", value_name="correlation")
            .filter(pl.col("feature_a") != pl.col("feature_b"))
            .unique()
        )

In [None]:
import polars as pl
import numpy as np
from scipy.stats import chi2_contingency

# Test Data Generation
def generate_test_data(n=1000) -> pl.DataFrame:
    np.random.seed(42)
    
    # Highly correlated numerical features
    base = np.random.normal(0, 1, n)
    num1 = base + np.random.normal(0, 0.1, n)
    num2 = base + np.random.normal(0, 0.1, n)
    
    # Uncorrelated numerical
    num3 = np.random.uniform(0, 1, n)
    
    # Categorical features with association
    cat1 = np.random.choice(["A", "B", "C"], n, p=[0.4, 0.4, 0.2])
    cat2 = np.where(cat1 == "A", "X", np.where(cat1 == "B", "Y", "Z"))
    
    # Low cardinality categorical (should be filtered out)
    cat3 = np.full(n, "Constant")
    
    return pl.DataFrame({
        "num1": num1,
        "num2": num2,
        "num3": num3,
        "cat1": cat1,
        "cat2": cat2,
        "cat3": cat3
    })


# Test Case 1: Basic Functionality
def test_correlation_calculation():
    df = generate_test_data()
    selector = FeatureSelector(corr_threshold=0.6)
    
    numeric_cols = ["num1", "num2", "num3"]
    cat_cols = ["cat1", "cat2", "cat3"]
    features = numeric_cols + cat_cols
    
    # Calculate correlations
    corr_matrix = selector._calculate_correlations(
        X=df,
        features=features,
        numeric_cols=numeric_cols,
        cat_cols=cat_cols
    )
    
    # Verify expected correlations
    high_corr_pairs = corr_matrix.filter(
        pl.col("correlation") >= selector.corr_threshold
    )
    
    # num1 and num2 should be highly correlated
    assert ("num1", "num2") in high_corr_pairs.select(
        pl.col("feature_a", "feature_b")
    ).rows(), "Missing numeric correlation"
    
    # cat1 and cat2 should be associated
    assert ("cat1", "cat2") in high_corr_pairs.select(
        pl.col("feature_a", "feature_b")
    ).rows(), "Missing categorical association"
    
    # cat3 should be filtered out (low cardinality)
    assert "cat3" not in corr_matrix["feature_a"].to_list(), \
        "Low cardinality feature not filtered"
    
    print("Correlation calculation tests passed!")


# Run tests
test_correlation_calculation()

In [None]:
# Test Case 2: Adaptive Removal
def test_adaptive_removal():
    df = generate_test_data()
    selector = FeatureSelector()
    
    # Mock MIS scores (num2 > num1, cat2 > cat1)
    mis_scores = {
        "num1": 0.8,
        "num2": 1.2,
        "num3": 0.4,
        "cat1": 0.7,
        "cat2": 1.0
    }
    
    corr_matrix = pl.DataFrame([
        ("num1", "num2", 0.85),
        ("cat1", "cat2", 0.72),
        ("num1", "num3", 0.15)
    ], schema=["feature_a", "feature_b", "correlation"])
    
    to_remove = selector._adaptive_removal(corr_matrix, mis_scores)
    
    # num1 should be removed (correlated with better num2)
    assert "num1" in to_remove, "num1 not marked for removal"
    
    # cat1 should be removed (correlated with better cat2)
    assert "cat1" in to_remove, "cat1 not marked for removal"
    
    # num3 should NOT be removed (low correlation)
    assert "num3" not in to_remove, "num3 incorrectly marked"
    
    print("Adaptive removal tests passed!")

test_adaptive_removal()