[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/AutoViML/featurewiz_polars/blob/main/examples/fw_polars_vs_featurewiz_test.ipynb)

In [None]:
#!pip install featurewiz --ignore-installed

In [1]:
from featurewiz import FeatureWiz

Imported lazytransform v1.18. 

Imported featurewiz 0.6.1. Use the following syntax:
    >>> wiz = FeatureWiz(feature_engg = '', nrows=None, transform_target=True,
        		category_encoders="auto", auto_encoders='VAE', ae_options={},
        		add_missing=False, imbalanced=False, verbose=0)
    >>> X_train_selected, y_train = wiz.fit_transform(X_train, y_train)
    >>> X_test_selected = wiz.transform(X_test)
    >>> selected_features = wiz.features
    


In [2]:
wiz = FeatureWiz(feature_engg = '', nrows=None, transform_target=True, corr_limit=0.70,
        		category_encoders="auto", auto_encoders='', ae_options={},
        		add_missing=False, imbalanced=False, verbose=0)

featurewiz is given 0.7 as correlation limit...
    Skipping feature engineering since no feature_engg input...
    final list of category encoders given: ['onehot', 'label']
    final list of scalers given: []


In [3]:
import polars as pl
import numpy as np
import pandas as pd
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
import time
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor

In [None]:
#!pip install git+https://github.com/AutoViML/featurewiz_polars.git

In [4]:
from featurewiz_polars import FeatureWiz, polars_train_test_split, print_classification_metrics, print_regression_metrics

Imported featurewiz_polars 0.3.0. Use the following syntax:
 >> from featurewiz_polars import FeatureWiz, FeatureWiz_Model
    


# Load file into Polars Dataframes

In [6]:
datapath = "../data/"
filename = "baby_weight.csv"

In [7]:
df = pl.read_csv(datapath+filename, null_values=['NULL','NA'], try_parse_dates=True, infer_schema_length=10000, ignore_errors=True, )#.sample(1000)
print('Loaded data...', df.shape)
target = 'bwt' # Replace with your target column name
model_type = 'Regression'
if target not in df.columns:
    print(f"Error: Target column '{target}' not found in the CSV file.")
    exit()
predictors = [x for x in df.columns if x!=target]
X = df[predictors]
y = df[target]
print('Data dimensions (rows x cols) = %d dims' %(int(X.shape[0]*X.shape[1])))

Loaded data... (1236, 8)
Data dimensions (rows x cols) = 8652 dims


In [8]:
### You are going to use Polars train_test_split function to split the data into train and test
### This is because Featurewiz_MRMR uses Polars dataframes and not Pandas dataframes
X_train, X_test, y_train, y_test = polars_train_test_split(X, y, test_size=0.2, random_state=42)
y_train.dtype

Int64

# Setup Featurewiz-Polars

In [11]:
mrmr = FeatureWiz(model_type=model_type, corr_limit=0.7, 
        verbose=0, category_encoders='ordinal', imputation_strategy="median")

if model_type == 'Regression':
    model = RandomForestRegressor(n_estimators=100, random_state=99)
else:
    model = RandomForestClassifier(n_estimators=100, random_state=99)

# First run with featurewiz classic edition

In [12]:
X_train_selected, y_train = wiz.fit_transform(X_train.to_pandas(), y_train.to_pandas())
X_test_selected = wiz.transform(X_test.to_pandas())
selected_features = wiz.features
print('featurewiz selected features: ', selected_features)

Loaded input data. Shape = (989, 7)
#### Starting featurewiz transform for train data ####
    Regression models don't need targets to be transformed to numeric...
    Single_Label Regression problem 
Shape of dataset: (989, 7). Now we classify variables into different types...
Time taken to define data pipeline = 1 second(s)
No model input given...
Lazy Transformer Pipeline created...
    Time taken to fit dataset = 1 second(s)
    Time taken to transform dataset = 1 second(s)
    Shape of transformed dataset: (989, 7)
    Single_Label Regression problem 
Starting SULOV with 7 features...
Completed SULOV. All 7 features selected
Performing recursive XGBoost feature selection from 7 features...
    time taken to run entire featurewiz = 1 second(s)
Recursive XGBoost selected 4 features...
#### Starting featurewiz transform for test data ####
Loaded input data. Shape = (247, 7)
#### Starting lazytransform for test data ####
    Time taken to transform dataset = 1 second(s)
    Shape of t

# Then run with featurewiz polars edition

In [13]:
mrmr.fit(X_train, y_train)
Xt, yt = mrmr.transform(X_train,y_train)
Xtt = mrmr.transform(X_test)
print(type(Xt))
print(type(yt))


Featurewiz Polars started. Model type: Regression
SULOV selected Features (7): ['age', 'case', 'gestation', 'height', 'parity', 'smoke', 'weight']

--- Run 1 started ---

--- Run 2 started ---

--- Run 3 started ---

Recursive XGBoost selected Features (2): ['smoke', 'gestation']

Featurewiz-Polars feature selection with XGBoost estimator completed.
Time taken  = 2.1 seconds
<class 'polars.dataframe.frame.DataFrame'>
<class 'pandas.core.series.Series'>


# Train two different Models with two different feature sets

In [14]:
import copy
modelx = copy.deepcopy(model)
modelx.fit(X_train_selected, y_train)
y_predx = modelx.predict(X_test_selected)

In [15]:
modely = copy.deepcopy(model)
modely.fit(Xt, yt)
y_predy = modely.predict(Xtt)

# performance of featurewiz featureset

In [16]:
if model_type.lower() == 'classification':
    print_classification_metrics(y_test.to_pandas().ravel(), y_predx, verbose=1)
else:
    print_regression_metrics(y_test.to_pandas().ravel(), y_predx, verbose=1)

    RMSE = 18.097
    Norm RMSE = 91%
    MAE = 14.483
    WAPE = 12%, Bias = -0.0%
    MAPE = 1291%
	Hint: high MAPE: try np.log(y) instead of (y).
    R-Squared = 17%


# performance of featurewiz polars featureset

In [17]:
if model_type.lower() == 'classification':
    print_classification_metrics(y_test.to_pandas().ravel(), y_predy, verbose=1)
else:
    print_regression_metrics(y_test.to_pandas().ravel(), y_predy, verbose=1)

    RMSE = 18.198
    Norm RMSE = 91%
    MAE = 14.219
    WAPE = 12%, Bias = -1.6%
    MAPE = 1292%
	Hint: high MAPE: try np.log(y) instead of (y).
    R-Squared = 16%


# In this specific dataset, Featurewiz-Polars narrowly matches classic featurewiz in RMSE with fewer features