[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/AutoViML/featurewiz_polars/blob/main/examples/fw_polars_vs_featurewiz_test.ipynb)

In [None]:
#!pip install featurewiz --ignore-installed

In [2]:
from featurewiz import FeatureWiz

Imported lazytransform v1.18. 

Imported featurewiz 0.6.1. Use the following syntax:
    >>> wiz = FeatureWiz(feature_engg = '', nrows=None, transform_target=True,
        		category_encoders="auto", auto_encoders='VAE', ae_options={},
        		add_missing=False, imbalanced=False, verbose=0)
    >>> X_train_selected, y_train = wiz.fit_transform(X_train, y_train)
    >>> X_test_selected = wiz.transform(X_test)
    >>> selected_features = wiz.features
    


In [3]:
wiz = FeatureWiz(feature_engg = '', nrows=None, transform_target=True,
        		category_encoders="auto", auto_encoders='', ae_options={},
        		add_missing=False, imbalanced=False, verbose=0)

featurewiz is given 0.9 as correlation limit...
    Skipping feature engineering since no feature_engg input...
    final list of category encoders given: ['onehot', 'label']
    final list of scalers given: []


In [4]:
import polars as pl
import numpy as np
import pandas as pd
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
import time
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor

In [5]:
#!pip install git+https://github.com/AutoViML/featurewiz_polars.git

In [6]:
from featurewiz_polars import Featurewiz_MRMR

Imported featurewiz_polars 0.1.4. Use the following syntax:
 >> from featurewiz_polars import Featurewiz_MRMR, Featurewiz_MRMR_Model
 >> wiz = Featurewiz_MRMR(model_type='Classification')
 >> X_transformed, y_transformed = wiz.fit_transform(X_train, y_train)
 >> X_test_transformed = wiz.transform(X_test)
 >> print(wiz.selected_features)
    


# Load file into Polars Dataframes

In [7]:
datapath = "../../../documents/ram/data_sets/"
filename = "ames_train.csv"

In [8]:
df = pl.read_csv(datapath+filename, null_values=['NULL','NA'], try_parse_dates=True, infer_schema_length=10000, ignore_errors=True, )#.sample(1000)
print('Loaded data...', df.shape)
target = 'SalePrice' # Replace with your target column name
model_type = 'Regression'
if target not in df.columns:
    print(f"Error: Target column '{target}' not found in the CSV file.")
    exit()
predictors = [x for x in df.columns if x!=target]
X = df[predictors]
y = df[target]
print('Data dimensions (rows x cols) = %d dims' %(int(X.shape[0]*X.shape[1])))
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
y_train.dtype

Loaded data... (1460, 81)
Data dimensions (rows x cols) = 116800 dims


Int64

# Setup Featurewiz-Polars

In [9]:
mrmr = Featurewiz_MRMR(model_type=model_type, corr_threshold=0.7, 
        verbose=0, encoding_type='ordinal', imputation_strategy="median")

if model_type == 'Regression':
    model = RandomForestRegressor(n_estimators=100, random_state=99)
else:
    model = RandomForestClassifier(n_estimators=100, random_state=99)

# First run with featurewiz classic edition

In [10]:
X_train_selected, y_train = wiz.fit_transform(X_train.to_pandas(), y_train.to_pandas())
X_test_selected = wiz.transform(X_test.to_pandas())
selected_features = wiz.features
print('featurewiz selected features: ', selected_features)

Loaded input data. Shape = (1168, 80)
#### Starting featurewiz transform for train data ####
    Regression models don't need targets to be transformed to numeric...
    Single_Label Regression problem 
Shape of dataset: (1168, 80). Now we classify variables into different types...
Time taken to define data pipeline = 1 second(s)
No model input given...
Lazy Transformer Pipeline created...
    Time taken to fit dataset = 2 second(s)
    Time taken to transform dataset = 2 second(s)
    Shape of transformed dataset: (1168, 303)
    Single_Label Regression problem 
Starting SULOV with 287 features...
    there are no null values in dataset...
    there are no null values in target column...
Completed SULOV. 274 features selected
Performing recursive XGBoost feature selection from 274 features...
    time taken to run entire featurewiz = 7 second(s)
Recursive XGBoost selected 94 features...
#### Starting featurewiz transform for test data ####
Loaded input data. Shape = (292, 80)
#### Sta

# Then run with featurewiz polars edition

In [11]:
mrmr.fit(X_train, y_train)
Xt, yt = mrmr.transform(X_train,y_train)
Xtt = mrmr.transform(X_test)
print(type(Xt))
print(type(yt))


Featurewiz Polars started. Model type: Regression
SULOV selected Features (78): ['1stFlrSF', '2ndFlrSF', '3SsnPorch', 'Alley', 'BedroomAbvGr', 'BldgType', 'BsmtCond', 'BsmtExposure', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtFinType1', 'BsmtFinType2', 'BsmtFullBath', 'BsmtHalfBath', 'BsmtQual', 'BsmtUnfSF', 'CentralAir', 'Condition1', 'Condition2', 'Electrical', 'EnclosedPorch', 'ExterCond', 'ExterQual', 'Exterior1st', 'Exterior2nd', 'Fence', 'FireplaceQu', 'Fireplaces', 'Foundation', 'FullBath', 'Functional', 'GarageArea', 'GarageCars', 'GarageCond', 'GarageFinish', 'GarageQual', 'GarageType', 'GarageYrBlt', 'GrLivArea', 'HalfBath', 'Heating', 'HeatingQC', 'HouseStyle', 'Id', 'KitchenAbvGr', 'KitchenQual', 'LandContour', 'LandSlope', 'LotArea', 'LotConfig', 'LotFrontage', 'LotShape', 'LowQualFinSF', 'MSSubClass', 'MSZoning', 'MasVnrArea', 'MasVnrType', 'MiscFeature', 'MiscVal', 'MoSold', 'Neighborhood', 'OpenPorchSF', 'OverallCond', 'OverallQual', 'PavedDrive', 'PoolArea', 'RoofMatl', 'RoofSt

# Train two different Models with two different feature sets

In [15]:
import copy
modelx = copy.deepcopy(model)
modelx.fit(X_train_selected, y_train)
y_predx = modelx.predict(X_test_selected)

In [16]:
modely = copy.deepcopy(model)
modely.fit(Xt, yt)
y_predy = modely.predict(Xtt)

# performance of featurewiz featureset

In [17]:
from featurewiz import print_classification_metrics, print_regression_metrics
if model_type.lower() == 'classification':
    print_classification_metrics(y_test.to_pandas().ravel(), y_predx, verbose=1)
else:
    print_regression_metrics(y_test.to_pandas().ravel(), y_predx, verbose=1)

    RMSE = 29515.600
    Norm RMSE = 34%
    MAE = 17635.649
    WAPE = 10%, Bias = 0.9%
    MAPE = 1065%
    R-Squared = 89%


# performance of featurewiz polars featureset

In [18]:
if model_type.lower() == 'classification':
    print_classification_metrics(y_test.to_pandas().ravel(), y_predy, verbose=1)
else:
    print_regression_metrics(y_test.to_pandas().ravel(), y_predy, verbose=1)

    RMSE = 28946.596
    Norm RMSE = 33%
    MAE = 17390.609
    WAPE = 10%, Bias = 0.6%
    MAPE = 1052%
    R-Squared = 89%


# In this specific dataset, Featurewiz-Polars narrowly beats classic featurewiz in RMSE with fewer features