In [3]:
import sys
sys.path.append("../src")

In [4]:
import pandas as pd
import numpy as np
from sklearn.datasets import fetch_california_housing

In [5]:
# fetch a regression data
data = fetch_california_housing()
X = data["data"]
col_names = data["feature_names"]
y = data["target"]

In [6]:
X.shape

(20640, 8)

In [7]:
col_names

['MedInc',
 'HouseAge',
 'AveRooms',
 'AveBedrms',
 'Population',
 'AveOccup',
 'Latitude',
 'Longitude']

In [8]:
y[:5]

array([4.526, 3.585, 3.521, 3.413, 3.422])

In [9]:
# convert to pandas dataframe
df = pd.DataFrame(X, columns=col_names)

# introduce a highly correlated column
df.loc[:, "MedInc_Sqrt"] = df.MedInc.apply(np.sqrt)

# get correlation matrix (pearson)
df.corr()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,MedInc_Sqrt
MedInc,1.0,-0.119034,0.326895,-0.06204,0.004834,0.018766,-0.079809,-0.015176,0.984329
HouseAge,-0.119034,1.0,-0.153277,-0.077747,-0.296244,0.013191,0.011173,-0.108197,-0.132797
AveRooms,0.326895,-0.153277,1.0,0.847621,-0.072213,-0.004852,0.106389,-0.02754,0.326688
AveBedrms,-0.06204,-0.077747,0.847621,1.0,-0.066197,-0.006181,0.069721,0.013344,-0.06691
Population,0.004834,-0.296244,-0.072213,-0.066197,1.0,0.069863,-0.108785,0.099773,0.018415
AveOccup,0.018766,0.013191,-0.004852,-0.006181,0.069863,1.0,0.002366,0.002476,0.015266
Latitude,-0.079809,0.011173,0.106389,0.069721,-0.108785,0.002366,1.0,-0.924664,-0.084303
Longitude,-0.015176,-0.108197,-0.02754,0.013344,0.099773,0.002476,-0.924664,1.0,-0.015569
MedInc_Sqrt,0.984329,-0.132797,0.326688,-0.06691,0.018415,0.015266,-0.084303,-0.015569,1.0


## Testing the univariate feature selection wrapper

In [10]:
from feat_selection_wrapper import UnivariateFeatureSelection

ufs = UnivariateFeatureSelection(
        n_features=0.1,
        problem_type="regression",
        scoring="f_regression"
)

ufs.fit(X, y)
X_transformed = ufs.transform(X)

In [12]:
X_transformed.shape

(20640, 1)