In [1]:
from sklearn import datasets, preprocessing, feature_selection
from sklearn.feature_selection import VarianceThreshold, SelectKBest, f_regression, SelectFromModel, RFE
from sklearn.linear_model import LassoCV
from sklearn.tree import DecisionTreeRegressor
from itertools import compress

import pandas as pd
import numpy as np

# Dataset

In [42]:
X, y = datasets.make_classification(n_samples=1000, n_features=20, n_informative=5, n_redundant=5, n_repeated=5, shuffle=False)

In [3]:
columns = [f"informative_{i+1}" for i in range(5)] + [f"redundant_{i-4}" for i in range(5,10)] + [f"repeated_{i-9}" for i in range(10,15)] + [f"useless_{i-14}" for i in range(15,20)]
columns

['informative_1',
 'informative_2',
 'informative_3',
 'informative_4',
 'informative_5',
 'redundant_1',
 'redundant_2',
 'redundant_3',
 'redundant_4',
 'redundant_5',
 'repeated_1',
 'repeated_2',
 'repeated_3',
 'repeated_4',
 'repeated_5',
 'useless_1',
 'useless_2',
 'useless_3',
 'useless_4',
 'useless_5']

In [4]:
data = pd.DataFrame(X, columns=columns)

In [5]:
data

Unnamed: 0,informative_1,informative_2,informative_3,informative_4,informative_5,redundant_1,redundant_2,redundant_3,redundant_4,redundant_5,repeated_1,repeated_2,repeated_3,repeated_4,repeated_5,useless_1,useless_2,useless_3,useless_4,useless_5
0,0.802999,1.255175,-0.098105,0.399569,0.305318,0.935659,1.379703,-0.675126,0.583458,0.884465,-0.098105,0.399569,1.379703,0.935659,0.399569,0.448046,0.336377,-2.549690,-0.832616,0.517242
1,-0.185667,-0.218252,-1.375825,0.236369,2.331106,1.990546,-1.641326,1.581460,0.238155,1.982855,-1.375825,0.236369,-1.641326,1.990546,0.236369,0.148682,2.653195,0.696716,0.765764,-0.248576
2,0.898255,0.819221,-1.630470,2.097751,1.184311,2.043041,1.897965,0.504202,-0.369160,2.333359,-1.630470,2.097751,1.897965,2.043041,2.097751,-0.156380,0.383988,0.857657,-0.714347,0.335459
3,2.406300,1.468227,-1.406436,1.807410,0.478188,2.658498,3.308027,-1.468629,0.773034,2.682706,-1.406436,1.807410,3.308027,2.658498,1.807410,-0.061912,0.993351,0.275979,0.720344,1.099823
4,1.375543,0.037575,-0.694078,-0.729483,1.493594,2.416935,-1.089479,-0.471859,1.658212,1.969603,-0.694078,-0.729483,-1.089479,2.416935,-0.729483,0.074575,-0.254410,1.443872,0.665089,1.556708
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,-2.223432,0.764830,2.679562,0.063040,-0.240599,-2.628341,-1.054075,2.785288,-1.931665,-2.398925,2.679562,0.063040,-1.054075,-2.628341,0.063040,-0.675014,-0.519490,1.234113,0.118845,1.169173
996,-1.111459,-0.262279,0.332087,0.728942,0.924754,-0.287938,-1.230624,2.545504,-1.380895,-0.064331,0.332087,0.728942,-1.230624,-0.287938,0.728942,0.408324,-0.048969,-0.518667,-0.458633,0.724117
997,-0.198228,1.570894,-0.018970,1.601360,2.065496,1.454199,0.310800,2.446547,-0.802313,1.663108,-0.018970,1.601360,0.310800,1.454199,1.601360,-1.628022,1.715094,-0.920438,0.879215,-1.233535
998,-1.491248,1.199736,1.584191,1.238172,0.229313,-1.402390,0.335221,2.615298,-1.968784,-1.020948,1.584191,1.238172,0.335221,-1.402390,1.238172,-0.828398,-1.051151,-0.689622,0.317751,0.405725


# Feature Selection

## Removing attributes with low variance

In [6]:
scaler = preprocessing.StandardScaler()
X_normalized = scaler.fit_transform(X)

selector_variance = VarianceThreshold(threshold=(.5 * (1 - .5)))
new_data = selector_variance.fit_transform(X_normalized)

print(data.iloc[:, selector_variance.get_support(indices=True)])

     informative_1  informative_2  informative_3  informative_4  \
0         0.802999       1.255175      -0.098105       0.399569   
1        -0.185667      -0.218252      -1.375825       0.236369   
2         0.898255       0.819221      -1.630470       2.097751   
3         2.406300       1.468227      -1.406436       1.807410   
4         1.375543       0.037575      -0.694078      -0.729483   
..             ...            ...            ...            ...   
995      -2.223432       0.764830       2.679562       0.063040   
996      -1.111459      -0.262279       0.332087       0.728942   
997      -0.198228       1.570894      -0.018970       1.601360   
998      -1.491248       1.199736       1.584191       1.238172   
999      -1.309098       3.356987       1.629234      -0.613950   

     informative_5  redundant_1  redundant_2  redundant_3  redundant_4  \
0         0.305318     0.935659     1.379703    -0.675126     0.583458   
1         2.331106     1.990546    -1.641326   

## Linear Regression

In [43]:
selector = SelectKBest(score_func=f_regression, k=5)
new_data = selector.fit_transform(X, y)
print(data.iloc[:, selector.get_support(indices=True)])

     informative_1  informative_3  redundant_2  redundant_5  repeated_1
0         0.802999      -0.098105     1.379703     0.884465   -0.098105
1        -0.185667      -1.375825    -1.641326     1.982855   -1.375825
2         0.898255      -1.630470     1.897965     2.333359   -1.630470
3         2.406300      -1.406436     3.308027     2.682706   -1.406436
4         1.375543      -0.694078    -1.089479     1.969603   -0.694078
..             ...            ...          ...          ...         ...
995      -2.223432       2.679562    -1.054075    -2.398925    2.679562
996      -1.111459       0.332087    -1.230624    -0.064331    0.332087
997      -0.198228      -0.018970     0.310800     1.663108   -0.018970
998      -1.491248       1.584191     0.335221    -1.020948    1.584191
999      -1.309098       1.629234     1.589034    -1.169031    1.629234

[1000 rows x 5 columns]


## Linear Regression and regularization

In [44]:
clf = LassoCV()
clf.fit(X, y)

sfm = SelectFromModel(clf, threshold=0.01)
sfm.fit(X, y)
n_rows, n_features = sfm.transform(X).shape

while n_features > 5:
    sfm.threshold += 0.005
    new_data = sfm.transform(X)
    n_rows, n_features = new_data.shape
    
print(data.iloc[:, sfm.get_support(indices=True)])

     informative_1  informative_2  informative_3  informative_4  informative_5
0         0.802999       1.255175      -0.098105       0.399569       0.305318
1        -0.185667      -0.218252      -1.375825       0.236369       2.331106
2         0.898255       0.819221      -1.630470       2.097751       1.184311
3         2.406300       1.468227      -1.406436       1.807410       0.478188
4         1.375543       0.037575      -0.694078      -0.729483       1.493594
..             ...            ...            ...            ...            ...
995      -2.223432       0.764830       2.679562       0.063040      -0.240599
996      -1.111459      -0.262279       0.332087       0.728942       0.924754
997      -0.198228       1.570894      -0.018970       1.601360       2.065496
998      -1.491248       1.199736       1.584191       1.238172       0.229313
999      -1.309098       3.356987       1.629234      -0.613950       0.074545

[1000 rows x 5 columns]


## RFE - Recursive feature selection

In [45]:
estimator = DecisionTreeRegressor()
selector_rfe = RFE(estimator, n_features_to_select=5, step=1)
new_data = selector_rfe.fit_transform(X, y)
print(data.iloc[:, selector_rfe.get_support(indices=True)])

     informative_1  informative_2  informative_3  informative_4  redundant_1
0         0.802999       1.255175      -0.098105       0.399569     0.935659
1        -0.185667      -0.218252      -1.375825       0.236369     1.990546
2         0.898255       0.819221      -1.630470       2.097751     2.043041
3         2.406300       1.468227      -1.406436       1.807410     2.658498
4         1.375543       0.037575      -0.694078      -0.729483     2.416935
..             ...            ...            ...            ...          ...
995      -2.223432       0.764830       2.679562       0.063040    -2.628341
996      -1.111459      -0.262279       0.332087       0.728942    -0.287938
997      -0.198228       1.570894      -0.018970       1.601360     1.454199
998      -1.491248       1.199736       1.584191       1.238172    -1.402390
999      -1.309098       3.356987       1.629234      -0.613950    -1.242118

[1000 rows x 5 columns]
