# Feature Selection Sample

In [4]:
import pandas as pd

In [5]:
from sklearn.datasets import load_boston
# load data
boston_bunch = load_boston()
df = pd.DataFrame(data= boston_bunch.data, columns= boston_bunch.feature_names)
# adding the target variable
df["target"] = boston_bunch.target


In [6]:
df.head() 

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,target
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33,36.2


In [14]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 506 entries, 0 to 505
Data columns (total 14 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   CRIM     506 non-null    float64
 1   ZN       506 non-null    float64
 2   INDUS    506 non-null    float64
 3   CHAS     506 non-null    float64
 4   NOX      506 non-null    float64
 5   RM       506 non-null    float64
 6   AGE      506 non-null    float64
 7   DIS      506 non-null    float64
 8   RAD      506 non-null    float64
 9   TAX      506 non-null    float64
 10  PTRATIO  506 non-null    float64
 11  B        506 non-null    float64
 12  LSTAT    506 non-null    float64
 13  target   506 non-null    float64
dtypes: float64(14)
memory usage: 55.5 KB


# Filtering Methods

### 1-Pearson correlation coefficient

In [8]:
corr = df.corr()["target"].sort_values(ascending=False)[1:]
# absolute for positive values
abs_corr = abs(corr)
# random threshold for features to keep
relevant_features = abs_corr[abs_corr>0.4]
relevant_features

RM         0.695360
NOX        0.427321
TAX        0.468536
INDUS      0.483725
PTRATIO    0.507787
LSTAT      0.737663
Name: target, dtype: float64

### 2- Thresholding Numerical Feature Variance

In [4]:
# Load libraries
from sklearn import datasets
from sklearn.feature_selection import VarianceThreshold
# import some data to play with
iris = datasets.load_iris()
# Create features and target
features = iris.data
target = iris.target

In [6]:
# Create thresholder
thresholder = VarianceThreshold(threshold=.5)
# Create high variance feature matrix
features_high_variance = thresholder.fit_transform(features)
# View high variance feature matrix
features_high_variance[0:3]

array([[5.1, 1.4, 0.2],
       [4.9, 1.4, 0.2],
       [4.7, 1.3, 0.2]])

In [7]:
# View variances for each feature
thresholder.fit(features).variances_

array([0.68112222, 0.18871289, 3.09550267, 0.57713289])

In [8]:
iris.data.shape

(150, 4)

In [9]:
print (iris.feature_names)

['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)']


In [10]:
print (iris.target_names)

['setosa' 'versicolor' 'virginica']


### Univariate Feature Selection
Univariate Feature Selection is a statistical method used to select the features which have the strongest relationship with our corrispondent labels. Using the SelectKBest method we can decide which metrics to use to evaluate our features and the number of K best features we want to keep. Different types of scoring functions are available depending on our needs:

Classification: chi2, f_classif, mutual_info_classif
Regression: f_regression, mutual_info_regression

### 3-chi-square

If the features are categorical, calculate a chi-square (χ ) statistic between each feature and the target vector:

In [11]:
# Load libraries
from sklearn.datasets import load_iris
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2, f_classif
# Load data
iris = load_iris()
features = iris.data
target = iris.target
# Convert to categorical data by converting data to integers
features = features.astype(int)
# Select two features with highest chi-squared statistics
chi2_selector = SelectKBest(chi2, k=2)
features_kbest = chi2_selector.fit_transform(features,target)
# Show results
print("Original number of features:", features.shape[1])
print("Reduced number of features:",
features_kbest.shape[1])

Original number of features: 4
Reduced number of features: 2


### 4-ANOVA

If the features are quantitative, compute the ANOVA F-value between each feature and the target vector:

In [12]:
# Select two features with highest F-values
fvalue_selector = SelectKBest(f_classif, k=2)
features_kbest = fvalue_selector.fit_transform(features,target)
# Show results
print("Original number of features:", features.shape[1])
print("Reduced number of features:",
features_kbest.shape[1])

Original number of features: 4
Reduced number of features: 2


Instead of selecting a specific number of features, we can also use
SelectPercentile to select the top n percent of features:

In [13]:
# Load library
from sklearn.feature_selection import SelectPercentile
# Select top 75% of features with highest F-values
fvalue_selector = SelectPercentile(f_classif, percentile=75)
features_kbest = fvalue_selector.fit_transform(features,
target)
# Show results
print("Original number of features:", features.shape[1])
print("Reduced number of features:",
features_kbest.shape[1])

Original number of features: 4
Reduced number of features: 3


# Wrapper Methods

### recursive feature elimination (RFE)

In [9]:
#RFE(recursive feature elimination)
from sklearn.feature_selection import RFE
from sklearn.linear_model import LinearRegression
# input and output features
X = df.drop("target", axis= 1)
y = df["target"]
# defining model to build
lin_reg = LinearRegression()
# create the RFE model and select 6 attributes
rfe = RFE(lin_reg, 6)  #(estimator, n_features_to_select=5, step=1)
rfe.fit(X, y)
# summarize the selection of the attributes
print(f"Number of selected features: {rfe.n_features_}\n\
Mask: {rfe.support_}\n\
Selected Features:", [feature for feature, rank in zip(X.columns.values, rfe.ranking_) if rank==1],"\n\
Estimator : {rfe.estimator_}")



Number of selected features: 6
Mask: [False False False  True  True  True False  True False False  True False
  True]
Selected Features: ['CHAS', 'NOX', 'RM', 'DIS', 'PTRATIO', 'LSTAT'] 
Estimator : {rfe.estimator_}


### Use scikit-learn’s RFECV to conduct recursive feature elimination (RFE) using cross-validation (CV).

In [14]:
# Load libraries
import warnings
from sklearn.datasets import make_regression
from sklearn.feature_selection import RFECV
from sklearn import datasets, linear_model
# Suppress an annoying but harmless warning
warnings.filterwarnings(action="ignore", module="scipy", message="^internal gelsd")
# Generate features matrix, target vector, and the true coefficients
features, target = make_regression(n_samples = 10000,n_features = 100,n_informative = 2,random_state = 1)
# Create a linear regression
ols = linear_model.LinearRegression()
# Recursively eliminate features
rfecv = RFECV(estimator=ols, step=1,scoring="neg_mean_squared_error")
rfecv.fit(features, target)
rfecv.transform(features)

array([[ 0.00850799,  0.22141345,  0.7031277 , ..., -0.10738769,
         0.71832105, -0.1595391 ],
       [-1.07500204, -0.81480548,  2.56148527, ..., -0.91773159,
        -0.6688925 , -1.68400704],
       [ 1.37940721,  0.95713976, -1.77039484, ..., -1.17442094,
        -0.19582889,  0.58847495],
       ...,
       [-0.80331656, -0.55700395, -1.60648007, ..., -0.20756972,
        -1.10567169,  0.56935412],
       [ 0.39508844, -2.27367489, -1.34564911, ..., -0.61308782,
         0.27774093,  0.68749393],
       [-0.55383035, -0.48159973,  0.82880112, ...,  0.3816852 ,
         0.61610294,  0.8233291 ]])

In [15]:
rfecv.n_features_

9

In [16]:
rfecv.support_

array([False, False, False, False, False,  True, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False,  True,  True, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False,  True, False, False, False, False,
       False, False, False,  True,  True,  True, False, False, False,
       False,  True,  True, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False])

# Embedded Methods

In [13]:
from sklearn.linear_model import Lasso
# train model
lasso = Lasso()
lasso.fit(X, y)
# perform feature selection
kept_cols = [feature for feature, weight in zip(X.columns.values, lasso.coef_) if weight != 0]
kept_cols

['CRIM', 'ZN', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT']