### Feature Selection- Dropping constant features
In this step we will be removing the features which have constant features which are actually not important
for solving the problem statement

##### Variance Threshold
Feature selector that removes all low-variance features.

This feature selection algorithm looks only at the features (X), not the desired outputs (y), and can thus be used for unsupervised learning.

In [11]:
# Import pandas to create DataFrame 
import pandas as pd 
import numpy as np
from sklearn.model_selection import train_test_split  

# Make DataFrame of the given data 
X = pd.DataFrame({"A":[1,2,4,1,2,4], 
                    "B":[4,5,6,7,8,9], 
                    "C":[0,0,0,0,0,0],
                    "D":[1,1,1,1,1,1]}) 

y = np.array([4, 7, 8, 10, 12, 1])

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)
X_train

Unnamed: 0,A,B,C,D
4,2,8,0,1
0,1,4,0,1
3,1,7,0,1
5,4,9,0,1


In [12]:
# DYNAMIC
from sklearn.feature_selection import VarianceThreshold

def dropping_constant_features(X_train, X_test, threshold):
    var_thres = VarianceThreshold(threshold=threshold)
    var_thres.fit(X_train)
    
    cols = X_train.columns[var_thres.get_support()]
    features_low_variance = [feature for feature in X_train.columns if feature not in cols]
    print(f'Features with low variance : {features_low_variance}. Amount : {len(features_low_variance)} out of {len(X_train.columns)} total features')
    
    X_train = X_train[cols]
    X_test = X_test[cols] # remove the constant feature on test set without checking if it is constant in test set
    return X_train, X_test, features_low_variance


X_train, X_test, features_low_variance = dropping_constant_features(X_train, X_test, threshold=0)
X_train.head()

Features with low variance : ['C', 'D']. Amount : 2 out of 4 total features


Unnamed: 0,A,B
4,2,8
0,1,4
3,1,7
5,4,9


In [13]:
X_test.head()

Unnamed: 0,A,B
2,4,6
1,2,5


In [9]:
features_low_variance

['C', 'D']