# Dimensionality Reduction Using Feature Selection

In [2]:
from sklearn import datasets
from sklearn.feature_selection import VarianceThreshold

In [10]:
iris = datasets.load_iris()

#create features and target
feature = iris.data
target = iris.target

In [11]:
#create tresholder
thresholder = VarianceThreshold(threshold=.5)

In [12]:
#create high variance feature matrix
features_high_variance = thresholder.fit_transform(feature)

In [13]:
#view high variance feature matrix
features_high_variance[0:5]

array([[5.1, 1.4, 0.2],
       [4.9, 1.4, 0.2],
       [4.7, 1.3, 0.2],
       [4.6, 1.5, 0.2],
       [5. , 1.4, 0.2]])

In [14]:
# View variances
thresholder.fit(feature).variances_

array([0.68112222, 0.18871289, 3.09550267, 0.57713289])

In [15]:
from sklearn.preprocessing import StandardScaler

# standardize feaure matrix
scaler = StandardScaler()
features_std = scaler.fit_transform(feature)

In [16]:
#calculate variance of each feature
selector = VarianceThreshold()
selector.fit(features_std).variances_

array([1., 1., 1., 1.])

In [17]:
from sklearn.feature_selection import VarianceThreshold
# Create feature matrix with:
# Feature 0: 80% class 0
# Feature 1: 80% class 1
# Feature 2: 60% class 0, 40% class 1
features = [[0, 1, 0],[0, 1, 1],[0, 1, 0],[0, 1, 1],[1, 0, 0]]
# Run threshold by variance
thresholder = VarianceThreshold(threshold=(.75 * (1 - .75)))
thresholder.fit_transform(features)

array([[0],
       [1],
       [0],
       [1],
       [0]])

In [18]:
import pandas as pd
import numpy as np
# Create feature matrix with two highly correlated features
features = np.array([[1, 1, 1], [2, 2, 0], [3, 3, 1], [4, 4, 0], [5, 5, 1], [6, 6, 0], [7, 7, 1], [8, 7, 0], [9, 7, 1]])
# Convert feature matrix into DataFrame
dataframe = pd.DataFrame(features)
# Create correlation matrix
corr_matrix = dataframe.corr().abs()
# Select upper triangle of correlation matrix
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape),k=1).astype(np.bool))
# Find index of feature columns with correlation greater than 0.95
to_drop = [column for column in upper.columns if any(upper[column] > 0.95)]
# Drop features
dataframe.drop(dataframe.columns[to_drop], axis=1).head(3)

Unnamed: 0,0,2
0,1,1
1,2,0
2,3,1


In [19]:
# Correlation matrix
dataframe.corr()

Unnamed: 0,0,1,2
0,1.0,0.976103,0.0
1,0.976103,1.0,-0.034503
2,0.0,-0.034503,1.0


In [20]:
upper

Unnamed: 0,0,1,2
0,,0.976103,0.0
1,,,0.034503
2,,,


In [21]:
from sklearn.datasets import load_iris
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2, f_classif
# Load data
iris = load_iris()
features = iris.data
target = iris.target
# Convert to categorical data by converting data to integers
features = features.astype(int)
# Select two features with highest chi-squared statistics
chi2_selector = SelectKBest(chi2, k=2)
features_kbest = chi2_selector.fit_transform(features,target)
# Show results
print("Original number of features:", features.shape[1])
print("Reduced number of features:",features_kbest.shape[1])

Original number of features: 4
Reduced number of features: 2


In [22]:
# Select two features with highest F-values
fvalue_selector = SelectKBest(f_classif, k=2)

In [24]:
features_kbest = fvalue_selector.fit_transform(features,target)
# Show results
print("Original number of features:", features.shape[1])
print("Reduced number of features:",features_kbest.shape[1])

Original number of features: 4
Reduced number of features: 2


In [25]:
from sklearn.feature_selection import SelectPercentile
# Select top 75% of features with highest F-values
fvalue_selector = SelectPercentile(f_classif, percentile=75)
features_kbest = fvalue_selector.fit_transform(features,target)
# Show results
print("Original number of features:", features.shape[1])
print("Reduced number of features:",features_kbest.shape[1])

Original number of features: 4
Reduced number of features: 3


In [26]:
import warnings
from sklearn.datasets import make_regression
from sklearn.feature_selection import RFECV
from sklearn import datasets, linear_model
# Suppress an annoying but harmless warning
warnings.filterwarnings(action="ignore", module="scipy",message="^internal gelsd")
# Generate features matrix, target vector, and the true coefficients
features, target = make_regression(n_samples = 10000,n_features = 100,n_informative = 2,random_state = 1)
# Create a linear regression
ols = linear_model.LinearRegression()
# Recursively eliminate features
rfecv = RFECV(estimator=ols, step=1,scoring="neg_mean_squared_error")
rfecv.fit(features, target)
rfecv.transform(features)

array([[ 0.00850799,  0.7031277 ],
       [-1.07500204,  2.56148527],
       [ 1.37940721, -1.77039484],
       ...,
       [-0.80331656, -1.60648007],
       [ 0.39508844, -1.34564911],
       [-0.55383035,  0.82880112]])

In [27]:
# Number of best features
rfecv.n_features_

2

In [28]:
# Which categories are best
rfecv.support_

array([False, False, False, False, False,  True, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False,  True, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False])

In [29]:
# Rank features best (1) to worst
rfecv.ranking_

array([91, 89, 74, 93,  9,  1, 39, 63, 52, 47, 68, 94, 82, 18, 43, 64, 45,
       67, 21, 41, 31, 55, 44, 79,  7, 32, 99, 33, 50, 11, 37, 15, 14,  3,
       80, 60, 83, 19, 86,  1, 49, 95, 84, 26, 23, 77,  4, 66, 22, 61, 73,
       38,  6, 92, 17, 59, 24, 51, 85,  8, 75, 72, 10, 71, 54, 20, 30, 46,
       65, 13, 57,  5, 16, 58, 90, 34, 88, 97,  2, 69, 12, 25, 62, 27, 53,
       81, 96, 48, 56, 98, 29, 35, 70, 28, 76, 87, 40, 78, 36, 42])