In [2]:

# 10.1 Thresholding Numerical Feature Variance

from sklearn import datasets
from sklearn.feature_selection import VarianceThreshold

iris = datasets.load_iris()

features = iris.data
target = iris.target

vrc = VarianceThreshold(threshold=0.5)
features_high_variance = vrc.fit_transform(features)
features_high_variance[:3]

array([[5.1, 1.4, 0.2],
       [4.9, 1.4, 0.2],
       [4.7, 1.3, 0.2]])

In [3]:
# 10.2 Thresholding Binary Feature Variance

# Create feature matrix with:
# Feature 0: 80% class 0
# Feature 1: 80% class 1
# Feature 2: 60% class 0, 40% class 1
features = [[0, 1, 0],
            [0, 1, 1],
            [0, 1, 0],
            [0, 1, 1],
            [1, 0, 0]]
vrc = VarianceThreshold(threshold=(0.75*(0.25)))

vrc.fit_transform(features)

array([[0],
       [1],
       [0],
       [1],
       [0]])

In [6]:

# 10.3 Handling Highly Correlated Features
import numpy as np
import pandas as pd

features = np.array(
    [[1, 1, 1],
    [2, 2, 0],
    [3, 3, 1],
    [4, 4, 0],
    [5, 5, 1],
    [6, 6, 0],
    [7, 7, 1],
    [8, 7, 0],
    [9, 7, 1]])
dataframe = pd.DataFrame(features)
# Create correlation matrix
corr_matrix = dataframe.corr().abs()
# Select upper triangle of correlation matrix
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape),
k=1).astype(np.bool))
# Find index of feature columns with correlation greater than 0.95
to_drop = [column for column in upper.columns if any(upper[column] > 0.95)]
# Drop features
dataframe.drop(dataframe.columns[to_drop], axis=1).head(3)

Unnamed: 0,0,2
0,1,1
1,2,0
2,3,1


In [7]:
# 10.4 Removing Irrelevant Features for Classification

# Load libraries
from sklearn.datasets import load_iris
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2, f_classif
# Load data
iris = load_iris()
features = iris.data
target = iris.target
# Convert to categorical data by converting data to integers
features = features.astype(int)
# Select two features with highest chi-squared statistics
chi2_selector = SelectKBest(chi2, k=2)
features_kbest = chi2_selector.fit_transform(features, target)
# Show results
print("Original number of features:", features.shape[1])
print("Reduced number of features:", features_kbest.shape[1])

Original number of features: 4
Reduced number of features: 2


In [8]:

# 10.5 Recursively Eliminating Features

# Load libraries
import warnings
from sklearn.datasets import make_regression
from sklearn.feature_selection import RFECV
from sklearn import datasets, linear_model
# Suppress an annoying but harmless warning
warnings.filterwarnings(action="ignore", module="scipy",
message="^internal gelsd")
# Generate features matrix, target vector, and the true coefficients
features, target = make_regression(n_samples = 10000,
n_features = 100,
n_informative = 2,
random_state = 1)
# Create a linear regression
ols = linear_model.LinearRegression()
# Recursively eliminate features
rfecv = RFECV(estimator=ols, step=1, scoring="neg_mean_squared_error")
rfecv.fit(features, target)
rfecv.transform(features)

array([[ 0.00850799,  0.7031277 , -0.09009697],
       [-1.07500204,  2.56148527, -0.83288561],
       [ 1.37940721, -1.77039484, -0.19323117],
       ...,
       [-0.80331656, -1.60648007,  0.16783823],
       [ 0.39508844, -1.34564911, -0.23055945],
       [-0.55383035,  0.82880112,  0.05952898]])