In [None]:
# Chapter 9. Dimensionality Reduction Using Feature Extraction

In [7]:
# 9.1 Reducing Features Using Principal Components

from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn import datasets

# load the data
digits = datasets.load_digits()

# # Standardize the feature matrix
features = StandardScaler().fit_transform(digits.data)

# # Create a PCA that will retain 99% of variance
pca = PCA(n_components=0.99, whiten=True)

# Conduct PCA
features_pca = pca.fit_transform(features)

print("original number of features: ", features.shape[1])
print("Reduced number of features: ", features_pca.shape[1])


original number of features:  64
Reduced number of features:  54


In [9]:
# 9.2 Reducing Features When Data Is Linearly Inseparable

from sklearn.decomposition import PCA, KernelPCA
from sklearn.datasets import make_circles

# Create linearly inseparable data
features, _ = make_circles(n_samples=1000, random_state=1, noise=0.1, factor=0.1)

#Apply kernel PCA with radius basis function (RBF) kernel
kpca = KernelPCA(kernel="rbf", gamma=15, n_components=1)
features_kpca =  kpca.fit_transform(features)

print("Original number of features: ", features.shape[1])
print("Reduced number of features: ", features_kpca.shape[1])

Original number of features:  2
Reduced number of features:  1


In [None]:
# 9.3 Reducing Features by Maximizing Class Separability

from sklearn import datasets
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

# load iris flower dataset
iris = datasets.load_iris()
features = iris.data
target = iris.target

# Create and run an LDA, then use it to transform the features
lda = LinearDiscriminantAnalysis(n_components=1)
features_lda = lda.fit(features, target).transform(features)

# print the number of features
print("Original number of features: ", features.shape[1])
print("Reduced number of features: ", features_lda.shape[1])

Original number of features:  4
Reduced number of features:  1


In [2]:
lda.explained_variance_ratio_

array([0.9912126])

In [3]:
# Run  LinearDiscriminantAnalysis with n_components set to None

# create and run LDA
lda = LinearDiscriminantAnalysis(n_components=None)
features_lda = lda.fit(features, target)

# create array of explained variance ratios
lda_var_ratios = lda.explained_variance_ratio_

# create function
def select_n_component(var_ratio, goal_var: float) -> int:
    
    total_variance = 0
    
    # initial number of features
    n_component = 0
    
    for explained_variance in var_ratio:
        total_variance += explained_variance
        
        n_component += 1
        
        if total_variance >= goal_var:
            break
    
    return n_component

select_n_component(lda_var_ratios,0.95)
    

1

In [1]:
# 9.4 Reducing Features Using Matrix Factorization
# Use nonnegative matrix factorization (NMF)

from sklearn.decomposition import NMF
from sklearn import datasets

# load data
digits = datasets.load_digits()

# load feature matrix
features = digits.data

# create and fir and apply NMF
nmf = NMF(n_components=10, random_state=4)
features_nmf = nmf.fit_transform(features)

print("original number of features: ", features.shape[1])
print("reduced number of features: ", features_nmf.shape[1])

original number of features:  64
reduced number of features:  10


In [30]:
# 9.5 Reducing Features on Sparse Data

from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import TruncatedSVD
from scipy.sparse import csr_matrix
from sklearn import datasets
import numpy as np

# load the data
digits = datasets.load_digits()

# Standardize feature matrix
features = StandardScaler().fit_transform(digits.data)

# make sparse matrix
features_sparse = csr_matrix(features)

# create a TSVD
tsvd = TruncatedSVD(n_components=10)

features_sparse_tsvd = tsvd.fit(features_sparse).transform(features_sparse)

print("original number of features: ", features_sparse.shape[1])
print("reduced number of features: ", features_sparse_tsvd.shape[1])

original number of features:  64
reduced number of features:  10


In [31]:
features_sparse_tsvd

array([[-1.9142176 , -0.95450096, -3.94603194, ...,  1.49702093,
         0.13070244, -0.82291922],
       [-0.58897591,  0.92463595,  3.92476521, ...,  0.55603383,
         1.07432705,  0.09376282],
       [-1.30203469, -0.31718982,  3.0233352 , ...,  1.15353154,
         0.77918653, -1.10457132],
       ...,
       [-1.02259596, -0.14791073,  2.46997036, ...,  0.52912405,
         2.05476542, -2.03812374],
       [-1.07605884, -0.38090814, -2.4554995 , ...,  0.76512564,
         1.08186336, -0.34064668],
       [ 1.25769755, -2.22758764,  0.2836077 , ..., -1.20408527,
         0.82285946, -1.82851047]])

In [39]:
tsvd.explained_variance_ratio_

array([1.20339161e-01, 9.56105440e-02, 8.44441489e-02, 6.49840791e-02,
       4.86015488e-02, 4.21411987e-02, 3.94208280e-02, 3.38938092e-02,
       2.99822101e-02, 2.93200255e-02, 2.78180546e-02, 2.57705509e-02,
       2.27530332e-02, 2.22717974e-02, 2.16522943e-02, 1.91416661e-02,
       1.77554709e-02, 1.63806927e-02, 1.59646017e-02, 1.48919119e-02,
       1.34796957e-02, 1.27193137e-02, 1.16583735e-02, 1.05764660e-02,
       9.75315947e-03, 9.44558990e-03, 8.63013827e-03, 8.36642854e-03,
       7.97693248e-03, 7.46471371e-03, 7.25582151e-03, 6.91911245e-03,
       6.53908536e-03, 6.40792574e-03, 5.91384112e-03, 5.71162405e-03,
       5.23636803e-03, 4.81807586e-03, 4.53719260e-03, 4.23162753e-03,
       4.06053070e-03, 3.97084808e-03, 3.56493303e-03, 3.40787181e-03,
       3.27835335e-03, 3.11032007e-03, 2.88575294e-03, 2.76489264e-03,
       2.59174941e-03, 2.34483006e-03, 2.18256858e-03, 2.03597635e-03,
       1.95512426e-03, 1.83318499e-03, 1.67946387e-03, 1.61236062e-03,
      

In [40]:
tsvd.explained_variance_ratio_[0:3].sum()

np.float64(0.30039385393457296)

In [42]:
# Create and run a TSVD with one less than number of features
tvsd = TruncatedSVD(n_components=features_sparse.shape[1]-1)
features_tsvd = tsvd.fit(features)

tsvd_var_ratio = tsvd.explained_variance_ratio_

def select_n_components(var_ratio, goal_var):
    
    total_variance = 0
    n_components = 0
    
    for explained_variance in var_ratio:
        total_variance += explained_variance
        n_components += 1
        
        if total_variance >= goal_var:
            break
    
    return n_components


select_n_components(tsvd_var_ratio, 0.95)

40

In [22]:
tsvd_var_ratio.size

10