In [1]:
import pandas as pd
from pandas import DataFrame, Series
import numpy as np
%matplotlib notebook
import matplotlib.pyplot as plt
%matplotlib notebook 
from sklearn import datasets
import sklearn
import matplotlib as mpl
from sklearn.model_selection import train_test_split

Reducing dimensionality does lose some information (just like
compressing an image to JPEG can degrade its quality), so even
though it will speed up training, it may also make your system per‐
form slightly worse. It also makes your pipelines a bit more com‐
plex and thus harder to maintain. So you should first try to train
your system with the original data before considering using dimen‐
sionality reduction if training is too slow. In some cases, however,
reducing the dimensionality of the training data may filter out
some noise and unnecessary details and thus result in higher per‐
formance (but in general it won’t; it will just speed up training).

Apart from speeding up training, dimensionality reduction is also extremely useful
for data visualization (or DataViz). Reducing the number of dimensions down to two
(or three) makes it possible to plot a high-dimensional training set on a graph and
often gain some important insights by visually detecting patterns, such as clusters.

# Manifold Learning
The Swiss roll is an example of a 2D manifold. Put simply, a 2D manifold is a 2D
shape that can be bent and twisted in a higher-dimensional space. More generally, a
d-dimensional manifold is a part of an n-dimensional space (where d < n) that locally
resembles a d-dimensional hyperplane. In the case of the Swiss roll, d = 2 and n = 3: it
locally resembles a 2D plane, but it is rolled in the third dimension.

# PCA
Principal Component Analysis (PCA) is by far the most popular dimensionality reduc‐
tion algorithm. First it identifies the hyperplane that lies closest to the data, and then
it projects the data onto it.


Using Scikit-Learn ,Scikit-Learn’s PCA class implements PCA using SVD decomposition just like we did
before. The following code applies PCA to reduce the dimensionality of the dataset
down to two dimensions (note that it automatically takes care of centering the data):


In [None]:
from sklearn.decomposition import PCA
pca = PCA(n_components = 2)
2D = pca.fit_transform(X)

# Choosing the Right Number of Dimensions
Instead of arbitrarily choosing the number of dimensions to reduce down to, it is
generally preferable to choose the number of dimensions that add up to a sufficiently
large portion of the variance (e.g., 95%). Unless, of course, you are reducing dimen‐
sionality for data visualization—in that case you will generally want to reduce the
dimensionality down to 2 or 3

In [None]:
pca = PCA(n_components=0.95)   # if number between 0 and 1.0, it is the explained variance ratio.
X_reduced = pca.fit_transform(X)

In [2]:
iris = datasets.load_iris()
X = iris["data"] 
y = (iris["target"] == 2).astype(np.float64)

In [3]:
X

array([[5.1, 3.5, 1.4, 0.2],
       [4.9, 3. , 1.4, 0.2],
       [4.7, 3.2, 1.3, 0.2],
       [4.6, 3.1, 1.5, 0.2],
       [5. , 3.6, 1.4, 0.2],
       [5.4, 3.9, 1.7, 0.4],
       [4.6, 3.4, 1.4, 0.3],
       [5. , 3.4, 1.5, 0.2],
       [4.4, 2.9, 1.4, 0.2],
       [4.9, 3.1, 1.5, 0.1],
       [5.4, 3.7, 1.5, 0.2],
       [4.8, 3.4, 1.6, 0.2],
       [4.8, 3. , 1.4, 0.1],
       [4.3, 3. , 1.1, 0.1],
       [5.8, 4. , 1.2, 0.2],
       [5.7, 4.4, 1.5, 0.4],
       [5.4, 3.9, 1.3, 0.4],
       [5.1, 3.5, 1.4, 0.3],
       [5.7, 3.8, 1.7, 0.3],
       [5.1, 3.8, 1.5, 0.3],
       [5.4, 3.4, 1.7, 0.2],
       [5.1, 3.7, 1.5, 0.4],
       [4.6, 3.6, 1. , 0.2],
       [5.1, 3.3, 1.7, 0.5],
       [4.8, 3.4, 1.9, 0.2],
       [5. , 3. , 1.6, 0.2],
       [5. , 3.4, 1.6, 0.4],
       [5.2, 3.5, 1.5, 0.2],
       [5.2, 3.4, 1.4, 0.2],
       [4.7, 3.2, 1.6, 0.2],
       [4.8, 3.1, 1.6, 0.2],
       [5.4, 3.4, 1.5, 0.4],
       [5.2, 4.1, 1.5, 0.1],
       [5.5, 4.2, 1.4, 0.2],
       [4.9, 3

In [4]:
y

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.])

In [11]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

In [6]:
from sklearn.decomposition import PCA

In [28]:
pca = PCA()

In [29]:
X2D = pca.fit_transform(X)

In [30]:
X2D

array([[-2.68412563e+00,  3.19397247e-01, -2.79148276e-02,
        -2.26243707e-03],
       [-2.71414169e+00, -1.77001225e-01, -2.10464272e-01,
        -9.90265503e-02],
       [-2.88899057e+00, -1.44949426e-01,  1.79002563e-02,
        -1.99683897e-02],
       [-2.74534286e+00, -3.18298979e-01,  3.15593736e-02,
         7.55758166e-02],
       [-2.72871654e+00,  3.26754513e-01,  9.00792406e-02,
         6.12585926e-02],
       [-2.28085963e+00,  7.41330449e-01,  1.68677658e-01,
         2.42008576e-02],
       [-2.82053775e+00, -8.94613845e-02,  2.57892158e-01,
         4.81431065e-02],
       [-2.62614497e+00,  1.63384960e-01, -2.18793179e-02,
         4.52978706e-02],
       [-2.88638273e+00, -5.78311754e-01,  2.07595703e-02,
         2.67447358e-02],
       [-2.67275580e+00, -1.13774246e-01, -1.97632725e-01,
         5.62954013e-02],
       [-2.50694709e+00,  6.45068899e-01, -7.53180094e-02,
         1.50199245e-02],
       [-2.61275523e+00,  1.47299392e-02,  1.02150260e-01,
      

In [33]:
pca.explained_variance_ratio_

array([0.92461872, 0.05306648, 0.01710261, 0.00521218])

In [34]:
cumsum = np.cumsum(pca.explained_variance_ratio_)

In [35]:
cumsum

array([0.92461872, 0.97768521, 0.99478782, 1.        ])

In [40]:
d = np.argmax(cumsum >= 0.95) + 1

In [41]:
d

2

In [43]:
pca = PCA(n_components=0.95)
X_reduced = pca.fit_transform(X)

In [44]:
X_reduced

array([[-2.68412563,  0.31939725],
       [-2.71414169, -0.17700123],
       [-2.88899057, -0.14494943],
       [-2.74534286, -0.31829898],
       [-2.72871654,  0.32675451],
       [-2.28085963,  0.74133045],
       [-2.82053775, -0.08946138],
       [-2.62614497,  0.16338496],
       [-2.88638273, -0.57831175],
       [-2.6727558 , -0.11377425],
       [-2.50694709,  0.6450689 ],
       [-2.61275523,  0.01472994],
       [-2.78610927, -0.235112  ],
       [-3.22380374, -0.51139459],
       [-2.64475039,  1.17876464],
       [-2.38603903,  1.33806233],
       [-2.62352788,  0.81067951],
       [-2.64829671,  0.31184914],
       [-2.19982032,  0.87283904],
       [-2.5879864 ,  0.51356031],
       [-2.31025622,  0.39134594],
       [-2.54370523,  0.43299606],
       [-3.21593942,  0.13346807],
       [-2.30273318,  0.09870885],
       [-2.35575405, -0.03728186],
       [-2.50666891, -0.14601688],
       [-2.46882007,  0.13095149],
       [-2.56231991,  0.36771886],
       [-2.63953472,

Yet another option is to plot the explained variance as a function of the number of
dimensions (simply plot cumsum; see Figure 8-8). There will usually be an elbow in the
curve, where the explained variance stops growing fast. You can think of this as the
intrinsic dimensionality of the dataset. In this case, you can see that reducing the
dimensionality down to about 100 dimensions wouldn’t lose too much explained var‐
iance.

# Incremental PCA
One problem with the preceding implementation of PCA is that it requires the whole
training set to fit in memory in order for the SVD algorithm to run. Fortunately,
Incremental PCA (IPCA) algorithms have been developed: you can split the training
set into mini-batches and feed an IPCA algorithm one mini-batch at a time. This is
useful for large training sets, and also to apply PCA online (i.e., on the fly, as new
instances arrive).
The following code splits the MNIST dataset into 100 mini-batches (using NumPy’s
array_split() function) and feeds them to Scikit-Learn’s IncrementalPCA class5
 to
reduce the dimensionality of the MNIST dataset down to 154 dimensions (just like
before). Note that you must call the partial_fit() method with each mini-batch
rather than the fit() method with the whole training set:

In [None]:
from sklearn.decomposition import IncrementalPCA

n_batches = 100
inc_pca = IncrementalPCA(n_components=154)

for X_batch in np.array_split(X_mnist, n_batches):
    inc_pca.partial_fit(X_batch)
    
X_mnist_reduced = inc_pca.transform(X_mnist)

Alternatively, you can use NumPy’s memmap class, which allows you to manipulate a
large array stored in a binary file on disk as if it were entirely in memory; the class
loads only the data it needs in memory, when it needs it. Since the IncrementalPCA
class uses only a small part of the array at any given time, the memory usage remains
under control. This makes it possible to call the usual fit() method, as you can see
in the following code:

In [None]:
X_mm = np.memmap(filename, dtype="float32", mode="readonly", shape=(m, n))

batch_size = m // n_batches
inc_pca = IncrementalPCA(n_components=154, batch_size=batch_size)
inc_pca.fit(X_mm)

# Randomized PCA 
Use this technique for very large data sets.

In [None]:
rnd_pca = PCA(n_components=154, svd_solver="randomized")

X_reduced = rnd_pca.fit_transform(X_mnist)

# t-SNE
t-Distributed Stochastic Neighbor Embedding (t-SNE) reduces dimensionality
while trying to keep similar instances close and dissimilar instances apart. It is
mostly used for visualization, in particular to visualize clusters of instances in
high-dimensional space (e.g., to visualize the MNIST images in 2D).
