In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import matplotlib as mpl
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

from tensorflow.keras.datasets import mnist
from tensorflow.keras.utils import to_categorical

import os

%matplotlib inline

In [3]:
PROJECT_ROOT_DIR = "."
CHAPTER_ID = "dim_reduction"
IMAGES_PATH = os.path.join(PROJECT_ROOT_DIR, "images", CHAPTER_ID)
os.makedirs(IMAGES_PATH, exist_ok=True)

In [4]:
def save_fig(fig_id, tight_layout=True, fig_extension="png", resolution=300):
    path = os.path.join(IMAGES_PATH, fig_id + "." + fig_extension)
    print("Saving figure", fig_id)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format=fig_extension, dpi=resolution)

In [3]:
from sklearn.decomposition import PCA

In [10]:
(x_train, y_train), (x_test, y_test) = mnist.load_data()

In [11]:
x_train = np.reshape(x_train, (len(x_train),-1))
y_train = np.reshape(y_train, (len(y_train),-1))
x_test = np.reshape(x_test, (len(x_test),-1))
y_test = np.reshape(y_test, (len(y_test),-1))

In [13]:
x_train

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=uint8)

Perform PCA without reducing dimensionality, then compute the min number of dimension required to preserve 95% of training set variance

In [17]:
pca = PCA(n_components=2)
X2D = pca.fit(x_train)
cumsum = np.cumsum(pca.explained_variance_ratio_)
d = np.argmax(cumsum >= 0.95)+1

Better way is to have n_components set to 0.95... basically choosing the right number of dimensions

In [20]:
pca = PCA(n_components=0.95)
X_reduced = pca.fit_transform(x_train)

In [25]:
x_train.shape

(60000, 784)

In [22]:
X_reduced.shape #Dataset is now less than 20% of its original size instead of 784. Now the training set takes up much less space.

(60000, 154)

PCA for compression and decompression

In [27]:
pca = PCA(n_components = 154)
x_reduced = pca.fit_transform(x_train) #compression to 154 dimensions
x_recovered = pca.inverse_transform(x_reduced) #uncompression back to 784 dimensions

In [29]:
x_reduced.shape

(60000, 154)

In [30]:
x_recovered.shape

(60000, 784)

Randomized PCA

In [32]:
rnd_pca = PCA(n_components=154, svd_solver='randomized') #if you want to set the full svd approach, can change the svd_solver hyperparameter to 'full'
x_reduced = rnd_pca.fit_transform(x_train)

In [33]:
x_reduced.shape

(60000, 154)

Incremental PCA... splitting training set into mini batches and feeding it to an IPCA algorithm

In [34]:
from sklearn.decomposition import IncrementalPCA

In [36]:
n_batches = 100
inc_pca = IncrementalPCA(n_components=154)
for x_batch in np.array_split(x_train, n_batches):
    inc_pca.partial_fit(x_batch)
x_reduced = inc_pca.transform(x_train)

Kernel PCA... good at preserving clusters of instances after projection, even unrolling datasets that lie close to twisted manifold. "SWISS ROLL"

In [37]:
from sklearn.decomposition import KernelPCA

rbf_pca = KernelPCA(n_components=2, kernel='rbf', gamma=0.04)
x_reduced = rbf_pca.fit_transform(x_train)