In [1]:
import numpy as np
from sklearn.decomposition import PCA, IncrementalPCA, KernelPCA
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error
from sklearn.manifold import LocallyLinearEmbedding
from sklearn.datasets import fetch_openml
import matplotlib.pyplot as plt

In [2]:
#Using the MNIST dataset for dimensionality reduction
mnist = fetch_openml('mnist_784')

X, y = mnist['data'], mnist['target']

In [3]:
#Creating training and testing dataset and shuffling them 
X_train, X_test, y_train, y_test = X[:60000], X[60000:], y[:60000], y[60000:]
shuffle_index = np.random.permutation(60000)
X_train, y_train = X_train[shuffle_index], y_train[shuffle_index]

# Principal Component Analysis (PCA)

In [4]:
#Computing PCA using SVD Decomposition
pca = PCA(n_components = 2)
pca.fit(X_train)

PCA(n_components=2)

In [5]:
#Viewing the min number of dimensions needed to get 95% variance
cumsum = np.cumsum(pca.explained_variance_ratio_)
d = np.argmax(cumsum >= 0.95) + 1
d

1

In [6]:
#Computing PCA for 1D since its need to get 95% variance
pca = PCA(n_components = 0.95)
X_reduced = pca.fit_transform(X_train)

In [7]:
#Viewing the number of dimensions in the new fitted model
len(pca.explained_variance_ratio_)

154

In [8]:
#Viewing the reconstruction error for the model
X_recovered = pca.inverse_transform(X_reduced)

mean_squared_error(X_train, X_recovered)

217.79366111265557

# Incremental PCA

In [9]:
#Splitting the MNIST dataset into 100 mini-batches for incremental learning and using Incremental PCA Model
n_batches = 100

pca = IncrementalPCA(n_components = 154) #Since we got dimensionality reduction to 154 for PCA
for X_batch in np.array_split(X_train, n_batches):
  pca.partial_fit(X_batch)  

In [None]:
#Reducing dimensions of the dataset
X_reduced = pca.fit_transform(X_train)

# Randomized PCA

In [None]:
#Using the PCA model with randomized approximation of n principal components
pca = PCA(n_components = 154, svd_solver="randomized")
X_reduced = pca.fit_transform(X_train) 

In [None]:
#Choosing the best Kernel for the reduction using Grid Search Cross Validation
classifier = Pipeline([
          ("kpca", KernelPCA(n_components=2)),
          ("log_reg", LogisticRegression())
])

param_grid = [{
        "kpca__gamma": np.linspace(0.03, 0.05, 10),
        "kpca__kernel": ["rbf", "sigmoid"]
}]

grid_search = GridSearchCV(clf, param_grid, cv=3)
grid_search.fit(X_train, y_train)

In [None]:
#Viewing the best grid parameters
grid_search.best_params_

# Locally Linear Embedding (LLE)

In [None]:
#Using Local Linear Embedding to reduce dimensions
lle = LocallyLinearEmbedding(n_components=2, n_neighbors=10)
X_reduced = lle.fit_transform(X_train)