# Dimensionality Reduction on MNIST

In [1]:
from sklearn.datasets import fetch_openml

X, y = fetch_openml("mnist_784", version=1, return_X_y=True, as_frame=False)
X.shape

(70000, 784)

In [2]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(X,y,test_size=10000, random_state=42)
print(x_train.shape)
print(x_test.shape)

(60000, 784)
(10000, 784)


In [3]:
from sklearn.ensemble import RandomForestClassifier
import time

forest = RandomForestClassifier(n_estimators=100, random_state=42)
start = time.time()
forest.fit(x_train, y_train)
print(f'Training without dimensionality reduction took {time.time() - start}')

Training without dimensionality reduction took 33.20436406135559


## Dimensionality Reduction

In [4]:
from sklearn.decomposition import PCA

light_forest = RandomForestClassifier(n_estimators=100, random_state=42)
pca = PCA(n_components=0.95, random_state=42)
start = time.time()
light_x = pca.fit_transform(x_train)
print(f'Dimensionality reduction took {time.time() - start}')
print(light_x.shape)
start = time.time()
light_forest.fit(light_x, y_train)
print(f'Training with dimensionality reduction took {time.time() - start}')

Dimensionality reduction took 5.272422790527344
(60000, 154)
Training with dimensionality reduction took 93.1807656288147


## Test

In [5]:
light_x_test = pca.transform(x_test)
print(f'Without dimensionality reduction: {forest.score(x_test, y_test)}')
print(f'With dimensionality reduction: {light_forest.score(light_x_test, y_test)}')

Without dimensionality reduction: 0.9674
With dimensionality reduction: 0.9469
