# Speeding up supervised learning with PCA

In [2]:
from sklearn import datasets

X, y = datasets.fetch_openml('mnist_784', version=1, return_X_y=True, parser='auto')
X.shape


(70000, 784)

In [3]:
from sklearn import linear_model
from sklearn import model_selection
from sklearn import pipeline
from sklearn import preprocessing

model = pipeline.make_pipeline(
    preprocessing.StandardScaler(),
    linear_model.LogisticRegression(solver='sag', max_iter=10)
)

X_train, X_test, y_train, y_test = model_selection.train_test_split(
    X, y,
    test_size=0.2,
    random_state=42
)

%time model.fit(X_train, y_train)


CPU times: user 12.3 s, sys: 344 ms, total: 12.6 s
Wall time: 12.7 s




In [5]:
from sklearn import metrics

y_pred = model.predict(X_test)
accuracy = metrics.accuracy_score(y_test, y_pred)
print(f'{accuracy:.2%} accuracy')


90.72% accuracy


In [10]:
from sklearn import decomposition

model = pipeline.make_pipeline(
    decomposition.PCA(n_components=20),
    preprocessing.StandardScaler(),
    linear_model.LogisticRegression()
)

%time model.fit(X_train, y_train)


CPU times: user 1min, sys: 4.28 s, total: 1min 5s
Wall time: 9.06 s


In [11]:
y_pred = model.predict(X_test)
accuracy = metrics.accuracy_score(y_test, y_pred)
print(f'{accuracy:.2%} accuracy')


87.84% accuracy


PCA thus provides a way to trade accuracy for training speed. There is little reason to believe a PCA improves a model's accuracy.