# Principle Component Analysis (PCA) using scikit learn
Specifies 

In [1]:
# Import libraries
import pandas as pd
from matplotlib import pyplot as plt
import sklearn


In [2]:
# Import/read data: MNIST dataset
from sklearn.datasets import fetch_openml

mnist = fetch_openml('mnist_784', version=1, cache=True)

In [3]:
# Split the data into training and test sets (6/7 used for training)
from sklearn.model_selection import train_test_split

# test_size: what proportion of original data is used for test set
train_img, test_img, train_lbl, test_lbl = train_test_split( mnist.data, mnist.target, test_size=1/7.0, random_state=0)

In [4]:
# Standardize the data
# PCA is effected by scale so you need to scale the features first onto unit cscale (mean = 0, variance =1)
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

# Fit on training set only.
scaler.fit(train_img)

# Apply transform to both the training set and the test set.
train_img = scaler.transform(train_img)
test_img = scaler.transform(test_img)

In [5]:
# Import and apply PCA
# Note the code below specifies that scikit-learn should choose the minimum number of components
# such that 95% of variance is retained.
from sklearn.decomposition import PCA

# Make an instance of the Model
pca = PCA(.95) # <------- The important bit

In [6]:
# Fit PCA on training set (and training set only!)
pca.fit(train_img)

PCA(copy=True, iterated_power='auto', n_components=0.95, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)

In [7]:
# Find out how many components PCA chose after fitting the model
pca.n_components_

327

In [8]:
# Apply transform to both the training and test sets
train_img = pca.transform(train_img)
test_img = pca.transform(test_img)

## Optional: Run some regression on our (much faster) data set

In [9]:
# Apply Logistic Regression to the Transformed Data
from sklearn.linear_model import LogisticRegression

# Make an instance of the model:
# All parameters not specified are set to their defaults
# Default solver is incredibly slow which is why it was changed to 'lbfgs'
logisticRegr = LogisticRegression(solver = 'lbfgs')

In [10]:
# Train the model on the data
logisticRegr.fit(train_img, train_lbl)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='lbfgs',
          tol=0.0001, verbose=0, warm_start=False)

In [11]:
# Predict the labels of new data (new images)

# Predict for several observations (images) at once
logisticRegr.predict(test_img[0:10])

array(['0', '4', '1', '2', '4', '4', '7', '1', '1', '7'], dtype=object)