### Import Libraries

In [6]:
%matplotlib inline

import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression

### Load the Data

In [7]:
df = pd.read_csv("data/MNISTonly0_1.csv")
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,775,776,777,778,779,780,781,782,783,label
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


### Without Pipeline

In [8]:
# Train Test Split
X_train, X_test, y_train, y_test = train_test_split(df[df.columns[:-1]], df['label'], random_state=0)

# Standardize Data
scaler = StandardScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

# Apply PCA
pca = PCA(n_components = .90, random_state=0)
pca.fit(X_train)
X_train = pca.transform(X_train)
X_test = pca.transform(X_test)

# Apply Logistic Regression
clf = LogisticRegression()
clf.fit(X_train, y_train)

# Get Model Performance
print(clf.score(X_test, y_test))

0.996


### With Pipeline
Allow Less Errors To Occur

In [11]:
# Train Test Split
X_train, X_test, y_train, y_test = train_test_split(df[df.columns[:-1]], df['label'], random_state=0)

#Create a Pipeline
pipe = Pipeline([('scaler', StandardScaler()), 
                 ('pca', PCA(n_components = .90, random_state = 0)), 
                  ('clf', LogisticRegression())])

pipe.fit(X_train, y_train)

#Get Model Performance
print(pipe.score(X_test, y_test))

0.996


### Visualize Pipeline

In [12]:
from sklearn import set_config

set_config(display='diagram')
pipe