### Scikit Learn Pipeline

Machine Learning is not always about applying a **single algorithm**.

In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression

**Dataset**

In [2]:
df = pd.read_csv('../Data/MNIST.csv')
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,775,776,777,778,779,780,781,782,783,label
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


**Without** Pipeline

In [3]:
X = df[df.columns[:-1]]
y = df['label']

# Split the dataset into train set and test set. 
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

# Standardize data. 
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Apply PCA
pca = PCA(n_components=0.90, random_state=0)
X_train = pca.fit_transform(X_train)
X_test = pca.transform(X_test)

# Apply Logistic Regression
model = LogisticRegression()
model.fit(X_train, y_train)

# Model Performance 
print(f'Score : {model.score(X_test, y_test)*100:.2f}%')

Score : 99.70%


**With** Pipeline

In [4]:
# Split the dataset into train set and test set.
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

# Create Pipeline
pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('pca', PCA(n_components=0.90, random_state=0)),
    ('logistic', LogisticRegression())
])

pipe.fit(X_train, y_train)

# Get Model Performance
print(f'Score : {pipe.score(X_test, y_test)*100:.2f}%')

Score : 99.70%


**Visualize** Pipeline

In [5]:
from sklearn import set_config
set_config(display='diagram')
pipe