#Pipelines in Machine Learning

In [40]:
import pandas as pd
import numpy as np
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

In [41]:
iris=datasets.load_iris()

In [42]:
iris.data

array([[5.1, 3.5, 1.4, 0.2],
       [4.9, 3. , 1.4, 0.2],
       [4.7, 3.2, 1.3, 0.2],
       [4.6, 3.1, 1.5, 0.2],
       [5. , 3.6, 1.4, 0.2],
       [5.4, 3.9, 1.7, 0.4],
       [4.6, 3.4, 1.4, 0.3],
       [5. , 3.4, 1.5, 0.2],
       [4.4, 2.9, 1.4, 0.2],
       [4.9, 3.1, 1.5, 0.1],
       [5.4, 3.7, 1.5, 0.2],
       [4.8, 3.4, 1.6, 0.2],
       [4.8, 3. , 1.4, 0.1],
       [4.3, 3. , 1.1, 0.1],
       [5.8, 4. , 1.2, 0.2],
       [5.7, 4.4, 1.5, 0.4],
       [5.4, 3.9, 1.3, 0.4],
       [5.1, 3.5, 1.4, 0.3],
       [5.7, 3.8, 1.7, 0.3],
       [5.1, 3.8, 1.5, 0.3],
       [5.4, 3.4, 1.7, 0.2],
       [5.1, 3.7, 1.5, 0.4],
       [4.6, 3.6, 1. , 0.2],
       [5.1, 3.3, 1.7, 0.5],
       [4.8, 3.4, 1.9, 0.2],
       [5. , 3. , 1.6, 0.2],
       [5. , 3.4, 1.6, 0.4],
       [5.2, 3.5, 1.5, 0.2],
       [5.2, 3.4, 1.4, 0.2],
       [4.7, 3.2, 1.6, 0.2],
       [4.8, 3.1, 1.6, 0.2],
       [5.4, 3.4, 1.5, 0.4],
       [5.2, 4.1, 1.5, 0.1],
       [5.5, 4.2, 1.4, 0.2],
       [4.9, 3

In [43]:
X_train,X_test,y_train,y_test=train_test_split(iris.data,iris.target,test_size=0.3,random_state=42)

In [44]:
pipeline_lr=Pipeline([('scale1',StandardScaler()),('pca1',PCA(n_components=2)),('lr',LogisticRegression())])

In [45]:
pipeline_dt=Pipeline([('scale2',StandardScaler()),('pca2',PCA(n_components=2)),('dt',DecisionTreeClassifier())])

In [46]:
pipeline_rf=Pipeline([('scale3',StandardScaler()),('pca3',PCA(n_components=2)),('rf',RandomForestClassifier())])

In [47]:
pipelines=[pipeline_lr,pipeline_dt,pipeline_rf]
pipeline_dict={0:'Logistic Regression',1:'Decision Tree',2:'Random Forest'}

In [48]:
best_pipeline=""
best_classifier=0
best_accuracy=0.0

In [49]:
for i in pipelines:
  i.fit(X_train,y_train)

In [50]:
for i,model in enumerate(pipelines):
  print("{} Accuracy is {}".format(pipeline_dict[i],model.score(X_test,y_test)))

Logistic Regression Accuracy is 0.9111111111111111
Decision Tree Accuracy is 0.9555555555555556
Random Forest Accuracy is 0.9555555555555556


In [51]:
for i,model in enumerate(pipelines):
  if model.score(X_test,y_test)>best_accuracy:
    best_accuracy=model.score(X_test,y_test)
    best_classifier=i
    best_pipeline=model
print("Classifier that has best accuracy is {}".format(pipeline_dict[best_classifier]))

Classifier that has best accuracy is Decision Tree
