In [41]:
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
# from sklearn.externals import joblib
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

In [42]:
iris_df =load_iris()
iris_df

{'data': array([[5.1, 3.5, 1.4, 0.2],
        [4.9, 3. , 1.4, 0.2],
        [4.7, 3.2, 1.3, 0.2],
        [4.6, 3.1, 1.5, 0.2],
        [5. , 3.6, 1.4, 0.2],
        [5.4, 3.9, 1.7, 0.4],
        [4.6, 3.4, 1.4, 0.3],
        [5. , 3.4, 1.5, 0.2],
        [4.4, 2.9, 1.4, 0.2],
        [4.9, 3.1, 1.5, 0.1],
        [5.4, 3.7, 1.5, 0.2],
        [4.8, 3.4, 1.6, 0.2],
        [4.8, 3. , 1.4, 0.1],
        [4.3, 3. , 1.1, 0.1],
        [5.8, 4. , 1.2, 0.2],
        [5.7, 4.4, 1.5, 0.4],
        [5.4, 3.9, 1.3, 0.4],
        [5.1, 3.5, 1.4, 0.3],
        [5.7, 3.8, 1.7, 0.3],
        [5.1, 3.8, 1.5, 0.3],
        [5.4, 3.4, 1.7, 0.2],
        [5.1, 3.7, 1.5, 0.4],
        [4.6, 3.6, 1. , 0.2],
        [5.1, 3.3, 1.7, 0.5],
        [4.8, 3.4, 1.9, 0.2],
        [5. , 3. , 1.6, 0.2],
        [5. , 3.4, 1.6, 0.4],
        [5.2, 3.5, 1.5, 0.2],
        [5.2, 3.4, 1.4, 0.2],
        [4.7, 3.2, 1.6, 0.2],
        [4.8, 3.1, 1.6, 0.2],
        [5.4, 3.4, 1.5, 0.4],
        [5.2, 4.1, 1.5, 0.1],
  

In [43]:
len(iris_df)

7

In [44]:
X_train, X_test, y_train, y_test =train_test_split(iris_df.data,
                                                  iris_df.target, 
                                                  test_size=0.3, 
                                                  random_state=42)

In [45]:
#Pipeline Creation
##1. Data preprocessing by using Standard Scaler
### 2. Reduce Dimesnion using PCA
##Apply classifier

pipeline_lr =Pipeline([('scalar1', StandardScaler()), ('pca1', PCA(n_components=2)),
                      ('lr_classifier', LogisticRegression(random_state=0))])


In [46]:
pipeline_dt=Pipeline([('scalar2', StandardScaler()), 
                       ('pca2', PCA(n_components=2)),
                     ('dt_classifier', DecisionTreeClassifier())])

In [47]:
pipeline_rf =Pipeline([('scalar3', StandardScaler()),
                      ('pca3', PCA(n_components=2)),
                      ('rf_classifier', RandomForestClassifier())])

In [48]:
pipelines = [pipeline_lr, pipeline_dt, pipeline_rf]

In [49]:
pipelines

[Pipeline(steps=[('scalar1', StandardScaler()), ('pca1', PCA(n_components=2)),
                 ('lr_classifier', LogisticRegression(random_state=0))]),
 Pipeline(steps=[('scalar2', StandardScaler()), ('pca2', PCA(n_components=2)),
                 ('dt_classifier', DecisionTreeClassifier())]),
 Pipeline(steps=[('scalar3', StandardScaler()), ('pca3', PCA(n_components=2)),
                 ('rf_classifier', RandomForestClassifier())])]

In [50]:
#initialize three variable
best_acuracy=0.0
best_classifier =0
best_pipeline =  ""

In [51]:
#dictionary of pipelines and classifier types for ease
#of reference

pipe_dict = {
    0: 'Logistic Regression', 1: 'Decision Tree',
    2: 'Random Forest'
}

#fir the pipelines
for pipe in pipelines:
    pipe.fit (X_train, y_train)

In [52]:
for i, model in enumerate(pipelines):
    print("{} Test Accuracy: {}".format(pipe_dict[i],
                                       model.score(X_test, y_test)))

Logistic Regression Test Accuracy: 0.9111111111111111
Decision Tree Test Accuracy: 0.9555555555555556
Random Forest Test Accuracy: 0.9555555555555556


In [53]:
for i, model in enumerate(pipelines):
    if model.score(X_test, y_test):
        best_accuracy =model.score(X_test, y_test)
        best_pipeline =model
        best_classifier=i
print('Classifier with best accuracy:{}'.format(pipe_dict[best_classifier]))

Classifier with best accuracy:Random Forest


### Pipelines perform Hyperparameter Tuning using Grid SerachCV

In [54]:
from sklearn.model_selection import GridSearchCV
import numpy as np

In [58]:
#Create  a pipeline
pipe = Pipeline([('classifier', RandomForestClassifier())])

#create a dictionary with candidate learning algo and their paramters

grid_param = [
    {'classifier':[LogisticRegression()],
    'classifier__penalty': ['l2','l1'],
    'classifier__C':np.logspace(0,4,10)},
    
    {'classifier': [LogisticRegression()],
    'classifier__penalty': ['l2'],
     'classifier__C':np.logspace(0,4,10),
    'classifier__solver':['newton-cg','saga', 'sag', 'liblinear' ]
    },
    
    {
        'classifier': [RandomForestClassifier()],
        'classifier__n_estimators':[10,100,1000],
        'classifier__max_depth': [5,8,15,25,30,None],
        'classifier__min_samples_leaf': [1,2,5,10,14,100],
        'classifier__max_leaf_nodes': [2,5,10]
    }
     
]

# Create a gridsearch of the pipeline , the fit the model
gridsearch = GridSearchCV(pipe, grid_param, cv=5, verbose=0, n_jobs=-1)
best_model =gridsearch.fit(X_train, y_train)



In [59]:
print(best_model.best_estimator_)
print ('The mean accuracy of the model is:', best_model.score(X_test, y_test))

Pipeline(steps=[('classifier', LogisticRegression(solver='sag'))])
The mean accuracy of the model is: 1.0


In [61]:
print(best_model.best_estimator_)

Pipeline(steps=[('classifier', LogisticRegression(solver='sag'))])


In [62]:
best_model.score(X_train, y_train)

0.9809523809523809

In [63]:
best_model.score(X_test, y_test)

1.0