In [1]:
import warnings
warnings.filterwarnings("ignore")

In [23]:
## Importing required modules
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.pipeline import Pipeline
import sklearn.externals
import joblib
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import load_iris
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

In [5]:
df1 = load_iris()
df1.data

array([[5.1, 3.5, 1.4, 0.2],
       [4.9, 3. , 1.4, 0.2],
       [4.7, 3.2, 1.3, 0.2],
       [4.6, 3.1, 1.5, 0.2],
       [5. , 3.6, 1.4, 0.2],
       [5.4, 3.9, 1.7, 0.4],
       [4.6, 3.4, 1.4, 0.3],
       [5. , 3.4, 1.5, 0.2],
       [4.4, 2.9, 1.4, 0.2],
       [4.9, 3.1, 1.5, 0.1],
       [5.4, 3.7, 1.5, 0.2],
       [4.8, 3.4, 1.6, 0.2],
       [4.8, 3. , 1.4, 0.1],
       [4.3, 3. , 1.1, 0.1],
       [5.8, 4. , 1.2, 0.2],
       [5.7, 4.4, 1.5, 0.4],
       [5.4, 3.9, 1.3, 0.4],
       [5.1, 3.5, 1.4, 0.3],
       [5.7, 3.8, 1.7, 0.3],
       [5.1, 3.8, 1.5, 0.3],
       [5.4, 3.4, 1.7, 0.2],
       [5.1, 3.7, 1.5, 0.4],
       [4.6, 3.6, 1. , 0.2],
       [5.1, 3.3, 1.7, 0.5],
       [4.8, 3.4, 1.9, 0.2],
       [5. , 3. , 1.6, 0.2],
       [5. , 3.4, 1.6, 0.4],
       [5.2, 3.5, 1.5, 0.2],
       [5.2, 3.4, 1.4, 0.2],
       [4.7, 3.2, 1.6, 0.2],
       [4.8, 3.1, 1.6, 0.2],
       [5.4, 3.4, 1.5, 0.4],
       [5.2, 4.1, 1.5, 0.1],
       [5.5, 4.2, 1.4, 0.2],
       [4.9, 3

In [7]:
X_train, X_test, y_train, y_test = train_test_split(df1.data, df1.target, test_size=0.3, random_state=0)

In [30]:
# Making the pipeline
pipleline_lr = Pipeline([
    
    
    ('scaler1', StandardScaler()),
    ('pca1', PCA(n_components=2)),
    ('lr_classifier', LogisticRegression(random_state=0))
])

pipleline_dt = Pipeline([
    
    
    ('scaler2', StandardScaler()),
    ('pca2', PCA(n_components=2)),
    ('dt_classifier', DecisionTreeClassifier())
])

pipleline_rf = Pipeline([
    
    
    ('scaler3', StandardScaler()),
    ('pca3', PCA(n_components=2)),
    ('rf_classifier', RandomForestClassifier())
])

In [31]:
# Make a list of pipelines
pipelines = [ pipleline_lr, pipleline_dt, pipleline_rf ]

In [32]:
best_accuracy = 0.0
best_classifier = 0
best_pipeline = ""

In [33]:
# Dictionary of pipelines and classifier for ease of reference
pipe_dict = {0: 'Logistic Regression', 1: 'Decision Tree', 2: 'RandomForest'}

# Fit the pipelines
for pipe in pipelines:
    pipe.fit(X_train, y_train)

In [34]:
for i, model in enumerate(pipelines):
    print('{} Test accuracy: {}'.format(pipe_dict[i], model.score(X_test, y_test)))

Logistic Regression Test accuracy: 0.8666666666666667
Decision Tree Test accuracy: 0.9111111111111111
RandomForest Test accuracy: 0.9111111111111111


In [35]:
for i, model in enumerate(pipelines):
    if model.score(X_test, y_test)>best_accuracy:
        best_accuracy = model.score(X_test, y_test)
        best_pipeline = model
        best_classifier = i
        
print('classifier with the best accuracy: {}'.format(pipe_dict[best_classifier]))

classifier with the best accuracy: Decision Tree


# Performing Hyper Parameters Tunning

In [43]:
# Create a pipeline
pipe = Pipeline([('classifier', RandomForestClassifier())])

# Create dictionary with candidate learning algorithm and their hyperparameters
grid_param = [
        {'classifier': [LogisticRegression()],
         'classifier__penalty': ['12', '11'],
         'classifier__C': np.logspace(0, 4, 10)
         },
        
        {'classifier': [LogisticRegression()],
         'classifier__penalty': ['12'],
         'classifier__C': np.logspace(0, 4, 10),
         'classifier__solver': ['newton-cg', 'saga', 'sag', 'liblinear'] ## This solvers don't allow penalty
         },
        
        {'classifier': [RandomForestClassifier()],
         'classifier__n_estimators': [10, 100, 1000],
         'classifier__max_depth': [5, 8, 15, 25, 30, None],
         'classifier__min_samples_leaf': [1, 2, 5, 10, 15, 100],
         'classifier__max_leaf_nodes': [2, 5, 10]
         }
    ]
    
# Create a grid search of the pipeline, then fit the best model
gridsearch = GridSearchCV(pipe, grid_param, cv=5, verbose=0, n_jobs=-1) # fit grid search
best_model = gridsearch.fit(X_train, y_train)

In [44]:
LogisticRegression().get_params().keys()

dict_keys(['C', 'class_weight', 'dual', 'fit_intercept', 'intercept_scaling', 'l1_ratio', 'max_iter', 'multi_class', 'n_jobs', 'penalty', 'random_state', 'solver', 'tol', 'verbose', 'warm_start'])

In [46]:
print(best_model.best_estimator_)
print('The mean accuracy of the model:', best_model.score(X_test, y_test))

Pipeline(steps=[('classifier',
                 RandomForestClassifier(max_depth=30, max_leaf_nodes=5,
                                        min_samples_leaf=5, n_estimators=10))])
The mean accuracy of the model: 0.9777777777777777
