# Demo Pipeline 2

## Example: choose best classifier

This notebook show how to implement a simple pipeline that let to choose the best classifier.  
We compare Logistic regression, Decision Tree and Random Forest performance on Iris dataset.

### Load libraries

In [1]:
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.externals import joblib
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier



### Load data

In [2]:
iris_df = load_iris()
iris_df.data

array([[5.1, 3.5, 1.4, 0.2],
       [4.9, 3. , 1.4, 0.2],
       [4.7, 3.2, 1.3, 0.2],
       [4.6, 3.1, 1.5, 0.2],
       [5. , 3.6, 1.4, 0.2],
       [5.4, 3.9, 1.7, 0.4],
       [4.6, 3.4, 1.4, 0.3],
       [5. , 3.4, 1.5, 0.2],
       [4.4, 2.9, 1.4, 0.2],
       [4.9, 3.1, 1.5, 0.1],
       [5.4, 3.7, 1.5, 0.2],
       [4.8, 3.4, 1.6, 0.2],
       [4.8, 3. , 1.4, 0.1],
       [4.3, 3. , 1.1, 0.1],
       [5.8, 4. , 1.2, 0.2],
       [5.7, 4.4, 1.5, 0.4],
       [5.4, 3.9, 1.3, 0.4],
       [5.1, 3.5, 1.4, 0.3],
       [5.7, 3.8, 1.7, 0.3],
       [5.1, 3.8, 1.5, 0.3],
       [5.4, 3.4, 1.7, 0.2],
       [5.1, 3.7, 1.5, 0.4],
       [4.6, 3.6, 1. , 0.2],
       [5.1, 3.3, 1.7, 0.5],
       [4.8, 3.4, 1.9, 0.2],
       [5. , 3. , 1.6, 0.2],
       [5. , 3.4, 1.6, 0.4],
       [5.2, 3.5, 1.5, 0.2],
       [5.2, 3.4, 1.4, 0.2],
       [4.7, 3.2, 1.6, 0.2],
       [4.8, 3.1, 1.6, 0.2],
       [5.4, 3.4, 1.5, 0.4],
       [5.2, 4.1, 1.5, 0.1],
       [5.5, 4.2, 1.4, 0.2],
       [4.9, 3

### Split data

In [13]:
X_train, X_test, y_train, y_test = train_test_split(iris_df.data, iris_df.target, test_size =0.8, random_state = 0)

### Create pipelines

Pipeline Creation:
1. Data Preprocessing by using Standard Scaler
2. Reduce Dimension using PCA
3. Apply Classifier

#### Logistic Regression pipeline

In [5]:
pipeline_lr = Pipeline([
                        ('scalar1', StandardScaler()),
                        ('pca1', PCA(n_components=2)),
                        ('lr_classifier', LogisticRegression(random_state=0))])
pipeline_lr

Pipeline(memory=None,
         steps=[('scalar1',
                 StandardScaler(copy=True, with_mean=True, with_std=True)),
                ('pca1',
                 PCA(copy=True, iterated_power='auto', n_components=2,
                     random_state=None, svd_solver='auto', tol=0.0,
                     whiten=False)),
                ('lr_classifier',
                 LogisticRegression(C=1.0, class_weight=None, dual=False,
                                    fit_intercept=True, intercept_scaling=1,
                                    l1_ratio=None, max_iter=100,
                                    multi_class='auto', n_jobs=None,
                                    penalty='l2', random_state=0,
                                    solver='lbfgs', tol=0.0001, verbose=0,
                                    warm_start=False))],
         verbose=False)

#### Decission Tree pipeline

In [6]:
pipeline_dt = Pipeline([
                        ('scalar2', StandardScaler()),
                        ('pca2', PCA(n_components=2)),
                        ('dt_classifier', DecisionTreeClassifier())])

pipeline_dt

Pipeline(memory=None,
         steps=[('scalar2',
                 StandardScaler(copy=True, with_mean=True, with_std=True)),
                ('pca2',
                 PCA(copy=True, iterated_power='auto', n_components=2,
                     random_state=None, svd_solver='auto', tol=0.0,
                     whiten=False)),
                ('dt_classifier',
                 DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None,
                                        criterion='gini', max_depth=None,
                                        max_features=None, max_leaf_nodes=None,
                                        min_impurity_decrease=0.0,
                                        min_impurity_split=None,
                                        min_samples_leaf=1, min_samples_split=2,
                                        min_weight_fraction_leaf=0.0,
                                        presort='deprecated', random_state=None,
                                        splitte

#### Random Forest pipeline

In [7]:
pipeline_randomforest = Pipeline([
                                  ('scalar3', StandardScaler()),
                                  ('pca3', PCA(n_components=2)),
                                   ('rf_classifier', RandomForestClassifier())])

pipeline_randomforest

Pipeline(memory=None,
         steps=[('scalar3',
                 StandardScaler(copy=True, with_mean=True, with_std=True)),
                ('pca3',
                 PCA(copy=True, iterated_power='auto', n_components=2,
                     random_state=None, svd_solver='auto', tol=0.0,
                     whiten=False)),
                ('rf_classifier',
                 RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                        class_weight=None, criterion='gini',
                                        max_depth=None, max_features='auto',
                                        max_leaf_nodes=None, max_samples=None,
                                        min_impurity_decrease=0.0,
                                        min_impurity_split=None,
                                        min_samples_leaf=1, min_samples_split=2,
                                        min_weight_fraction_leaf=0.0,
                                        n_estimators

#### Pipeline of pipelines

In [8]:
pipelines = [pipeline_lr, pipeline_dt,pipeline_randomforest]

### Comparison

In [9]:
best_accuracy = 0.0
best_classifier = 0
best_pipeline = ""
# Dictionary of pipelines and classifier types for ease of reference
pipe_dict = {0: 'Logistic Regression',
             1: 'Decision Tree',
             2: 'RandomForest'}

#### Fit each model (through the pipeline)

In [10]:
# Fit the pipeline
for pipe in pipelines:
  pipe.fit(X_train, y_train)

#### Get each model accuracy

In [15]:
for i, model in enumerate(pipelines):
  print("{} Test Accuracy: {}".format(pipe_dict[i], model.score(X_test, y_test)))
  if model.score(X_test, y_test) > best_accuracy:
    best_accuracy = model.score(X_test, y_test)
    best_pipeline = model
    best_classifier = i

Logistic Regression Test Accuracy: 0.825
Decision Tree Test Accuracy: 0.8916666666666667
RandomForest Test Accuracy: 0.8916666666666667


In [17]:
print("Classifier with best accuracy: {} with {} of accuracy".format(pipe_dict[best_classifier], best_accuracy))

Classifier with best accuracy: Decision Tree with 0.8916666666666667 of accuracy
