# Basic Implementation of Pipeline

In [1]:
from pyforest import*

In [2]:
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.impute import SimpleImputer

In [3]:
iris_df=load_iris()
X_train,X_test,y_train,y_test=train_test_split(iris_df.data,iris_df.target,test_size=0.3,random_state=0)
pipeline_lr=Pipeline([
                     ('scalar1',StandardScaler()),
                     ('pca1',PCA(n_components=2)), 
                     ('imputer', SimpleImputer(strategy='most_frequent')),
                     ('lr_classifier',LogisticRegression(random_state=0))
                    ])
model = pipeline_lr.fit(X_train, y_train)
model.score(X_test,y_test)

0.8666666666666667

###### Use this in pipeline if there are any categorical variables

('onehot', OneHotEncoder(handle_unknown='ignore')),

## Stacking Multiple Pipelines to Find the Model with the Best Accuracy

In [4]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn import svm
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import SGDClassifier

##### We build different pipelines for each algorithm and the fit to see which performs better

In [72]:
pipeline_lr=Pipeline([('scalar1',StandardScaler()),
                     ('pca1',PCA(n_components=2)), 
                     ('lr_classifier',LogisticRegression())])
pipeline_dt=Pipeline([('scalar2',StandardScaler()),
                     ('pca2',PCA(n_components=2)),
                     ('dt_classifier',DecisionTreeClassifier())])
pipeline_svm = Pipeline([('scalar3', StandardScaler()),
                      ('pca3', PCA(n_components=2)),
                      ('clf', svm.SVC())])
pipeline_knn=Pipeline([('scalar4',StandardScaler()),
                     ('pca4',PCA(n_components=2)),
                     ('knn_classifier',KNeighborsClassifier())])
pipeline_sgd=Pipeline([('scalar5',StandardScaler()),
                     ('pca4',PCA(n_components=2)),
                     ('sgd_classifier',SGDClassifier())])
pipeline_rf=Pipeline([('scalar6',StandardScaler()),
                     ('pca4',PCA(n_components=2)),
                     ('rf_classifier',RandomForestClassifier())])
pipeline_nb=Pipeline([('scalar7',StandardScaler()),
                     ('knn_classifier',GaussianNB())])


In [5]:
pipelines = [pipeline_lr, pipeline_dt, pipeline_svm, pipeline_knn, pipeline_sgd, pipeline_rf, pipeline_nb]

NameError: name 'pipeline_dt' is not defined

In [6]:
pipe_dict = {0: 'Logistic Regression', 1: 'Decision Tree', 2: 'Support Vector Machine',3:'K Nearest Neighbor',
             4: 'Stochastic Gradient descent', 5: 'Random Forrest', 6: 'Naive Bayes'
            }

In [7]:
for pipe in pipelines:
  pipe.fit(X_train, y_train)
for i,model in enumerate(pipelines):
    print("{} Test Accuracy:{}".format(pipe_dict[i],model.score(X_test,y_test)))

NameError: name 'pipelines' is not defined

##### Now you can see the Best accuracy.
But for suppose SVM gave good accuracy and you want tune the model

In [76]:
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import RandomForestClassifier
pipe = make_pipeline((RandomForestClassifier()))

In [77]:
grid_param = [
{"randomforestclassifier": [RandomForestClassifier()],
"randomforestclassifier__n_estimators":[10,100,1000],            
 "randomforestclassifier__max_depth":[5,8,15,25,30,None],        
 "randomforestclassifier__min_samples_leaf":[1,2,5,10,15,100],
"randomforestclassifier__max_leaf_nodes": [2, 5,10]}]
gridsearch = GridSearchCV(pipe, grid_param, cv=5, verbose=0,n_jobs=-1) 
best_model = gridsearch.fit(X_train,y_train)
best_model.score(X_test,y_test)

0.9777777777777777

In [86]:
best_model.best_params_

{'svc': SVC(C=1, coef0=0.001, gamma='auto', kernel='linear'),
 'svc__C': 1,
 'svc__coef0': 0.001,
 'svc__degree': 3,
 'svc__gamma': 'auto',
 'svc__kernel': 'linear'}

In [8]:
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import make_pipeline
from sklearn.svm import SVC
pipe = make_pipeline((SVC()))

In [9]:
grid_param={'svc':[SVC()],
    'svc__kernel':('linear', 'poly', 'rbf', 'sigmoid'),
      'svc__C':np.arange(1,42,10),
      'svc__degree':np.arange(3,6),   
      'svc__coef0':np.arange(0.001,3,0.5),
      'svc__gamma': ('auto', 'scale')}
gridsearch = GridSearchCV(pipe, grid_param, cv=5, verbose=0,n_jobs=-1) 
best_model = gridsearch.fit(X_train,y_train)
best_model.score(X_test,y_test)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

0.9777777777777777

In [10]:
best_model.best_params_

{'svc': SVC(C=1, coef0=0.001, gamma='auto', kernel='linear'),
 'svc__C': 1,
 'svc__coef0': 0.001,
 'svc__degree': 3,
 'svc__gamma': 'auto',
 'svc__kernel': 'linear'}