# sklearn pipeline

解决问题：

1、每次训练时 *训练集* 和 *测试集* 一直重复数据清洗， => 通用部分模块化，减少代码量

2、机器学习流程更直观

In [2]:
import numpy as np
import pandas as pd
from sklearn import linear_model, decomposition, datasets
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

## 使用pipeline对手写数字进行PCA降维（数据清洗） + 预测

In [5]:
pca = decomposition.PCA()
logistic = linear_model.LogisticRegression()

In [25]:
digits = datasets.load_digits()
X_digits = digits.data
labels = digits.target

In [8]:
pipe = Pipeline(steps=[('pca', pca), ('logistic', logistic)])

In [10]:
X_digits.shape

(1797, 64)

pipeline参数

In [21]:
n_components = [20, 40, 64]
Cs = np.logspace(-4, 4, 3)

In [23]:
pca.fit(X_digits)

PCA(copy=True, iterated_power='auto', n_components=None, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)

In [28]:
estimator = GridSearchCV(pipe, dict(pca__n_components=n_components, logistic__C=Cs))

In [29]:
estimator.fit(X_digits, labels)



GridSearchCV(cv='warn', error_score='raise-deprecating',
       estimator=Pipeline(memory=None,
     steps=[('pca', PCA(copy=True, iterated_power='auto', n_components=None, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)), ('logistic', LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False))]),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'logistic__C': array([1.e-04, 1.e+00, 1.e+04]), 'pca__n_components': [20, 40, 64]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [30]:
estimator.best_estimator_

Pipeline(memory=None,
     steps=[('pca', PCA(copy=True, iterated_power='auto', n_components=40, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)), ('logistic', LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False))])

## 自定义transformer

In [33]:
estimator.best_estimator_

Pipeline(memory=None,
     steps=[('pca', PCA(copy=True, iterated_power='auto', n_components=40, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)), ('logistic', LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False))])

## FeatureUnion