In [1]:
#下面这戏代码实现了划分数据，计算最小值，最大值，数据缩放和训练SVM
from sklearn.svm import SVC
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
#加载并划分数据
cancer = load_breast_cancer()
x_train, x_test, y_train, y_test = train_test_split(cancer.data, cancer.target, random_state=0)
#计算训练数据的最大最小值
scaler = MinMaxScaler().fit(x_train)

In [2]:
#对训练数据进行缩放
x_train_scaled = scaler.transform(x_train)
svm = SVC(gamma='auto')
#SVM在缩放后的数据上学习
svm.fit(x_train_scaled, y_train)
#对测试集进行缩放，并进行打分
x_test_scaled = scaler.transform(x_test)
svm.score(x_test_scaled, y_test)

0.951048951048951

如何使用pipeline类来简化构建变换和模型链的过程。如何将pipeline和gridsearchcv结合起来。

# 6.1用预处理进行参数选择

In [4]:
#这个网格搜索有个不易察觉的陷阱，我们使用缩放后的所有训练数据来运行带交叉验证的网格搜索，交叉验证中的划分
#无法正确的反应新数据的特征，我们已经将这部分数据的信息泄露给建模过程，交叉验证结果将过于乐观
from sklearn.model_selection import GridSearchCV
param_grid={'C':[0.001,0.01,0.1,1,10,100],'gamma':[0.001,0.01,0.1,1,10,100]}
grid=GridSearchCV(SVC(),param_grid=param_grid,cv=5)
grid.fit(x_train_scaled,y_train)
print(grid.best_score_)
print(grid.score(x_test_scaled,y_test))
print(grid.best_params_)
#不要在实践中使用这个方法

0.9812206572769953
0.972027972027972
{'C': 1, 'gamma': 1}


在交叉验证的过程中，应该在进行任何预处理之前完成数据集的划分。任何从数据集中提取信息的处理过程都应该仅应用于数据集的训练部分。任何交叉验证都应该位于处理过程的“最外层循环”

pipeline类最常见的用例是将预处理步骤（比如数据缩放）与一个监督模型（比如分类器）连接在一起

# 6.2构建管道

In [3]:
#我们先使用MinMaxScaler缩放数据之后再训练一个SVM的工作流程（暂不使用网格搜索）
from sklearn.pipeline import Pipeline
pipe=Pipeline([("scaler",MinMaxScaler()),('svm',SVC(gamma='auto'))])
pipe.fit(x_train,y_train)
pipe.score(x_test,y_test)
#利用管道，我们减少了‘预处理+分类’过程所需要的代码量
#使用管道的主要优点在于，我们可以在cross_val_score或GridSearchCV中使用这个估计器

0.951048951048951

# 6.3在网格搜索中使用管道

In [4]:
param_grid = {'svm__C': [0.001, 0.01, 0.1, 1, 10, 100],
              'svm__gamma': [0.001, 0.01, 0.1, 1, 10, 100]}

In [5]:
from sklearn.model_selection import GridSearchCV
grid = GridSearchCV(pipe, param_grid=param_grid, cv=5)
grid.fit(x_train, y_train)
print((grid.best_score_))
print((grid.score(x_test, y_test)))
print((grid.best_params_))

0.9812206572769953
0.972027972027972
{'svm__C': 1, 'svm__gamma': 1}


这次交叉验证的每次划分，仅使用训练部分对MinMaxScaler进行拟合，测试部分信息没有泄露到建模过程中（参数搜索中）

# 信息泄露举例

In [10]:
import numpy as np
rnd=np.random.RandomState(seed=0)
x=rnd.normal(size=(100,10000))
y=rnd.normal(size=(100,))
from sklearn.feature_selection import SelectPercentile,f_regression
select=SelectPercentile(score_func=f_regression,percentile=5).fit(x,y)
x_selected=select.transform(x)
x_selected.shape

(100, 500)

In [11]:
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import Ridge
cross_val_score(Ridge(),x_selected,y,cv=5)

array([0.84834054, 0.94084243, 0.88541709, 0.94012139, 0.91425508])

x和y是随机数本质上没任何关系，由于我们在交叉验证之外特征选择进行拟合，测试部分泄露出去的信息量非常大，导致结果不切实际

In [12]:
pipe=Pipeline([("selectpercentile",SelectPercentile(score_func=f_regression,percentile=5)),('ridge',Ridge())])
cross_val_score(pipe,x,y,cv=5)

array([-0.97502994, -0.03166358, -0.03989415,  0.03018385, -0.2163673 ])

r-squared 为负表明模型很差

# 6.4通用的管道接口

Pipeline类不但可以用于预处理和分类，实际上还可以将任意数量的估计器连接在一起，对于估计器的唯一要求是，除了最后一步外的所有步骤都需要具有transform方法，可以生成新的数据，以供下一个步骤使用

# 6.4.1用make_pipeline方便地创造管道

In [7]:
#有一个很方便的函数make_pipeline
from sklearn.pipeline import make_pipeline
#标准语法
pipe_long=Pipeline([('scaler',MinMaxScaler()),('svm',SVC(C=100))])
#缩写语法
pipe_short=make_pipeline(MinMaxScaler(),SVC(C=100))
#管道对象pipe_long,pipe_short作用完全相同

In [15]:
pipe_short.steps

[('minmaxscaler', MinMaxScaler(copy=True, feature_range=(0, 1))),
 ('svc', SVC(C=100, cache_size=200, class_weight=None, coef0=0.0,
      decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
      kernel='rbf', max_iter=-1, probability=False, random_state=None,
      shrinking=True, tol=0.001, verbose=False))]

# 6.4.2访问步骤属性

In [8]:
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

pipe = make_pipeline(StandardScaler(), PCA(n_components=2), StandardScaler())
pipe.steps

[('standardscaler-1',
  StandardScaler(copy=True, with_mean=True, with_std=True)),
 ('pca',
  PCA(copy=True, iterated_power='auto', n_components=2, random_state=None,
      svd_solver='auto', tol=0.0, whiten=False)),
 ('standardscaler-2',
  StandardScaler(copy=True, with_mean=True, with_std=True))]

In [14]:
#通常检查管道中某一个步骤的属性，可以通过named_steps属性
pipe.fit(cancer.data)
components=pipe.named_steps['pca'].components_
components.shape

(2, 30)

# 6.4.3访问网格搜索管道中的属性

In [21]:
from sklearn.linear_model import LogisticRegression
pipe=make_pipeline(StandardScaler(),LogisticRegression())
pipe.steps

[('standardscaler', StandardScaler(copy=True, with_mean=True, with_std=True)),
 ('logisticregression',
  LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                     intercept_scaling=1, l1_ratio=None, max_iter=100,
                     multi_class='warn', n_jobs=None, penalty='l2',
                     random_state=None, solver='warn', tol=0.0001, verbose=0,
                     warm_start=False))]

In [24]:
import warnings;warnings.simplefilter('ignore')
param_grid={'logisticregression__C':[0.01,0.1,1,10,100]}
x_train, x_test, y_train, y_test = train_test_split(cancer.data, cancer.target, random_state=4)
grid=GridSearchCV(pipe,param_grid,cv=5)
grid.fit(x_train,y_train)
grid.best_estimator_

Pipeline(memory=None,
         steps=[('standardscaler',
                 StandardScaler(copy=True, with_mean=True, with_std=True)),
                ('logisticregression',
                 LogisticRegression(C=0.1, class_weight=None, dual=False,
                                    fit_intercept=True, intercept_scaling=1,
                                    l1_ratio=None, max_iter=100,
                                    multi_class='warn', n_jobs=None,
                                    penalty='l2', random_state=None,
                                    solver='warn', tol=0.0001, verbose=0,
                                    warm_start=False))],
         verbose=False)

In [25]:
#gridsearchcv找到的最佳模型（在所有训练数据上训练得到的模型）保存在grid.best_estimator_中
#best_estimator_是一个管道，包含standardscaler和logisticregression两个步骤
#我们可以使用管道的named_steps属性来访问logisticregression
grid.best_estimator_.named_steps['logisticregression']

LogisticRegression(C=0.1, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [26]:
grid.best_estimator_.named_steps['logisticregression'].coef_

array([[-0.38856355, -0.37529972, -0.37624793, -0.39649439, -0.11519359,
         0.01709608, -0.3550729 , -0.38995414, -0.05780518,  0.20879795,
        -0.49487753, -0.0036321 , -0.37122718, -0.38337777, -0.04488715,
         0.19752816,  0.00424822, -0.04857196,  0.21023226,  0.22444999,
        -0.54669761, -0.52542026, -0.49881157, -0.51451071, -0.39256847,
        -0.12293451, -0.38827425, -0.4169485 , -0.32533663, -0.13926972]])

# 6.5网格搜索预处理步骤与模型参数

In [32]:
from sklearn.datasets import load_boston
boston = load_boston()
x_train, x_test, y_train, y_test = train_test_split(boston.data, boston.target,random_state=0)
from sklearn.preprocessing import PolynomialFeatures
pipe = make_pipeline(StandardScaler(),PolynomialFeatures(),Ridge())
param_grid = {'polynomialfeatures__degree': [1, 2, 3],
              'ridge__alpha': [0.001, 0.01, 0.1, 1, 10, 100]}
grid=GridSearchCV(pipe,param_grid=param_grid,cv=5,n_jobs=-1)
grid.fit(x_train,y_train)
grid.best_params_

{'polynomialfeatures__degree': 2, 'ridge__alpha': 10}

In [33]:
grid.score(x_test,y_test)

0.7683045464100141

同时搜索预处理参数与模型参数是一个非常强大的策略，但是向网格中添加更多参数，需要构建的模型数量将呈指数增长

# 6.6网格搜索选择使用哪个模型

In [35]:
#我们先定义管道。这里我们显示地对步骤命名，我们需要两个步骤，一个用于预处理，一个是分类器
pipe=Pipeline([('preprocessing',StandardScaler()),('classifier',SVC())])
from sklearn.ensemble import RandomForestClassifier
param_grid = [
    {'classifier': [SVC()], 'preprocessing': [StandardScaler(), None],
     'classifier__gamma': [0.001, 0.01, 0.1, 1, 10, 100],
     'classifier__C': [0.001, 0.01, 0.1, 1, 10, 100]},
    {'classifier': [RandomForestClassifier(n_estimators=100)],
     'preprocessing': [None], 'classifier__max_features': [1, 2, 3]}]
x_train, x_test, y_train, y_test = train_test_split(cancer.data, cancer.target, random_state=0)
grid=GridSearchCV(pipe,param_grid,cv=5)
grid.fit(x_train,y_train)
print(grid.best_params_)
print(grid.best_score_)
print(grid.score(x_test,y_test))

{'classifier': SVC(C=10, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma=0.01, kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False), 'classifier__C': 10, 'classifier__gamma': 0.01, 'preprocessing': StandardScaler(copy=True, with_mean=True, with_std=True)}
0.9859154929577465
0.9790209790209791
