## Import libraries 导入库

In [1]:
import pandas as pd
import numpy as np

from sklearn import datasets
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

## Load datasets 加载数据集

In [2]:
# Using "red wine dataset"
# Labels: quality of wine, range from 1-10
wine_x, wine_y = datasets.load_wine(return_X_y=True)

In [3]:
wine_df_x = pd.DataFrame(wine_x, columns=['Alcohol', 'Malic acid', 'Ash', 'Alcalinity of ash', 'Magnesium', 
                                          'Total phenols', 'Flavanoids', 'Nonflavanoid phenols', 'Proanthocyanins', 
                                          'Color intensity', 'Hue', 'OD280/OD315', 'Proline'])
wine_df_y = pd.DataFrame(wine_y, columns=['Target'])

In [4]:
wine_df_x.head()

Unnamed: 0,Alcohol,Malic acid,Ash,Alcalinity of ash,Magnesium,Total phenols,Flavanoids,Nonflavanoid phenols,Proanthocyanins,Color intensity,Hue,OD280/OD315,Proline
0,14.23,1.71,2.43,15.6,127.0,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065.0
1,13.2,1.78,2.14,11.2,100.0,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050.0
2,13.16,2.36,2.67,18.6,101.0,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185.0
3,14.37,1.95,2.5,16.8,113.0,3.85,3.49,0.24,2.18,7.8,0.86,3.45,1480.0
4,13.24,2.59,2.87,21.0,118.0,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735.0


## Split the dataset into train and test set. 将数据集分为测试和训练数据集。

In [6]:
x_train, x_test, y_train, y_test = train_test_split(wine_df_x, wine_df_y, 
                                                    test_size=0.2, random_state=40, stratify=wine_df_y)

In [7]:
x_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 142 entries, 26 to 48
Data columns (total 13 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Alcohol               142 non-null    float64
 1   Malic acid            142 non-null    float64
 2   Ash                   142 non-null    float64
 3   Alcalinity of ash     142 non-null    float64
 4   Magnesium             142 non-null    float64
 5   Total phenols         142 non-null    float64
 6   Flavanoids            142 non-null    float64
 7   Nonflavanoid phenols  142 non-null    float64
 8   Proanthocyanins       142 non-null    float64
 9   Color intensity       142 non-null    float64
 10  Hue                   142 non-null    float64
 11  OD280/OD315           142 non-null    float64
 12  Proline               142 non-null    float64
dtypes: float64(13)
memory usage: 15.5 KB


## Create pipeline. 创建Pipeline。

In [8]:
# define steps that need to be in the pipeline
steps = [('scaler', StandardScaler()), ('PCA', PCA()), ('DCT', DecisionTreeClassifier())]

In [9]:
# create pipeline object
pipeline = Pipeline(steps)

In [10]:
pipeline.fit(x_train, y_train)

Pipeline(memory=None,
         steps=[('scaler',
                 StandardScaler(copy=True, with_mean=True, with_std=True)),
                ('PCA',
                 PCA(copy=True, iterated_power='auto', n_components=None,
                     random_state=None, svd_solver='auto', tol=0.0,
                     whiten=False)),
                ('DCT',
                 DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None,
                                        criterion='gini', max_depth=None,
                                        max_features=None, max_leaf_nodes=None,
                                        min_impurity_decrease=0.0,
                                        min_impurity_split=None,
                                        min_samples_leaf=1, min_samples_split=2,
                                        min_weight_fraction_leaf=0.0,
                                        presort='deprecated', random_state=None,
                                        splitter='best')

In [11]:
pipeline.score(x_test, y_test)

0.9444444444444444

## Use Pipeline with GridSearchCV. 结合使用Pipeline和GridSearchCV。

In [12]:
# define steps that need to be in the pipeline
steps_cv = [('scaler', StandardScaler()), ('PCA', PCA()), ('DCT', DecisionTreeClassifier())]

In [13]:
pipeline_cv = Pipeline(steps_cv)

In [14]:
# create a dictionary with parameters
parameters = {'DCT__criterion':['gini', 'entropy'], 'DCT__random_state':[10,20], 'DCT__max_depth':[5,10,15,20,25,30]}

In [15]:
# create a gridsearchCV object
grid_cv = GridSearchCV(pipeline, param_grid=parameters, cv=10)

In [16]:
# fit the training dataset on grid_cv
grid_cv.fit(x_train, y_train)

GridSearchCV(cv=10, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('scaler',
                                        StandardScaler(copy=True,
                                                       with_mean=True,
                                                       with_std=True)),
                                       ('PCA',
                                        PCA(copy=True, iterated_power='auto',
                                            n_components=None,
                                            random_state=None,
                                            svd_solver='auto', tol=0.0,
                                            whiten=False)),
                                       ('DCT',
                                        DecisionTreeClassifier(ccp_alpha=0.0,
                                                               class_weight=None,
                                                               criter

In [17]:
grid_cv.cv_results_

{'mean_fit_time': array([0.01266522, 0.00648103, 0.00684102, 0.00667949, 0.00688508,
        0.00669038, 0.00637984, 0.00667729, 0.00718038, 0.0108655 ,
        0.01206846, 0.00953116, 0.0133673 , 0.03121681, 0.01805124,
        0.01117084, 0.00957658, 0.00907316, 0.00937455, 0.00847352,
        0.01005404, 0.01924884, 0.01371317, 0.01047213]),
 'mean_score_time': array([0.00229445, 0.00139437, 0.00178385, 0.00169773, 0.00129595,
        0.00178933, 0.00139613, 0.00149567, 0.00199516, 0.00269299,
        0.00279198, 0.00289037, 0.00379093, 0.00787911, 0.00469103,
        0.00478668, 0.00359097, 0.00209999, 0.00219228, 0.00165012,
        0.002491  , 0.00488696, 0.00289228, 0.00209455]),
 'mean_test_score': array([0.93666667, 0.93666667, 0.93666667, 0.93666667, 0.93666667,
        0.93666667, 0.93666667, 0.93666667, 0.93666667, 0.93666667,
        0.93666667, 0.93666667, 0.94380952, 0.95095238, 0.94380952,
        0.95095238, 0.94380952, 0.95095238, 0.94380952, 0.95095238,
        0.943

In [18]:
# create a dataframe to have better understanding of the cross validation results
grid_cv_df = pd.DataFrame(grid_cv.cv_results_)

In [19]:
grid_cv_df.head(5)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_DCT__criterion,param_DCT__max_depth,param_DCT__random_state,params,split0_test_score,split1_test_score,...,split3_test_score,split4_test_score,split5_test_score,split6_test_score,split7_test_score,split8_test_score,split9_test_score,mean_test_score,std_test_score,rank_test_score
0,0.012665,0.007491,0.002294,0.001098,gini,5,10,"{'DCT__criterion': 'gini', 'DCT__max_depth': 5...",0.933333,0.933333,...,1.0,0.857143,1.0,1.0,0.928571,0.928571,0.928571,0.936667,0.0499,13
1,0.006481,0.000497,0.001394,0.000486,gini,5,20,"{'DCT__criterion': 'gini', 'DCT__max_depth': 5...",0.933333,0.933333,...,1.0,0.857143,1.0,1.0,0.928571,0.857143,0.928571,0.936667,0.0499,13
2,0.006841,0.000708,0.001784,0.001171,gini,10,10,"{'DCT__criterion': 'gini', 'DCT__max_depth': 1...",0.933333,0.933333,...,1.0,0.857143,1.0,1.0,0.928571,0.928571,0.928571,0.936667,0.0499,13
3,0.006679,0.000633,0.001698,0.000644,gini,10,20,"{'DCT__criterion': 'gini', 'DCT__max_depth': 1...",0.933333,0.933333,...,1.0,0.857143,1.0,1.0,0.928571,0.857143,0.928571,0.936667,0.0499,13
4,0.006885,0.000527,0.001296,0.000457,gini,15,10,"{'DCT__criterion': 'gini', 'DCT__max_depth': 1...",0.933333,0.933333,...,1.0,0.857143,1.0,1.0,0.928571,0.928571,0.928571,0.936667,0.0499,13


In [20]:
grid_cv.score(x_test, y_test)

0.8611111111111112

In [21]:
print(grid_cv.best_params_)

{'DCT__criterion': 'entropy', 'DCT__max_depth': 5, 'DCT__random_state': 20}
