# 网格搜索

In [1]:
import numpy as np
from sklearn import datasets

In [2]:
digits = datasets.load_digits()
X = digits.data
y = digits.target

In [3]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=666)

In [4]:
from sklearn.neighbors import KNeighborsClassifier

sk_knn_clf = KNeighborsClassifier(n_neighbors=4, weights="uniform")
sk_knn_clf.fit(X_train, y_train)
sk_knn_clf.score(X_test, y_test)

0.9916666666666667

### Grid Search

在网格搜索中，用来评价分类器准确度的方式是更加复杂的，也就是 CV 代表的意思，叫做Cross Vailiudation，也就是交叉验证。交叉验证的方式比我们用train_test_split 这样的方式获得的准确度相对更加准确。

In [13]:
param_grid = [
    {
        'weights': ['uniform'],
        'n_neighbors': [i for i in range(1, 11)]
    },
    {
        'weights': ['distance'],
        'n_neighbors': [i for i in range(1, 11)],
        'p': [i for i in range(1, 6)]
    }
] # 数组中存放字典

In [14]:
knn_clf = KNeighborsClassifier()

In [15]:
from sklearn.model_selection import GridSearchCV

grid_search = GridSearchCV(knn_clf, param_grid) # GridSearchCV(对哪一个分类器进行网格搜索, 网格搜索对应的参数)

In [16]:
%%time
grid_search.fit(X_train, y_train) #这个过程其实就是基于X_train和y_train我们的训练数据集，针对我们定义的所有参数来尝试寻找最佳的模型

Wall time: 3min 8s


GridSearchCV(cv=None, error_score='raise',
       estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform'),
       fit_params=None, iid=True, n_jobs=1,
       param_grid=[{'weights': ['uniform'], 'n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]}, {'weights': ['distance'], 'n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], 'p': [1, 2, 3, 4, 5]}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [17]:
grid_search.best_estimator_ # 返回网格搜索搜索到的最佳的分类器对应的参数

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=3, p=3,
           weights='distance')

In [18]:
grid_search.best_score_ # 最佳分类器对应的准确度

0.9853862212943633

看似准确度的值低于train_test_split得到的准确度，我们这个分类器效果不是那么好，其实不然，这是因为我们的评判标准是不一样的，在机器算法领域，怎么评判机器算法的好坏，是一个非常重要的课题，后续会有详细介绍。

In [19]:
grid_search.best_params_ # 得到对应于我们的搜索的数组而言，最终得到的最佳参数

{'n_neighbors': 3, 'p': 3, 'weights': 'distance'}

这三个参数的调用都是名字后面再加上个下划线_，这是个简单的代码原则：不是由用户传入的参数，而是由根据用户传入的参数，我们的这个类自己计算出来的结果，对于这样的参数，在命名的时候使用名字后面再跟上一个下划线_的方式来获得。

In [20]:
knn_clf = grid_search.best_estimator_

In [21]:
knn_clf.predict(X_test)

array([8, 1, 3, 4, 4, 0, 7, 0, 8, 0, 4, 6, 1, 1, 2, 0, 1, 6, 7, 3, 3, 6,
       5, 2, 9, 4, 0, 2, 0, 3, 0, 8, 7, 2, 3, 5, 1, 3, 1, 5, 8, 6, 2, 6,
       3, 1, 3, 0, 0, 4, 9, 9, 2, 8, 7, 0, 5, 4, 0, 9, 5, 5, 8, 7, 4, 2,
       8, 8, 7, 5, 4, 3, 0, 2, 7, 2, 1, 2, 4, 0, 9, 0, 6, 6, 2, 0, 0, 5,
       4, 4, 3, 1, 3, 8, 6, 4, 4, 7, 5, 6, 8, 4, 8, 4, 6, 9, 7, 7, 0, 8,
       8, 3, 9, 7, 1, 8, 4, 2, 7, 0, 0, 4, 9, 6, 7, 3, 4, 6, 4, 8, 4, 7,
       2, 6, 9, 5, 8, 7, 2, 5, 5, 9, 7, 9, 3, 1, 9, 4, 4, 1, 5, 1, 6, 4,
       4, 8, 1, 6, 2, 5, 2, 1, 4, 4, 3, 9, 4, 0, 6, 0, 8, 3, 8, 7, 3, 0,
       3, 0, 5, 9, 2, 7, 1, 8, 1, 4, 3, 3, 7, 8, 2, 7, 2, 2, 8, 0, 5, 7,
       6, 7, 3, 4, 7, 1, 7, 0, 9, 2, 8, 9, 3, 8, 9, 1, 1, 1, 9, 8, 8, 0,
       3, 7, 3, 3, 4, 8, 2, 1, 8, 6, 0, 1, 7, 7, 5, 8, 3, 8, 7, 6, 8, 4,
       2, 6, 2, 3, 7, 4, 9, 3, 5, 0, 6, 3, 8, 3, 3, 1, 4, 5, 3, 2, 5, 6,
       9, 6, 9, 5, 5, 3, 6, 5, 9, 3, 7, 7, 0, 2, 4, 9, 9, 9, 2, 5, 6, 1,
       9, 6, 9, 7, 7, 4, 5, 0, 0, 5, 3, 8, 4, 4, 3,

In [22]:
knn_clf.score(X_test, y_test)

0.9833333333333333

In [25]:
%%time
grid_search = GridSearchCV(knn_clf, param_grid, n_jobs=-1, verbose=2) 
# n_jobs 决定为计算机提供几个核来进行并行处理，默认为1代表使用单核处理，-1代表计算机的所有的核全部用于网格搜索的过程
# verbose 输出，帮助我们了解搜索状态，它传入一个整数，整数越大输出的信息越详细
grid_search.fit(X_train, y_train)

Fitting 3 folds for each of 60 candidates, totalling 180 fits


[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:   19.1s
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:   58.3s


Wall time: 1min 11s


[Parallel(n_jobs=-1)]: Done 180 out of 180 | elapsed:  1.2min finished


In [26]:
%%time
grid_search = GridSearchCV(knn_clf, param_grid, n_jobs=-1, verbose=3) 
# n_jobs 决定为计算机提供几个核来进行并行处理，默认为1代表使用单核处理，-1代表计算机的所有的核全部用于网格搜索的过程
# verbose 输出，帮助我们了解搜索状态，它传入一个整数，整数越大输出的信息越详细
grid_search.fit(X_train, y_train)

Fitting 3 folds for each of 60 candidates, totalling 180 fits


[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:   14.9s
[Parallel(n_jobs=-1)]: Done 120 tasks      | elapsed:   45.0s


Wall time: 1min 10s


[Parallel(n_jobs=-1)]: Done 180 out of 180 | elapsed:  1.2min finished


In [29]:
%%time
grid_search = GridSearchCV(knn_clf, param_grid, n_jobs=-1, verbose=10) 
# n_jobs 决定为计算机提供几个核来进行并行处理，默认为1代表使用单核处理，-1代表计算机的所有的核全部用于网格搜索的过程
# verbose 输出，帮助我们了解搜索状态，它传入一个整数，整数越大输出的信息越详细
grid_search.fit(X_train, y_train)

Fitting 3 folds for each of 60 candidates, totalling 180 fits


[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:    4.5s
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:    6.8s
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:   11.5s
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:   15.9s
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:   17.5s
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   20.0s
[Parallel(n_jobs=-1)]: Done  53 tasks      | elapsed:   22.6s
[Parallel(n_jobs=-1)]: Done  64 tasks      | elapsed:   24.8s
[Parallel(n_jobs=-1)]: Done  77 tasks      | elapsed:   29.4s
[Parallel(n_jobs=-1)]: Done  90 tasks      | elapsed:   34.4s
[Parallel(n_jobs=-1)]: Done 105 tasks      | elapsed:   39.8s
[Parallel(n_jobs=-1)]: Done 120 tasks      | elapsed:   45.1s
[Parallel(n_jobs=-1)]: Done 137 tasks      | elapsed:   50.7s
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:   56.4s
[Parallel(n_jobs=-1)]: Done 173 tasks      | elapsed:  1.1min


Wall time: 1min 9s


[Parallel(n_jobs=-1)]: Done 180 out of 180 | elapsed:  1.2min finished


### 更多的距离定义

向量空间余弦相似度  Cosine Similarity <br>
调整预先相似度  Adjusted Cosine Similarity <br>
皮尔森相关系数  Pearson Cosine Similarity <br>
Jaccard相似系数  Jaccard Cosine Similarity