# 交叉验证

In [1]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier

In [2]:
digits = datasets.load_digits()
X = digits.data
y = digits.target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4)

### 一直以来的做法：Train Test Split 调参

In [3]:
best_score, best_k, best_p = 0, 0, 0
for k in range(2, 11): # kNN 的 k 个近邻[2, 10]
    for p in range(1, 6): # kNN的明可夫斯基距离的参数，[1, 5]
        knn = KNeighborsClassifier(weights="distance", n_neighbors=k, p=p)
        knn.fit(X_train, y_train)
        score = knn.score(X_test, y_test)
        if score > best_score:
            best_score, best_k, best_p = score, k, p
print(f"Best score: {best_score}\nBest k: {best_k}\nBest p: {best_p}")

Best score: 0.9888734353268428
Best k: 3
Best p: 1


### 使用交叉验证调参

In [4]:
from sklearn.model_selection import cross_val_score
knn = KNeighborsClassifier()
cross_val_score(knn, X_train, y_train)

array([0.96759259, 0.96296296, 1.        , 0.97674419, 0.98139535])

In [5]:
best_score, best_k, best_p = 0, 0, 0
for k in range(2, 11):
    for p in range(1, 6): 
        knn = KNeighborsClassifier(weights="distance", n_neighbors=k, p=p)
        scores = cross_val_score(knn, X_train, y_train)
        score = np.mean(scores)
        if score > best_score:
            best_score, best_k, best_p = score, k, p
print(f"Best score: {best_score}\nBest k: {best_k}\nBest p: {best_p}")

Best score: 0.9870155038759691
Best k: 2
Best p: 3


* best_k, best_p 是不一样的，这种情况，可以更加相信交叉验证的结果。
* 交叉验证出来的分数可能更低一些，但是没关系，因为这里没有过拟合
* 最终的成绩不是这里的score，交叉验证出来的分数，并不是最终的分数，需要用测试数据集去完成最终评分
* 这里给出的k和p产生的kNN模型，就是最优秀的。

In [6]:
best_knn_clf = KNeighborsClassifier(weights="distance", n_neighbors=best_k, p=best_p)

In [7]:
best_knn_clf.fit(X_train, y_train)
best_knn_clf.score(X_test, y_test) # 测试数据集

0.980528511821975

*sklearn的交叉验证的分组是可以换的，在撰写本文的时候，是分成5组。*
* 这个分数是在k个分组下（这里就是5）的交叉验证中找出的最好的模型的最终成绩
* 这个准确率，是值得相信的

### 回顾网格搜索

In [8]:
from sklearn.model_selection import GridSearchCV
param_grid = [
    {
        "weights" : ["distance"],
        "n_neighbors" : [i for i in range(2, 11)],
        "p" : [i for i in range(1, 6)]
    }
]
knn = KNeighborsClassifier()
grid_serch = GridSearchCV(knn, param_grid, verbose=1) # CV, 就是交叉验证Cross Validation的意思
grid_serch.fit(X_train, y_train)

Fitting 5 folds for each of 45 candidates, totalling 225 fits


这里显示的Fitting 5 folds，就是说交叉验证时分成了多少组。
* 有45个参数组合
* 针对45个参数组合，分别生成5个模型进行训练，一共是255个模型

In [9]:
grid_serch.best_score_

0.9870155038759691

In [10]:
grid_serch.best_params_

{'n_neighbors': 2, 'p': 3, 'weights': 'distance'}

In [11]:
bst_knn = grid_serch.best_estimator_ # 获取最好的参数对应的最好的模型

In [12]:
bst_knn.score(X_test, y_test)

0.980528511821975

* cross_val_score()中，cv参数用于控制分组个数
* GridSearchCV()也是。
