## 为什么要划分训练集和测试集？

如果将训练得到的模型直接在真实环境中使用，那么

1. 如果模型性能很差，有可能会造成真实的经济损失或安全风险
2. 在真实环境中难以拿到真实的label，用来评价预测是否准确

所以我们需要对训练集进行划分，划分为测试集和训练集。利用测试集来判断模型的好坏，从而在模型进入真实使用场景前对模型进行改进。

## 2.1 手动实现数据集划分

In [1]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets

In [2]:
# 鸢尾花数据集
iris = datasets.load_iris()
iris.keys()

dict_keys(['data', 'target', 'frame', 'target_names', 'DESCR', 'feature_names', 'filename'])

In [4]:
X = iris.data # 样本数据
y = iris.target # 标签
X.shape
y.shape

(150,)

In [5]:
# train_test_split 将数据集划分为测试集和训练集
y # 我们不能直接取一定量的数据作为测试集，因为y是默认排好序的。要随机取

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])

In [6]:
# 我们要先做下shuffle，进行随机取样
# 由于X和y是分离的，但标签和样本点是一一对应的，故有两种方法做shuffle
# 1. 先将X和y进行合并后，shuffle，之后再拆分数据集
# 2. 对索引进行shuffle
shuffled_indexes = np.random.permutation(len(X))
shuffled_indexes

array([116,  56,  74, 106,   3,  78, 133, 131,  46,  33,  20,  47,  77,
        52,   6, 109,  26,  39, 118,  51, 129, 128, 123, 111, 122,  93,
        90, 132,   8, 100,  69,  68, 102,  79,  76, 134,  95,  24,  88,
        71,  40, 120, 115,  98,  34,  41, 113,  28,  50, 141, 107,  84,
        63,  11,  94,  64,  16, 121, 108, 138,  87,  60,  15,  43,  81,
         0, 145,  23,   5,  67, 104, 110,  30,  57, 149, 126, 105,   2,
        91,  21,  83,  31,  65, 143,   1,  19,  42,  54,  96, 114,  97,
        37,  49, 148,  35, 125,  55, 139,  66,  12,  32,   4,  75,  89,
        29,  18,  62,  36,  99,  72,  53, 101,  92,  58,  10,  48,  13,
       136, 124,  70, 146,  86, 119,  61, 137,  38, 140,  59,  45, 103,
       117, 142, 127,  27,  17,  14,  25,  85,   7,  44,  80,  22, 112,
         9, 135, 147, 144, 130,  73,  82])

In [8]:
test_ratio = 0.2 # 20%的数据作为测试集
test_size = int(len(X) * test_ratio)
test_indexes = shuffled_indexes[:test_size]
train_indexes = shuffled_indexes[test_size:]

In [10]:
# 利用fancyindex，划分测试集和训练集
X_train = X[train_indexes]
y_train = y[train_indexes]

X_test = X[test_indexes]
y_test = y[test_indexes]

X_train.shape, y_train.shape, X_test.shape, y_test.shape

((120, 4), (120,), (30, 4), (30,))

## 2.2 使用scikit-learn中的train_test_split

In [12]:
from sklearn.model_selection import train_test_split

# random_state是随机数种子，可以保证代码多次运行结果一致
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=666) 
X_train.shape, y_train.shape, X_test.shape, y_test.shape

((120, 4), (120,), (30, 4), (30,))

## 2.3 使用自封装的train_test_split方法

In [14]:
from playML.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y)
X_train.shape, y_train.shape, X_test.shape, y_test.shape

((120, 4), (120,), (30, 4), (30,))

## 对我们的KNN算法进行测试

In [16]:
from playML.KNN_simulation import KNNClassifier

my_knn_clf = KNNClassifier(k = 3)
my_knn_clf.fit(X_train, y_train)  # 模型拟合
y_predict = my_knn_clf.predict(X_test)

In [17]:
sum(y_predict == y_test) / len(y_test)  # 正确率

0.9333333333333333