### 为什么要划分训练集和测试集？

将训练得到的模型之间在真实环境中使用，那么
- 如果模型很差怎么办？这可能会造成真实的经济损失或安全风险
- 真实环境中难以拿到真实label，以评价预测是否准确

所以我们需要对训练数据集进行划分，分割为<b>训练集和测试集</b>。<b>利用测试集判断模型好坏</b>，在模型进入真实环境前改进模型

### 测试我们的算法

In [1]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets

In [3]:
# 鸢尾花数据集
iris = datasets.load_iris()
iris.keys()

dict_keys(['data', 'target', 'frame', 'target_names', 'DESCR', 'feature_names', 'filename'])

In [4]:
X = iris.data # 训练数据
y = iris.target  # 标签

In [5]:
X.shape

(150, 4)

In [6]:
y.shape

(150,)

### train_test_split

将数据集分割为训练集和测试集

In [7]:
y # 我们不能直接取一定量的数据作为测试集，因为y是默认排好序的

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])

In [11]:
# 我们要先做下shuffle，进行随机取样
# 由于X和y是分离的，但标签和样本点是一一对应的，故有两种方法做shuffle
# 1. 先将X和y进行合并后，shuffle，之后再拆分数据集
# 2. 对索引进行shuffle
shuffled_indexes = np.random.permutation(len(X))
shuffled_indexes

array([138,  11,   4,  96,  44,  28, 112,  35, 116,  45, 137,  82,  62,
        75,  46,  53,  27, 101, 145,  52,  77,  41,  85, 120, 129, 133,
        60, 114,  13,  24,   8, 103,   2,  69, 118,  39, 110,  21, 144,
        87,  76,  10,  15,   1,  34, 149,  99, 141, 143,  81,  54, 107,
        84,  19,   0, 123,  93,  51,  86,  37,  56, 147, 115,  26,  92,
        29,  88, 113,  63,   6,  73, 124, 146, 128, 111, 106, 122, 139,
        40, 119,  59,  72, 148, 142,  33,  94,  22,  17,  78,  48,  68,
       132, 100, 130,   5, 102,  31,  66,  36, 126,  57,  12, 135,  20,
       105,  49,   9,  98, 117,  14, 134,  67, 104,  47, 109,  90,  30,
        25,  38,   7,  79,  91, 131,  89,  18,  71, 136, 121,  55,  43,
       127, 140,  23,  42,  95,  65,   3,  83,  61,  32,  74,  58,  80,
        97, 125,  64, 108,  50,  16,  70])

In [12]:
test_ratio = 0.2 # 20%的数据作为测试集
test_size = int(len(X) * test_ratio)

In [13]:
test_indexes = shuffled_indexes[:test_size] # 前20%的数据作为测试集
train_indexes = shuffled_indexes[test_size:] 

In [14]:
# 利用fancyindex，划分测试集和训练集
X_train = X[train_indexes]
y_train = y[train_indexes]

X_test = X[test_indexes]
y_test = y[test_indexes]

In [15]:
print(X_train.shape)
print(y_train.shape)

(120, 4)
(120,)


In [16]:
print(X_test.shape)
print(y_test.shape)

(30, 4)
(30,)


### 使用scikit-learn中的train_test_split

In [17]:
from sklearn.model_selection import train_test_split
train_test_split

<function sklearn.model_selection._split.train_test_split(*arrays, **options)>

In [18]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 666)
# random_state是随机数种子，保证代码多次运行结果一致

In [19]:
print(X_train.shape)
print(y_train.shape)

(120, 4)
(120,)


In [20]:
print(X_test.shape)
print(y_test.shape)

(30, 4)
(30,)


### 封装自己的train_test_split方法

In [21]:
from playML.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y)

In [23]:
print(X_train.shape)
print(y_train.shape)

(120, 4)
(120,)


In [24]:
print(X_test.shape)
print(y_test.shape)

(30, 4)
(30,)


### 测试我们的KNN算法

In [25]:
from playML.KNN_simulation import KNNClassifier

my_knn_clf = KNNClassifier(k=3)
my_knn_clf.fit(X_train, y_train)
y_predict = my_knn_clf.predict(X_test)

In [26]:
y_predict  # 预测值

array([1, 1, 0, 2, 1, 1, 2, 0, 2, 0, 2, 1, 2, 0, 1, 0, 2, 1, 2, 2, 1, 2,
       0, 1, 1, 0, 2, 0, 0, 2])

In [27]:
y_test   # 真实值

array([1, 1, 0, 2, 1, 1, 2, 0, 2, 0, 1, 1, 2, 0, 1, 0, 2, 1, 2, 2, 1, 2,
       0, 1, 1, 0, 2, 0, 0, 2])

In [30]:
sum(y_predict == y_test) / len(y_test) # 正确率

0.9666666666666667