# 测试我们的算法

In [38]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets

In [4]:
iris = datasets.load_iris()

In [5]:
X = iris.data
y = iris.target

In [6]:
X.shape

(150, 4)

In [7]:
y.shape

(150,)

## train-test-split

In [8]:
y # y已经排好序，不能直接按顺序获取

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])

In [9]:
shuffle_indexes = np.random.permutation(len(X)) # 0-len(X)-1的随机的排列, 150个索引的随机排列

In [10]:
shuffle_indexes

array([  3,  16,  77,  65,  20,  50,  44, 148, 146,  19,  89, 118,  71,
       135, 136,  86, 127,  90,  98,  56, 145,  96,   0, 130,  59,  52,
        57,  29,  85,   1, 141,  25,  67,  28,  69,  38, 129,  53,  95,
        23,  92, 117, 137,  15,  61,  62,  26,  83,  47, 122, 132, 112,
         9, 125,  66, 121,  64,  11,  94,  48,  84, 120, 119,   4, 126,
       104,  37,  78,  45,  68, 109,  17,  80, 100,  46,  10, 142,  39,
        24, 128,  43, 115,  76, 124,  41,  18,  21, 140, 133,  32,   6,
       131, 114, 134,  82, 113, 103,  79,  31,  40,  34, 111,  60,  49,
       108,  81,  87,  36,  97,  73, 106,  54,   5,  12,   7,  51, 149,
        75,  70, 110, 102, 138, 116,   8,   2, 105,  55,  88, 139,  14,
       107, 143,  35,  42,  72,  30,  91, 144,  74, 101,  93,  33, 123,
        22, 147,  13,  58,  99,  27,  63])

In [11]:
test_ratio = 0.2 # 测试数据集有20%
test_size = int(len(X) * test_ratio) # 测试数据集大小

In [12]:
test_size

30

In [13]:
test_indexes = shuffle_indexes[:test_size] # 测试集索引
train_indexes = shuffle_indexes[test_size:] # 训练集索引

In [14]:
X_train = X[train_indexes]
y_train = y[train_indexes]

X_test = X[test_indexes]
y_test = y[test_indexes]

In [15]:
print(X_train.shape)
print(y_train.shape)

(120, 4)
(120,)


In [16]:
print(X_test.shape)
print(y_test.shape)

(30, 4)
(30,)


## 使用我们的算法

In [17]:
from KNN.model_selection import train_test_split # 在目录中加入__init__.py可以使该文件夹当成包来导入

In [18]:
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [19]:
print(X_train.shape)
print(y_train.shape)

(120, 4)
(120,)


In [20]:
print(X_test.shape)

(30, 4)


In [21]:
print(y_test.shape)

(30,)


In [22]:
from KNN.KNN import KNNClassifier

In [23]:
my_knn_clf = KNNClassifier(k=3)

In [24]:
my_knn_clf.fit(X_train, y_train)

KNN(k=3)

In [25]:
y_predict = my_knn_clf.predict(X_test)

In [26]:
y_predict # 预测结果

array([0, 2, 1, 2, 0, 0, 0, 2, 0, 2, 1, 1, 2, 1, 1, 2, 0, 1, 0, 2, 0, 0,
       2, 1, 1, 1, 0, 2, 1, 2])

In [27]:
y_test # 真实结果

array([0, 2, 1, 2, 0, 0, 0, 2, 0, 2, 1, 1, 2, 1, 1, 2, 0, 1, 0, 1, 0, 0,
       2, 1, 1, 1, 0, 2, 1, 2])

In [28]:
sum(y_predict == y_test) # 比较预测结果和真实结果

29

In [29]:
sum(y_predict == y_test) / len(y_test) # 准确率

0.9666666666666667

## sklearn中的train_test_split

In [30]:
from sklearn.model_selection import train_test_split # sklearn中的train_test_split

In [37]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=666) 
# 调用方式, test_size表示测试集所占百分比, random_state表示随机数种子

In [34]:
print(X_train.shape)
print(y_train.shape)

(120, 4)
(120,)


In [36]:
print(X_test.shape)
print(y_test.shape)

(30, 4)
(30,)
