作为一种没有显式训练和学习过程的分类和回归算法，k 近邻在众多有监督机器学习算法中算是一种比较独特的方法。说它独特，是因为 k 近邻不像其他模型有损失函数、有优化算法、有训练过程。对于给定的实例数据和实例数据对应所属类别，当要对新的实例进行分类时，根据这个实例最近的 k 个实例所属的类别来决定其属于哪一类。所以相对于其它机器学习模型和算法，k 近邻总体上而言是一种非常简单的方法。
![微信图片_20200515135711.jpg](https://img04.sogoucdn.com/app/a/100520146/28e7636e61ad9c31d011b193d83c4fa2)
![微信图片_20200515135911.jpg](https://img04.sogoucdn.com/app/a/100520146/ce4e4ab8237e8a96b6dae336c0cdbca7)
[参考文章](https://mp.weixin.qq.com/s/XmUAYs-l5PqiTpXVVG2_WQ)

# iris数据集

In [60]:
import numpy as np 
from sklearn.utils import shuffle 
from sklearn.datasets import load_iris 
from collections import Counter
iris=load_iris() 
x,y =shuffle(iris.data,iris.target,random_state=12)
x.astype(np.float32)
y.reshape((-1,1))
rate=.7 
offset=int(x.shape[0]*rate )
x_train,y_train=x[:offset],y[:offset]
x_test,y_test=x[offset:],y[offset:]
x_train.shape,y_train.shape,x_test.shape,y_test.shape

((354, 13),
 (354,),
 (152, 13),
 (152,),
 array([25. , 19.8, 13.3, 23.3, 23. , 13. , 24. , 46. ,  6.3, 32.2,  9.7,
        23. , 20.3, 17.5, 13.4, 23.1, 50. , 17. , 19.3, 19.3, 20. , 13.3,
        20. , 18.9, 18.3, 22.2, 50. , 33.3, 23.9, 24.5, 34.7, 18.4, 22. ,
        14.1, 18.9, 24.6, 19.6, 19.4, 38.7, 23.3, 18.6, 26.4, 31.6, 25. ,
        19.5, 17.8, 25. , 18.8, 44. ,  9.5, 12.8, 16.7, 22. , 18.5, 24.2,
        21.2, 16.1, 20.3, 32.4,  7.2, 15.2, 29. , 37.2, 30.3, 19.5, 16.3,
        13.6, 31.5, 22.4, 32.5, 37.9, 13.3, 20.6, 23.9, 19.6, 23.3, 13.1,
        16.8, 21.4, 23. , 34.6, 17.4, 18.7, 29.4, 20.4, 20.3, 11.8, 36.2,
        25. , 50. , 18.5, 48.5, 14.2, 24.8, 23.5, 33.8, 19.4, 50. , 24.3,
        13.9, 21.1, 12.7, 32. , 14.8, 21.7, 27. , 22. ,  7. , 23.6, 23.4,
        19.4, 17.3, 15.6, 24.4, 11.7, 15.6, 18.6, 12.6, 19.3, 24.3, 10.4,
         8.4, 27.5, 14.9, 37.3, 28.7, 24.8, 20.7, 50. , 20.1, 13.8,  8.8,
        24.1, 20.4, 20. ,  8.8, 29.8, 14.4, 11.5,  9.6, 19.4, 29.1, 16

# 计算距离

In [26]:
def compute_distance(x_test,x_train=x_train):
    dist=np.dot(x_test,x_train.T)
    te,tr=np.square(x_test).sum(axis=1),np.square(x_train).sum(axis=1)
    dist=np.sqrt(-2*dist+tr+np.matrix(te).T)
    return dist 
dist=compute_distance(x_test,x_train)

# KNN预测分类

In [35]:
def predict_labels(dist,y_train,k=1):
    num_test=dist.shape[0]
    y_pred=np.zeros(num_test)
    for i in range(num_test):
        y=y_train[np.argsort(dist[i])].flatten()[0:k]
        y=Counter(y)
        y_pred[i]=y.most_common(1)[0][0]
    return y_pred.reshape((-1,1))
y_pred=predict_labels(dist,y_train) 

# 计算精确度

In [34]:
def accuracy(y_test,y_pred):
    print(y_test.shape,y_pred.shape)
    return (y_test==y_pred).sum()/len(y_test)
accuracy(y_test,y_pred)

(152,) (152, 1)


0.9210526315789473

# 计算最好的k点

In [44]:
def get_best_k(x_train,y_train):
    num_folds=5 
    x_train_fold,y_train_fold=np.array_split(x_train,num_folds),np.array_split(y_train,num_folds)
    ks=[1,2,4,8,10,12]
    acc={}
    for k in ks:
        for fold in range(num_folds):
            x_test,y_test=x_train_fold[fold],y_train_fold[fold]
            x_train,y_train=np.concatenate(x_train_fold[:fold]+x_train_fold[fold+1:]),np.concatenate(y_train_fold[:fold]+y_train_fold[fold+1:])
            dist=compute_distance(x_test,x_train)
            y_pred=predict_labels(dist,y_train,k)
            acc[k]=acc.get(k,[])+[accuracy(y_test,y_pred)]
    for k in acc:
        for a in acc[k]:
            print(f"k={k},accuracy: {a}")
    acc_mean=np.array([np.mean(v) for v in acc.values()])
    best_k=ks[np.argmax(acc_mean)]
    return best_k
best_k=get_best_k(x_train,y_train)
best_k

(71,) (71, 1)
(71,) (71, 1)
(71,) (71, 1)
(71,) (71, 1)
(70,) (70, 1)
(71,) (71, 1)
(71,) (71, 1)
(71,) (71, 1)
(71,) (71, 1)
(70,) (70, 1)
(71,) (71, 1)
(71,) (71, 1)
(71,) (71, 1)
(71,) (71, 1)
(70,) (70, 1)
(71,) (71, 1)
(71,) (71, 1)
(71,) (71, 1)
(71,) (71, 1)
(70,) (70, 1)
(71,) (71, 1)
(71,) (71, 1)
(71,) (71, 1)
(71,) (71, 1)
(70,) (70, 1)
(71,) (71, 1)
(71,) (71, 1)
(71,) (71, 1)
(71,) (71, 1)
(70,) (70, 1)
k=1,accuracy: 0.3380281690140845
k=1,accuracy: 0.23943661971830985
k=1,accuracy: 0.36619718309859156
k=1,accuracy: 0.3380281690140845
k=1,accuracy: 0.4714285714285714
k=2,accuracy: 0.3380281690140845
k=2,accuracy: 0.23943661971830985
k=2,accuracy: 0.36619718309859156
k=2,accuracy: 0.3380281690140845
k=2,accuracy: 0.4714285714285714
k=4,accuracy: 0.49295774647887325
k=4,accuracy: 0.23943661971830985
k=4,accuracy: 0.4084507042253521
k=4,accuracy: 0.352112676056338
k=4,accuracy: 0.4142857142857143
k=8,accuracy: 0.7746478873239436
k=8,accuracy: 0.28169014084507044
k=8,accuracy:

12

# kNN 算法的函数化封装

In [62]:
import numpy as np
from sklearn.utils import shuffle
from sklearn.datasets import load_boston,load_iris
from collections import Counter


def accuracy(y_test, y_pred):
    print(y_test.shape, y_pred.shape)
    return (y_test == y_pred).sum()/len(y_test)


class KNearestNeighbor():
    def __init__(self): ...

#     加载boston数据集
    def create_datasets(self, type="iris"):
        assert type == "iris", "仅支持iris"
        iris = load_iris()
        x, y = shuffle(iris.data, iris.target, random_state=12)
        x.astype(np.float32)
        y=y.reshape((-1, 1))
        rate = .7
        offset = int(x.shape[0]*rate)
        x_train, y_train = x[:offset], y[:offset]
        x_test, y_test = x[offset:], y[offset:]
        return x_train, y_train.reshape((-1,1)), x_test, y_test.reshape((-1,1))
#      5折交叉验证 寻找最佳k
    def cross_validation(self, x_train, y_train):
        num_folds = 5
        x_train_fold, y_train_fold = np.array_split(
            x_train, num_folds), np.array_split(y_train, num_folds)
        ks = [1, 2, 4, 8, 10, 12]
        acc = {}
        for k in ks:
            for fold in range(num_folds):
                x_test, y_test = x_train_fold[fold], y_train_fold[fold]
                x_train, y_train = np.concatenate(
                    x_train_fold[:fold]+x_train_fold[fold+1:]), np.concatenate(y_train_fold[:fold]+y_train_fold[fold+1:])
                dist = compute_distance(x_test, x_train)
                y_pred = self.predict_labels(dist, y_train, k)
                acc[k] = acc.get(k, [])+[accuracy(y_test, y_pred)]
        for k in acc:
            for a in acc[k]:
                print(f"k={k},accuracy: {a}")
        acc_mean = np.array([np.mean(v) for v in acc.values()])
        best_k = ks[np.argmax(acc_mean)]
        print(f"最佳k值为 {best_k}")
        return best_k
#     计算测试集与训练集的距离
    def compute_distance(self, x_test, x_train):
        dist = np.dot(x_test, x_train.T)
        te, tr = np.square(x_test).sum(axis=1), np.square(x_train).sum(axis=1)
        dist = np.sqrt(-2*dist+tr+np.matrix(te).T)
        return dist
#     选择周围K个点中最常见的标签
    def predict_labels(self, dist, y_train, k=1):
        num_test = dist.shape[0]
        y_pred = np.zeros(num_test)
        for i in range(num_test):
            y = y_train[np.argsort(dist[i])].flatten()[0:k]
            y = Counter(y)
            y_pred[i] = y.most_common(1)[0][0]
        return y_pred.reshape((-1, 1))


knn_classifier = KNearestNeighbor()
x_train, y_train, x_test, y_test = knn_classifier.create_datasets(
    type="boston")
best_k = knn_classifier.cross_validation(x_train, y_train)
dist = knn_classifier.compute_distance(x_test, x_train)
y_pred = knn_classifier.predict_labels(dist, y_train, best_k)
print(y_pred)
accuracy(y_test, y_pred)

(21, 1) (21, 1)
(21, 1) (21, 1)
(21, 1) (21, 1)
(21, 1) (21, 1)
(21, 1) (21, 1)
(21, 1) (21, 1)
(21, 1) (21, 1)
(21, 1) (21, 1)
(21, 1) (21, 1)
(21, 1) (21, 1)
(21, 1) (21, 1)
(21, 1) (21, 1)
(21, 1) (21, 1)
(21, 1) (21, 1)
(21, 1) (21, 1)
(21, 1) (21, 1)
(21, 1) (21, 1)
(21, 1) (21, 1)
(21, 1) (21, 1)
(21, 1) (21, 1)
(21, 1) (21, 1)
(21, 1) (21, 1)
(21, 1) (21, 1)
(21, 1) (21, 1)
(21, 1) (21, 1)
(21, 1) (21, 1)
(21, 1) (21, 1)
(21, 1) (21, 1)
(21, 1) (21, 1)
(21, 1) (21, 1)
k=1,accuracy: 1.0
k=1,accuracy: 1.0
k=1,accuracy: 1.0
k=1,accuracy: 0.9047619047619048
k=1,accuracy: 0.9523809523809523
k=2,accuracy: 1.0
k=2,accuracy: 1.0
k=2,accuracy: 1.0
k=2,accuracy: 0.9047619047619048
k=2,accuracy: 0.9523809523809523
k=4,accuracy: 1.0
k=4,accuracy: 1.0
k=4,accuracy: 1.0
k=4,accuracy: 0.9047619047619048
k=4,accuracy: 0.9523809523809523
k=8,accuracy: 1.0
k=8,accuracy: 1.0
k=8,accuracy: 1.0
k=8,accuracy: 0.9047619047619048
k=8,accuracy: 1.0
k=10,accuracy: 1.0
k=10,accuracy: 0.9523809523809523
k=

0.9555555555555556