作为一种没有显式训练和学习过程的分类和回归算法，k 近邻在众多有监督机器学习算法中算是一种比较独特的方法。说它独特，是因为 k 近邻不像其他模型有损失函数、有优化算法、有训练过程。对于给定的实例数据和实例数据对应所属类别，当要对新的实例进行分类时，根据这个实例最近的 k 个实例所属的类别来决定其属于哪一类。所以相对于其它机器学习模型和算法，k 近邻总体上而言是一种非常简单的方法。
![微信图片_20200515135711.jpg](https://img04.sogoucdn.com/app/a/100520146/28e7636e61ad9c31d011b193d83c4fa2)
![微信图片_20200515135911.jpg](https://img04.sogoucdn.com/app/a/100520146/ce4e4ab8237e8a96b6dae336c0cdbca7)

In [24]:
import numpy as np 
from sklearn.utils import shuffle 
from sklearn.datasets import load_boston 
from collections import Counter
boston=load_boston() 
x,y =shuffle(boston.data,boston.target,random_state=12)
x.astype(np.float32)
y.reshape((-1,1))
rate=.7 
offset=int(x.shape[0]*rate )
x_train,y_train=x[:offset],y[:offset]
x_test,y_test=x[offset:],y[offset:]
x_train.shape,y_train.shape,x_test.shape,y_test.shape

((354, 13), (354,), (152, 13), (152,))

In [26]:
def compute_distance(x_test,x_train=x_train):
    dist=np.dot(x_test,x_train.T)
    te,tr=np.square(x_test).sum(axis=1),np.square(x_train).sum(axis=1)
    dist=np.sqrt(-2*dist+tr+np.matrix(te).T)
    return dist 
dist=compute_distance(x_test,x_train)

In [35]:
def predict_labels(dist,y_train,k=1):
    num_test=dist.shape[0]
    y_pred=np.zeros(num_test)
    for i in range(num_test):
        y=y_train[np.argsort(dist[i])].flatten()[0:k]
        y=Counter(y)
        y_pred[i]=y.most_common(1)[0][0]
    return y_pred.reshape((-1,1))
y_pred=predict_labels(dist,y_train) 

In [34]:
def accuracy(y_test,y_pred):
    print(y_test.shape,y_pred.shape)
    return (y_test==y_pred).sum()/len(y_test)
accuracy(y_test,y_pred)

(152,) (152, 1)


0.9210526315789473

In [44]:
def get_best_k(x_train,y_train):
    num_folds=5 
    x_train_fold,y_train_fold=np.array_split(x_train,num_folds),np.array_split(y_train,num_folds)
    ks=[1,2,4,8,10,12]
    acc={}
    for k in ks:
        for fold in range(num_folds):
            x_test,y_test=x_train_fold[fold],y_train_fold[fold]
            x_train,y_train=np.concatenate(x_train_fold[:fold]+x_train_fold[fold+1:]),np.concatenate(y_train_fold[:fold]+y_train_fold[fold+1:])
            dist=compute_distance(x_test,x_train)
            y_pred=predict_labels(dist,y_train,k)
            acc[k]=acc.get(k,[])+[accuracy(y_test,y_pred)]
    for k in acc:
        for a in acc[k]:
            print(f"k={k},accuracy: {a}")
    acc_mean=np.array([np.mean(v) for v in acc.values()])
    best_k=ks[np.argmax(acc_mean)]
    return best_k
best_k=get_best_k(x_train,y_train)
best_k

(71,) (71, 1)
(71,) (71, 1)
(71,) (71, 1)
(71,) (71, 1)
(70,) (70, 1)
(71,) (71, 1)
(71,) (71, 1)
(71,) (71, 1)
(71,) (71, 1)
(70,) (70, 1)
(71,) (71, 1)
(71,) (71, 1)
(71,) (71, 1)
(71,) (71, 1)
(70,) (70, 1)
(71,) (71, 1)
(71,) (71, 1)
(71,) (71, 1)
(71,) (71, 1)
(70,) (70, 1)
(71,) (71, 1)
(71,) (71, 1)
(71,) (71, 1)
(71,) (71, 1)
(70,) (70, 1)
(71,) (71, 1)
(71,) (71, 1)
(71,) (71, 1)
(71,) (71, 1)
(70,) (70, 1)
k=1,accuracy: 0.3380281690140845
k=1,accuracy: 0.23943661971830985
k=1,accuracy: 0.36619718309859156
k=1,accuracy: 0.3380281690140845
k=1,accuracy: 0.4714285714285714
k=2,accuracy: 0.3380281690140845
k=2,accuracy: 0.23943661971830985
k=2,accuracy: 0.36619718309859156
k=2,accuracy: 0.3380281690140845
k=2,accuracy: 0.4714285714285714
k=4,accuracy: 0.49295774647887325
k=4,accuracy: 0.23943661971830985
k=4,accuracy: 0.4084507042253521
k=4,accuracy: 0.352112676056338
k=4,accuracy: 0.4142857142857143
k=8,accuracy: 0.7746478873239436
k=8,accuracy: 0.28169014084507044
k=8,accuracy:

12

In [57]:
import numpy as np
from sklearn.utils import shuffle
from sklearn.datasets import load_boston
from collections import Counter


def accuracy(y_test, y_pred):
    print(y_test.shape, y_pred.shape)
    return (y_test == y_pred).sum()/len(y_test)


class KNearestNeighbor():
    def __init__(self): ...

#     加载boston数据集
    def create_datasets(self, type="boston"):
        assert type == "boston", "仅支持boston"
        boston = load_boston()
        x, y = shuffle(boston.data, boston.target, random_state=12)
        x.astype(np.float32)
        y=y.reshape((-1, 1))
        rate = .7
        offset = int(x.shape[0]*rate)
        x_train, y_train = x[:offset], y[:offset]
        x_test, y_test = x[offset:], y[offset:]
        return x_train, y_train.reshape((-1,1)), x_test, y_test.reshape((-1,1))
#      5折交叉验证 寻找最佳k
    def cross_validation(self, x_train, y_train):
        num_folds = 5
        x_train_fold, y_train_fold = np.array_split(
            x_train, num_folds), np.array_split(y_train, num_folds)
        ks = [1, 2, 4, 8, 10, 12]
        acc = {}
        for k in ks:
            for fold in range(num_folds):
                x_test, y_test = x_train_fold[fold], y_train_fold[fold]
                x_train, y_train = np.concatenate(
                    x_train_fold[:fold]+x_train_fold[fold+1:]), np.concatenate(y_train_fold[:fold]+y_train_fold[fold+1:])
                dist = compute_distance(x_test, x_train)
                y_pred = self.predict_labels(dist, y_train, k)
                acc[k] = acc.get(k, [])+[accuracy(y_test, y_pred)]
        for k in acc:
            for a in acc[k]:
                print(f"k={k},accuracy: {a}")
        acc_mean = np.array([np.mean(v) for v in acc.values()])
        best_k = ks[np.argmax(acc_mean)]
        print(f"最佳k值为 {best_k}")
        return best_k
#     计算测试集与训练集的距离
    def compute_distance(self, x_test, x_train):
        dist = np.dot(x_test, x_train.T)
        te, tr = np.square(x_test).sum(axis=1), np.square(x_train).sum(axis=1)
        dist = np.sqrt(-2*dist+tr+np.matrix(te).T)
        return dist

    def predict_labels(self, dist, y_train, k=1):
        num_test = dist.shape[0]
        y_pred = np.zeros(num_test)
        for i in range(num_test):
            y = y_train[np.argsort(dist[i])].flatten()[0:k]
            y = Counter(y)
            y_pred[i] = y.most_common(1)[0][0]
        return y_pred.reshape((-1, 1))


knn_classifier = KNearestNeighbor()
x_train, y_train, x_test, y_test = knn_classifier.create_datasets(
    type="boston")
best_k = knn_classifier.cross_validation(x_train, y_train)
dist = knn_classifier.compute_distance(x_test, x_train)
y_pred = knn_classifier.predict_labels(dist, y_train, best_k)
accuracy(y_test, y_pred)

(71, 1) (71, 1)
(71, 1) (71, 1)
(71, 1) (71, 1)
(71, 1) (71, 1)
(70, 1) (70, 1)
(71, 1) (71, 1)
(71, 1) (71, 1)
(71, 1) (71, 1)
(71, 1) (71, 1)
(70, 1) (70, 1)
(71, 1) (71, 1)
(71, 1) (71, 1)
(71, 1) (71, 1)
(71, 1) (71, 1)
(70, 1) (70, 1)
(71, 1) (71, 1)
(71, 1) (71, 1)
(71, 1) (71, 1)
(71, 1) (71, 1)
(70, 1) (70, 1)
(71, 1) (71, 1)
(71, 1) (71, 1)
(71, 1) (71, 1)
(71, 1) (71, 1)
(70, 1) (70, 1)
(71, 1) (71, 1)
(71, 1) (71, 1)
(71, 1) (71, 1)
(71, 1) (71, 1)
(70, 1) (70, 1)
k=1,accuracy: 0.0
k=1,accuracy: 0.014084507042253521
k=1,accuracy: 0.0
k=1,accuracy: 0.04225352112676056
k=1,accuracy: 0.02857142857142857
k=2,accuracy: 0.0
k=2,accuracy: 0.014084507042253521
k=2,accuracy: 0.0
k=2,accuracy: 0.04225352112676056
k=2,accuracy: 0.02857142857142857
k=4,accuracy: 0.014084507042253521
k=4,accuracy: 0.014084507042253521
k=4,accuracy: 0.0
k=4,accuracy: 0.04225352112676056
k=4,accuracy: 0.014285714285714285
k=8,accuracy: 0.014084507042253521
k=8,accuracy: 0.014084507042253521
k=8,accuracy: 0

0.03289473684210526