python实现，遍历所有数据点，找出$n$个距离最近的点的分类情况，少数服从多数

In [2]:
import numpy as np
import random
import math

import sklearn.datasets as datasets
from sklearn.model_selection import train_test_split
from collections import Counter

In [3]:
# 实验参数：选择不同的数据集
#dataset = datasets.load_iris()
dataset = datasets.load_wine()
#dataset = datasets.load_breast_cancer()

In [18]:
# 实验参数：选择不同的数据规模
sample_rate = 0.5 # 采样率（可以选择小于1）
assert sample_rate>0 and sample_rate<=1
nSample = math.floor(dataset.data.shape[0]*sample_rate)
idx = random.sample(range(dataset.data.shape[0]), nSample)

X = dataset.data[idx][:]
y = dataset.target[idx][:]
y_stat = Counter(y)
print('======================= dataset information =======================')
print('Total sample number: %d, Feature dimension: %d, Category number: %d' % (X.shape[0], X.shape[1], len(y_stat)))
for category in y_stat:
    print('category %d has %d samples' % (category, y_stat[category]))

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
print('Training sample number: %d, Test sample number: %d' % (X_train.shape[0], X_test.shape[0]))

Total sample number: 89, Feature dimension: 13, Category number: 3
category 2 has 24 samples
category 1 has 30 samples
category 0 has 35 samples
Training sample number: 71, Test sample number: 18


In [7]:
#KNN类定义
class KNN:
    def __init__(self, X_train, y_train, n_neighbors=3, p=2): # 通过n_neighbors修改k值
        """
        parameter: n_neighbors 临近点个数
        parameter: p 距离度量
        """
        self.n = n_neighbors
        self.p = p
        self.X_train = X_train
        self.y_train = y_train

    def predict(self, X):
        # 取出n个点
        knn_list = []
        for i in range(self.n):
            dist = np.linalg.norm(X - self.X_train[i], ord=self.p)
            knn_list.append((dist, self.y_train[i]))

        for i in range(self.n, len(self.X_train)):
            max_index = knn_list.index(max(knn_list, key=lambda x: x[0]))
            dist = np.linalg.norm(X - self.X_train[i], ord=self.p)
            if knn_list[max_index][0] > dist:
                knn_list[max_index] = (dist, self.y_train[i])

        # 统计
        knn = [k[-1] for k in knn_list]
        count_pairs = Counter(knn)
#         max_count = sorted(count_pairs, key=lambda x: x)[-1]
        max_count = sorted(count_pairs.items(), key=lambda x: x[1])[-1][0]
        return max_count

    def score(self, X_test, y_test):
        right_count = 0
        n = 10
        for X, y in zip(X_test, y_test):
            label = self.predict(X)
            if label == y:
                right_count += 1
        return right_count / len(X_test)

In [8]:
# 实验参数：选择不同的K值
K=3
assert K<=X_train.shape[0]
clf = KNN(X_train, y_train, K)
print('precision rate: {:.2%}'.format(clf.score(X_test, y_test)))

precision rate: 66.67%


In [9]:
# 实验参数：选择不同的测试样本
test_idx = 0
assert test_idx<X_test.shape[0]
test_point = X_test[0]
print('Test Point category: {}'.format(clf.predict(test_point)))

Test Point category: 2
