In [1]:
import numpy as np
import pandas as pd
from sklearn import datasets
from sklearn.model_selection import train_test_split

In [2]:
class KNN_C:
    def __init__(self, k):
        self.k = k
    def fit(self, X, y):
        self.X = X
        self.y = y
    def predict(self, x):
        neighbor_k_indexs = np.argsort([((x - x_) ** 2).sum()**0.5 for x_ in self.X])[:self.k]
        class_counts = [np.count_nonzero(self.y[neighbor_k_indexs] == j) for j in range(self.y.max()+1)]
        return np.argmax(class_counts)

In [3]:
iris = datasets.load_iris()
X, y = iris.data, iris.target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42,shuffle=True)

In [4]:
knn_c = KNN_C(3)
knn_c.fit(X_train,y_train)
y_pred = np.array([knn_c.predict(X_test[i]) for i in range(len(X_test))])
(y_pred == y_test).sum()/len(y_pred)

0.98

In [5]:
class KNN_R:
    def __init__(self, k):
        self.k = k
    def fit(self, X, y):
        self.X = X
        self.y = y
    def predict(self, x):
        neighbor_k_indexs = np.argsort([((x - x_) ** 2).sum()**0.5 for x_ in self.X])[:self.k]
        return np.mean(self.y[neighbor_k_indexs])

In [6]:
diabetes = datasets.load_diabetes()
X, y = diabetes.data, diabetes.target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42,shuffle=True)

In [7]:
knn_c = KNN_R(31)
knn_c.fit(X_train,y_train)
y_pred = np.array([knn_c.predict(X_test[i]) for i in range(len(X_test))])
mse = ((y_pred - y_test) ** 2).sum() ** 0.5
print(mse)

669.0875595397849


In [8]:
class KNN_OD:
    def __init__(self, k):
        self.k = k
    def fit(self, X):
        self.X = X
    def predict(self, x):
        k_dist = np.sort([((x - x_) ** 2).sum()**0.5 for x_ in self.X])[self.k - 1]
        return k_dist

In [9]:
from sklearn.datasets import make_blobs
X, y = make_blobs(n_samples=300, centers=2, cluster_std=1.0, random_state=42)
# 添加离群点
import numpy as np
X_outliers = np.random.uniform(low=-10, high=10, size=(20, 2))
X = np.vstack([X, X_outliers])
y = np.hstack([y, np.full(20, -1)])  # 用-1标记离群点

In [11]:
knn_od = KNN_OD(7)
knn_od.fit(X)
y_pred = np.array([knn_od.predict(X[i]) for i in range(len(X))])
print(y_pred, y)

[0.1965282  0.74877438 0.30542163 0.24510081 0.65769227 0.72744746
 0.29521538 0.4902486  0.28814528 0.77459591 0.23922116 0.26660493
 0.57582366 0.29894751 0.46527895 0.36273999 0.62998784 1.00805313
 0.34157533 0.63038887 0.32505283 0.41668979 0.1674055  0.18358681
 0.31111421 0.2389448  1.01750705 0.35716643 0.36883474 0.38232714
 0.93400383 0.4168594  0.78608997 0.26463096 0.23148228 0.56805952
 0.25359655 0.34923756 0.26684456 0.63081404 0.53516303 0.74146492
 0.40255197 0.40199998 0.36620599 0.2978238  0.30915486 0.23490706
 0.33612759 0.30695487 0.50954074 0.55546445 0.62998784 0.31592461
 0.23591596 0.31956284 0.45077317 0.53603017 0.25428199 0.51157261
 0.52113168 0.72762349 0.31417129 1.1107398  0.37496    0.23997945
 0.28195184 0.30110951 0.44531285 0.50757154 0.58358795 0.19501465
 0.31307388 0.35110104 0.23134768 0.29635163 0.4639958  0.40062831
 0.28373359 1.16189316 0.57193901 0.57892601 0.17730292 0.23922116
 0.23313547 0.38975537 0.2632035  0.28614736 0.3663722  0.5186