In [29]:
import numpy as np
from collections import Counter
from sklearn.datasets import load_iris, load_diabetes
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler

def euclidean_distance(x1, x2):
    return np.sqrt(np.sum((x1 - x2) ** 2))

def test_accuracy_score(model, X, y):
    X_train, X_val, y_train, y_val = train_test_split(X, y, random_state=0, test_size=0.2)
    model.fit(X_train, y_train)
    print("Accuracy score, ", accuracy_score(y_val, model.predict(X_val)))

    
class kNN:
    '''An naive kNN classifier implementation'''
    def __init__(self, n_neighbors=3):
        '''Scaling is necessary for otherwise features with large scale will 
        dominate the distance. 
        E.g., 'age 0 ~ 120' will be shadowed by 'income 3000 ~ 10_000'
        '''
        self.n_neighbors = n_neighbors
        self.scaler = StandardScaler()
    
    def fit(self, X, y):
        self.scaler.fit(X)
        self.X = self.scaler.transform(X)
        self.y = y
    
    def predict(self, X):
        X_s = self.scaler.transform(X)
        y_preds = list(map(self._predict, X_s))
        return np.array(y_preds)
        
    def _predict(self, x):
        distance = [euclidean_distance(x, sample) for sample in self.X]
        k_indexes = np.argsort(distance)[:self.n_neighbors]
        return Counter([self.y[i] for i in k_indexes]).most_common()[0][0]
    
        
x1 = np.array([1, 0])
x2 = np.array([0, 1])
x3 = np.array([2, 1])
euclidean_distance(x1, x2)
dists = [euclidean_distance(np.array([2, 0]), sample) for sample in [x1, x2, x3]]
np.argsort(dists)[:2]


array([0, 2])

In [31]:
d = load_iris()
X, y = d.data, d.target

K = int(np.sqrt(X.shape[0]))
print(K)

model = kNN(n_neighbors=K)
test_accuracy_score(model, X, y)

model = KNeighborsClassifier(n_neighbors=6)
test_accuracy_score(model, X, y)

12
Accuracy score,  1.0
Accuracy score,  1.0


In [28]:
d = load_diabetes()
X, y = d.data, d.target

print("Test on diabetes")


ss = StandardScaler()

train = np.array([[50, 0.2], [100, 0.1], [75, 0.3]])
test  = np.array([105, 150]).reshape(-1, 1)
ss.fit(train)
s_train = ss.transform(train)
# s_test = ss.transform(test)
s_train
# , s_test


Test on diabetes


array([[-1.22474487e+00, -3.39934989e-16],
       [ 1.22474487e+00, -1.22474487e+00],
       [ 0.00000000e+00,  1.22474487e+00]])