In [1]:
from sklearn import datasets
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from collections import Counter

In [2]:
dataset = datasets.load_breast_cancer()
X_train, X_test, Y_train, Y_test = train_test_split(dataset.data, dataset.target, test_size = 0.2, random_state = 0)

In [3]:
k_neighbours = 8
clf = KNeighborsClassifier(n_neighbors=k_neighbours)
clf.fit(X_train, Y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=8, p=2,
           weights='uniform')

In [4]:
print('In-Built KNN score - k =', k_neighbours, '-','using clf.score:', clf.score(X_test, Y_test))
Y_pred = clf.predict(X_test)
print('In-Built KNN score - k =', k_neighbours, '-','using accuracy_score:', accuracy_score(Y_test, Y_pred))

In-Built KNN score - k = 8 - using clf.score: 0.9473684210526315
In-Built KNN score - k = 8 - using accuracy_score: 0.9473684210526315


### K-NN Classifier Implementation 

In [5]:
# This function was meant to train the data - but no explicit training in K-NN
def train(x, y):
    return

In [6]:
# This function takes in the training data and the feature values to the data sample to be predicted on. 
# Using the specified number of nearest neighbours, it returns the predicted classification of the specificed data sample
def predict_one(x_train, y_train, x_test_point, k):
    distances = []
    for i in range(len(x_train)):
        # sum of square of distance of each feature - minkowski Distance with p = 2
        distance = ((x_train[i, :] - x_test_point)**2).sum()
        # apeending the list of the distance for the point and its index to the list
        distances.append([distance, i])

    # Sorting using the distance from the sample point
    distances = sorted(distances)
    
    targets = []
    # Finding the classification of the elements using the first 'k' elemets in the distances list i.e 'k' nearest neighbours
    for i in range(k):
        # list of the indices of the 'k' nearest neighbours
        index_of_training_data = distances[i][1]
        
        # adding the nearest neighbours to the targets list
        targets.append(y_train[index_of_training_data])
    
    # returns the most common entry among the targets
    return Counter(targets).most_common(1)[0][0]

In [7]:
# This function takes in the training data, the data samples to be predicted upon and the value of k
# It return the predicted classification of the data samples given to it for prediction
def predict(x_train, y_train, x_test_data, k):
    predictions = []
    
    # making prediction for the testing data samples
    for x_test in x_test_data:
        predictions.append(predict_one(x_train, y_train, x_test, k))
    return predictions

In [8]:
k_neighbours = 8
y_pred = predict(X_train, Y_train, X_test, k_neighbours)
print('Implemented KNN score - k =', k_neighbours, '--', accuracy_score(Y_test, y_pred))

Implemented KNN score - k = 8 -- 0.9473684210526315


In [9]:
# Sample use of Counter
a = [1,2,3,2,2,2,3,4,3,4,2,3,4]
print(Counter(a).most_common())
Counter(a).most_common(1)[0][0]

[(2, 5), (3, 4), (4, 3), (1, 1)]


2