In [20]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import pairwise_distances_argmin_min
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics import accuracy_score

breast_cancer = pd.read_csv("wisconsin_breast_cancer.csv")
breast_cancer = breast_cancer.dropna(axis=1, how="all")

train, test = train_test_split(breast_cancer, test_size=0.33, stratify=breast_cancer['diagnosis'])

In [8]:
print(f"# training samples {train.shape[0]}", f"# testing samples {test.shape[0]}", sep="\n")

# training samples 381
# testing samples 188


In [9]:
knn_model = NearestNeighbors(n_neighbors=7)

In [10]:
knn_model.fit(train.values[:, 2:])

NearestNeighbors(algorithm='auto', leaf_size=30, metric='minkowski',
                 metric_params=None, n_jobs=None, n_neighbors=7, p=2,
                 radius=1.0)

In [11]:
distances, nearest_neighbours = knn_model.kneighbors(test.values[:, 2:])

In [12]:
train.values[nearest_neighbours[0], 1]

array(['B', 'B', 'B', 'B', 'B', 'B', 'B'], dtype=object)

In [13]:
distances[0]

array([ 9.1948234 , 11.58465417, 14.26999777, 15.50380134, 17.13673636,
       18.29751403, 20.14390398])

In [14]:
test.values[0, 1]

'B'

In [15]:
predictions = np.apply_along_axis(axis=1,
                    arr=train.values[nearest_neighbours, 1] == "B",
                    func1d = lambda arr: 'B' if np.sum(arr) >= 3 else 'M')

In [16]:
predictions

array(['B', 'B', 'M', 'B', 'M', 'M', 'B', 'M', 'B', 'B', 'M', 'B', 'B',
       'B', 'B', 'B', 'M', 'M', 'B', 'M', 'M', 'M', 'B', 'M', 'B', 'M',
       'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'M', 'B',
       'M', 'B', 'B', 'B', 'B', 'M', 'B', 'M', 'B', 'B', 'B', 'B', 'B',
       'B', 'B', 'B', 'M', 'M', 'B', 'M', 'B', 'B', 'B', 'B', 'B', 'B',
       'B', 'B', 'B', 'B', 'B', 'B', 'M', 'B', 'B', 'B', 'M', 'B', 'B',
       'B', 'B', 'M', 'B', 'B', 'B', 'M', 'M', 'B', 'B', 'M', 'M', 'B',
       'B', 'B', 'B', 'M', 'B', 'B', 'M', 'M', 'B', 'B', 'B', 'M', 'B',
       'M', 'B', 'M', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'M', 'B', 'M',
       'B', 'M', 'B', 'B', 'B', 'B', 'M', 'M', 'B', 'M', 'B', 'B', 'M',
       'B', 'B', 'B', 'B', 'M', 'B', 'M', 'M', 'B', 'M', 'M', 'B', 'B',
       'B', 'M', 'M', 'B', 'M', 'B', 'B', 'B', 'M', 'M', 'B', 'B', 'B',
       'M', 'B', 'B', 'B', 'M', 'M', 'B', 'B', 'M', 'B', 'M', 'B', 'B',
       'B', 'B', 'B', 'B', 'M', 'B', 'B', 'B', 'M', 'B', 'B', 'M

In [17]:
accuracy_score(test.values[:, 1], predictions)

0.9308510638297872