In [1]:
# Chapter 15. K-Nearest Neighbors

In [8]:
# 15.1 Finding an Observation’s Nearest Neighbors
from sklearn import datasets
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import StandardScaler

iris = datasets.load_iris()
features, target = iris.data, iris.target

# Create standardizer
standardizer = StandardScaler()

# Standardize features
features_standardized = standardizer.fit_transform(features)

# Two nearest neighbors
nearest_neighbors = NearestNeighbors(n_neighbors=2).fit(features_standardized)

# create an observation
new_observation = [1, 1, 1, 1]

# Find distances and indices of the observation's nearest neighbors
distances, indices = nearest_neighbors.kneighbors([new_observation])

# view the nearest neighbors
print(features_standardized[indices])
print("------------------------------")
print(distances)

[[[1.03800476 0.55861082 1.10378283 1.18556721]
  [0.79566902 0.32841405 0.76275827 1.05393502]]]
------------------------------
[[0.49140089 0.74294782]]


In [11]:
# Find two nearest neighbors based on Euclidean distance

nearestneighbors_eucliean = NearestNeighbors(n_neighbors=2, metric='euclidean').fit(features_standardized)

distances, indices = nearestneighbors_eucliean.kneighbors([new_observation])

distances

array([[0.49140089, 0.74294782]])

In [15]:
# # Find each observation's three nearest neighbors based on
# Euclidean distance (including itself)

nearestneighbors_eucliean = NearestNeighbors(n_neighbors=3, metric="euclidean").fit(features_standardized)

# List of lists indicating each observation's three nearest neighbors
# (including itself)

nearest_neighbors_with_self = nearestneighbors_eucliean.kneighbors_graph(features_standardized).toarray()

# Remove 1s marking an observation is a nearest neighbor to itself
for i, x in enumerate(nearest_neighbors_with_self):
    x[i] = 0
    
# View first observation's two nearest neighbors
nearest_neighbors_with_self[0]

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])

In [21]:
# 15.2 Creating a K-Nearest Neighbors Classifier

from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn import datasets

iris = datasets.load_iris()
X = iris.data 
y = iris.target

standardizer = StandardScaler()

X_std = standardizer.fit_transform(X)

# Train a KNN classifier with 5 neighbors
knn = KNeighborsClassifier(n_neighbors=5, n_jobs=-1).fit(X_std, y)

new_observation = [[0.75, 0.75, 0.75, 0.75], [1, 1, 1, 1]]

# Predict the class of two observations
knn.predict(new_observation)

array([1, 2])

In [22]:
# View probability that each observation is one of three classes
knn.predict_proba(new_observation)

array([[0. , 0.6, 0.4],
       [0. , 0. , 1. ]])

In [25]:
# 15.3 Identifying the Best Neighborhood Size

from sklearn.neighbors import KNeighborsClassifier
from sklearn import datasets
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.model_selection import GridSearchCV

iris = datasets.load_iris()
features, target = iris.data, iris.target

standardizer = StandardScaler()

# create knn classifier
knn = KNeighborsClassifier(n_neighbors=5, n_jobs=-1)

# create a pipeline
pipe = Pipeline([("standardizer", standardizer), ("knn", knn)])

# Create space of candidate values
search_space = [{"knn__n_neighbors": [1, 2, 3, 4, 5, 6 , 7, 8, 9, 10]}]

# create grid search
classifier = GridSearchCV(pipe, search_space, cv=5, verbose=0).fit(features_standardized, target)

# Best neighborhood size (k)
classifier.best_estimator_.get_params()["knn__n_neighbors"]

6

In [None]:
# 15.4 Creating a Radius-Based Nearest Neighbors Classifier

from sklearn.neighbors import RadiusNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn import datasets

iris = datasets.load_iris()
features, target = iris.data, iris.target

standardizer = StandardScaler()

features_standardized = standardizer.fit_transform(features)

# Train a radius neighbors classifier
rnn = RadiusNeighborsClassifier(radius=.5, n_jobs=-1).fit(features_standardized,target)

# create an observation
new_observation= [[1, 1, 1, 1]]

# Predict the class of  observation
rnn.predict(new_observation)

array([2])

In [27]:
rnn.predict_proba(new_observation)

array([[0., 0., 1.]])

In [33]:
# 15.5 Finding Approximate Nearest Neighbors
import faiss
import numpy as np
from sklearn import datasets
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import NearestNeighbors

iris = datasets.load_iris()
features, target = iris.data, iris.target

standardizer = StandardScaler()

features_standardized = standardizer.fit_transform(features)

# set faiss parameters
n_features = features_standardized.shape[1]
nlist = 3
k =2

# create an IVF index
quantizer = faiss.IndexFlatIP(n_features)
index = faiss.IndexIVFFlat(quantizer, n_features, nlist)

# Train the index and add feature vectors
index.train(features_standardized)
index.add(features_standardized)

# create an observation
new_observation = np.array([[1, 1, 1, 1]])

# Search the index for the 2 nearest neighbors
distances, indices = index.search(new_observation, k)

np.array([list(features_standardized[i]) for i in indices[0]])

array([[1.03800476, 0.55861082, 1.10378283, 1.18556721],
       [0.79566902, 0.32841405, 0.76275827, 1.05393502]])

In [2]:
# 15.6 Evaluating Approximate Nearest Neighbors
import faiss
import numpy as np
from sklearn import datasets
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import NearestNeighbors

# number of nearest neighbors
k = 10

iris = datasets.load_iris()
features, target = iris.data, iris.target

standardizer = StandardScaler()

features_standardized = standardizer.fit_transform(features)

# crete KNN with 10 NN
nearest_neighbors = NearestNeighbors(n_neighbors=k).fit(features_standardized)

# Set faiss parameters
n_features = features_standardized.shape[1]
nlist = 3

# Create an IVF index
quantizer = faiss.IndexFlatIP(n_features)
index = faiss.IndexIVFFlat(quantizer, n_features, nlist)

# Train the index and add feature vectors
index.train(features_standardized)
index.add(features_standardized)

index.nprobe = 1

new_observation = np.array([[1, 1, 1, 1]])

# Find distances and indices of the observation's exact nearest neighbors
knn_distances, knn_indices = nearest_neighbors.kneighbors(new_observation)

ivf_distances, ivf_indices = index.search(new_observation, k)

# Get the set overlap
recalled_items = set(list(knn_indices[0])) & set(list(ivf_indices[0]))

print(f"recall @k = {k}: {len(recalled_items)/k * 100}%") 

recall @k = 10: 100.0%
