# Intro to KNN

## Collect Data

In [None]:
from sklearn.datasets import load_breast_cancer

data = load_breast_cancer()
list(data.keys())

In [None]:
print(data.DESCR)

## Separte Features and Targets in X, y

In [3]:
X = data["data"]
y = data["target"]

In [None]:
X

## Train/ Test Splits

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train.shape

In [None]:
from sklearn.neighbors import KNeighborsClassifier

k = 4
knn = KNeighborsClassifier(n_neighbors = k)
knn.fit(X_train, y_train)

In [7]:
pred_y = knn.predict(X_test)

In [None]:
from sklearn.metrics import accuracy_score

print(f"Acc at K={k} is {accuracy_score(y_test, pred_y)}")

In [None]:
ks = [1, 2, 4, 8, 16, 32, 64, 455]

for k in ks:
    knn = KNeighborsClassifier(n_neighbors = k)
    knn.fit(X_train, y_train)
    pred_y = knn.predict(X_test)
    print(f"Acc at K={k} is {accuracy_score(y_test, pred_y)}")

## Plot the error, plot the metric (accuracy)

In [10]:
import numpy as np
# import matplotlib.pyplot as plt

error_rate = []

for i in range(1, 40):
    knn = KNeighborsClassifier(n_neighbors = i)
    knn.fit(X_train, y_train)
    pred_i = knn.predict(X_test)
    
    # Error history
    error_rate.append(np.mean(pred_i != y_test))

In [11]:
import matplotlib.pyplot as plt


In [None]:
plt.figure(figsize = (10, 6))
plt.plot(range(1, 40), error_rate, color = "blue",
         linestyle = "dashed",
         marker='o', markerfacecolor="red", markersize=10)

plt.title("Error Rate vs K-Value")
plt.xlabel("K")
plt.ylabel("Error Rate")
plt.show()

print(f"Min Error: {min(error_rate)}, at K={error_rate.index(min(error_rate))}")

### Accuracy

In [13]:
acc = []

for i in range(1, 20):
    knn = KNeighborsClassifier(n_neighbors = i).fit(X_train, y_train)
    pred_i = knn.predict(X_test)
    acc.append(accuracy_score(y_test, pred_i))

In [None]:
plt.figure(figsize=(10, 6))
plt.plot(range(1, 20), acc, color ="blue",
         linestyle = "dashed",
         marker='o', markerfacecolor="red", markersize=10)

plt.title("Accuracy vs K-Value")
plt.xlabel("K")
plt.ylabel("Accuracy")
plt.show()

In [None]:
print(f"Max Acc: {max(acc)}, at K={acc.index(max(acc))}")