# Solution: Fixing KNN Overfitting

In [None]:
import matplotlib.pyplot as plt
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier

# 1. Data
data = load_breast_cancer()
X, y = data.data, data.target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# 2. Replicate the Problem (k=1)
knn1 = KNeighborsClassifier(n_neighbors=1)
knn1.fit(X_train, y_train)

print(f"K=1 Train Acc: {knn1.score(X_train, y_train):.4f}")
print(f"K=1 Test Acc:  {knn1.score(X_test, y_test):.4f}")
print("Diagnosis: Extreme Overfitting! Perfect Train, Lower Test.\n")

# 3. The Fix (Search for best K)
train_scores = []
test_scores = []
k_values = range(1, 21)

for k in k_values:
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train, y_train)
    train_scores.append(knn.score(X_train, y_train))
    test_scores.append(knn.score(X_test, y_test))

plt.figure(figsize=(10, 5))
plt.plot(k_values, train_scores, label='Train Acc')
plt.plot(k_values, test_scores, label='Test Acc')
plt.xlabel('n_neighbors (k)')
plt.ylabel('Accuracy')
plt.legend()
plt.title('Finding the Sweet Spot (Bias-Variance Tradeoff)')
plt.grid(True)
plt.xticks(k_values)
plt.show()

# Pick best K
best_k = k_values[test_scores.index(max(test_scores))]
print(f"Best Test Accuracy found at K={best_k}: {max(test_scores):.4f}")