# K Nearest Neighbors

**K Nearest Neighbors** is a simple supervised learning used to assign new data point to a given class.

In [None]:
# import things
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import sklearn
from sklearn.datasets import load_iris, load_breast_cancer, load_boston, fetch_california_housing
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.metrics import accuracy_score, mean_squared_error, r2_score
from sklearn.model_selection import train_test_split

In [None]:
sklearn.__version__

In [None]:
# load iris data
iris = load_iris()
iris.keys()

In [None]:
iris["data"].shape

In [None]:
iris["feature_names"]

In [None]:
iris["target_names"]

In [None]:
knn = KNeighborsClassifier(1)
knn

In [None]:
X = iris["data"]
y = iris["target"]

In [None]:
knn.fit(X, y)

In [None]:
y_pred = knn.predict(X)

In [None]:
plt.scatter(X[y==0, 0], X[y==0, 1], c='r')
plt.scatter(X[y==1, 0], X[y==1, 1], c='b')
plt.scatter(X[y==2, 0], X[y==2, 1], c='y')
plt.show()

In [None]:
knn = KNeighborsClassifier(6)
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)

plt.scatter(X_train[y_pred==0, 0], X_train[y_pred==0, 1], c='r')
plt.scatter(X_train[y_pred==1, 0], X_train[y_pred==1, 1], c='b')
plt.scatter(X_train[y_pred==2, 0], X_train[y_pred==2, 1], c='y')
plt.show()

In [None]:
# range 1 to 25
scores = list()
for k in range(1, 26):
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X, y)
    pred = knn.predict(X)
    acc_score = accuracy_score(y, pred)
    scores.append(acc_score)
print(scores)

In [None]:
# split with test size 0.3 or 0.2
X_train, X_test, \
    y_train, y_test = train_test_split(X, y, test_size=.3, random_state=10)
scores = list()
for k in range(1, 26):
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train, y_train)
    pred = knn.predict(X_test)
    acc_score = accuracy_score(y_test, pred)
    scores.append(acc_score)
print(scores)

In [None]:
pred = knn.predict(X)
plt.scatter(X[pred==0, 0], X[pred==0, 1], c='r')
plt.scatter(X[pred==1, 0], X[pred==1, 1], c='b')
plt.scatter(X[pred==2, 0], X[pred==2, 1], c='y')
plt.show()

In [None]:
cancer = load_breast_cancer()
cancer.keys()

In [None]:
cancer["feature_names"]

In [None]:
cancer["data"].shape

In [None]:
cancer["data"][0]

In [None]:
cancer["target_names"]

In [None]:
X = cancer["data"]
y = cancer["target"]

X_train, X_test, \
    y_train, y_test = train_test_split(X, y, test_size=.2, random_state=5)

In [None]:
scores = list()
for k in range(1, 31):
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train, y_train)
    pred = knn.predict(X_test)
    acc_score = accuracy_score(y_test, pred)
    print(f"accuracy scores for k={k}: {acc_score:.5f}")
    scores.append(acc_score)

In [None]:
boston = load_boston()
boston.keys()

In [None]:
boston["data"].shape

In [None]:
boston["data"][0]

In [None]:
boston["feature_names"]

In [None]:
# predict house prices
X = boston["data"]
y = boston["target"]

X_train, X_test, \
    y_train, y_test = train_test_split(X, y, test_size=.2, random_state=111)

In [None]:
scores = list()
for k in range(1, 31):
    knn_regressor = KNeighborsRegressor(n_neighbors=k)
    knn_regressor.fit(X_train, y_train)
    pred = knn_regressor.predict(X_test)
    mse_scores = mean_squared_error(y_test, pred)
    r2_scores = r2_score(y_test, pred)
    scores.append((mse_scores, r2_scores))
    print(f"metrics for k={k} - mse: {mse_scores:.5f}, r2: {r2_scores:.5f}")

In [None]:
X_train.shape

In [None]:
knn_regressor.predict(X_test[:5]), y_test[:5]

In [None]:
california = fetch_california_housing()

In [None]:
california.keys()

In [None]:
california["data"].shape

In [None]:
california["feature_names"]

In [None]:
california["data"][0], california["target"]

In [None]:
california["target"].shape

In [None]:
df_california = pd.DataFrame(np.concatenate((california["data"], california["target"][:, None]), axis=1),
                             columns=list(california["feature_names"]) + ["avg_value"])

In [None]:
df_california.to_csv("california_house.csv", index=False)

In [None]:
# predict california house
X = california["data"]
y = california["target"]

X_train, X_test, \
    y_train, y_test = train_test_split(X, y, test_size=.2, random_state=1111)

In [None]:
mse_scores = []
r2_scores = []
for k in range(1, 31):
    knn_regressor = KNeighborsRegressor(n_neighbors=k)
    knn_regressor.fit(X_train, y_train)
    pred = knn_regressor.predict(X_test)
    r2_scores.append(r2_score(y_test, pred))
    mse_scores.append(mean_squared_error(y_test, pred))
#     print(f"metrics for k={k} - mse: {mse_scores:.5f}, r2: {r2_scores:.5f}, rmse: {rmse_scores:.3f}")