## K-Nearest Neighbors (KNN) Algorithm



 Compute the distance between the test instance and all training instances. The Euclidean distance is commonly used:


   
   d(x, x_i) = sqrt{sum_{j=1}^{n} (x_j - x_{ij})^2}
   
   where (x) is the test instance, (x_i) is a training instance, and (n) is the number of features.

   

 Identify the (k) training instances that are closest to the test instance based on the calculated distances.

 For classification, predict the class label by taking a majority vote among the (k) nearest neighbors. For regression, predict the value by taking the mean of the (k) nearest neighbors.



# Load Data

In [1]:
import pandas as pd
import numpy as np



In [2]:
data1_train = pd.read_csv('data1_train.csv')
data1_test = pd.read_csv('data1_test.csv')

print(data1_train.head())
print(data1_test.head())

    Feature_1  Feature_2   Feature_3  Target
0   94.870985  88.239326  101.497093       0
1   97.684482  84.837474   90.892151       0
2   94.648343  77.467282   87.646104       0
3   94.635471  85.327735   99.851568       0
4  104.397011  84.097116   98.211326       0
    Feature_1  Feature_2  Feature_3  Target
0   97.533783  88.329103  98.191966       0
1  102.838058  78.783627  90.867559       0
2   95.155724  86.679155  94.861119       0
3  102.838797  87.504581  99.622361       0
4   99.238078  86.704614  91.685225       0


In [3]:
data2_train = pd.read_csv('data2_train.csv')
data2_test = pd.read_csv('data2_test.csv')

print(data2_train.head())
print(data2_test.head())

   Feature_1   Feature_2  Target
0   8.160646   88.799326       0
1  31.149536  102.335826       0
2  13.103383   92.902908       0
3  15.950445   77.412565       0
4  35.856965   94.441550       0
   Feature_1  Feature_2  Target
0  48.489576  81.609641       0
1  26.069706  89.783100       0
2  31.967447  88.005024       0
3  44.957613  91.219129       0
4  27.681870  87.381969       0


In [4]:
from sklearn.preprocessing import StandardScaler

def normalize_data(train_data, test_data):
    
    scaler = StandardScaler()

    X_train = scaler.fit_transform(train_data.iloc[:, :-1].values)

    y_train = train_data.iloc[:, -1].values

    X_test = scaler.transform(test_data.iloc[:, :-1].values)

    y_test = test_data.iloc[:, -1].values

    return X_train, y_train, X_test, y_test

X_train1, y_train1, X_test1, y_test1 = normalize_data(data1_train, data1_test)

X_train2, y_train2, X_test2, y_test2 = normalize_data(data2_train, data2_test)


calculate distance


In [13]:
import numpy as np

def calculate_distances(X_train, X_test_instance):

    distances = np.sqrt(np.sum((X_train - X_test_instance)**2, axis=1))
    
    return distances


In [14]:
def find_k_nearest_neighbors(X_train, y_train, X_test_instance, k):

    distances = calculate_distances(X_train, X_test_instance)

    k_indices = np.argsort(distances)[:k]

    k_nearest_labels = y_train[k_indices]
    
    return k_nearest_labels


In [19]:
from collections import Counter

def predict(X_train, y_train, X_test, k):

    predictions = []

    for X_test_instance in X_test:

        k_nearest_labels = find_k_nearest_neighbors(X_train, y_train, X_test_instance, k)

        majority_vote = Counter(k_nearest_labels).most_common(1)[0][0]

        predictions.append(majority_vote)
        
    return predictions


In [24]:
def evaluate_model(y_true, y_pred):

    accuracy = np.mean(y_true == y_pred) * 100

    return accuracy


k = 3


y_train1_pred = predict(X_train1, y_train1, X_train1, k)

y_test1_pred = predict(X_train1, y_train1, X_test1, k)

y_train2_pred = predict(X_train2, y_train2, X_train2, k)

y_test2_pred = predict(X_train2, y_train2, X_test2, k)

train_accuracy1 = evaluate_model(y_train1, y_train1_pred)

test_accuracy1 = evaluate_model(y_test1, y_test1_pred)

train_accuracy2 = evaluate_model(y_train2, y_train2_pred)

test_accuracy2 = evaluate_model(y_test2, y_test2_pred)

print(f'Training Accuracy 1: {train_accuracy1:.2f}%')

print(f'Test Accuracy 1: {test_accuracy1:.2f}%')

print(f'Training Accuracy 2: {train_accuracy2:.2f}%')

print(f'Test Accuracy 2: {test_accuracy2:.2f}%')


Training Accuracy 1: 97.75%
Test Accuracy 1: 98.00%
Training Accuracy 2: 99.12%
Test Accuracy 2: 99.00%


In [27]:
def grid_search_knn(X_train, y_train, X_test, y_test, k_values):

    best_k = k_values[0]

    best_accuracy = 0

    for k in k_values:

        y_test_pred = predict(X_train, y_train, X_test, k)

        accuracy = evaluate_model(y_test, y_test_pred)

        if accuracy > best_accuracy:

            best_accuracy = accuracy

            best_k = k

    return best_k, best_accuracy

k_values = [1, 3, 5, 7, 9, 11, 13, 15]

best_k1, best_accuracy1 = grid_search_knn(X_train1, y_train1, X_test1, y_test1, k_values)

best_k2, best_accuracy2 = grid_search_knn(X_train2, y_train2, X_test2, y_test2, k_values)


print(f'Best k: {best_k1}')

print(f'Best Test Accuracy: {best_accuracy1:.2f}%')

print(f'Best k: {best_k2}')

print(f'Best Test Accuracy: {best_accuracy2:.2f}%')


Best k: 3
Best Test Accuracy: 98.00%
Best k: 9
Best Test Accuracy: 99.50%


In [30]:
from sklearn.neighbors import KNeighborsClassifier

from sklearn.metrics import accuracy_score

def compare_with_sklearn(X_train, y_train, X_test, y_test, best_k):

    model = KNeighborsClassifier(n_neighbors=best_k)

    model.fit(X_train, y_train)

    y_train_pred_sklearn = model.predict(X_train)

    y_test_pred_sklearn = model.predict(X_test)

    train_accuracy_sklearn = accuracy_score(y_train, y_train_pred_sklearn) * 100

    test_accuracy_sklearn = accuracy_score(y_test, y_test_pred_sklearn) * 100

    return train_accuracy_sklearn, test_accuracy_sklearn

train_accuracy_sklearn_1, test_accuracy_sklearn_1 = compare_with_sklearn(X_train1, y_train1, X_test1, y_test1, best_k1)

train_accuracy_sklearn_2, test_accuracy_sklearn_2 = compare_with_sklearn(X_train2, y_train2, X_test2, y_test2, best_k2)

print(f'Scikit-Learn Training Accuracy 1: {train_accuracy_sklearn_1:.2f}%')

print(f'Scikit-Learn Test Accuracy 1: {test_accuracy_sklearn_1:.2f}%')


print(f'Scikit-Learn Training Accuracy 2: {train_accuracy_sklearn_2:.2f}%')

print(f'Scikit-Learn Test Accuracy 2: {test_accuracy_sklearn_2:.2f}%')


Scikit-Learn Training Accuracy 1: 97.62%
Scikit-Learn Test Accuracy 1: 98.00%
Scikit-Learn Training Accuracy 2: 98.75%
Scikit-Learn Test Accuracy 2: 99.50%
