# Lab 6: K-Nearest Neighbors (K-NN)

---



## Importing the libraries

In [1]:
import numpy as np
import pandas as pd
from time import time
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score

## Importing the dataset

In [2]:
train_data = pd.read_excel("TrainingSet.xlsx")
test_data = pd.read_excel("TestingSet.xlsx")

## Data preprocessing

In [3]:
# Separate features and target variable for training dataset
X_train = train_data.drop(columns=['plant'])
y_train = train_data['plant']

In [4]:
# Separate features and target variable for testing dataset
X_test = test_data.drop(columns=['plant'])
y_test = test_data['plant']

## Applying feature scaling

In [5]:
# Standardize features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

## Custom KNN

In [6]:
class KNNClassifier:
    def __init__(self, k):
        self.k = k

    def fit(self, X_train, y_train):
        self.X_train = X_train
        self.y_train = y_train

    def predict(self, X_test):
        y_pred = []
        for x in X_test:
            # Calculate distances between the current test instance and all training instances
            distances = np.sqrt(np.sum((self.X_train - x) ** 2, axis=1))
            # Get indices of k nearest neighbors
            nearest_indices = np.argsort(distances)[:self.k]
            # Get labels of k nearest neighbors
            nearest_labels = self.y_train[nearest_indices]
            # Predict the label with majority vote
            unique_labels, counts = np.unique(nearest_labels, return_counts=True)
            predicted_label = unique_labels[np.argmax(counts)]
            y_pred.append(predicted_label)
        return np.array(y_pred)

In [7]:
k_values = [3, 5, 7]
y_pred_custom_ = []
training_time_custom_knn = []
prediction_time_custom_knn = []

for k in k_values:
    knn = KNNClassifier(k)

    # Training
    start = time()
    knn.fit(X_train_scaled, y_train)
    end = time()
    training_time_custom_knn.append(end-start)

    # Prediction
    start = time()
    y_pred_custom = knn.predict(X_test_scaled)
    end = time()
    prediction_time_custom_knn.append(end-start)

    y_pred_custom_.append(y_pred_custom)

## Standard KNN

In [8]:
y_pred_sklearn_ = []
training_time_knn = []
prediction_time_knn = []
for k in k_values:
    knn_sklearn = KNeighborsClassifier(n_neighbors=k)

    # Training
    start = time()
    knn_sklearn.fit(X_train_scaled, y_train)
    end = time()
    training_time_knn.append(end-start)

    # Prediction
    start = time()
    y_pred_sklearn = knn_sklearn.predict(X_test_scaled)
    end = time()
    prediction_time_knn.append(end-start)

    y_pred_sklearn_.append(y_pred_sklearn)

## Comparing Training and Prediction time of Custom and Standard KNN

In [9]:
k_values = [3, 5, 7]

for i, k in enumerate(k_values):
    print(f"For k={k}:")

    # Custom KNN
    training_time_custom = training_time_custom_knn[i]
    prediction_time_custom = prediction_time_custom_knn[i]
    print("Custom KNN:")
    print(f"\tTraining time: {training_time_custom:.4f} seconds")
    print(f"\tPrediction time: {prediction_time_custom:.4f} seconds")

    # Scikit-learn KNN
    training_time_sklearn = training_time_knn[i]
    prediction_time_sklearn = prediction_time_knn[i]
    print("Scikit-learn KNN:")
    print(f"\tTraining time: {training_time_sklearn:.4f} seconds")
    print(f"\tPrediction time: {prediction_time_sklearn:.4f} seconds")

    # Comparison
    print("Comparison:")
    if training_time_custom < training_time_sklearn:
        print("\tCustom KNN trains faster.")
    elif training_time_custom > training_time_sklearn:
        print("\tScikit-learn KNN trains faster.")
    else:
        print("\tTraining times are equal.")

    if prediction_time_custom < prediction_time_sklearn:
        print("\tCustom KNN predicts faster.")
    elif prediction_time_custom > prediction_time_sklearn:
        print("\tScikit-learn KNN predicts faster.")
    else:
        print("\tPrediction times are equal.")

    print()

For k=3:
Custom KNN:
	Training time: 0.0000 seconds
	Prediction time: 0.0246 seconds
Scikit-learn KNN:
	Training time: 0.0025 seconds
	Prediction time: 0.0065 seconds
Comparison:
	Custom KNN trains faster.
	Scikit-learn KNN predicts faster.

For k=5:
Custom KNN:
	Training time: 0.0000 seconds
	Prediction time: 0.0154 seconds
Scikit-learn KNN:
	Training time: 0.0011 seconds
	Prediction time: 0.0030 seconds
Comparison:
	Custom KNN trains faster.
	Scikit-learn KNN predicts faster.

For k=7:
Custom KNN:
	Training time: 0.0000 seconds
	Prediction time: 0.0168 seconds
Scikit-learn KNN:
	Training time: 0.0010 seconds
	Prediction time: 0.0045 seconds
Comparison:
	Custom KNN trains faster.
	Scikit-learn KNN predicts faster.



## Comparing Predictions of Custom and Standard KNN

In [10]:
y_pred_custom_

[array(['Arctica', 'Arctica', 'Arctica', 'Arctica', 'Arctica', 'Arctica',
        'Harlequin', 'Harlequin', 'Arctica', 'Arctica', 'Arctica',
        'Harlequin', 'Arctica', 'Harlequin', 'Harlequin', 'Carolinian',
        'Harlequin', 'Carolinian', 'Carolinian', 'Harlequin', 'Harlequin',
        'Carolinian', 'Harlequin', 'Carolinian', 'Harlequin', 'Harlequin',
        'Carolinian', 'Carolinian', 'Carolinian', 'Carolinian'],
       dtype='<U10'),
 array(['Arctica', 'Arctica', 'Arctica', 'Arctica', 'Arctica', 'Arctica',
        'Harlequin', 'Harlequin', 'Arctica', 'Arctica', 'Arctica',
        'Harlequin', 'Arctica', 'Harlequin', 'Harlequin', 'Carolinian',
        'Harlequin', 'Carolinian', 'Carolinian', 'Harlequin', 'Harlequin',
        'Carolinian', 'Harlequin', 'Carolinian', 'Harlequin', 'Harlequin',
        'Carolinian', 'Carolinian', 'Carolinian', 'Carolinian'],
       dtype='<U10'),
 array(['Arctica', 'Arctica', 'Arctica', 'Arctica', 'Arctica', 'Arctica',
        'Harlequin', 'Harl

In [11]:
y_pred_sklearn_

[array(['Arctica', 'Arctica', 'Arctica', 'Arctica', 'Arctica', 'Arctica',
        'Harlequin', 'Harlequin', 'Arctica', 'Arctica', 'Arctica',
        'Harlequin', 'Arctica', 'Harlequin', 'Harlequin', 'Carolinian',
        'Harlequin', 'Carolinian', 'Carolinian', 'Harlequin', 'Harlequin',
        'Carolinian', 'Harlequin', 'Carolinian', 'Harlequin', 'Harlequin',
        'Carolinian', 'Carolinian', 'Carolinian', 'Carolinian'],
       dtype=object),
 array(['Arctica', 'Arctica', 'Arctica', 'Arctica', 'Arctica', 'Arctica',
        'Harlequin', 'Harlequin', 'Arctica', 'Arctica', 'Arctica',
        'Harlequin', 'Arctica', 'Harlequin', 'Harlequin', 'Carolinian',
        'Harlequin', 'Carolinian', 'Carolinian', 'Harlequin', 'Harlequin',
        'Carolinian', 'Harlequin', 'Carolinian', 'Harlequin', 'Harlequin',
        'Carolinian', 'Carolinian', 'Carolinian', 'Carolinian'],
       dtype=object),
 array(['Arctica', 'Arctica', 'Arctica', 'Arctica', 'Arctica', 'Arctica',
        'Harlequin', 'Harl

In [12]:
# Print shapes of the arrays
print("Shape of y_pred_custom_:", len(y_pred_custom_), len(y_pred_custom_[0]))
print("Shape of y_pred_sklearn_:", len(y_pred_sklearn_), len(y_pred_sklearn_[0]))

Shape of y_pred_custom_: 3 30
Shape of y_pred_sklearn_: 3 30


In [13]:
# Convert the predicted labels lists to numpy arrays
y_pred_custom_array = np.array(y_pred_custom_)
y_pred_sklearn_array = np.array(y_pred_sklearn_)

# Convert the predicted labels arrays to DataFrames
df_custom = pd.DataFrame(y_pred_custom_array.T, columns=[f"k={k}" for k in k_values])
df_sklearn = pd.DataFrame(y_pred_sklearn_array.T, columns=[f"k={k}" for k in k_values])

# Print the comparison DataFrame
print("Comparison of Predicted Labels (Custom vs Scikit-learn)")
print(pd.concat([df_custom, df_sklearn], axis=1))

Comparison of Predicted Labels (Custom vs Scikit-learn)
           k=3         k=5         k=7         k=3         k=5         k=7
0      Arctica     Arctica     Arctica     Arctica     Arctica     Arctica
1      Arctica     Arctica     Arctica     Arctica     Arctica     Arctica
2      Arctica     Arctica     Arctica     Arctica     Arctica     Arctica
3      Arctica     Arctica     Arctica     Arctica     Arctica     Arctica
4      Arctica     Arctica     Arctica     Arctica     Arctica     Arctica
5      Arctica     Arctica     Arctica     Arctica     Arctica     Arctica
6    Harlequin   Harlequin   Harlequin   Harlequin   Harlequin   Harlequin
7    Harlequin   Harlequin   Harlequin   Harlequin   Harlequin   Harlequin
8      Arctica     Arctica     Arctica     Arctica     Arctica     Arctica
9      Arctica     Arctica     Arctica     Arctica     Arctica     Arctica
10     Arctica     Arctica     Arctica     Arctica     Arctica     Arctica
11   Harlequin   Harlequin   Harlequin   Har

## Conclusion

From the comparison:

1.   **Training Time:** The custom KNN is quicker at training than scikit-learn's KNN for all k values (3, 5, and 7). This suggests the custom implementation might be better optimized for training.

2.   **Prediction Time:** On the other hand, scikit-learn's KNN predicts faster for all k values. This indicates scikit-learn's implementation might have better prediction algorithms or optimizations.

3.   **Overall Performance:** Despite time differences, both implementations offer similar accuracy in predicting labels. So, they perform similarly in terms of accuracy.

4.   **Choice of Implementation:** Choose based on needs. If fast training matters more and a bit slower prediction is fine, go for the custom implementation. If quick prediction is crucial, especially in real-time, opt for scikit-learn's implementation.

In short, there's a trade-off between training and prediction speed. It's essential to consider specific needs when choosing between custom and scikit-learn implementations.