In [1]:
import pandas as pd
import numpy as np
from collections import Counter
from sklearn.model_selection import train_test_split

In [2]:
def euclidean_distance(x1, x2):
    """
    Calculate the Euclidean distance between two points.

    Parameters:
    - x1 (numpy array): First point.
    - x2 (numpy array): Second point.

    Returns:
    - float: Euclidean distance between x1 and x2.
    """
    return np.sqrt(np.sum((x1 - x2) ** 2))


In [3]:
def knn_predict(X_train, y_train, x_test, k):
    """
    Predict the label for a test point using K-Nearest Neighbors.

    Parameters:
    - X_train (numpy array): Training features.
    - y_train (numpy array): Training labels.
    - x_test (numpy array): Test point.
    - k (int): Number of neighbors to consider.

    Returns:
    - int: Predicted label for the test point.
    """
    # Calculate distances between the test point and all training points
    distances = [euclidean_distance(x_test, x) for x in X_train]

    # Get indices of the k-nearest neighbors
    k_neighbors_indices = np.argsort(distances)[:k]

    # Get labels of the k-nearest neighbors
    k_nearest_labels = [y_train[i] for i in k_neighbors_indices]

    # Find the most common label among the k-nearest neighbors
    most_common = Counter(k_nearest_labels).most_common(1)

    # Return the predicted label
    return most_common[0][0]


In [4]:
def knn_accuracy(X_train, y_train, X_test, y_test, k):
    """
    Calculate the accuracy of the K-Nearest Neighbors model.

    Parameters:
    - X_train (numpy array): Training features.
    - y_train (numpy array): Training labels.
    - X_test (numpy array): Test features.
    - y_test (numpy array): Test labels.
    - k (int): Number of neighbors to consider.

    Returns:
    - float: Accuracy of the model.
    """
    # Make predictions for all test points
    predictions = [knn_predict(X_train, y_train, x_test, k)
                   for x_test in X_test]

    # Count the number of correct predictions
    correct_predictions = sum(
        predictions[i] == y_test[i] for i in range(len(y_test)))

    # Calculate accuracy
    accuracy = (correct_predictions / len(y_test)) * 100

    # Return the accuracy
    return accuracy


In [5]:
# Load the dataset
iris_data = pd.read_csv('datasets/iris.csv')
print(iris_data.head())
# Split the data into features (X) and labels (y)
X = iris_data.iloc[:, :-1].values
y = iris_data.iloc[:, -1].values


   sepal.length  sepal.width  petal.length  petal.width variety
0           5.1          3.5           1.4          0.2  Setosa
1           4.9          3.0           1.4          0.2  Setosa
2           4.7          3.2           1.3          0.2  Setosa
3           4.6          3.1           1.5          0.2  Setosa
4           5.0          3.6           1.4          0.2  Setosa


In [6]:
# Initialize arrays for training and testing sets
X_train = []
y_train = []
X_test = []
y_test = []

# Split the data for each category
for category in np.unique(y):
    # Filter data for the current category
    category_indices = np.where(y == category)[0]
    category_data = X[category_indices]

    # Split the data into training (90%) and testing (10%) for the current category
    category_X_train, category_X_test = train_test_split(category_data, test_size=0.1, random_state=42)

    # Append the category data to the overall training and testing sets
    X_train.extend(category_X_train)
    y_train.extend([category] * len(category_X_train))
    X_test.extend(category_X_test)
    y_test.extend([category] * len(category_X_test))

# Convert lists to numpy arrays
X_train = np.array(X_train)
y_train = np.array(y_train)
X_test = np.array(X_test)
y_test = np.array(y_test)


In [7]:
#Counting occurence of unique values in training and testing samples
print(Counter(y_train))
print(Counter(y_test))

Counter({'Setosa': 45, 'Versicolor': 45, 'Virginica': 45})
Counter({'Setosa': 5, 'Versicolor': 5, 'Virginica': 5})


In [8]:
# Test the model for K=3 and K=5
accuracy_3 = knn_accuracy(X_train, y_train, X_test, y_test, k=3)
accuracy_5 = knn_accuracy(X_train, y_train, X_test, y_test, k=5)

print(f'Accuracy for K=3: {accuracy_3}%')
print(f'Accuracy for K=5: {accuracy_5}%')

Accuracy for K=3: 100.0%
Accuracy for K=5: 100.0%
