### Necessary Imports
- NumPy
- Pandas
- Math
- Copy
- Train Test Split

In [7]:
# Importing libraries
import numpy as np
import pandas as pd
import math
import copy
from sklearn.model_selection import train_test_split


### Load the Dataset

In [8]:
# Load the dataset using pandas
df = pd.read_csv('seeds_dataset.csv')
# Print the dataset
df

Unnamed: 0,Area,Perimeter.,Compactness,Length of kernel,Width of kernel,Asymmetry coefficient,Length of kernel groove,class
0,15.26,14.84,0.8710,5.763,3.312,2.221,5.220,1
1,14.88,14.57,0.8811,5.554,3.333,1.018,4.956,1
2,14.29,14.09,0.9050,5.291,3.337,2.699,4.825,1
3,13.84,13.94,0.8955,5.324,3.379,2.259,4.805,1
4,16.14,14.99,0.9034,5.658,3.562,1.355,5.175,1
...,...,...,...,...,...,...,...,...
205,12.19,13.20,0.8783,5.137,2.981,3.631,4.870,3
206,11.23,12.88,0.8511,5.140,2.795,4.325,5.003,3
207,13.20,13.66,0.8883,5.236,3.232,8.315,5.056,3
208,11.84,13.21,0.8521,5.175,2.836,3.598,5.044,3


### Perform Some Preprocessing

In [9]:
# Extract the class labels into a numpy array called 'y'
y = np.array(df['class'])

# Remove the class labels from the dataframe and store the remaining data in a numpy array called 'X'
X = np.array(df.drop('class',axis=1))

In [10]:
# Split the dataset into training and testing
X_train,X_test,y_train,y_test = train_test_split(X,y,random_state=0,train_size=0.75)

In [11]:
# Print the dimensions of the training and testing data
print(f"X train: {X_train.shape}")
print(f"X test: {X_test.shape}")
print(f"y train: {y_train.shape}")
print(f"y test: {y_test.shape}")

X train: (157, 7)
X test: (53, 7)
y train: (157,)
y test: (53,)


### Define the Following Functions

In [12]:
def euclidean_distance(predicted, target):
    # Calculate the Euclidean distance between the predicted and target values
    return np.linalg.norm(np.array(predicted)-np.array(target))

In [13]:
def get_k_nearest(distances, k):
    # Sort the list of distances in ascending order
    sorted_distances = sorted(distances, key = lambda X: X[0])
    # Get the first 'k' distances from the sorted list and return them
    return sorted_distances[:k]

In [14]:
def get_max_class(classes):
    # Get the unique values and their counts
    classes,values = np.unique(classes,return_counts = True)
    # Get the index of the maximum count value
    index = values.argmax()
    # Return the class with the maximum count
    return classes[index]

In [15]:
def get_classes_only(k_nearest):
    # Use list comprehension to extract the class labels from k_nearest tuples
    # k_nearest = [(np.array([1, 2]), 'A'),(np.array([3, 4]), 'B'),(np.array([5, 6]), 'A'),(np.array([7, 8]), 'A'),(np.array([9, 10]), 'B')]
    classes = []
    for thing in k_nearest:
        classes.append(thing[1])
    return classes

In [16]:
def calculate_accuracy(prediction, target):
    # Calculate the accuracy of the predictions
    count = 0
    for i in range(len(prediction)):
        if prediction[i]==target[i]:
            count= count +1
    # Print the accuracy, don't return it
    print(f"Accuracy is : {count/len(prediction)}")

### Define KNN Running Script Function

In [17]:
# Using the KNN Supervised Learning Algorithm for the model
# It will help us classify the quality of wheat seeds

# Defining the KNN function
def KNN(X_train,X_test,y_train):
    # Initialize an empty list to store the predicted labels for the test set
    predicted_label = []
    # Iterate over each instance in the test set
    for test_instance in X_test:
        # Initialize an empty list to store the distances and labels of all training instances with respect to the current test instance
        distances = []
        # Iterate over each instance in the training set
        for indx,train_instance in enumerate(X_train):
            # Calculate the Euclidean distance between the current test instance and the current training instance
            dist=euclidean_distance(test_instance, train_instance)
            # Store the distance and the label of the current training instance as a tuple in the info list
            distances.append((dist,y_train[indx]))
        # Find the k nearest training instances based on their distances to the current test instance
        k_nearest = get_k_nearest(distances, 3)
        # Extract the labels of the k nearest training instances
        classes = get_classes_only(k_nearest)
        # Predict the label of the current test instance by choosing the label with the highest frequency among the k nearest training instances
        max_class = get_max_class(classes)
        # Add the predicted label to the list of predictions for the test set
        predicted_label.append(max_class)
    # Return the list of predicted labels for the test set
    return predicted_label

### Perform Prediction and Show Accuracy

In [18]:
# Running on k = 3
preds = KNN(X_train,X_test,y_train)
calculate_accuracy(preds, y_test)

Accuracy is : 0.8867924528301887


<hr>

### Test Cases (If Required in Logic Building)

In [13]:
# # Testing euclidean_distance function
# point1 = [1, 2, 3]
# point2 = [4, 5, 6]
# distance = euclidean_distance(point1, point2)
# print(distance)

# Testing get_classes_only function
# k_nearest = [(np.array([1, 2]), 'A'),(np.array([3, 4]), 'B'),(np.array([5, 6]), 'A'),(np.array([7, 8]), 'A'),(np.array([9, 10]), 'B')]
# classes = get_classes_only(k_nearest)
# print(classes)

# # Testing get_max_class function
# classes = ['apple', 'banana', 'banana', 'orange', 'apple', 'banana', 'banana']
# max_class = get_max_class(classes)
# print(max_class)

# # Testing get_k_nearest function
# distances = [(0.3, 'class1'), (0.5, 'class2'), (0.2, 'class1'), (0.4, 'class2'), (0.1, 'class1')]
# k_nearest = get_k_nearest(distances, 3)
# print(k_nearest)

# # Testing calculate_accuracy function
# prediction = [1, 1, 1, 0, 0, 1, 1, 0]
# target = [1, 1, 1, 0, 0, 1, 1, 0]
# calculate_accuracy(prediction, target)


['A', 'B', 'A', 'A', 'B']


<hr>