# PreProcess Data

In [2]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder

hrLink = 'HayesRoth/hayes-roth.data'

hrDF = pd.read_csv(hrLink, header=None)
hrDF = hrDF.apply(LabelEncoder().fit_transform)
hrFeatures = hrDF.iloc[:, :-1].values.tolist()
hrLabels = hrDF.iloc[:, -1].tolist()


hrTestLink = 'HayesRoth/hayes-roth.test'

hrDFTest = pd.read_csv(hrLink, header=None)
hrDFTest = hrDFTest.apply(LabelEncoder().fit_transform)
hrTestFeatures = hrDFTest.iloc[:, :-1].values.tolist()
hrTestLabels = hrDF.iloc[:, -1].tolist()

In [3]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder

carLink = 'CarEvaluation/car.data'

carDF = pd.read_csv(carLink, header=None)
carDF = carDF.apply(LabelEncoder().fit_transform)
carFeatures = carDF.iloc[:, :-1].values.tolist()
carLabels = carDF.iloc[:, -1].tolist()

In [4]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder

cancerLink = 'BreastCancer/breast-cancer.data'

cancerDF = pd.read_csv(cancerLink, header=None)
cancerDF = cancerDF.apply(LabelEncoder().fit_transform)
cancerFeatures = cancerDF.iloc[:, :-1].values.tolist()
cancerLabels = cancerDF.iloc[:, -1].tolist()

# KNN without SKLearn

In [5]:
import random
from heapq import nsmallest
from collections import Counter
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

class KNNClassifier:
    
    def __init__(self, k=7, dis_metric='euclidean'):
        self.k = k
        self.dis_metric = dis_metric
        self.train_data = []
        self.train_labels = []
    

    def train(self, train_data, train_labels):
        self.train_data = train_data
        self.train_labels = train_labels
        
    def euclidean_distance(self, v1, v2):
        if len(v1) != len(v2):
            raise ValueError("Undefined for sequences of unequal length.")
        return sum((a - b) ** 2 for a, b in zip(v1, v2)) ** 0.5


    def get_neighbors(self, test_row):
        dis_metrics = {
            'euclidean': self.euclidean_distance,
        }
        if self.dis_metric not in dis_metrics:
            raise ValueError("Invalid distance metric")
        calc_distance = dis_metrics[self.dis_metric]
        distances = [
            (train_row, calc_distance(test_row, train_row), label)
            for train_row, label in zip(self.train_data, self.train_labels)
        ]
        k_nearest = nsmallest(self.k, distances, key=lambda x: x[1])
        return k_nearest


    def predict(self, x_test):
        predictions = []
        for test_case in x_test:
            neighbors = self.get_neighbors(test_case)
            output = [row[-1] for row in neighbors]
            prediction = Counter(output).most_common(1)[0][0]
            predictions.append(prediction)
        return predictions


# KFold cross validation

In [203]:
class kFoldCV:
    def __init__(self, classifier):
        self.classifier = classifier  
    
    def cross_val_split(self, dataset, num_folds):
        data_split = []
        data_copy = list(dataset)
        fold_size = len(dataset) // num_folds
        for _ in range(num_folds):
            if fold_size <= len(data_copy):
                fold = random.sample(data_copy, fold_size)
            else:
                fold = data_copy  # If the fold size is larger than the remaining data, use all the data
            data_split.append(fold)
            data_copy = [row for row in data_copy if row not in fold]
        return data_split

    def k_foldEvaluate(self, dataset, num_folds):
        folds = self.cross_val_split(dataset, num_folds)
        scores = []
        

        for fold_number, fold in enumerate(folds, 1):
            train_set = [row for row in dataset if row not in fold]
            test_set = [row for row in fold]
            train_labels, train_set = zip(*[(row[-1], row[:-1]) for row in train_set])
            test_labels, test_set = zip(*[(row[-1], row[:-1]) for row in test_set])
            self.classifier.train(train_set, train_labels)
            predicted = self.classifier.predict(test_set)
            accuracy = self.calculate_accuracy(test_labels, predicted)
            finalAccuracy = round(accuracy, 2)
            scores.append(finalAccuracy)
        
            print(f"Fold {fold_number}: Accuracy = {finalAccuracy}%")
            
        return scores

       
    def calculate_accuracy(self, actual, predicted):
        assert len(actual) == len(predicted)
        correct = sum(1 for a, p in zip(actual, predicted) if a == p)
        return (correct / len(actual)) * 100.0

In [7]:
def print_metrics(actual, predictions):
    if actual is None or predictions is None:
        print("Error: Both 'actual' and 'predictions' must be valid lists.")
        return
    
    assert len(actual) == len(predictions)
    correct = sum(1 for a, p in zip(actual, predictions) if a == p)
    accuracy = (correct / len(actual)) * 100.0
    print("Accuracy of kNN model: {:.2f}%".format(accuracy))
    return accuracy

# Packages used for this KNN

In [188]:
import numpy as np
from sklearn.model_selection import KFold, cross_val_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler


import numpy as np
from scipy import stats


# Hayes Roth Dataset Accuracy

In [221]:
X_train, X_test, y_train, y_test = train_test_split(hrFeatures, hrLabels, test_size=0.2)
knn = KNNClassifier()
knn.train(X_train, y_train)
hr_euc_pred = knn.predict(X_test)
nosklearnAccuracyHR = print_metrics(y_test, hr_euc_pred)
kfcv = kFoldCV(knn)
KFoldNoSkl = kfcv.k_foldEvaluate(hrTestFeatures, 10)
 
# SKlearn
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
k = 3  # Number of neighbors
knn_classifier = KNeighborsClassifier(n_neighbors=k)
knn_classifier.fit(X_train, y_train)
y_pred = knn_classifier.predict(X_test)
sk_accuracy_HR = accuracy_score(y_test, y_pred)
print(f"Accuracy of kNN model using SKlearn: {sk_accuracy_HR * 100:.2f}%")

X = np.vstack((X_train, X_test))
y = np.hstack((y_train, y_test))

# Define 10-fold cross-validation
kfold = KFold(n_splits=10, shuffle=True, random_state=42)
cv_scores = cross_val_score(knn_classifier, X, y, cv=kfold, scoring='accuracy')

hrarray = []
for i, score in enumerate(cv_scores):
    print(f"Fold {i + 1}: Accuracy = {score * 100:.2f}%")
    hrarray.append(round(score*100,2))



Accuracy of kNN model: 22.22%
Fold 1: Accuracy = 38.46%
Fold 2: Accuracy = 38.46%
Fold 3: Accuracy = 61.54%
Fold 4: Accuracy = 53.85%
Fold 5: Accuracy = 15.38%
Fold 6: Accuracy = 38.46%
Fold 7: Accuracy = 30.77%
Fold 8: Accuracy = 46.15%
Fold 9: Accuracy = 46.15%
Fold 10: Accuracy = 38.46%
Accuracy of kNN model using SKlearn: 48.15%
Fold 1: Accuracy = 14.29%
Fold 2: Accuracy = 50.00%
Fold 3: Accuracy = 61.54%
Fold 4: Accuracy = 53.85%
Fold 5: Accuracy = 30.77%
Fold 6: Accuracy = 46.15%
Fold 7: Accuracy = 76.92%
Fold 8: Accuracy = 53.85%
Fold 9: Accuracy = 46.15%
Fold 10: Accuracy = 69.23%


In [222]:
import numpy as np
from scipy import stats


# Sample data for two groups
normalKNN = KFoldNoSkl
sklearnKNN = hrarray
print(normalKNN)
print(sklearnKNN)
# Perform the independent samples t-test
t_stat, p_value = stats.ttest_ind(normalKNN, sklearnKNN)

print(t_stat)
print(p_value)

# Check the p-value to determine significance
if p_value < 0.05:  # 0.05 is a common significance level
    print("A notable significant difference exists between KNN and SKLearn-KNN Accuracies.")
else:
    print("No significant difference between KNN and SKLearn-KNN Accuracies.")
    



[38.46, 38.46, 61.54, 53.85, 15.38, 38.46, 30.77, 46.15, 46.15, 38.46]
[14.29, 50.0, 61.54, 53.85, 30.77, 46.15, 76.92, 53.85, 46.15, 69.23]
-1.3662927000951348
0.18867435817517453
No significant difference between KNN and SKLearn-KNN Accuracies.


# Car Evaluation Dataset Accuracy

In [253]:
X_train, X_test, y_train, y_test = train_test_split(carFeatures, carLabels, test_size=0.2)
knn = KNNClassifier()
knn.train(X_train, y_train)
car_euc_pred = knn.predict(X_test)
nosklearnAccuracyCar = print_metrics(y_test, car_euc_pred)
kfcv = kFoldCV(knn)
ceNoSKLArray = kfcv.k_foldEvaluate(carFeatures, 10)


# SKlearn
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
k = 3  # Number of neighbors
knn_classifier = KNeighborsClassifier(n_neighbors=k)
knn_classifier.fit(X_train, y_train)
y_pred = knn_classifier.predict(X_test)
sk_accuracy_Car = accuracy_score(y_test, y_pred)
print(f"Accuracy of kNN model using SKlearn: {sk_accuracy_Car * 100:.2f}%")


X = np.vstack((X_train, X_test))
y = np.hstack((y_train, y_test))

# Define 10-fold cross-validation
kfold = KFold(n_splits=10, shuffle=True, random_state=42)
cv_scores = cross_val_score(knn_classifier, X, y, cv=kfold, scoring='accuracy')
cearray = []
for i, score in enumerate(cv_scores):
    print(f"Fold {i + 1}: Accuracy = {score * 100:.2f}%")
    cearray.append(round(score*100,2))



Accuracy of kNN model: 92.20%
Fold 1: Accuracy = 3.49%
Fold 2: Accuracy = 1.74%
Fold 3: Accuracy = 2.91%
Fold 4: Accuracy = 2.33%
Fold 5: Accuracy = 1.16%
Fold 6: Accuracy = 2.33%
Fold 7: Accuracy = 2.33%
Fold 8: Accuracy = 1.74%
Fold 9: Accuracy = 1.74%
Fold 10: Accuracy = 3.49%
Accuracy of kNN model using SKlearn: 93.06%
Fold 1: Accuracy = 93.64%
Fold 2: Accuracy = 93.06%
Fold 3: Accuracy = 95.38%
Fold 4: Accuracy = 92.49%
Fold 5: Accuracy = 95.95%
Fold 6: Accuracy = 96.53%
Fold 7: Accuracy = 89.60%
Fold 8: Accuracy = 89.60%
Fold 9: Accuracy = 91.28%
Fold 10: Accuracy = 93.60%


In [255]:
import numpy as np
from scipy import stats


# Sample data for two groups
normalKNN = ceNoSKLArray
sklearnKNN = cearray
print(normalKNN)
print(sklearnKNN)

# Perform the independent samples t-test
t_stat, p_value = stats.ttest_ind(normalKNN, sklearnKNN)

print(t_stat)
print(p_value)

# Check the p-value to determine significance
if p_value < 0.05:  # 0.05 is a common significance level
    print("A notable significant difference exists between KNN and SKLearn-KNN Accuracies.")
else:
    print("No significant difference between KNN and SKLearn-KNN Accuracies.")
    

[3.49, 1.74, 2.91, 2.33, 1.16, 2.33, 2.33, 1.74, 1.74, 3.49]
[93.64, 93.06, 95.38, 92.49, 95.95, 96.53, 89.6, 89.6, 91.28, 93.6]
-111.89012623558473
4.809953730856732e-27
A notable significant difference exists between KNN and SKLearn-KNN Accuracies.


# Breast Cancer Dataset Accuracy

In [235]:
X_train, X_test, y_train, y_test = train_test_split(cancerFeatures, cancerLabels, test_size=0.2)
knn = KNNClassifier()
knn.train(X_train, y_train)
cancer_euc_pred = knn.predict(X_test)
nosklearnAccuracyCancer = print_metrics(y_test, cancer_euc_pred)
kfcv = kFoldCV(knn)
bcnoSKLarray = kfcv.k_foldEvaluate(cancerFeatures, 10)

# SKlearn
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
k = 3  # Number of neighbors
knn_classifier = KNeighborsClassifier(n_neighbors=k)
knn_classifier.fit(X_train, y_train)
y_pred = knn_classifier.predict(X_test)
sk_accuracy_Cancer = accuracy_score(y_test, y_pred)
print(f"Accuracy of kNN model using SKlearn: {sk_accuracy_Cancer * 100:.2f}%")


X = np.vstack((X_train, X_test))
y = np.hstack((y_train, y_test))

# Define 10-fold cross-validation
kfold = KFold(n_splits=10, shuffle=True, random_state=42)
cv_scores = cross_val_score(knn_classifier, X, y, cv=kfold, scoring='accuracy')
bcarray = []
for i, score in enumerate(cv_scores):
    print(f"Fold {i + 1}: Accuracy = {score * 100:.2f}%")
    bcarray.append(round(score*100,2))

Accuracy of kNN model: 79.31%
Fold 1: Accuracy = 17.86%
Fold 2: Accuracy = 32.14%
Fold 3: Accuracy = 17.86%
Fold 4: Accuracy = 32.14%
Fold 5: Accuracy = 32.14%
Fold 6: Accuracy = 39.29%
Fold 7: Accuracy = 32.14%
Fold 8: Accuracy = 25.0%
Fold 9: Accuracy = 25.0%
Fold 10: Accuracy = 25.0%
Accuracy of kNN model using SKlearn: 77.59%
Fold 1: Accuracy = 75.86%
Fold 2: Accuracy = 72.41%
Fold 3: Accuracy = 86.21%
Fold 4: Accuracy = 68.97%
Fold 5: Accuracy = 75.86%
Fold 6: Accuracy = 72.41%
Fold 7: Accuracy = 71.43%
Fold 8: Accuracy = 75.00%
Fold 9: Accuracy = 71.43%
Fold 10: Accuracy = 71.43%


In [236]:
import numpy as np
from scipy import stats


# Sample data for two groups
normalKNN = bcnoSKLarray
sklearnKNN = bcarray

print(normalKNN)
print(sklearnKNN)

# Perform the independent samples t-test
t_stat, p_value = stats.ttest_ind(normalKNN, sklearnKNN)

print(t_stat)
print(p_value)

# Check the p-value to determine significance
if p_value < 0.05:  # 0.05 is a common significance level
    print("A notable significant difference exists between KNN and SKLearn-KNN Accuracies.")
else:
    print("No significant difference  between KNN and SKLearn-KNN Accuracies.")



[17.86, 32.14, 17.86, 32.14, 32.14, 39.29, 32.14, 25.0, 25.0, 25.0]
[75.86, 72.41, 86.21, 68.97, 75.86, 72.41, 71.43, 75.0, 71.43, 71.43]
-17.40237474899569
1.0479826208634378e-12
A notable significant difference exists between KNN and SKLearn-KNN Accuracies.
