# Import Libraries

In [20]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import confusion_matrix
from collections import Counter

In [21]:
import math

# Distance Measurement methods

In [14]:
def euclidean_distance(x1, x2):
    return np.sqrt(np.sum((x1 - x2)**2))

In [22]:
def modified_euc_dis(x1, x2):
    total_sum = 0
    for i, j in zip(x1, x2):
        total_sum += (i-j)**2
    return math.sqrt(total_sum)

In [31]:
x3 = np.array([3, 6, 7.5, 2.5]*1000)
x17 = np.array([6, 2.9, 6.8, 3.5]*1000)

In [32]:
%%time
print(euclidean_distance(x3, x17))

141.77446878757826
CPU times: user 144 µs, sys: 63 µs, total: 207 µs
Wall time: 188 µs


In [33]:
%%time
print(modified_euc_dis(x3, x17))

141.77446878757976
CPU times: user 4.2 ms, sys: 391 µs, total: 4.59 ms
Wall time: 4.73 ms


In [107]:
def manhattan_distance(x1, x2):
    return np.sum(np.abs(x1 - x2))

In [106]:
def minkowski_distance(x1, x2, p=3):
    return np.power(np.sum(np.power(np.abs(x1 - x2), p)), 1/p)

In [105]:
def cosine_distance(x1, x2):
    return 1 - cosine_similarity([x1], [x2])[0][0]

In [10]:
def knn(x_train, y_train, x_val, k, distance_measure='euclidean'):
    if distance_measure == 'euclidean':
        distance_fn = euclidean_distance
    elif distance_measure == 'manhattan':
        distance_fn = manhattan_distance
    elif distance_measure == 'minkowski':
        distance_fn = lambda x1, x2: minkowski_distance(x1, x2, p=3)
    elif distance_measure == 'cosine':
        distance_fn = cosine_distance
    else:
        raise ValueError("Invalid distance measure")
        
    distances = []
    for i in range(len(x_train)):
        distance = distance_fn(x_train[i], x_val)
        distances.append((distance, y_train[i]))
    distances = sorted(distances)
    targets = [y for _, y in distances[:k]]
    return Counter(targets).most_common(1)[0][0]

In [116]:
Counter(targets).most_common(1)[0][0]

2

In [62]:
# Load the iris dataset
data = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data', header=None)

In [63]:
data.columns = ["sepal length", "sepal width", "petal length", "petal width", "flower_class"]
data.head()

Unnamed: 0,sepal length,sepal width,petal length,petal width,flower_class
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


In [66]:
# statistical summary
data.describe()

Unnamed: 0,sepal length,sepal width,petal length,petal width
count,150.0,150.0,150.0,150.0
mean,5.843333,3.054,3.758667,1.198667
std,0.828066,0.433594,1.76442,0.763161
min,4.3,2.0,1.0,0.1
25%,5.1,2.8,1.6,0.3
50%,5.8,3.0,4.35,1.3
75%,6.4,3.3,5.1,1.8
max,7.9,4.4,6.9,2.5


In [50]:
data["flower_class"].describe()

count             150
unique              3
top       Iris-setosa
freq               50
Name: flower_class, dtype: object

In [58]:
data["flower_class"].unique()

array(['Iris-setosa', 'Iris-versicolor', 'Iris-virginica'], dtype=object)

In [19]:
data.sample(10)

Unnamed: 0,0,1,2,3,4
88,5.6,3.0,4.1,1.3,Iris-versicolor
109,7.2,3.6,6.1,2.5,Iris-virginica
77,6.7,3.0,5.0,1.7,Iris-versicolor
147,6.5,3.0,5.2,2.0,Iris-virginica
68,6.2,2.2,4.5,1.5,Iris-versicolor
54,6.5,2.8,4.6,1.5,Iris-versicolor
133,6.3,2.8,5.1,1.5,Iris-virginica
55,5.7,2.8,4.5,1.3,Iris-versicolor
64,5.6,2.9,3.6,1.3,Iris-versicolor
48,5.3,3.7,1.5,0.2,Iris-setosa


In [61]:
data.groupby('flower_class').count()

Unnamed: 0_level_0,sepal length,sepal width,petal length,petal width
flower_class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Iris-setosa,50,50,50,50
Iris-versicolor,50,50,50,50
Iris-virginica,50,50,50,50


In [67]:
# Assign the features and labels to X and y respectively
X = data.iloc[:, :-1].values
y = data.iloc[:, -1].values

In [72]:
# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [109]:
# Set the number of neighbors to consider (k)
k_lst = [1, 3, 5, 7, 9, 11, 13]

# Set the number of folds for k-fold cross-validation
k_folds_lst = [3, 5, 10]

# Set the distance measure to use
distance_measure_lst = ['euclidean', 'manhattan', 'minkowski']

# Initialize the cross-validation folds
kfold = KFold(n_splits=k_folds, shuffle=True, random_state=42)

# Initialize the accuracy scores
val_accuracies = []

for k in k_lst:
    for k_folds in k_folds_lst:
        for distance_measure in distance_measure_lst:
            print(f"K-neightbours: {k}\tK-fold: {k_folds}\tDistance Algo: {distance_measure}")
            # Initialize the accuracy scores
            val_accuracies = []
            # Perform k-fold cross-validation
            for train_index, val_index in kfold.split(X_train):
                X_cv_train, X_cv_val = X_train[train_index], X_train[val_index]
                y_cv_train, y_cv_val = y_train[train_index], y_train[val_index]

                val_predictions = []
                for x in X_cv_val:
                    val_predictions.append(knn(X_cv_train, y_cv_train, x, k, distance_measure))

                val_accuracy = np.mean(val_predictions == y_cv_val)
                val_accuracies.append(val_accuracy)
            
            # Calculate the average validation accuracy score
            avg_val_accuracy = np.mean(val_accuracies)
            print("Average Validation Accuracy: {:.2f}%".format(avg_val_accuracy * 100))
            
            # Calculate the test accuracy
            test_predictions = []
            for x in X_test:
                test_predictions.append(knn(X_train, y_train, x, k, distance_measure))
            test_accuracy = np.mean(test_predictions == y_test)
            print("Test Accuracy: {:.2f}%".format(test_accuracy * 100))
            print("============================")

K-neightbours: 1	K-fold: 3	Distance Algo: euclidean
Average Validation Accuracy: 94.17%
Test Accuracy: 100.00%
K-neightbours: 1	K-fold: 3	Distance Algo: manhattan
Average Validation Accuracy: 93.33%
Test Accuracy: 100.00%
K-neightbours: 1	K-fold: 3	Distance Algo: minkowski
Average Validation Accuracy: 94.17%
Test Accuracy: 100.00%
K-neightbours: 1	K-fold: 5	Distance Algo: euclidean
Average Validation Accuracy: 94.17%
Test Accuracy: 100.00%
K-neightbours: 1	K-fold: 5	Distance Algo: manhattan
Average Validation Accuracy: 93.33%
Test Accuracy: 100.00%
K-neightbours: 1	K-fold: 5	Distance Algo: minkowski
Average Validation Accuracy: 94.17%
Test Accuracy: 100.00%
K-neightbours: 1	K-fold: 10	Distance Algo: euclidean
Average Validation Accuracy: 94.17%
Test Accuracy: 100.00%
K-neightbours: 1	K-fold: 10	Distance Algo: manhattan
Average Validation Accuracy: 93.33%
Test Accuracy: 100.00%
K-neightbours: 1	K-fold: 10	Distance Algo: minkowski
Average Validation Accuracy: 94.17%
Test Accuracy: 100.00

In [85]:
# understanding kfold splitng and its output
for t_i, v_i in kfold.split(X_train):
    print("Train Data: ", t_i)
    print("Train Data Length: ", len(t_i), '\n')
    print("Validation Data: ", v_i)
    print("Validation Data Length: ", len(v_i))
    print('---------------------------')

Train Data:  [  1   2   3   5   6   7   8   9  12  13  14  15  16  17  19  20  21  22
  23  24  25  27  28  29  30  32  33  34  35  37  38  39  41  42  43  46
  48  49  50  51  52  53  54  56  57  58  59  60  61  63  66  67  68  69
  71  72  74  75  76  77  78  79  80  81  82  83  84  85  86  87  90  92
  93  94  95  96  97  98  99 100 101 102 103 105 106 108 110 111 112 113
 114 115 116 117 118 119]
Train Data Length:  96 

Validation Data:  [  0   4  10  11  18  26  31  36  40  44  45  47  55  62  64  65  70  73
  88  89  91 104 107 109]
Validation Data Length:  24
---------------------------
Train Data:  [  0   1   2   3   4   6   7   8  10  11  13  14  16  17  18  19  20  21
  23  26  27  29  31  32  34  35  36  37  38  39  40  41  43  44  45  46
  47  48  49  50  51  52  54  55  57  58  59  60  61  62  63  64  65  66
  67  68  70  71  72  73  74  75  77  79  80  81  82  83  84  86  87  88
  89  91  92  93  94  95  99 100 101 102 103 104 105 106 107 108 109 111
 112 113 115 116 117

In [100]:
val_accuracies

[0.9166666666666666,
 0.9583333333333334,
 0.9166666666666666,
 0.9166666666666666,
 1.0]

In [101]:
# Calculate the average validation accuracy score
avg_val_accuracy = np.mean(val_accuracies)
print("Average Validation Accuracy: {:.2f}%".format(avg_val_accuracy * 100))

Average Validation Accuracy: 94.17%


In [102]:
# Calculate the test accuracy
test_predictions = []
for x in X_test:
    test_predictions.append(knn(X_train, y_train, x, k, distance_measure))
test_accuracy = np.mean(test_predictions == y_test)
print("Test Accuracy: {:.2f}%".format(test_accuracy * 100))

Test Accuracy: 100.00%


In [103]:
# Calculate the confusion matrix on the test data
cm = confusion_matrix(y_test, test_predictions)
print("Confusion Matrix:")
print(cm)

Confusion Matrix:
[[10  0  0]
 [ 0  9  0]
 [ 0  0 11]]
