In [150]:
import numpy as np
import pandas as pd
import math

In [153]:
#METRICS
def distance_euclidean(vector1, vector2):
    distance=np.sum(np.square(vector1 - vector2))
    return np.sqrt(distance)

def distance_manhattan(vector1, vector2):
    distance=np.sum(abs(vector1 - vector2))
    return distance

def distance_max(vector1, vector2):
    distance=max(abs(vector1 - vector2))
    return distance

def distance_cos(vector1, vector2):
    similarity=(np.sum(vector1*vector2))/(np.sqrt(np.sum(vector1*vector1))*np.sqrt(np.sum(vector2*vector2)))
    return 1-similarity

In [154]:
#KNN
class KNN:

    def __init__(self, k, metrics):
        self.k=k
        self.metrics=metrics
        self.x_train=None
        self.y_train=None

    def train(self, x_train, y_train):
        if self.x_train is not None and self.y_train is not None:
            self.x_train = np.concatenate((self.x_train, x_train), axis=0)
            self.y_train = np.concatenate((self.y_train, y_train), axis=0)
        else:
            self.x_train = x_train
            self.y_train = y_train

    def predict(self, x_test):
        y_test=np.empty(len(x_test))
        for counter in range(len(x_test)):
            neighbours=np.empty((len(self.x_train), 2))
            for i in range(len(self.x_train)):
                distance=self.metrics(x_test[counter], self.x_train[i,])
                neighbours[i]=[distance, self.y_train[i]]
            neighbours_sorted= neighbours[neighbours[:, 0].argsort()]
            if np.count_nonzero(neighbours_sorted[:self.k, 1] == 0) > self.k/2:
                label=0
            else:
                label=1
            y_test[counter]=label
        return y_test

In [155]:
#DATA
df = pd.read_csv('car_data.csv')
print(df.info())
df['Gender']=df['Gender'].replace(['Female','Male'], [1,0])
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   User ID       1000 non-null   int64 
 1   Gender        1000 non-null   object
 2   Age           1000 non-null   int64 
 3   AnnualSalary  1000 non-null   int64 
 4   Purchased     1000 non-null   int64 
dtypes: int64(4), object(1)
memory usage: 39.2+ KB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype
---  ------        --------------  -----
 0   User ID       1000 non-null   int64
 1   Gender        1000 non-null   int64
 2   Age           1000 non-null   int64
 3   AnnualSalary  1000 non-null   int64
 4   Purchased     1000 non-null   int64
dtypes: int64(5)
memory usage: 39.2 KB
None


In [156]:
print(df.head())

   User ID  Gender  Age  AnnualSalary  Purchased
0      385       0   35         20000          0
1      681       0   40         43500          0
2      353       0   49         74000          0
3      895       0   40        107500          1
4      661       0   25         79000          0


In [157]:
data=df.drop("Purchased", axis=1).to_numpy()
result=df["Purchased"].to_numpy()

In [158]:
#DATA SPLIT INTO TRAIN AND TEST DATASETS
division=round(2/3*len(df))
train_x=data[:division]
train_y=result[:division]
test_x=data[division:]
test_y=result[division:]

In [161]:
# QUALITY OF PREDICTION
def labels(true,predicted):
    tp=(true*predicted).sum()
    tn=sum((true+predicted)==0)
    fp=sum((predicted-true)==1)
    fn = sum((predicted - true) == -1)
    return tp, tn, fp, fn

def accuracy(true,predicted):
    tp, tn, fp, fn=labels(true, predicted)
    return (tp+tn)/(tp+tn+fp+fn)

def precision(true,predicted):
    tp, tn, fp, fn = labels(true, predicted)
    return tp / (tp + fp)

def recall(true, predicted):
    tp, tn, fp, fn = labels(true, predicted)
    return tp / (tp + fn)

def f1(true, predicted):
    return (2*precision(true, predicted)*recall(true, predicted))/(precision(true, predicted)+recall(true, predicted))

def prediction_quality(true, predicted):
    acc=accuracy(true, predicted)
    prec=precision(true, predicted)
    rec=recall(true, predicted)
    f=f1(true, predicted)
    return acc, prec, rec, f

In [164]:
#MAIN
k=round(math.sqrt(len(train_y))/2)
print("Accuracy, Precision, Recall, F1")
for m in [distance_euclidean, distance_manhattan, distance_max, distance_cos]:
    print(m.__name__)
    knn=KNN(k, m)
    knn.train(train_x, train_y)
    prediction=knn.predict(test_x)
    print(prediction_quality(test_y, prediction))

Accuracy, Precision, Recall, F1
distance_euclidean
(0.7897897897897898, 0.7699115044247787, 0.6641221374045801, 0.7131147540983606)
distance_manhattan
(0.7957957957957958, 0.7692307692307693, 0.6870229007633588, 0.7258064516129032)
distance_max
(0.7747747747747747, 0.7545454545454545, 0.6335877862595419, 0.6887966804979253)
distance_cos
(0.6516516516516516, 0.5773195876288659, 0.42748091603053434, 0.49122807017543857)
