In [None]:

import numpy as np
import pandas as pd
from scipy.stats import mode 
from typing import Tuple

from sklearn.metrics import confusion_matrix
import seaborn as sns

csv_path = "diabetes.csv"

Load csv

In [None]:
def load_csv(csv_path:str):
    df = pd.read_csv(csv_path)
    df = df.sample(frac=1, random_state=42).reset_index(drop=True)
    x,y = df.iloc[:,:-1], df.iloc[:,-1]
    return x,y

In [None]:
x,y = load_csv(csv_path)
x,y

In [None]:
np.mean(x, axis=0), np.var(x, axis=0)

In [None]:
np.nanmean(x, axis=0), np.nanvar(x, axis=0)

In [None]:
x.shape

In [None]:
(x < 0.0).sum()

Train test split

In [None]:
def train_test_split(features:pd.DataFrame, labels:pd.DataFrame, test_split_ratio:float):
    test_size = int(len(features) * test_split_ratio)
    train_size = len(features) - test_size
    assert len(features) == test_size + train_size, "Size mismatch!"

    x_train, y_train = features.iloc[:train_size, :], labels.iloc[:train_size]
    x_test, y_test = features.iloc[train_size:, :], labels.iloc[train_size:]
    return (x_train.reset_index(drop=True),y_train.reset_index(drop=True),x_test.reset_index(drop=True),y_test.reset_index(drop=True))

In [None]:
x_train, y_train, x_test, y_test = train_test_split(x, y, 0.2)

In [None]:
def euclidean(points:pd.DataFrame, element_of_x:pd.Series) -> pd.Series:
    return ((points - element_of_x)**2).sum(axis=1)**(1/2)

In [None]:
euclidean(x_train, x_test.iloc[0])

In [None]:
def predict(x_train:pd.DataFrame, y_train:pd.DataFrame, x_test:pd.DataFrame, k:int) -> pd.DataFrame:
    labels_pred = []
    for idx, x_test_element in x_test.iterrows():
        #távolságok meghatározása
        distances = euclidean(x_train, x_test_element)
        distances = pd.DataFrame({'distances': distances, 'labels': y_train})
        distances.sort_values(by='distances', inplace=True)

        #leggyakoribb labelt kiszedjük
        label_pred = mode(distances.iloc[:k,1], axis=0).mode[0]

        labels_pred.append(label_pred)
    return pd.Series(labels_pred)


In [None]:
y_preds = predict(x_train, y_train, y_test, 2)

In [None]:
def accuracy(y_test:pd.Series, y_preds:pd.Series) -> float:
    true_positive = (y_test == y_preds).sum()
    return true_positive / len(y_test) * 100

In [None]:
accuracy(y_test, y_preds)

In [None]:
def plot_confusion_matrix(y_test:pd.DataFrame, y_preds:pd.DataFrame) -> None:
    conf_matrix = confusion_matrix(y_test, y_preds)
    sns.heatmap(conf_matrix, annot=True)
    return confusion_matrix

In [None]:
plot_confusion_matrix(y_test, y_preds)

In [None]:
def best_k() -> Tuple[int, float]:
        accuracy = 0
        idx = -1
        for i in range(1, 21):
            k = i
            predict(x_train, y_train, x_test, k)
            new_accuracy = accuracy()
            if new_accuracy > accuracy:
                accuracy = new_accuracy
                idx = i
        return (idx, round(accuracy, 2)) 

In [None]:
a,b = best_k()