In [None]:
import os
import pandas as pd
from scipy.stats import mode
from sklearn.impute import SimpleImputer
from sklearn.impute import KNNImputer
from sklearn.neighbors import KNeighborsClassifier
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.metrics import accuracy_score
import sys

In [None]:
class KNearestNeighbour:
    def __init__(self, k):
        self.k = k
    
    
    def fit(self, X_train, y_train):
        self.X_train = X_train
        self.y_train = y_train


    def predict(self, X_test):
        labels_pred = []
        for x_test in X_test:
            distances = []
            for x_train in self.X_train:
                distance = np.linalg.norm(x_train - x_test)
                distances.append(distance)
            distances = np.array(distances)
            k_nearest = np.argsort(distances)[:self.k]
            labels = self.y_train[k_nearest]
            most_common = mode(labels, keepdims=True)
            majority = most_common.mode[0]
            labels_pred.append(majority)
        return np.array(labels_pred)

In [None]:
df = pd.read_csv('/home/jovyan/work/3-semester/3-semester-cml1/data/clean.csv')
# create df only with column "type"
df_y = df[['type']]

In [None]:
# get only numerical columns and drop column Unnamed: 0
df = df.select_dtypes(include=['float64', 'int64'])
# exclude column floor and Unnamed: 0
df = df.drop(columns=['Unnamed: 0', 'floor', 'zip_code'])

In [None]:
df.isna().sum()

In [None]:
# impute missing values with knn
df_imp = df.copy(deep=True)
median_imputer = KNNImputer(n_neighbors=5)
df_imp.iloc[:, :] = median_imputer.fit_transform(df_imp)

In [None]:
df_imp.isna().sum()

In [None]:
df_imp.head()

In [None]:
# show correlation matrix
corr = df_imp.corr()
plt.figure(figsize=(10, 10))
sns.heatmap(corr, annot=True, fmt='.2f', cmap='coolwarm')
plt.show()

In [None]:
types_nrs = dict(enumerate(df_y["type"].unique()))
types_name = {v: k for k, v in types_nrs.items()}
df_y["type"] = df_y["type"].map(types_name)

In [None]:
X = df_imp.values
# standardize X
X = (X - X.mean(axis=0)) / X.std(axis=0)
y = df_y.values

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
models = [KNearestNeighbour(7) for _ in range(21)]

In [None]:
models_pred = dict.fromkeys(range(0, 21), 0)

In [None]:
for i in range(0, 21):
    models[i].fit(X_train, np.where(y_train == i, 1, 0))
    models_pred[i] = models[i].predict(X_test)

In [None]:
for i in range(0, 21):
    y_pred = models_pred[i]
    print(accuracy_score(np.where(y_test == i, 1, 0), y_pred))

In [None]:
models_sklearn = [KNeighborsClassifier(n_neighbors=7) for _ in range(21)]

In [None]:
models_pred_sklearn = dict.fromkeys(range(0, 21), 0)

In [None]:
for i in range(0, 21):
    models_sklearn[i].fit(X_train, np.where(y_train == i, 1, 0))
    models_pred_sklearn[i] = models[i].predict(X_test)

In [None]:
for i in range(0, 21):
    y_pred = models_pred[i]
    print(accuracy_score(np.where(y_test == i, 1, 0), y_pred))