In [1]:
import numpy as np
from collections import Counter
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

class KNN:
    def __init__(self, k=3):
        self.k = k
        self.X_train = None
        self.y_train = None
        
    def fit(self, X_train, y_train):
        self.X_train = X_train
        self.y_train = y_train

    def _euclidean_distance(self, a, b):
        return np.sqrt(np.sum((a - b) ** 2))

    def _predict_single(self, x_test):
        distances = []
        for x_train in self.X_train:
            dist = self._euclidean_distance(x_train, x_test)
            distances.append(dist)
        distances = np.array(distances)

        # Indices of the k nearest neighbors
        knn_indices = distances.argsort()[:self.k]

        # Labels of the k nearest neighbors
        knn_labels = self.y_train[knn_indices]

        # Majority vote
        most_common = Counter(knn_labels).most_common(1)
        return most_common[0][0]

    def predict(self, X_test):
        predictions = [self._predict_single(x) for x in X_test]
        return np.array(predictions)

df = pd.read_csv('Salaries.csv')

input_data = df.drop('salary_more_than_100k', axis='columns')
output_data = df['salary_more_than_100k']

le_company = LabelEncoder()
le_job = LabelEncoder()
le_degree = LabelEncoder()

input_data['company_n'] = le_company.fit_transform(input_data['company'])
input_data['job_n'] = le_job.fit_transform(input_data['job'])
input_data['degree_n'] = le_degree.fit_transform(input_data['degree'])

input_data = input_data.drop(['company', 'job', 'degree'], axis='columns')

print("Data types after encoding:")
print(input_data.dtypes)

x_train, x_test, y_train, y_test = train_test_split(input_data, output_data, test_size=0.2, random_state=42)

k = 3
knn = KNN(k=k)
knn.fit(x_train.to_numpy(), y_train.to_numpy())

predictions = knn.predict(x_test.to_numpy())

print("Predictions:", predictions)

accuracy = np.mean(predictions == y_test.to_numpy())
print("Accuracy:", accuracy)


Data types after encoding:
company_n    int32
job_n        int32
degree_n     int32
dtype: object
Predictions: [1 1 1 0]
Accuracy: 0.25
