In [19]:
import math
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix

# Fungsi Euclidean Distance

In [20]:
def euclidean_distance(point1, point2):
    distance = 0
    for i in range(len(point1)):
        distance += (point1[i] - point2[i]) ** 2
    return math.sqrt(distance)

# Fungsi Manhattan Distance

In [21]:
def manhattan_distance(point1, point2):
    return np.sum(np.abs(point1 - point2))

# Fungsi utama KNN

In [22]:
def knn_classify(x_train, y_train, x_test, k):
    predictions = []  # Daftar untuk menyimpan prediksi label
    
    # Iterasi untuk setiap titik data di x_test
    for test_point in x_test.values:
        distances = []  # Menyimpan jarak setiap data latih ke test_point
        
        # Hitung distance ke semua data latih
        for i, train_point in enumerate(x_train.values):
            distance = euclidean_distance(train_point, test_point)
            distances.append((distance, y_train.values[i]))
        
        # Urutkan jarak dari terkecil dan ambil k tetangga terdekat
        distances.sort(key=lambda x: x[0])
        k_nearest_neighbors = distances[:k]
        
        # Voting: Hitung label terbanyak dari k tetangga terdekat
        label_counts = {}
        for neighbor in k_nearest_neighbors:
            label = neighbor[1]
            label_counts[label] = label_counts.get(label, 0) + 1
        
        # Tentukan label dengan frekuensi terbanyak
        predicted_label = max(label_counts, key=label_counts.get)
        predictions.append(predicted_label)
    
    return predictions

# Training

In [23]:
dataset = pd.read_csv("./dataset.csv")

x = dataset.iloc[:, :-1]  
y = dataset.iloc[:, -1]   

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

# Mencari K dengan akurasi tertinggi

In [24]:
def calculate_accuracy(y_true, y_pred):
    cm = confusion_matrix(y_true, y_pred)

    # Mengambil nilai TP, TN, FP, dan FN dari confusion matrix
    TP = cm[1, 1]
    TN = cm[0, 0]
    FP = cm[0, 1]
    FN = cm[1, 0]

    accuracy = (TP + TN) / (TP + TN + FP + FN)
    return accuracy

In [25]:
for k in range(1, 22, 2):
    predicted_label = knn_classify(x_train, y_train, x_test, k)
    accuracy = calculate_accuracy(y_test, predicted_label)
    print(f"Akurasi model KNN dengan k={k}: {accuracy:.2f}")

Akurasi model KNN dengan k=1: 0.68
Akurasi model KNN dengan k=3: 0.65
Akurasi model KNN dengan k=5: 0.66
Akurasi model KNN dengan k=7: 0.69
Akurasi model KNN dengan k=9: 0.72
Akurasi model KNN dengan k=11: 0.73
Akurasi model KNN dengan k=13: 0.77
Akurasi model KNN dengan k=15: 0.76
Akurasi model KNN dengan k=17: 0.77
Akurasi model KNN dengan k=19: 0.75
Akurasi model KNN dengan k=21: 0.74


# Akurasi menggunakan five fold cross validation

In [26]:
def k_fold_cross_validation(X, y, k, num_folds):
    fold_size = len(X) // num_folds
    accuracies = []

    for fold in range(num_folds):
        # Membagi data
        test_indices = list(range(fold * fold_size, (fold + 1) * fold_size))
        train_indices = [i for i in range(len(X)) if i not in test_indices]

        X_train, X_test = X.iloc[train_indices], X.iloc[test_indices]
        y_train, y_test = y.iloc[train_indices], y.iloc[test_indices]

        # Prediksi menggunakan KNN
        predicted_labels = knn_classify(X_train, y_train, X_test, k)

        # Hitung akurasi
        accuracy = calculate_accuracy(y_test, predicted_labels)
        accuracies.append(accuracy)

    # Menghitung rata-rata akurasi
    average_accuracy = np.mean(accuracies)
    print(f"Akurasi Rata-Rata K-Fold Cross Validation dengan k={k} : {average_accuracy:.4f}")
    return average_accuracy

In [27]:
for k in range(1, 22, 2):
    k_fold_cross_validation(x,y,k,5)

Akurasi Rata-Rata K-Fold Cross Validation dengan k=1 : 0.6549
Akurasi Rata-Rata K-Fold Cross Validation dengan k=3 : 0.7046
Akurasi Rata-Rata K-Fold Cross Validation dengan k=5 : 0.7203
Akurasi Rata-Rata K-Fold Cross Validation dengan k=7 : 0.7281
Akurasi Rata-Rata K-Fold Cross Validation dengan k=9 : 0.7399
Akurasi Rata-Rata K-Fold Cross Validation dengan k=11 : 0.7464
Akurasi Rata-Rata K-Fold Cross Validation dengan k=13 : 0.7477
Akurasi Rata-Rata K-Fold Cross Validation dengan k=15 : 0.7359
Akurasi Rata-Rata K-Fold Cross Validation dengan k=17 : 0.7438
Akurasi Rata-Rata K-Fold Cross Validation dengan k=19 : 0.7438
Akurasi Rata-Rata K-Fold Cross Validation dengan k=21 : 0.7425
