In [None]:
import numpy as np
import pandas as pd
import json
from collections import defaultdict
from IPython.display import display  # ‡πÉ‡∏ä‡πâ‡∏™‡∏≥‡∏´‡∏£‡∏±‡∏ö‡πÅ‡∏™‡∏î‡∏á DataFrame ‡πÉ‡∏ô Jupyter Notebook

class FuzzyKNN:
    def __init__(self, k=5, m=2):
        """
        Fuzzy K-Nearest Neighbors classifier.

        Parameters:
        - k: ‡∏à‡∏≥‡∏ô‡∏ß‡∏ô‡πÄ‡∏û‡∏∑‡πà‡∏≠‡∏ô‡∏ö‡πâ‡∏≤‡∏ô (‡∏ï‡πâ‡∏≠‡∏á‡πÄ‡∏õ‡πá‡∏ô‡πÄ‡∏•‡∏Ç‡∏Ñ‡∏µ‡πà)
        - m: Fuzziness parameter (‡∏Ñ‡πà‡∏≤‡∏Ñ‡∏ß‡∏≤‡∏° fuzzy)
        """
        self.k = k
        self.m = m

    def predict(self, train_labels, test_labels, distance_matrix):
        """
        ‡∏ó‡∏≥‡∏ô‡∏≤‡∏¢‡∏Ç‡πâ‡∏≠‡∏°‡∏π‡∏• Test ‡πÇ‡∏î‡∏¢‡πÉ‡∏ä‡πâ‡∏Ñ‡πà‡∏≤‡∏£‡∏∞‡∏¢‡∏∞‡∏´‡πà‡∏≤‡∏á‡∏ó‡∏µ‡πà‡∏Ñ‡∏≥‡∏ô‡∏ß‡∏ì‡πÑ‡∏ß‡πâ‡πÅ‡∏•‡πâ‡∏ß

        Parameters:
        - train_labels: ‡∏≠‡∏≤‡∏£‡πå‡πÄ‡∏£‡∏¢‡πå‡∏Ç‡∏≠‡∏á label ‡∏™‡∏≥‡∏´‡∏£‡∏±‡∏ö‡∏Ç‡πâ‡∏≠‡∏°‡∏π‡∏• train (200 ‡πÅ‡∏ñ‡∏ß)
        - test_labels: ‡∏≠‡∏≤‡∏£‡πå‡πÄ‡∏£‡∏¢‡πå‡∏Ç‡∏≠‡∏á label ‡∏™‡∏≥‡∏´‡∏£‡∏±‡∏ö‡∏Ç‡πâ‡∏≠‡∏°‡∏π‡∏• test (500 ‡πÅ‡∏ñ‡∏ß)
        - distance_matrix: ‡πÄ‡∏°‡∏ó‡∏£‡∏¥‡∏Å‡∏ã‡πå‡∏£‡∏∞‡∏¢‡∏∞‡∏´‡πà‡∏≤‡∏á (200x500)

        Returns:
        - DataFrame ‡πÅ‡∏™‡∏î‡∏á‡∏ú‡∏•‡∏•‡∏±‡∏û‡∏ò‡πå Fuzzy Membership ‡∏Ç‡∏≠‡∏á‡πÅ‡∏ï‡πà‡∏•‡∏∞ test sample
        """
        N, M = distance_matrix.shape
        assert len(train_labels) == N, f"train_labels ‡∏ï‡πâ‡∏≠‡∏á‡∏°‡∏µ‡∏Ç‡∏ô‡∏≤‡∏î {N}"
        assert len(test_labels) == M, f"test_labels ‡∏ï‡πâ‡∏≠‡∏á‡∏°‡∏µ‡∏Ç‡∏ô‡∏≤‡∏î {M}"

        predictions = []

        for j in range(M):  # ‡∏ß‡∏ô‡∏•‡∏π‡∏õ‡∏ó‡∏∏‡∏Å‡πÅ‡∏ñ‡∏ß‡∏Ç‡∏≠‡∏á Test Data
            distances = distance_matrix[:, j]  # ‡∏î‡∏∂‡∏á‡∏£‡∏∞‡∏¢‡∏∞‡∏´‡πà‡∏≤‡∏á‡∏Ç‡∏≠‡∏á Test ‡∏ï‡∏±‡∏ß‡∏ó‡∏µ‡πà j ‡∏Å‡∏±‡∏ö‡∏ó‡∏∏‡∏Å Train
            sorted_indices = np.argsort(distances)[:self.k]  # ‡πÄ‡∏•‡∏∑‡∏≠‡∏Å K ‡∏ï‡∏±‡∏ß‡∏ó‡∏µ‡πà‡πÉ‡∏Å‡∏•‡πâ‡∏ó‡∏µ‡πà‡∏™‡∏∏‡∏î
            nearest_labels = train_labels[sorted_indices]
            nearest_distances = distances[sorted_indices]

            # ‡∏Ñ‡∏≥‡∏ô‡∏ß‡∏ì‡∏™‡∏°‡∏≤‡∏ä‡∏¥‡∏Å (Fuzzy Membership)
            membership = defaultdict(float)
            if np.any(nearest_distances == 0):  # ‡∏ñ‡πâ‡∏≤‡∏°‡∏µ‡∏ï‡∏±‡∏ß‡∏ó‡∏µ‡πà‡∏ï‡∏£‡∏á‡∏Å‡∏±‡∏ô‡∏û‡∏≠‡∏î‡∏µ
                label = nearest_labels[np.argmin(nearest_distances)]
                predictions.append({label: 1.0})
                continue

            inv_distances = 1 / (nearest_distances ** (2 / (self.m - 1)))
            total_inv = np.sum(inv_distances)

            for i, label in enumerate(nearest_labels):
                membership[label] += inv_distances[i] / total_inv

            predictions.append(membership)

        # ‡πÅ‡∏õ‡∏•‡∏á Fuzzy Membership ‡πÄ‡∏õ‡πá‡∏ô Label ‡∏ó‡∏µ‡πà‡∏ó‡∏≥‡∏ô‡∏≤‡∏¢‡πÑ‡∏î‡πâ
        predicted_labels = [max(membership, key=membership.get) for membership in predictions]

        # ‡∏™‡∏£‡πâ‡∏≤‡∏á DataFrame ‡πÅ‡∏™‡∏î‡∏á‡∏ú‡∏•
        df_results = pd.DataFrame({
            "Test Label": test_labels, 
            "Predicted Label": predicted_labels
        })

        return df_results

# üü¢ **‡πÇ‡∏´‡∏•‡∏î‡πÑ‡∏ü‡∏•‡πå‡πÅ‡∏•‡∏∞‡∏ó‡∏î‡∏™‡∏≠‡∏ö‡∏Ñ‡πà‡∏≤‡∏ó‡∏µ‡πà‡∏î‡∏µ‡∏ó‡∏µ‡πà‡∏™‡∏∏‡∏î**
if __name__ == "__main__":
    # üìå **‡∏Å‡∏≥‡∏´‡∏ô‡∏î‡∏û‡∏≤‡∏ò‡πÑ‡∏ü‡∏•‡πå**
    train_label_path = "C:/Users/BMEI CMU/Documents/GitHub/WORK/Windows/CODE_BME/PROJECT_CYBER_SECURITY/RESULT/05.DATA_VALIDATION/fold_1/MALWARE_100_BENIGN_100/validation_train.json"
    test_label_path = "C:/Users/BMEI CMU/Documents/GitHub/WORK/Windows/CODE_BME/PROJECT_CYBER_SECURITY/RESULT/05.DATA_VALIDATION/fold_1/MALWARE_100_BENIGN_100/validation_test.json"
    distance_matrix_path = "C:/Users/BMEI CMU/Documents/GitHub/WORK/Windows/CODE_BME/PROJECT_CYBER_SECURITY/RESULT/06.EDIT_DISTANCE_VALIDATION/fold_1/MATRIX_EDIT_DISTANCE_MALWARE_100_BENIGN_100.csv"

    # üìå **‡πÇ‡∏´‡∏•‡∏î Labels ‡∏à‡∏≤‡∏Å JSON**
    with open(train_label_path, "r", encoding="utf-8") as file:
        train_labels_data = json.load(file)
    
    with open(test_label_path, "r", encoding="utf-8") as file:
        test_labels_data = json.load(file)

    # üìå **‡πÅ‡∏õ‡∏•‡∏á JSON ‡πÄ‡∏õ‡πá‡∏ô DataFrame**
    train_labels_df = pd.DataFrame(train_labels_data)
    test_labels_df = pd.DataFrame(test_labels_data)

    # üìå **‡∏î‡∏∂‡∏á‡∏Ñ‡πà‡∏≤ Label ‡∏à‡∏≤‡∏Å Column 2 (Index 1)**
    train_labels = train_labels_df.iloc[:, 1].to_numpy()  # Column 2
    test_labels = test_labels_df.iloc[:, 1].to_numpy()  # Column 2

    # üìå **‡πÇ‡∏´‡∏•‡∏î Distance Matrix (200x500)**
    distance_matrix_df = pd.read_csv(distance_matrix_path, header=None)
    distance_matrix = distance_matrix_df.to_numpy()

    # üìå **‡∏£‡∏±‡∏ô‡∏Ñ‡πà‡∏≤‡∏ï‡πà‡∏≤‡∏á‡πÜ ‡∏Ç‡∏≠‡∏á k ‡πÅ‡∏•‡∏∞ m ‡πÄ‡∏û‡∏∑‡πà‡∏≠‡∏´‡∏≤ Accuracy ‡∏ó‡∏µ‡πà‡∏î‡∏µ‡∏ó‡∏µ‡πà‡∏™‡∏∏‡∏î**
    k_min, k_max = 1, 21  # k ‡πÄ‡∏£‡∏¥‡πà‡∏°‡∏ó‡∏µ‡πà 1 ‡πÅ‡∏•‡∏∞‡πÄ‡∏û‡∏¥‡πà‡∏°‡∏ó‡∏µ‡∏•‡∏∞ 2 (1, 3, 5, ..., 21)
    m_min, m_max = 2, 5   # m ‡πÄ‡∏£‡∏¥‡πà‡∏°‡∏ó‡∏µ‡πà 2 ‡πÅ‡∏•‡∏∞‡πÄ‡∏û‡∏¥‡πà‡∏°‡∏ó‡∏µ‡∏•‡∏∞ 1 (2, 3, 4, 5)

    best_accuracy = 0
    best_k = None
    best_m = None

    for k in range(k_min, k_max + 1, 2):  # k ‡πÄ‡∏õ‡πá‡∏ô‡πÄ‡∏•‡∏Ç‡∏Ñ‡∏µ‡πà‡πÄ‡∏ó‡πà‡∏≤‡∏ô‡∏±‡πâ‡∏ô
        for m in np.arange(m_min, m_max + 0.1, 0.1):  # m ‡∏ß‡∏¥‡πà‡∏á‡∏ï‡∏±‡πâ‡∏á‡πÅ‡∏ï‡πà 2 ‡∏ñ‡∏∂‡∏á m_max
            # ‡∏™‡∏£‡πâ‡∏≤‡∏á‡πÅ‡∏•‡∏∞ Train Model
            fuzzy_knn = FuzzyKNN(k=k, m=m)
            df_results = fuzzy_knn.predict(train_labels, test_labels, distance_matrix)

            # üìå **‡∏Ñ‡∏≥‡∏ô‡∏ß‡∏ì Accuracy**
            correct_predictions = (df_results["Test Label"] == df_results["Predicted Label"]).sum()
            total_predictions = len(df_results)
            accuracy = (correct_predictions / total_predictions) * 100

            print(f"üîπ k={k}, m={m} ‚Üí Accuracy: {accuracy:.2f}% ({correct_predictions}/{total_predictions} ‡∏ñ‡∏π‡∏Å‡∏ï‡πâ‡∏≠‡∏á)")

            # ‡∏≠‡∏±‡∏õ‡πÄ‡∏î‡∏ï‡∏Ñ‡πà‡∏≤‡∏ó‡∏µ‡πà‡∏î‡∏µ‡∏ó‡∏µ‡πà‡∏™‡∏∏‡∏î
            if accuracy > best_accuracy:
                best_accuracy = accuracy
                best_k = k
                best_m = m

    # üìå **‡πÅ‡∏™‡∏î‡∏á‡∏Ñ‡πà‡∏≤‡∏ó‡∏µ‡πà‡∏î‡∏µ‡∏ó‡∏µ‡πà‡∏™‡∏∏‡∏î**
    print(f"\n‚úÖ ‡∏Ñ‡πà‡∏≤ k ‡πÅ‡∏•‡∏∞ m ‡∏ó‡∏µ‡πà‡πÉ‡∏´‡πâ Accuracy ‡∏™‡∏π‡∏á‡∏™‡∏∏‡∏î:")
    print(f"üîπ Best k = {best_k}, Best m = {best_m}, Best Accuracy = {best_accuracy:.2f}%")


TypeError: 'float' object cannot be interpreted as an integer