In [1]:
import numpy as np
import json
from Levenshtein import distance as levenshtein_distance
import pandas as pd
import os

In [2]:
# ฟังก์ชันอ่าน JSON
def read_multiline_json(file_path):
    try:
        with open(file_path, 'r') as file:
            return [json.loads(line) for line in file]
    except (FileNotFoundError, json.JSONDecodeError) as e:
        print(f"Error reading JSON file at {file_path}: {e}")
        return []

# ฟังก์ชันตรวจสอบข้อมูล
def validate_inputs(strings, distance_matrix, num_clusters):
    if len(strings) != distance_matrix.shape[0] or len(strings) != distance_matrix.shape[1]:
        raise ValueError("Distance matrix must be NxN and match the number of strings.")
    if num_clusters <= 0 or num_clusters > len(strings):
        raise ValueError("Number of clusters must be between 1 and the number of strings.")

def sgfcmed(strings, distance_matrix, num_clusters, labels, fuzzifier=2.0, max_iter=100, tol=1e-4):
    """
    String Grammar Fuzzy C-Medians (sgFCMed) Algorithm Implementation.

    Args:
        strings (list of str): List of strings to cluster.
        distance_matrix (np.ndarray): NxN matrix of precomputed distances between strings.
        num_clusters (int): Number of clusters (C).
        labels (list): List of labels corresponding to strings.
        fuzzifier (float): Fuzzifier parameter (m > 1).
        max_iter (int): Maximum number of iterations.
        tol (float): Tolerance for convergence.

    Returns:
        prototypes (list of dict): Selected prototypes with labels for each cluster.
        membership_matrix (np.ndarray): NxC membership matrix.
        prototype_indices (list of int): Indices of the selected prototypes.
    """
    validate_inputs(strings, distance_matrix, num_clusters)
    num_strings = len(strings)

    # Initialize membership matrix randomly
    membership_matrix = np.random.rand(num_strings, num_clusters)
    membership_matrix /= membership_matrix.sum(axis=1, keepdims=True)

    # Initialize cluster prototypes randomly
    prototypes = [strings[i] for i in np.random.choice(num_strings, num_clusters, replace=False)]
    prototype_indices = [strings.index(p) for p in prototypes]
    prototype_labels = [labels[i] for i in prototype_indices]

    for iteration in range(max_iter):
        print(f"Iteration {iteration + 1}")
        previous_prototypes = prototypes.copy()
        previous_prototype_indices = prototype_indices.copy()

        # Update membership matrix
        for j in range(num_strings):
            for i in range(num_clusters):
                denominator = sum(
                    (distance_matrix[j, k] / distance_matrix[j, i]) ** (2 / (fuzzifier - 1))
                    for k in range(num_clusters)
                    if distance_matrix[j, k] > 0 and distance_matrix[j, i] > 0
                )
                membership_matrix[j, i] = 1 / denominator if denominator > 0 else 1

        membership_matrix /= membership_matrix.sum(axis=1, keepdims=True)

        # Update prototypes by selecting the string with the minimum weighted distance
        for i in range(num_clusters):
            min_distance = float('inf')
            best_prototype = None

            for candidate_idx, candidate_string in enumerate(strings):
                weighted_distance = sum(
                    membership_matrix[k, i] ** fuzzifier * distance_matrix[k, candidate_idx]
                    for k in range(num_strings)
                )
                if weighted_distance < min_distance:
                    min_distance = weighted_distance
                    best_prototype = candidate_string

            old_prototype_idx = strings.index(prototypes[i])
            new_prototype_idx = strings.index(best_prototype)
            prototypes[i] = best_prototype
            prototype_indices[i] = new_prototype_idx
            prototype_labels[i] = labels[new_prototype_idx]

        # Check convergence (if prototypes don't change)
        changes = sum(1 for i in range(num_clusters) if previous_prototypes[i] != prototypes[i])
        print(f"Number of prototype changes in iteration {iteration + 1}: {changes}")

        if all(previous_prototypes[i] == prototypes[i] for i in range(num_clusters)):
            print(f"Converged in {iteration + 1} iterations.")
            break

    prototypes_with_labels = [{"prototype": prototypes[i], "label": prototype_labels[i]} for i in range(num_clusters)]

    return prototypes_with_labels, membership_matrix, prototype_indices

In [None]:
# Paths ของโฟลเดอร์
malware_base_path = r'C:\Users\KUNG_LOBSTER69\Documents\GitHub\WORK\Windows\CODE_BME\PROJECT_CYBER_SECURITY\RESULT\03.CROSS_VALIDATION_4_FOLD\MALWARE_FOLDS'
benign_base_path = r'C:\Users\KUNG_LOBSTER69\Documents\GitHub\WORK\Windows\CODE_BME\PROJECT_CYBER_SECURITY\RESULT\03.CROSS_VALIDATION_4_FOLD\BENIGN_FOLDS'
output_base_path = r'C:\Users\KUNG_LOBSTER69\Documents\GitHub\WORK\Windows\CODE_BME\PROJECT_CYBER_SECURITY\RESULT\04.sgFCMed'

folds = [f"fold_{i}" for i in range(1, 5)]

for fold in folds:
    print(f"Processing {fold} for malware...")
    fold_path = os.path.join(malware_base_path, fold)
    malware_validation_train_path = os.path.join(fold_path, 'malware_validation_train.json')
    malware_validation_train_matrix_path = os.path.join(fold_path, 'malware_validation_train_matrix.json')

    # ตรวจสอบว่าไฟล์มีอยู่หรือไม่
    if not os.path.exists(malware_validation_train_path):
        print(f"File not found: {malware_validation_train_path}")
        continue

    if not os.path.exists(malware_validation_train_matrix_path):
        print(f"File not found: {malware_validation_train_matrix_path}")
        continue

    # อ่านไฟล์ JSON
    validation_train_data = read_multiline_json(malware_validation_train_path)
    if not validation_train_data:
        print(f"Validation train data is empty or invalid for {fold}.")
        continue

    validation_train_data_df = pd.DataFrame(validation_train_data)
    strings = validation_train_data_df.iloc[:, 0].tolist()
    labels = validation_train_data_df.iloc[:, 1].tolist()

    # อ่านเมทริกซ์ระยะทาง
    try:
        validation_train_matrix = pd.read_json(malware_validation_train_matrix_path)
        distance_matrix = validation_train_matrix.to_numpy()
    except ValueError as e:
        print(f"Error reading distance matrix for {fold}: {e}")
        continue

    # วนลูปเปลี่ยนค่าของ num_clusters
    for num_clusters in [100, 200, 300]:
        print(f"Processing {fold} with num_clusters={num_clusters}...")

        # เรียกใช้ sgFCMed
        prototypes_with_labels, membership_matrix, prototype_indices = sgfcmed(strings, distance_matrix, num_clusters, labels)

        # Path for saving results
        output_path = os.path.join(output_base_path, 'malware', fold, f"clusters_{num_clusters}")
        os.makedirs(output_path, exist_ok=True)

        # Save prototypes
        with open(os.path.join(output_path, 'prototypes.json'), 'w') as f:
            json.dump(prototypes_with_labels, f)

        # Save membership matrix as .npy
        np.save(os.path.join(output_path, 'membership_matrix.npy'), membership_matrix)

        # Save prototype indices
        with open(os.path.join(output_path, 'prototype_indices.json'), 'w') as f:
            json.dump(prototype_indices, f)

    print(f"Finished processing {fold} for malware.")

for fold in folds:
    print(f"Processing {fold} for benign...")
    fold_path = os.path.join(benign_base_path, fold)
    benign_validation_train_path = os.path.join(fold_path, 'benign_validation_train.json')
    benign_validation_train_matrix_path = os.path.join(fold_path, 'benign_validation_train_matrix.json')

    # ตรวจสอบว่าไฟล์มีอยู่หรือไม่
    if not os.path.exists(benign_validation_train_path):
        print(f"File not found: {benign_validation_train_path}")
        continue

    if not os.path.exists(benign_validation_train_matrix_path):
        print(f"File not found: {benign_validation_train_matrix_path}")
        continue

    # อ่านไฟล์ JSON
    validation_train_data = read_multiline_json(benign_validation_train_path)
    if not validation_train_data:
        print(f"Validation train data is empty or invalid for {fold}.")
        continue

    validation_train_data_df = pd.DataFrame(validation_train_data)
    strings = validation_train_data_df.iloc[:, 0].tolist()
    labels = validation_train_data_df.iloc[:, 1].tolist()

    # อ่านเมทริกซ์ระยะทาง
    try:
        validation_train_matrix = pd.read_json(benign_validation_train_matrix_path)
        distance_matrix = validation_train_matrix.to_numpy()
    except ValueError as e:
        print(f"Error reading distance matrix for {fold}: {e}")
        continue

    # วนลูปเปลี่ยนค่าของ num_clusters
    for num_clusters in [100, 200, 300]:
        print(f"Processing {fold} with num_clusters={num_clusters}...")

        # เรียกใช้ sgFCMed
        prototypes_with_labels, membership_matrix, prototype_indices = sgfcmed(strings, distance_matrix, num_clusters, labels)

        # Path for saving results
        output_path = os.path.join(output_base_path, 'benign', fold, f"clusters_{num_clusters}")
        os.makedirs(output_path, exist_ok=True)

        # Save prototypes
        with open(os.path.join(output_path, 'prototypes.json'), 'w') as f:
            json.dump(prototypes_with_labels, f)

        # Save membership matrix as .npy
        np.save(os.path.join(output_path, 'membership_matrix.npy'), membership_matrix)

        # Save prototype indices
        with open(os.path.join(output_path, 'prototype_indices.json'), 'w') as f:
            json.dump(prototype_indices, f)

    print(f"Finished processing {fold} for benign.")