In [1]:
import json
import numpy as np

# Update the path for the benign_train.json file
benign_train_path = r"C:\Users\BMEI CMU\Documents\GitHub\WORK\Windows\CODE_BME\PROJECT_CYBER_SECURITY\RESULT\01.TRAIN_TEST_SET\benign_train.json"
distance_matrix_path = r"C:\Users\BMEI CMU\Documents\GitHub\WORK\Windows\CODE_BME\PROJECT_CYBER_SECURITY\RESULT\02.EDIT_DISTANCE_MATRIX\benign_distance_matrix.npy"

benign_train = []
with open(benign_train_path, 'r') as file:
    for line in file:
        benign_train.append(json.loads(line.strip()))

distance_matrix = np.load(distance_matrix_path)

In [4]:
def fuzzy_median_with_matrix(cluster_indices, memberships, distance_matrix):
    """Find the fuzzy median string using precomputed distance matrix."""
    best_index = None
    min_sum_distance = float('inf')

    for candidate_index in cluster_indices:
        total_distance = sum(memberships[i] * distance_matrix[candidate_index, cluster_indices[i]]
                             for i in range(len(cluster_indices)))
        if total_distance < min_sum_distance:
            min_sum_distance = total_distance
            best_index = candidate_index

    return best_index

def sgFCMed_with_matrix(distance_matrix, num_clusters, fuzzifier, max_iter=100, tol=1e-4):
    """String Grammar Fuzzy C-Medians (sgFCMed) using precomputed distance matrix."""
    n = distance_matrix.shape[0]
    memberships = np.random.dirichlet(np.ones(num_clusters), size=n)
    prototypes = np.random.choice(n, size=num_clusters, replace=False)  # Use indices as prototypes

    for iteration in range(max_iter):
        # Update prototypes
        new_prototypes = []
        for i in range(num_clusters):
            cluster_indices = list(range(n))  # All strings are candidates
            cluster_memberships = memberships[:, i]
            new_prototype = fuzzy_median_with_matrix(cluster_indices, cluster_memberships, distance_matrix)
            new_prototypes.append(new_prototype)

        # Check for convergence
        if all(distance_matrix[prototypes[k], new_prototypes[k]] <= tol for k in range(num_clusters)):
            break
        prototypes = new_prototypes

        # Update memberships
        for i in range(n):
            for j in range(num_clusters):
                denom = sum((distance_matrix[i, prototypes[k]] ** (2 / (fuzzifier - 1))) for k in range(num_clusters))
                memberships[i, j] = (distance_matrix[i, prototypes[j]] ** (2 / (fuzzifier - 1))) / denom

    return prototypes, memberships


In [None]:
# Parameters
num_clusters = 100  # จำนวนคลัสเตอร์
fuzzifier = 2.0    # ค่า fuzziness parameter
max_iter = 100     # จำนวนรอบการทำงานสูงสุด
tol = 1e-4         # ค่าความคลาดเคลื่อนสำหรับ convergence

# Run sgFCMed using the precomputed distance matrix
prototypes, memberships = sgFCMed_with_matrix(
    distance_matrix=distance_matrix,
    num_clusters=num_clusters,
    fuzzifier=fuzzifier,
    max_iter=max_iter,
    tol=tol
)

# Mapping prototype indices back to the strings
prototype_strings = [benign_train[prototype] for prototype in prototypes]

# Display the results
print("Prototypes (Cluster Centers):")
for i, proto in enumerate(prototype_strings):
    print(f"Cluster {i + 1}: {proto}")

print("\nMembership Matrix (first 5 rows):")
print(memberships[:5])  # แสดงเฉพาะ 5 แถวแรก
