In [1]:
import json
import numpy as np
import os
import pandas as pd

# Update the path for the malware_train_1000.json file
malware_train_1000_path = r"C:\Users\KUNG_LOBSTER69\Documents\GitHub\WORK\Windows\CODE_BME\PROJECT_CYBER_SECURITY\RESULT\01.TRAIN_TEST_SET\malware_train_1000.json"
distance_matrix_path = r"C:\Users\KUNG_LOBSTER69\Documents\GitHub\WORK\Windows\CODE_BME\PROJECT_CYBER_SECURITY\RESULT\02.EDIT_DISTANCE_MATRIX\malware_distance_matrix_1000.npy"

# Load malware_train_1000 as a DataFrame
malware_train_1000 = []
with open(malware_train_1000_path, 'r') as file:
    for line in file:
        malware_train_1000.append(json.loads(line.strip()))

# Convert malware_train_1000 to a DataFrame
malware_train_1000 = pd.DataFrame(malware_train_1000)

# Display malware_train_1000 to verify its structure
print("Benign Train DataFrame:")
print(malware_train_1000.head())

# Load the distance matrix
distance_matrix = np.load(distance_matrix_path)

# Display distance_matrix shape to verify loading
print("\nDistance Matrix Shape:", distance_matrix.shape)

Benign Train DataFrame:
                                                data  label
0  ÐÏà¡±á                >  þÿ\t            ...      1
1  µ0     ÿþ p                            ...      1
2  ÐÏà¡±á                >  þÿ\t            ...      1
3  µ0     ÿþ p                            ...      1
4  ÐÏà¡±á                >  þÿ\t            ...      1

Distance Matrix Shape: (1000, 1000)


In [2]:
def save_results(output_dir, num_clusters, prototypes, memberships, max_iter, malware_train_1000):
    """
    Save prototypes and membership matrix as a new DataFrame, then save as JSON and save max_iter to a file.
    """
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    
    # Create a new DataFrame for prototypes with both columns
    prototype_df = malware_train_1000.iloc[prototypes].copy()  # ดึงเฉพาะแถวที่เกี่ยวข้อง
    prototype_df["Prototype_Index"] = prototypes  # เพิ่มคอลัมน์ดัชนีของ Prototype
    
    # Save the new DataFrame as JSON
    prototypes_path = os.path.join(output_dir, f"prototypes_{num_clusters}.json")
    prototype_df.to_json(prototypes_path, orient="records", force_ascii=False, indent=4)
    
    # Prepare memberships data
    memberships_data = memberships.tolist()  # Convert NumPy array to list
    
    # Save memberships as JSON
    memberships_path = os.path.join(output_dir, f"memberships_{num_clusters}.json")
    with open(memberships_path, "w", encoding="utf-8") as f:
        json.dump(memberships_data, f, ensure_ascii=False, indent=4)
    
    # Save max_iter as a text file
    max_iter_path = os.path.join(output_dir, f"max_iter_{num_clusters}.txt")
    with open(max_iter_path, "w", encoding="utf-8") as f:
        f.write(str(max_iter))

In [3]:
def fuzzy_median_with_matrix(distance_matrix, memberships, fuzzifier, cluster_idx):
    """
    คำนวณ fuzzy median สำหรับ cluster ที่กำหนดโดยใช้ distance matrix และ membership matrix
    """
    weights = memberships[:, cluster_idx] ** fuzzifier
    weighted_distances = np.sum(weights[:, np.newaxis] * distance_matrix, axis=0)
    return np.argmin(weighted_distances)

In [4]:
def sgFCMed_debug_calculation(distance_matrix, num_clusters, fuzzifier, max_iter, tol):
    """
    String Grammar Fuzzy C-Medians (sgFCMed) Debug Calculation.
    
    Parameters:
    - distance_matrix: 2D numpy array containing the distances.
    - num_clusters: Number of clusters.
    - fuzzifier: Fuzzifier for membership computation.
    - max_iter: Maximum number of iterations.
    - tol: Tolerance for convergence.
    
    Returns:
    - prototypes: Final prototypes for each cluster.
    - memberships: Final membership matrix.
    """
    # Initialize prototypes randomly
    prototypes = np.random.choice(range(distance_matrix.shape[0]), size=num_clusters, replace=False)
    memberships = np.zeros((distance_matrix.shape[0], num_clusters))

    print("Initial Prototypes and Memberships:")
    for i, prototype in enumerate(prototypes):
        print(f"Cluster {i+1}: Prototype Index {prototype}")

    for iteration in range(1, max_iter + 1):
        print(f"\n--- Iteration {iteration} ---")
        new_prototypes = []

        for cluster_idx in range(num_clusters):
            # Calculate memberships for the current prototype
            for i in range(distance_matrix.shape[0]):
                denominator = sum(
                    (distance_matrix[i, prototypes[cluster_idx]] / distance_matrix[i, prototypes[other_idx]]) ** (2 / (fuzzifier - 1))
                    if distance_matrix[i, prototypes[other_idx]] > 0 else 1
                    for other_idx in range(num_clusters)
                )
                memberships[i, cluster_idx] = 1 / denominator

            # Update prototypes using fuzzy_median_with_matrix
            best_index = fuzzy_median_with_matrix(distance_matrix, memberships, fuzzifier, cluster_idx)
            print(f"Cluster {cluster_idx+1}: Prototype Update -> Old: {prototypes[cluster_idx]}, New: {best_index}")
            new_prototypes.append(best_index)

        # Check for convergence
        if np.all([distance_matrix[prototypes[k], new_prototypes[k]] <= tol for k in range(num_clusters)]):
            print("\nConvergence reached.")
            break

        prototypes = new_prototypes

    return prototypes, memberships

In [5]:
def dir_search_sgFCMed(distance_matrix, cluster_range, fuzzifier, max_iter=100, tol=1e-4, malware_train_1000=None, output_dir="results"):
    """
    Perform a directed search over multiple cluster counts and save results.
    """
    if malware_train_1000 is None:
        raise ValueError("malware_train_1000 must be provided to map prototypes to strings.")
    
    results = {}
    for num_clusters in cluster_range:
        print(f"\nRunning sgFCMed with num_clusters = {num_clusters}")
        prototypes, memberships = sgFCMed_debug_calculation(
            distance_matrix=distance_matrix,
            num_clusters=num_clusters,
            fuzzifier=fuzzifier,
            max_iter=max_iter,
            tol=tol
        )
        
        # Save the results for this num_clusters
        save_results(output_dir, num_clusters, prototypes, memberships, max_iter, malware_train_1000)
        results[num_clusters] = (prototypes, memberships)
    
    print("\nAll results saved to:", output_dir)
    return results

In [6]:
results = dir_search_sgFCMed(
    distance_matrix=distance_matrix,
    cluster_range=[10, 20, 30, 40, 50, 60, 70, 80, 90, 100],
    fuzzifier=2.0,
    max_iter=1000,
    malware_train_1000=malware_train_1000,
    tol=1e-4,
    output_dir=r"C:\Users\KUNG_LOBSTER69\Documents\GitHub\WORK\Windows\CODE_BME\PROJECT_CYBER_SECURITY\RESULT\03.sgFCMed\malware"
)


Running sgFCMed with num_clusters = 10
Initial Prototypes and Memberships:
Cluster 1: Prototype Index 155
Cluster 2: Prototype Index 568
Cluster 3: Prototype Index 679
Cluster 4: Prototype Index 511
Cluster 5: Prototype Index 631
Cluster 6: Prototype Index 601
Cluster 7: Prototype Index 957
Cluster 8: Prototype Index 687
Cluster 9: Prototype Index 189
Cluster 10: Prototype Index 973

--- Iteration 1 ---
Cluster 1: Prototype Update -> Old: 155, New: 155
Cluster 2: Prototype Update -> Old: 568, New: 568
Cluster 3: Prototype Update -> Old: 679, New: 123
Cluster 4: Prototype Update -> Old: 511, New: 199
Cluster 5: Prototype Update -> Old: 631, New: 766
Cluster 6: Prototype Update -> Old: 601, New: 601
Cluster 7: Prototype Update -> Old: 957, New: 957
Cluster 8: Prototype Update -> Old: 687, New: 766
Cluster 9: Prototype Update -> Old: 189, New: 189
Cluster 10: Prototype Update -> Old: 973, New: 471

--- Iteration 2 ---
Cluster 1: Prototype Update -> Old: 155, New: 155
Cluster 2: Prototype