In [1]:
import os
import pandas as pd
from tqdm import tqdm
import json

In [2]:
# Define the folder path containing multiple files
combined_train_data_folder = r"C:\Users\KUNG_LOBSTER69\Documents\GitHub\WORK\Windows\CODE_BME\PROJECT_CYBER_SECURITY\RESULT\04.COMBINED_TRAIN_DATA"

# Load all JSON files in the folder into a dictionary
def load_json_files_as_dict(folder_path):
    data_dict = {}
    for file_name in os.listdir(folder_path):
        if file_name.endswith(".json"):  # ตรวจสอบเฉพาะไฟล์ที่มีนามสกุล .json
            file_path = os.path.join(folder_path, file_name)
            data_dict[file_name] = pd.read_json(file_path)
    return data_dict

# Load data into a dictionary
combined_train_data_dict = load_json_files_as_dict(combined_train_data_folder)

In [None]:
# Check the loaded data
print("Loaded files and their DataFrame sizes:")
for file_name, df in combined_train_data_dict.items():
    print(f"{file_name}: {df.shape}")

In [4]:
def edit_distance_optimized(s1, s2):
    m, n = len(s1), len(s2)
    
    # สร้างแถวก่อนหน้า (previous row) และแถวปัจจุบัน (current row)
    previous_row = list(range(n + 1))
    current_row = [0] * (n + 1)
    
    for i in range(1, m + 1):
        current_row[0] = i  # ค่าเริ่มต้นในแถวแรก
        for j in range(1, n + 1):
            # คำนวณ cost
            cost = 0 if s1[i - 1] == s2[j - 1] else 1
            # ใช้ค่าจากแถวก่อนหน้า (previous_row) และแถวปัจจุบัน (current_row)
            current_row[j] = min(
                current_row[j - 1] + 1,  # การแทรก (insertion)
                previous_row[j] + 1,     # การลบ (deletion)
                previous_row[j - 1] + cost  # การแทนที่ (substitution)
            )
        # สลับแถว: current_row กลายเป็น previous_row
        previous_row, current_row = current_row, previous_row

    return previous_row[-1]  # ค่า edit distance สุดท้าย


In [5]:
def calculate_edit_distances_with_progress(df1, df2):
    results = []
    total = len(df1) * len(df2)  # จำนวนการคำนวณทั้งหมด
    progress = tqdm(total=total, desc="Calculating Edit Distance", unit="calculation")

    for s1 in df1.iloc[:, 0]:  # ใช้คอลัมน์แรกของ df1
        row_results = []
        for s2 in df2.iloc[:, 0]:  # ใช้คอลัมน์แรกของ df2
            row_results.append(edit_distance_optimized(s1, s2))
            progress.update(1)  # อัปเดตความคืบหน้า
        results.append(row_results)

    progress.close()  # ปิด tqdm หลังคำนวณเสร็จ
    return results

In [6]:
def calculate_edit_distances_and_save(data_dict, test_df, output_folder):
    os.makedirs(output_folder, exist_ok=True)  # สร้างโฟลเดอร์ถ้ายังไม่มี

    for file_name, train_df in data_dict.items():
        print(f"Calculating Edit Distance for {file_name}...")
        
        # เตรียมบันทึกผลลัพธ์
        results = []
        total = len(train_df) * len(test_df)
        progress = tqdm(total=total, desc=f"Processing {file_name}", unit="calculation")

        # คำนวณ Edit Distance
        for i, s1 in enumerate(train_df.iloc[:, 0]):  # ใช้คอลัมน์แรก
            row_results = []
            for j, s2 in enumerate(test_df.iloc[:, 0]):  # ใช้คอลัมน์แรก
                distance = edit_distance_optimized(s1, s2)
                row_results.append({"train_index": i, "test_index": j, "edit_distance": distance})
                progress.update(1)
            results.append(row_results)

        progress.close()

        # บันทึกผลลัพธ์เป็นไฟล์ .json
        output_file = os.path.join(output_folder, f"{os.path.splitext(file_name)[0]}_distances.json")
        with open(output_file, "w", encoding="utf-8") as f:
            json.dump(results, f, ensure_ascii=False, indent=4)

        print(f"Saved results to {output_file}")

In [None]:
# Example usage:
benign_test_df = pd.read_json(r"C:\Users\KUNG_LOBSTER69\Documents\GitHub\WORK\Windows\CODE_BME\PROJECT_CYBER_SECURITY\RESULT\01.TRAIN_TEST_SET\benign_test.json", lines=True)
malware_test_df = pd.read_json(r"C:\Users\KUNG_LOBSTER69\Documents\GitHub\WORK\Windows\CODE_BME\PROJECT_CYBER_SECURITY\RESULT\01.TRAIN_TEST_SET\malware_test.json", lines=True)

# Path สำหรับ output folder
output_folder = r"C:\Users\KUNG_LOBSTER69\Documents\GitHub\WORK\Windows\CODE_BME\PROJECT_CYBER_SECURITY\RESULT\05.EDIT)DISTANCE_TRAIN_TEST"

# เรียกใช้ฟังก์ชัน
calculate_edit_distances_and_save(combined_train_data_dict, benign_test_df, output_folder)
calculate_edit_distances_and_save(combined_train_data_dict, malware_test_df, output_folder)