In [None]:
pip install --upgrade -r requirements.txt

In [None]:
import os
import json
import math
import pandas as pd
from tqdm import tqdm
from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor, as_completed
from rapidfuzz.distance import Levenshtein

# === Utility Functions ===

def read_multiline_json(file_path):
    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            return [json.loads(line) for line in file]
    except Exception as e:
        print(f"❌ Error reading {file_path}: {e}")
        return []

def save_progress(csv_file_path, data_rows):
    with open(csv_file_path, 'a', encoding='utf-8') as f:
        for row in data_rows:
            f.write(",".join(map(str, row)) + "\n")

# === Threaded compute per row (for each train sample)
def compute_row_distance(train_str, test_strs):
    return [Levenshtein.distance(train_str, t) for t in test_strs]

# === Called by Process: handle a block of train data
def compute_block_distance(args):
    train_block, test_strs, start_idx, csv_file_name = args
    results = []
    with ThreadPoolExecutor(max_workers=os.cpu_count()) as executor:
        futures = {
            executor.submit(compute_row_distance, str(sample), test_strs): i
            for i, sample in enumerate(train_block)
        }
        for future in as_completed(futures):
            result = future.result()
            results.append(result)
    print(f"[{csv_file_name}] ✅ Block {start_idx} → {start_idx + len(train_block)} done")
    return results

# === Main block
if __name__ == '__main__':
    # === Paths ===
    train_path = r'C:\Users\KUNG_LOBSTER69\Documents\GitHub\WORK\Windows\CODE_BME\PROJECT_CYBER_SECURITY\RESULT\05.DATA_VALIDATION\fold_3\MALWARE_100_BENIGN_100\validation_train.json'
    malware_test_path = r'C:\Users\KUNG_LOBSTER69\Documents\GitHub\WORK\Windows\CODE_BME\PROJECT_CYBER_SECURITY\RESULT\01.TRAIN_TEST_SET\malware_test.json'
    benign_test_path = r'C:\Users\KUNG_LOBSTER69\Documents\GitHub\WORK\Windows\CODE_BME\PROJECT_CYBER_SECURITY\RESULT\01.TRAIN_TEST_SET\benign_test.json'

    output_path = r'C:\Users\KUNG_LOBSTER69\Documents\GitHub\WORK\Windows\CODE_BME\PROJECT_CYBER_SECURITY\RESULT\08.EDIT_DISTANCE_TEST_DATA'
    os.makedirs(output_path, exist_ok=True)
    csv_file_path = os.path.join(output_path, 'EDIT_DISTANCE_MATRIX_TRAIN_vs_TEST_HYBRID_SAFE.csv')

    # === Load data ===
    with open(train_path, 'r', encoding='utf-8') as f:
        train_data = json.load(f)
    malware_test = read_multiline_json(malware_test_path)
    benign_test = read_multiline_json(benign_test_path)
    test_data = malware_test + benign_test
    test_strs = [str(item) for item in test_data]

    if not train_data or not test_data:
        print("🚫 No train or test data loaded.")
        exit()

    # === Prepare blocks
    num_processes = min(os.cpu_count(), len(train_data))
    block_size = math.ceil(len(train_data) / num_processes)
    blocks = [
        (train_data[i:i + block_size], test_strs, i, os.path.basename(csv_file_path))
        for i in range(0, len(train_data), block_size)
    ]

    print(f"🚀 Starting SAFE Hybrid Edit Distance: {len(train_data)} train × {len(test_strs)} test")
    print(f"🔧 Using {num_processes} processes × {os.cpu_count()} threads")

    with ProcessPoolExecutor(max_workers=num_processes) as executor:
        futures = {
            executor.submit(compute_block_distance, block_args): i
            for i, block_args in enumerate(blocks)
        }

        for future in tqdm(as_completed(futures), total=len(futures), desc="Processing blocks"):
            result_rows = future.result()
            save_progress(csv_file_path, result_rows)

    print(f"✅ Done! Matrix saved to {csv_file_path}")