In [None]:
import os
import pandas as pd
import numpy as np
from datetime import datetime
from decimal import Decimal
from sklearn.model_selection import train_test_split

# ─────────────────────────────────────────────────────
# 설정
# ─────────────────────────────────────────────────────
base_dir = r'C:\Users\idle9\Desktop\Naver MYBOX\AISLab\Dataset\AWID3_Dataset_CSV'
dataset_folder = os.path.join(base_dir, 'CSV')
base_output_folder = os.path.join(base_dir, 'Preprocessed')
os.makedirs(base_output_folder, exist_ok=True)

folders = {
    '1.Deauth': '.csv',
    '2.Disas': '.csv',
    '3.(Re)Assoc': '.csv',
    '4.Rogue_AP': '.csv',
    '5.Krack': '.csv',
    '6.Kr00k': '.csv',
    '7.SSH': '.csv',
    '8.Botnet': '.csv',
    '9.Malware': '.csv',
    '10.SQL_Injection': '.csv',
    '11.SSDP': '.csv',
    '12.Evil_Twin': '.csv',
    '13.Website_spoofing': '.csv'
}

label_mapping = {
    'Normal': 0,
    'Deauth': 1,
    'Disas': 2,
    '(Re)Assoc': 3,
    'RogueAP': 4,
    'Krack': 5,
    'Kr00k': 6,
    'kr00k': 6,
    'SSH': 7,
    'Botnet': 8,
    'Malware': 9,
    'SQL_Injection': 10,
    'SSDP': 11,
    'SDDP': 11,
    'Evil_Twin': 12,
    'Website_spoofing': 13
}

# ─────────────────────────────────────────────────────
# 파싱 함수
# ─────────────────────────────────────────────────────
def parse_frame_time(time_str):
    if pd.isna(time_str):
        return np.nan
    time_str = time_str.replace('-', '').replace(' GTB Standard Time', '')
    if '.' in time_str:
        parts = time_str.split(' ')
        for i, part in enumerate(parts):
            if '.' in part:
                main_sec, frac = part.split('.')
                frac = frac[:6]
                parts[i] = main_sec + '.' + frac
        time_str = ' '.join(parts)
    try:
        dt = datetime.strptime(time_str, '%b %d %Y %H:%M:%S.%f')
    except ValueError:
        try:
            dt = datetime.strptime(time_str, '%b %d %Y %H:%M:%S')
        except Exception:
            return np.nan
    return dt.timestamp()

def evaluate_string_formula(s):
    try:
        return eval(s.replace('-', ' - '))
    except:
        return s

def hex_to_decimal(x):
    try:
        return Decimal(int(x, 16))
    except:
        return x

# ─────────────────────────────────────────────────────
# 폴더 단위 전처리 함수
# ─────────────────────────────────────────────────────
def process_folder(folder_path, ext, base_output_folder, attack_name):
    data = pd.DataFrame()
    for filename in os.listdir(folder_path):
        if filename.endswith(ext):
            file_path = os.path.join(folder_path, filename)
            temp_df = pd.read_csv(file_path)
            data = pd.concat([data, temp_df], ignore_index=True)

    if data.empty:
        print(f"[{folder_path}] 폴더에 CSV 파일이 없습니다. 스킵합니다.")
        return

    df = data.copy()
    del data

    if 'frame.time' in df.columns:
        df['frame.time'] = df['frame.time'].apply(parse_frame_time)

    drop_cols = [col for col in df.columns if df[col].nunique() == 1]
    df.drop(columns=drop_cols, inplace=True)

    mac_cols = ['wlan.sa', 'wlan.da']
    # 수식 및 16진수 변환 (MAC 제외)
    for col in df.select_dtypes(include=['object']).columns:
        if col not in mac_cols:
            df[col] = df[col].apply(evaluate_string_formula)
            df[col] = df[col].apply(hex_to_decimal)

    # 변환 후에도 남은 문자열 열은 MAC 주소만 유지
    object_cols = df.select_dtypes(include=['object']).columns
    keep_objects = [col for col in object_cols if col in mac_cols]
    numeric_cols = df.select_dtypes(exclude=['object']).columns
    df = pd.concat([df[numeric_cols], df[keep_objects]], axis=1)

    if 'Label' in df.columns:
        df['label'] = df['Label'].map(label_mapping)

    stratify_val = df['label'] if 'label' in df.columns else None
    train_df, test_df = train_test_split(df, test_size=0.3, random_state=42, stratify=stratify_val)

    attack_output_folder = os.path.join(base_output_folder, attack_name)
    os.makedirs(attack_output_folder, exist_ok=True)

    train_df.to_csv(os.path.join(attack_output_folder, "train_preprocessed.csv"), index=False)
    test_df.to_csv(os.path.join(attack_output_folder, "test_preprocessed.csv"), index=False)
    print(f"[{attack_name}] 전처리 완료 및 저장: {attack_output_folder}")
    del df, train_df, test_df

# ─────────────────────────────────────────────────────
# 전체 실행
# ─────────────────────────────────────────────────────
for attack_name, ext in folders.items():
    folder_path = os.path.join(dataset_folder, attack_name)
    process_folder(folder_path, ext, base_output_folder, attack_name)

print("=== 모든 공격별 전처리 및 train/test 분리 완료 ===")


  process_folder(folder_path, ext, base_output_folder, attack_name)
  process_folder(folder_path, ext, base_output_folder, attack_name)
  process_folder(folder_path, ext, base_output_folder, attack_name)
  process_folder(folder_path, ext, base_output_folder, attack_name)
  process_folder(folder_path, ext, base_output_folder, attack_name)
  process_folder(folder_path, ext, base_output_folder, attack_name)
  process_folder(folder_path, ext, base_output_folder, attack_name)
  process_folder(folder_path, ext, base_output_folder, attack_name)
  process_folder(folder_path, ext, base_output_folder, attack_name)
  process_folder(folder_path, ext, base_output_folder, attack_name)
  process_folder(folder_path, ext, base_output_folder, attack_name)
  process_folder(folder_path, ext, base_output_folder, attack_name)
  process_folder(folder_path, ext, base_output_folder, attack_name)
  process_folder(folder_path, ext, base_output_folder, attack_name)
  process_folder(folder_path, ext, base_output_f

[1.Deauth] 전처리 완료 및 저장: C:\Users\idle9\Desktop\Naver MYBOX\AISLab\Dataset\AWID3_Dataset_CSV\Preprocessed\1.Deauth


  process_folder(folder_path, ext, base_output_folder, attack_name)
  process_folder(folder_path, ext, base_output_folder, attack_name)
  process_folder(folder_path, ext, base_output_folder, attack_name)
  process_folder(folder_path, ext, base_output_folder, attack_name)
  process_folder(folder_path, ext, base_output_folder, attack_name)
  process_folder(folder_path, ext, base_output_folder, attack_name)
  process_folder(folder_path, ext, base_output_folder, attack_name)
  process_folder(folder_path, ext, base_output_folder, attack_name)
  process_folder(folder_path, ext, base_output_folder, attack_name)
  process_folder(folder_path, ext, base_output_folder, attack_name)
  process_folder(folder_path, ext, base_output_folder, attack_name)
  process_folder(folder_path, ext, base_output_folder, attack_name)
  process_folder(folder_path, ext, base_output_folder, attack_name)
  process_folder(folder_path, ext, base_output_folder, attack_name)
  process_folder(folder_path, ext, base_output_f

[2.Disas] 전처리 완료 및 저장: C:\Users\idle9\Desktop\Naver MYBOX\AISLab\Dataset\AWID3_Dataset_CSV\Preprocessed\2.Disas


  process_folder(folder_path, ext, base_output_folder, attack_name)
  process_folder(folder_path, ext, base_output_folder, attack_name)
  process_folder(folder_path, ext, base_output_folder, attack_name)
  process_folder(folder_path, ext, base_output_folder, attack_name)
  process_folder(folder_path, ext, base_output_folder, attack_name)
  process_folder(folder_path, ext, base_output_folder, attack_name)
  process_folder(folder_path, ext, base_output_folder, attack_name)
  process_folder(folder_path, ext, base_output_folder, attack_name)
  process_folder(folder_path, ext, base_output_folder, attack_name)
  process_folder(folder_path, ext, base_output_folder, attack_name)
  process_folder(folder_path, ext, base_output_folder, attack_name)
  process_folder(folder_path, ext, base_output_folder, attack_name)
  process_folder(folder_path, ext, base_output_folder, attack_name)
  process_folder(folder_path, ext, base_output_folder, attack_name)
  process_folder(folder_path, ext, base_output_f