# feature_engineering_v0.5.py

# 1. 개요
`feature_engineering_v0.5.py` 파일은 Raw Data(Pickle) 를 입력으로 받아  
피처를 추출하고, 피처 세트별 성능을 검증한 뒤 최종 학습용 CSV 데이터셋을 생성하는 전처리 파일입니다.

---

# 2. 주요 기능 -  Feature Engineering: Extraction & Validation
**설명:**  
- Raw Data로부터 **Non-leaky Feature** 및 **Fingerprint Feature**를 추출  
- 피처 세트 **Set A(Only Non-leaky)**, **Set B(Only Fingerprint)**, **Set A+B(Non-leaky + Fingerprint)** 의 모델 성능을 비교 검증  
- 검증 결과를 기반으로 최종 학습용 CSV 파일 생성



In [None]:
import pandas as pd
import numpy as np
import pickle
from tqdm import tqdm
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, f1_score, roc_auc_score
from sklearn.utils.class_weight import compute_sample_weight
import os

# non leaky Feature 추출 (Set A: 18개)
def extract_non_leaky_features(trace, N_PACKETS=100, T_SECONDS=5.0, N_PACKETS_30=30):
    if trace is None or len(trace) == 0: return {}
    timestamps = np.abs(np.array(trace))
    directions = np.sign(np.array(trace))
    if directions[0] == 0: directions[0] = 1
    trace_len = len(timestamps)
    features = {}

    n = min(trace_len, N_PACKETS)
    ts_n = timestamps[:n]
    dir_n = directions[:n]
    features['time_for_N_pkts'] = ts_n[-1]
    features['out_ratio_N_pkts'] = np.sum(dir_n == 1) / n
    features['in_ratio_N_pkts'] = np.sum(dir_n == -1) / n
    if n > 1:
        ipt_n = np.diff(ts_n)
        features['avg_ipt_N_pkts'] = np.mean(ipt_n)
        features['std_ipt_N_pkts'] = np.std(ipt_n)
    else:
        features['avg_ipt_N_pkts'] = 0
        features['std_ipt_N_pkts'] = 0

    mask_t = timestamps <= T_SECONDS
    ts_t = timestamps[mask_t]s
    dir_t = directions[mask_t]
    n_t = len(ts_t)
    features['packets_in_T_sec'] = n_t
    if n_t > 0:
        features['out_ratio_T_sec'] = np.sum(dir_t == 1) / n_t
        features['in_ratio_T_sec'] = np.sum(dir_t == -1) / n_t
        if n_t > 1:
            ipt_t = np.diff(ts_t)
            features['avg_ipt_T_sec'] = np.mean(ipt_t)
            features['std_ipt_T_sec'] = np.std(ipt_t)
        else:
            features['avg_ipt_T_sec'] = 0
            features['std_ipt_T_sec'] = 0
    else:
        features['out_ratio_T_sec'] = 0
        features['in_ratio_T_sec'] = 0
        features['avg_ipt_T_sec'] = 0
        features['std_ipt_T_sec'] = 0

    first_in_idx = np.where(directions == -1)[0]
    if len(first_in_idx) > 0:
        first_in_idx = first_in_idx[0]
        features['time_to_first_in'] = timestamps[first_in_idx]
        features['pkts_before_first_in'] = first_in_idx
    else:
        features['time_to_first_in'] = -1
        features['pkts_before_first_in'] = trace_len

    if n > 1:
        direction_changes = np.diff(dir_n) != 0
        features['burst_count_N_pkts'] = np.sum(direction_changes)
        if features['burst_count_N_pkts'] > 0:
            burst_lengths = np.diff(np.where(np.concatenate(([True], direction_changes, [True])))[0])
            features['avg_burst_len_N_pkts'] = np.mean(burst_lengths)
        else:
            features['avg_burst_len_N_pkts'] = n
    else:
        features['burst_count_N_pkts'] = 0
        features['avg_burst_len_N_pkts'] = n

    ts_out_n = ts_n[dir_n == 1]
    ts_in_n = ts_n[dir_n == -1]
    features['avg_ipt_out_N_pkts'] = np.mean(np.diff(ts_out_n)) if len(ts_out_n) > 1 else 0
    features['avg_ipt_in_N_pkts'] = np.mean(np.diff(ts_in_n)) if len(ts_in_n) > 1 else 0

    n_30 = min(trace_len, N_PACKETS_30)
    dir_n_30 = directions[:n_30]
    features['in_ratio_N_pkts_30'] = np.sum(dir_n_30 == -1) / n_30 if n_30 > 0 else 0
    features['out_ratio_N_pkts_30'] = np.sum(dir_n_30 == 1) / n_30 if n_30 > 0 else 0
    return features

# fingerprint feature 추출 (Set B: 8개)
def extract_fingerprint_features(trace):
    if trace is None or len(trace) == 0: return {}
    timestamps = np.abs(np.array(trace))
    directions = np.sign(np.array(trace))
    if directions[0] == 0: directions[0] = 1

    features = {}
    features['total_packets'] = len(timestamps)
    features['t_max'] = timestamps[-1] if len(timestamps) > 0 else 0
    features['t_mean'] = np.mean(timestamps)
    features['t_std'] = np.std(timestamps)
    features['num_in'] = np.sum(directions == -1)
    features['num_out'] = np.sum(directions == 1)
    if features['total_packets'] > 0:
        features['in_ratio_total'] = features['num_in'] / features['total_packets']
        features['out_ratio_total'] = features['num_out'] / features['total_packets']
    else:
        features['in_ratio_total'] = 0
        features['out_ratio_total'] = 0
    return features

# All feature 추출 (Set A+B: 26개) ---
def extract_all_features(trace):
    if trace is None or len(trace) == 0: return {}
    features = extract_fingerprint_features(trace)
    non_leaky_feats = extract_non_leaky_features(trace)
    features.update(non_leaky_feats)
    return features

def load_pickle(filepath):
    print(f"Loading {filepath}...")
    try:
        with open(filepath, 'rb') as f:
            return pickle.load(f, encoding='latin1')
    except Exception as e:
        print(f"!!! Error loading {filepath}: {e}")
        return None

def process_data_for_comparison(mon_data, unmon_data):
    data_A, data_B, data_AB = [], [], []

    print("Processing Monitored data...")
    for site_id, traces in tqdm(mon_data.items()):
        for trace in traces:
            label = site_id // 10  # 0-94 레이블
            data_A.append({**extract_non_leaky_features(trace), 'label': label})
            data_B.append({**extract_fingerprint_features(trace), 'label': label})
            data_AB.append({**extract_all_features(trace), 'label': label})

    print("Processing Unmonitored data...")
    for trace in tqdm(unmon_data):
        label = 95 # Unmonitored = 95
        data_A.append({**extract_non_leaky_features(trace), 'label': label})
        data_B.append({**extract_fingerprint_features(trace), 'label': label})
        data_AB.append({**extract_all_features(trace), 'label': label})

    df_A = pd.DataFrame(data_A).fillna(0).replace([np.inf, -np.inf], 0)
    df_B = pd.DataFrame(data_B).fillna(0).replace([np.inf, -np.inf], 0)
    df_AB = pd.DataFrame(data_AB).fillna(0).replace([np.inf, -np.inf], 0)

    return df_A, df_B, df_AB

def train_evaluate_model(train_csv_path, test_csv_path, model_name):

    print("\n" + "="*80)
    print(f"      TESTING: {model_name}")
    print("="*80)

    try:
        train_df = pd.read_csv(train_csv_path)
        test_df = pd.read_csv(test_csv_path)
    except FileNotFoundError as e:
        print(f"!!! Error loading CSV: {e}")
        return None, None, None

    X_train = train_df.drop('label', axis=1)
    y_train = train_df['label'].astype(int)
    X_test = test_df.drop('label', axis=1)
    y_test = test_df['label'].astype(int)

    num_classes = 96 # 0-95
    sample_weights = compute_sample_weight(class_weight='balanced', y=y_train)

    print(f"Features: {X_train.shape[1]} | Train samples: {X_train.shape[0]} | Test samples: {X_test.shape[0]}")
    print("Training Single Multi-Class XGBoost...")

    model = XGBClassifier(
        objective='multi:softmax',
        num_class=num_classes,
        n_estimators=500,
        max_depth=8,
        learning_rate=0.1,
        random_state=42,
        eval_metric='mlogloss',
        use_label_encoder=False
    )
    model.fit(X_train, y_train, sample_weight=sample_weights)

    print("Evaluating...")
    y_pred = model.predict(X_test)
    y_proba = model.predict_proba(X_test)

    acc = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average='macro')
    auc = roc_auc_score(y_test, y_proba, multi_class='ovr', average='macro')

    report = classification_report(y_test, y_pred, labels=[95], output_dict=True, zero_division=0)
    recall_95 = report['95']['recall']

    metrics = {
        "Model": model_name,
        "Features": X_train.shape[1],
        "Accuracy": acc,
        "Macro F1": f1,
        "Macro ROC-AUC": auc,
        "Unmon (95) Recall": recall_95
    }

    return metrics, y_test, y_pred

if __name__ == "__main__":

    MON_FILE = 'mon_standard.pkl'
    UNMON_FILE = 'unmon_standard10.pkl'

    #  pkl -> feature 추출 -> train/test 분할 -> 6개 CSV 저장
    print("--- STAGE 1: Extracting, Splitting, and Saving CSVs ---")

    mon_data_main = load_pickle(MON_FILE)
    unmon_data_main = load_pickle(UNMON_FILE)

    if mon_data_main is not None and unmon_data_main is not None:

        df_A, df_B, df_AB = process_data_for_comparison(mon_data_main, unmon_data_main)

        paths = {
            'A': ('train_set_A.csv', 'test_set_A.csv'),
            'B': ('train_set_B.csv', 'test_set_B.csv'),
            'AB': ('train_set_AB.csv', 'test_set_AB.csv')
        }

        for key, df in [('A', df_A), ('B', df_B), ('AB', df_AB)]:
            train_path, test_path = paths[key]

            print(f"\nProcessing and splitting Dataset {key}...")
            X = df.drop('label', axis=1)
            y = df['label']

            X_train, X_test, y_train, y_test = train_test_split(
                X, y, test_size=0.3, random_state=42, stratify=y
            )

            train_df = pd.concat([X_train.reset_index(drop=True),
                                  y_train.reset_index(drop=True)], axis=1)
            test_df = pd.concat([X_test.reset_index(drop=True),
                                 y_test.reset_index(drop=True)], axis=1)

            train_df.to_csv(train_path, index=False)
            test_df.to_csv(test_path, index=False)
            print(f"✓ Saved {train_path} (Shape: {train_df.shape})")
            print(f"✓ Saved {test_path} (Shape: {test_df.shape})")

    else:
        print("--- Data loading failed. Aborting script. ---")
        exit()

    print("\n--- STAGE 2: Loading from CSV and Training Models ---")

    results_list = []

    # 모델 1 (Unleaky Only)
    results_A, _, _ = train_evaluate_model(paths['A'][0], paths['A'][1], "Set A (Unleaky Only)")
    if results_A: results_list.append(results_A)

    # 모델 2 (Fingerprint Only)
    results_B, _, _ = train_evaluate_model(paths['B'][0], paths['B'][1], "Set B (Fingerprint Only)")
    if results_B: results_list.append(results_B)

    # 모델 3 (All Features)
    results_AB, y_test_AB, y_pred_AB = train_evaluate_model(paths['AB'][0], paths['AB'][1], "Set A+B (All Features)")
    if results_AB: results_list.append(results_AB)

    # 최종 리포트
    print("\n" + "="*80)
    print("           FINAL FEATURE SET COMPARISON (Single Multi-Class Model)")
    print("="*80)

    results_df = pd.DataFrame(results_list)
    print(results_df.to_string(index=False, float_format="%.4f"))

    # Set A+B 모델의 이진 분류 성능 분석
    if 'y_test_AB' in locals():
        y_test_binary = np.where(y_test_AB == 95, 0, 1)
        y_pred_binary = np.where(y_pred_AB == 95, 0, 1)

        print("\n" + "="*80)
        print("      'Set A+B' 모델의 binary classification 성능 (Mon vs Unmon)")
        print("="*80)

        print(f"Total Binary Accuracy: {accuracy_score(y_test_binary, y_pred_binary):.4f}")
        print("\n Binary Classification Report:")
        print(classification_report(y_test_binary, y_pred_binary, target_names=['Unmonitored (0)', 'Monitored (1)']))
    else:
        print("\n--- Skipping Binary Analysis (Model A+B failed) ---")

--- STAGE 1: Extracting, Splitting, and Saving CSVs ---
Loading mon_standard.pkl...
Loading unmon_standard10.pkl...
Processing Monitored data...


100%|██████████| 950/950 [00:48<00:00, 19.61it/s]


Processing Unmonitored data...


100%|██████████| 10000/10000 [00:29<00:00, 334.90it/s]



Processing and splitting Dataset A...
✓ Saved train_set_A.csv (Shape: (20300, 19))
✓ Saved test_set_A.csv (Shape: (8700, 19))

Processing and splitting Dataset B...
✓ Saved train_set_B.csv (Shape: (20300, 9))
✓ Saved test_set_B.csv (Shape: (8700, 9))

Processing and splitting Dataset AB...
✓ Saved train_set_AB.csv (Shape: (20300, 27))
✓ Saved test_set_AB.csv (Shape: (8700, 27))

--- STAGE 2: Loading from CSV and Training Models ---

      TESTING: Set A (Unleaky Only)
Features: 18 | Train samples: 20300 | Test samples: 8700
Training Single Multi-Class XGBoost...


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Evaluating...

      TESTING: Set B (Fingerprint Only)
Features: 8 | Train samples: 20300 | Test samples: 8700
Training Single Multi-Class XGBoost...


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Evaluating...

      TESTING: Set A+B (All Features)
Features: 26 | Train samples: 20300 | Test samples: 8700
Training Single Multi-Class XGBoost...


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Evaluating...

           FINAL FEATURE SET COMPARISON (Single Multi-Class Model)
                   Model  Features  Accuracy  Macro F1  Macro ROC-AUC  Unmon (95) Recall
    Set A (Unleaky Only)        18    0.5201    0.3954         0.9512             0.7323
Set B (Fingerprint Only)         8    0.5643    0.5574         0.9658             0.4807
  Set A+B (All Features)        26    0.7937    0.7575         0.9913             0.8323

      'Set A+B' 모델의 binary classification 성능 (Mon vs Unmon)
Total Binary Accuracy: 0.9109

 Binary Classification Report:
                 precision    recall  f1-score   support

Unmonitored (0)       0.90      0.83      0.87      3000
  Monitored (1)       0.92      0.95      0.93      5700

       accuracy                           0.91      8700
      macro avg       0.91      0.89      0.90      8700
   weighted avg       0.91      0.91      0.91      8700

