In [None]:
import os
import zipfile

# 파일 경로
zip_path = "/content/mon_25.zip"
extract_dir = "/content"

# 압축 해제
with zipfile.ZipFile(zip_path, 'r') as z:
    z.extractall(extract_dir)

print("압축 해제 완료.")

# 폴더 확인
folder_path = "/content/mon_25"
print("폴더 존재 여부:", os.path.exists(folder_path))

if os.path.exists(folder_path):
    files = [f for f in os.listdir(folder_path) if f.endswith(".cell")]
    print("총 .cell 파일 수:", len(files))
    print("샘플 5개:", files[:5])
else:
    print("경로를 확인")

압축 해제 완료.
폴더 존재 여부: True
총 .cell 파일 수: 30000
샘플 5개: ['21-4_join.cell', '3-45_join.cell', '7-18_split_4.cell', '3-177_split_4.cell', '2-194_join.cell']


IAT 주파수 특징 추출 함수

In [None]:
import numpy as np
import pandas as pd

MAX_IAT_LEN = 2048

def compute_iat_frequency_features(times, max_len=MAX_IAT_LEN):
    # IAT 계산
    times = np.asarray(times, dtype=np.float64)
    if len(times) < 2:
        return None

    iat = np.diff(times)

    # 길이 맞춤
    if len(iat) >= max_len:
        iat_window = iat[:max_len]
    else:
        pad_len = max_len - len(iat)
        iat_window = np.pad(iat, (0, pad_len), mode="constant", constant_values=0.0)

    # 정규화
    if np.std(iat_window) > 0:
        iat_window = (iat_window - np.mean(iat_window)) / np.std(iat_window)

    # 실수 FFT
    fft_vals = np.fft.rfft(iat_window)
    mag = np.abs(fft_vals)  # magnitude 스펙트럼

    # DC 성분 제거
    mag_no_dc = mag[1:]
    if len(mag_no_dc) == 0 or np.sum(mag_no_dc) == 0:
        return {
            "freq_total_power": 0.0,
            "freq_band_low": 0.0,
            "freq_band_mid": 0.0,
            "freq_band_high": 0.0,
            "freq_dom_idx_norm": 0.0,
            "freq_dom_ratio": 0.0,
            "freq_spectral_centroid": 0.0,
        }

    total_power = np.sum(mag_no_dc)

    # 대역 나누기
    n = len(mag_no_dc)
    # 대략 3개 구간으로 나누기
    b1_end = n // 4
    b2_end = n // 2

    band1 = mag_no_dc[:b1_end]
    band2 = mag_no_dc[b1_end:b2_end]
    band3 = mag_no_dc[b2_end:]

    band1_power = np.sum(band1) / total_power
    band2_power = np.sum(band2) / total_power
    band3_power = np.sum(band3) / total_power

    # 지배적인 주파수 인덱스
    dom_idx = np.argmax(mag_no_dc) + 1
    dom_power = mag[dom_idx]
    dom_ratio = dom_power / (total_power + mag[0])

    # 스펙트럼 중심
    idxs = np.arange(1, len(mag))
    spectral_centroid = np.sum(idxs * mag[1:]) / np.sum(mag[1:])

    return {
        "freq_total_power": float(total_power),
        "freq_band_low": float(band1_power),
        "freq_band_mid": float(band2_power),
        "freq_band_high": float(band3_power),
        "freq_dom_idx_norm": float(dom_idx / len(mag)),
        "freq_dom_ratio": float(dom_ratio),
        "freq_spectral_centroid": float(spectral_centroid / len(mag)),
    }


# CSV

In [None]:
folder_path = "/content/mon_25"

features_list = []
labels_list = []

files = [f for f in os.listdir(folder_path) if f.endswith(".cell") and "join" not in f]

print("총 파일 수:", len(files))

for idx, filename in enumerate(files):
    if idx % 5000 == 0:
        print(f"Processing {idx}/{len(files)}...")

    file_path = os.path.join(folder_path, filename)

    try:
        label = int(filename.split("-")[0])
    except:
        continue

    try:
        df = pd.read_csv(file_path, sep=r"\s+", header=None, names=["time", "direction", "size"])
    except:
        continue

    if len(df) < 2:
        continue

    times = df["time"].values
    freq_feats = compute_iat_frequency_features(times)

    if freq_feats is None:
        continue

    freq_feats["label"] = label
    features_list.append(freq_feats)
    labels_list.append(label)

df_freq = pd.DataFrame(features_list)
print("생성된 feature shape:", df_freq.shape)
print(df_freq.head())

save_path = "/content/iat_freq_features_mon25.csv"
df_freq.to_csv(save_path, index=False)
print("저장 완료:", save_path)


총 파일 수: 25000
Processing 0/25000...
Processing 5000/25000...
Processing 10000/25000...
Processing 15000/25000...
Processing 20000/25000...
생성된 feature shape: (22876, 8)
   freq_total_power  freq_band_low  freq_band_mid  freq_band_high  \
0      46344.420808       0.251711       0.252175        0.496114   
1      46301.786488       0.252531       0.249437        0.498032   
2      40535.274238       0.371010       0.127592        0.501398   
3      42617.099604       0.339590       0.160046        0.500365   
4      46286.860579       0.251716       0.250609        0.497675   

   freq_dom_idx_norm  freq_dom_ratio  freq_spectral_centroid  label  
0           0.000976        0.001029                0.498057      7  
1           0.006829        0.001118                0.498327      3  
2           0.000976        0.002555                0.475689     22  
3           0.002927        0.001785                0.500176     19  
4           0.000976        0.001205                0.498422      

# XGBoost

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report

# label
X = df_freq.drop("label", axis=1)
y = df_freq["label"]

print("Feature shape:", X.shape)
print("Label 개수:", len(y), " / 고유 클래스:", y.nunique())

# train/test
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# 스케일링
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# XGBoost
model = XGBClassifier(
    n_estimators=300,
    learning_rate=0.05,
    max_depth=6,
    subsample=0.9,
    colsample_bytree=0.9,
    n_jobs=-1,
    eval_metric="mlogloss",
)

model.fit(X_train_scaled, y_train)
y_pred = model.predict(X_test_scaled)

acc = accuracy_score(y_test, y_pred)
print("IAT 주파수 feature XGBoost Accuracy:", acc)
print(classification_report(y_test, y_pred))


Feature shape: (22876, 7)
Label 개수: 22876  / 고유 클래스: 25
IAT 주파수 feature XGBoost Accuracy: 0.1138548951048951
              precision    recall  f1-score   support

           0       0.11      0.08      0.10       191
           1       0.11      0.09      0.10       183
           2       0.12      0.13      0.13       191
           3       0.11      0.07      0.09       182
           4       0.10      0.13      0.11       180
           5       0.07      0.05      0.06       194
           6       0.17      0.15      0.16       193
           7       0.11      0.10      0.11       169
           8       0.12      0.09      0.10       184
           9       0.09      0.12      0.10       191
          10       0.16      0.17      0.16       189
          11       0.18      0.23      0.21       196
          12       0.13      0.20      0.16       168
          13       0.05      0.03      0.03       190
          14       0.16      0.11      0.13       171
          15       0.11   