In [None]:
import zipfile

zip_path = "/content/mon_25.zip"
extract_dir = "/content"

with zipfile.ZipFile(zip_path, 'r') as z:
    z.extractall(extract_dir)

print("압축 해제 완료!")

압축 해제 완료!


In [None]:
import os
import numpy as np
import pandas as pd
from scipy import stats

Burst 함수

In [None]:
def get_bursts(directions):
    if len(directions) == 0:
        return []

    bursts = []
    current_burst = 1
    for i in range(1, len(directions)):
        if directions[i] == directions[i-1]:
            current_burst += 1
        else:
            bursts.append(current_burst)
            current_burst = 1
    bursts.append(current_burst)
    return bursts

기본 feature + chunk feature extract 함수

In [None]:
def extract_features_with_chunk(file_path, N=50, num_chunks=5):
    try:
        df = pd.read_csv(file_path, sep=r"\s+", header=None, names=['time','direction','size'])
    except:
        return None

    if len(df) < 2:
        return None

    features = {}

    # 기본 Feature
    directions = df['direction'].values
    sizes = np.abs(df['size'].values)
    times = df['time'].values

    # Basic
    features['total_packets'] = len(df)
    features['num_in'] = np.sum(directions == -1)
    features['num_out'] = np.sum(directions == 1)
    features['incoming_ratio'] = features['num_in'] / len(df)

    features['duration'] = times[-1] - times[0]
    features['pkts_per_sec'] = len(df) / (features['duration'] + 1e-9)

    # IAT
    ipt = np.diff(times)
    features['ipt_mean'] = np.mean(ipt)
    features['ipt_std'] = np.std(ipt)
    features['ipt_max'] = np.max(ipt)
    features['ipt_q75'] = np.percentile(ipt, 75)

    # Burst
    bursts = get_bursts(directions)
    features['burst_count'] = len(bursts)
    features['burst_mean'] = np.mean(bursts)
    features['burst_std'] = np.std(bursts)
    features['burst_max'] = np.max(bursts)

    # Packet Size
    in_sizes = sizes[directions == -1]
    out_sizes = sizes[directions == 1]

    features['in_size_mean'] = np.mean(in_sizes) if len(in_sizes)>0 else 0
    features['in_size_std'] = np.std(in_sizes) if len(in_sizes)>0 else 0
    features['in_size_max'] = np.max(in_sizes) if len(in_sizes)>0 else 0

    features['out_size_mean'] = np.mean(out_sizes) if len(out_sizes)>0 else 0
    features['out_size_std'] = np.std(out_sizes) if len(out_sizes)>0 else 0
    features['out_size_max'] = np.max(out_sizes) if len(out_sizes)>0 else 0

    # First N packets
    N_actual = min(len(df), N)
    f_d = directions[:N_actual]
    f_s = sizes[:N_actual]
    f_t = times[:N_actual]

    features[f'first{N}_in_count'] = np.sum(f_d == -1)
    features[f'first{N}_out_count'] = np.sum(f_d == 1)

    f_in = f_s[f_d == -1]
    f_out = f_s[f_d == 1]

    features[f'first{N}_in_size_mean'] = np.mean(f_in) if len(f_in)>0 else 0
    features[f'first{N}_in_size_std'] = np.std(f_in) if len(f_in)>0 else 0

    features[f'first{N}_out_size_mean'] = np.mean(f_out) if len(f_out)>0 else 0
    features[f'first{N}_out_size_std'] = np.std(f_out) if len(f_out)>0 else 0

    if N_actual > 1:
        f_ipt = np.diff(f_t)
        features[f'first{N}_ipt_mean'] = np.mean(f_ipt)
        features[f'first{N}_ipt_std'] = np.std(f_ipt)
    else:
        features[f'first{N}_ipt_mean'] = 0
        features[f'first{N}_ipt_std'] = 0

    # Chunk-Based Feature
    chunk_size = len(df) // num_chunks

    for c in range(num_chunks):
        start = c * chunk_size
        end = (c+1)*chunk_size if c < num_chunks-1 else len(df)

        chunk_d = directions[start:end]
        chunk_s = sizes[start:end]
        chunk_ipt = ipt[start:end]

        prefix = f"chunk{c+1}"

        features[f"{prefix}_ipt_mean"] = np.mean(chunk_ipt)
        features[f"{prefix}_ipt_std"] = np.std(chunk_ipt)

        inbound = np.sum(chunk_d == -1)
        outbound = np.sum(chunk_d == 1)
        total = len(chunk_d)

        features[f"{prefix}_in_ratio"] = inbound/total if total>0 else 0
        features[f"{prefix}_out_ratio"] = outbound/total if total>0 else 0

        features[f"{prefix}_burst"] = get_bursts(chunk_d)
        features[f"{prefix}_burst"] = len(features[f"{prefix}_burst"])

        features[f"{prefix}_size_mean"] = np.mean(chunk_s)

    return features


feature CSV

In [None]:
folder_path = "mon_25"

data_list = []
labels = []

files = [f for f in os.listdir(folder_path) if f.endswith(".cell") and "join" not in f]

for idx, filename in enumerate(files):
    if idx % 5000 == 0:
        print(f"Processing {idx}/{len(files)}...")

    file_path = os.path.join(folder_path, filename)

    try:
        label = int(filename.split("-")[0])
    except:
        continue

    feats = extract_features_with_chunk(file_path)

    if feats:
        feats["label"] = label
        data_list.append(feats)

df = pd.DataFrame(data_list)
df.to_csv("chunk_features_mon25.csv", index=False)

print("CSV 저장완료")
print(df.shape)
df.head()

Processing 0/25000...


  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  ret = _var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  arrmean = um.true_divide(arrmean, div, out=arrmean,
  ret = ret.dtype.type(ret / rcount)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  ret = _var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  arrmean = um.true_divide(arrmean, div, out=arrmean,
  ret = ret.dtype.type(ret / rcount)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  ret = _var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  arrmean = um.true_divide(arrmean, div, out=arrmean,
  ret = ret.dtype.type(ret / rcount)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  ret = _var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  arrmean = um.true_divide(arrmean, div, out=arrmean,
  ret = ret.dtype.type(ret / rcount)
  return _methods._mean(a, axis=axis, dtype=

Processing 5000/25000...


  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  ret = _var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  arrmean = um.true_divide(arrmean, div, out=arrmean,
  ret = ret.dtype.type(ret / rcount)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  ret = _var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  arrmean = um.true_divide(arrmean, div, out=arrmean,
  ret = ret.dtype.type(ret / rcount)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  ret = _var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  arrmean = um.true_divide(arrmean, div, out=arrmean,
  ret = ret.dtype.type(ret / rcount)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  ret = _var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  arrmean = um.true_divide(arrmean, div, out=arrmean,
  ret = ret.dtype.type(ret / rcount)
  return _methods._mean(a, axis=axis, dtype=

Processing 10000/25000...


  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  ret = _var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  arrmean = um.true_divide(arrmean, div, out=arrmean,
  ret = ret.dtype.type(ret / rcount)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  ret = _var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  arrmean = um.true_divide(arrmean, div, out=arrmean,
  ret = ret.dtype.type(ret / rcount)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  ret = _var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  arrmean = um.true_divide(arrmean, div, out=arrmean,
  ret = ret.dtype.type(ret / rcount)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  ret = _var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  arrmean = um.true_divide(arrmean, div, out=arrmean,
  ret = ret.dtype.type(ret / rcount)
  return _methods._mean(a, axis=axis, dtype=

Processing 15000/25000...


  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  ret = _var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  arrmean = um.true_divide(arrmean, div, out=arrmean,
  ret = ret.dtype.type(ret / rcount)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  ret = _var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  arrmean = um.true_divide(arrmean, div, out=arrmean,
  ret = ret.dtype.type(ret / rcount)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  ret = _var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  arrmean = um.true_divide(arrmean, div, out=arrmean,
  ret = ret.dtype.type(ret / rcount)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  ret = _var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  arrmean = um.true_divide(arrmean, div, out=arrmean,
  ret = ret.dtype.type(ret / rcount)
  return _methods._mean(a, axis=axis, dtype=

Processing 20000/25000...


  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  ret = _var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  arrmean = um.true_divide(arrmean, div, out=arrmean,
  ret = ret.dtype.type(ret / rcount)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  ret = _var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  arrmean = um.true_divide(arrmean, div, out=arrmean,
  ret = ret.dtype.type(ret / rcount)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  ret = _var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  arrmean = um.true_divide(arrmean, div, out=arrmean,
  ret = ret.dtype.type(ret / rcount)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  ret = _var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  arrmean = um.true_divide(arrmean, div, out=arrmean,
  ret = ret.dtype.type(ret / rcount)
  return _methods._mean(a, axis=axis, dtype=

CSV 저장완료
(22876, 59)


Unnamed: 0,total_packets,num_in,num_out,incoming_ratio,duration,pkts_per_sec,ipt_mean,ipt_std,ipt_max,ipt_q75,...,chunk4_out_ratio,chunk4_burst,chunk4_size_mean,chunk5_ipt_mean,chunk5_ipt_std,chunk5_in_ratio,chunk5_out_ratio,chunk5_burst,chunk5_size_mean,label
0,226,85,141,0.376106,39.566052,5.711967,0.175849,1.636584,23.788,0.0225,...,0.755556,6,512.0,0.686034,3.571496,0.23913,0.76087,14,512.0,20
1,78,78,0,1.0,10.8995,7.156292,0.141552,0.615273,4.369,0.011,...,0.0,1,512.0,0.584,1.20552,1.0,0.0,1,512.0,24
2,546,546,0,1.0,12.891,42.355131,0.023653,0.324585,6.8955,0.002,...,0.0,1,512.0,0.014394,0.098181,1.0,0.0,1,512.0,9
3,508,508,0,1.0,2.758,184.191443,0.00544,0.024157,0.191,0.001,...,0.0,1,512.0,0.003034,0.010729,1.0,0.0,1,512.0,1
4,757,757,0,1.0,2.6385,286.905439,0.00349,0.007143,0.1095,0.0045,...,0.0,1,512.0,0.005046,0.007928,1.0,0.0,1,512.0,4


XGBoost 학습 평가

In [None]:
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report

X = df.drop("label", axis=1)
y = df["label"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

model = XGBClassifier(
    n_estimators=300,
    learning_rate=0.05,
    max_depth=8,
    n_jobs=-1,
    eval_metric='mlogloss'
)

model.fit(X_train, y_train)
y_pred = model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

Accuracy: 0.2318618881118881
              precision    recall  f1-score   support

           0       0.12      0.10      0.11       191
           1       0.17      0.17      0.17       183
           2       0.20      0.21      0.21       191
           3       0.29      0.24      0.26       182
           4       0.13      0.14      0.13       180
           5       0.22      0.18      0.20       194
           6       0.26      0.28      0.27       193
           7       0.15      0.21      0.18       169
           8       0.22      0.16      0.18       184
           9       0.24      0.21      0.23       191
          10       0.30      0.25      0.27       189
          11       0.47      0.43      0.45       196
          12       0.21      0.36      0.27       168
          13       0.09      0.06      0.07       190
          14       0.20      0.12      0.15       171
          15       0.19      0.22      0.20       179
          16       0.35      0.38      0.37       17