In [None]:
import pandas as pd

all_df = pd.read_csv('/content/all_features.csv')

all_df.describe()

Unnamed: 0,all_total_packets,all_num_in,all_num_out,all_incoming_ratio,all_duration,all_pkts_per_sec,all_ipt_mean,all_ipt_std,all_ipt_max,all_ipt_q75,...,firstT_packets,firstT_in_ratio,firstT_out_ratio,firstT_ipt_mean,firstT_ipt_std,first30_in_ratio,first30_out_ratio,firstin_time,firstin_pkts_before,label
count,172085.0,172085.0,172085.0,172085.0,172085.0,172085.0,172085.0,172085.0,172085.0,172085.0,...,172085.0,172085.0,172085.0,172085.0,172085.0,172085.0,172085.0,172085.0,172085.0,172085.0
mean,981.683465,904.591882,77.091583,0.836945,13.747864,148.091007,0.04366,0.261023,4.281876,0.016502,...,416.821263,0.778056,0.221944,0.084556,0.167037,0.756304,0.243696,1.020962,14.8187,70.772078
std,1136.425769,1110.021706,120.572424,0.273002,13.517647,309.82241,0.150884,0.557011,6.606039,0.131627,...,622.884015,0.360676,0.360676,0.195728,0.220571,0.399859,0.399859,2.836852,40.2883,30.875847
min,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,200.0,148.0,0.0,0.807377,4.376,27.644431,0.006332,0.049659,0.957,0.001,...,50.0,0.717391,0.0,0.005553,0.037173,0.633333,0.0,0.0,0.0,47.0
50%,566.0,489.0,23.0,0.970732,9.054303,67.227543,0.01487,0.113602,1.9465,0.002,...,177.0,1.0,0.0,0.014721,0.086495,1.0,0.0,0.0,0.0,93.0
75%,1334.0,1233.0,112.0,1.0,18.017591,157.631111,0.036188,0.263711,4.521112,0.00475,...,521.0,1.0,0.282609,0.06474,0.208494,1.0,0.366667,0.0,2.0,95.0
max,9122.0,9047.0,2898.0,1.0,85.8685,19200.000429,11.08325,18.817187,80.194999,19.7835,...,8252.0,1.0,1.0,4.974,2.4495,1.0,1.0,56.626537,2231.0,95.0


In [None]:
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report
import numpy as np

def train_and_test_with_XGBoost(train_df, test_df):

  # ===================================================
  # Closed world (95 제거)
  # ===================================================
  train_cw = train_df[train_df['label'] != 95].copy()
  test_cw  = test_df[test_df['label'] != 95].copy()

  y_train_cw = train_cw['label']
  X_train_cw = train_cw.drop(columns=['label'])

  y_test_cw = test_cw['label']
  X_test_cw = test_cw.drop(columns=['label'])

  print("[INFO] Train shape:", X_train_cw.shape)
  print("[INFO] Test shape:", X_test_cw.shape)

  print("\n================ Closed-World (95-class) ================\n")

  num_classes = y_train_cw.nunique()
  print("[INFO] Classes:", num_classes)

  model_closed = XGBClassifier(
      objective='multi:softprob',   # closed-world, open-multi
      eval_metric='mlogloss',
      tree_method='hist',
      learning_rate=0.05,
      n_estimators=200,
      max_depth=6,
      subsample=0.8,
      colsample_bytree=0.8,
      random_state=42,
      device='cuda'
  )

  model_closed.fit(X_train_cw, y_train_cw)

  pred_closed = model_closed.predict(X_test_cw)

  print("[Closed-World] Accuracy:", accuracy_score(y_test_cw, pred_closed))
  print(classification_report(y_test_cw, pred_closed))

  # ===================================================
  # Open world
  # ===================================================
  X_train_ow = train_df.drop(columns=['label'])
  y_train_ow = train_df['label']

  X_test_ow = test_df.drop(columns=['label'])
  y_test_ow = test_df['label']

  print("[INFO] Train shape:", X_train_ow.shape)
  print("[INFO] Test shape:", X_test_ow.shape)

  # Binary labels: mon=1, unmon=0
  y_train_ow_bin = (y_train_ow != 95).astype(int)
  y_test_ow_bin  = (y_test_ow  != 95).astype(int)

  print("[INFO] Binary class ratio:", np.bincount(y_train_ow_bin))

  # Open world binary

  model_binary = XGBClassifier(
      tree_method='hist',
      learning_rate=0.05,
      n_estimators=200,
      max_depth=6,
      subsample=0.8,
      colsample_bytree=0.8,
      objective='binary:logistic',
      eval_metric='logloss',
      random_state=42,
      device='cuda'
  )

  model_binary.fit(X_train_ow, y_train_ow_bin)

  pred_binary = (model_binary.predict_proba(X_test_ow)[:,1] > 0.5).astype(int)

  print("[Binary] Accuracy:", accuracy_score(y_test_ow_bin, pred_binary))
  print(classification_report(y_test_ow_bin, pred_binary))

  # Open world multi

  print("\n================ Open-World Multiclass (0~95) ================\n")

  num_classes = y_train_ow.nunique()
  print("[INFO] Classes:", num_classes)

  model_multi = XGBClassifier(
      objective='multi:softprob',   # closed-world, open-multi
      eval_metric='mlogloss',
      tree_method='hist',
      learning_rate=0.05,
      n_estimators=200,
      max_depth=6,
      subsample=0.8,
      colsample_bytree=0.8,
      random_state=42,
      num_class=num_classes,
      device='cuda'
  )

  model_multi.fit(X_train_ow, y_train_ow)

  pred_multi = model_multi.predict(X_test_ow)

  print("[Open-Multi] Accuracy:", accuracy_score(y_test_ow, pred_multi))
  print(classification_report(y_test_ow, pred_multi))


## 기본

In [None]:
# train/test split

from sklearn.model_selection import train_test_split

train_df, test_df = train_test_split(all_df, test_size=0.25, stratify=all_df['label'])

train_df.describe()
test_df.describe()

Unnamed: 0,all_total_packets,all_num_in,all_num_out,all_incoming_ratio,all_duration,all_pkts_per_sec,all_ipt_mean,all_ipt_std,all_ipt_max,all_ipt_q75,...,firstT_packets,firstT_in_ratio,firstT_out_ratio,firstT_ipt_mean,firstT_ipt_std,first30_in_ratio,first30_out_ratio,firstin_time,firstin_pkts_before,label
count,43022.0,43022.0,43022.0,43022.0,43022.0,43022.0,43022.0,43022.0,43022.0,43022.0,...,43022.0,43022.0,43022.0,43022.0,43022.0,43022.0,43022.0,43022.0,43022.0,43022.0
mean,982.707429,906.28074,76.426689,0.836689,13.664851,149.164484,0.043867,0.259641,4.248674,0.017037,...,419.447562,0.778474,0.221526,0.08544,0.167148,0.757104,0.242896,1.021297,14.750151,70.775231
std,1143.423459,1117.471875,120.091083,0.274499,13.458357,293.47045,0.145212,0.547574,6.538871,0.128302,...,631.895565,0.360985,0.360985,0.200724,0.222801,0.399936,0.399936,2.862344,41.493014,30.873915
min,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,199.0,148.0,0.0,0.808896,4.3417,27.672925,0.006291,0.049273,0.947,0.001,...,50.0,0.720383,0.0,0.005511,0.037018,0.633333,0.0,0.0,0.0,47.0
50%,564.0,487.0,22.0,0.97115,8.990854,67.656118,0.014782,0.113404,1.926649,0.002,...,178.0,1.0,0.0,0.014661,0.085824,1.0,0.0,0.0,0.0,93.0
75%,1332.0,1231.0,112.0,1.0,17.902986,158.666593,0.036141,0.260508,4.48425,0.00475,...,524.0,1.0,0.279617,0.064103,0.20699,1.0,0.366667,0.0,1.0,95.0
max,8705.0,8693.0,2231.0,1.0,84.300003,15333.333993,6.0375,15.452378,61.061724,11.450812,...,7791.0,1.0,1.0,4.5905,2.438,1.0,1.0,54.044556,2231.0,95.0


In [None]:
# Closed world (95 제거)
train_cw = train_df[train_df['label'] != 95].copy()
test_cw  = test_df[test_df['label'] != 95].copy()

y_train_cw = train_cw['label']
X_train_cw = train_cw.drop(columns=['label'])

y_test_cw = test_cw['label']
X_test_cw = test_cw.drop(columns=['label'])

print("[INFO] Train shape:", X_train_cw.shape)
print("[INFO] Test shape:", X_test_cw.shape)

[INFO] Train shape: (65278, 49)
[INFO] Test shape: (21760, 49)


In [None]:
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report
import numpy as np

xgb_params = dict(
    objective='multi:softprob',   # closed-world, open-multi
    eval_metric='mlogloss',
    tree_method='hist',
    learning_rate=0.05,
    n_estimators=200,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
)

In [None]:
print("\n================ Closed-World (95-class) ================\n")

num_classes = y_train_cw.nunique()
print("[INFO] Classes:", num_classes)

model_closed = XGBClassifier(
    objective='multi:softprob',   # closed-world, open-multi
    eval_metric='mlogloss',
    tree_method='hist',
    learning_rate=0.05,
    n_estimators=200,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    device='cuda'
)

model_closed.fit(X_train_cw, y_train_cw)

pred_closed = model_closed.predict(X_test_cw)

print("[Closed-World] Accuracy:", accuracy_score(y_test_cw, pred_closed))
print(classification_report(y_test_cw, pred_closed))



[INFO] Classes: 95


Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.


  return func(**kwargs)


[Closed-World] Accuracy: 0.11801470588235294
              precision    recall  f1-score   support

           0       0.09      0.03      0.04       239
           1       0.09      0.07      0.08       229
           2       0.06      0.07      0.06       239
           3       0.07      0.04      0.05       228
           4       0.05      0.04      0.05       225
           5       0.12      0.05      0.07       242
           6       0.17      0.17      0.17       242
           7       0.07      0.13      0.09       211
           8       0.12      0.08      0.10       229
           9       0.05      0.03      0.03       240
          10       0.12      0.08      0.10       237
          11       0.22      0.26      0.24       246
          12       0.08      0.20      0.11       210
          13       0.00      0.00      0.00       237
          14       0.12      0.05      0.07       214
          15       0.07      0.08      0.07       224
          16       0.14      0.19   

In [None]:
X_train_ow = train_df.drop(columns=['label'])
y_train_ow = train_df['label']

X_test_ow = test_df.drop(columns=['label'])
y_test_ow = test_df['label']

print("[INFO] Train shape:", X_train_ow.shape)
print("[INFO] Test shape:", X_test_ow.shape)

# Binary labels: mon=1, unmon=0
y_train_ow_bin = (y_train_ow != 95).astype(int)
y_test_ow_bin  = (y_test_ow  != 95).astype(int)

print("[INFO] Binary class ratio:", np.bincount(y_train_ow_bin))

[INFO] Train shape: (129063, 49)
[INFO] Test shape: (43022, 49)
[INFO] Binary class ratio: [63785 65278]


In [None]:
# Open world binary

model_binary = XGBClassifier(
    tree_method='hist',
    learning_rate=0.05,
    n_estimators=200,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.8,
    objective='binary:logistic',
    eval_metric='logloss',
    random_state=42,
    device='cuda'
)

model_binary.fit(X_train_ow, y_train_ow_bin)

pred_binary = (model_binary.predict_proba(X_test_ow)[:,1] > 0.5).astype(int)

print("[Binary] Accuracy:", accuracy_score(y_test_ow_bin, pred_binary))
print(classification_report(y_test_ow_bin, pred_binary))

[Binary] Accuracy: 0.6203802705592487
              precision    recall  f1-score   support

           0       0.63      0.58      0.60     21262
           1       0.62      0.66      0.64     21760

    accuracy                           0.62     43022
   macro avg       0.62      0.62      0.62     43022
weighted avg       0.62      0.62      0.62     43022



In [None]:
# Open world multi

print("\n================ Open-World Multiclass (0~95) ================\n")

num_classes = y_train_ow.nunique()
print("[INFO] Classes:", num_classes)

model_multi = XGBClassifier(
    objective='multi:softprob',   # closed-world, open-multi
    eval_metric='mlogloss',
    tree_method='hist',
    learning_rate=0.05,
    n_estimators=200,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    num_class=num_classes,
    device='cuda'
)

model_multi.fit(X_train_ow, y_train_ow)

pred_multi = model_multi.predict(X_test_ow)

print("[Open-Multi] Accuracy:", accuracy_score(y_test_ow, pred_multi))
print(classification_report(y_test_ow, pred_multi))




[INFO] Classes: 96
[Open-Multi] Accuracy: 0.5020687090325879
              precision    recall  f1-score   support

           0       0.50      0.01      0.02       239
           1       0.14      0.00      0.01       229
           2       0.00      0.00      0.00       239
           3       0.00      0.00      0.00       228
           4       0.33      0.00      0.01       225
           5       0.00      0.00      0.00       242
           6       0.61      0.05      0.08       242
           7       0.11      0.00      0.01       211
           8       0.67      0.02      0.03       229
           9       0.00      0.00      0.00       240
          10       0.18      0.01      0.02       237
          11       0.48      0.05      0.09       246
          12       0.40      0.02      0.04       210
          13       0.00      0.00      0.00       237
          14       0.50      0.00      0.01       214
          15       0.20      0.00      0.01       224
          16      

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


## PCA 수행

In [None]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report
from xgboost import XGBClassifier
import numpy as np


def apply_pca(train_df, test_df, n_components=20):

    # df에서 label 분리
    X_train = train_df.drop(columns=['label'])
    X_test  = test_df.drop(columns=['label'])

    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled  = scaler.transform(X_test)

    pca = PCA(n_components=n_components, random_state=42)
    X_train_pca = pca.fit_transform(X_train_scaled)
    X_test_pca  = pca.transform(X_test_scaled)

    # PCA된 feature + label 다시 합쳐서 반환
    train_pca = train_df[['label']].copy()
    test_pca  = test_df[['label']].copy()

    train_pca = train_pca.join(
        pd.DataFrame(X_train_pca, index=train_df.index)
    )
    test_pca = test_pca.join(
        pd.DataFrame(X_test_pca, index=test_df.index)
    )

    return train_pca, test_pca


train_pca, test_pca = apply_pca(train_df, test_df, n_components=20)
train_and_test_with_XGBoost(train_pca, test_pca)


[INFO] Train shape: (65278, 20)
[INFO] Test shape: (21760, 20)


[INFO] Classes: 95
[Closed-World] Accuracy: 0.07619485294117648
              precision    recall  f1-score   support

           0       0.02      0.01      0.01       239
           1       0.03      0.03      0.03       229
           2       0.03      0.03      0.03       239
           3       0.03      0.01      0.01       228
           4       0.05      0.03      0.03       225
           5       0.03      0.01      0.02       242
           6       0.12      0.11      0.11       242
           7       0.05      0.09      0.06       211
           8       0.04      0.02      0.03       229
           9       0.06      0.03      0.03       240
          10       0.05      0.03      0.04       237
          11       0.12      0.17      0.14       246
          12       0.06      0.18      0.09       210
          13       0.03      0.01      0.01       237
          14       0.07      0.03      0.04       214
      

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
