In [1]:
import pandas as pd
import numpy as np
from google.colab import drive
import matplotlib.pyplot as plt

# 데이터 로드
drive.mount('/content/drive')
file_path = '/content/drive/MyDrive/monunmon.csv'
data = pd.read_csv(file_path)

# 데이터 확인
print("데이터 샘플:")
print(data.head())

# 특징(X)과 레이블(y) 분리
X = data.iloc[:, :-1]  # 특징 (feature)들
y = data.iloc[:, -1]   # 레이블 (0~94)

Mounted at /content/drive
데이터 샘플:
   Total Packets  Incoming Packets  Outgoing Packets  Incoming Ratio  \
0         1421.0             121.0            1300.0        0.085151   
1          518.0              80.0             438.0        0.154440   
2         1358.0             118.0            1240.0        0.086892   
3         1446.0             122.0            1324.0        0.084371   
4         1406.0             115.0            1291.0        0.081792   

   Outgoing Ratio  Outgoing Std  Outgoing Mean  Packets per Second  \
0        0.914849    515.483953     773.322314          140.138067   
1        0.845560    139.231951     226.162500           50.984252   
2        0.913108    472.735508     786.110169          122.232223   
3        0.915629    513.916038     820.139344          108.233533   
4        0.918208    503.993490     789.608696          132.142857   

   First 30 Incoming  First 30 Outgoing  Inter-arrival Mean  \
0                9.0               21.0          

In [2]:
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# 레이블을 1과 0로 변환(monitored label:1/ unmonitored label:0)
data['Label'] = data['Label'].apply(lambda x: 1 if x != -1 else 0)

# feature와 label 분리
X = data.drop('Label', axis=1)  # 'Label'을 제외한 모든 열이 feature
y = data['Label']  # 'Label' 열이 target

# 학습용 데이터와 테스트 데이터로 분리
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# XGBoost 데이터 형식으로 변환
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)


**튜닝을 하지 않은 기본 XG Boost model**



In [3]:
# XGBoost 모델 파라미터 설정
params = {
    'objective': 'binary:logistic', # 이진 분류 (0과 1)
    'max_depth': 6,
    'eta': 0.1,
    'eval_metric': 'logloss',
    'random_state': 42
}

# 모델 학습
num_boost_round = 2000
evals = [(dtest, 'eval'), (dtrain, 'train')]
model = xgb.train(params, dtrain, num_boost_round=num_boost_round, evals=evals,
                  early_stopping_rounds=50, verbose_eval=200)

# 예측
y_pred_prob = model.predict(dtest)
y_pred = [1 if prob >= 0.5 else 0 for prob in y_pred_prob]

# 정확도 평가
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.4f}')

[0]	eval-logloss:0.38453	train-logloss:0.38871
[200]	eval-logloss:0.23702	train-logloss:0.15540
[400]	eval-logloss:0.23100	train-logloss:0.10452
[600]	eval-logloss:0.23311	train-logloss:0.07329
[800]	eval-logloss:0.23696	train-logloss:0.05375
[1000]	eval-logloss:0.24262	train-logloss:0.04107
[1200]	eval-logloss:0.25057	train-logloss:0.03134
[1400]	eval-logloss:0.25684	train-logloss:0.02504
[1600]	eval-logloss:0.26360	train-logloss:0.02029
[1800]	eval-logloss:0.26989	train-logloss:0.01700
[1999]	eval-logloss:0.27534	train-logloss:0.01453
Accuracy: 0.9155


**튜닝을 진행한 XG Boost model**

---



In [6]:
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split, KFold, GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.model_selection import StratifiedKFold

# XGBoost 모델 초기화
xgb_model = xgb.XGBClassifier(random_state=42)

# 튜닝할 하이퍼파라미터 (범위를 좁힘)
param_grid = {
    'n_estimators': [100, 200],  # 부스팅 단계 수를 적게 설정
    'learning_rate': [0.01, 0.1],  # 너무 큰 learning rate를 피함
    'max_depth': [3, 5],  # 너무 깊은 트리는 피하기
    'subsample': [0.8],  # 데이터 샘플링 비율을 0.8로 고정
    'colsample_bytree': [0.8],  # 트리별 특성 샘플링 비율을 0.8로 고정
    'gamma': [0, 0.1]  # 과도한 정규화를 피하기 위해 낮은 값
}

# GridSearchCV를 사용하여 하이퍼파라미터 튜닝 (KFold 사용)
kf = KFold(n_splits=3, shuffle=True, random_state=42)

grid_search = GridSearchCV(estimator=xgb_model, param_grid=param_grid, cv=kf, verbose=2, n_jobs=-1, scoring='accuracy', error_score='raise')

# 모델 학습
grid_search.fit(X_train, y_train)

# 최적 하이퍼파라미터 출력
print(f"Best Parameters: {grid_search.best_params_}")

# 최적 모델로 예측
y_pred = grid_search.best_estimator_.predict(X_test)

# 정확도 평가
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy (Tuned Model): {accuracy:.4f}')

Fitting 3 folds for each of 16 candidates, totalling 48 fits
Best Parameters: {'colsample_bytree': 0.8, 'gamma': 0.1, 'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 200, 'subsample': 0.8}
Accuracy (Tuned Model): 0.9086
