In [28]:

%pip install lightgbm

Note: you may need to restart the kernel to use updated packages.


In [2]:
import pandas as pd
import numpy as np
from sklearn.ensemble import VotingClassifier
from lightgbm import LGBMClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, f1_score
import warnings

warnings.filterwarnings("ignore")  # 불필요한 경고 메시지 무시


In [4]:
data=pd.read_csv('D:/MLB_TEAM/1128/mon.csv')

# 특성과 레이블 분리
X = data.drop(columns=['Label'])  # 피처 데이터
y = data['Label']  # 레이블 데이터

# Train, Validation, Test 데이터 나누기
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp)


In [5]:
print(X_train.describe())

       Total Packets  Incoming Packets  Outgoing Packets  Incoming Ratio  \
count   13300.000000      13300.000000      13300.000000    13300.000000   
mean     4192.482556        348.749774       3843.732782        0.097923   
std      3206.916819        290.424075       2992.480388        0.046475   
min        50.000000          8.000000         25.000000        0.029521   
25%      1525.000000        138.000000       1372.000000        0.065099   
50%      3316.000000        283.000000       3020.000000        0.089881   
75%      6363.000000        485.250000       5752.250000        0.119666   
max      9993.000000       4241.000000       9698.000000        0.500000   

       Outgoing Ratio  Outgoing Std  Outgoing Mean  Packets per Second  \
count    13300.000000  13300.000000   13300.000000        13300.000000   
mean         0.902077   1292.260644    2059.643557          224.558112   
std          0.046475    969.241187    1620.257046          276.381072   
min          0.5000

In [6]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer, f1_score

param_grid = {
    'num_leaves': [70, 100],
    'max_depth': [20, 30],
    'learning_rate': [0.01, 0.05, 0.1],
    'n_estimators': [300, 500, 700],
    'min_child_samples': [1, 3],
    'min_split_gain': [0.0],
    'subsample': [0.8],
    'colsample_bytree': [0.8]
}

lgbm = LGBMClassifier(objective='multiclass', num_class=95, random_state=42,force_col_wise=True)
f1_scorer = make_scorer(f1_score, average='macro')

grid_search = GridSearchCV(
    estimator=lgbm,
    param_grid=param_grid,
    scoring='f1_macro',
    cv=5,
    n_jobs=-1,
    verbose=2
)

# 학습 데이터로 그리드 서치 실행
grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 72 candidates, totalling 360 fits


In [None]:
print("Best Parameters:", grid_search.best_params_)
print("Best F1 Score:", grid_search.best_score_)

Best Parameters: {'colsample_bytree': 0.8, 'learning_rate': 0.01, 'max_depth': 20, 'min_child_samples': 1, 'min_split_gain': 0.0, 'n_estimators': 300, 'num_leaves': 70, 'reg_alpha': 0.0, 'reg_lambda': 0.0, 'subsample': 0.8}
Best F1 Score: 1.0


In [None]:
y_train.value_counts()
X_train.describe()

Unnamed: 0,Total Packets,Incoming Packets,Outgoing Packets,Incoming Ratio,Outgoing Ratio,Outgoing Std,Outgoing Mean,Packets per Second,First 30 Incoming,First 30 Outgoing,Inter-arrival Mean,Inter-arrival Std,Concentration Mean,Concentration Std,Alternative Sum
count,13300.0,13300.0,13300.0,13300.0,13300.0,13300.0,13300.0,13300.0,13300.0,13300.0,13300.0,13300.0,13300.0,13300.0,13300.0
mean,4193.291955,346.851579,3846.440376,0.097676,0.902324,1293.285735,2057.270757,223.787988,8.230301,21.769699,0.021294,0.248382,1.928044,2.526183,-1791789.0
std,3204.587299,286.055676,2991.996357,0.047077,0.047077,970.222041,1620.005928,273.610543,1.377136,1.377136,0.064663,0.490915,0.886392,0.795757,1429788.0
min,50.0,8.0,25.0,0.029521,0.492754,11.790077,17.25,0.804649,4.0,17.0,0.00047,0.005326,0.59,0.0,-4814336.0
25%,1534.0,138.0,1380.75,0.064559,0.880951,484.342246,670.225044,68.224497,7.0,21.0,0.004131,0.042525,1.287475,2.005607,-2672128.0
50%,3308.5,281.0,3012.0,0.089325,0.910675,1063.346356,1664.016184,125.372203,8.0,22.0,0.007979,0.116207,1.778226,2.496556,-1393408.0
75%,6379.25,484.25,5781.75,0.119049,0.935441,1938.770109,3284.042841,242.197188,9.0,23.0,0.014666,0.242424,2.363636,2.969239,-635904.0
max,9993.0,3674.0,9698.0,0.507246,0.970479,3747.259674,7664.499381,2127.136752,13.0,26.0,1.266226,7.641922,9.428571,8.553464,512.0


In [None]:
# 최적의 모델로 검증 데이터 평가
best_model = grid_search.best_estimator_
y_val_pred = best_model.predict(X_val)
val_accuracy = accuracy_score(y_val, y_val_pred)
val_f1 = f1_score(y_val, y_val_pred)

print(f"\nValidation Accuracy: {val_accuracy:.2f}")
print(f"Validation F1 Score: {val_f1:.2f}")

# 테스트 데이터 평가
y_test_pred = best_model.predict(X_test)
test_accuracy = accuracy_score(y_test, y_test_pred)
test_f1 = f1_score(y_test, y_test_pred)

print(f"\nTest Accuracy: {test_accuracy:.2f}")
print(f"Test F1 Score: {test_f1:.2f}")

# 분류 보고서 출력
print("\nClassification Report on Test Data:")
print(classification_report(y_test, y_test_pred))


Validation Accuracy: 1.00
Validation F1 Score: 1.00

Test Accuracy: 1.00
Test F1 Score: 1.00

Classification Report on Test Data:
              precision    recall  f1-score   support

           1       1.00      1.00      1.00      2850

    accuracy                           1.00      2850
   macro avg       1.00      1.00      1.00      2850
weighted avg       1.00      1.00      1.00      2850

