### 데이터 불러오기

In [None]:
import pandas as pd

train = pd.read_csv("/content/drive/MyDrive/Coursework/25-2 Machine Learning/Project/DataPreprocessing_final/closedworld_train.csv")
test = pd.read_csv("/content/drive/MyDrive/Coursework/25-2 Machine Learning/Project/DataPreprocessing_final/closedworld_test.csv")

y_train = train['label']
X_train = train.drop(columns=['label'])

y_test = test['label']
X_test = test.drop(columns=['label'])

print("X_train shape:", X_train.shape)
print("y_train shape:", y_train.shape)
print("X_test shape:", X_test.shape)
print("y_test shape:", y_test.shape)

X_train shape: (13300, 26)
y_train shape: (13300,)
X_test shape: (5700, 26)
y_test shape: (5700,)


In [None]:
# validation set 분리
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(
    X_train, y_train, test_size=0.2, random_state=42, stratify=y_train
)

### xgboost 및 optuna 임포트

In [None]:
pip install optuna

Collecting optuna
  Downloading optuna-4.6.0-py3-none-any.whl.metadata (17 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.10.1-py3-none-any.whl.metadata (11 kB)
Downloading optuna-4.6.0-py3-none-any.whl (404 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m404.7/404.7 kB[0m [31m12.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorlog-6.10.1-py3-none-any.whl (11 kB)
Installing collected packages: colorlog, optuna
Successfully installed colorlog-6.10.1 optuna-4.6.0


In [None]:
pip install optuna.integration

Collecting optuna.integration
  Downloading optuna_integration-4.6.0-py3-none-any.whl.metadata (12 kB)
Downloading optuna_integration-4.6.0-py3-none-any.whl (99 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m99.1/99.1 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: optuna.integration
Successfully installed optuna.integration-4.6.0


In [None]:
import os, numpy as np
import optuna
from sklearn.metrics import f1_score, classification_report, confusion_matrix, top_k_accuracy_score
from xgboost import XGBClassifier
from optuna.integration import XGBoostPruningCallback

### 하이퍼파라미터 튜닝

In [None]:
NUM_CLASSES = len(np.unique(y_train))
RANDOM_STATE = 42
VALID_SIZE = 0.2

def objective(trial: optuna.Trial) -> float:
    params = {
        "objective": "multi:softprob",
        "num_class": NUM_CLASSES,
        "eval_metric": "mlogloss",
        "n_estimators": 100,
        "learning_rate": trial.suggest_float("learning_rate", 0.02, 0.2, log=True),
        "max_depth": trial.suggest_int("max_depth", 3, 10),
        "min_child_weight": trial.suggest_int("min_child_weight", 1, 10),
        "subsample": trial.suggest_float("subsample", 0.6, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.6, 1.0),
        "gamma": trial.suggest_float("gamma", 0.0, 5.0),
        "reg_lambda": trial.suggest_float("reg_lambda", 0.0, 10.0),
        "reg_alpha": trial.suggest_float("reg_alpha", 0.0, 5.0),
        "random_state": RANDOM_STATE,
        "n_jobs": os.cpu_count(),
        "early_stopping_rounds": 50
    }

    model = XGBClassifier(**params)
    callbacks = [XGBoostPruningCallback(trial, "validation_0-mlogloss")]

    model.fit(
        X_train, y_train,
        eval_set=[(X_train, y_train), (X_val, y_val)],
        verbose=1
    )
    best_logloss = model.best_score
    return best_logloss

study = optuna.create_study(direction="minimize", study_name="xgb_multiclass_mlogloss")
study.optimize(objective, n_trials=200, show_progress_bar=True)

[I 2025-11-18 13:38:45,104] A new study created in memory with name: xgb_multiclass_mlogloss


  0%|          | 0/200 [00:00<?, ?it/s]

[1;30;43m스트리밍 출력 내용이 길어서 마지막 5000줄이 삭제되었습니다.[0m
[50]	validation_0-mlogloss:0.41608	validation_1-mlogloss:1.09083
[51]	validation_0-mlogloss:0.40768	validation_1-mlogloss:1.08591
[52]	validation_0-mlogloss:0.39969	validation_1-mlogloss:1.08193
[53]	validation_0-mlogloss:0.39259	validation_1-mlogloss:1.07736
[54]	validation_0-mlogloss:0.38538	validation_1-mlogloss:1.07451
[55]	validation_0-mlogloss:0.37835	validation_1-mlogloss:1.07082
[56]	validation_0-mlogloss:0.37165	validation_1-mlogloss:1.06765
[57]	validation_0-mlogloss:0.36515	validation_1-mlogloss:1.06367
[58]	validation_0-mlogloss:0.35918	validation_1-mlogloss:1.06063
[59]	validation_0-mlogloss:0.35288	validation_1-mlogloss:1.05730
[60]	validation_0-mlogloss:0.34705	validation_1-mlogloss:1.05410
[61]	validation_0-mlogloss:0.34123	validation_1-mlogloss:1.05021
[62]	validation_0-mlogloss:0.33565	validation_1-mlogloss:1.04754
[63]	validation_0-mlogloss:0.33069	validation_1-mlogloss:1.04478
[64]	validation_0-mlogloss:0.32551	valid

In [None]:
print("Best trial:", study.best_trial.number)
print("Best mlogloss:", study.best_value)
print("Best params:", study.best_params)

Best trial: 134
Best mlogloss: 0.9760807929088317
Best params: {'learning_rate': 0.19721925924769965, 'max_depth': 10, 'min_child_weight': 4, 'subsample': 0.7577072390567685, 'colsample_bytree': 0.6494394276320198, 'gamma': 0.06948732401595321, 'reg_lambda': 3.2137571796403774, 'reg_alpha': 0.446034681615163}


### 최종 학습

In [None]:
best_params = {
    "objective": "multi:softprob",
    "num_class": NUM_CLASSES,
    "eval_metric": "mlogloss",
    "n_estimators": 10000,
    "random_state": RANDOM_STATE,
    "n_jobs": os.cpu_count(),
    "early_stopping_rounds":5000,
    **study.best_params
}

final_model = XGBClassifier(**best_params)
final_model.fit(
    X_train, y_train,
    eval_set=[(X_train, y_train), (X_val, y_val)],
    verbose=1
)

[1;30;43m스트리밍 출력 내용이 길어서 마지막 5000줄이 삭제되었습니다.[0m
[283]	validation_0-mlogloss:0.10814	validation_1-mlogloss:0.94857
[284]	validation_0-mlogloss:0.10803	validation_1-mlogloss:0.94851
[285]	validation_0-mlogloss:0.10792	validation_1-mlogloss:0.94856
[286]	validation_0-mlogloss:0.10785	validation_1-mlogloss:0.94861
[287]	validation_0-mlogloss:0.10777	validation_1-mlogloss:0.94898
[288]	validation_0-mlogloss:0.10768	validation_1-mlogloss:0.94889
[289]	validation_0-mlogloss:0.10758	validation_1-mlogloss:0.94898
[290]	validation_0-mlogloss:0.10747	validation_1-mlogloss:0.94905
[291]	validation_0-mlogloss:0.10738	validation_1-mlogloss:0.94927
[292]	validation_0-mlogloss:0.10729	validation_1-mlogloss:0.94921
[293]	validation_0-mlogloss:0.10719	validation_1-mlogloss:0.94935
[294]	validation_0-mlogloss:0.10711	validation_1-mlogloss:0.94929
[295]	validation_0-mlogloss:0.10698	validation_1-mlogloss:0.94944
[296]	validation_0-mlogloss:0.10692	validation_1-mlogloss:0.94935
[297]	validation_0-mloglos

### 예측

In [None]:
from sklearn.metrics import (
    accuracy_score,
    f1_score,
    roc_auc_score,
    average_precision_score,
    classification_report
)

In [None]:
y_pred = final_model.predict(X_test)
y_prob = final_model.predict_proba(X_test)

# ==== Accuracy ====
accuracy = accuracy_score(y_test, y_pred)

# ==== F1 ====
f1_macro = f1_score(y_test, y_pred, average='macro')
f1_micro = f1_score(y_test, y_pred, average='micro')
f1_weighted = f1_score(y_test, y_pred, average='weighted')

# ==== ROC-AUC ====
roc_auc = roc_auc_score(y_test, y_prob, multi_class='ovr', average='macro')

# ==== PR-AUC ====
pr_auc = average_precision_score(y_test, y_prob, average='macro')

# ==== Classification report ====
report = classification_report(y_test, y_pred)

print("Accuracy:", accuracy)
print("F1-macro:", f1_macro)
print("F1-micro:", f1_micro)
print("F1-weighted:", f1_weighted)
print("ROC-AUC:", roc_auc)
print("PR-AUC:", pr_auc)
print(report)

Accuracy: 0.7501754385964913
F1-macro: 0.7482437402341408
F1-micro: 0.7501754385964913
F1-weighted: 0.7482437402341408
ROC-AUC: 0.9889474306333209
PR-AUC: 0.8207142307147461
              precision    recall  f1-score   support

           0       0.72      0.65      0.68        60
           1       0.86      0.80      0.83        60
           2       0.89      0.82      0.85        60
           3       0.82      0.85      0.84        60
           4       0.73      0.68      0.71        60
           5       0.72      0.63      0.67        60
           6       0.76      0.88      0.82        60
           7       0.79      0.80      0.79        60
           8       0.80      0.60      0.69        60
           9       0.64      0.63      0.64        60
          10       0.81      0.63      0.71        60
          11       0.83      0.72      0.77        60
          12       0.90      0.88      0.89        60
          13       0.46      0.43      0.45        60
          14   