In [None]:
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.metrics import (
    accuracy_score, f1_score, classification_report, confusion_matrix,
    roc_auc_score, average_precision_score
)

### 데이터 불러오기

In [None]:
import pandas as pd

train = pd.read_csv("/content/drive/MyDrive/Coursework/25-2 Machine Learning/Project/DataPreprocessing_final/openworld_train.csv")
test = pd.read_csv("/content/drive/MyDrive/Coursework/25-2 Machine Learning/Project/DataPreprocessing_final/openworld_test.csv")

y_train = train['label']
X_train = train.drop(columns=['label'])

y_test = test['label']
X_test = test.drop(columns=['label'])

print("X_train shape:", X_train.shape)
print("y_train shape:", y_train.shape)
print("X_test shape:", X_test.shape)
print("y_test shape:", y_test.shape)

X_train shape: (20300, 26)
y_train shape: (20300,)
X_test shape: (8700, 26)
y_test shape: (8700,)


### xgboost 임포트

In [None]:
pip install optuna

Collecting optuna
  Downloading optuna-4.6.0-py3-none-any.whl.metadata (17 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.10.1-py3-none-any.whl.metadata (11 kB)
Downloading optuna-4.6.0-py3-none-any.whl (404 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/404.7 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m404.7/404.7 kB[0m [31m30.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorlog-6.10.1-py3-none-any.whl (11 kB)
Installing collected packages: colorlog, optuna
Successfully installed colorlog-6.10.1 optuna-4.6.0


In [None]:
import os, numpy as np, optuna
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBClassifier, callback
from sklearn.preprocessing import label_binarize

### 하이퍼파라미터 튜닝 (개별)

In [None]:
NEG_LABEL = 95
RANDOM_STATE = 42
TEST_SIZE = 0.2  # validation 비율
EARLY_STOP = 100

X_tr, X_val, y_tr_full, y_val_full = train_test_split(
    X_train, y_train, test_size=TEST_SIZE, stratify=y_train, random_state=RANDOM_STATE
)

# 1단계 이진 라벨 (양성=1: NEG_LABEL이 아닌 모든 클래스)
y_tr_bin = (y_tr_full != NEG_LABEL).astype(int)
y_val_bin = (y_val_full != NEG_LABEL).astype(int)

### 이진 분류 모델 튜닝

In [None]:
def objective(trial):
            params = {
                      "objective": "binary:logistic",
                      "tree_method": "hist",
                      "n_estimators": 100,
                      "learning_rate": trial.suggest_float("bin_lr", 0.01, 0.2, log=True),
                      "max_depth": trial.suggest_int("bin_max_depth", 3, 10),
                      "min_child_weight": trial.suggest_int("bin_min_child_weight", 1, 10),
                      "subsample": trial.suggest_float("bin_subsample", 0.6, 1.0),
                      "colsample_bytree": trial.suggest_float("bin_colsample_bytree", 0.6, 1.0),
                      "gamma": trial.suggest_float("bin_gamma", 0.0, 5.0),
                      "reg_lambda": trial.suggest_float("bin_reg_lambda", 0.0, 10.0),
                      "reg_alpha": trial.suggest_float("bin_reg_alpha", 0.0, 5.0),
                      "eval_metric": "logloss",
                      "random_state": RANDOM_STATE,
                      "n_jobs": os.cpu_count(),
                      "early_stopping_rounds": 50
            }

            model = XGBClassifier(**params)


            model.fit(X_tr, y_tr_bin, eval_set=[(X_tr, y_tr_bin), (X_val, y_val_bin)],verbose=1)
            best_logloss = model.best_score

            return best_logloss


study = optuna.create_study(direction='minimize', study_name='binary_classification')
study.optimize(objective, n_trials=200, show_progress_bar=True)

[I 2025-11-21 05:39:38,493] A new study created in memory with name: binary_classification


  0%|          | 0/200 [00:00<?, ?it/s]

[1;30;43m스트리밍 출력 내용이 길어서 마지막 5000줄이 삭제되었습니다.[0m
[51]	validation_0-logloss:0.14931	validation_1-logloss:0.26017
[52]	validation_0-logloss:0.14793	validation_1-logloss:0.25946
[53]	validation_0-logloss:0.14505	validation_1-logloss:0.25830
[54]	validation_0-logloss:0.14251	validation_1-logloss:0.25852
[55]	validation_0-logloss:0.14053	validation_1-logloss:0.25751
[56]	validation_0-logloss:0.13958	validation_1-logloss:0.25718
[57]	validation_0-logloss:0.13712	validation_1-logloss:0.25642
[58]	validation_0-logloss:0.13619	validation_1-logloss:0.25601
[59]	validation_0-logloss:0.13475	validation_1-logloss:0.25543
[60]	validation_0-logloss:0.13298	validation_1-logloss:0.25491
[61]	validation_0-logloss:0.13063	validation_1-logloss:0.25369
[62]	validation_0-logloss:0.12832	validation_1-logloss:0.25396
[63]	validation_0-logloss:0.12586	validation_1-logloss:0.25356
[64]	validation_0-logloss:0.12415	validation_1-logloss:0.25317
[65]	validation_0-logloss:0.12263	validation_1-logloss:0.25276
[66]	

In [None]:
bin_best_value = study.best_value
bin_best_params = study.best_params

print("Best score:",bin_best_value)
print("Best params:", bin_best_params)

Best score: 0.2390114263312713
Best params: {'bin_lr': 0.1890408241426905, 'bin_max_depth': 10, 'bin_min_child_weight': 1, 'bin_subsample': 0.8887320375513359, 'bin_colsample_bytree': 0.9912853863634461, 'bin_gamma': 0.014498503088113818, 'bin_reg_lambda': 2.7498279931612504, 'bin_reg_alpha': 0.0985171472007404}


### 다중 분류 모델 튜닝

In [None]:
def objective(trial):
            params = {
                      "objective": "multi:softprob",
                      "tree_method": "hist",
                      "n_estimators": 100,
                      "learning_rate": trial.suggest_float("mul_lr", 0.01, 0.2, log=True),
                      "max_depth": trial.suggest_int("mul_max_depth", 3, 10),
                      "min_child_weight": trial.suggest_int("mul_min_child_weight", 1, 10),
                      "subsample": trial.suggest_float("mul_subsample", 0.6, 1.0),
                      "colsample_bytree": trial.suggest_float("mul_colsample_bytree", 0.6, 1.0),
                      "gamma": trial.suggest_float("mul_gamma", 0.0, 5.0),
                      "reg_lambda": trial.suggest_float("mul_reg_lambda", 0.0, 10.0),
                      "reg_alpha": trial.suggest_float("mul_reg_alpha", 0.0, 5.0),
                      "eval_metric": "mlogloss",
                      "random_state": RANDOM_STATE,
                      "n_jobs": os.cpu_count(),
                      "early_stopping_rounds": 50
            }

            tr_pos_mask = (y_tr_full != NEG_LABEL)
            pos_le = LabelEncoder().fit(y_tr_full[tr_pos_mask])
            y_tr_pos_enc = pos_le.transform(y_tr_full[tr_pos_mask])
            num_pos_classes = len(pos_le.classes_)

            model = XGBClassifier(num_class=num_pos_classes, **params)

            model.fit(
                X_tr[tr_pos_mask], y_tr_pos_enc,
                eval_set=[(X_tr[tr_pos_mask], y_tr_pos_enc)],
                verbose = 1
                )
            best_logloss = model.best_score

            return best_logloss

study = optuna.create_study(direction='minimize', study_name='multiclass_classification')
study.optimize(objective, n_trials=200, show_progress_bar=True)

[I 2025-11-21 05:48:42,186] A new study created in memory with name: multiclass_classification


  0%|          | 0/200 [00:00<?, ?it/s]

[1;30;43m스트리밍 출력 내용이 길어서 마지막 5000줄이 삭제되었습니다.[0m
[50]	validation_0-mlogloss:0.42458
[51]	validation_0-mlogloss:0.41594
[52]	validation_0-mlogloss:0.40763
[53]	validation_0-mlogloss:0.39982
[54]	validation_0-mlogloss:0.39235
[55]	validation_0-mlogloss:0.38518
[56]	validation_0-mlogloss:0.37814
[57]	validation_0-mlogloss:0.37137
[58]	validation_0-mlogloss:0.36476
[59]	validation_0-mlogloss:0.35849
[60]	validation_0-mlogloss:0.35235
[61]	validation_0-mlogloss:0.34652
[62]	validation_0-mlogloss:0.34099
[63]	validation_0-mlogloss:0.33543
[64]	validation_0-mlogloss:0.33020
[65]	validation_0-mlogloss:0.32515
[66]	validation_0-mlogloss:0.32011
[67]	validation_0-mlogloss:0.31514
[68]	validation_0-mlogloss:0.31039
[69]	validation_0-mlogloss:0.30586
[70]	validation_0-mlogloss:0.30159
[71]	validation_0-mlogloss:0.29738
[72]	validation_0-mlogloss:0.29320
[73]	validation_0-mlogloss:0.28910
[74]	validation_0-mlogloss:0.28507
[75]	validation_0-mlogloss:0.28121
[76]	validation_0-mlogloss:0.27756
[77]	

In [None]:
mul_best_value = study.best_value
mul_best_params = study.best_params

print("Best score:",mul_best_value)
print("Best params:", mul_best_params)

Best score: 0.10242243201118707
Best params: {'mul_lr': 0.16862524911487684, 'mul_max_depth': 9, 'mul_min_child_weight': 4, 'mul_subsample': 0.8754925874208443, 'mul_colsample_bytree': 0.9905531435470902, 'mul_gamma': 0.004617282882739576, 'mul_reg_lambda': 1.3105248980670992, 'mul_reg_alpha': 0.0030352556227954}


### 최종 재학습

In [None]:
bin_best = {'bin_lr': 0.1890408241426905, 'bin_max_depth': 10, 'bin_min_child_weight': 1, 'bin_subsample': 0.8887320375513359, 'bin_colsample_bytree': 0.9912853863634461, 'bin_gamma': 0.014498503088113818, 'bin_reg_lambda': 2.7498279931612504, 'bin_reg_alpha': 0.0985171472007404}
mul_best = {'mul_lr': 0.16862524911487684, 'mul_max_depth': 9, 'mul_min_child_weight': 4, 'mul_subsample': 0.8754925874208443, 'mul_colsample_bytree': 0.9905531435470902, 'mul_gamma': 0.004617282882739576, 'mul_reg_lambda': 1.3105248980670992, 'mul_reg_alpha': 0.0030352556227954}

bin_params = {
    "objective": "binary:logistic",
    "eval_metric": "logloss",
    "n_estimators": 1000,
    "tree_method": "hist",
    "random_state": RANDOM_STATE,
    "n_jobs": os.cpu_count(),
    "early_stopping_rounds": 500,
    **bin_best
}

mul_params = {
    "objective": "multi:softprob",
    "num_class": 95,
    "eval_metric": "mlogloss",
    "n_estimators": 1000,
    "random_state": RANDOM_STATE,
    "n_jobs": os.cpu_count(),
    "tree_method": "hist",
    "early_stopping_rounds": 500,
    **mul_best
}


# 이진 분류 모델 최종 재학습
final_model_bin = XGBClassifier(**bin_params)
final_model_bin.fit(
    X_tr, y_tr_bin,
    eval_set=[(X_tr, y_tr_bin), (X_val, y_val_bin)],
    verbose=1
)

[0]	validation_0-logloss:0.54317	validation_1-logloss:0.54307
[1]	validation_0-logloss:0.48503	validation_1-logloss:0.48523
[2]	validation_0-logloss:0.43779	validation_1-logloss:0.44265
[3]	validation_0-logloss:0.40984	validation_1-logloss:0.41812
[4]	validation_0-logloss:0.38853	validation_1-logloss:0.39914
[5]	validation_0-logloss:0.37340	validation_1-logloss:0.38560
[6]	validation_0-logloss:0.36062	validation_1-logloss:0.37594
[7]	validation_0-logloss:0.35196	validation_1-logloss:0.36828
[8]	validation_0-logloss:0.34033	validation_1-logloss:0.35944
[9]	validation_0-logloss:0.32906	validation_1-logloss:0.35054
[10]	validation_0-logloss:0.32408	validation_1-logloss:0.34676
[11]	validation_0-logloss:0.31515	validation_1-logloss:0.33904
[12]	validation_0-logloss:0.30938	validation_1-logloss:0.33470
[13]	validation_0-logloss:0.30428	validation_1-logloss:0.33064
[14]	validation_0-logloss:0.29590	validation_1-logloss:0.32673
[15]	validation_0-logloss:0.29289	validation_1-logloss:0.32354
[1

Parameters: { "bin_colsample_bytree", "bin_gamma", "bin_lr", "bin_max_depth", "bin_min_child_weight", "bin_reg_alpha", "bin_reg_lambda", "bin_subsample" } are not used.

  self.starting_round = model.num_boosted_rounds()


[22]	validation_0-logloss:0.25769	validation_1-logloss:0.30313
[23]	validation_0-logloss:0.25637	validation_1-logloss:0.30202
[24]	validation_0-logloss:0.25038	validation_1-logloss:0.29844
[25]	validation_0-logloss:0.24926	validation_1-logloss:0.29757
[26]	validation_0-logloss:0.24677	validation_1-logloss:0.29672
[27]	validation_0-logloss:0.24500	validation_1-logloss:0.29561
[28]	validation_0-logloss:0.24268	validation_1-logloss:0.29473
[29]	validation_0-logloss:0.24071	validation_1-logloss:0.29338
[30]	validation_0-logloss:0.23817	validation_1-logloss:0.29180
[31]	validation_0-logloss:0.23535	validation_1-logloss:0.29076
[32]	validation_0-logloss:0.23091	validation_1-logloss:0.28793
[33]	validation_0-logloss:0.22654	validation_1-logloss:0.28567
[34]	validation_0-logloss:0.22287	validation_1-logloss:0.28366
[35]	validation_0-logloss:0.22101	validation_1-logloss:0.28235
[36]	validation_0-logloss:0.21943	validation_1-logloss:0.28218
[37]	validation_0-logloss:0.21817	validation_1-logloss:

In [None]:
tr_pos_mask = (y_tr_full != NEG_LABEL)

# 폴드(여기선 트레인) 양성 라벨을 0..C-1로 재인코딩
pos_le = LabelEncoder().fit(y_tr_full[tr_pos_mask])
y_tr_pos_enc = pos_le.transform(y_tr_full[tr_pos_mask])
num_pos_classes = len(pos_le.classes_)

final_model_mul = XGBClassifier(**mul_params)
final_model_mul.fit(
        X_tr[tr_pos_mask], y_tr_pos_enc,
        eval_set=[(X_tr[tr_pos_mask], y_tr_pos_enc)],
        verbose = 1
    )

Parameters: { "mul_colsample_bytree", "mul_gamma", "mul_lr", "mul_max_depth", "mul_min_child_weight", "mul_reg_alpha", "mul_reg_lambda", "mul_subsample" } are not used.

  self.starting_round = model.num_boosted_rounds()


[0]	validation_0-mlogloss:2.33674
[1]	validation_0-mlogloss:1.81328
[2]	validation_0-mlogloss:1.50024
[3]	validation_0-mlogloss:1.27862
[4]	validation_0-mlogloss:1.10136
[5]	validation_0-mlogloss:0.96620
[6]	validation_0-mlogloss:0.85312
[7]	validation_0-mlogloss:0.75600
[8]	validation_0-mlogloss:0.67370
[9]	validation_0-mlogloss:0.60151
[10]	validation_0-mlogloss:0.54131
[11]	validation_0-mlogloss:0.48787
[12]	validation_0-mlogloss:0.44007
[13]	validation_0-mlogloss:0.39961
[14]	validation_0-mlogloss:0.36375
[15]	validation_0-mlogloss:0.33140
[16]	validation_0-mlogloss:0.30018
[17]	validation_0-mlogloss:0.27343
[18]	validation_0-mlogloss:0.24915
[19]	validation_0-mlogloss:0.22846
[20]	validation_0-mlogloss:0.20905
[21]	validation_0-mlogloss:0.19139
[22]	validation_0-mlogloss:0.17691
[23]	validation_0-mlogloss:0.16257
[24]	validation_0-mlogloss:0.15006
[25]	validation_0-mlogloss:0.13837
[26]	validation_0-mlogloss:0.12840
[27]	validation_0-mlogloss:0.11881
[28]	validation_0-mlogloss:0.1

### 예측

In [None]:
def evaluate_cascade_top1(
    bst_bin, bst_mul, pos_le, tau, X_test, y_test_full, neg_label=95
):
    # ----- 공통 -----
    # dtest_all = xgb.DMatrix(X_test) # Remove this line as XGBClassifier.predict can directly take X_test (DataFrame)
    y_true_bin = (y_test_full != neg_label).astype(int)
    n_samples = len(y_test_full)

    # =========================
    # Stage 1 (Binary) 평가
    # =========================
    p_pos = bst_bin.predict(X_test, iteration_range=(0, bst_bin.best_iteration + 1))  # P(양성)
    pass_mask = (p_pos >= tau)  # 라우팅 기준 = 평가 기준(일관성)

    y_pred_bin_tau = pass_mask.astype(int)  # tau 기준 0/1 예측
    # Top-1 accuracy = 일반 accuracy
    stage1_acc_top1 = accuracy_score(y_true_bin, y_pred_bin_tau)
    stage1_f1_micro = f1_score(y_true_bin, y_pred_bin_tau, average="micro")
    stage1_f1_macro = f1_score(y_true_bin, y_pred_bin_tau, average="macro")
    stage1_f1_weighted = f1_score(y_true_bin, y_pred_bin_tau, average="weighted")

    # 참고용 확률기반 지표(원하면 남기고, 필요 없으면 제거 가능)
    stage1_auc  = roc_auc_score(y_true_bin, p_pos) if len(np.unique(y_true_bin)) == 2 else np.nan
    stage1_aupr = average_precision_score(y_true_bin, p_pos) if y_true_bin.sum() > 0 else np.nan

    coverage = pass_mask.mean()
    true_pos_mask = (y_true_bin == 1)
    miss_stage1 = float((~pass_mask & true_pos_mask).sum()) / true_pos_mask.sum() if true_pos_mask.sum() > 0 else 0.0
    rec_stage1 = 1.0 - miss_stage1

    # =========================
    # Stage 2 (Multiclass, 조건부)
    # =========================
    # 최종 엔드투엔드 예측(보고용)도 함께 계산
    y_pred_full = np.full_like(y_test_full, fill_value=neg_label)

    if pass_mask.any():
        # dtest_pass = xgb.DMatrix(X_test[pass_mask]) # Remove this line
        prob_pos = bst_mul.predict_proba(X_test[pass_mask], iteration_range=(0, bst_mul.best_iteration + 1))  # (n_pass, C)
        pred_pos_enc = prob_pos.argmax(axis=1)  # Top-1
        pred_pos_labels = pos_le.inverse_transform(pred_pos_enc)  # 원래 양성 라벨
        y_pred_full[pass_mask] = pred_pos_labels

        # Stage 2 조건부 Top-1 accuracy (양성으로 라우팅된 샘플만)
        y_true_pass = y_test_full[pass_mask]                  # 원래 라벨(95 제외)
        stage2_acc_top1_cond = accuracy_score(y_true_pass, pred_pos_labels)
    else:
        stage2_acc_top1_cond = np.nan

    # =========================
    # End-to-end (최종)
    # =========================
    end2end_acc_top1 = accuracy_score(y_test_full, y_pred_full)
    end2end_f1_micro = f1_score(y_test_full, y_pred_full, average="micro")
    end2end_f1_macro = f1_score(y_test_full, y_pred_full, average="macro")
    end2end_f1_weighted = f1_score(y_test_full, y_pred_full, average="weighted")

    # =========================
    # End-to-end (Prob-based: multi-class ROC/PR-AUC)
    # =========================
    # 전체 X_test에 대해 2단계 확률 계산 (조건부 양성 클래스 분포)
    prob_pos_all = bst_mul.predict_proba(
        X_test,
        iteration_range=(0, bst_mul.best_iteration + 1)
    )  # (n_samples, C_pos)

    # 전체 클래스 세트 (0..max or np.unique 사용)
    all_classes = np.unique(y_test_full)
    num_classes = len(all_classes)

    # label -> column index 매핑
    # neg_label도 포함된 전체 클래스에서 인덱스 찾기
    class_to_idx = {c: i for i, c in enumerate(all_classes)}

    proba_full = np.zeros((n_samples, num_classes), dtype=float)

    # 음수 클래스 확률: 1 - p_pos
    neg_col = class_to_idx[neg_label]
    proba_full[:, neg_col] = 1.0 - p_pos

    # 양성 클래스들: p_pos * P(class | positive, x)
    pos_labels = pos_le.classes_  # 원래 양성 라벨들 (neg_label 제외)
    for j, lbl in enumerate(pos_labels):
        col = class_to_idx[lbl]
        proba_full[:, col] = p_pos * prob_pos_all[:, j]

    # --- multi-class ROC-AUC (OvR) ---
    try:
        end2end_roc_auc_ovr_macro = roc_auc_score(
            y_test_full,
            proba_full,
            multi_class="ovr",
            average="macro",
        )
    except ValueError:
        # 클래스가 하나만 있거나 하면 에러날 수 있으니 방어적으로 처리
        end2end_roc_auc_ovr_macro = np.nan
        end2end_roc_auc_ovr_weighted = np.nan

    # --- multi-class PR-AUC (Average Precision) ---
    # average_precision_score는 multi-label indicator를 입력으로 받으므로 binarize 필요
    y_test_bin = label_binarize(y_test_full, classes=all_classes)
    try:
        end2end_pr_auc_macro = average_precision_score(
            y_test_bin,
            proba_full,
            average="macro",
        )
    except ValueError:
        end2end_pr_auc_macro = np.nan
        end2end_pr_auc_weighted = np.nan

    return {
        # ----- Stage 1 (tau 기준) -----
        "Binary_acc_top1": stage1_acc_top1,           # 요청: top-1
        "Binary_f1_micro": stage1_f1_micro,           # 요청: f1-micro
        "Binary_f1_macro": stage1_f1_macro,
        "Binary_f1_weighted": stage1_f1_weighted,
        "Binary_auc": stage1_auc,                     # 참고
        "Binary_aupr": stage1_aupr,                   # 참고
        # "coverage_to_stage2": coverage,
        # "stage1_miss_rate": miss_stage1,
        # "stage1_recall": rec_stage1,

        # ----- Stage 2 (조건부, 양성만) -----
        "Multiclass_acc_top1_conditional": stage2_acc_top1_cond,  # 요청: top-1

        # ----- End-to-end (최종) -----
        "end2end_acc_top1": end2end_acc_top1,
        "end2end_f1_micro": end2end_f1_micro,
        "end2end_f1_macro": end2end_f1_macro,
        "end2end_f1_weighted": end2end_f1_weighted,

        # ----- End-to-end (prob-based) -----
        "end2end_roc_auc_ovr_macro": end2end_roc_auc_ovr_macro,
        "end2end_pr_auc_macro": end2end_pr_auc_macro,

    }

In [None]:
evaluate_cascade_top1(final_model_bin, final_model_mul,pos_le, 0.5, X_test, y_test, neg_label=95)

{'Binary_acc_top1': 0.8967816091954023,
 'Binary_f1_micro': 0.8967816091954023,
 'Binary_f1_macro': 0.8832899244633676,
 'Binary_f1_weighted': 0.8956048496751641,
 'Binary_auc': np.float64(0.8755964912280703),
 'Binary_aupr': np.float64(0.889075149356648),
 'Multiclass_acc_top1_conditional': 0.6886539107082914,
 'end2end_acc_top1': 0.75,
 'end2end_f1_micro': 0.75,
 'end2end_f1_macro': 0.7040741741190225,
 'end2end_f1_weighted': 0.7512271527978682,
 'end2end_roc_auc_ovr_macro': np.float64(0.9440392156365064),
 'end2end_pr_auc_macro': np.float64(0.7528892637406344)}