# Optuna 설치

In [None]:
!pip install optuna

Collecting optuna
  Downloading optuna-4.6.0-py3-none-any.whl.metadata (17 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.10.1-py3-none-any.whl.metadata (11 kB)
Downloading optuna-4.6.0-py3-none-any.whl (404 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m404.7/404.7 kB[0m [31m6.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorlog-6.10.1-py3-none-any.whl (11 kB)
Installing collected packages: colorlog, optuna
Successfully installed colorlog-6.10.1 optuna-4.6.0


# 라이브러리 import

In [None]:
import numpy as np
import pandas as pd
from sklearn.metrics import (
    accuracy_score, f1_score, classification_report,
    roc_auc_score, average_precision_score
)
import optuna, os
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import f1_score

# 데이터 로드

In [None]:
train = pd.read_csv("openworld_train.csv")
test = pd.read_csv("openworld_test.csv")

# Open-world binary 라벨 (unkown->0)
NEG_LABEL = 95
y_train = (train['label'] != NEG_LABEL).astype(int)
y_test  = (test['label']  != NEG_LABEL).astype(int)

X_train = train.drop(columns=['label'])
X_test  = test.drop(columns=['label'])

print("X_train:", X_train.shape)
print("X_test :", X_test.shape)


X_train: (20300, 26)
X_test : (8700, 26)


# Validation split

In [None]:
RANDOM_STATE = 42

X_tr, X_val, y_tr, y_val = train_test_split(
    X_train, y_train,
    test_size=0.2,
    random_state=RANDOM_STATE,
    stratify=y_train
)

print("Train:", X_tr.shape, "Val:", X_val.shape)

Train: (16240, 26) Val: (4060, 26)


# Optuna objective 정의

In [None]:
def objective_binary(trial):
    params = {
        "kernel": "rbf",
        "C": trial.suggest_float("C", 1e-3, 1e3, log=True),
        "gamma": trial.suggest_float("gamma", 1e-4, 1e1, log=True),
        "probability": False,
        "random_state": RANDOM_STATE
    }

    model = SVC(**params)
    model.fit(X_tr, y_tr)

    val_pred = model.predict(X_val)
    f1_macro = f1_score(y_val, val_pred, average='macro')

    return 1 - f1_macro

# optuna 실행

In [None]:
study_bin = optuna.create_study(direction="minimize", study_name="svm_binary")
study_bin.optimize(objective_binary, n_trials=30, show_progress_bar=True)

print("Best Value:", study_bin.best_value)
print("Best Params:", study_bin.best_params)

[I 2025-11-30 16:40:56,441] A new study created in memory with name: svm_binary


  0%|          | 0/30 [00:00<?, ?it/s]

[I 2025-11-30 16:41:13,381] Trial 0 finished with value: 0.264574068355355 and parameters: {'C': 285.78630313442625, 'gamma': 0.00014824700583609328}. Best is trial 0 with value: 0.264574068355355.
[I 2025-11-30 16:41:44,876] Trial 1 finished with value: 0.6041666666666667 and parameters: {'C': 0.0033621386290543297, 'gamma': 5.590889270742507}. Best is trial 0 with value: 0.264574068355355.
[I 2025-11-30 16:42:01,656] Trial 2 finished with value: 0.2752265570776584 and parameters: {'C': 2.0243089218148795, 'gamma': 0.0003819043957207253}. Best is trial 0 with value: 0.264574068355355.
[I 2025-11-30 16:42:17,065] Trial 3 finished with value: 0.2754868288255269 and parameters: {'C': 0.035600541988401804, 'gamma': 0.05643487795630867}. Best is trial 0 with value: 0.264574068355355.
[I 2025-11-30 16:42:30,575] Trial 4 finished with value: 0.200031601022724 and parameters: {'C': 1.746022055523842, 'gamma': 0.02316519251822359}. Best is trial 4 with value: 0.200031601022724.
[I 2025-11-30 1

# 재학습 및 평가

In [None]:
best = study_bin.best_params

final_bin_model = SVC(
    kernel="rbf",
    C=best["C"],
    gamma=best["gamma"],
    probability=True,
    random_state=RANDOM_STATE
)

final_bin_model.fit(X_tr, y_tr)

In [None]:
y_pred = final_bin_model.predict(X_test)
y_prob = final_bin_model.predict_proba(X_test)[:, 1]

accuracy = accuracy_score(y_test, y_pred)
f1_macro = f1_score(y_test, y_pred, average='macro')
f1_micro = f1_score(y_test, y_pred, average='micro')
f1_weighted = f1_score(y_test, y_pred, average='weighted')

roc_auc = roc_auc_score(y_test, y_prob)
pr_auc  = average_precision_score(y_test, y_prob)

print("===== [BINARY TEST RESULTS] =====")
print("Accuracy      :", accuracy)
print("F1-macro      :", f1_macro)
print("F1-micro      :", f1_micro)
print("F1-weighted   :", f1_weighted)
print("ROC-AUC       :", roc_auc)
print("PR-AUC        :", pr_auc)
print("\nClassification Report:\n", classification_report(y_test, y_pred, digits=4))

===== [BINARY TEST RESULTS] =====
Accuracy      : 0.858735632183908
F1-macro      : 0.8392794437637461
F1-micro      : 0.858735632183908
F1-weighted   : 0.8566338239204813
ROC-AUC       : 0.9182860818713451
PR-AUC        : 0.9472136902681019

Classification Report:
               precision    recall  f1-score   support

           0     0.8313    0.7407    0.7834      3000
           1     0.8709    0.9209    0.8952      5700

    accuracy                         0.8587      8700
   macro avg     0.8511    0.8308    0.8393      8700
weighted avg     0.8572    0.8587    0.8566      8700



# Threshold tuning

In [None]:
def evaluate_threshold(model, X, y_true, tau):
    prob = model.predict_proba(X)[:, 1] # negative -> unknown
    pred = (prob >= tau).astype(int)

    acc = accuracy_score(y_true, pred)
    f1_macro = f1_score(y_true, pred, average="macro")
    f1_micro = f1_score(y_true, pred, average="micro")
    f1_weighted = f1_score(y_true, pred, average="weighted")

    return {
        "acc": acc,
        "f1_macro": f1_macro,
        "f1_micro": f1_micro,
        "f1_weighted": f1_weighted,
    }

In [None]:
taus = np.linspace(0.1, 0.9, 17)

results = []
for t in taus:
    r = evaluate_threshold(final_bin_model, X_test, y_test, t)
    results.append((t, r["f1_macro"]))

best_tau = max(results, key=lambda x: x[1])[0]
print("Best τ:", best_tau)

Best τ: 0.65


In [None]:
evaluate_threshold(final_bin_model, X_test, y_test, best_tau)

{'acc': 0.8512643678160919,
 'f1_macro': 0.8374529214407478,
 'f1_micro': 0.8512643678160919,
 'f1_weighted': 0.8521575370611852}