### 데이터 불러오기

In [1]:
import pandas as pd

train = pd.read_csv("/content/drive/MyDrive/Coursework/25-2 Machine Learning/Project/DataPreprocessing_final/openworld_train.csv")
test = pd.read_csv("/content/drive/MyDrive/Coursework/25-2 Machine Learning/Project/DataPreprocessing_final/openworld_test.csv")

y_train = train['label']
X_train = train.drop(columns=['label'])

y_test = test['label']
X_test = test.drop(columns=['label'])

print("X_train shape:", X_train.shape)
print("y_train shape:", y_train.shape)
print("X_test shape:", X_test.shape)
print("y_test shape:", y_test.shape)

X_train shape: (20300, 26)
y_train shape: (20300,)
X_test shape: (8700, 26)
y_test shape: (8700,)


In [5]:
# validation set 분리
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(
    X_train, y_train, test_size=0.2, random_state=42, stratify=y_train
)

#이진 분류 라벨 생성
y_tr_bin = (y_train != 95).astype(int)
y_val_bin = (y_val != 95).astype(int)
y_test_bin = (y_test != 95).astype(int)

### xgboost 임포트

In [6]:
import numpy as np
import pandas as pd
from xgboost import XGBClassifier, callback
from sklearn.metrics import (
    classification_report, confusion_matrix, roc_auc_score,
    roc_curve, average_precision_score, precision_recall_curve, accuracy_score, f1_score
)

### 클래스 불균형 대응

In [7]:
# 양성=1, 음성=0 가정
pos = (y_tr_bin == 1).sum()
neg = (y_val_bin == 0).sum()
scale_pos_weight = neg / max(pos, 1)   # pos=0 보호

print(f"pos={pos}, neg={neg}, scale_pos_weight={scale_pos_weight:.2f}")

pos=8512, neg=1120, scale_pos_weight=0.13


### 베이스라인 학습

In [9]:
xgb = XGBClassifier(
    n_estimators=2000,
    learning_rate=0.03,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.8,
    min_child_weight=1.0,
    reg_lambda=1.0,
    reg_alpha=0.0,
    tree_method="hist",
    random_state=42,
    n_jobs=-1,
    scale_pos_weight=scale_pos_weight,
    eval_metric=["aucpr","auc", "logloss"],
    early_stopping_rounds = 100
)

xgb.fit(
    X_train, y_tr_bin,
    eval_set=[(X_train, y_tr_bin), (X_val, y_val_bin)],
    verbose=50
)

[0]	validation_0-aucpr:0.91880	validation_0-auc:0.85696	validation_0-logloss:1.02824	validation_1-aucpr:0.91900	validation_1-auc:0.85792	validation_1-logloss:1.02792
[50]	validation_0-aucpr:0.95568	validation_0-auc:0.91859	validation_0-logloss:0.66163	validation_1-aucpr:0.94673	validation_1-auc:0.90348	validation_1-logloss:0.66850
[100]	validation_0-aucpr:0.96407	validation_0-auc:0.93385	validation_0-logloss:0.56972	validation_1-aucpr:0.95304	validation_1-auc:0.91555	validation_1-logloss:0.58313
[150]	validation_0-aucpr:0.97066	validation_0-auc:0.94537	validation_0-logloss:0.51512	validation_1-aucpr:0.95752	validation_1-auc:0.92376	validation_1-logloss:0.53554
[200]	validation_0-aucpr:0.97606	validation_0-auc:0.95482	validation_0-logloss:0.47148	validation_1-aucpr:0.96111	validation_1-auc:0.93044	validation_1-logloss:0.49948
[250]	validation_0-aucpr:0.97990	validation_0-auc:0.96163	validation_0-logloss:0.43808	validation_1-aucpr:0.96305	validation_1-auc:0.93393	validation_1-logloss:0.4

### 예측

In [12]:
y_pred = xgb.predict(X_test)
y_prob = xgb.predict_proba(X_test)[:,1]

# ==== Accuracy ====
accuracy = accuracy_score(y_test_bin, y_pred)

# ==== F1 ====
f1_macro = f1_score(y_test_bin, y_pred, average='macro')
f1_micro = f1_score(y_test_bin, y_pred, average='micro')
f1_weighted = f1_score(y_test_bin, y_pred, average='weighted')

# ==== ROC-AUC ====
roc_auc = roc_auc_score(y_test_bin, y_prob)

# ==== PR-AUC ====
pr_auc = average_precision_score(y_test_bin, y_prob, average='macro')

# ==== Classification report ====
report = classification_report(y_test_bin, y_pred)

print("Accuracy:", accuracy)
print("F1-macro:", f1_macro)
print("F1-micro:", f1_micro)
print("F1-weighted:", f1_weighted)
print("ROC-AUC:", roc_auc)
print("PR-AUC:", pr_auc)
print(report)

Accuracy: 0.8801149425287357
F1-macro: 0.8714478528238355
F1-micro: 0.8801149425287357
F1-weighted: 0.881806924184274
ROC-AUC: 0.9568333333333333
PR-AUC: 0.9760184133116043
              precision    recall  f1-score   support

           0       0.78      0.90      0.84      3000
           1       0.94      0.87      0.90      5700

    accuracy                           0.88      8700
   macro avg       0.86      0.88      0.87      8700
weighted avg       0.89      0.88      0.88      8700

