<a href="https://colab.research.google.com/github/cindyshin2211/Website_Fingerprinting_MLB/blob/%EC%8B%A0%EC%84%B1%ED%98%84/openworld(multi)_XGBoost.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# 필요한 라이브러리 불러오기
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, accuracy_score, f1_score
from imblearn.over_sampling import SMOTE
from lightgbm import LGBMClassifier

import numpy as np

# 데이터 로드
data = pd.read_csv("D:/MLB_TEAM/1128/mon.csv")  # 데이터셋 경로 입력

# Label 값에 1을 더하여 0부터 시작하는 연속된 정수로 변환
data['Label'] = data['Label'] + 1

# 특성과 레이블 분리
X = data.drop(columns=['Label'])  # 피처 데이터
y = data['Label']  # 레이블 데이터

# Train, Validation, Test 데이터 나누기
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp)

# 데이터 정규화
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)
X_test = scaler.transform(X_test)

### 방법 0: 기본 ###
print("=== Default LGBM ===")
model_default = LGBMClassifier(random_state=42, n_jobs=-1, use_label_encoder=False, eval_metric='mlogloss')
model_default.fit(X_train, y_train)

# 테스트 데이터 평가
y_test_pred_default = model_default.predict(X_test)
print("\nClassification Report on Test Data (Default XGBoost):")
print(classification_report(y_test, y_test_pred_default))

test_accuracy_default = accuracy_score(y_test, y_test_pred_default)
test_f1_default = f1_score(y_test, y_test_pred_default, average='weighted')

print(f"Test Accuracy: {test_accuracy_default}")
print(f"Test F1 Score: {test_f1_default}")


### 방법 1: 클래스 가중치 적용 ###
print("\n=== LGBM with Class Weights ===")
# 클래스 비율 계산
class_weights = {cls: len(y_train) / (len(np.unique(y_train)) * sum(y_train == cls)) for cls in np.unique(y_train)}
sample_weights = y_train.map(class_weights)

model_weighted = LGBMClassifier(random_state=42, n_jobs=-1, use_label_encoder=False, eval_metric='mlogloss')
model_weighted.fit(X_train, y_train, sample_weight=sample_weights)

# 테스트 데이터 평가
y_test_pred = model_weighted.predict(X_test)
print("\nClassification Report on Test Data (Class Weights):")
print(classification_report(y_test, y_test_pred))

test_accuracy = accuracy_score(y_test, y_test_pred)
test_f1 = f1_score(y_test, y_test_pred, average='weighted')

print(f"Test Accuracy: {test_accuracy}")
print(f"Test F1 Score: {test_f1}")


### 방법 2: SMOTE를 이용한 오버샘플링 ###
print("\n=== XGBoost with SMOTE ===")
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

print("Resampled Training Set Size (SMOTE):", X_train_smote.shape)

model_smote = LGBMClassifier(random_state=42, n_jobs=-1, use_label_encoder=False, eval_metric='mlogloss')
model_smote.fit(X_train_smote, y_train_smote)

# 테스트 데이터 평가
y_test_pred_smote = model_smote.predict(X_test)
print("\nClassification Report on Test Data (SMOTE):")
print(classification_report(y_test, y_test_pred_smote))

test_accuracy = accuracy_score(y_test, y_test_pred_smote)
test_f1 = f1_score(y_test, y_test_pred_smote, average='weighted')

print(f"Test Accuracy: {test_accuracy}")
print(f"Test F1 Score: {test_f1}")


=== Default XGBoost ===
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002602 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3337
[LightGBM] [Info] Number of data points in the train set: 13300, number of used features: 15
[LightGBM] [Info] Start training from score -4.553877
[LightGBM] [Info] Start training from score -4.553877
[LightGBM] [Info] Start training from score -4.553877
[LightGBM] [Info] Start training from score -4.553877
[LightGBM] [Info] Start training from score -4.553877
[LightGBM] [Info] Start training from score -4.553877
[LightGBM] [Info] Start training from score -4.553877
[LightGBM] [Info] Start training from score -4.553877
[LightGBM] [Info] Start training from score -4.553877
[LightGBM] [Info] Start training from score -4.553877
[LightGBM] [Info] Start training from score -4.553877
[LightGBM] [Info] Start training from score -4.553877
[LightGBM] [Info] Start training from s