In [None]:
# 필요한 라이브러리 불러오기
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, accuracy_score, f1_score
from imblearn.over_sampling import SMOTE
from xgboost import XGBClassifier
import numpy as np

# 데이터 로드
data = pd.read_csv("../../../../monunmon.csv")  # 데이터셋 경로 입력

# Label 값에 1을 더하여 0부터 시작하는 연속된 정수로 변환
data['Label'] = data['Label'] + 1

# 특성과 레이블 분리
X = data.drop(columns=['Label'])  # 피처 데이터
y = data['Label']  # 레이블 데이터

# Train, Validation, Test 데이터 나누기
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp)

# 데이터 정규화
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)
X_test = scaler.transform(X_test)

### 방법 0: 기본 XGBoost ###
print("=== Default XGBoost ===")
model_default = XGBClassifier(random_state=42, n_jobs=-1, use_label_encoder=False, eval_metric='mlogloss')
model_default.fit(X_train, y_train)

# 테스트 데이터 평가
y_test_pred_default = model_default.predict(X_test)
print("\nClassification Report on Test Data (Default XGBoost):")
print(classification_report(y_test, y_test_pred_default))

test_accuracy_default = accuracy_score(y_test, y_test_pred_default)
test_f1_default = f1_score(y_test, y_test_pred_default, average='weighted')

print(f"Test Accuracy: {test_accuracy_default}")
print(f"Test F1 Score: {test_f1_default}")


### 방법 1: 클래스 가중치 적용 ###
print("\n=== XGBoost with Class Weights ===")
# 클래스 비율 계산
class_weights = {cls: len(y_train) / (len(np.unique(y_train)) * sum(y_train == cls)) for cls in np.unique(y_train)}
sample_weights = y_train.map(class_weights)

model_weighted = XGBClassifier(random_state=42, n_jobs=-1, use_label_encoder=False, eval_metric='mlogloss')
model_weighted.fit(X_train, y_train, sample_weight=sample_weights)

# 테스트 데이터 평가
y_test_pred = model_weighted.predict(X_test)
print("\nClassification Report on Test Data (Class Weights):")
print(classification_report(y_test, y_test_pred))

test_accuracy = accuracy_score(y_test, y_test_pred)
test_f1 = f1_score(y_test, y_test_pred, average='weighted')

print(f"Test Accuracy: {test_accuracy}")
print(f"Test F1 Score: {test_f1}")


### 방법 2: SMOTE를 이용한 오버샘플링 ###
print("\n=== XGBoost with SMOTE ===")
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

print("Resampled Training Set Size (SMOTE):", X_train_smote.shape)

model_smote = XGBClassifier(random_state=42, n_jobs=-1, use_label_encoder=False, eval_metric='mlogloss')
model_smote.fit(X_train_smote, y_train_smote)

# 테스트 데이터 평가
y_test_pred_smote = model_smote.predict(X_test)
print("\nClassification Report on Test Data (SMOTE):")
print(classification_report(y_test, y_test_pred_smote))

test_accuracy = accuracy_score(y_test, y_test_pred_smote)
test_f1 = f1_score(y_test, y_test_pred_smote, average='weighted')

print(f"Test Accuracy: {test_accuracy}")
print(f"Test F1 Score: {test_f1}")


=== Default XGBoost ===


Parameters: { "use_label_encoder" } are not used.




Classification Report on Test Data (Default XGBoost):
              precision    recall  f1-score   support

           0       0.52      0.79      0.63       450
           1       0.72      0.60      0.65        30
           2       0.88      0.70      0.78        30
           3       0.91      0.70      0.79        30
           4       0.81      0.83      0.82        30
           5       0.85      0.77      0.81        30
           6       0.73      0.63      0.68        30
           7       0.89      0.83      0.86        30
           8       0.88      0.93      0.90        30
           9       0.84      0.70      0.76        30
          10       0.77      0.77      0.77        30
          11       0.84      0.70      0.76        30
          12       0.80      0.53      0.64        30
          13       0.93      0.87      0.90        30
          14       0.63      0.57      0.60        30
          15       0.80      0.67      0.73        30
          16       0.92   

Parameters: { "use_label_encoder" } are not used.




Classification Report on Test Data (Class Weights):
              precision    recall  f1-score   support

           0       0.71      0.55      0.62       450
           1       0.72      0.60      0.65        30
           2       0.72      0.77      0.74        30
           3       0.92      0.80      0.86        30
           4       0.74      0.87      0.80        30
           5       0.86      0.83      0.85        30
           6       0.69      0.67      0.68        30
           7       0.81      0.83      0.82        30
           8       0.83      0.97      0.89        30
           9       0.77      0.67      0.71        30
          10       0.68      0.77      0.72        30
          11       0.80      0.80      0.80        30
          12       0.79      0.63      0.70        30
          13       0.70      0.77      0.73        30
          14       0.54      0.63      0.58        30
          15       0.75      0.70      0.72        30
          16       0.82     

Parameters: { "use_label_encoder" } are not used.




Classification Report on Test Data (SMOTE):
              precision    recall  f1-score   support

           0       0.64      0.56      0.60       450
           1       0.61      0.47      0.53        30
           2       0.75      0.70      0.72        30
           3       0.89      0.80      0.84        30
           4       0.79      0.87      0.83        30
           5       0.71      0.83      0.77        30
           6       0.73      0.63      0.68        30
           7       0.92      0.80      0.86        30
           8       0.89      0.80      0.84        30
           9       0.79      0.63      0.70        30
          10       0.56      0.63      0.59        30
          11       0.72      0.70      0.71        30
          12       0.61      0.67      0.63        30
          13       0.87      0.87      0.87        30
          14       0.53      0.60      0.56        30
          15       0.68      0.70      0.69        30
          16       0.79      0.77   