In [1]:
!pip install lightgbm catboost

Collecting catboost
  Downloading catboost-1.2.7-cp310-cp310-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.7-cp310-cp310-manylinux2014_x86_64.whl (98.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.7/98.7 MB[0m [31m9.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.7


In [12]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier, AdaBoostClassifier
from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

In [13]:
data = pd.read_csv("/content/monunmon.csv")  # CSV 파일 경로

# 레이블 이진 분류를 위해 변환 (0~94 -> 1, -1 -> 0)
data['Label'] = data['Label'].apply(lambda x: 1 if x >= 0 else 0)

# 변환 결과 확인
print("Label Distribution:")
print(data['Label'].value_counts())

# 특성과 레이블 분리
X = data.drop(columns=['Label'])  # 피처 데이터
y = data['Label']  # 레이블 데이터

# Train, Validation, Test 데이터 나누기
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp)

Label Distribution:
Label
1    19000
0     3000
Name: count, dtype: int64


#오리지널 데이터 사용

##Gradient Boosting Classifier

In [20]:
# Gradient Boosting Classifier
model = GradientBoostingClassifier(random_state=42)
model.fit(X_train, y_train)
y_pred_gbc = model.predict(X_test)

# 검증 데이터 평가
y_val_pred = model.predict(X_val)
val_accuracy = accuracy_score(y_val, y_val_pred)
val_f1 = f1_score(y_val, y_val_pred)

print(f"Validation Accuracy: {val_accuracy:.2f}")
print(f"Validation F1 Score: {val_f1:.2f}")

# 테스트 데이터 평가
y_test_pred = model.predict(X_test)
test_accuracy = accuracy_score(y_test, y_test_pred)
test_f1 = f1_score(y_test, y_test_pred)

print(f"\nTest Accuracy: {test_accuracy:.2f}")
print(f"Test F1 Score: {test_f1:.2f}")

# 분류 보고서 출력
print("\nClassification Report on Test Data:")
print(classification_report(y_test, y_test_pred))



Validation Accuracy: 0.89
Validation F1 Score: 0.94

Test Accuracy: 0.90
Test F1 Score: 0.94

Classification Report on Test Data:
              precision    recall  f1-score   support

           0       0.86      0.28      0.42       450
           1       0.90      0.99      0.94      2850

    accuracy                           0.90      3300
   macro avg       0.88      0.64      0.68      3300
weighted avg       0.89      0.90      0.87      3300



In [21]:
#클래스별 Accuracy 계산
conf_matrix = confusion_matrix(y_test, y_test_pred)
class_0_accuracy = conf_matrix[0, 0] / conf_matrix[0].sum()  # TP / (TP + FN) for class 0
class_1_accuracy = conf_matrix[1, 1] / conf_matrix[1].sum()  # TP / (TP + FN) for class 1

# 클래스별 Accuracy 출력
print(f"\nTest Class 0 Accuracy: {class_0_accuracy:.2f}")
print(f"Test Class 1 Accuracy: {class_1_accuracy:.2f}")


Test Class 0 Accuracy: 0.28
Test Class 1 Accuracy: 0.99


##XGBoost

In [26]:
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix

# XGBoost Classifier 학습
xgb = XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss')
xgb.fit(X_train, y_train)

# 검증 데이터 평가
y_val_pred = xgb.predict(X_val)
val_accuracy = accuracy_score(y_val, y_val_pred)
val_f1 = f1_score(y_val, y_val_pred)

print(f"Validation Accuracy: {val_accuracy:.2f}")
print(f"Validation F1 Score: {val_f1:.2f}")

# 테스트 데이터 평가
y_test_pred = xgb.predict(X_test)
test_accuracy = accuracy_score(y_test, y_test_pred)
test_f1 = f1_score(y_test, y_test_pred)

print(f"Test Accuracy: {test_accuracy:.2f}")
print(f"Test F1 Score: {test_f1:.2f}")

# 분류 보고서 출력
print("\nClassification Report on Test Data:")
print(classification_report(y_test, y_test_pred))

Parameters: { "use_label_encoder" } are not used.



Validation Accuracy: 0.91
Validation F1 Score: 0.95
Test Accuracy: 0.91
Test F1 Score: 0.95

Classification Report on Test Data:
              precision    recall  f1-score   support

           0       0.81      0.44      0.57       450
           1       0.92      0.98      0.95      2850

    accuracy                           0.91      3300
   macro avg       0.87      0.71      0.76      3300
weighted avg       0.90      0.91      0.90      3300



In [27]:
#클래스별 Accuracy 계산
conf_matrix = confusion_matrix(y_test, y_test_pred)
class_0_accuracy = conf_matrix[0, 0] / conf_matrix[0].sum()  # TP / (TP + FN) for class 0
class_1_accuracy = conf_matrix[1, 1] / conf_matrix[1].sum()  # TP / (TP + FN) for class 1

# 클래스별 Accuracy 출력
print(f"\nTest Class 0 Accuracy: {class_0_accuracy:.2f}")
print(f"Test Class 1 Accuracy: {class_1_accuracy:.2f}")


Test Class 0 Accuracy: 0.44
Test Class 1 Accuracy: 0.98


##LightGBM

In [28]:
# LightGBM Classifier
lgbm = LGBMClassifier(random_state=42)
lgbm.fit(X_train, y_train)

# 검증 데이터 평가
y_val_pred = lgbm.predict(X_val)
val_accuracy = accuracy_score(y_val, y_val_pred)
val_f1 = f1_score(y_val, y_val_pred)

print(f"Validation Accuracy: {val_accuracy:.2f}")
print(f"Validation F1 Score: {val_f1:.2f}")

# 테스트 데이터 평가
y_test_pred = lgbm.predict(X_test)
test_accuracy = accuracy_score(y_test, y_test_pred)
test_f1 = f1_score(y_test, y_test_pred)

print(f"Test Accuracy: {test_accuracy:.2f}")
print(f"Test F1 Score: {test_f1:.2f}")

# 분류 보고서 출력
print("\nClassification Report on Test Data:")
print(classification_report(y_test, y_test_pred))

[LightGBM] [Info] Number of positive: 13300, number of negative: 2100
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001965 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3345
[LightGBM] [Info] Number of data points in the train set: 15400, number of used features: 15
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.863636 -> initscore=1.845827
[LightGBM] [Info] Start training from score 1.845827
Validation Accuracy: 0.91
Validation F1 Score: 0.95
Test Accuracy: 0.91
Test F1 Score: 0.95

Classification Report on Test Data:
              precision    recall  f1-score   support

           0       0.84      0.38      0.52       450
           1       0.91      0.99      0.95      2850

    accuracy                           0.91      3300
   macro avg       0.88      0.68      0.73      3300
weighted avg       0.90      0.91      0.89 

In [29]:
#클래스별 Accuracy 계산
conf_matrix = confusion_matrix(y_test, y_test_pred)
class_0_accuracy = conf_matrix[0, 0] / conf_matrix[0].sum()  # TP / (TP + FN) for class 0
class_1_accuracy = conf_matrix[1, 1] / conf_matrix[1].sum()  # TP / (TP + FN) for class 1

# 클래스별 Accuracy 출력
print(f"\nTest Class 0 Accuracy: {class_0_accuracy:.2f}")
print(f"Test Class 1 Accuracy: {class_1_accuracy:.2f}")


Test Class 0 Accuracy: 0.38
Test Class 1 Accuracy: 0.99


##catBoost

In [30]:
# CatBoost Classifier
catboost = CatBoostClassifier(random_state=42, verbose=0)  # verbose=0으로 출력 제한
catboost.fit(X_train, y_train)

# 검증 데이터 평가
y_val_pred = catboost.predict(X_val)
val_accuracy = accuracy_score(y_val, y_val_pred)
val_f1 = f1_score(y_val, y_val_pred)

print(f"Validation Accuracy: {val_accuracy:.2f}")
print(f"Validation F1 Score: {val_f1:.2f}")

# 테스트 데이터 평가
y_test_pred = catboost.predict(X_test)
test_accuracy = accuracy_score(y_test, y_test_pred)
test_f1 = f1_score(y_test, y_test_pred)

print(f"Test Accuracy: {test_accuracy:.2f}")
print(f"Test F1 Score: {test_f1:.2f}")

# 분류 보고서 출력
print("\nClassification Report on Test Data:")
print(classification_report(y_test, y_test_pred))

Validation Accuracy: 0.91
Validation F1 Score: 0.95
Test Accuracy: 0.90
Test F1 Score: 0.95

Classification Report on Test Data:
              precision    recall  f1-score   support

           0       0.82      0.38      0.52       450
           1       0.91      0.99      0.95      2850

    accuracy                           0.90      3300
   macro avg       0.87      0.68      0.73      3300
weighted avg       0.90      0.90      0.89      3300



In [31]:
#클래스별 Accuracy 계산
conf_matrix = confusion_matrix(y_test, y_test_pred)
class_0_accuracy = conf_matrix[0, 0] / conf_matrix[0].sum()  # TP / (TP + FN) for class 0
class_1_accuracy = conf_matrix[1, 1] / conf_matrix[1].sum()  # TP / (TP + FN) for class 1

# 클래스별 Accuracy 출력
print(f"\nTest Class 0 Accuracy: {class_0_accuracy:.2f}")
print(f"Test Class 1 Accuracy: {class_1_accuracy:.2f}")


Test Class 0 Accuracy: 0.38
Test Class 1 Accuracy: 0.99


##adaBoost

In [32]:
# AdaBoost Classifier 사용
abc = AdaBoostClassifier(random_state=42)
abc.fit(X_train, y_train)

# 검증 데이터 평가
y_val_pred = abc.predict(X_val)
val_accuracy = accuracy_score(y_val, y_val_pred)
val_f1 = f1_score(y_val, y_val_pred)

print(f"Validation Accuracy: {val_accuracy:.2f}")
print(f"Validation F1 Score: {val_f1:.2f}")

# 테스트 데이터 평가
y_test_pred = abc.predict(X_test)
test_accuracy = accuracy_score(y_test, y_test_pred)
test_f1 = f1_score(y_test, y_test_pred)

print(f"Test Accuracy: {test_accuracy:.2f}")
print(f"Test F1 Score: {test_f1:.2f}")

# 분류 보고서 출력
print("\nClassification Report on Test Data:")
print(classification_report(y_test, y_test_pred))



Validation Accuracy: 0.89
Validation F1 Score: 0.94
Test Accuracy: 0.89
Test F1 Score: 0.94

Classification Report on Test Data:
              precision    recall  f1-score   support

           0       0.76      0.25      0.37       450
           1       0.89      0.99      0.94      2850

    accuracy                           0.89      3300
   macro avg       0.82      0.62      0.65      3300
weighted avg       0.87      0.89      0.86      3300



In [33]:
#클래스별 Accuracy 계산
conf_matrix = confusion_matrix(y_test, y_test_pred)
class_0_accuracy = conf_matrix[0, 0] / conf_matrix[0].sum()  # TP / (TP + FN) for class 0
class_1_accuracy = conf_matrix[1, 1] / conf_matrix[1].sum()  # TP / (TP + FN) for class 1

# 클래스별 Accuracy 출력
print(f"\nTest Class 0 Accuracy: {class_0_accuracy:.2f}")
print(f"Test Class 1 Accuracy: {class_1_accuracy:.2f}")


Test Class 0 Accuracy: 0.25
Test Class 1 Accuracy: 0.99


#0 oversampling 한 데이터

##데이터 생성

In [34]:
!pip install imbalanced-learn



In [35]:
from imblearn.over_sampling import SMOTE

In [40]:
data = pd.read_csv("/content/monunmon.csv")  # CSV 파일 경로

# 레이블 이진 분류를 위해 변환 (0~94 -> 1, -1 -> 0)
data['Label'] = data['Label'].apply(lambda x: 1 if x >= 0 else 0)

# 변환 결과 확인
print("Label Distribution:")
print(data['Label'].value_counts())

# 특성과 레이블 분리
X = data.drop(columns=['Label'])  # 피처 데이터
y = data['Label']  # 레이블 데이터


Label Distribution:
Label
1    19000
0     3000
Name: count, dtype: int64


In [46]:
# SMOTE 적용
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

print("Original dataset size:", X.shape[0])
print("Resampled dataset size:", X_resampled.shape[0])

# Train, Validation, Test 데이터 나누기
X_train, X_temp, y_train, y_temp = train_test_split(X_resampled, y_resampled, test_size=0.3, random_state=42, stratify=y_resampled)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp)

Original dataset size: 22000
Resampled dataset size: 38000


##Gradient Boosting Classifier

In [47]:
# Gradient Boosting Classifier
model = GradientBoostingClassifier(random_state=42)
model.fit(X_train, y_train)
y_pred_gbc = model.predict(X_test)

# 검증 데이터 평가
y_val_pred = model.predict(X_val)
val_accuracy = accuracy_score(y_val, y_val_pred)
val_f1 = f1_score(y_val, y_val_pred)

print(f"Validation Accuracy: {val_accuracy:.2f}")
print(f"Validation F1 Score: {val_f1:.2f}")

# 테스트 데이터 평가
y_test_pred = model.predict(X_test)
test_accuracy = accuracy_score(y_test, y_test_pred)
test_f1 = f1_score(y_test, y_test_pred)

print(f"\nTest Accuracy: {test_accuracy:.2f}")
print(f"Test F1 Score: {test_f1:.2f}")

# 분류 보고서 출력
print("\nClassification Report on Test Data:")
print(classification_report(y_test, y_test_pred))



Validation Accuracy: 0.89
Validation F1 Score: 0.90

Test Accuracy: 0.90
Test F1 Score: 0.90

Classification Report on Test Data:
              precision    recall  f1-score   support

           0       0.94      0.84      0.89      2850
           1       0.86      0.95      0.90      2850

    accuracy                           0.90      5700
   macro avg       0.90      0.90      0.90      5700
weighted avg       0.90      0.90      0.90      5700



In [48]:
#클래스별 Accuracy 계산
conf_matrix = confusion_matrix(y_test, y_test_pred)
class_0_accuracy = conf_matrix[0, 0] / conf_matrix[0].sum()  # TP / (TP + FN) for class 0
class_1_accuracy = conf_matrix[1, 1] / conf_matrix[1].sum()  # TP / (TP + FN) for class 1

# 클래스별 Accuracy 출력
print(f"\nTest Class 0 Accuracy: {class_0_accuracy:.2f}")
print(f"Test Class 1 Accuracy: {class_1_accuracy:.2f}")


Test Class 0 Accuracy: 0.84
Test Class 1 Accuracy: 0.95


##XGBoost

In [49]:
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix

# XGBoost Classifier 학습
xgb = XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss')
xgb.fit(X_train, y_train)

# 검증 데이터 평가
y_val_pred = xgb.predict(X_val)
val_accuracy = accuracy_score(y_val, y_val_pred)
val_f1 = f1_score(y_val, y_val_pred)

print(f"Validation Accuracy: {val_accuracy:.2f}")
print(f"Validation F1 Score: {val_f1:.2f}")

# 테스트 데이터 평가
y_test_pred = xgb.predict(X_test)
test_accuracy = accuracy_score(y_test, y_test_pred)
test_f1 = f1_score(y_test, y_test_pred)

print(f"Test Accuracy: {test_accuracy:.2f}")
print(f"Test F1 Score: {test_f1:.2f}")

# 분류 보고서 출력
print("\nClassification Report on Test Data:")
print(classification_report(y_test, y_test_pred))

Parameters: { "use_label_encoder" } are not used.



Validation Accuracy: 0.93
Validation F1 Score: 0.93
Test Accuracy: 0.93
Test F1 Score: 0.93

Classification Report on Test Data:
              precision    recall  f1-score   support

           0       0.96      0.90      0.93      2850
           1       0.91      0.96      0.93      2850

    accuracy                           0.93      5700
   macro avg       0.93      0.93      0.93      5700
weighted avg       0.93      0.93      0.93      5700



In [50]:
#클래스별 Accuracy 계산
conf_matrix = confusion_matrix(y_test, y_test_pred)
class_0_accuracy = conf_matrix[0, 0] / conf_matrix[0].sum()  # TP / (TP + FN) for class 0
class_1_accuracy = conf_matrix[1, 1] / conf_matrix[1].sum()  # TP / (TP + FN) for class 1

# 클래스별 Accuracy 출력
print(f"\nTest Class 0 Accuracy: {class_0_accuracy:.2f}")
print(f"Test Class 1 Accuracy: {class_1_accuracy:.2f}")


Test Class 0 Accuracy: 0.90
Test Class 1 Accuracy: 0.96


##LightGBM

In [51]:
# LightGBM Classifier
lgbm = LGBMClassifier(random_state=42)
lgbm.fit(X_train, y_train)

# 검증 데이터 평가
y_val_pred = lgbm.predict(X_val)
val_accuracy = accuracy_score(y_val, y_val_pred)
val_f1 = f1_score(y_val, y_val_pred)

print(f"Validation Accuracy: {val_accuracy:.2f}")
print(f"Validation F1 Score: {val_f1:.2f}")

# 테스트 데이터 평가
y_test_pred = lgbm.predict(X_test)
test_accuracy = accuracy_score(y_test, y_test_pred)
test_f1 = f1_score(y_test, y_test_pred)

print(f"Test Accuracy: {test_accuracy:.2f}")
print(f"Test F1 Score: {test_f1:.2f}")

# 분류 보고서 출력
print("\nClassification Report on Test Data:")
print(classification_report(y_test, y_test_pred))

[LightGBM] [Info] Number of positive: 13300, number of negative: 13300
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.004385 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3825
[LightGBM] [Info] Number of data points in the train set: 26600, number of used features: 15
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
Validation Accuracy: 0.92
Validation F1 Score: 0.92
Test Accuracy: 0.92
Test F1 Score: 0.92

Classification Report on Test Data:
              precision    recall  f1-score   support

           0       0.96      0.88      0.92      2850
           1       0.89      0.96      0.92      2850

    accuracy                           0.92      5700
   macro avg       0.92      0.92      0.92      5700
weighted avg       0.92      0.92      0.92      5700



In [52]:
#클래스별 Accuracy 계산
conf_matrix = confusion_matrix(y_test, y_test_pred)
class_0_accuracy = conf_matrix[0, 0] / conf_matrix[0].sum()  # TP / (TP + FN) for class 0
class_1_accuracy = conf_matrix[1, 1] / conf_matrix[1].sum()  # TP / (TP + FN) for class 1

# 클래스별 Accuracy 출력
print(f"\nTest Class 0 Accuracy: {class_0_accuracy:.2f}")
print(f"Test Class 1 Accuracy: {class_1_accuracy:.2f}")


Test Class 0 Accuracy: 0.88
Test Class 1 Accuracy: 0.96


##catBoost

In [53]:
# CatBoost Classifier
catboost = CatBoostClassifier(random_state=42, verbose=0)  # verbose=0으로 출력 제한
catboost.fit(X_train, y_train)

# 검증 데이터 평가
y_val_pred = catboost.predict(X_val)
val_accuracy = accuracy_score(y_val, y_val_pred)
val_f1 = f1_score(y_val, y_val_pred)

print(f"Validation Accuracy: {val_accuracy:.2f}")
print(f"Validation F1 Score: {val_f1:.2f}")

# 테스트 데이터 평가
y_test_pred = catboost.predict(X_test)
test_accuracy = accuracy_score(y_test, y_test_pred)
test_f1 = f1_score(y_test, y_test_pred)

print(f"Test Accuracy: {test_accuracy:.2f}")
print(f"Test F1 Score: {test_f1:.2f}")

# 분류 보고서 출력
print("\nClassification Report on Test Data:")
print(classification_report(y_test, y_test_pred))

Validation Accuracy: 0.93
Validation F1 Score: 0.93
Test Accuracy: 0.93
Test F1 Score: 0.93

Classification Report on Test Data:
              precision    recall  f1-score   support

           0       0.96      0.89      0.93      2850
           1       0.90      0.97      0.93      2850

    accuracy                           0.93      5700
   macro avg       0.93      0.93      0.93      5700
weighted avg       0.93      0.93      0.93      5700



In [54]:
#클래스별 Accuracy 계산
conf_matrix = confusion_matrix(y_test, y_test_pred)
class_0_accuracy = conf_matrix[0, 0] / conf_matrix[0].sum()  # TP / (TP + FN) for class 0
class_1_accuracy = conf_matrix[1, 1] / conf_matrix[1].sum()  # TP / (TP + FN) for class 1

# 클래스별 Accuracy 출력
print(f"\nTest Class 0 Accuracy: {class_0_accuracy:.2f}")
print(f"Test Class 1 Accuracy: {class_1_accuracy:.2f}")


Test Class 0 Accuracy: 0.89
Test Class 1 Accuracy: 0.97


##adaBoost

In [55]:
# AdaBoost Classifier 사용
abc = AdaBoostClassifier(random_state=42)
abc.fit(X_train, y_train)

# 검증 데이터 평가
y_val_pred = abc.predict(X_val)
val_accuracy = accuracy_score(y_val, y_val_pred)
val_f1 = f1_score(y_val, y_val_pred)

print(f"Validation Accuracy: {val_accuracy:.2f}")
print(f"Validation F1 Score: {val_f1:.2f}")

# 테스트 데이터 평가
y_test_pred = abc.predict(X_test)
test_accuracy = accuracy_score(y_test, y_test_pred)
test_f1 = f1_score(y_test, y_test_pred)

print(f"Test Accuracy: {test_accuracy:.2f}")
print(f"Test F1 Score: {test_f1:.2f}")

# 분류 보고서 출력
print("\nClassification Report on Test Data:")
print(classification_report(y_test, y_test_pred))



Validation Accuracy: 0.86
Validation F1 Score: 0.86
Test Accuracy: 0.86
Test F1 Score: 0.87

Classification Report on Test Data:
              precision    recall  f1-score   support

           0       0.89      0.82      0.86      2850
           1       0.83      0.90      0.87      2850

    accuracy                           0.86      5700
   macro avg       0.86      0.86      0.86      5700
weighted avg       0.86      0.86      0.86      5700



In [56]:
#클래스별 Accuracy 계산
conf_matrix = confusion_matrix(y_test, y_test_pred)
class_0_accuracy = conf_matrix[0, 0] / conf_matrix[0].sum()  # TP / (TP + FN) for class 0
class_1_accuracy = conf_matrix[1, 1] / conf_matrix[1].sum()  # TP / (TP + FN) for class 1

# 클래스별 Accuracy 출력
print(f"\nTest Class 0 Accuracy: {class_0_accuracy:.2f}")
print(f"Test Class 1 Accuracy: {class_1_accuracy:.2f}")


Test Class 0 Accuracy: 0.82
Test Class 1 Accuracy: 0.90


**GBC와 AdaBoost:**

SMOTE로 생성된 합성 데이터가 원본 데이터와 다소 다른 분포를 가질 경우, 모델 성능이 저하될 가능성이 높음.
특히 AdaBoost는 잘못된 데이터에 민감함.


**CatBoost, LightGBM, XGBoost:**

이러한 알고리즘들은 데이터 노이즈와 불균형에 더 잘 대처하는 구조를 가짐.

#0에 weighted 가중치 부여한 데이터


In [61]:
data = pd.read_csv("/content/monunmon.csv")  # CSV 파일 경로

# 레이블 이진 분류를 위해 변환 (0~94 -> 1, -1 -> 0)
data['Label'] = data['Label'].apply(lambda x: 1 if x >= 0 else 0)

# 변환 결과 확인
print("Label Distribution:")
print(data['Label'].value_counts())

# 특성과 레이블 분리
X = data.drop(columns=['Label'])  # 피처 데이터
y = data['Label']  # 레이블 데이터

# Train, Validation, Test 데이터 나누기
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp)


Label Distribution:
Label
1    19000
0     3000
Name: count, dtype: int64


##Gradient Boosting Classifier

In [62]:
from sklearn.utils.class_weight import compute_sample_weight
sample_weights = compute_sample_weight(class_weight='balanced', y=y_train)

In [64]:
# Gradient Boosting Classifier
model = GradientBoostingClassifier(random_state=42)
model.fit(X_train, y_train, sample_weight=sample_weights)
y_pred_gbc = model.predict(X_test)

# 검증 데이터 평가
y_val_pred = model.predict(X_val)
val_accuracy = accuracy_score(y_val, y_val_pred)
val_f1 = f1_score(y_val, y_val_pred)

print(f"Validation Accuracy: {val_accuracy:.2f}")
print(f"Validation F1 Score: {val_f1:.2f}")

# 테스트 데이터 평가
y_test_pred = model.predict(X_test)
test_accuracy = accuracy_score(y_test, y_test_pred)
test_f1 = f1_score(y_test, y_test_pred)

print(f"\nTest Accuracy: {test_accuracy:.2f}")
print(f"Test F1 Score: {test_f1:.2f}")

# 분류 보고서 출력
print("\nClassification Report on Test Data:")
print(classification_report(y_test, y_test_pred))



Validation Accuracy: 0.79
Validation F1 Score: 0.87

Test Accuracy: 0.79
Test F1 Score: 0.86

Classification Report on Test Data:
              precision    recall  f1-score   support

           0       0.36      0.72      0.48       450
           1       0.95      0.80      0.86      2850

    accuracy                           0.79      3300
   macro avg       0.65      0.76      0.67      3300
weighted avg       0.87      0.79      0.81      3300



In [65]:
#클래스별 Accuracy 계산
conf_matrix = confusion_matrix(y_test, y_test_pred)
class_0_accuracy = conf_matrix[0, 0] / conf_matrix[0].sum()  # TP / (TP + FN) for class 0
class_1_accuracy = conf_matrix[1, 1] / conf_matrix[1].sum()  # TP / (TP + FN) for class 1

# 클래스별 Accuracy 출력
print(f"\nTest Class 0 Accuracy: {class_0_accuracy:.2f}")
print(f"Test Class 1 Accuracy: {class_1_accuracy:.2f}")


Test Class 0 Accuracy: 0.72
Test Class 1 Accuracy: 0.80


##XGBoost

scale_pos_weight = Negative Sample 수 / Positive Sample 수

In [66]:
# 가중치 설정
num_0 = sum(y_train == 0)
num_1 = sum(y_train == 1)
scale_pos_weight = num_0 / num_1

In [67]:
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix

# XGBoost Classifier 학습
xgb = XGBClassifier(random_state=42, scale_pos_weight=scale_pos_weight, use_label_encoder=False, eval_metric='logloss')
xgb.fit(X_train, y_train)

# 검증 데이터 평가
y_val_pred = xgb.predict(X_val)
val_accuracy = accuracy_score(y_val, y_val_pred)
val_f1 = f1_score(y_val, y_val_pred)

print(f"Validation Accuracy: {val_accuracy:.2f}")
print(f"Validation F1 Score: {val_f1:.2f}")

# 테스트 데이터 평가
y_test_pred = xgb.predict(X_test)
test_accuracy = accuracy_score(y_test, y_test_pred)
test_f1 = f1_score(y_test, y_test_pred)

print(f"Test Accuracy: {test_accuracy:.2f}")
print(f"Test F1 Score: {test_f1:.2f}")

# 분류 보고서 출력
print("\nClassification Report on Test Data:")
print(classification_report(y_test, y_test_pred))

Parameters: { "use_label_encoder" } are not used.



Validation Accuracy: 0.87
Validation F1 Score: 0.92
Test Accuracy: 0.86
Test F1 Score: 0.92

Classification Report on Test Data:
              precision    recall  f1-score   support

           0       0.49      0.69      0.57       450
           1       0.95      0.88      0.92      2850

    accuracy                           0.86      3300
   macro avg       0.72      0.79      0.74      3300
weighted avg       0.88      0.86      0.87      3300



In [68]:
#클래스별 Accuracy 계산
conf_matrix = confusion_matrix(y_test, y_test_pred)
class_0_accuracy = conf_matrix[0, 0] / conf_matrix[0].sum()  # TP / (TP + FN) for class 0
class_1_accuracy = conf_matrix[1, 1] / conf_matrix[1].sum()  # TP / (TP + FN) for class 1

# 클래스별 Accuracy 출력
print(f"\nTest Class 0 Accuracy: {class_0_accuracy:.2f}")
print(f"Test Class 1 Accuracy: {class_1_accuracy:.2f}")


Test Class 0 Accuracy: 0.69
Test Class 1 Accuracy: 0.88


##LightGBM

In [69]:
# 가중치 설정
num_0 = sum(y_train == 0)
num_1 = sum(y_train == 1)
scale_pos_weight = num_0 / num_1

In [70]:
# LightGBM Classifier
lgbm = LGBMClassifier(random_state=42,scale_pos_weight=scale_pos_weight)
lgbm.fit(X_train, y_train)

# 검증 데이터 평가
y_val_pred = lgbm.predict(X_val)
val_accuracy = accuracy_score(y_val, y_val_pred)
val_f1 = f1_score(y_val, y_val_pred)

print(f"Validation Accuracy: {val_accuracy:.2f}")
print(f"Validation F1 Score: {val_f1:.2f}")

# 테스트 데이터 평가
y_test_pred = lgbm.predict(X_test)
test_accuracy = accuracy_score(y_test, y_test_pred)
test_f1 = f1_score(y_test, y_test_pred)

print(f"Test Accuracy: {test_accuracy:.2f}")
print(f"Test F1 Score: {test_f1:.2f}")

# 분류 보고서 출력
print("\nClassification Report on Test Data:")
print(classification_report(y_test, y_test_pred))

[LightGBM] [Info] Number of positive: 13300, number of negative: 2100
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.004224 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3345
[LightGBM] [Info] Number of data points in the train set: 15400, number of used features: 15
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.863636 -> initscore=1.845827
[LightGBM] [Info] Start training from score 1.845827
Validation Accuracy: 0.85
Validation F1 Score: 0.91
Test Accuracy: 0.84
Test F1 Score: 0.90

Classification Report on Test Data:
              precision    recall  f1-score   support

           0       0.44      0.71      0.54       450
           1       0.95      0.86      0.90      2850

    accuracy                           0.84      3300
   macro avg       0.69      0.78      0.72      3300
weighted avg       0.88      0.84      0.85      3300



In [71]:
#클래스별 Accuracy 계산
conf_matrix = confusion_matrix(y_test, y_test_pred)
class_0_accuracy = conf_matrix[0, 0] / conf_matrix[0].sum()  # TP / (TP + FN) for class 0
class_1_accuracy = conf_matrix[1, 1] / conf_matrix[1].sum()  # TP / (TP + FN) for class 1

# 클래스별 Accuracy 출력
print(f"\nTest Class 0 Accuracy: {class_0_accuracy:.2f}")
print(f"Test Class 1 Accuracy: {class_1_accuracy:.2f}")


Test Class 0 Accuracy: 0.71
Test Class 1 Accuracy: 0.86


##catBoost

In [72]:
# 가중치 설정
num_0 = sum(y_train == 0)
num_1 = sum(y_train == 1)
class_weights = [num_1 / num_0, 1.0]  # [weight for class 0, weight for class 1]

In [73]:
# CatBoost Classifier
catboost = CatBoostClassifier(random_state=42, class_weights=class_weights, verbose=0)  # verbose=0으로 출력 제한
catboost.fit(X_train, y_train)

# 검증 데이터 평가
y_val_pred = catboost.predict(X_val)
val_accuracy = accuracy_score(y_val, y_val_pred)
val_f1 = f1_score(y_val, y_val_pred)

print(f"Validation Accuracy: {val_accuracy:.2f}")
print(f"Validation F1 Score: {val_f1:.2f}")

# 테스트 데이터 평가
y_test_pred = catboost.predict(X_test)
test_accuracy = accuracy_score(y_test, y_test_pred)
test_f1 = f1_score(y_test, y_test_pred)

print(f"Test Accuracy: {test_accuracy:.2f}")
print(f"Test F1 Score: {test_f1:.2f}")

# 분류 보고서 출력
print("\nClassification Report on Test Data:")
print(classification_report(y_test, y_test_pred))

Validation Accuracy: 0.86
Validation F1 Score: 0.92
Test Accuracy: 0.85
Test F1 Score: 0.91

Classification Report on Test Data:
              precision    recall  f1-score   support

           0       0.48      0.74      0.58       450
           1       0.95      0.87      0.91      2850

    accuracy                           0.85      3300
   macro avg       0.71      0.80      0.74      3300
weighted avg       0.89      0.85      0.87      3300



In [74]:
#클래스별 Accuracy 계산
conf_matrix = confusion_matrix(y_test, y_test_pred)
class_0_accuracy = conf_matrix[0, 0] / conf_matrix[0].sum()  # TP / (TP + FN) for class 0
class_1_accuracy = conf_matrix[1, 1] / conf_matrix[1].sum()  # TP / (TP + FN) for class 1

# 클래스별 Accuracy 출력
print(f"\nTest Class 0 Accuracy: {class_0_accuracy:.2f}")
print(f"Test Class 1 Accuracy: {class_1_accuracy:.2f}")


Test Class 0 Accuracy: 0.74
Test Class 1 Accuracy: 0.87


##adaBoost

In [76]:
from sklearn.utils.class_weight import compute_sample_weight

# 가중치 계산
sample_weights = compute_sample_weight(class_weight='balanced', y=y_train)

In [77]:
# AdaBoost Classifier 사용
abc = AdaBoostClassifier(random_state=42)
abc.fit(X_train, y_train, sample_weight=sample_weights)

# 검증 데이터 평가
y_val_pred = abc.predict(X_val)
val_accuracy = accuracy_score(y_val, y_val_pred)
val_f1 = f1_score(y_val, y_val_pred)

print(f"Validation Accuracy: {val_accuracy:.2f}")
print(f"Validation F1 Score: {val_f1:.2f}")

# 테스트 데이터 평가
y_test_pred = abc.predict(X_test)
test_accuracy = accuracy_score(y_test, y_test_pred)
test_f1 = f1_score(y_test, y_test_pred)

print(f"Test Accuracy: {test_accuracy:.2f}")
print(f"Test F1 Score: {test_f1:.2f}")

# 분류 보고서 출력
print("\nClassification Report on Test Data:")
print(classification_report(y_test, y_test_pred))



Validation Accuracy: 0.76
Validation F1 Score: 0.84
Test Accuracy: 0.76
Test F1 Score: 0.84

Classification Report on Test Data:
              precision    recall  f1-score   support

           0       0.32      0.69      0.44       450
           1       0.94      0.76      0.84      2850

    accuracy                           0.76      3300
   macro avg       0.63      0.73      0.64      3300
weighted avg       0.86      0.76      0.79      3300



In [78]:
#클래스별 Accuracy 계산
conf_matrix = confusion_matrix(y_test, y_test_pred)
class_0_accuracy = conf_matrix[0, 0] / conf_matrix[0].sum()  # TP / (TP + FN) for class 0
class_1_accuracy = conf_matrix[1, 1] / conf_matrix[1].sum()  # TP / (TP + FN) for class 1

# 클래스별 Accuracy 출력
print(f"\nTest Class 0 Accuracy: {class_0_accuracy:.2f}")
print(f"Test Class 1 Accuracy: {class_1_accuracy:.2f}")


Test Class 0 Accuracy: 0.69
Test Class 1 Accuracy: 0.76
