In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
import numpy as np

In [None]:
data=pd.read_csv('/content/drive/MyDrive/monunmon.csv')

**Light GBM**




- 기본 모델 학습

In [None]:
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, f1_score
import matplotlib.pyplot as plt
import numpy as np

# 특징(X)와 타겟(y) 분리
X = data.drop(columns=['Label'])
y = data['Label']

# 라벨 값 변환: -1 → 0, 0 → 1, ..., 94 → 95
y_transformed = y + 1

# 데이터를 훈련 세트, 검증 세트, 테스트 세트로 분할 (60% train, 20% validation, 20% test)
X_train, X_temp, y_train, y_temp = train_test_split(X, y_transformed, test_size=0.4, random_state=42, stratify=y_transformed)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp)

# LightGBM 데이터셋 생성
train_data = lgb.Dataset(X_train, label=y_train)
val_data = lgb.Dataset(X_val, label=y_val, reference=train_data)
test_data = lgb.Dataset(X_test, label=y_test, reference=train_data)

# 모델 파라미터 설정
params = {
    'objective': 'multiclass',
    'num_class': len(np.unique(y_transformed)),  # 클래스 수
    'boosting_type': 'gbdt',
    'metric': 'multi_logloss',
    'is_unbalance': True,  # 불균형 데이터 대응
    'random_state': 42,
    'verbose': -1  # 로그 최소화
}

# 모델 학습
model = lgb.train(
    params,
    train_data,
    valid_sets=[train_data, val_data]
)

# Validation Accuracy 출력
y_val_pred_prob = model.predict(X_val)  # 검증 데이터에 대한 예측 확률
y_val_pred = np.argmax(y_val_pred_prob, axis=1)  # 가장 높은 확률의 클래스 선택
valid_accuracy = accuracy_score(y_val, y_val_pred)
print(f"Validation Accuracy: {valid_accuracy}")

# 테스트 세트 예측
y_test_pred_prob = model.predict(X_test)  # 테스트 데이터에 대한 예측 확률
y_test_pred = np.argmax(y_test_pred_prob, axis=1)  # 가장 높은 확률의 클래스 선택

# 라벨 값 복원: 0 → -1, 1 → 0, ..., 95 → 94
y_test_original = y_test - 1
y_test_pred_original = y_test_pred - 1

# 테스트 세트 성능 평가
test_accuracy = accuracy_score(y_test_original, y_test_pred_original)
classification_rep = classification_report(y_test_original, y_test_pred_original)
conf_matrix = confusion_matrix(y_test_original, y_test_pred_original)
test_f1_score = f1_score(y_test_original, y_test_pred_original, average='weighted')  # 가중 평균 F1 Score 계산

print(f"Test Accuracy: {test_accuracy}")
print(f"Test F1 Score (weighted): {test_f1_score}")
print("\nClassification Report:\n", classification_rep)
print("\nConfusion Matrix:\n", conf_matrix)

Validation Accuracy: 0.7095454545454546
Test Accuracy: 0.7227272727272728
Test F1 Score (weighted): 0.7286673447790059

Classification Report:
               precision    recall  f1-score   support

          -1       0.46      0.81      0.59       600
           0       0.83      0.62      0.71        40
           1       0.71      0.55      0.62        40
           2       0.87      0.82      0.85        40
           3       0.78      0.72      0.75        40
           4       0.83      0.62      0.71        40
           5       0.82      0.70      0.76        40
           6       0.77      0.85      0.81        40
           7       0.90      0.68      0.77        40
           8       0.80      0.60      0.69        40
           9       0.66      0.57      0.61        40
          10       0.78      0.62      0.69        40
          11       0.77      0.68      0.72        40
          12       0.88      0.88      0.88        40
          13       0.71      0.50      0.59  

**weighted**

In [None]:
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, f1_score
import matplotlib.pyplot as plt
import numpy as np

# 특징(X)와 타겟(y) 분리
X = data.drop(columns=['Label'])
y = data['Label']

# 라벨 값 변환: -1 → 0, 0 → 1, ..., 94 → 95
y_transformed = y + 1

# 데이터를 훈련 세트, 검증 세트, 테스트 세트로 분할 (60% train, 20% validation, 20% test)
X_train, X_temp, y_train, y_temp = train_test_split(X, y_transformed, test_size=0.4, random_state=42, stratify=y_transformed)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp)

# 클래스 가중치 계산
class_weights = {label: len(y_train) / (len(np.unique(y_train)) * sum(y_train == label)) for label in np.unique(y_train)}

# 가중치를 학습 데이터에 적용
weights = np.array([class_weights[label] for label in y_train])

# LightGBM 데이터셋 생성 (가중치 포함)
train_data = lgb.Dataset(X_train, label=y_train, weight=weights)
val_data = lgb.Dataset(X_val, label=y_val, reference=train_data)
test_data = lgb.Dataset(X_test, label=y_test, reference=train_data)

# 모델 파라미터 설정
params = {
    'objective': 'multiclass',
    'num_class': len(np.unique(y_transformed)),  # 클래스 수
    'boosting_type': 'gbdt',
    'metric': 'multi_logloss',
    'is_unbalance': True,  # 불균형 데이터 대응
    'random_state': 42,
    'verbose': -1  # 로그 최소화
}

# 모델 학습
model = lgb.train(
    params,
    train_data,
    valid_sets=[train_data, val_data]
)

# Validation Accuracy 출력
y_val_pred_prob = model.predict(X_val)  # 검증 데이터에 대한 예측 확률
y_val_pred = np.argmax(y_val_pred_prob, axis=1)  # 가장 높은 확률의 클래스 선택
valid_accuracy = accuracy_score(y_val, y_val_pred)
print(f"Validation Accuracy: {valid_accuracy}")

# 테스트 세트 예측
y_test_pred_prob = model.predict(X_test)  # 테스트 데이터에 대한 예측 확률
y_test_pred = np.argmax(y_test_pred_prob, axis=1)  # 가장 높은 확률의 클래스 선택

# 라벨 값 복원: 0 → -1, 1 → 0, ..., 95 → 94
y_test_original = y_test - 1
y_test_pred_original = y_test_pred - 1

# 테스트 세트 성능 평가
test_accuracy = accuracy_score(y_test_original, y_test_pred_original)
classification_rep = classification_report(y_test_original, y_test_pred_original)
conf_matrix = confusion_matrix(y_test_original, y_test_pred_original)
test_f1_score = f1_score(y_test_original, y_test_pred_original, average='weighted')  # 가중 평균 F1 Score 계산

print(f"Test Accuracy: {test_accuracy}")
print(f"Test F1 Score (weighted): {test_f1_score}")
print("\nClassification Report:\n", classification_rep)
print("\nConfusion Matrix:\n", conf_matrix)


Validation Accuracy: 0.7188636363636364
Test Accuracy: 0.7293181818181819
Test F1 Score (weighted): 0.7320001442166222

Classification Report:
               precision    recall  f1-score   support

          -1       0.53      0.72      0.61       600
           0       0.83      0.60      0.70        40
           1       0.65      0.55      0.59        40
           2       0.87      0.85      0.86        40
           3       0.73      0.82      0.78        40
           4       0.76      0.65      0.70        40
           5       0.81      0.75      0.78        40
           6       0.86      0.80      0.83        40
           7       0.88      0.72      0.79        40
           8       0.85      0.72      0.78        40
           9       0.62      0.65      0.63        40
          10       0.81      0.65      0.72        40
          11       0.81      0.65      0.72        40
          12       0.78      0.80      0.79        40
          13       0.68      0.47      0.56  

**SMOTE**

In [None]:
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, f1_score
from imblearn.over_sampling import SMOTE
import matplotlib.pyplot as plt
import numpy as np

# 특징(X)와 타겟(y) 분리
X = data.drop(columns=['Label'])
y = data['Label']

# 라벨 값 변환: -1 → 0, 0 → 1, ..., 94 → 95
y_transformed = y + 1

# 데이터를 훈련 세트, 검증 세트, 테스트 세트로 분할 (60% train, 20% validation, 20% test)
X_train, X_temp, y_train, y_temp = train_test_split(X, y_transformed, test_size=0.4, random_state=42, stratify=y_transformed)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp)

# SMOTE 오버샘플링 적용
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

# LightGBM 데이터셋 생성
train_data_smote = lgb.Dataset(X_train_smote, label=y_train_smote)
val_data = lgb.Dataset(X_val, label=y_val, reference=train_data_smote)
test_data = lgb.Dataset(X_test, label=y_test, reference=train_data_smote)

# 모델 파라미터 설정
params = {
    'objective': 'multiclass',
    'num_class': len(np.unique(y_transformed)),  # 클래스 수
    'boosting_type': 'gbdt',
    'metric': 'multi_logloss',
    'is_unbalance': True,  # 불균형 데이터 대응
    'random_state': 42,
    'verbose': -1  # 로그 최소화
}

# 모델 학습
model_smote = lgb.train(
    params,
    train_data_smote,
    valid_sets=[train_data_smote, val_data]
)

# Validation Accuracy 출력
y_val_pred_prob = model_smote.predict(X_val)  # 검증 데이터에 대한 예측 확률
y_val_pred = np.argmax(y_val_pred_prob, axis=1)  # 가장 높은 확률의 클래스 선택
valid_accuracy = accuracy_score(y_val, y_val_pred)
print(f"Validation Accuracy: {valid_accuracy}")

# 테스트 세트 예측
y_test_pred_prob = model_smote.predict(X_test)  # 테스트 데이터에 대한 예측 확률
y_test_pred = np.argmax(y_test_pred_prob, axis=1)  # 가장 높은 확률의 클래스 선택

# 라벨 값 복원: 0 → -1, 1 → 0, ..., 95 → 94
y_test_original = y_test - 1
y_test_pred_original = y_test_pred - 1

# 테스트 세트 성능 평가
test_accuracy = accuracy_score(y_test_original, y_test_pred_original)
classification_rep = classification_report(y_test_original, y_test_pred_original)
conf_matrix = confusion_matrix(y_test_original, y_test_pred_original)
test_f1_score = f1_score(y_test_original, y_test_pred_original, average='weighted')  # 가중 평균 F1 Score 계산

print(f"Test Accuracy: {test_accuracy}")
print(f"Test F1 Score (weighted): {test_f1_score}")
print("\nClassification Report:\n", classification_rep)
print("\nConfusion Matrix:\n", conf_matrix)


Validation Accuracy: 0.1315909090909091
Test Accuracy: 0.1390909090909091
Test F1 Score (weighted): 0.12861952418113048

Classification Report:
               precision    recall  f1-score   support

          -1       0.44      0.04      0.07       600
           0       0.08      0.12      0.10        40
           1       0.00      0.00      0.00        40
           2       0.00      0.00      0.00        40
           3       0.10      0.30      0.15        40
           4       0.00      0.00      0.00        40
           5       0.00      0.00      0.00        40
           6       0.15      0.38      0.22        40
           7       0.10      0.10      0.10        40
           8       0.11      0.23      0.15        40
           9       0.08      0.15      0.10        40
          10       0.03      0.05      0.03        40
          11       0.00      0.00      0.00        40
          12       0.00      0.00      0.00        40
          13       0.03      0.03      0.03 

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
