In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import accuracy_score, classification_report, f1_score, make_scorer
from lightgbm import LGBMClassifier
import warnings

warnings.filterwarnings("ignore", category=UserWarning)

# 데이터 로드
data = pd.read_csv('../../../mon.csv')

# 데이터 확인
print(f"Data Shape: {data.shape}")
print(f"Unique Labels: {data['Label'].nunique()}")

# 특성과 레이블 분리
X = data.drop(columns=['Label'])
y = data['Label']

# Train, Validation, Test 데이터 나누기
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, random_state=42, stratify=y)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp)

# LightGBM 모델 정의 및 하이퍼파라미터 공간
param_dist = {
    'num_leaves': [50, 70, 100, 150],
    'max_depth': [10, 20, 30, -1],
    'learning_rate': [0.01, 0.05, 0.1],
    'n_estimators': [200, 300, 500],
    'min_child_samples': [1, 5, 10],
    'min_split_gain': [0.0, 0.01, 0.1],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0],
    'reg_alpha': [0.0, 0.1, 0.5],
    'reg_lambda': [0.0, 0.1, 0.5]
}

lgbm = LGBMClassifier(objective='multiclass', num_class=95, random_state=42)

# F1 스코어를 매크로 평균으로 계산
f1_scorer = make_scorer(f1_score, average='macro')

# RandomizedSearchCV 정의
random_search = RandomizedSearchCV(
    estimator=lgbm,
    param_distributions=param_dist,
    n_iter=50,  # 랜덤 샘플링 횟수
    scoring=f1_scorer,
    cv=5,
    n_jobs=-1,
    verbose=2,
    random_state=42
)

# 학습 데이터로 랜덤 서치 실행
print("Starting RandomizedSearchCV...")
random_search.fit(X_train, y_train)

# 최적 하이퍼파라미터 출력
print("Best Parameters:", random_search.best_params_)
print("Best F1 Score (Macro):", random_search.best_score_)

# 검증 데이터 평가
best_model = random_search.best_estimator_
y_val_pred = best_model.predict(X_val)
val_accuracy = accuracy_score(y_val, y_val_pred)
val_f1 = f1_score(y_val, y_val_pred, average='macro')

print(f"\nValidation Accuracy: {val_accuracy:.2f}")
print(f"Validation F1 Score (Macro): {val_f1:.2f}")

# 테스트 데이터 평가
y_test_pred = best_model.predict(X_test)
test_accuracy = accuracy_score(y_test, y_test_pred)
test_f1 = f1_score(y_test, y_test_pred, average='macro')

print(f"\nTest Accuracy: {test_accuracy:.2f}")
print(f"Test F1 Score (Macro): {test_f1:.2f}")

# 분류 보고서 출력
print("\nClassification Report on Test Data:")
print(classification_report(y_test, y_test_pred))


Data Shape: (19000, 16)
Unique Labels: 95
Starting RandomizedSearchCV...
Fitting 5 folds for each of 50 candidates, totalling 250 fits
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003582 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3337
[LightGBM] [Info] Number of data points in the train set: 13300, number of used features: 15
[LightGBM] [Info] Start training from score -4.553877
[LightGBM] [Info] Start training from score -4.553877
[LightGBM] [Info] Start training from score -4.553877
[LightGBM] [Info] Start training from score -4.553877
[LightGBM] [Info] Start training from score -4.553877
[LightGBM] [Info] Start training from score -4.553877
[LightGBM] [Info] Start training from score -4.553877
[LightGBM] [Info] Start training from score -4.553877
[LightGBM] [Info] Start training from score -4.553877
[LightGBM] [Info] Start training from score -4.553877
[LightGBM] [Info] Start training fro