In [None]:
#1. 필요한 라이브러리 불러오기
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, accuracy_score, f1_score
import lightgbm as lgb

#2. 데이터 로드
data = pd.read_csv("../../../../monunmon.csv")  # CSV 파일 경로

#3. 레이블 이진 분류를 위해 변환 (0~94 -> 1, -1 -> 0)
data['Label'] = data['Label'].apply(lambda x: 1 if x >= 0 else 0)

#4. 변환 결과 확인
print("Label Distribution:")
print(data['Label'].value_counts())

#5. 특성과 레이블 분리
X = data.drop(columns=['Label'])  # 피처 데이터
y = data['Label']  # 레이블 데이터

#6. Train, Validation, Test 데이터 나누기
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp)

#7. 데이터 정규화 (LightGBM은 정규화 필요 없음, 하지만 데이터 분포에 따라 적용)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)
X_test = scaler.transform(X_test)

#8. GridSearchCV를 사용한 하이퍼파라미터 튜닝
from sklearn.model_selection import GridSearchCV

# 1. 하이퍼파라미터 설정
param_grid = {
    'n_estimators': [100, 300, 500],
    'learning_rate': [0.01, 0.05, 0.1],
    'max_depth': [-1, 5, 10],
    'num_leaves': [31, 50, 100],
    'min_child_samples': [10, 20, 30],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0]
}

# 2. GridSearchCV 설정
grid_search = GridSearchCV(
    estimator=lgb.LGBMClassifier(random_state=42),
    param_grid=param_grid,
    scoring='f1',  # F1 스코어 기준으로 평가
    cv=3,  # 3-폴드 교차 검증
    verbose=2,
    n_jobs=-1  # 모든 CPU 코어 사용
)

# 3. 학습
grid_search.fit(X_train, y_train)

# 4. 최적 하이퍼파라미터 및 결과 출력
print("Best Parameters:", grid_search.best_params_)
print("Best F1 Score:", grid_search.best_score_)

# 5. 최적 모델로 평가
best_model = grid_search.best_estimator_

y_val_pred = best_model.predict(X_val)
val_accuracy = accuracy_score(y_val, y_val_pred)
val_f1 = f1_score(y_val, y_val_pred)

print(f"Validation Accuracy with Tuned Model: {val_accuracy:.2f}")
print(f"Validation F1 Score with Tuned Model: {val_f1:.2f}")

y_test_pred = best_model.predict(X_test)
test_accuracy = accuracy_score(y_test, y_test_pred)
test_f1 = f1_score(y_test, y_test_pred)

print(f"Test Accuracy with Tuned Model: {test_accuracy:.2f}")
print(f"Test F1 Score with Tuned Model: {test_f1:.2f}")

print("\nClassification Report on Test Data with Tuned Model:")
print(classification_report(y_test, y_test_pred))

Dask dataframe query planning is disabled because dask-expr is not installed.

You can install it with `pip install dask[dataframe]` or `conda install dask`.
This will raise in a future version.



Label Distribution:
Label
1    19000
0     3000
Name: count, dtype: int64
Fitting 3 folds for each of 972 candidates, totalling 2916 fits


  _data = np.array(data, dtype=dtype, copy=copy,


[LightGBM] [Info] Number of positive: 13300, number of negative: 2100
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002330 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3345
[LightGBM] [Info] Number of data points in the train set: 15400, number of used features: 15
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.863636 -> initscore=1.845827
[LightGBM] [Info] Start training from score 1.845827
Best Parameters: {'colsample_bytree': 0.8, 'learning_rate': 0.1, 'max_depth': -1, 'min_child_samples': 20, 'n_estimators': 300, 'num_leaves': 50, 'subsample': 0.8}
Best F1 Score: 0.9486857861835238
Validation Accuracy with Tuned Model: 0.92
Validation F1 Score with Tuned Model: 0.95
Test Accuracy with Tuned Model: 0.91
Test F1 Score with Tuned Model: 0.95

Classification Report on Test Data with Tuned Model:
              precision    recall  f1-score   support

           0       0.81      0.43      0.5