In [2]:
!pip install lightbgm

ERROR: Could not find a version that satisfies the requirement lightbgm (from versions: none)
ERROR: No matching distribution found for lightbgm
You should consider upgrading via the 'C:\Users\tjddl\AppData\Local\Programs\Python\Python310\python.exe -m pip install --upgrade pip' command.


In [4]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import roc_auc_score, classification_report
from sklearn.ensemble import StackingClassifier
import lightgbm as lgb
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline

# 1. 데이터 로드
train = pd.read_csv('train.csv').drop(columns=['ID'])
test = pd.read_csv('test.csv').drop(columns=['ID'])

# 2. 독립 변수(X)와 종속 변수(y) 분리
X = train.drop('임신 성공 여부', axis=1)
y = train['임신 성공 여부']

# 3. 결측값 처리 및 One-Hot Encoding 적용
categorical_columns = X.select_dtypes(include='object').columns
pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])


X_encoded = pipeline.fit_transform(X[categorical_columns])
test_encoded = pipeline.transform(test[categorical_columns])

# 수치형 변수도 결합
X_final = np.hstack([X.drop(columns=categorical_columns).values, X_encoded])
test_final = np.hstack([test.drop(columns=categorical_columns).values, test_encoded])

# 4. 데이터 분할 (훈련/검증 데이터셋)
X_train, X_val, y_train, y_val = train_test_split(X_final, y, test_size=0.2, random_state=42)

# 5. 스태킹 앙상블 구성
base_models = [
    ('lgb', lgb.LGBMClassifier(n_estimators=1000, learning_rate=0.05, random_state=42)),
    ('rf', RandomForestClassifier(n_estimators=200, max_depth=15, random_state=42))
]

stacking_model = StackingClassifier(estimators=base_models, final_estimator=lgb.LGBMClassifier(random_state=42))

# 6. 하이퍼파라미터 튜닝 (LGBM 모델 위주)
param_grid = {
    'final_estimator__n_estimators': [500, 1000, 1500],
    'final_estimator__learning_rate': [0.01, 0.05, 0.1],
    'final_estimator__max_depth': [5, 10, 15]
}

grid_search = GridSearchCV(stacking_model, param_grid, cv=3, scoring='roc_auc', n_jobs=-1, verbose=1)
grid_search.fit(X_train, y_train)

# 최적의 하이퍼파라미터 확인
print("\n최적의 하이퍼파라미터:", grid_search.best_params_)

# 7. 최적의 모델로 검증 데이터 예측
best_model = grid_search.best_estimator_
y_pred = best_model.predict_proba(X_val)[:, 1]

# 최적의 ROC AUC 점수 출력
roc_auc = roc_auc_score(y_val, y_pred)
print("\nROC AUC Score (Validation):", roc_auc)

# 분류 보고서
optimal_threshold = 0.5  # 필요 시 ROC 커브를 이용해 임계값 조정 가능
final_predictions = (y_pred >= optimal_threshold).astype(int)
print("\n검증 데이터에 대한 분류 보고서:")
print(classification_report(y_val, final_predictions))

# 8. 테스트 데이터 예측 및 결과 저장
test_prob_predictions = best_model.predict_proba(test_final)[:, 1]
submission = pd.DataFrame({
    'ID': pd.read_csv('test.csv')['ID'],
    '임신 성공 확률': np.round(test_prob_predictions, 3)
})
submission.to_csv('submission_optimized.csv', index=False)
print("\n최적화된 결과가 'submission_optimized.csv'에 저장되었습니다.")


Fitting 3 folds for each of 27 candidates, totalling 81 fits


KeyboardInterrupt: 