# SCORE: 0.7360688602

### Best Random Forest Parameters: {'max_depth': 20, 'min_samples_split': 10, 'n_estimators': 300}
### Best Random Forest ROC-AUC Score: 0.7337449998431756
### Best AdaBoost Parameters: {'learning_rate': 1, 'n_estimators': 200}
### Best AdaBoost ROC-AUC Score: 0.7323571286246053
### CSV 파일 생성 완료: optimized_with_hyperparams_submit.csv

In [2]:
import pandas as pd
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, VotingClassifier
from sklearn.preprocessing import OrdinalEncoder
from sklearn.metrics import roc_auc_score

# 데이터 불러오기
train = pd.read_csv('train.csv').drop(columns=['ID'])
test = pd.read_csv('test.csv').drop(columns=['ID'])

X = train.drop('임신 성공 여부', axis=1)
y = train['임신 성공 여부']

# 범주형 데이터 인코딩
categorical_columns = X.select_dtypes(include='object').columns
ordinal_encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)

X[categorical_columns] = ordinal_encoder.fit_transform(X[categorical_columns])
test[categorical_columns] = ordinal_encoder.transform(test[categorical_columns])

# 결측치 채우기
X.fillna(0, inplace=True)
test.fillna(0, inplace=True)

### 1. 랜덤 포레스트 하이퍼파라미터 튜닝
rf_param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10]
}

rf_grid_search = GridSearchCV(RandomForestClassifier(random_state=42), param_grid=rf_param_grid, cv=5, scoring='roc_auc', n_jobs=-1)
rf_grid_search.fit(X, y)

best_rf = rf_grid_search.best_estimator_
print("Best Random Forest Parameters:", rf_grid_search.best_params_)
print("Best Random Forest ROC-AUC Score:", rf_grid_search.best_score_)

### 2. AdaBoost 하이퍼파라미터 튜닝
ada_param_grid = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.01, 0.1, 1]
}

ada_grid_search = GridSearchCV(AdaBoostClassifier(random_state=42), param_grid=ada_param_grid, cv=5, scoring='roc_auc', n_jobs=-1)
ada_grid_search.fit(X, y)

best_ada = ada_grid_search.best_estimator_
print("Best AdaBoost Parameters:", ada_grid_search.best_params_)
print("Best AdaBoost ROC-AUC Score:", ada_grid_search.best_score_)

### 3. 최적의 모델로 앙상블 학습 (소프트 보팅)
voting_model = VotingClassifier(estimators=[
    ('rf', best_rf),
    ('ada', best_ada)
], voting='soft')

# 최적의 앙상블 모델 학습
voting_model.fit(X, y)

# 예측
pred_proba = voting_model.predict_proba(test)[:, 1]

# 제출 파일 생성
submission = pd.DataFrame({
    'ID': pd.read_csv('test.csv')['ID'],
    'probability': pred_proba
})

submission.to_csv('optimized_with_hyperparams_submit.csv', index=False)

print("CSV 파일 생성 완료: optimized_with_hyperparams_submit.csv")


KeyboardInterrupt: 