In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import OrdinalEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.experimental import enable_iterative_imputer  # IterativeImputer 활성화
from sklearn.impute import IterativeImputer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score, roc_curve
import lightgbm as lgb
# 데이터 로드
train_path = "./train.csv"  # Google Colab 환경에 맞게 수정
test_path = "./test.csv"
df_train = pd.read_csv(train_path).drop(columns=['ID'], errors='ignore')
df_test = pd.read_csv(test_path).drop(columns=['ID'], errors='ignore')

Dask dataframe query planning is disabled because dask-expr is not installed.

You can install it with `pip install dask[dataframe]` or `conda install dask`.
This will raise in a future version.



In [2]:
# 결측값 처리 - y_train의 NaN 제거
df_train = df_train.dropna(subset=['임신 성공 여부'])

In [3]:
# 독립 변수(X)와 종속 변수(y) 분리
X_train = df_train.drop(columns=['임신 성공 여부'], errors='ignore')
y_train = df_train['임신 성공 여부'].fillna(df_train['임신 성공 여부'].mode()[0])  # 최빈값으로 결측값 채움
X_test = df_test.copy()

In [4]:
# 상위 20개 중요한 변수만 선택
important_features = [
    "시술 시기 코드", "총 생성 배아 수", "수집된 신선 난자 수", "시술 당시 나이", "배아 이식 경과일",
    "이식된 배아 수", "총 시술 횟수", "해동된 배아 수", "총 임신 횟수", "배아 생성 주요 이유",
    "미세주입된 난자 수", "IVF 임신 횟수", "난자 채취 경과일", "배란 유도 유형", "정자 출처",
    "총 출산 횟수", "DI 임신 횟수", "IVF 시술 횟수", "남성 주 불임 원인", "불임 원인 - 배란 장애"
]
X_train = X_train[important_features]
X_test = X_test[important_features]

In [5]:
# 범주형 변수 인코딩
categorical_columns = [col for col in important_features if X_train[col].dtype == 'object']
ordinal_encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
X_train[categorical_columns] = ordinal_encoder.fit_transform(X_train[categorical_columns])
X_test[categorical_columns] = ordinal_encoder.transform(X_test[categorical_columns])

In [6]:
# 결측값 처리 - Iterative Imputer 사용
iterative_imputer = IterativeImputer(random_state=42)
X_train = pd.DataFrame(iterative_imputer.fit_transform(X_train), columns=X_train.columns)
X_test = pd.DataFrame(iterative_imputer.transform(X_test), columns=X_test.columns)



# 모델 학습
lightGBM


In [9]:
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, roc_auc_score, roc_curve
import matplotlib.pyplot as plt
import numpy as np

In [11]:
# 예시: 랜덤 포레스트, 결정 트리 모델 정의
rf = RandomForestClassifier(
    n_estimators=100,      # 트리 개수
    max_depth=None,        # 트리 최대 깊이
    random_state=42
)
dt = DecisionTreeClassifier(
    max_depth=None,
    random_state=42
)

# VotingClassifier - Soft Voting
voting_clf_soft = VotingClassifier(
    estimators=[('rf', rf), ('dt', dt)],
    voting='soft'
)

# VotingClassifier - Hard Voting
voting_clf_hard = VotingClassifier(
    estimators=[('rf', rf), ('dt', dt)],
    voting='hard'
)

# 각 모델 학습
rf.fit(X_train, y_train)
dt.fit(X_train, y_train)
voting_clf_soft.fit(X_train, y_train)
voting_clf_hard.fit(X_train, y_train)


In [15]:
# 예측 및 평가
rf_y_pred_proba = rf.predict_proba(X_test)[:, 1]
dt_y_pred_proba = dt.predict_proba(X_test)[:, 1]
voting_clf_soft_y_pred_proba = voting_clf_soft.predict_proba(X_test)[:, 1]
#voting_clf_hard_y_pred_proba = voting_clf_hard.predict_proba(X_test)[:, 1]

In [16]:
sample_submission = pd.read_csv('./sample_submission.csv')
sample_submission['probability'] =rf_y_pred_proba
sample_submission.to_csv('./baseline_submit6.csv', index=False)

sample_submission = pd.read_csv('./sample_submission.csv')
sample_submission['probability'] =dt_y_pred_proba
sample_submission.to_csv('./baseline_submit7.csv', index=False)

sample_submission = pd.read_csv('./sample_submission.csv')
sample_submission['probability'] =rf_y_pred_proba
sample_submission.to_csv('./baseline_submit8.csv', index=False)

