In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install numpy==1.26.4 pandas==2.2.2 scikit-learn==1.3.2 catboost==1.2.7 xgboost==2.1.4 lightgbm imbalanced-learn



In [None]:
#  baseline > SCORE: 0.7409664035 from hong
"""
version
!pip install numpy==1.26.4
!pip install pandas==2.2.2
!pip install scikit-learn==1.3.2
!pip install catboost==1.2.7
!pip install xgboost==2.1.4

Ver Check
import numpy as np
import pandas as pd
import sklearn
import catboost
import xgboost as xgb

print("NumPy version:", np.__version__)
print("Pandas version:", pd.__version__)
print("scikit-learn version:", sklearn.__version__)
print("CatBoost version:", catboost.__version__)
print("XGBoost 버전:", xgb.__version__)

+
pip install imbalanced-learn  > version 0.13.0

"""
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import StackingClassifier, ExtraTreesClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from imblearn.under_sampling import RandomUnderSampler

#  0. 문자열 → 숫자로 변환하는 함수
def convert_count_str(val):
    if pd.isna(val):
        return 0.0
    val = str(val).strip()
    if "회 이상" in val:
        return 6.0
    m = re.search(r'(\d+)회?', val)
    if m:
        return float(m.group(1))
    return 0.0

donor_age_mapping = {
    '만20세 이하': 3, '만21-25세': 5, '만26-30세': 4, '만31-35세': 2,
    '만36-40세': 1, '만41-45세': 0, '알 수 없음': 0
}

def convert_donor_age(val):
    if pd.isna(val):
        return np.nan
    return donor_age_mapping.get(str(val).strip(), np.nan)

# 카테고리형 변수들의 NaN을 문자열 'NaN'으로 변환하는 함수
def convert_nan_to_string(df, category_columns):
    df_copy = df.copy()
    for col in category_columns:
        df_copy[col] = df_copy[col].fillna('NaN')
    return df_copy

#  1. 데이터 로드 및 전처리
train = pd.read_csv('/content/drive/MyDrive/aimers/data/train.csv').drop(columns=['ID'])
test = pd.read_csv('/content/drive/MyDrive/aimers/data/test.csv').drop(columns=['ID'])

# 가중치 데이터 로드
weight_data = pd.read_csv('/content/drive/MyDrive/aimers/data/og_weighted_hong.csv')
weight_dict = weight_data.set_index("데이터 항목").to_dict()

# '시술 당시 나이' 결측치 여부 추가
train['시술 당시 나이_missing'] = train['시술 당시 나이'].apply(lambda x: 1.0 if str(x).strip() == '알 수 없음' else 0.0)
test['시술 당시 나이_missing'] = test['시술 당시 나이'].apply(lambda x: 1.0 if str(x).strip() == '알 수 없음' else 0.0)

# '시술 당시 나이' 변환
age_mapping = {
    '만18-34세': 5, '만35-37세': 4, '만38-39세': 3, '만40-42세': 2, '만43-44세': 1, '만45-50세': 0, '알 수 없음': np.nan
}
train['시술 당시 나이'] = train['시술 당시 나이'].apply(lambda x: float(age_mapping.get(str(x).strip(), 0)))
test['시술 당시 나이'] = test['시술 당시 나이'].apply(lambda x: float(age_mapping.get(str(x).strip(), 0)))

# 횟수 관련 컬럼 변환
count_columns = ["총 임신 횟수", "총 출산 횟수", "총 시술 횟수", "IVF 시술 횟수", "DI 시술 횟수", "클리닉 내 총 시술 횟수"]
for col in count_columns:
    train[col] = train[col].astype(str).apply(convert_count_str)
    test[col] = test[col].astype(str).apply(convert_count_str)

# 난자/정자 기증자 나이 변환
train['난자 기증자 나이'] = train['난자 기증자 나이'].astype(str).apply(convert_donor_age)
test['난자 기증자 나이'] = test['난자 기증자 나이'].astype(str).apply(convert_donor_age)
train['정자 기증자 나이'] = train['정자 기증자 나이'].astype(str).apply(convert_donor_age)
test['정자 기증자 나이'] = test['정자 기증자 나이'].astype(str).apply(convert_donor_age)

#  2. 가중치 적용 함수
def apply_feature_weights(X, weight_dict):
    X_weighted = X.copy()
    for column in X.columns:
        if column in weight_dict["IVF"]:
            X_weighted[column] *= weight_dict["IVF"][column]  # IVF 가중치 적용
    return X_weighted

# 카테고리형 변수들의 인덱스 찾기
def get_categorical_feature_indices(df):
    cat_features = []
    for idx, (column, dtype) in enumerate(df.dtypes.items()):
        if dtype == 'category':
            cat_features.append(idx)
    return cat_features

# Feature 가중치 적용
X = train.drop('임신 성공 여부', axis=1)
y = train['임신 성공 여부']

X_weighted = apply_feature_weights(X, weight_dict)
X_test_weighted = apply_feature_weights(test, weight_dict)

#  3. 데이터 불균형 처리 (임신 성공 여부 기준)
undersample = RandomUnderSampler(sampling_strategy=0.5, random_state=42)
X_resampled, y_resampled = undersample.fit_resample(X_weighted, y)

#  4. 데이터 타입 변환
category_columns = [
    "시술 시기 코드", "시술 유형", "특정 시술 유형", "배란 유도 유형",
    "배아 생성 주요 이유", "IVF 임신 횟수", "DI 임신 횟수",
    "IVF 출산 횟수", "DI 출산 횟수", "난자 출처", "정자 출처"
]

# NaN을 문자열로 변환하고 카테고리형으로 변환
X_resampled = convert_nan_to_string(X_resampled, category_columns)
X_test_weighted = convert_nan_to_string(X_test_weighted, category_columns)

for col in category_columns:
    X_resampled[col] = X_resampled[col].astype("category")
    X_test_weighted[col] = X_test_weighted[col].astype("category")

#  5. 모델 학습
X_train, X_val, y_train, y_val = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42, stratify=y_resampled)

# 모델 파라미터 수정
stack_clf = StackingClassifier(
    estimators=[
        ('xgb', XGBClassifier(
            tree_method='gpu_hist',
            enable_categorical=True,
            n_estimators=100,
            max_depth=6
        )),

        ('lgbm', LGBMClassifier(
            n_jobs=-2,
            verbose=0,
            n_estimators=100
        )),

        ('cat', CatBoostClassifier(
            task_type='GPU',
            verbose=0,
            iterations=100,
            cat_features=get_categorical_feature_indices(X_resampled)
        ))
        # ExtraTreesClassifier 제거하여 모델 복잡도 감소
    ],
    final_estimator=Pipeline([
        ('scaler', StandardScaler()),
        ('lr', LogisticRegression(max_iter=1000))
    ]),
    cv=3,
    n_jobs=-2
)

# GridSearchCV 파라미터 수정
grid_search = GridSearchCV(
    stack_clf,
    {'final_estimator__lr__C': [0.1, 1.0, 10.0]},
    scoring='roc_auc',
    cv=3,
    n_jobs=-2
)


grid_search.fit(X_train, y_train)

y_val_pred = grid_search.best_estimator_.predict_proba(X_val)[:, 1]
roc_auc = roc_auc_score(y_val, y_val_pred)
print(f"Validation ROC AUC: {roc_auc:.5f}")


#  6. 최종 예측 및 제출 파일 생성
pred_proba = grid_search.best_estimator_.predict_proba(X_test_weighted)[:, 1]
submission = pd.DataFrame({'ID': [f"TEST_{i:05d}" for i in range(len(test))], 'probability': pred_proba})
submission.to_csv('/content/drive/MyDrive/aimers/submit/baseline_v2_submit.csv', index=False)
print("제출 파일 생성 완료")

# Validation ROC AUC: 0.74261

"""  baseline 수정목록

train, test , submit 파일 경로 수정



('xgb', XGBClassifier(
            tree_method='hist',
            device='cuda',
            enable_categorical=True,
            n_estimators=100,  # 트리 개수 감소
            max_depth=6       # 트리 깊이 제한
        )),

↓       ↓       ↓

('xgb', XGBClassifier(
            tree_method='gpu_hist',
            enable_categorical=True,
            n_estimators=100,
            max_depth=6
        )),

xgb학습시 gpu 사용 (gpu_hist),  device = 'cuda' 는 필요 x






n_jobs = 1

↓

n_jobs = -2

병렬처리 제한 약 80% 사용



age_mapping = {
    '만18-34세': 0, '만35-37세': 1, '만38-39세': 2, '만40-42세': 3, '만43-44세': 4, '만45-50세': 5, '알 수 없음': np.nan
}


↓

age_mapping = {
    '만18-34세': 5, '만35-37세': 4, '만38-39세': 3, '만40-42세': 2, '만43-44세': 1, '만45-50세': 0, '알 수 없음': np.nan
}

가중치 적용 수정





donor_age_mapping = {
    '만20세 이하': 0, '만21-25세': 1, '만26-30세': 2, '만31-35세': 3,
    '만36-40세': 4, '만41-45세': 5, '알 수 없음': 0
}

↓

donor_age_mapping = {
    '만20세 이하': 3, '만21-25세': 5, '만26-30세': 4, '만31-35세': 2,
    '만36-40세': 1, '만41-45세': 0, '알 수 없음': 0
}
가중치 적용 수정
"""


    E.g. tree_method = "hist", device = "cuda"






    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"

Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.



    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"






    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"






    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"






    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"






    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"






    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"






    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"






    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"






    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"






    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"






    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"






    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"






    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"






    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"






    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"






    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"






    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"






    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"






    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"






    E.g. tree_method = "hist", device = "cuda"



Validation ROC AUC: 0.74261
제출 파일 생성 완료


"  baseline 수정목록\n\ntrain, test , submit 파일 경로 수정\n\n\n\n('xgb', XGBClassifier(\n            tree_method='hist',\n            device='cuda',\n            enable_categorical=True,\n            n_estimators=100,  # 트리 개수 감소\n            max_depth=6       # 트리 깊이 제한\n        )),\n\n↓       ↓       ↓\n\n('xgb', XGBClassifier(\n            tree_method='gpu_hist',\n            enable_categorical=True,\n            n_estimators=100,\n            max_depth=6\n        )),\n\nxgb학습시 gpu 사용 (gpu_hist),  device = 'cuda' 는 필요 x\n\n\n\n\n\n\nn_jobs = 1\n\n↓\n\nn_jobs = -2\n\n병렬처리 제한 약 80% 사용\n\n\n\nage_mapping = {\n    '만18-34세': 0, '만35-37세': 1, '만38-39세': 2, '만40-42세': 3, '만43-44세': 4, '만45-50세': 5, '알 수 없음': np.nan\n}\n\n\n↓\n\nage_mapping = {\n    '만18-34세': 5, '만35-37세': 4, '만38-39세': 3, '만40-42세': 2, '만43-44세': 1, '만45-50세': 0, '알 수 없음': np.nan\n}\n\n가중치 적용 수정\n\n\n\n\n\ndonor_age_mapping = {\n    '만20세 이하': 0, '만21-25세': 1, '만26-30세': 2, '만31-35세': 3,\n    '만36-40세': 4, '만41-45세': 5, '알 수 없음'