In [1]:
# Log in to your W&B account
import wandb
import random
import math

In [5]:
import wandb
import pandas as pd
import numpy as np

from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from lightgbm import LGBMClassifier

# 1차

In [33]:

# 1. wandb 로그인 (API key를 이용)
wandb.login(key="8bab4affd59dd33bd76096b069d74f6dc0abb056")

# 2. 데이터 불러오기 및 전처리
train_df = pd.read_csv('train.csv')
# test_df = pd.read_csv('test.csv')  # 필요시 불러오기

# ID 컬럼 제거 (예시로 'ID', 'id', 'RecordID', 'patient_id' 등)
for col in ['id', 'ID', 'RecordID', 'patient_id']:
    if col in train_df.columns:
        train_df = train_df.drop(col, axis=1)
    # if col in test_df.columns:
    #     test_df = test_df.drop(col, axis=1)

# 타깃과 피처 분리 (타깃 컬럼명이 '임신 성공 여부'라고 가정)
target = '임신 성공 여부'
X = train_df.drop(target, axis=1)
y = train_df[target]

# 범주형, 수치형 변수 지정
# (아래 목록은 예시이므로 실제 데이터에 맞게 수정하세요)
categorical_columns = [
    '시술 시기 코드', '시술 당시 나이', '시술 유형', '특정 시술 유형',  # "특정 시술 유형"은 문자열이므로 반드시 범주형에 포함!
    '배란 자극 여부', '배란 유도 유형', '단일 배아 이식 여부',
    '착상 전 유전 검사 사용 여부', '착상 전 유전 진단 사용 여부',
    '남성 주 불임 원인', '남성 부 불임 원인', '여성 주 불임 원인',
    '여성 부 불임 원인', '부부 주 불임 원인', '부부 부 불임 원인',
    '불명확 불임 원인', '불임 원인 - 난관 질환', '불임 원인 - 남성 요인',
    '불임 원인 - 배란 장애', '불임 원인 - 여성 요인', '불임 원인 - 자궁경부 문제',
    '불임 원인 - 자궁내막증', '불임 원인 - 정자 농도', '불임 원인 - 정자 면역학적 요인',
    '불임 원인 - 정자 운동성', '불임 원인 - 정자 형태', '배아 생성 주요 이유',
    '총 시술 횟수', '클리닉 내 총 시술 횟수', 'IVF 시술 횟수', 'DI 시술 횟수',
    '총 임신 횟수', 'IVF 임신 횟수', 'DI 임신 횟수', '총 출산 횟수',
    'IVF 출산 횟수', 'DI 출산 횟수', '난자 출처', '정자 출처',
    '난자 기증자 나이', '정자 기증자 나이', '동결 배아 사용 여부',
    '신선 배아 사용 여부', '기증 배아 사용 여부', '대리모 여부',
    'PGD 시술 여부', 'PGS 시술 여부'
]

numerical_columns = [col for col in X.columns if col not in categorical_columns]

# 전처리 파이프라인
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_columns),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_columns)
    ]
)

# 3. wandb 스윕(run)을 위한 함수 정의
def run():
    # 각 run마다 wandb 초기화 (sweep 에이전트가 호출)
    wandb.init(project="LGaimers", entity="espada105-hanseouniversity")
    config = wandb.config

    # 모델 생성: wandb.config에서 하이퍼파라미터 값을 가져옴
    model = LGBMClassifier(
        n_estimators=int(config.n_estimators),
        learning_rate=config.learning_rate,
        max_depth=int(config.max_depth),
        random_state=42
    )

    # 파이프라인 구성 (전처리 + 분류기)
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('classifier', model)
    ])

    # 교차 검증 (5-fold)로 성능 평가
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    scores = cross_val_score(pipeline, X, y, cv=cv, scoring='roc_auc')
    mean_score = np.mean(scores)
    
    # 결과 로깅
    wandb.log({'roc_auc': mean_score})
    print(f"Mean ROC AUC: {mean_score:.4f}")
    wandb.finish()

# 4. 스윕 설정 구성
sweep_config = {
    'method': 'grid',  # 'grid', 'random', 'bayes'
    'metric': {
        'name': 'roc_auc',
        'goal': 'maximize'
    },
    'parameters': {
        'n_estimators': {
            'values': [100, 200]
        },
        'learning_rate': {
            'values': [0.01, 0.05, 0.1]
        },
        'max_depth': {
            'values': [5, 7, 10]
        }
    }
}

# 스윕 생성 (sweep_id 반환)
sweep_id = wandb.sweep(sweep_config, project="LGaimers", entity="espada105-hanseouniversity")
print("Sweep ID:", sweep_id)

# 5. 에이전트를 사용해 스윕 실행 (예: 10번의 실험)
wandb.agent(sweep_id, function=run, count=20)


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: C:\Users\tjddl\_netrc


Create sweep with ID: d6kdsk99
Sweep URL: https://wandb.ai/espada105-hanseouniversity/LGaimers/sweeps/d6kdsk99
Sweep ID: d6kdsk99


[34m[1mwandb[0m: Agent Starting Run: rpir7gr0 with config:
[34m[1mwandb[0m: 	learning_rate: 0.01
[34m[1mwandb[0m: 	max_depth: 5
[34m[1mwandb[0m: 	n_estimators: 100




Mean ROC AUC: 0.7293




0,1
roc_auc,▁

0,1
roc_auc,0.72932


[34m[1mwandb[0m: Agent Starting Run: 93yvj83k with config:
[34m[1mwandb[0m: 	learning_rate: 0.01
[34m[1mwandb[0m: 	max_depth: 5
[34m[1mwandb[0m: 	n_estimators: 200




Mean ROC AUC: 0.7328




0,1
roc_auc,▁

0,1
roc_auc,0.7328


[34m[1mwandb[0m: Agent Starting Run: 5l0i07kq with config:
[34m[1mwandb[0m: 	learning_rate: 0.01
[34m[1mwandb[0m: 	max_depth: 7
[34m[1mwandb[0m: 	n_estimators: 100




Mean ROC AUC: 0.7331




0,1
roc_auc,▁

0,1
roc_auc,0.73309


[34m[1mwandb[0m: Agent Starting Run: ab7iq93p with config:
[34m[1mwandb[0m: 	learning_rate: 0.01
[34m[1mwandb[0m: 	max_depth: 7
[34m[1mwandb[0m: 	n_estimators: 200




Mean ROC AUC: 0.7357




0,1
roc_auc,▁

0,1
roc_auc,0.73567


[34m[1mwandb[0m: Agent Starting Run: 6tg7gk1s with config:
[34m[1mwandb[0m: 	learning_rate: 0.01
[34m[1mwandb[0m: 	max_depth: 10
[34m[1mwandb[0m: 	n_estimators: 100




Mean ROC AUC: 0.7331




0,1
roc_auc,▁

0,1
roc_auc,0.7331


[34m[1mwandb[0m: Agent Starting Run: bb85zo02 with config:
[34m[1mwandb[0m: 	learning_rate: 0.01
[34m[1mwandb[0m: 	max_depth: 10
[34m[1mwandb[0m: 	n_estimators: 200




Mean ROC AUC: 0.7358




0,1
roc_auc,▁

0,1
roc_auc,0.7358


[34m[1mwandb[0m: Agent Starting Run: mvb8aip9 with config:
[34m[1mwandb[0m: 	learning_rate: 0.05
[34m[1mwandb[0m: 	max_depth: 5
[34m[1mwandb[0m: 	n_estimators: 100




Mean ROC AUC: 0.7370




0,1
roc_auc,▁

0,1
roc_auc,0.73703


[34m[1mwandb[0m: Agent Starting Run: r7ybkim1 with config:
[34m[1mwandb[0m: 	learning_rate: 0.05
[34m[1mwandb[0m: 	max_depth: 5
[34m[1mwandb[0m: 	n_estimators: 200




Mean ROC AUC: 0.7389




0,1
roc_auc,▁

0,1
roc_auc,0.73888


[34m[1mwandb[0m: Agent Starting Run: fdolufym with config:
[34m[1mwandb[0m: 	learning_rate: 0.05
[34m[1mwandb[0m: 	max_depth: 7
[34m[1mwandb[0m: 	n_estimators: 100




Mean ROC AUC: 0.7389




0,1
roc_auc,▁

0,1
roc_auc,0.73889


[34m[1mwandb[0m: Agent Starting Run: kswllc0v with config:
[34m[1mwandb[0m: 	learning_rate: 0.05
[34m[1mwandb[0m: 	max_depth: 7
[34m[1mwandb[0m: 	n_estimators: 200




Mean ROC AUC: 0.7395




0,1
roc_auc,▁

0,1
roc_auc,0.7395


[34m[1mwandb[0m: Agent Starting Run: x5x2rkon with config:
[34m[1mwandb[0m: 	learning_rate: 0.05
[34m[1mwandb[0m: 	max_depth: 10
[34m[1mwandb[0m: 	n_estimators: 100




Mean ROC AUC: 0.7391




0,1
roc_auc,▁

0,1
roc_auc,0.73912


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: yxbgakch with config:
[34m[1mwandb[0m: 	learning_rate: 0.05
[34m[1mwandb[0m: 	max_depth: 10
[34m[1mwandb[0m: 	n_estimators: 200




Mean ROC AUC: 0.7396




0,1
roc_auc,▁

0,1
roc_auc,0.73957


[34m[1mwandb[0m: Agent Starting Run: wndxr4uf with config:
[34m[1mwandb[0m: 	learning_rate: 0.1
[34m[1mwandb[0m: 	max_depth: 5
[34m[1mwandb[0m: 	n_estimators: 100




Mean ROC AUC: 0.7388




0,1
roc_auc,▁

0,1
roc_auc,0.73879


[34m[1mwandb[0m: Agent Starting Run: 99jclt1r with config:
[34m[1mwandb[0m: 	learning_rate: 0.1
[34m[1mwandb[0m: 	max_depth: 5
[34m[1mwandb[0m: 	n_estimators: 200




Mean ROC AUC: 0.7389




0,1
roc_auc,▁

0,1
roc_auc,0.73887


[34m[1mwandb[0m: Agent Starting Run: vg76vuex with config:
[34m[1mwandb[0m: 	learning_rate: 0.1
[34m[1mwandb[0m: 	max_depth: 7
[34m[1mwandb[0m: 	n_estimators: 100




Mean ROC AUC: 0.7392




0,1
roc_auc,▁

0,1
roc_auc,0.73921


[34m[1mwandb[0m: Agent Starting Run: h15mbceg with config:
[34m[1mwandb[0m: 	learning_rate: 0.1
[34m[1mwandb[0m: 	max_depth: 7
[34m[1mwandb[0m: 	n_estimators: 200




Mean ROC AUC: 0.7388




0,1
roc_auc,▁

0,1
roc_auc,0.73878


[34m[1mwandb[0m: Agent Starting Run: u20ullj1 with config:
[34m[1mwandb[0m: 	learning_rate: 0.1
[34m[1mwandb[0m: 	max_depth: 10
[34m[1mwandb[0m: 	n_estimators: 100




Mean ROC AUC: 0.7394




0,1
roc_auc,▁

0,1
roc_auc,0.73938


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: 8ea6inl9 with config:
[34m[1mwandb[0m: 	learning_rate: 0.1
[34m[1mwandb[0m: 	max_depth: 10
[34m[1mwandb[0m: 	n_estimators: 200




Mean ROC AUC: 0.7388




0,1
roc_auc,▁

0,1
roc_auc,0.73876


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Sweep Agent: Exiting.


# 2차

In [3]:
import pandas as pd
import numpy as np
import wandb

from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import accuracy_score
import lightgbm as lgb
from category_encoders import TargetEncoder
from lightgbm import early_stopping, log_evaluation

# -----------------------------
# 1. wandb 로그인 및 스윕 설정
# -----------------------------
wandb.login(key="8bab4affd59dd33bd76096b069d74f6dc0abb056")

# 스윕 설정 (여기서는 그리드 서치를 예시로 함)
sweep_config = {
    'method': 'grid',  # 'grid', 'random', 'bayes' 중 선택
    'metric': {
        'name': 'final_validation_accuracy',
        'goal': 'maximize'
    },
    'parameters': {
        'n_estimators': {'values': [1000, 1500]},
        'learning_rate': {'values': [0.01, 0.005]},
        'max_depth': {'values': [10, 12]},
        # 나머지 고정 파라미터는 'value'로 지정합니다.
        'stopping_rounds': {'value': 100},
        'test_size': {'value': 0.2},
        'random_state': {'value': 42}
    }
}

sweep_id = wandb.sweep(sweep_config, project="LGaimers", entity="espada105-hanseouniversity")
print("Sweep ID:", sweep_id)

# -----------------------------
# 2. run() 함수 정의: 하이퍼파라미터에 따른 모델 학습 및 평가
# -----------------------------
def run():
    # 각 run마다 wandb 초기화 및 config 로드
    wandb.init(project="LGaimers", entity="espada105-hanseouniversity")
    config = wandb.config

    # --- 데이터 불러오기 및 전처리 ---
    # 학습 데이터 불러오기 (ID 컬럼 제거)
    train_df = pd.read_csv('train.csv')
    train_df = train_df.drop(columns=['ID'])
    
    # 타깃과 피처 분리 (타깃 컬럼명이 '임신 성공 여부'라고 가정)
    X = train_df.drop('임신 성공 여부', axis=1)
    y = train_df['임신 성공 여부']
    
    # 범주형 변수 목록 (실제 데이터에 맞게 수정)
    categorical_columns = [
        "시술 시기 코드", "시술 당시 나이", "시술 유형", "특정 시술 유형", "배란 자극 여부",
        "배란 유도 유형", "단일 배아 이식 여부", "착상 전 유전 검사 사용 여부", "착상 전 유전 진단 사용 여부",
        "남성 주 불임 원인", "남성 부 불임 원인", "여성 주 불임 원인", "여성 부 불임 원인", "부부 주 불임 원인",
        "부부 부 불임 원인", "불명확 불임 원인", "불임 원인 - 난관 질환", "불임 원인 - 남성 요인",
        "불임 원인 - 배란 장애", "불임 원인 - 여성 요인", "불임 원인 - 자궁경부 문제",
        "불임 원인 - 자궁내막증", "불임 원인 - 정자 농도", "불임 원인 - 정자 면역학적 요인",
        "불임 원인 - 정자 운동성", "불임 원인 - 정자 형태", "배아 생성 주요 이유", "총 시술 횟수",
        "클리닉 내 총 시술 횟수", "IVF 시술 횟수", "DI 시술 횟수", "총 임신 횟수", "IVF 임신 횟수",
        "DI 임신 횟수", "총 출산 횟수", "IVF 출산 횟수", "DI 출산 횟수", "난자 출처", "정자 출처",
        "난자 기증자 나이", "정자 기증자 나이", "동결 배아 사용 여부", "신선 배아 사용 여부",
        "기증 배아 사용 여부", "대리모 여부", "PGD 시술 여부", "PGS 시술 여부"
    ]
    numerical_columns = [col for col in X.columns if col not in categorical_columns]
    
    # TargetEncoder를 이용하여 범주형 변수 인코딩
    encoder = TargetEncoder(cols=categorical_columns)
    X_encoded = encoder.fit_transform(X, y)
    
    # train/validation 세트 분할 (config에 설정된 test_size와 random_state 사용)
    X_train, X_val, y_train, y_val = train_test_split(
        X_encoded, y, test_size=config.test_size, random_state=config.random_state
    )
    
    # --- 초기 모델 학습 및 피처 선택 ---
    base_model = lgb.LGBMClassifier(
        n_estimators=config.n_estimators,
        learning_rate=config.learning_rate,
        max_depth=config.max_depth,
        random_state=config.random_state
    )
    callbacks = [early_stopping(stopping_rounds=config.stopping_rounds, verbose=True)]
    
    base_model.fit(X_train, y_train, eval_set=[(X_val, y_val)], callbacks=callbacks)
    
    # SelectFromModel을 이용해 피처 선택 (중간값(threshold='median') 기준)
    selector = SelectFromModel(base_model, prefit=True, threshold='median')
    X_train_selected = selector.transform(X_train)
    X_val_selected = selector.transform(X_val)
    
    # --- 최종 모델 학습 ---
    final_model = lgb.LGBMClassifier(
        n_estimators=config.n_estimators,
        learning_rate=config.learning_rate,
        max_depth=config.max_depth,
        random_state=config.random_state
    )
    callbacks_final = [
        early_stopping(stopping_rounds=config.stopping_rounds, verbose=True),
        log_evaluation(10)
    ]
    final_model.fit(X_train_selected, y_train, eval_set=[(X_val_selected, y_val)], callbacks=callbacks_final)
    
    # 검증 데이터에 대한 예측 및 평가
    from sklearn.metrics import roc_auc_score

    # 최종 모델 예측 확률 (양성 클래스에 대한 확률)
    y_pred_proba_final = final_model.predict_proba(X_val_selected)[:, 1]
    roc_auc = roc_auc_score(y_val, y_pred_proba_final)
    print(f"최종 모델 ROC AUC: {roc_auc:.6f}")
    wandb.log({"final_validation_roc_auc": roc_auc})

    wandb.finish()

# -----------------------------
# 3. wandb 에이전트를 이용해 스윕 실행
# -----------------------------
wandb.agent(sweep_id, function=run, count=10)


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: C:\Users\tjddl\_netrc


Create sweep with ID: dem4d8yd
Sweep URL: https://wandb.ai/espada105-hanseouniversity/LGaimers/sweeps/dem4d8yd
Sweep ID: dem4d8yd


[34m[1mwandb[0m: Agent Starting Run: wqdcqzr7 with config:
[34m[1mwandb[0m: 	learning_rate: 0.01
[34m[1mwandb[0m: 	max_depth: 10
[34m[1mwandb[0m: 	n_estimators: 1000
[34m[1mwandb[0m: 	random_state: 42
[34m[1mwandb[0m: 	stopping_rounds: 100
[34m[1mwandb[0m: 	test_size: 0.2


Training until validation scores don't improve for 100 rounds
Did not meet early stopping. Best iteration is:
[949]	valid_0's binary_logloss: 0.485906




Training until validation scores don't improve for 100 rounds
[10]	valid_0's binary_logloss: 0.557659
[20]	valid_0's binary_logloss: 0.548339
[30]	valid_0's binary_logloss: 0.540488
[40]	valid_0's binary_logloss: 0.533832
[50]	valid_0's binary_logloss: 0.528117
[60]	valid_0's binary_logloss: 0.523202
[70]	valid_0's binary_logloss: 0.518968
[80]	valid_0's binary_logloss: 0.515291
[90]	valid_0's binary_logloss: 0.512077
[100]	valid_0's binary_logloss: 0.509274
[110]	valid_0's binary_logloss: 0.506829
[120]	valid_0's binary_logloss: 0.504677
[130]	valid_0's binary_logloss: 0.502781
[140]	valid_0's binary_logloss: 0.501087
[150]	valid_0's binary_logloss: 0.499581
[160]	valid_0's binary_logloss: 0.498259
[170]	valid_0's binary_logloss: 0.49708
[180]	valid_0's binary_logloss: 0.496036
[190]	valid_0's binary_logloss: 0.495096
[200]	valid_0's binary_logloss: 0.494264
[210]	valid_0's binary_logloss: 0.493512
[220]	valid_0's binary_logloss: 0.492837
[230]	valid_0's binary_logloss: 0.492232
[240]



최종 모델 ROC AUC: 0.739263


0,1
final_validation_roc_auc,▁

0,1
final_validation_roc_auc,0.73926


[34m[1mwandb[0m: Agent Starting Run: m0zvgkdl with config:
[34m[1mwandb[0m: 	learning_rate: 0.01
[34m[1mwandb[0m: 	max_depth: 10
[34m[1mwandb[0m: 	n_estimators: 1500
[34m[1mwandb[0m: 	random_state: 42
[34m[1mwandb[0m: 	stopping_rounds: 100
[34m[1mwandb[0m: 	test_size: 0.2


Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1031]	valid_0's binary_logloss: 0.485904




Training until validation scores don't improve for 100 rounds
[10]	valid_0's binary_logloss: 0.557659
[20]	valid_0's binary_logloss: 0.548339
[30]	valid_0's binary_logloss: 0.540488
[40]	valid_0's binary_logloss: 0.533832
[50]	valid_0's binary_logloss: 0.528117
[60]	valid_0's binary_logloss: 0.523202
[70]	valid_0's binary_logloss: 0.518968
[80]	valid_0's binary_logloss: 0.515291
[90]	valid_0's binary_logloss: 0.512077
[100]	valid_0's binary_logloss: 0.509274
[110]	valid_0's binary_logloss: 0.506829
[120]	valid_0's binary_logloss: 0.504677
[130]	valid_0's binary_logloss: 0.502781
[140]	valid_0's binary_logloss: 0.501087
[150]	valid_0's binary_logloss: 0.499581
[160]	valid_0's binary_logloss: 0.498259
[170]	valid_0's binary_logloss: 0.49708
[180]	valid_0's binary_logloss: 0.496036
[190]	valid_0's binary_logloss: 0.495096
[200]	valid_0's binary_logloss: 0.494264
[210]	valid_0's binary_logloss: 0.493512
[220]	valid_0's binary_logloss: 0.492837
[230]	valid_0's binary_logloss: 0.492232
[240]



최종 모델 ROC AUC: 0.739302


0,1
final_validation_roc_auc,▁

0,1
final_validation_roc_auc,0.7393


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: pxib7dfe with config:
[34m[1mwandb[0m: 	learning_rate: 0.01
[34m[1mwandb[0m: 	max_depth: 12
[34m[1mwandb[0m: 	n_estimators: 1000
[34m[1mwandb[0m: 	random_state: 42
[34m[1mwandb[0m: 	stopping_rounds: 100
[34m[1mwandb[0m: 	test_size: 0.2


Training until validation scores don't improve for 100 rounds
Did not meet early stopping. Best iteration is:
[933]	valid_0's binary_logloss: 0.485964




Training until validation scores don't improve for 100 rounds
[10]	valid_0's binary_logloss: 0.557659
[20]	valid_0's binary_logloss: 0.548339
[30]	valid_0's binary_logloss: 0.540488
[40]	valid_0's binary_logloss: 0.533832
[50]	valid_0's binary_logloss: 0.528117
[60]	valid_0's binary_logloss: 0.523202
[70]	valid_0's binary_logloss: 0.518968
[80]	valid_0's binary_logloss: 0.515291
[90]	valid_0's binary_logloss: 0.512077
[100]	valid_0's binary_logloss: 0.509274
[110]	valid_0's binary_logloss: 0.506829
[120]	valid_0's binary_logloss: 0.504677
[130]	valid_0's binary_logloss: 0.502778
[140]	valid_0's binary_logloss: 0.50108
[150]	valid_0's binary_logloss: 0.499587
[160]	valid_0's binary_logloss: 0.498268
[170]	valid_0's binary_logloss: 0.497097
[180]	valid_0's binary_logloss: 0.496044
[190]	valid_0's binary_logloss: 0.495109
[200]	valid_0's binary_logloss: 0.494266
[210]	valid_0's binary_logloss: 0.493519
[220]	valid_0's binary_logloss: 0.492849
[230]	valid_0's binary_logloss: 0.492245
[240]



최종 모델 ROC AUC: 0.739165


0,1
final_validation_roc_auc,▁

0,1
final_validation_roc_auc,0.73916


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: fk2v853o with config:
[34m[1mwandb[0m: 	learning_rate: 0.01
[34m[1mwandb[0m: 	max_depth: 12
[34m[1mwandb[0m: 	n_estimators: 1500
[34m[1mwandb[0m: 	random_state: 42
[34m[1mwandb[0m: 	stopping_rounds: 100
[34m[1mwandb[0m: 	test_size: 0.2


Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[933]	valid_0's binary_logloss: 0.485964




Training until validation scores don't improve for 100 rounds
[10]	valid_0's binary_logloss: 0.557659
[20]	valid_0's binary_logloss: 0.548339
[30]	valid_0's binary_logloss: 0.540488
[40]	valid_0's binary_logloss: 0.533832
[50]	valid_0's binary_logloss: 0.528117
[60]	valid_0's binary_logloss: 0.523202
[70]	valid_0's binary_logloss: 0.518968
[80]	valid_0's binary_logloss: 0.515291
[90]	valid_0's binary_logloss: 0.512077
[100]	valid_0's binary_logloss: 0.509274
[110]	valid_0's binary_logloss: 0.506829
[120]	valid_0's binary_logloss: 0.504677
[130]	valid_0's binary_logloss: 0.502778
[140]	valid_0's binary_logloss: 0.50108
[150]	valid_0's binary_logloss: 0.499587
[160]	valid_0's binary_logloss: 0.498268
[170]	valid_0's binary_logloss: 0.497097
[180]	valid_0's binary_logloss: 0.496044
[190]	valid_0's binary_logloss: 0.495109
[200]	valid_0's binary_logloss: 0.494266
[210]	valid_0's binary_logloss: 0.493519
[220]	valid_0's binary_logloss: 0.492849
[230]	valid_0's binary_logloss: 0.492245
[240]



최종 모델 ROC AUC: 0.739166


0,1
final_validation_roc_auc,▁

0,1
final_validation_roc_auc,0.73917


[34m[1mwandb[0m: Agent Starting Run: z4udrb5u with config:
[34m[1mwandb[0m: 	learning_rate: 0.005
[34m[1mwandb[0m: 	max_depth: 10
[34m[1mwandb[0m: 	n_estimators: 1000
[34m[1mwandb[0m: 	random_state: 42
[34m[1mwandb[0m: 	stopping_rounds: 100
[34m[1mwandb[0m: 	test_size: 0.2


Training until validation scores don't improve for 100 rounds
Did not meet early stopping. Best iteration is:
[1000]	valid_0's binary_logloss: 0.486587




Training until validation scores don't improve for 100 rounds
[10]	valid_0's binary_logloss: 0.563006
[20]	valid_0's binary_logloss: 0.557681
[30]	valid_0's binary_logloss: 0.552819
[40]	valid_0's binary_logloss: 0.548374
[50]	valid_0's binary_logloss: 0.544288
[60]	valid_0's binary_logloss: 0.540528
[70]	valid_0's binary_logloss: 0.537068
[80]	valid_0's binary_logloss: 0.533871
[90]	valid_0's binary_logloss: 0.530909
[100]	valid_0's binary_logloss: 0.528158
[110]	valid_0's binary_logloss: 0.52561
[120]	valid_0's binary_logloss: 0.523247
[130]	valid_0's binary_logloss: 0.521057
[140]	valid_0's binary_logloss: 0.51901
[150]	valid_0's binary_logloss: 0.517108
[160]	valid_0's binary_logloss: 0.515333
[170]	valid_0's binary_logloss: 0.513672
[180]	valid_0's binary_logloss: 0.512112
[190]	valid_0's binary_logloss: 0.510662
[200]	valid_0's binary_logloss: 0.509307
[210]	valid_0's binary_logloss: 0.508043
[220]	valid_0's binary_logloss: 0.506855
[230]	valid_0's binary_logloss: 0.505748
[240]	



최종 모델 ROC AUC: 0.738311


0,1
final_validation_roc_auc,▁

0,1
final_validation_roc_auc,0.73831


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: guvne2fe with config:
[34m[1mwandb[0m: 	learning_rate: 0.005
[34m[1mwandb[0m: 	max_depth: 10
[34m[1mwandb[0m: 	n_estimators: 1500
[34m[1mwandb[0m: 	random_state: 42
[34m[1mwandb[0m: 	stopping_rounds: 100
[34m[1mwandb[0m: 	test_size: 0.2


Training until validation scores don't improve for 100 rounds
Did not meet early stopping. Best iteration is:
[1500]	valid_0's binary_logloss: 0.486005




Training until validation scores don't improve for 100 rounds
[10]	valid_0's binary_logloss: 0.563007
[20]	valid_0's binary_logloss: 0.55768
[30]	valid_0's binary_logloss: 0.552818
[40]	valid_0's binary_logloss: 0.548375
[50]	valid_0's binary_logloss: 0.54429
[60]	valid_0's binary_logloss: 0.540532
[70]	valid_0's binary_logloss: 0.537071
[80]	valid_0's binary_logloss: 0.533873
[90]	valid_0's binary_logloss: 0.530912
[100]	valid_0's binary_logloss: 0.528159
[110]	valid_0's binary_logloss: 0.525613
[120]	valid_0's binary_logloss: 0.523246
[130]	valid_0's binary_logloss: 0.521058
[140]	valid_0's binary_logloss: 0.519009
[150]	valid_0's binary_logloss: 0.517108
[160]	valid_0's binary_logloss: 0.515336
[170]	valid_0's binary_logloss: 0.513676
[180]	valid_0's binary_logloss: 0.512116
[190]	valid_0's binary_logloss: 0.510663
[200]	valid_0's binary_logloss: 0.509309
[210]	valid_0's binary_logloss: 0.508044
[220]	valid_0's binary_logloss: 0.506855
[230]	valid_0's binary_logloss: 0.505747
[240]	



최종 모델 ROC AUC: 0.738932


0,1
final_validation_roc_auc,▁

0,1
final_validation_roc_auc,0.73893


[34m[1mwandb[0m: Agent Starting Run: 6b3xh08i with config:
[34m[1mwandb[0m: 	learning_rate: 0.005
[34m[1mwandb[0m: 	max_depth: 12
[34m[1mwandb[0m: 	n_estimators: 1000
[34m[1mwandb[0m: 	random_state: 42
[34m[1mwandb[0m: 	stopping_rounds: 100
[34m[1mwandb[0m: 	test_size: 0.2


Training until validation scores don't improve for 100 rounds
Did not meet early stopping. Best iteration is:
[1000]	valid_0's binary_logloss: 0.486565




Training until validation scores don't improve for 100 rounds
[10]	valid_0's binary_logloss: 0.563006
[20]	valid_0's binary_logloss: 0.557681
[30]	valid_0's binary_logloss: 0.552819
[40]	valid_0's binary_logloss: 0.548374
[50]	valid_0's binary_logloss: 0.544288
[60]	valid_0's binary_logloss: 0.540528
[70]	valid_0's binary_logloss: 0.537068
[80]	valid_0's binary_logloss: 0.533871
[90]	valid_0's binary_logloss: 0.530909
[100]	valid_0's binary_logloss: 0.528158
[110]	valid_0's binary_logloss: 0.52561
[120]	valid_0's binary_logloss: 0.523247
[130]	valid_0's binary_logloss: 0.521057
[140]	valid_0's binary_logloss: 0.51901
[150]	valid_0's binary_logloss: 0.517108
[160]	valid_0's binary_logloss: 0.515333
[170]	valid_0's binary_logloss: 0.513672
[180]	valid_0's binary_logloss: 0.512112
[190]	valid_0's binary_logloss: 0.510662
[200]	valid_0's binary_logloss: 0.509307
[210]	valid_0's binary_logloss: 0.508043
[220]	valid_0's binary_logloss: 0.506855
[230]	valid_0's binary_logloss: 0.505747
[240]	



최종 모델 ROC AUC: 0.738330


0,1
final_validation_roc_auc,▁

0,1
final_validation_roc_auc,0.73833


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: za53ctwq with config:
[34m[1mwandb[0m: 	learning_rate: 0.005
[34m[1mwandb[0m: 	max_depth: 12
[34m[1mwandb[0m: 	n_estimators: 1500
[34m[1mwandb[0m: 	random_state: 42
[34m[1mwandb[0m: 	stopping_rounds: 100
[34m[1mwandb[0m: 	test_size: 0.2


Training until validation scores don't improve for 100 rounds
Did not meet early stopping. Best iteration is:
[1499]	valid_0's binary_logloss: 0.486014




Training until validation scores don't improve for 100 rounds
[10]	valid_0's binary_logloss: 0.563006
[20]	valid_0's binary_logloss: 0.557681
[30]	valid_0's binary_logloss: 0.55282
[40]	valid_0's binary_logloss: 0.548377
[50]	valid_0's binary_logloss: 0.544292
[60]	valid_0's binary_logloss: 0.540533
[70]	valid_0's binary_logloss: 0.537072
[80]	valid_0's binary_logloss: 0.533876
[90]	valid_0's binary_logloss: 0.530913
[100]	valid_0's binary_logloss: 0.528161
[110]	valid_0's binary_logloss: 0.525616
[120]	valid_0's binary_logloss: 0.523249
[130]	valid_0's binary_logloss: 0.521059
[140]	valid_0's binary_logloss: 0.519011
[150]	valid_0's binary_logloss: 0.51711
[160]	valid_0's binary_logloss: 0.515336
[170]	valid_0's binary_logloss: 0.513679
[180]	valid_0's binary_logloss: 0.512121
[190]	valid_0's binary_logloss: 0.510669
[200]	valid_0's binary_logloss: 0.509312
[210]	valid_0's binary_logloss: 0.508048
[220]	valid_0's binary_logloss: 0.50686
[230]	valid_0's binary_logloss: 0.505751
[240]	v



최종 모델 ROC AUC: 0.738965


0,1
final_validation_roc_auc,▁

0,1
final_validation_roc_auc,0.73897


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Sweep Agent: Exiting.


# 3

In [5]:
!pip install seaborn

Collecting seaborn
  Downloading seaborn-0.13.2-py3-none-any.whl (294 kB)
Installing collected packages: seaborn
Successfully installed seaborn-0.13.2


You should consider upgrading via the 'C:\Users\tjddl\AppData\Local\Programs\Python\Python310\python.exe -m pip install --upgrade pip' command.


In [2]:
import pandas as pd
import numpy as np
import wandb
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.impute import SimpleImputer
from category_encoders import TargetEncoder

# -----------------------------
# 1. wandb 로그인 및 스윕 설정
# -----------------------------
wandb.login(key="8bab4affd59dd33bd76096b069d74f6dc0abb056")

sweep_config = {
    'method': 'grid',  # grid search 방식 (또는 'random', 'bayes' 등)
    'metric': {
        'name': 'final_validation_roc_auc',
        'goal': 'maximize'
    },
    'parameters': {
        # XGBoost 관련 하이퍼파라미터
        'xgb_n_estimators': {'values': [100, 200]},
        'xgb_max_depth': {'values': [3, 5, 7]},
        'xgb_learning_rate': {'values': [0.01, 0.1, 0.2]},
        # 고정 파라미터
        'random_state': {'value': 42},
        'test_size': {'value': 0.2}
    }
}

# 스윕 생성: 반환된 sweep_id를 사용하여 에이전트가 작업을 가져가도록 합니다.
sweep_id = wandb.sweep(sweep_config, project="LGaimers", entity="espada105-hanseouniversity")
print("Sweep ID:", sweep_id)

# -----------------------------
# 2. run() 함수 정의: 데이터 전처리, 모델 학습 및 평가
# -----------------------------
def run():
    # 각 run마다 wandb 초기화 및 config 로드
    wandb.init(project="LGaimers", entity="espada105-hanseouniversity", reinit=True)
    config = wandb.config

    # --- 데이터 불러오기 및 전처리 ---
    train_df = pd.read_csv('train.csv')
    train_df = train_df.drop(columns=['ID'])
    
    # 타깃 및 피처 분리:
    # '임신 성공 여부'가 1 이상이면 성공(1), 아니면 실패(0)로 이진 분류 처리
    X = train_df.drop('임신 성공 여부', axis=1)
    y = train_df['임신 성공 여부'].apply(lambda x: 1 if x >= 1 else 0)
    
    # 범주형 변수 목록 (실제 데이터에 맞게 수정)
    categorical_columns = [
        "시술 시기 코드", "시술 당시 나이", "시술 유형", "특정 시술 유형", "배란 자극 여부",
        "배란 유도 유형", "단일 배아 이식 여부", "착상 전 유전 검사 사용 여부", "착상 전 유전 진단 사용 여부",
        "남성 주 불임 원인", "남성 부 불임 원인", "여성 주 불임 원인", "여성 부 불임 원인", "부부 주 불임 원인",
        "부부 부 불임 원인", "불명확 불임 원인", "불임 원인 - 난관 질환", "불임 원인 - 남성 요인",
        "불임 원인 - 배란 장애", "불임 원인 - 여성 요인", "불임 원인 - 자궁경부 문제",
        "불임 원인 - 자궁내막증", "불임 원인 - 정자 농도", "불임 원인 - 정자 면역학적 요인",
        "불임 원인 - 정자 운동성", "불임 원인 - 정자 형태", "배아 생성 주요 이유",
        "총 시술 횟수", "클리닉 내 총 시술 횟수", "IVF 시술 횟수", "DI 시술 횟수",
        "총 임신 횟수", "IVF 임신 횟수", "DI 임신 횟수", "총 출산 횟수", "IVF 출산 횟수",
        "DI 출산 횟수", "난자 출처", "정자 출처",
        "난자 기증자 나이", "정자 기증자 나이", "동결 배아 사용 여부", "신선 배아 사용 여부",
        "기증 배아 사용 여부", "대리모 여부", "PGD 시술 여부", "PGS 시술 여부"
    ]
    numerical_columns = [col for col in X.columns if col not in categorical_columns]
    
    # TargetEncoder로 범주형 변수 인코딩
    encoder = TargetEncoder(cols=categorical_columns)
    X_encoded = encoder.fit_transform(X, y)
    
    # 결측치 처리: SimpleImputer를 사용해 중앙값으로 대체
    imputer = SimpleImputer(strategy='median')
    X_encoded = pd.DataFrame(imputer.fit_transform(X_encoded), columns=X_encoded.columns)
    
    # train / validation 분할 (클래스 불균형을 고려하여 stratify 사용)
    X_train, X_val, y_train, y_val = train_test_split(
        X_encoded, y,
        test_size=config.test_size,
        random_state=config.random_state,
        stratify=y
    )
    
    # --- 모델 정의 ---
    # XGBoost 모델: wandb 스윕으로 전달받은 하이퍼파라미터 사용
    xgb_model = XGBClassifier(
        n_estimators=config.xgb_n_estimators,
        max_depth=config.xgb_max_depth,
        learning_rate=config.xgb_learning_rate,
        random_state=config.random_state,
        use_label_encoder=False,  # 최신 버전에서는 불필요한 경고 제거
        eval_metric='logloss'
    )
    
    # Logistic Regression: 기본 설정
    lr_model = LogisticRegression(
        max_iter=1000,
        random_state=config.random_state
    )
    
    # 소프트 보팅 앙상블: 두 모델의 확률 예측을 평균하여 최종 예측
    ensemble = VotingClassifier(
        estimators=[('xgb', xgb_model), ('lr', lr_model)],
        voting='soft'
    )
    
    # --- 모델 학습 및 평가 ---
    ensemble.fit(X_train, y_train)
    y_pred_proba = ensemble.predict_proba(X_val)[:, 1]
    roc_auc = roc_auc_score(y_val, y_pred_proba)
    print(f"Validation ROC AUC: {roc_auc:.4f}")
    
    # ROC 커브 생성 및 플롯
    fpr, tpr, _ = roc_curve(y_val, y_pred_proba)
    plt.figure(figsize=(8, 6))
    plt.plot(fpr, tpr, label=f'ROC curve (area = {roc_auc:.4f})', color='blue')
    plt.plot([0, 1], [0, 1], 'k--', label='Random guess')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver Operating Characteristic')
    plt.legend(loc="lower right")
    
    # wandb에 ROC 커브 이미지 로깅
    wandb.log({
        "final_validation_roc_auc": roc_auc,
        "roc_curve": wandb.Image(plt)
    })
    
    plt.close()
    wandb.finish()

# -----------------------------
# 3. wandb 에이전트를 이용한 스윕 실행 (예: 총 25회 run)
# -----------------------------
wandb.agent(sweep_id, function=run, count=25)


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: C:\Users\tjddl\_netrc


Create sweep with ID: 5e9b3czx
Sweep URL: https://wandb.ai/espada105-hanseouniversity/LGaimers/sweeps/5e9b3czx
Sweep ID: 5e9b3czx


[34m[1mwandb[0m: Agent Starting Run: 902irat0 with config:
[34m[1mwandb[0m: 	random_state: 42
[34m[1mwandb[0m: 	test_size: 0.2
[34m[1mwandb[0m: 	xgb_learning_rate: 0.01
[34m[1mwandb[0m: 	xgb_max_depth: 3
[34m[1mwandb[0m: 	xgb_n_estimators: 100


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Validation ROC AUC: 0.7211


0,1
final_validation_roc_auc,▁

0,1
final_validation_roc_auc,0.72109


[34m[1mwandb[0m: Agent Starting Run: gjy29w0h with config:
[34m[1mwandb[0m: 	random_state: 42
[34m[1mwandb[0m: 	test_size: 0.2
[34m[1mwandb[0m: 	xgb_learning_rate: 0.01
[34m[1mwandb[0m: 	xgb_max_depth: 3
[34m[1mwandb[0m: 	xgb_n_estimators: 200


Validation ROC AUC: 0.7232


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0,1
final_validation_roc_auc,▁

0,1
final_validation_roc_auc,0.72324


[34m[1mwandb[0m: Agent Starting Run: 35cdt5vq with config:
[34m[1mwandb[0m: 	random_state: 42
[34m[1mwandb[0m: 	test_size: 0.2
[34m[1mwandb[0m: 	xgb_learning_rate: 0.01
[34m[1mwandb[0m: 	xgb_max_depth: 5
[34m[1mwandb[0m: 	xgb_n_estimators: 100


Validation ROC AUC: 0.7239


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0,1
final_validation_roc_auc,▁

0,1
final_validation_roc_auc,0.72387


[34m[1mwandb[0m: Agent Starting Run: 9lzz7ht2 with config:
[34m[1mwandb[0m: 	random_state: 42
[34m[1mwandb[0m: 	test_size: 0.2
[34m[1mwandb[0m: 	xgb_learning_rate: 0.01
[34m[1mwandb[0m: 	xgb_max_depth: 5
[34m[1mwandb[0m: 	xgb_n_estimators: 200


Validation ROC AUC: 0.7265


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0,1
final_validation_roc_auc,▁

0,1
final_validation_roc_auc,0.72651


[34m[1mwandb[0m: Agent Starting Run: se3ot6fk with config:
[34m[1mwandb[0m: 	random_state: 42
[34m[1mwandb[0m: 	test_size: 0.2
[34m[1mwandb[0m: 	xgb_learning_rate: 0.01
[34m[1mwandb[0m: 	xgb_max_depth: 7
[34m[1mwandb[0m: 	xgb_n_estimators: 100


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Validation ROC AUC: 0.7255


0,1
final_validation_roc_auc,▁

0,1
final_validation_roc_auc,0.7255


[34m[1mwandb[0m: Agent Starting Run: spqc2f5x with config:
[34m[1mwandb[0m: 	random_state: 42
[34m[1mwandb[0m: 	test_size: 0.2
[34m[1mwandb[0m: 	xgb_learning_rate: 0.01
[34m[1mwandb[0m: 	xgb_max_depth: 7
[34m[1mwandb[0m: 	xgb_n_estimators: 200


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Validation ROC AUC: 0.7283


0,1
final_validation_roc_auc,▁

0,1
final_validation_roc_auc,0.72834


[34m[1mwandb[0m: Agent Starting Run: e1zgoab0 with config:
[34m[1mwandb[0m: 	random_state: 42
[34m[1mwandb[0m: 	test_size: 0.2
[34m[1mwandb[0m: 	xgb_learning_rate: 0.1
[34m[1mwandb[0m: 	xgb_max_depth: 3
[34m[1mwandb[0m: 	xgb_n_estimators: 100


Validation ROC AUC: 0.7287


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0,1
final_validation_roc_auc,▁

0,1
final_validation_roc_auc,0.72865


[34m[1mwandb[0m: Agent Starting Run: 7vbkli05 with config:
[34m[1mwandb[0m: 	random_state: 42
[34m[1mwandb[0m: 	test_size: 0.2
[34m[1mwandb[0m: 	xgb_learning_rate: 0.1
[34m[1mwandb[0m: 	xgb_max_depth: 3
[34m[1mwandb[0m: 	xgb_n_estimators: 200


Validation ROC AUC: 0.7298


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0,1
final_validation_roc_auc,▁

0,1
final_validation_roc_auc,0.72978


[34m[1mwandb[0m: Agent Starting Run: gz1yw6zt with config:
[34m[1mwandb[0m: 	random_state: 42
[34m[1mwandb[0m: 	test_size: 0.2
[34m[1mwandb[0m: 	xgb_learning_rate: 0.1
[34m[1mwandb[0m: 	xgb_max_depth: 5
[34m[1mwandb[0m: 	xgb_n_estimators: 100


Validation ROC AUC: 0.7304


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0,1
final_validation_roc_auc,▁

0,1
final_validation_roc_auc,0.73039


[34m[1mwandb[0m: Agent Starting Run: maymnqx6 with config:
[34m[1mwandb[0m: 	random_state: 42
[34m[1mwandb[0m: 	test_size: 0.2
[34m[1mwandb[0m: 	xgb_learning_rate: 0.1
[34m[1mwandb[0m: 	xgb_max_depth: 5
[34m[1mwandb[0m: 	xgb_n_estimators: 200


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Validation ROC AUC: 0.7306


0,1
final_validation_roc_auc,▁

0,1
final_validation_roc_auc,0.7306


[34m[1mwandb[0m: Agent Starting Run: hnkj01jk with config:
[34m[1mwandb[0m: 	random_state: 42
[34m[1mwandb[0m: 	test_size: 0.2
[34m[1mwandb[0m: 	xgb_learning_rate: 0.1
[34m[1mwandb[0m: 	xgb_max_depth: 7
[34m[1mwandb[0m: 	xgb_n_estimators: 100


Validation ROC AUC: 0.7310


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0,1
final_validation_roc_auc,▁

0,1
final_validation_roc_auc,0.73097


[34m[1mwandb[0m: Agent Starting Run: vm3lsjpl with config:
[34m[1mwandb[0m: 	random_state: 42
[34m[1mwandb[0m: 	test_size: 0.2
[34m[1mwandb[0m: 	xgb_learning_rate: 0.1
[34m[1mwandb[0m: 	xgb_max_depth: 7
[34m[1mwandb[0m: 	xgb_n_estimators: 200


Validation ROC AUC: 0.7310


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0,1
final_validation_roc_auc,▁

0,1
final_validation_roc_auc,0.73097


[34m[1mwandb[0m: Agent Starting Run: 937thwjc with config:
[34m[1mwandb[0m: 	random_state: 42
[34m[1mwandb[0m: 	test_size: 0.2
[34m[1mwandb[0m: 	xgb_learning_rate: 0.2
[34m[1mwandb[0m: 	xgb_max_depth: 3
[34m[1mwandb[0m: 	xgb_n_estimators: 100


Validation ROC AUC: 0.7297


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0,1
final_validation_roc_auc,▁

0,1
final_validation_roc_auc,0.72972


[34m[1mwandb[0m: Agent Starting Run: 1zkn5sor with config:
[34m[1mwandb[0m: 	random_state: 42
[34m[1mwandb[0m: 	test_size: 0.2
[34m[1mwandb[0m: 	xgb_learning_rate: 0.2
[34m[1mwandb[0m: 	xgb_max_depth: 3
[34m[1mwandb[0m: 	xgb_n_estimators: 200


Validation ROC AUC: 0.7303


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0,1
final_validation_roc_auc,▁

0,1
final_validation_roc_auc,0.7303


[34m[1mwandb[0m: Agent Starting Run: bnl1a10m with config:
[34m[1mwandb[0m: 	random_state: 42
[34m[1mwandb[0m: 	test_size: 0.2
[34m[1mwandb[0m: 	xgb_learning_rate: 0.2
[34m[1mwandb[0m: 	xgb_max_depth: 5
[34m[1mwandb[0m: 	xgb_n_estimators: 100


Validation ROC AUC: 0.7308


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0,1
final_validation_roc_auc,▁

0,1
final_validation_roc_auc,0.73085


[34m[1mwandb[0m: Agent Starting Run: qby2jrme with config:
[34m[1mwandb[0m: 	random_state: 42
[34m[1mwandb[0m: 	test_size: 0.2
[34m[1mwandb[0m: 	xgb_learning_rate: 0.2
[34m[1mwandb[0m: 	xgb_max_depth: 5
[34m[1mwandb[0m: 	xgb_n_estimators: 200


Validation ROC AUC: 0.7308


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0,1
final_validation_roc_auc,▁

0,1
final_validation_roc_auc,0.73084


[34m[1mwandb[0m: Agent Starting Run: 6kndthyt with config:
[34m[1mwandb[0m: 	random_state: 42
[34m[1mwandb[0m: 	test_size: 0.2
[34m[1mwandb[0m: 	xgb_learning_rate: 0.2
[34m[1mwandb[0m: 	xgb_max_depth: 7
[34m[1mwandb[0m: 	xgb_n_estimators: 100


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Validation ROC AUC: 0.7310


0,1
final_validation_roc_auc,▁

0,1
final_validation_roc_auc,0.73098


[34m[1mwandb[0m: Agent Starting Run: heueedm9 with config:
[34m[1mwandb[0m: 	random_state: 42
[34m[1mwandb[0m: 	test_size: 0.2
[34m[1mwandb[0m: 	xgb_learning_rate: 0.2
[34m[1mwandb[0m: 	xgb_max_depth: 7
[34m[1mwandb[0m: 	xgb_n_estimators: 200


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Validation ROC AUC: 0.7304


0,1
final_validation_roc_auc,▁

0,1
final_validation_roc_auc,0.73035


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Sweep Agent: Exiting.


# 4

In [11]:
!pip install pandas --upgrade
!pip install numpy --upgrade

import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import StackingClassifier, ExtraTreesClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

# --- 1. 데이터 로드 ---
# train.csv와 test.csv 파일에서 'ID' 컬럼은 제거합니다.
train = pd.read_csv('./train.csv').drop(columns=['ID'])
test = pd.read_csv('./test.csv').drop(columns=['ID'])

# --- 2. "시술 당시 나이" 전처리 및 missing indicator 생성 ---
# missing indicator: "알 수 없음"이면 1, 아니면 0
train['시술 당시 나이_missing'] = train['시술 당시 나이'].apply(lambda x: 1 if x == '알 수 없음' else 0)
test['시술 당시 나이_missing'] = test['시술 당시 나이'].apply(lambda x: 1 if x == '알 수 없음' else 0)

# 나이 매핑: 높은 값이 나이가 많음을 의미 → 임신 성공 확률은 낮아짐.
age_mapping = {
    '만18-34세': 0,
    '만35-37세': 1,
    '만38-39세': 2,
    '만40-42세': 3,
    '만43-44세': 4,
    '만45-50세': 5,
    '알 수 없음': np.nan
}
train['시술 당시 나이'] = train['시술 당시 나이'].map(age_mapping)
test['시술 당시 나이'] = test['시술 당시 나이'].map(age_mapping)

# --- 3. 타겟 및 Feature 분리 ---
X = train.drop('임신 성공 여부', axis=1)
y = train['임신 성공 여부']

# --- 4. 컬럼 목록 구성 ---
# "시술 당시 나이"와 "시술 당시 나이_missing"은 수치형으로 처리됩니다.
categorical_columns = [
    "시술 시기 코드", "시술 유형", "특정 시술 유형", "배란 자극 여부", "배란 유도 유형",
    "단일 배아 이식 여부", "착상 전 유전 검사 사용 여부", "착상 전 유전 진단 사용 여부",
    "남성 주 불임 원인", "남성 부 불임 원인", "여성 주 불임 원인", "여성 부 불임 원인",
    "부부 주 불임 원인", "부부 부 불임 원인", "불명확 불임 원인", "불임 원인 - 난관 질환",
    "불임 원인 - 남성 요인", "불임 원인 - 배란 장애", "불임 원인 - 여성 요인",
    "불임 원인 - 자궁경부 문제", "불임 원인 - 자궁내막증", "불임 원인 - 정자 농도",
    "불임 원인 - 정자 면역학적 요인", "불임 원인 - 정자 운동성", "불임 원인 - 정자 형태",
    "배아 생성 주요 이유", "총 시술 횟수", "클리닉 내 총 시술 횟수", "IVF 시술 횟수",
    "DI 시술 횟수", "총 임신 횟수", "IVF 임신 횟수", "DI 임신 횟수",
    "총 출산 횟수", "IVF 출산 횟수", "DI 출산 횟수", "난자 출처", "정자 출처",
    "난자 기증자 나이", "정자 기증자 나이", "동결 배아 사용 여부", "신선 배아 사용 여부",
    "기증 배아 사용 여부", "대리모 여부", "PGD 시술 여부", "PGS 시술 여부"
]

numeric_columns = [
    "시술 당시 나이", "시술 당시 나이_missing",
    "임신 시도 또는 마지막 임신 경과 연수",
    "총 생성 배아 수", "미세주입된 난자 수", "미세주입에서 생성된 배아 수",
    "이식된 배아 수", "미세주입 배아 이식 수", "저장된 배아 수", "미세주입 후 저장된 배아 수",
    "해동된 배아 수", "해동 난자 수", "수집된 신선 난자 수", "저장된 신선 난자 수",
    "혼합된 난자 수", "파트너 정자와 혼합된 난자 수", "기증자 정자와 혼합된 난자 수",
    "난자 채취 경과일", "난자 해동 경과일", "난자 혼합 경과일",
    "배아 이식 경과일", "배아 해동 경과일"
]

# --- 5. 결측치 처리 및 인코딩 ---
# 수치형 변수: 결측치는 0으로 채웁니다.
X[numeric_columns] = X[numeric_columns].fillna(0)
test[numeric_columns] = test[numeric_columns].fillna(0)

# 범주형 변수: pd.get_dummies를 사용 (dummy_na=True)
X_encoded = pd.get_dummies(X, columns=categorical_columns, dummy_na=True)
X_test_encoded = pd.get_dummies(test, columns=categorical_columns, dummy_na=True)

# 학습 데이터와 테스트 데이터의 컬럼을 align (누락된 dummy 컬럼은 0으로 채움)
X_encoded, X_test_encoded = X_encoded.align(X_test_encoded, join='outer', axis=1, fill_value=0)

# 중복 컬럼 제거 (만약 발생했다면)
X_encoded = X_encoded.loc[:, ~X_encoded.columns.duplicated()]
X_test_encoded = X_test_encoded.loc[:, ~X_test_encoded.columns.duplicated()]

# **중요**: 모든 feature 이름을 안전하게 (f0, f1, f2, ...) 재설정하여 특수문자 문제를 완전히 회피합니다.
X_encoded.columns = ["f" + str(i) for i in range(X_encoded.shape[1])]
X_test_encoded.columns = ["f" + str(i) for i in range(X_test_encoded.shape[1])]

# --- 6. 학습/검증 데이터 분할 (교차 검증용) ---
X_train, X_val, y_train, y_val = train_test_split(
    X_encoded, y, test_size=0.2, random_state=42, stratify=y
)

# --- 7. Stacking 앙상블 모델 구성 ---
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression

# GPU 사용: XGBoost와 LGBM에 GPU 옵션 설정 (ExtraTreesClassifier는 CPU 사용)
estimators = [
    ('xgb', XGBClassifier(
                objective='binary:logistic',
                eval_metric='auc',
                random_state=42,
                use_label_encoder=False,
                tree_method='gpu_hist',   # GPU 사용
                predictor='gpu_predictor',
                gpu_id=0,
                n_jobs=1)),
    ('lgbm', LGBMClassifier(
                random_state=42,
                # GPU 파라미터를 제거하여 CPU 모드로 동작 (OpenCL 문제가 발생하므로)
                n_jobs=1)),
    ('etc', ExtraTreesClassifier(
                random_state=42,
                n_jobs=1))
]
stack_clf = StackingClassifier(
    estimators=estimators,
    final_estimator=LogisticRegression(max_iter=1000),
    cv=5,
    n_jobs=1  # n_jobs=1로 설정하여 병렬 처리 관련 피클링 문제 회피
)

# --- 8. GridSearchCV를 통한 최종 메타 모델 튜닝 (LogisticRegression의 C값 튜닝) ---
param_grid = {
    'final_estimator__C': [0.1, 1.0, 10.0]
}
grid_search = GridSearchCV(stack_clf, param_grid, scoring='roc_auc', cv=5, n_jobs=1, verbose=1)
grid_search.fit(X_train, y_train)
print("Best parameters: ", grid_search.best_params_)
print("Best CV ROC AUC Score: ", grid_search.best_score_)

# --- 9. 검증 세트 평가 ---
val_pred_proba = grid_search.predict_proba(X_val)[:, 1]
val_roc_auc = roc_auc_score(y_val, val_pred_proba)
print("Validation ROC AUC Score: {:.4f}".format(val_roc_auc))

# --- 10. 최종 모델 재학습 (전체 학습 데이터 이용) ---
best_params = grid_search.best_params_
final_model = StackingClassifier(
    estimators=estimators,
    final_estimator=LogisticRegression(max_iter=1000, C=best_params['final_estimator__C']),
    cv=5,
    n_jobs=1
)
final_model.fit(X_encoded, y)

# --- 11. 테스트 데이터 예측 및 제출 파일 생성 ---
pred_proba = final_model.predict_proba(X_test_encoded)[:, 1]
submission = pd.DataFrame({
    'ID': ["TEST_" + str(i).zfill(5) for i in range(len(test))]
})
submission['probability'] = pred_proba
submission.to_csv('./improved_submit.csv', index=False)
print("Submission 파일 'improved_submit.csv'가 생성되었습니다.")

# --- 12. Test 데이터에는 실제 타겟(임신 성공 여부)이 없으므로 ROC AUC 점수는 계산하지 않습니다.
print("Test 데이터에는 실제 '임신 성공 여부'가 없으므로, ROC AUC 점수를 계산할 수 없습니다.")





You should consider upgrading via the 'C:\Users\tjddl\AppData\Local\Programs\Python\Python310\python.exe -m pip install --upgrade pip' command.


Collecting numpy
  Using cached numpy-2.2.2-cp310-cp310-win_amd64.whl (12.9 MB)
Installing collected packages: numpy
  Attempting uninstall: numpy
    Found existing installation: numpy 1.26.4
    Uninstalling numpy-1.26.4:
      Successfully uninstalled numpy-1.26.4


ERROR: Could not install packages due to an OSError: [WinError 5] 액세스가 거부되었습니다: 'C:\\Users\\tjddl\\AppData\\Local\\Programs\\Python\\Python310\\Lib\\site-packages\\~-mpy.libs\\libopenblas64__v0.3.23-293-gc2f4bdbb-gcc_10_3_0-2bde3a66a51006b2b53eb373ff767a3f.dll'
Consider using the `--user` option or check the permissions.

You should consider upgrading via the 'C:\Users\tjddl\AppData\Local\Programs\Python\Python310\python.exe -m pip install --upgrade pip' command.


Fitting 5 folds for each of 3 candidates, totalling 15 fits



    E.g. tree_method = "hist", device = "cuda"

Parameters: { "predictor", "use_label_encoder" } are not used.



[LightGBM] [Info] Number of positive: 42385, number of negative: 121679
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.100224 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 850
[LightGBM] [Info] Number of data points in the train set: 164064, number of used features: 181
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.258344 -> initscore=-1.054592
[LightGBM] [Info] Start training from score -1.054592


KeyboardInterrupt: 