In [1]:
from autogluon.tabular import TabularDataset, TabularPredictor
from sklearn.preprocessing import  OrdinalEncoder, StandardScaler
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from imblearn.over_sampling import ADASYN
import pandas as pd
from statsmodels.stats.outliers_influence import variance_inflation_factor

numeric_columns = [
    "임신 시도 또는 마지막 임신 경과 연수",
    "총 생성 배아 수",
    "미세주입된 난자 수",
    "미세주입에서 생성된 배아 수",
    "이식된 배아 수",
    "미세주입 배아 이식 수",
    "저장된 배아 수",
    "미세주입 후 저장된 배아 수",
    "해동된 배아 수",
    "해동 난자 수",
    "수집된 신선 난자 수",
    "저장된 신선 난자 수",
    "혼합된 난자 수",
    "파트너 정자와 혼합된 난자 수",
    "기증자 정자와 혼합된 난자 수",
    "난자 채취 경과일",
    "난자 해동 경과일",
    "난자 혼합 경과일",
    "배아 이식 경과일",
    "배아 해동 경과일"
]

categorical_columns = [
    "시술 시기 코드",
    "시술 당시 나이",
    "시술 유형",
    "특정 시술 유형",
    "배란 자극 여부",
    "배란 유도 유형",
    "단일 배아 이식 여부",
    "착상 전 유전 검사 사용 여부",
    "착상 전 유전 진단 사용 여부",
    "남성 주 불임 원인",
    "남성 부 불임 원인",
    "여성 주 불임 원인",
    "여성 부 불임 원인",
    "부부 주 불임 원인",
    "부부 부 불임 원인",
    "불명확 불임 원인",
    "불임 원인 - 난관 질환",
    "불임 원인 - 남성 요인",
    "불임 원인 - 배란 장애",
    "불임 원인 - 여성 요인",
    "불임 원인 - 자궁경부 문제",
    "불임 원인 - 자궁내막증",
    "불임 원인 - 정자 농도",
    "불임 원인 - 정자 면역학적 요인",
    "불임 원인 - 정자 운동성",
    "불임 원인 - 정자 형태",
    "배아 생성 주요 이유",
    "총 시술 횟수",
    "클리닉 내 총 시술 횟수",
    "IVF 시술 횟수",
    "DI 시술 횟수",
    "총 임신 횟수",
    "IVF 임신 횟수",
    "DI 임신 횟수",
    "총 출산 횟수",
    "IVF 출산 횟수",
    "DI 출산 횟수",
    "난자 출처",
    "정자 출처",
    "난자 기증자 나이",
    "정자 기증자 나이",
    "동결 배아 사용 여부",
    "신선 배아 사용 여부",
    "기증 배아 사용 여부",
    "대리모 여부",
    "PGD 시술 여부",
    "PGS 시술 여부"
]

train = pd.read_csv('./data/train.csv').drop(columns=['ID'])
test = pd.read_csv('./data/test.csv').drop(columns=['ID'])

# 카테고리형 컬럼들을 문자열로 변환
for col in categorical_columns:
    train[col] = train[col].astype(str)
    test[col] = test[col].astype(str)

ordinal_encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)

X_train_encoded = train.copy()
X_train_encoded[categorical_columns] = ordinal_encoder.fit_transform(train[categorical_columns])
X_test_encoded = test.copy()
X_test_encoded[categorical_columns] = ordinal_encoder.transform(test[categorical_columns])

X_train_encoded[numeric_columns] = X_train_encoded[numeric_columns].fillna(0)
X_test_encoded[numeric_columns] = X_test_encoded[numeric_columns].fillna(0)

# **PCA**로 차원 축소 후 군집화 수행 (군집 수는 3~6 사이로 실험)
pca = PCA(n_components=7, random_state=42)
X_pca = pca.fit_transform(X_train_encoded[numeric_columns])

kmeans = KMeans(n_clusters=5, random_state=42)
kmeans_clusters = kmeans.fit_predict(X_pca)

# 새로운 군집 피처 추가
X_train_encoded['cluster_group'] = kmeans_clusters
X_test_encoded['cluster_group'] = kmeans.predict(pca.transform(X_test_encoded[numeric_columns]))

# 새로운 파생 변수 생성
X_train_encoded["시도_대비_이식_비율"] = (X_train_encoded["임신 시도 또는 마지막 임신 경과 연수"] / (X_train_encoded["배아 이식 경과일"] + 1))
X_train_encoded["난자_활용도"] = (X_train_encoded
                             ["수집된 신선 난자 수"] / (X_train_encoded["총 생성 배아 수"] + 1))
X_test_encoded["시도_대비_이식_비율"] = (X_test_encoded["임신 시도 또는 마지막 임신 경과 연수"] / (X_test_encoded["배아 이식 경과일"] + 1))
X_test_encoded["난자_활용도"] = (X_test_encoded["수집된 신선 난자 수"] / (X_test_encoded["총 생성 배아 수"] + 1))

X_train = X_train_encoded.drop('임신 성공 여부', axis=1)
y_train = X_train_encoded['임신 성공 여부']

# Function to calculate VIF for each feature
def calculate_vif(df):
    vif_data = pd.DataFrame()
    vif_data["Feature"] = df.columns
    vif_data["VIF"] = [variance_inflation_factor(df.values, i) for i in range(df.shape[1])]
    return vif_data

# Calculate VIF for the selected features in X
vif_df = calculate_vif(X_train)

# Remove features with VIF >= 10
high_vif_features = vif_df[vif_df["VIF"] >= 23]["Feature"].tolist()
X_reduced = X_train.drop(columns=high_vif_features)

# Recalculate VIF for the reduced feature set
reduced_vif_df = calculate_vif(X_reduced)

X_train_encoded = X_train_encoded[reduced_vif_df['Feature']]
X_test_encoded = X_test_encoded[reduced_vif_df['Feature']]

X_train_encoded['임신 성공 여부'] = train['임신 성공 여부']

  return 1 - self.ssr/self.uncentered_tss
  vif = 1. / (1. - r_squared_i)
  return 1 - self.ssr/self.uncentered_tss


In [2]:
train = TabularDataset(X_train_encoded)
test = TabularDataset(X_test_encoded)
predictor = TabularPredictor(label='임신 성공 여부', eval_metric='roc_auc',).fit(train, time_limit=3600,  num_cpus=8, num_gpus =3, presets='best_quality')

ld_board = predictor.leaderboard(train, silent=True)
ld_board

No path specified. Models will be saved in: "AutogluonModels/ag-20250220_060351"
Verbosity: 2 (Standard Logging)
AutoGluon Version:  1.2
Python Version:     3.10.15
Operating System:   Linux
Platform Machine:   x86_64
Platform Version:   #136~20.04.1-Ubuntu SMP Thu Nov 14 16:38:05 UTC 2024
CPU Count:          48
Memory Avail:       231.06 GB / 251.54 GB (91.9%)
Disk Space Avail:   1358.99 GB / 1875.30 GB (72.5%)
Presets specified: ['best_quality']
Setting dynamic_stacking from 'auto' to True. Reason: Enable dynamic_stacking when use_bag_holdout is disabled. (use_bag_holdout=False)
Stack configuration (auto_stack=True): num_stack_levels=1, num_bag_folds=8, num_bag_sets=1
DyStack is enabled (dynamic_stacking=True). AutoGluon will try to determine whether the input data is affected by stacked overfitting and enable or disable stacking as a consequence.
	This is used to identify the optimal `num_stack_levels` value. Copies of AutoGluon will be fit on subsets of the data. Then holdout valid

[36m(_ray_fit pid=269268)[0m [1000]	valid_set's binary_logloss: 0.488006
[36m(_ray_fit pid=269269)[0m [1000]	valid_set's binary_logloss: 0.489515
[36m(_ray_fit pid=269730)[0m [1000]	valid_set's binary_logloss: 0.488779
[36m(_ray_fit pid=269268)[0m [2000]	valid_set's binary_logloss: 0.487584
[36m(_ray_fit pid=269730)[0m [2000]	valid_set's binary_logloss: 0.488019
[36m(_ray_fit pid=269268)[0m [3000]	valid_set's binary_logloss: 0.487216
[36m(_ray_fit pid=269730)[0m [3000]	valid_set's binary_logloss: 0.487564
[36m(_ray_fit pid=269268)[0m [4000]	valid_set's binary_logloss: 0.487107
[36m(_ray_fit pid=269730)[0m [4000]	valid_set's binary_logloss: 0.487503
[36m(_ray_fit pid=269268)[0m [5000]	valid_set's binary_logloss: 0.486819
[36m(_ray_fit pid=269730)[0m [5000]	valid_set's binary_logloss: 0.487334
[36m(_ray_fit pid=269268)[0m [6000]	valid_set's binary_logloss: 0.486821
[36m(_ray_fit pid=269730)[0m [6000]	valid_set's binary_logloss: 0.48738
[36m(_ray_fit pid=269268)

[36m(_dystack pid=258813)[0m 	0.7378	 = Validation score   (roc_auc)
[36m(_dystack pid=258813)[0m 	136.43s	 = Training   runtime
[36m(_dystack pid=258813)[0m 	24.73s	 = Validation runtime
[36m(_dystack pid=258813)[0m Fitting model: LightGBM_BAG_L2 ... Training model for up to 132.26s of the 132.19s of remaining time.
[36m(_dystack pid=258813)[0m 	Fitting 8 child models (S1F1 - S1F8) | Fitting with ParallelLocalFoldFittingStrategy (8 workers, per: cpus=1, gpus=0, memory=0.15%)
[36m(_dystack pid=258813)[0m 	0.7354	 = Validation score   (roc_auc)
[36m(_dystack pid=258813)[0m 	13.92s	 = Training   runtime
[36m(_dystack pid=258813)[0m 	1.37s	 = Validation runtime
[36m(_dystack pid=258813)[0m Fitting model: RandomForestGini_BAG_L2 ... Training model for up to 116.36s of the 116.30s of remaining time.
[36m(_dystack pid=258813)[0m 	0.7436	 = Validation score   (roc_auc)
[36m(_dystack pid=258813)[0m 	11.02s	 = Training   runtime
[36m(_dystack pid=258813)[0m 	12.22s	 = Va

Unnamed: 0,model,score_test,score_val,eval_metric,pred_time_test,pred_time_val,fit_time,pred_time_test_marginal,pred_time_val_marginal,fit_time_marginal,stack_level,can_infer,fit_order
0,KNeighborsDist_BAG_L1,0.966161,0.631998,roc_auc,19.150973,18.17437,0.2658,19.150973,18.17437,0.2658,1,True,2
1,RandomForestEntr_BAG_L1,0.908146,0.727089,roc_auc,1.526156,13.278316,5.885077,1.526156,13.278316,5.885077,1,True,6
2,RandomForestGini_BAG_L1,0.907762,0.726528,roc_auc,1.58479,12.013081,5.663512,1.58479,12.013081,5.663512,1,True,5
3,ExtraTreesGini_BAG_L1,0.889342,0.725403,roc_auc,1.498913,12.568044,5.146178,1.498913,12.568044,5.146178,1,True,8
4,ExtraTreesEntr_BAG_L1,0.888769,0.726096,roc_auc,1.544819,11.743327,5.361473,1.544819,11.743327,5.361473,1,True,9
5,KNeighborsUnif_BAG_L1,0.826922,0.653995,roc_auc,18.471337,18.223511,0.240502,18.471337,18.223511,0.240502,1,True,1
6,LightGBMLarge_BAG_L1,0.760407,0.734689,roc_auc,2.028799,3.271528,18.813151,2.028799,3.271528,18.813151,1,True,13
7,WeightedEnsemble_L2,0.755443,0.73624,roc_auc,44.824954,32.787617,2176.300529,0.015257,0.037671,19.057418,2,True,20
8,LightGBM_r131_BAG_L1,0.747871,0.735677,roc_auc,6.305973,13.385923,49.063252,6.305973,13.385923,49.063252,1,True,16
9,CatBoost_BAG_L1,0.745702,0.735639,roc_auc,0.426372,0.19233,88.098321,0.426372,0.19233,88.098321,1,True,7


In [3]:
results = predictor.fit_summary()
results

*** Summary of fit() ***
Estimated performance of each model:
                          model  score_val eval_metric  pred_time_val     fit_time  pred_time_val_marginal  fit_time_marginal  stack_level  can_infer  fit_order
0           WeightedEnsemble_L2   0.736240     roc_auc      32.787617  2176.300529                0.037671          19.057418            2       True         20
1          LightGBM_r131_BAG_L1   0.735677     roc_auc      13.385923    49.063252               13.385923          49.063252            1       True         16
2               CatBoost_BAG_L1   0.735639     roc_auc       0.192330    88.098321                0.192330          88.098321            1       True          7
3          CatBoost_r177_BAG_L1   0.735463     roc_auc       0.196942    63.641751                0.196942          63.641751            1       True         14
4                XGBoost_BAG_L1   0.735389     roc_auc       1.852394    77.894484                1.852394          77.894484        



{'model_types': {'KNeighborsUnif_BAG_L1': 'StackerEnsembleModel_KNN',
  'KNeighborsDist_BAG_L1': 'StackerEnsembleModel_KNN',
  'LightGBMXT_BAG_L1': 'StackerEnsembleModel_LGB',
  'LightGBM_BAG_L1': 'StackerEnsembleModel_LGB',
  'RandomForestGini_BAG_L1': 'StackerEnsembleModel_RF',
  'RandomForestEntr_BAG_L1': 'StackerEnsembleModel_RF',
  'CatBoost_BAG_L1': 'StackerEnsembleModel_CatBoost',
  'ExtraTreesGini_BAG_L1': 'StackerEnsembleModel_XT',
  'ExtraTreesEntr_BAG_L1': 'StackerEnsembleModel_XT',
  'NeuralNetFastAI_BAG_L1': 'StackerEnsembleModel_NNFastAiTabular',
  'XGBoost_BAG_L1': 'StackerEnsembleModel_XGBoost',
  'NeuralNetTorch_BAG_L1': 'StackerEnsembleModel_TabularNeuralNetTorch',
  'LightGBMLarge_BAG_L1': 'StackerEnsembleModel_LGB',
  'CatBoost_r177_BAG_L1': 'StackerEnsembleModel_CatBoost',
  'NeuralNetTorch_r79_BAG_L1': 'StackerEnsembleModel_TabularNeuralNetTorch',
  'LightGBM_r131_BAG_L1': 'StackerEnsembleModel_LGB',
  'NeuralNetFastAI_r191_BAG_L1': 'StackerEnsembleModel_NNFastAiT

In [None]:
L2_prediction = predictor.predict_proba(test, model='WeightedEnsemble_L2')
cat_prediction = predictor.predict_proba(test, model='LightGBM_r131_BAG_L1')
XGB_prediction = predictor.predict_proba(test, model='CatBoost_BAG_L1')
LGBM_prediction = predictor.predict_proba(test, model='CatBoost_r177_BAG_L1')

final_train_pred_proba = (
    0.45 * L2_prediction +
    0.45 * cat_prediction +
    0.05 * XGB_prediction +
    0.05 * LGBM_prediction
)


In [6]:
final_train_pred_proba.drop(columns=[0])
print(final_train_pred_proba.head(5))

# 제출 파일 생성
sample_submission = pd.read_csv('./data/sample_submission.csv')
sample_submission['probability'] = final_train_pred_proba[1]
sample_submission.to_csv('./submit/final_submit.csv', index=False)
print(sample_submission.head(5))

          0         1
0  0.998587  0.001413
1  0.997330  0.002670
2  0.846437  0.153563
3  0.876118  0.123882
4  0.469360  0.530640
           ID  probability
0  TEST_00000     0.001413
1  TEST_00001     0.002670
2  TEST_00002     0.153563
3  TEST_00003     0.123882
4  TEST_00004     0.530640
