In [None]:
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, StratifiedKFold
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import StackingClassifier, ExtraTreesClassifier, RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.impute import SimpleImputer
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from imblearn.under_sampling import RandomUnderSampler
from sklearn.base import BaseEstimator, ClassifierMixin
import scipy.stats as stats

# üîπ train_test_splitÏùÑ import Ï∂îÍ∞Ä
from sklearn.model_selection import train_test_split

# ‚úÖ Î¨∏ÏûêÏó¥ ‚Üí Ïà´ÏûêÎ°ú Î≥ÄÌôòÌïòÎäî Ìï®Ïàò (ÌöüÏàò Îç∞Ïù¥ÌÑ∞ Î≥ÄÌôò)
def convert_count_str(val):
    if pd.isna(val):
        return 0
    val = str(val).strip()
    if "Ìöå Ïù¥ÏÉÅ" in val:
        return 6
    m = re.search(r'(\d+)Ìöå?', val)
    return int(m.group(1)) if m else 0

# ‚úÖ Ï†ïÏûê, ÎÇúÏûê Í∏∞Ï¶ùÏûê ÎÇòÏù¥ Îß§Ìïë
donor_age_mapping = {
    'Îßå20ÏÑ∏ Ïù¥Ìïò': 3, 'Îßå21-25ÏÑ∏': 5, 'Îßå26-30ÏÑ∏': 4, 'Îßå31-35ÏÑ∏': 2,
    'Îßå36-40ÏÑ∏': 1, 'Îßå41-45ÏÑ∏': 0, 'Ïïå Ïàò ÏóÜÏùå': 0
}

def convert_donor_age(val):
    if pd.isna(val):
        return np.nan
    return donor_age_mapping.get(str(val).strip(), np.nan)

# ‚úÖ NaNÏùÑ Î¨∏ÏûêÏó¥ 'NaN'ÏúºÎ°ú Î≥ÄÌôòÌïòÎäî Ìï®Ïàò (Ïπ¥ÌÖåÍ≥†Î¶¨Ìòï Î≥ÄÏàò Ï≤òÎ¶¨)
def convert_nan_to_string(df, category_columns):
    df_copy = df.copy()
    for col in category_columns:
        df_copy[col] = df_copy[col].fillna('NaN').astype("category")
    return df_copy

# ‚úÖ Í∞ÄÏ§ëÏπò Ï†ÅÏö© Ìï®Ïàò (Î™®Îì† ÏãúÏà† Ïú†Ìòï Î∞òÏòÅ)
def apply_feature_weights(X, weight_dict):
    X_weighted = X.copy()
    for category, weights in weight_dict.items():  # Î™®Îì† ÏãúÏà† Ïú†Ìòï(IVF, DI Îì±) ÏàúÌöå
        for column in X.columns:
            if column in weights:  # Ìï¥Îãπ Ïπ¥ÌÖåÍ≥†Î¶¨Ïóê Í∞ÄÏ§ëÏπòÍ∞Ä ÏûàÏúºÎ©¥ Ï†ÅÏö©
                X_weighted[column] *= weights[column]
    return X_weighted

# ‚úÖ 1. Îç∞Ïù¥ÌÑ∞ Î°úÎìú
train = pd.read_csv('train.csv').drop(columns=['ID'])
test = pd.read_csv('test.csv').drop(columns=['ID'])

# ‚úÖ 2. Í∞ÄÏ§ëÏπò Îç∞Ïù¥ÌÑ∞ Î°úÎìú (Î™®Îì† ÏãúÏà† Ïú†Ìòï Î∞òÏòÅ)
weight_data = pd.read_csv('og_weighted_hong.csv', encoding='utf-8')
weight_dict = weight_data.set_index("Îç∞Ïù¥ÌÑ∞ Ìï≠Î™©").to_dict()

# ‚úÖ 3. ÏãúÏà† ÎãπÏãú ÎÇòÏù¥ Î≥ÄÌôò
age_mapping = {
    'Îßå18-34ÏÑ∏': 5, 'Îßå35-37ÏÑ∏': 4, 'Îßå38-39ÏÑ∏': 3, 'Îßå40-42ÏÑ∏': 2, 'Îßå43-44ÏÑ∏': 1, 'Îßå45-50ÏÑ∏': 0, 'Ïïå Ïàò ÏóÜÏùå': np.nan
}
train['ÏãúÏà† ÎãπÏãú ÎÇòÏù¥'] = train['ÏãúÏà† ÎãπÏãú ÎÇòÏù¥'].map(lambda x: float(age_mapping.get(str(x).strip(), np.nan)))
test['ÏãúÏà† ÎãπÏãú ÎÇòÏù¥'] = test['ÏãúÏà† ÎãπÏãú ÎÇòÏù¥'].map(lambda x: float(age_mapping.get(str(x).strip(), np.nan)))

# ‚úÖ 4. ÌöüÏàò Í¥ÄÎ†® Ïª¨Îüº Î≥ÄÌôò
count_columns = ["Ï¥ù ÏãúÏà† ÌöüÏàò", "ÌÅ¥Î¶¨Îãâ ÎÇ¥ Ï¥ù ÏãúÏà† ÌöüÏàò", "IVF ÏãúÏà† ÌöüÏàò", "DI ÏãúÏà† ÌöüÏàò",
                 "Ï¥ù ÏûÑÏã† ÌöüÏàò", "IVF ÏûÑÏã† ÌöüÏàò", "DI ÏûÑÏã† ÌöüÏàò", "Ï¥ù Ï∂úÏÇ∞ ÌöüÏàò", "IVF Ï∂úÏÇ∞ ÌöüÏàò", "DI Ï∂úÏÇ∞ ÌöüÏàò"]

for col in count_columns:
    train[col] = train[col].astype(str).apply(convert_count_str).astype(int)
    test[col] = test[col].astype(str).apply(convert_count_str).astype(int)

# ‚úÖ 5. ÎÇúÏûê/Ï†ïÏûê Í∏∞Ï¶ùÏûê ÎÇòÏù¥ Î≥ÄÌôò
train['ÎÇúÏûê Í∏∞Ï¶ùÏûê ÎÇòÏù¥'] = train['ÎÇúÏûê Í∏∞Ï¶ùÏûê ÎÇòÏù¥'].astype(str).apply(convert_donor_age)
test['ÎÇúÏûê Í∏∞Ï¶ùÏûê ÎÇòÏù¥'] = test['ÎÇúÏûê Í∏∞Ï¶ùÏûê ÎÇòÏù¥'].astype(str).apply(convert_donor_age)
train['Ï†ïÏûê Í∏∞Ï¶ùÏûê ÎÇòÏù¥'] = train['Ï†ïÏûê Í∏∞Ï¶ùÏûê ÎÇòÏù¥'].astype(str).apply(convert_donor_age)
test['Ï†ïÏûê Í∏∞Ï¶ùÏûê ÎÇòÏù¥'] = test['Ï†ïÏûê Í∏∞Ï¶ùÏûê ÎÇòÏù¥'].astype(str).apply(convert_donor_age)

# ‚úÖ 6. Feature Í∞ÄÏ§ëÏπò Ï†ÅÏö©
X = train.drop('ÏûÑÏã† ÏÑ±Í≥µ Ïó¨Î∂Ä', axis=1)
y = train['ÏûÑÏã† ÏÑ±Í≥µ Ïó¨Î∂Ä']

X_weighted = apply_feature_weights(X, weight_dict)
X_test_weighted = apply_feature_weights(test, weight_dict)

# ‚úÖ 7. Îç∞Ïù¥ÌÑ∞ Î∂àÍ∑†Ìòï Ï≤òÎ¶¨ (ÏûÑÏã† ÏÑ±Í≥µ Ïó¨Î∂Ä Í∏∞Ï§Ä)
undersample = RandomUnderSampler(sampling_strategy=0.5, random_state=42)
X_resampled, y_resampled = undersample.fit_resample(X_weighted, y)

# ‚úÖ 8. Î≤îÏ£ºÌòï Î≥ÄÏàò Î≥ÄÌôò
category_columns = ["ÏãúÏà† ÏãúÍ∏∞ ÏΩîÎìú", "ÏãúÏà† Ïú†Ìòï", "ÌäπÏ†ï ÏãúÏà† Ïú†Ìòï", "Î∞∞ÎûÄ Ïú†ÎèÑ Ïú†Ìòï",
                    "Î∞∞ÏïÑ ÏÉùÏÑ± Ï£ºÏöî Ïù¥Ïú†", "ÎÇúÏûê Ï∂úÏ≤ò", "Ï†ïÏûê Ï∂úÏ≤ò", "ÎÇúÏûê Í∏∞Ï¶ùÏûê ÎÇòÏù¥", "Ï†ïÏûê Í∏∞Ï¶ùÏûê ÎÇòÏù¥"]

X_resampled = convert_nan_to_string(X_resampled, category_columns)
X_test_weighted = convert_nan_to_string(X_test_weighted, category_columns)

# ‚úÖ 9. Îç∞Ïù¥ÌÑ∞ Î∂ÑÌï†
X_train, X_val, y_train, y_val = train_test_split(X_resampled, y_resampled,
                                                  test_size=0.2, random_state=42, stratify=y_resampled)



In [1]:
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, StratifiedKFold
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import StackingClassifier, ExtraTreesClassifier, RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.impute import SimpleImputer
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from imblearn.under_sampling import RandomUnderSampler
from sklearn.base import BaseEstimator, ClassifierMixin
import scipy.stats as stats

# üîπ train_test_splitÏùÑ import Ï∂îÍ∞Ä
from sklearn.model_selection import train_test_split

In [None]:


#  1. Îç∞Ïù¥ÌÑ∞ Î°úÎìú Î∞è Ï†ÑÏ≤òÎ¶¨
train = pd.read_csv('train.csv').drop(columns=['ID'])
test = pd.read_csv('test.csv').drop(columns=['ID'])

# ‚úÖ train_test_split Ïò§Î•ò Ìï¥Í≤∞
X = train.drop('ÏûÑÏã† ÏÑ±Í≥µ Ïó¨Î∂Ä', axis=1)
y = train['ÏûÑÏã† ÏÑ±Í≥µ Ïó¨Î∂Ä']

# Îç∞Ïù¥ÌÑ∞ Î∂ÑÌï† (ÏàòÏ†ïÎêú Î∂ÄÎ∂Ñ)
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)


# ‚úÖ Î™®Îç∏ Íµ¨ÏÑ±
stack_clf = StackingClassifier(
    estimators=[
        ('xgb', XGBClassifier(tree_method='gpu_hist', enable_categorical=True, random_state=42)),
        ('lgbm', LGBMClassifier(n_jobs=1, random_state=42, verbose=-1)),
        ('cat', CatBoostClassifier(task_type='GPU', verbose=0, cat_features=[X_resampled.columns.get_loc(c) for c in category_columns]))
    ],
    final_estimator=Pipeline([
        ('scaler', RobustScaler()),
        ('lr', LogisticRegression(max_iter=1000, class_weight='balanced', solver='liblinear'))
    ]),
    cv=3,
    n_jobs=1
)

# ‚úÖ Î™®Îç∏ ÌïôÏäµ
stack_clf.fit(X_train, y_train)

# ‚úÖ Í≤ÄÏ¶ù Îç∞Ïù¥ÌÑ∞ ÌèâÍ∞Ä
y_val_pred = stack_clf.predict_proba(X_val)[:, 1]
roc_auc = roc_auc_score(y_val, y_val_pred)
print(f"Validation ROC AUC: {roc_auc:.5f}")

# ‚úÖ ÌÖåÏä§Ìä∏ Îç∞Ïù¥ÌÑ∞ ÏòàÏ∏°
pred_proba = stack_clf.predict_proba(X_test_weighted)[:, 1]

# ‚úÖ Í≤∞Í≥º Ï†ÄÏû•
submission = pd.DataFrame({'ID': [f"TEST_{i:05d}" for i in range(len(test))],'probability': pred_proba})
submission.to_csv('/content/drive/MyDrive/aimers/submit/final_model_submit.csv', index=False)
print("Ï†úÏ∂ú ÌååÏùº ÏÉùÏÑ± ÏôÑÎ£å")


NameError: name 'X_resampled' is not defined