In [None]:
import warnings
import os
warnings.filterwarnings('ignore')
os.environ['PYTHONWARNINGS'] = 'ignore'

import pandas as pd
import numpy as np
import re

from sklearn.model_selection import train_test_split, StratifiedKFold, RandomizedSearchCV, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, r2_score, mean_absolute_error, mean_squared_error, log_loss, accuracy_score
from sklearn.ensemble import RandomForestClassifier, StackingClassifier
from sklearn.linear_model import LogisticRegression

from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from scipy.stats import uniform, randint

# ---------- 데이터 로드 ----------
train = pd.read_csv('./data/1.titanic_train.csv')
test = pd.read_csv('./data/2.titanic_test.csv')
submission = pd.read_csv('./data/3.titanic_submission.csv')
test_passenger_id = test['PassengerId']

# ---------- [Feature Engineering] ----------
for df in [train, test]:
    df['FamilySize'] = df['SibSp'] + df['Parch'] + 1
    df['FamilyType'] = pd.cut(df['FamilySize'], bins=[0,1,4,20], labels=['Alone','Small','Large'])
    df['IsAlone'] = (df['FamilySize'] == 1).astype(int)

def extract_title(name):
    match = re.search(r',\s*([^.]*)\.', name)
    if match:
        return match.group(1).strip()
    return 'Unknown'

for df in [train, test]:
    df['Title'] = df['Name'].apply(extract_title)
    df['Title'] = df['Title'].replace(
        ['Lady', 'Countess','Capt','Col','Don','Dr','Major','Rev','Sir','Jonkheer','Dona'], 'Rare'
    )
    df['Title'] = df['Title'].replace({'Mlle':'Miss', 'Ms':'Miss', 'Mme':'Mrs'})

for df in [train, test]:
    df['Age*Pclass'] = df['Age'] * df['Pclass']
    df['Sex*Pclass'] = df.apply(lambda r: 0 if r['Sex']=='male' else 1, axis=1) * df['Pclass']

for df in [train, test]:
    df['FarePerPerson'] = df['Fare'] / df['FamilySize']
    df['Fare_log'] = np.log1p(df['Fare'])

for df in [train, test]:
    df['Deck'] = df['Cabin'].fillna('U').map(lambda x: x[0])

for df in [train, test]:
    for title in df['Title'].unique():
        median_age = df.loc[df['Title']==title, 'Age'].median()
        df.loc[(df['Title']==title) & (df['Age'].isnull()), 'Age'] = median_age

for df in [train, test]:
    df['Embarked'].fillna(df['Embarked'].mode()[0], inplace=True)

for df in [test]:
    for pclass in df['Pclass'].unique():
        median_fare = df.loc[df['Pclass']==pclass, 'Fare'].median()
        df.loc[(df['Pclass']==pclass) & (df['Fare'].isnull()), 'Fare'] = median_fare

drop_cols = ['PassengerId', 'Name', 'Ticket', 'Cabin']
train = train.drop(drop_cols, axis=1)
test = test.drop(drop_cols, axis=1)

numerical_features = train.select_dtypes(include=np.number).columns.tolist()
if 'Survived' in numerical_features: numerical_features.remove('Survived')
categorical_features = train.select_dtypes(exclude=np.number).columns.tolist()

numerical_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])
categorical_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

preprocessor = ColumnTransformer([
    ('num', numerical_transformer, numerical_features),
    ('cat', categorical_transformer, categorical_features)
])

X = train.drop('Survived', axis=1)
y = train['Survived']
X_processed = preprocessor.fit_transform(X)
X_test_processed = preprocessor.transform(test)

X_train, X_val, y_train, y_val = train_test_split(
    X_processed, y, test_size=0.2, random_state=42, stratify=y
)

kfold = StratifiedKFold(n_splits=7, shuffle=True, random_state=42)

# ----------- XGBoost 하이퍼파라미터 탐색 -----------
xgb_base = XGBClassifier(random_state=42, eval_metric='logloss', tree_method='hist')
param_dist_xgb = {
    'n_estimators': randint(50, 500),
    'max_depth': randint(2, 10),
    'learning_rate': uniform(0.01, 0.1),
    'subsample': uniform(0.5, 0.5),
    'colsample_bytree': uniform(0.5, 0.5),
    'gamma': uniform(0, 2),
    'reg_alpha': uniform(0, 2),
    'reg_lambda': uniform(0, 2)
}
rand_search_xgb = RandomizedSearchCV(
    xgb_base, param_distributions=param_dist_xgb, n_iter=10, cv=kfold, scoring='accuracy',
    verbose=0, n_jobs=-1, random_state=42
)
rand_search_xgb.fit(X_train, y_train)
best_xgb_params = rand_search_xgb.best_params_

# ----------- LightGBM 하이퍼파라미터 탐색 -----------
lgbm_base = LGBMClassifier(random_state=42, verbosity=-1)
param_dist_lgbm = {
    'n_estimators': randint(50, 500),
    'max_depth': randint(2, 10),
    'learning_rate': uniform(0.01, 0.1),
    'subsample': uniform(0.5, 0.5),
    'colsample_bytree': uniform(0.5, 0.5),
    'reg_alpha': uniform(0, 2),
    'reg_lambda': uniform(0, 2)
}
rand_search_lgbm = RandomizedSearchCV(
    lgbm_base, param_distributions=param_dist_lgbm, n_iter=10, cv=kfold, scoring='accuracy',
    verbose=0, n_jobs=-1, random_state=42
)
rand_search_lgbm.fit(X_train, y_train)
best_lgbm_params = rand_search_lgbm.best_params_
best_lgbm_params['verbosity'] = -1

# ----------- RandomForest 하이퍼파라미터 탐색 -----------
rfc_base = RandomForestClassifier(random_state=42)
param_grid_rfc = {
    'n_estimators': [50, 100, 200],
    'max_depth': [3, 7, 12, None],
    'min_samples_split': [2, 4, 8]
}
grid_search_rfc = GridSearchCV(
    rfc_base, param_grid_rfc, cv=kfold, scoring='accuracy',
    verbose=0, n_jobs=-1
)
grid_search_rfc.fit(X_train, y_train)
best_rfc_params = grid_search_rfc.best_params_

# ----------- CatBoost 하이퍼파라미터 탐색 -----------
catboost_base = CatBoostClassifier(random_state=42, verbose=0)
param_dist_cat = {
    'iterations': randint(60, 200),
    'depth': randint(3, 9),
    'learning_rate': uniform(0.03, 0.1),
    'l2_leaf_reg': uniform(0.5, 5)
}
rand_search_cat = RandomizedSearchCV(
    catboost_base, param_distributions=param_dist_cat, n_iter=8, cv=kfold, scoring='accuracy',
    verbose=0, n_jobs=-1, random_state=42
)
rand_search_cat.fit(X_train, y_train)
best_cat_params = rand_search_cat.best_params_
best_cat_params['verbose'] = 0

# ----------- 모델 객체 생성 (최적 하이퍼파라미터 적용) -----------
xgb_final = XGBClassifier(**best_xgb_params, random_state=42, eval_metric='logloss', tree_method='hist')
lgbm_final = LGBMClassifier(**best_lgbm_params, random_state=42)
rfc_final = RandomForestClassifier(**best_rfc_params, random_state=42)
cat_final = CatBoostClassifier(**best_cat_params, random_state=42)

# ----------- 최적 점수 및 파라미터 출력 -----------
print("=== 하이퍼파라미터 최적화 결과 ===")
model_results = {
    "XGBoost": (rand_search_xgb.best_score_, rand_search_xgb.best_params_),
    "LightGBM": (rand_search_lgbm.best_score_, rand_search_lgbm.best_params_),
    "RandomForest": (grid_search_rfc.best_score_, grid_search_rfc.best_params_),
    "CatBoost": (rand_search_cat.best_score_, rand_search_cat.best_params_)
}
for name, (score, params) in model_results.items():
    print(f"{name:12} | score={score:.4f} | params={params}")

best_model = max(model_results.items(), key=lambda x: x[1][0])
print(f"\n>>> 가장 성능이 좋았던 모델: {best_model[0]} (score={best_model[1][0]:.4f}) ")
print(f"    최적 파라미터: {best_model[1][1]}\n")

# ----------- 스태킹 앙상블 구성 및 학습 -----------
stacking = StackingClassifier(
    estimators=[
        ('xgb', xgb_final),
        ('lgbm', lgbm_final),
        ('rf', rfc_final),
        ('cat', cat_final)
    ],
    final_estimator=LogisticRegression(random_state=42, max_iter=1000),
    cv=kfold,
    n_jobs=-1,
    passthrough=False
)
stacking.fit(X_train, y_train)

# ----------- 학습 데이터 평가 -----------
y_pred_train = stacking.predict(X_train)
y_pred_train_proba = stacking.predict_proba(X_train)
print("\n=== [Stacking 앙상블: 학습 데이터] ===")
print(classification_report(y_train, y_pred_train))
print("R2  :", r2_score(y_train, y_pred_train))
print("MAE :", mean_absolute_error(y_train, y_pred_train))
print("MSE :", mean_squared_error(y_train, y_pred_train))
print("RMSE:", np.sqrt(mean_squared_error(y_train, y_pred_train)))
print("Log Loss:", log_loss(y_train, y_pred_train_proba))

# ----------- 검증 데이터 평가 (threshold 기본값=0.5) -----------
y_pred_val = stacking.predict(X_val)
y_pred_val_proba = stacking.predict_proba(X_val)
print("\n=== [Stacking 앙상블: 검증 데이터 (threshold=0.5)] ===")
print(classification_report(y_val, y_pred_val))
print("R2  :", r2_score(y_val, y_pred_val))
print("MAE :", mean_absolute_error(y_val, y_pred_val))
print("MSE :", mean_squared_error(y_val, y_pred_val))
print("RMSE:", np.sqrt(mean_squared_error(y_val, y_pred_val)))
print("Log Loss:", log_loss(y_val, y_pred_val_proba))

# ----------- threshold 튜닝 (검증 데이터에서) -----------
y_val_proba = y_pred_val_proba[:, 1]  # 생존 확률 (클래스 1)
# thresholds = np.arange(0.3, 0.8, 0.01)
# best_acc = 0
# best_thresh = 0.5

# print("\nThreshold 튜닝 결과:")
# for t in thresholds:
#     y_val_pred_thresh = (y_val_proba >= t).astype(int)
#     acc = accuracy_score(y_val, y_val_pred_thresh)
#     print(f"Threshold={t:.2f} -> Accuracy={acc:.4f}")
#     if acc > best_acc:
#         best_acc = acc
#         best_thresh = t
#
# print(f"\n>>> 최적 Threshold: {best_thresh:.2f} (Validation Accuracy={best_acc:.4f})")

# ----------- 최적 threshold로 검증 데이터 재평가 -----------
best_thresh = 0.47
y_val_pred_best = (y_val_proba >= best_thresh).astype(int)
print("\n=== [Stacking 앙상블: 검증 데이터 (threshold 튜닝 후)] ===")
print(classification_report(y_val, y_val_pred_best))
print("R2  :", r2_score(y_val, y_val_pred_best))
print("MAE :", mean_absolute_error(y_val, y_val_pred_best))
print("MSE :", mean_squared_error(y_val, y_val_pred_best))
print("RMSE:", np.sqrt(mean_squared_error(y_val, y_val_pred_best)))
print("Log Loss:", log_loss(y_val, y_pred_val_proba))

# ----------- 전체 데이터로 재학습 -----------
stacking.fit(X_processed, y)

# ----------- 최종 제출 예측 (최적 threshold 반영) -----------
y_test_proba = stacking.predict_proba(X_test_processed)[:, 1]
y_test_pred = (y_test_proba >= best_thresh).astype(int)
submission['Survived'] = y_test_pred
submission['PassengerId'] = test_passenger_id
submission.to_csv('titanic_stacking_submission.csv', index=False)
print("\n제출 파일 저장 완료!(titanic_stacking_submission.csv)")

In [1]:
import warnings
import os
warnings.filterwarnings('ignore')
os.environ['PYTHONWARNINGS'] = 'ignore'

import pandas as pd
import numpy as np
import re

from sklearn.model_selection import train_test_split, StratifiedKFold, RandomizedSearchCV, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, r2_score, mean_absolute_error, mean_squared_error, log_loss, accuracy_score
from sklearn.ensemble import RandomForestClassifier, StackingClassifier
from sklearn.linear_model import LogisticRegression

from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
import matplotlib.pyplot as plt
from scipy.stats import uniform, randint

# ---------- 데이터 로드 ----------
train = pd.read_csv('./data/1.titanic_train.csv')
test = pd.read_csv('./data/2.titanic_test.csv')
submission = pd.read_csv('./data/3.titanic_submission.csv')
test_passenger_id = test['PassengerId']

# ---------- Feature Engineering ----------
for df in [train, test]:
    df['FamilySize'] = df['SibSp'] + df['Parch'] + 1
    df['FamilyType'] = pd.cut(df['FamilySize'], bins=[0,1,4,20], labels=['Alone','Small','Large'])
    df['IsAlone'] = (df['FamilySize'] == 1).astype(int)

def extract_title(name):
    match = re.search(r',\s*([^.]*)\.', name)
    if match:
        return match.group(1).strip()
    return 'Unknown'

for df in [train, test]:
    df['Title'] = df['Name'].apply(extract_title)
    df['Title'] = df['Title'].replace(
        ['Lady', 'Countess','Capt','Col','Don','Dr','Major','Rev','Sir','Jonkheer','Dona'], 'Rare'
    )
    df['Title'] = df['Title'].replace({'Mlle':'Miss', 'Ms':'Miss', 'Mme':'Mrs'})

for df in [train, test]:
    df['Age*Pclass'] = df['Age'] * df['Pclass']
    df['Sex*Pclass'] = df.apply(lambda r: 0 if r['Sex']=='male' else 1, axis=1) * df['Pclass']

for df in [train, test]:
    df['FarePerPerson'] = df['Fare'] / df['FamilySize']
    df['Fare_log'] = np.log1p(df['Fare'])

for df in [train, test]:
    df['Deck'] = df['Cabin'].fillna('U').map(lambda x: x[0])

for df in [train, test]:
    for title in df['Title'].unique():
        median_age = df.loc[df['Title']==title, 'Age'].median()
        df.loc[(df['Title']==title) & (df['Age'].isnull()), 'Age'] = median_age

for df in [train, test]:
    df['Embarked'].fillna(df['Embarked'].mode()[0], inplace=True)

for df in [test]:
    for pclass in df['Pclass'].unique():
        median_fare = df.loc[df['Pclass']==pclass, 'Fare'].median()
        df.loc[(df['Pclass']==pclass) & (df['Fare'].isnull()), 'Fare'] = median_fare

drop_cols = ['PassengerId', 'Name', 'Ticket', 'Cabin']
train = train.drop(drop_cols, axis=1)
test = test.drop(drop_cols, axis=1)

numerical_features = train.select_dtypes(include=np.number).columns.tolist()
if 'Survived' in numerical_features: numerical_features.remove('Survived')
categorical_features = train.select_dtypes(exclude=np.number).columns.tolist()

numerical_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])
categorical_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

preprocessor = ColumnTransformer([
    ('num', numerical_transformer, numerical_features),
    ('cat', categorical_transformer, categorical_features)
])

X = train.drop('Survived', axis=1)
y = train['Survived']
X_processed = preprocessor.fit_transform(X)
X_test_processed = preprocessor.transform(test)

X_train, X_val, y_train, y_val = train_test_split(
    X_processed, y, test_size=0.2, random_state=42, stratify=y
)

kfold = StratifiedKFold(n_splits=7, shuffle=True, random_state=42)

# ----------- XGBoost 하이퍼파라미터 탐색 -----------
xgb_base = XGBClassifier(random_state=42, eval_metric='logloss', tree_method='hist')
param_dist_xgb = {
    'n_estimators': randint(50, 500),
    'max_depth': randint(2, 10),
    'learning_rate': uniform(0.01, 0.1),
    'subsample': uniform(0.5, 0.5),
    'colsample_bytree': uniform(0.5, 0.5),
    'gamma': uniform(0, 2),
    'reg_alpha': uniform(0, 2),
    'reg_lambda': uniform(0, 2)
}
rand_search_xgb = RandomizedSearchCV(
    xgb_base, param_distributions=param_dist_xgb, n_iter=10, cv=kfold, scoring='accuracy',
    verbose=0, n_jobs=-1, random_state=42
)
rand_search_xgb.fit(X_train, y_train)
best_xgb_params = rand_search_xgb.best_params_

# ----------- LightGBM 하이퍼파라미터 탐색 -----------
lgbm_base = LGBMClassifier(random_state=42, verbosity=-1)
param_dist_lgbm = {
    'n_estimators': randint(50, 500),
    'max_depth': randint(2, 10),
    'learning_rate': uniform(0.01, 0.1),
    'subsample': uniform(0.5, 0.5),
    'colsample_bytree': uniform(0.5, 0.5),
    'reg_alpha': uniform(0, 2),
    'reg_lambda': uniform(0, 2)
}
rand_search_lgbm = RandomizedSearchCV(
    lgbm_base, param_distributions=param_dist_lgbm, n_iter=10, cv=kfold, scoring='accuracy',
    verbose=0, n_jobs=-1, random_state=42
)
rand_search_lgbm.fit(X_train, y_train)
best_lgbm_params = rand_search_lgbm.best_params_
best_lgbm_params['verbosity'] = -1

# ----------- RandomForest 하이퍼파라미터 탐색 -----------
rfc_base = RandomForestClassifier(random_state=42)
param_grid_rfc = {
    'n_estimators': [50, 100, 200],
    'max_depth': [3, 7, 12, None],
    'min_samples_split': [2, 4, 8]
}
grid_search_rfc = GridSearchCV(
    rfc_base, param_grid_rfc, cv=kfold, scoring='accuracy',
    verbose=0, n_jobs=-1
)
grid_search_rfc.fit(X_train, y_train)
best_rfc_params = grid_search_rfc.best_params_

# ----------- CatBoost 하이퍼파라미터 탐색 -----------
catboost_base = CatBoostClassifier(random_state=42, verbose=0)
param_dist_cat = {
    'iterations': randint(60, 200),
    'depth': randint(3, 9),
    'learning_rate': uniform(0.03, 0.1),
    'l2_leaf_reg': uniform(0.5, 5)
}
rand_search_cat = RandomizedSearchCV(
    catboost_base, param_distributions=param_dist_cat, n_iter=8, cv=kfold, scoring='accuracy',
    verbose=0, n_jobs=-1, random_state=42
)
rand_search_cat.fit(X_train, y_train)
best_cat_params = rand_search_cat.best_params_
best_cat_params['verbose'] = 100

# ----------- 모델 객체 생성 (최적 하이퍼파라미터 적용) -----------

xgb_final = XGBClassifier(**best_xgb_params, random_state=42, eval_metric='logloss', tree_method='hist')
lgbm_final = LGBMClassifier(**best_lgbm_params, random_state=42)
rfc_final = RandomForestClassifier(**best_rfc_params, random_state=42)

# ** CatBoost 개별 학습 시 verbose와 eval_set으로 epoch별 로그 출력 및 그래프 **
cat_final = CatBoostClassifier(**best_cat_params, random_state=42)

# CatBoost 개별 학습(스태킹 전에 별도 학습 및 그래프)
cat_final.fit(
    X_train,
    y_train,
    eval_set=(X_val, y_val),
    use_best_model=True,
    verbose=100,
    plot=True
)

# ----------- 스태킹 앙상블 구성 및 학습 -----------
stacking = StackingClassifier(
    estimators=[
        ('xgb', xgb_final),
        ('lgbm', lgbm_final),
        ('rf', rfc_final),
        ('cat', cat_final)  # CatBoost는 위에서 이미 학습했지만 객체 재사용
    ],
    final_estimator=LogisticRegression(random_state=42, max_iter=1000),
    cv=kfold,
    n_jobs=-1,
    passthrough=False
)
stacking.fit(X_train, y_train)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

0:	learn: 0.6777740	test: 0.6749331	best: 0.6749331 (0)	total: 284us	remaining: 33.3ms
100:	learn: 0.3847360	test: 0.3528984	best: 0.3528941 (99)	total: 24.1ms	remaining: 4.06ms
117:	learn: 0.3769383	test: 0.3513498	best: 0.3513498 (117)	total: 28.5ms	remaining: 0us

bestTest = 0.3513497927
bestIteration = 117

0:	learn: 0.6777740	total: 352us	remaining: 41.2ms
100:	learn: 0.3847360	total: 31ms	remaining: 5.22ms
117:	learn: 0.3769383	total: 35.9ms	remaining: 0us
0:	learn: 0.6783880	total: 5.97ms	remaining: 699ms
0:	learn: 0.6783102	total: 12ms	remaining: 1.4s
0:	learn: 0.6778333	total: 10.6ms	remaining: 1.24s
0:	learn: 0.6771635	total: 3.84ms	remaining: 449ms
0:	learn: 0.6774765	total: 2.2ms	remaining: 258ms
0:	learn: 0.6763377	total: 8.74ms	remaining: 1.02s
0:	learn: 0.6767096	total: 711us	remaining: 83.3ms
100:	learn: 0.3780496	total: 45.5ms	remaining: 7.66ms
100:	learn: 0.3800979	total: 37ms	remaining: 6.22ms
100:	learn: 0.3777190	total: 39.8ms	remaining: 6.71ms
100:	learn: 0.390186