# 타이타닉 모델로 파이프라인 및 Stacking 모델 구현
- 정답은 kaggle_titanic 폴더 참고

In [33]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import Ridge, Lasso, ElasticNet
from sklearn.ensemble import RandomForestRegressor, StackingClassifier
from sklearn.metrics import mean_squared_error, make_scorer
from sklearn.model_selection import RandomizedSearchCV
from skopt import BayesSearchCV
import xgboost as xgb
import lightgbm as lgb
import warnings
import joblib
import os
import json
from datetime import datetime
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC 
from sklearn.linear_model import LogisticRegression

# LightGBM 경고 메시지 무시
warnings.filterwarnings('ignore', category=UserWarning, module='lightgbm')

# 1. 데이터 준비
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
# print(train.head(1))

# EDA에서 추출된 주요 변수
features = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']  # 승객 등급, 성별, 나이, 동반자 수, 요금, 탑승 항구
target = 'Survived'  # 생존 여부 (0: 사망, 1: 생존)

# 학습/검증 데이터 분리
X = train[features]
y = train['Survived']
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
print(X)

# 수치형/범주형 분리
numeric_features = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_features = X.select_dtypes(include=['object']).columns.tolist()
# print(categorical_features)

# 2. 전처리 파이프라인
numeric_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

# 각 범주형 컬럼의 파이프라인 정의
sex_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OrdinalEncoder(categories=[['male', 'female']]))
])

embarked_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OrdinalEncoder(categories=[['C', 'Q', 'S']]))
])

# ColumnTransformer로 컬럼별 파이프라인 적용
categorical_transformer = ColumnTransformer([
    ('sex', sex_transformer, ['Sex']),
    ('embarked', embarked_transformer, ['Embarked'])
])

preprocessor = ColumnTransformer([
    ('num', numeric_transformer, numeric_features),
    ('cat', categorical_transformer, categorical_features)
])

# 사용할 모델 정의
models = {
    'RandomForest': RandomForestClassifier(random_state=42),   # 랜덤 포레스트 분류기
    'SVM': SVC(random_state=42, probability=True),             # 서포트 벡터 머신 (선형 모델 but, 현재는 퍼셉트론 모델을 주로 사용)
    'LogisticRegression': LogisticRegression(random_state=42)  # 로지스틱 회귀
}

     Pclass     Sex   Age  SibSp  Parch     Fare Embarked
0         3    male  22.0      1      0   7.2500        S
1         1  female  38.0      1      0  71.2833        C
2         3  female  26.0      0      0   7.9250        S
3         1  female  35.0      1      0  53.1000        S
4         3    male  35.0      0      0   8.0500        S
..      ...     ...   ...    ...    ...      ...      ...
886       2    male  27.0      0      0  13.0000        S
887       1  female  19.0      0      0  30.0000        S
888       3  female   NaN      1      2  23.4500        S
889       1    male  26.0      0      0  30.0000        C
890       3    male  32.0      0      0   7.7500        Q

[891 rows x 7 columns]


In [34]:
# 개별 모델 학습 및 평가

individual_results = {}
trained_models = {}

for name, model in models.items():
    pipe = Pipeline([
        ('preprocessor', preprocessor),
        ('reg', model)
    ])

    # 학습
    pipe.fit(X_train, y_train)

    # 검증
    y_val_pred = pipe.predict(X_val)  
    val_rmse = np.sqrt(mean_squared_error(y_val, y_val_pred))
    print(f"{name} 모델 RMSE : {val_rmse}")
    individual_results[name] = val_rmse
    trained_models[name] = pipe

RandomForest 모델 RMSE : 0.42936877145344576
SVM 모델 RMSE : 0.42936877145344576
LogisticRegression 모델 RMSE : 0.43582580703557733


In [35]:
# StackingClassifier가 요구하는 리스트 안에 튜플 형태로 변환하는 작업
stacking_models = list(trained_models.values())
stacking_names = list(trained_models.keys())

# StackingClassifier 매겨변수(estimators) 생성 - zip함수 이용 because, zip함수 출력값이 튜플 형태
estimators = list(zip(stacking_names, stacking_models))

stacking_classifier = StackingClassifier(
    estimators = estimators,
    final_estimator = LogisticRegression(),
    cv=5,
    n_jobs = -1
)

stacking_classifier.fit(X_train, y_train)

In [36]:
# 앙상블 모델 검증
y_val_pred = stacking_classifier.predict(X_val)
ensemble_rmse = np.sqrt(mean_squared_error(y_val, y_val_pred))
ensemble_rmse

np.float64(0.4421885641408914)

In [37]:
# 모델 테스트
X_test = test[features]
test_predictions = stacking_classifier.predict(X_test)

pd.DataFrame({
    'Survived' : test_predictions
})

Unnamed: 0,Survived
0,0
1,0
2,0
3,0
4,1
...,...
413,0
414,1
415,0
416,0
