# 코드 리뷰

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import Ridge, Lasso, ElasticNet
from sklearn.ensemble import RandomForestRegressor, StackingRegressor
from sklearn.metrics import mean_squared_error, make_scorer
from sklearn.model_selection import RandomizedSearchCV
from skopt import BayesSearchCV
import xgboost as xgb
import lightgbm as lgb
import warnings
import joblib
import os
import json
from datetime import datetime

# LightGBM 경고 메시지 무시
warnings.filterwarnings('ignore', category=UserWarning, module='lightgbm')

# 1. 데이터 준비
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

# EDA에서 추출된 주요 변수
main_features = [
    'OverallQual', 'GrLivArea', 'GarageCars', 'GarageArea', 'TotalBsmtSF',
    '1stFlrSF', 'FullBath', 'TotRmsAbvGrd', 'YearBuilt', 'YearRemodAdd',
    'GarageYrBlt', 'MasVnrArea', 'Fireplaces', 'BsmtFinSF1', 'LotFrontage',
    'WoodDeckSF', '2ndFlrSF', 'OpenPorchSF', 'HalfBath', 'LotArea',
    'Neighborhood', 'ExterQual', 'KitchenQual', 'BsmtQual', 'GarageType',
    'SaleCondition'
]

# 학습/검증 데이터 분리
X = train[main_features]
y = train['SalePrice']
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# 수치형/범주형 분리
numeric_feats = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_feats = X.select_dtypes(include=['object']).columns.tolist()

# 2. 전처리 파이프라인
numeric_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

preprocessor = ColumnTransformer([
    ('num', numeric_transformer, numeric_feats),
    ('cat', categorical_transformer, categorical_feats)
])

# 3. 개별 모델 정의
base_models = {
    'Ridge': Ridge(alpha=1.0),
    'Lasso': Lasso(alpha=1.0),
    'ElasticNet': ElasticNet(alpha=1.0, l1_ratio=0.5),
    'XGBoost': xgb.XGBRegressor(
        n_estimators=500,
        max_depth=5,
        learning_rate=0.1,
        subsample=0.8,
        colsample_bytree=0.8,
        tree_method='hist',
        random_state=42
    ),
    'LightGBM': lgb.LGBMRegressor(
        n_estimators=500,
        max_depth=5,
        learning_rate=0.1,
        subsample=0.8,
        colsample_bytree=0.8,
        random_state=42,
        verbose=-1
    )
}

print('각 모델 트레이닝 실시')

각 모델 트레이닝 실시


# 개별 모델 학습 및 평가

In [9]:
individual_results = {}
trained_models = {}

for name, model in base_models.items():
    # print(name, model)
    print(f"\nTraining {name}...")
    pipe = Pipeline([
        ('preprocessor', preprocessor),
        ('reg', model)
    ])

    # 학습
    pipe.fit(X_train, y_train)
    print(f"\nTraining {name} Completed...!!")

    # 검증
    y_val_pred = pipe.predict(X_val)  # 이 코드와 28일 코드 비교하면서 파이프라인이 얼마나 유용한 지 확인
    val_rmse = np.sqrt(mean_squared_error(y_val, y_val_pred))
    print(f"{name} 모델 RMSE : {val_rmse}")
    individual_results[name] = val_rmse
    trained_models[name] = pipe


Training Ridge...

Training Ridge Completed...!!
Ridge 모델 RMSE : 32886.611682055016

Training Lasso...

Training Lasso Completed...!!
Lasso 모델 RMSE : 32993.20265185749

Training ElasticNet...

Training ElasticNet Completed...!!
ElasticNet 모델 RMSE : 37908.45726466044

Training XGBoost...


  model = cd_fast.enet_coordinate_descent(



Training XGBoost Completed...!!
XGBoost 모델 RMSE : 27164.23648844193

Training LightGBM...

Training LightGBM Completed...!!
LightGBM 모델 RMSE : 31217.708559019466




In [10]:
individual_results

{'Ridge': np.float64(32886.611682055016),
 'Lasso': np.float64(32993.20265185749),
 'ElasticNet': np.float64(37908.45726466044),
 'XGBoost': np.float64(27164.23648844193),
 'LightGBM': np.float64(31217.708559019466)}

In [29]:
print(trained_models)

{'Ridge': Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer()),
                                                                  ('scaler',
                                                                   StandardScaler())]),
                                                  ['OverallQual', 'GrLivArea',
                                                   'GarageCars', 'GarageArea',
                                                   'TotalBsmtSF', '1stFlrSF',
                                                   'FullBath', 'TotRmsAbvGrd',
                                                   'YearBuilt', 'YearRemodAdd',
                                                   'GarageYrBlt', 'MasVnrArea',
                                                   'Fireplaces', 'BsmtFinSF1',
                  

# 스태킹(Stacking) 앙상블 모델 생성
- https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.StackingRegressor.html

- 현재 trained_models 상태 : 딕셔너리 형태
- 하지만 StackingRegressor 클래스가 요구하는 입력 방식은 리스트 형태 + 리스트 내부는 튜플 형태로 구성
- 튜플 형태 => (모델의 이름, 모델의 객체)

In [30]:
stacking_models = list(trained_models.values())
stacking_names = list(trained_models.keys())

# StackingRegressor 매개변수(estimators) 생성
estimators = list(zip(stacking_names, stacking_models)) # zip함수 출력 == 튜플 형태
# print(estimators)

stacking_regressor = StackingRegressor(
    estimators = estimators,
    final_estimator = Ridge(alpha=1.0),
    cv=5,
    n_jobs = -1   # 가장 일반적인 병렬 처리 설정
)

stacking_regressor.fit(X_train, y_train)

In [25]:
# 앙상블 모델 검증
y_val_pred = stacking_regressor.predict(X_val)
ensemble_rmse = np.sqrt(mean_squared_error(y_val, y_val_pred))
ensemble_rmse



np.float64(28165.657335583746)

# 모델 테스트

In [28]:
X_test = test[main_features]
test_predictions = stacking_regressor.predict(X_test)

pd.DataFrame({
    'Id' : test['Id'],
    'SalePrice' : test_predictions
})



Unnamed: 0,Id,SalePrice
0,1461,129808.364039
1,1462,164551.512067
2,1463,186657.585677
3,1464,190667.914572
4,1465,206828.676687
...,...,...
1454,2915,68641.914990
1455,2916,72949.239007
1456,2917,160327.211971
1457,2918,112886.382235


# 미션
- 타이타닉 데이터 활용해서 분류 모델 만들어보기
- Stacking 방식으로 -> 분류니까 StackingClassifier 활용