In [2]:
# ## 1. 初始设置：加载库、数据和函数定义

# --- 1a. 导入所有需要的库 ---
import numpy as np
import pandas as pd
from scipy.stats import skew

from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_log_error
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder

import lightgbm as lgb
import xgboost as xgb
import catboost as cb

import warnings
warnings.filterwarnings('ignore')

print("✅ 所有库已成功导入。")


# --- 1b. 加载数据并进行初始清洗 ---
try:
    train_df = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/train.csv')
    test_df = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/test.csv')
    print("数据加载成功。")
except FileNotFoundError:
    print("错误：请确保数据集已添加到Notebook环境。")

# 移除著名的异常值
if 'train_df' in locals():
    outlier_indices = train_df[(train_df['GrLivArea'] > 4000) & (train_df['SalePrice'] < 300000)].index
    train_df = train_df.drop(outlier_indices)
    print(f"移除了 {len(outlier_indices)} 个GrLivArea异常值。")

✅ 所有库已成功导入。
数据加载成功。
移除了 2 个GrLivArea异常值。


In [3]:
# --- 1c. 定义我们最强的特征工程函数 ---
def feature_engineer_ultimate_final(df, skew_list_to_apply=None):
    """
    终极版特征工程函数，包含了所有优化技巧。
    """
    df_fe = df.copy()
    df_fe['MSSubClass'] = df_fe['MSSubClass'].astype(str)
    quality_map = {'Po': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5}
    ordered_cols = ['ExterQual', 'ExterCond', 'BsmtQual', 'BsmtCond', 'HeatingQC', 'KitchenQual', 'FireplaceQu', 'GarageQual', 'GarageCond', 'PoolQC']
    for col in ordered_cols:
        if col in df_fe.columns: df_fe[col] = df_fe[col].map(quality_map)
    for col in ['Alley', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'Fence', 'GarageType', 'GarageFinish', 'MasVnrType', 'MiscFeature']:
        df_fe[col] = df_fe[col].fillna('None')
    for col in ['BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF','TotalBsmtSF', 'BsmtFullBath', 'BsmtHalfBath', 'GarageYrBlt', 'GarageArea', 'GarageCars', 'MasVnrArea'] + ordered_cols:
        df_fe[col] = df_fe[col].fillna(0)
    df_fe['LotFrontage'] = df_fe.groupby('Neighborhood')['LotFrontage'].transform(lambda x: x.fillna(x.median()))
    for col in ['MSZoning', 'Utilities', 'Exterior1st', 'Exterior2nd', 'Electrical', 'Functional', 'SaleType']:
        if col in df_fe.columns: df_fe[col] = df_fe[col].fillna(df_fe[col].mode()[0])
    numerical_cols_with_na = df_fe.select_dtypes(include=np.number).columns[df_fe.select_dtypes(include=np.number).isnull().any()]
    for col in numerical_cols_with_na:
        df_fe[col] = df_fe[col].fillna(df_fe[col].median())
    df_fe['TotalSF'] = df_fe['TotalBsmtSF'] + df_fe['1stFlrSF'] + df_fe['2ndFlrSF']
    df_fe['HouseAge'] = df_fe['YrSold'] - df_fe['YearBuilt']
    df_fe['RemodAge'] = df_fe['YrSold'] - df_fe['YearRemodAdd']
    df_fe['TotalBath'] = df_fe['BsmtFullBath'] + (0.5 * df_fe['BsmtHalfBath']) + df_fe['FullBath'] + (0.5 * df_fe['HalfBath'])
    df_fe['HouseAge'] = df_fe['HouseAge'].clip(0)
    df_fe['RemodAge'] = df_fe['RemodAge'].clip(0)
    df_fe['Qual_x_TotalSF'] = df_fe['OverallQual'] * df_fe['TotalSF']
    df_fe['Qual_x_HouseAge'] = df_fe['OverallQual'] * df_fe['HouseAge']
    if skew_list_to_apply is None:
        numerical_feats = df_fe.select_dtypes(exclude=["object", "category"]).columns
        skewed_feats = df_fe[numerical_feats].apply(lambda x: skew(x.dropna()))
        skew_list_to_apply = skewed_feats[skewed_feats > 0.5].index
    for feat in skew_list_to_apply:
        if feat in df_fe.columns: df_fe[feat] = np.log1p(df_fe[feat])
    return df_fe, skew_list_to_apply

print("✅ 特征工程函数已定义。")
print("\n--- Notebook V2 第一部分完成 ---")

✅ 特征工程函数已定义。

--- Notebook V2 第一部分完成 ---


In [4]:
# ## 2. 准备工作：定义完整数据集和模型参数

# --- 2a. 准备完整的、干净的 X_full 和 y_log ---
# 从第一部分加载并清洗过的 train_df 创建
if 'train_df' in locals():
    X_full = train_df.drop(['Id', 'SalePrice'], axis=1).reset_index(drop=True)
    y_full = train_df['SalePrice'].reset_index(drop=True)
    y_log = np.log1p(y_full)
    
    print("✅ 完整的 X_full 和 y_log 已准备就绪。")
    print(f"X_full 的形状: {X_full.shape}")
    print(f"y_log 的形状: {y_log.shape}")
else:
    print("‼️ 错误：train_df 未定义，请先运行第一部分的代码。")


# --- 2b. 定义我们找到的最佳超参数 ---
# 这是我们之前通过GridSearchCV辛苦找到的“冠军配置”
lgbm_best_params = {'learning_rate': 0.05, 'n_estimators': 200, 'num_leaves': 20, 'random_state': 42, 'verbosity': -1}
xgb_best_params = {'learning_rate': 0.05, 'max_depth': 3, 'n_estimators': 500, 'random_state': 42}
cat_best_params = {'depth': 6, 'iterations': 500, 'learning_rate': 0.1, 'random_state': 42}

print("\n✅ 三巨头的最佳参数已定义。")
print("\n--- Notebook V2 第二部分完成 ---")

✅ 完整的 X_full 和 y_log 已准备就绪。
X_full 的形状: (1458, 79)
y_log 的形状: (1458,)

✅ 三巨头的最佳参数已定义。

--- Notebook V2 第二部分完成 ---


In [5]:
# ## 3. 核心引擎：K-折交叉验证函数

def run_kfold_cv(model_class, params, X, y, n_splits=5, model_type='lgbm'):
    """
    一个通用的交叉验证函数。

    参数:
    - model_class: 模型的类 (例如 lgb.LGBMRegressor)
    - params: 模型的最佳参数字典
    - X, y: 完整的特征集和目标集
    - n_splits: 交叉验证的折数
    - model_type: 'lgbm', 'xgb', 或 'catboost'，用于区分预处理方式
    """
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)
    cv_scores = []
    
    print(f"--- 开始对 {model_type.upper()} 模型进行 {n_splits}-折交叉验证 ---")

    for fold, (train_index, val_index) in enumerate(kf.split(X, y)):
        print(f"  --- 第 {fold+1}/{n_splits} 折 ---")
        
        # 1. 划分当前折的数据
        X_train_fold, X_val_fold = X.iloc[train_index], X.iloc[val_index]
        y_train_fold, y_val_fold = y.iloc[train_index], y.iloc[val_index]

        # 2. 特征工程
        X_train_fold_pro, skew_list = feature_engineer_ultimate_final(X_train_fold)
        X_val_fold_pro, _ = feature_engineer_ultimate_final(X_val_fold, skew_list_to_apply=skew_list)
        
        # 3. 根据模型类型进行预处理和训练
        # 这个函数最智能的地方在于，它知道CatBoost和另外两个模型‘吃’的数据不一样
        
        if model_type == 'catboost':
            # CatBoost直接使用特征工程后的数据
            categorical_cols_names = [c for c in X_train_fold_pro.columns if X_train_fold_pro[c].dtype == 'object']
            model = model_class(**params, cat_features=categorical_cols_names, verbose=0)
            model.fit(X_train_fold_pro, y_train_fold)
            preds_log = model.predict(X_val_fold_pro)
        
        else: # LGBM 和 XGBoost 需要独热编码
            categorical_cols = [c for c in X_train_fold_pro.columns if X_train_fold_pro[c].dtype == 'object']
            numerical_cols = [c for c in X_train_fold_pro.columns if X_train_fold_pro[c].dtype in ['int64', 'float64']]
            
            preprocessor = ColumnTransformer(transformers=[
                ('num', SimpleImputer(strategy='median'), numerical_cols),
                ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)])
            
            X_train_fold_processed = preprocessor.fit_transform(X_train_fold_pro)
            X_val_fold_processed = preprocessor.transform(X_val_fold_pro)
            
            model = model_class(**params)
            model.fit(X_train_fold_processed, y_train_fold)
            preds_log = model.predict(X_val_fold_processed)

        # 4. 评估分数
        y_val_orig = np.expm1(y_val_fold)
        preds_orig = np.expm1(preds_log)
        preds_orig[preds_orig < 0] = 0
        
        score = np.sqrt(mean_squared_log_error(y_val_orig, preds_orig))
        cv_scores.append(score)
        print(f"    第 {fold+1} 折的分数: {score:.5f}")

    # 5. 打印最终结果
    mean_score = np.mean(cv_scores)
    std_score = np.std(cv_scores)
    print("="*50)
    print(f"{model_type.upper()} 模型交叉验证完成。")
    print(f"平均分 (Mean RMSLE): {mean_score:.5f}")
    print(f"标准差 (Std Dev): {std_score:.5f}")
    print("="*50)
    
    return mean_score, std_score

print("✅ 交叉验证核心引擎函数已定义。")
print("\n--- Notebook V2 第三部分完成 ---")

✅ 交叉验证核心引擎函数已定义。

--- Notebook V2 第三部分完成 ---


In [6]:
# ## 4. 实验与分析

# --- 4a. 评估我们调优后的CatBoost模型 ---
# CatBoost是我们之前单次验证的王者，我们先看它在交叉验证下的表现
cat_cv_mean, cat_cv_std = run_kfold_cv(
    model_class=cb.CatBoostRegressor, 
    params=cat_best_params, 
    X=X_full, 
    y=y_log, 
    model_type='catboost'
)

# --- 4b. 评估我们调优后的XGBoost模型 ---
xgb_cv_mean, xgb_cv_std = run_kfold_cv(
    model_class=xgb.XGBRegressor,
    params=xgb_best_params,
    X=X_full,
    y=y_log,
    model_type='xgb'
)

# --- 4c. 评估我们调优后的LightGBM模型 ---
lgbm_cv_mean, lgbm_cv_std = run_kfold_cv(
    model_class=lgb.LGBMRegressor,
    params=lgbm_best_params,
    X=X_full,
    y=y_log,
    model_type='lgbm'
)

# --- 4d. 汇总和展示最终排行榜 ---
print("\n" + "#"*50)
print("### 最终模型排行榜 (基于5折交叉验证) ###")
print("#"*50)

# 修正后的排行榜代码
results_df = pd.DataFrame({
    'Model': ['CatBoost', 'XGBoost', 'LightGBM'],
    'Mean RMSLE': [cat_cv_mean, xgb_cv_mean, lgbm_cv_mean], # 修正了这里
    'Std Dev': [cat_cv_std, xgb_cv_std, lgbm_cv_std]
}).sort_values(by='Mean RMSLE')

print(results_df)

print("\n--- Notebook V2 第四部分完成 ---")

--- 开始对 CATBOOST 模型进行 5-折交叉验证 ---
  --- 第 1/5 折 ---
    第 1 折的分数: 0.12103
  --- 第 2/5 折 ---
    第 2 折的分数: 0.11207
  --- 第 3/5 折 ---
    第 3 折的分数: 0.11970
  --- 第 4/5 折 ---
    第 4 折的分数: 0.12947
  --- 第 5/5 折 ---
    第 5 折的分数: 0.10584
CATBOOST 模型交叉验证完成。
平均分 (Mean RMSLE): 0.11762
标准差 (Std Dev): 0.00807
--- 开始对 XGB 模型进行 5-折交叉验证 ---
  --- 第 1/5 折 ---
    第 1 折的分数: 0.12242
  --- 第 2/5 折 ---
    第 2 折的分数: 0.11852
  --- 第 3/5 折 ---
    第 3 折的分数: 0.12762
  --- 第 4/5 折 ---
    第 4 折的分数: 0.12656
  --- 第 5/5 折 ---
    第 5 折的分数: 0.10992
XGB 模型交叉验证完成。
平均分 (Mean RMSLE): 0.12101
标准差 (Std Dev): 0.00641
--- 开始对 LGBM 模型进行 5-折交叉验证 ---
  --- 第 1/5 折 ---
    第 1 折的分数: 0.12969
  --- 第 2/5 折 ---
    第 2 折的分数: 0.11622
  --- 第 3/5 折 ---
    第 3 折的分数: 0.12943
  --- 第 4/5 折 ---
    第 4 折的分数: 0.13061
  --- 第 5/5 折 ---
    第 5 折的分数: 0.11254
LGBM 模型交叉验证完成。
平均分 (Mean RMSLE): 0.12370
标准差 (Std Dev): 0.00770

##################################################
### 最终模型排行榜 (基于5折交叉验证) ###
#################################