In [1]:
# ## 最终提交模块 (“火力全开”版)

import numpy as np
import pandas as pd
from scipy.stats import skew
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
import lightgbm as lgb
import xgboost as xgb
import catboost as cb

# --- 1. 定义我们最终的特征工程函数 ---
def feature_engineer_ultimate_final(df, skew_list_to_apply=None):
    df_fe = df.copy()
    df_fe['MSSubClass'] = df_fe['MSSubClass'].astype(str)
    quality_map = {'Po': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5}
    ordered_cols = ['ExterQual', 'ExterCond', 'BsmtQual', 'BsmtCond', 'HeatingQC', 'KitchenQual', 'FireplaceQu', 'GarageQual', 'GarageCond', 'PoolQC']
    for col in ordered_cols:
        if col in df_fe.columns: df_fe[col] = df_fe[col].map(quality_map)
    for col in ['Alley', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'Fence', 'GarageType', 'GarageFinish', 'MasVnrType', 'MiscFeature']:
        df_fe[col] = df_fe[col].fillna('None')
    for col in ['BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF','TotalBsmtSF', 'BsmtFullBath', 'BsmtHalfBath', 'GarageYrBlt', 'GarageArea', 'GarageCars', 'MasVnrArea'] + ordered_cols:
        df_fe[col] = df_fe[col].fillna(0)
    df_fe['LotFrontage'] = df_fe.groupby('Neighborhood')['LotFrontage'].transform(lambda x: x.fillna(x.median()))
    for col in ['MSZoning', 'Utilities', 'Exterior1st', 'Exterior2nd', 'Electrical', 'Functional', 'SaleType']:
        if col in df_fe.columns: df_fe[col] = df_fe[col].fillna(df_fe[col].mode()[0])
    numerical_cols_with_na = df_fe.select_dtypes(include=np.number).columns[df_fe.select_dtypes(include=np.number).isnull().any()]
    for col in numerical_cols_with_na:
        df_fe[col] = df_fe[col].fillna(df_fe[col].median())
    df_fe['TotalSF'] = df_fe['TotalBsmtSF'] + df_fe['1stFlrSF'] + df_fe['2ndFlrSF']
    df_fe['HouseAge'] = df_fe['YrSold'] - df_fe['YearBuilt']
    df_fe['RemodAge'] = df_fe['YrSold'] - df_fe['YearRemodAdd']
    df_fe['TotalBath'] = df_fe['BsmtFullBath'] + (0.5 * df_fe['BsmtHalfBath']) + df_fe['FullBath'] + (0.5 * df_fe['HalfBath'])
    df_fe['HouseAge'] = df_fe['HouseAge'].clip(0)
    df_fe['RemodAge'] = df_fe['RemodAge'].clip(0)
    df_fe['Qual_x_TotalSF'] = df_fe['OverallQual'] * df_fe['TotalSF']
    df_fe['Qual_x_HouseAge'] = df_fe['OverallQual'] * df_fe['HouseAge']
    if skew_list_to_apply is None:
        numerical_feats = df_fe.select_dtypes(exclude=["object", "category"]).columns
        skewed_feats = df_fe[numerical_feats].apply(lambda x: skew(x.dropna()))
        skew_list_to_apply = skewed_feats[skewed_feats > 0.5].index
    for feat in skew_list_to_apply:
        if feat in df_fe.columns: df_fe[feat] = np.log1p(df_fe[feat])
    return df_fe, skew_list_to_apply

# --- 2. 加载全部数据并准备 ---
train_df = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/train.csv')
test_df = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/test.csv')

# 移除异常值
outlier_indices = train_df[(train_df['GrLivArea'] > 4000) & (train_df['SalePrice'] < 300000)].index
train_df = train_df.drop(outlier_indices)
test_ids = test_df['Id']

# 准备完整的训练集 X_full, y_log 和测试集 X_test
X_full = train_df.drop(['Id', 'SalePrice'], axis=1)
y_log = np.log1p(train_df['SalePrice'])
X_test = test_df.drop('Id', axis=1)

# --- 3. 应用特征工程 ---
X_full_pro, skew_list = feature_engineer_ultimate_final(X_full)
X_test_pro, _ = feature_engineer_ultimate_final(X_test, skew_list_to_apply=skew_list)

# --- 4. 训练最终的三个模型 (使用最佳参数和100%的训练数据) ---
print("正在用全部数据重新训练最终模型...")
# a) CatBoost
categorical_cols_names = [cname for cname in X_full_pro.columns if X_full_pro[cname].dtype == "object"]
cat_best_params = {'depth': 6, 'iterations': 500, 'learning_rate': 0.1, 'random_state': 42}
final_cat = cb.CatBoostRegressor(**cat_best_params, cat_features=categorical_cols_names, verbose=0)
final_cat.fit(X_full_pro, y_log)

# b) LGBM & XGBoost (需要预处理)
categorical_cols = [cname for cname in X_full_pro.columns if X_full_pro[cname].dtype == "object"]
numerical_cols = [cname for cname in X_full_pro.columns if X_full_pro[cname].dtype in ['int64', 'float64']]
preprocessor = ColumnTransformer(transformers=[('num', SimpleImputer(strategy='median'), numerical_cols), ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)])
X_full_processed = preprocessor.fit_transform(X_full_pro)
X_test_processed = preprocessor.transform(X_test_pro)

lgbm_best_params = {'learning_rate': 0.05, 'n_estimators': 200, 'num_leaves': 20, 'random_state': 42, 'verbosity': -1}
final_lgbm = lgb.LGBMRegressor(**lgbm_best_params)
final_lgbm.fit(X_full_processed, y_log)

xgb_best_params = {'learning_rate': 0.05, 'max_depth': 3, 'n_estimators': 500, 'random_state': 42}
final_xgb = xgb.XGBRegressor(**xgb_best_params)
final_xgb.fit(X_full_processed, y_log)
print("✅ 所有最终模型训练完成！")

# --- 5. 预测与终极融合 ---
preds_log_cat = final_cat.predict(X_test_pro)
preds_log_lgbm = final_lgbm.predict(X_test_processed)
preds_log_xgb = final_xgb.predict(X_test_processed)

preds_cat = np.expm1(preds_log_cat)
preds_lgbm = np.expm1(preds_log_lgbm)
preds_xgb = np.expm1(preds_log_xgb)

final_predictions = 0.6 * preds_cat + 0.2 * preds_xgb + 0.2 * preds_lgbm

# --- 6. 生成提交文件 ---
submission = pd.DataFrame({'Id': test_ids, 'SalePrice': final_predictions})
submission.to_csv('submission_ultimate_v2.csv', index=False)
print("\n🎉🎉🎉 “火力全开”版的最终提交文件 'submission_ultimate_v2.csv' 已成功生成！🎉🎉🎉")

正在用全部数据重新训练最终模型...
✅ 所有最终模型训练完成！

🎉🎉🎉 “火力全开”版的最终提交文件 'submission_ultimate_v2.csv' 已成功生成！🎉🎉🎉
