# GBDT模型建立与应用

本notebook加载特征工程与t-SNE降维后的数据，进行GBDT建模、调参、特征选择，并保存最佳模型。

In [5]:
import joblib
import numpy as np
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from scipy.stats import uniform, randint

# 加载预处理数据
processed_data = joblib.load('../data/processed_data.pkl')
X_train = processed_data['X_train']
y_train = processed_data['y_train']
feature_names = processed_data['feature_names']

print(f"训练特征形状: {X_train.shape}")
print(f"目标变量形状: {y_train.shape}")

训练特征形状: (1460, 287)
目标变量形状: (1460,)


In [6]:
# 定义GBDT超参数搜索空间
param_dist = {
    'n_estimators': randint(100, 1000),
    'max_depth': randint(3, 10),
    'learning_rate': uniform(0.01, 0.2),
    'subsample': uniform(0.6, 0.4)
}
random_search = RandomizedSearchCV(
    estimator=GradientBoostingRegressor(),
    param_distributions=param_dist,
    n_iter=100,
    scoring='neg_mean_squared_error',
    cv=5,
    verbose=1,
    n_jobs=-1,
    random_state=42
)
random_search.fit(X_train, y_train)
print("最佳参数组合：", random_search.best_params_)

Fitting 5 folds for each of 100 candidates, totalling 500 fits
最佳参数组合： {'learning_rate': 0.04499098541918724, 'max_depth': 3, 'n_estimators': 671, 'subsample': 0.7043316699321636}


In [7]:
# 训练最优GBDT模型并评估
best_gbdt_model = random_search.best_estimator_
y_train_pred = best_gbdt_model.predict(X_train)
gbdt_mse = mean_squared_error(y_train, y_train_pred)
gbdt_mae = mean_absolute_error(y_train, y_train_pred)
gbdt_r2 = r2_score(y_train, y_train_pred)
print(f'优化后的GBDT模型的均方误差: {gbdt_mse:.2f}')
print(f'优化后的GBDT模型的平均绝对误差: {gbdt_mae:.2f}')
print(f'优化后的GBDT模型的R-squared: {gbdt_r2:.2f}')

# 保存模型
joblib.dump(best_gbdt_model, '../models/best_gbdt_model.pkl', compress=3)
print("最优GBDT模型已保存到 ../models/best_gbdt_model.pkl")

优化后的GBDT模型的均方误差: 0.00
优化后的GBDT模型的平均绝对误差: 0.04
优化后的GBDT模型的R-squared: 0.98
最优GBDT模型已保存到 ../models/best_gbdt_model.pkl


In [8]:
# 递归特征消除（RFE）+ GBDT
from sklearn.feature_selection import RFE
rfe = RFE(estimator=GradientBoostingRegressor(**random_search.best_params_), n_features_to_select=30, step=10)
rfe.fit(X_train, y_train)
X_train_rfe = rfe.transform(X_train)
rfe_gbdt = GradientBoostingRegressor(**random_search.best_params_)
rfe_gbdt.fit(X_train_rfe, y_train)
y_train_pred_rfe = rfe_gbdt.predict(X_train_rfe)
rfe_mse = mean_squared_error(y_train, y_train_pred_rfe)
print(f'RFE特征选择后GBDT模型的均方误差: {rfe_mse:.2f}')

RFE特征选择后GBDT模型的均方误差: 0.00


In [9]:
# XGBoost模型
from xgboost import XGBRegressor
xgb_model = XGBRegressor(n_estimators=300, learning_rate=0.05, max_depth=5, subsample=0.8, random_state=42)
xgb_model.fit(X_train, y_train)
y_train_pred_xgb = xgb_model.predict(X_train)
xgb_mse = mean_squared_error(y_train, y_train_pred_xgb)
print(f'XGBoost模型的均方误差: {xgb_mse:.2f}')

XGBoost模型的均方误差: 0.00


In [10]:
# Stacking集成模型
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import StackingRegressor
stacking_model = StackingRegressor(
    estimators=[
        ('gbdt', GradientBoostingRegressor(**random_search.best_params_)),
        ('xgb', XGBRegressor(n_estimators=300, learning_rate=0.05, max_depth=5, subsample=0.8, random_state=42))
    ],
    final_estimator=LinearRegression()
)
stacking_model.fit(X_train, y_train)
y_train_pred_stack = stacking_model.predict(X_train)
stack_mse = mean_squared_error(y_train, y_train_pred_stack)
print(f'Stacking集成模型的均方误差: {stack_mse:.2f}')

Stacking集成模型的均方误差: 0.00


In [11]:
# 保存建模相关对象，便于后续可视化分析
import joblib
joblib.dump({
    'random_search': random_search,
    'rfe_gbdt': rfe_gbdt if 'rfe_gbdt' in locals() else None,
    'xgb_model': xgb_model if 'xgb_model' in locals() else None,
    'stacking_model': stacking_model if 'stacking_model' in locals() else None,
    'gbdt_mse': gbdt_mse if 'gbdt_mse' in locals() else None,
    'rfe_mse': rfe_mse if 'rfe_mse' in locals() else None,
    'xgb_mse': xgb_mse if 'xgb_mse' in locals() else None,
    'stack_mse': stack_mse if 'stack_mse' in locals() else None
}, '../models/model_objects.pkl', compress=3)
print("建模相关对象已保存到 ../models/model_objects.pkl")

建模相关对象已保存到 ../models/model_objects.pkl
