In [9]:
# 导入必要的库
import shap
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt
from skopt import BayesSearchCV  # 导入贝叶斯优化
from skopt.space import Real, Integer

In [4]:
# 1. 读取数据
X_train = pd.read_excel("Xtrain.xlsx").iloc[:, 2:].values  # 特征数据
y_train = pd.read_excel("ytrain.xlsx").values  # 标签数据
X_test = pd.read_excel("xtest.xlsx").iloc[:, 2:].values  # 测试集特征数据
y_test = pd.read_excel("ytest.xlsx").values  # 测试集标签数据

# 2. 处理缺失值（删除包含 NaN 的行）
train_mask = ~np.isnan(X_train).any(axis=1)
test_mask = ~np.isnan(X_test).any(axis=1)

X_train = X_train[train_mask]
y_train = y_train[train_mask]
X_test = X_test[test_mask]
y_test = y_test[test_mask]

# 3. 数据标准化

X_train_scaled = X_train
X_test_scaled = X_test

In [None]:
# 4. 定义 XGBoost 模型
model = xgb.XGBRegressor(objective='reg:squarederror', eval_metric='rmse')

# 5. 贝叶斯优化设置
param_space = {
    'n_estimators': Integer(100, 1000),  # 树的数量
    'max_depth': Integer(3, 10),  # 树的最大深度
    'learning_rate': Real(0.01, 0.3),  # 学习率
    'subsample': Real(0.5, 1.0),  # 随机采样比例
    'colsample_bytree': Real(0.5, 1.0)  # 树的特征采样比例
}

opt = BayesSearchCV(model, param_space, n_iter=50, cv=5, scoring='neg_mean_squared_error', n_jobs=-1, random_state=42)

# 6. 执行贝叶斯优化
opt.fit(X_train_scaled, y_train)

# 输出最优超参数
print("Best Hyperparameters:", opt.best_params_)




AttributeError: 'super' object has no attribute '__sklearn_tags__'

In [15]:
# 7. 使用最优超参数训练模型
best_model = opt.best_estimator_

# 8. 预测训练集和测试集
y_train_pred = best_model.predict(X_train_scaled)
y_test_pred = best_model.predict(X_test_scaled)

# 9. 反标准化预测结果


# 10. 计算模型性能
# R²
r2_train = r2_score(y_train, y_train_pred)
r2_test = r2_score(y_test, y_test_pred)

# RMSE
rmse_train = np.sqrt(mean_squared_error(y_train, y_train_pred))
rmse_test = np.sqrt(mean_squared_error(y_test, y_test_pred))

# 输出性能指标
print(f"Train R²: {r2_train:.4f}")
print(f"Test R²: {r2_test:.4f}")
print(f"Train RMSE: {rmse_train:.4f}")
print(f"Test RMSE: {rmse_test:.4f}")


Train R²: 0.9971
Test R²: 0.8094
Train RMSE: 3.2808
Test RMSE: 25.1872


In [None]:
# 11. 可视化结果（真实值 vs 预测值）
plt.figure(figsize=(8, 6))
plt.scatter(y_test_pred, y_test, color='black', label='Test Data', s=50)
plt.plot([min(y_test), max(y_test)], [min(y_test), max(y_test)], 'k--', label='y = x', linewidth=2)
plt.plot(y_test_pred, y_test_pred, color='red', label='Fit Line', linewidth=3)

plt.xlim([-20, 600])
plt.ylim([-20, 600])
plt.xlabel('Predicted Tur (FNU)', fontsize=14)
plt.ylabel('Observed Tur (FNU)', fontsize=14)
plt.legend(loc='southeast', fontsize=12)
plt.grid(True)
plt.show()

In [12]:
opt.best_estimator_

In [16]:
# Define the shap explainer for Education Facilities
explainer_edu = shap.Explainer(best_model)
shap_values_edu = explainer_edu.shap_values(X_train_scaled)

NameError: name 'shap' is not defined