In [14]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
import xgboost as xgb
from catboost import CatBoostRegressor
from sklearn.ensemble import VotingRegressor
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
import optuna
import warnings
import joblib

# 忽略所有警告
warnings.filterwarnings("ignore")

# 读取数据并预处理
data = pd.read_csv('/workspaces/SFP/Training_data/data_13_17_1_std.csv')  # 替换为实际数据路径
data = data.drop(['Unnamed: 0', 'Pe', 'R_total', 'R_calm', 'postime', 'num'], axis=1)

# 数据清理：使用KMeans聚类检测异常值
scaler = StandardScaler()
data_scaled = scaler.fit_transform(data)
kmeans = KMeans(n_clusters=2, random_state=42)
data['cluster'] = kmeans.fit_predict(data_scaled)

# 假设较小的簇为异常数据
cluster_sizes = data['cluster'].value_counts()
anomaly_cluster = cluster_sizes.idxmin()
data = data[data['cluster'] != anomaly_cluster]
data = data.drop(columns=['cluster'])

# 特征和目标提取
features = data.drop(columns=['R_added']).values
targets = data['R_added'].values

# 特征标准化
features = scaler.fit_transform(features)

# 数据集划分
X_train, X_test, y_train, y_test = train_test_split(features, targets, test_size=0.2, random_state=42)

# 检查是否已有最佳参数保存
try:
    xgb_best_params = joblib.load("xgb_best_params.pkl")
    catboost_best_params = joblib.load("catboost_best_params.pkl")
    print("Loaded saved parameters for XGBoost and CatBoost.")
except FileNotFoundError:
    # 定义Optuna优化目标函数（用于XGBoost）
    def objective_xgb(trial):
        params = {
            'n_estimators': trial.suggest_int('n_estimators', 50, 500),
            'max_depth': trial.suggest_int('max_depth', 3, 15),
            'learning_rate': trial.suggest_float('learning_rate', 0.005, 0.3),
            'subsample': trial.suggest_float('subsample', 0.6, 1.0),
            'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
            'gamma': trial.suggest_float('gamma', 0, 1.0),
            'reg_alpha': trial.suggest_float('reg_alpha', 0, 1.0),
            'reg_lambda': trial.suggest_float('reg_lambda', 1, 10.0),
            'min_child_weight': trial.suggest_int('min_child_weight', 1, 10)
        }
        model = xgb.XGBRegressor(objective='reg:squarederror', random_state=42, **params)
        model.fit(X_train, y_train, eval_set=[(X_test, y_test)], verbose=False)
        y_pred = model.predict(X_test)
        rmse = np.sqrt(mean_squared_error(y_test, y_pred))
        return rmse

    # 定义Optuna优化目标函数（用于CatBoost）
    def objective_catboost(trial):
        params = {
            'iterations': trial.suggest_int('iterations', 100, 1000),
            'depth': trial.suggest_int('depth', 4, 10),
            'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
            'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 1, 10),
            'bagging_temperature': trial.suggest_float('bagging_temperature', 0, 1),
            'random_strength': trial.suggest_float('random_strength', 0, 1)
        }
        model = CatBoostRegressor(verbose=0, random_seed=42, **params)
        model.fit(X_train, y_train, eval_set=(X_test, y_test))
        y_pred = model.predict(X_test)
        rmse = np.sqrt(mean_squared_error(y_test, y_pred))
        return rmse

    # 使用Optuna优化XGBoost超参数
    study_xgb = optuna.create_study(direction='minimize', study_name='XGBoost Optimization')
    study_xgb.optimize(objective_xgb, n_trials=50, show_progress_bar=False)

    # 使用Optuna优化CatBoost超参数
    study_catboost = optuna.create_study(direction='minimize', study_name='CatBoost Optimization')
    study_catboost.optimize(objective_catboost, n_trials=50, show_progress_bar=False)

    # 保存最佳参数
    xgb_best_params = study_xgb.best_params
    catboost_best_params = study_catboost.best_params
    joblib.dump(xgb_best_params, "xgb_best_params.pkl")
    joblib.dump(catboost_best_params, "catboost_best_params.pkl")
    print("Saved best parameters for XGBoost and CatBoost.")

# 定义XGBoost和CatBoost模型
xgb_model = xgb.XGBRegressor(objective='reg:squarederror', random_state=42, **xgb_best_params)
catboost_model = CatBoostRegressor(verbose=0, random_seed=42, **catboost_best_params)

# 集成模型：Voting Regressor
ensemble_model = VotingRegressor([('xgb', xgb_model), ('catboost', catboost_model)])

# 训练集成模型
ensemble_model.fit(X_train, y_train)

# 使用集成模型预测
y_pred = ensemble_model.predict(X_test)

# 计算评估指标
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

print("Mean Squared Error: ", mse)
print("Root Mean Squared Error: ", rmse)
print("R2 Score: ", r2)

# 如果需要保存模型
joblib.dump(ensemble_model, "ensemble_model.pkl")



Loaded saved parameters for XGBoost and CatBoost.


AttributeError: 'super' object has no attribute '__sklearn_tags__'