In [1]:
import shap
import numpy as np
import pandas as pd
import warnings
import matplotlib.pyplot as plt
from matplotlib.colors import Normalize
from itertools import combinations
import seaborn as sns
from sklearn import metrics
from sklearn import preprocessing 
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import LeaveOneOut
from sklearn.feature_selection import RFE
from bayes_opt import BayesianOptimization

In [2]:
warnings.filterwarnings('ignore')

def weighted_bootstrap(data, weights, n_samples):
    probabilities = weights / weights.sum()
    bootstrap_indices = np.random.choice(len(data), size=n_samples, p=probabilities)
    return data.iloc[bootstrap_indices]

def run_bayesian_optimization(X_train_all, y_train_all):
    def black_box_function(learning_rate, n_estimators, min_samples_split, max_features, max_depth, max_leaf_nodes):
        params = {
            'learning_rate': max(learning_rate, 1e-3),
            'n_estimators': int(n_estimators),
            'min_samples_split': int(min_samples_split),
            'max_features': min(max_features, 0.999),
            'max_depth': int(max_depth),
            'max_leaf_nodes': int(max_leaf_nodes),
            'random_state': 2
        }
        
        model = GradientBoostingRegressor(**params)
        loo = LeaveOneOut()
        preds, truths = [], []
        
        for train_idx, val_idx in loo.split(X_train_all):
            X_train, X_val = X_train_all[train_idx], X_train_all[val_idx]
            y_train, y_val = y_train_all[train_idx], y_train_all[val_idx]
            model.fit(X_train, y_train.ravel())
            preds.append(model.predict(X_val)[0])
            truths.append(y_val[0])
            
        return r2_score(truths, preds)

    pbounds = {
        'learning_rate': (0.001, 0.2),
        'n_estimators': (10, 500),
        'min_samples_split': (2, 25),
        'max_features': (0.1, 1.0), 
        'max_depth': (1, 5),
        'max_leaf_nodes': (2, 15)
    }

    optimizer = BayesianOptimization(
        f=black_box_function,
        pbounds=pbounds,
        random_state=1
    )
    optimizer.maximize(init_points=15, n_iter=20)
    return optimizer.max['params']

data = pd.read_excel(r"C:\Users\HP\Desktop\Data.xlsx", 
                    sheet_name='16+3',
                    index_col=0,
                    header=0)
features = data[['lg(O3)', 'lg(H2O2)', 'pH', 'TOC']]
data1 = features.iloc[0:18]

r2_results = []
rmse_results = []

all_train_true = []
all_train_pred = []
all_test_true = []
all_test_pred = []
for iter in range(10):
    print(f"\n========== Iteration times {iter+1}/10 ==========")
    
    test_indices = np.random.choice(data1.index, size=4, replace=False)
    test_set = data1.loc[test_indices]
    train_set = data1.drop(test_indices)
    
    scaler = MinMaxScaler()
    X_train = scaler.fit_transform(train_set[['lg(O3)', 'lg(H2O2)', 'pH']])
    y_train = train_set['TOC'].values.reshape(-1, 1)
    
    train_df = pd.DataFrame(X_train, 
                           columns=['lg(O3)', 'lg(H2O2)', 'pH'],
                           index=train_set.index)
    train_df['TOC'] = y_train
    
    np.random.seed(iter) 
    augmented = weighted_bootstrap(train_df, 
                                 np.ones(len(train_df)), 
                                 n_samples=36)
    
    X_augmented = np.vstack([X_train, augmented[['lg(O3)', 'lg(H2O2)', 'pH']].values])
    y_augmented = np.vstack([y_train, augmented['TOC'].values.reshape(-1, 1)])
    
    best_params = run_bayesian_optimization(X_augmented, y_augmented)
    
    final_model = GradientBoostingRegressor(
        learning_rate=best_params['learning_rate'],
        n_estimators=int(best_params['n_estimators']),
        min_samples_split=int(best_params['min_samples_split']),
        max_features=best_params['max_features'],
        max_depth=int(best_params['max_depth']),
        max_leaf_nodes=int(best_params['max_leaf_nodes']),
        random_state=2
    )
    final_model.fit(X_augmented, y_augmented.ravel())
    
    train_pred = final_model.predict(X_augmented)
    all_train_true.extend(y_augmented.ravel().tolist())
    all_train_pred.extend(train_pred.tolist())
    
    X_test = scaler.transform(test_set[['lg(O3)', 'lg(H2O2)', 'pH']])
    y_test = test_set['TOC'].values
    test_pred = final_model.predict(X_test)
    all_test_true.extend(y_test.tolist())
    all_test_pred.extend(test_pred.tolist())
    
    r2 = r2_score(y_test, test_pred)
    rmse = np.sqrt(mean_squared_error(y_test, test_pred))
    
    r2_results.append(r2)
    rmse_results.append(rmse)
    print(f"Results of this round: R2={r2:.4f}, RMSE={rmse:.4f}")

print("\n================ Final result ================")
print(f"Average R2 score: {np.mean(r2_results):.4f} ± {np.std(r2_results):.4f}")
print(f"Average RMSE score: {np.mean(rmse_results):.4f} ± {np.std(rmse_results):.4f}")
print("\n Detailed R2 results:", [round(x, 4) for x in r2_results])
print("Detailed RMSE results:", [round(x, 4) for x in rmse_results])


|   iter    |  target   | learni... | max_depth | max_fe... | max_le... | min_sa... | n_esti... |
-------------------------------------------------------------------------------------------------
| [39m1        [39m | [39m0.9973   [39m | [39m0.08399  [39m | [39m3.881    [39m | [39m0.1001   [39m | [39m5.93     [39m | [39m5.375    [39m | [39m55.25    [39m |
| [35m2        [39m | [35m0.9985   [39m | [35m0.03807  [39m | [35m2.382    [39m | [35m0.4571   [39m | [35m9.005    [39m | [35m11.64    [39m | [35m345.8    [39m |
| [35m3        [39m | [35m0.9994   [39m | [35m0.04169  [39m | [35m4.512    [39m | [35m0.1246   [39m | [35m10.72    [39m | [35m11.6     [39m | [35m283.8    [39m |
| [39m4        [39m | [39m0.9405   [39m | [39m0.02894  [39m | [39m1.792    [39m | [39m0.8207   [39m | [39m14.59    [39m | [39m9.209    [39m | [39m349.2    [39m |
| [39m5        [39m | [39m0.9759   [39m | [39m0.1754   [39m | [39m4.578    [39m | 

| [39m9        [39m | [39m0.908    [39m | [39m0.05827  [39m | [39m1.52     [39m | [39m0.1174   [39m | [39m10.82    [39m | [39m6.867    [39m | [39m140.1    [39m |
| [39m10       [39m | [39m0.9693   [39m | [39m0.09882  [39m | [39m1.213    [39m | [39m0.6167   [39m | [39m3.907    [39m | [39m15.55    [39m | [39m352.9    [39m |
| [39m11       [39m | [39m0.9796   [39m | [39m0.02136  [39m | [39m2.656    [39m | [39m0.725    [39m | [39m7.384    [39m | [39m3.149    [39m | [39m272.6    [39m |
| [39m12       [39m | [39m0.9952   [39m | [39m0.1331   [39m | [39m3.06     [39m | [39m0.9501   [39m | [39m9.625    [39m | [39m22.78    [39m | [39m77.36    [39m |
| [39m13       [39m | [39m0.9788   [39m | [39m0.02872  [39m | [39m4.23     [39m | [39m0.4579   [39m | [39m4.15     [39m | [39m23.33    [39m | [39m180.4    [39m |
| [39m14       [39m | [39m0.9944   [39m | [39m0.1504   [39m | [39m3.904    [39m | [39m0.895    [39m 

| [39m19       [39m | [39m0.9892   [39m | [39m0.2      [39m | [39m5.0      [39m | [39m1.0      [39m | [39m11.26    [39m | [39m25.0     [39m | [39m373.3    [39m |
| [39m20       [39m | [39m0.9954   [39m | [39m0.05202  [39m | [39m4.335    [39m | [39m0.6051   [39m | [39m13.17    [39m | [39m21.19    [39m | [39m314.9    [39m |
| [39m21       [39m | [39m0.9962   [39m | [39m0.09381  [39m | [39m4.301    [39m | [39m0.379    [39m | [39m12.0     [39m | [39m18.84    [39m | [39m311.6    [39m |
| [39m22       [39m | [39m0.9745   [39m | [39m0.01319  [39m | [39m4.324    [39m | [39m0.4943   [39m | [39m10.22    [39m | [39m17.95    [39m | [39m315.5    [39m |
| [39m23       [39m | [39m0.1729   [39m | [39m0.001    [39m | [39m1.0      [39m | [39m0.1      [39m | [39m14.24    [39m | [39m19.28    [39m | [39m313.2    [39m |
| [39m24       [39m | [39m0.9871   [39m | [39m0.2      [39m | [39m5.0      [39m | [39m1.0      [39m 

| [39m29       [39m | [39m1.0      [39m | [39m0.1292   [39m | [39m3.51     [39m | [39m0.4632   [39m | [39m7.513    [39m | [39m8.564    [39m | [39m278.6    [39m |
| [39m30       [39m | [39m0.3667   [39m | [39m0.001    [39m | [39m4.173    [39m | [39m0.1907   [39m | [39m13.72    [39m | [39m6.353    [39m | [39m280.2    [39m |
| [39m31       [39m | [39m1.0      [39m | [39m0.1508   [39m | [39m3.862    [39m | [39m0.3777   [39m | [39m5.451    [39m | [39m12.88    [39m | [39m281.7    [39m |
| [39m32       [39m | [39m0.9846   [39m | [39m0.1765   [39m | [39m2.876    [39m | [39m0.7129   [39m | [39m2.616    [39m | [39m9.547    [39m | [39m276.4    [39m |
| [39m33       [39m | [39m0.9999   [39m | [39m0.06633  [39m | [39m4.845    [39m | [39m0.1      [39m | [39m8.67     [39m | [39m15.85    [39m | [39m286.8    [39m |
| [39m34       [39m | [39m0.9895   [39m | [39m0.1903   [39m | [39m1.369    [39m | [39m0.7266   [39m 

| [39m2        [39m | [39m0.9803   [39m | [39m0.03807  [39m | [39m2.382    [39m | [39m0.4571   [39m | [39m9.005    [39m | [39m11.64    [39m | [39m345.8    [39m |
| [35m3        [39m | [35m0.9926   [39m | [35m0.04169  [39m | [35m4.512    [39m | [35m0.1246   [39m | [35m10.72    [39m | [35m11.6     [39m | [35m283.8    [39m |
| [39m4        [39m | [39m0.9652   [39m | [39m0.02894  [39m | [39m1.792    [39m | [39m0.8207   [39m | [39m14.59    [39m | [39m9.209    [39m | [39m349.2    [39m |
| [39m5        [39m | [39m0.9685   [39m | [39m0.1754   [39m | [39m4.578    [39m | [39m0.1765   [39m | [39m2.508    [39m | [39m5.906    [39m | [39m440.3    [39m |
| [39m6        [39m | [39m0.9647   [39m | [39m0.02057  [39m | [39m2.684    [39m | [39m0.9621   [39m | [39m8.931    [39m | [39m17.91    [39m | [39m164.6    [39m |
| [39m7        [39m | [39m0.99     [39m | [39m0.1376   [39m | [39m4.339    [39m | [39m0.1165   [39m 

| [39m12       [39m | [39m0.9986   [39m | [39m0.1331   [39m | [39m3.06     [39m | [39m0.9501   [39m | [39m9.625    [39m | [39m22.78    [39m | [39m77.36    [39m |
| [39m13       [39m | [39m0.9858   [39m | [39m0.02872  [39m | [39m4.23     [39m | [39m0.4579   [39m | [39m4.15     [39m | [39m23.33    [39m | [39m180.4    [39m |
| [35m14       [39m | [35m0.9999   [39m | [35m0.1504   [39m | [35m3.904    [39m | [35m0.895    [39m | [35m10.11    [39m | [35m19.27    [39m | [35m181.0    [39m |
| [39m15       [39m | [39m0.9998   [39m | [39m0.05472  [39m | [39m4.584    [39m | [39m0.4853   [39m | [39m14.54    [39m | [39m17.26    [39m | [39m314.6    [39m |
| [35m16       [39m | [35m0.9999   [39m | [35m0.2      [39m | [35m3.996    [39m | [35m0.9866   [39m | [35m10.2     [39m | [35m19.36    [39m | [35m181.1    [39m |
| [39m17       [39m | [39m0.9978   [39m | [39m0.0476   [39m | [39m4.509    [39m | [39m0.5961   [39m 

| [39m22       [39m | [39m0.9992   [39m | [39m0.03598  [39m | [39m3.7      [39m | [39m0.3841   [39m | [39m9.252    [39m | [39m7.906    [39m | [39m278.9    [39m |
| [39m23       [39m | [39m0.9983   [39m | [39m0.03757  [39m | [39m3.209    [39m | [39m0.5931   [39m | [39m14.76    [39m | [39m10.76    [39m | [39m279.3    [39m |
| [39m24       [39m | [39m0.943    [39m | [39m0.04266  [39m | [39m1.74     [39m | [39m0.9849   [39m | [39m12.94    [39m | [39m6.467    [39m | [39m283.3    [39m |
| [39m25       [39m | [39m1.0      [39m | [39m0.09212  [39m | [39m5.0      [39m | [39m0.2552   [39m | [39m9.83     [39m | [39m13.93    [39m | [39m278.1    [39m |
| [39m26       [39m | [39m1.0      [39m | [39m0.06321  [39m | [39m3.918    [39m | [39m0.7288   [39m | [39m11.86    [39m | [39m9.931    [39m | [39m273.5    [39m |
| [39m27       [39m | [39m0.9501   [39m | [39m0.07794  [39m | [39m1.0      [39m | [39m0.1      [39m 

| [39m32       [39m | [39m0.9717   [39m | [39m0.1213   [39m | [39m3.543    [39m | [39m0.6815   [39m | [39m9.804    [39m | [39m22.51    [39m | [39m77.38    [39m |
| [39m33       [39m | [39m0.9498   [39m | [39m0.1624   [39m | [39m4.524    [39m | [39m0.416    [39m | [39m14.96    [39m | [39m5.073    [39m | [39m309.8    [39m |
| [39m34       [39m | [39m0.9418   [39m | [39m0.1531   [39m | [39m3.898    [39m | [39m0.1544   [39m | [39m8.583    [39m | [39m19.94    [39m | [39m49.89    [39m |
| [39m35       [39m | [39m0.9463   [39m | [39m0.1953   [39m | [39m3.854    [39m | [39m0.1665   [39m | [39m3.423    [39m | [39m23.28    [39m | [39m180.5    [39m |
本轮结果: R²=0.7209, RMSE=0.0635

|   iter    |  target   | learni... | max_depth | max_fe... | max_le... | min_sa... | n_esti... |
-------------------------------------------------------------------------------------------------
| [39m1        [39m | [39m0.9724   [39m | [39m0.08399  [