In [1]:
import shap
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import warnings
from matplotlib.colors import Normalize
from itertools import combinations
import seaborn as sns
from sklearn import metrics
from sklearn import preprocessing
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from xgboost.sklearn import XGBRegressor
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.metrics import make_scorer
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.model_selection import LeaveOneOut
from sklearn.feature_selection import RFE
from bayes_opt import BayesianOptimization
from sklearn.inspection import PartialDependenceDisplay

In [2]:
def run_bayesian_optimization(X_train_all, y_train_all):
    def black_box_function(learning_rate, n_estimators, min_samples_split, max_features, max_depth, max_leaf_nodes):
        params = {
            'learning_rate': max(learning_rate, 1e-3),
            'n_estimators': int(n_estimators),
            'min_samples_split': int(min_samples_split),
            'max_features': min(max_features, 0.999),
            'max_depth': int(max_depth),
            'max_leaf_nodes': int(max_leaf_nodes),
            'random_state': 2
        }
        
        model = GradientBoostingRegressor(**params)
        loo = LeaveOneOut()
        preds, truths = [], []
        
        for train_idx, val_idx in loo.split(X_train_all):
            X_train, X_val = X_train_all[train_idx], X_train_all[val_idx]
            y_train, y_val = y_train_all[train_idx], y_train_all[val_idx]
            model.fit(X_train, y_train.ravel())
            preds.append(model.predict(X_val)[0])
            truths.append(y_val[0])
            
        return r2_score(truths, preds)

    pbounds = {
        'learning_rate': (0.001, 0.2),
        'n_estimators': (10, 500),
        'min_samples_split': (2, 25),
        'max_features': (0.1, 1.0),
        'max_depth': (1, 5),
        'max_leaf_nodes': (2, 15)
    }

    optimizer = BayesianOptimization(
        f=black_box_function,
        pbounds=pbounds,
        random_state=1
    )
    optimizer.maximize(init_points=15, n_iter=20)
    return optimizer.max['params']

In [4]:
data = pd.read_excel(r"C:\Users\HP\Desktop\Data.xlsx",
                    sheet_name='16+3',
                    index_col=0,
                    header=0)
features = data[['lg(O3)', 'lg(H2O2)', 'pH', 'TOC', 'FMax2']]
data1 = features.iloc[0:18]

df_GAN_origin_all = pd.read_excel(
    r"C:\Users\HP\jupyternotebook\MLofCC\GAN\GAN800.xlsx",
    header=0)
df_GAN_origin = df_GAN_origin_all[['lg(O3)', 'lg(H2O2)', 'pH', 'TOC']]

scaler1 = MinMaxScaler()
feature_columns = ['lg(O3)', 'lg(H2O2)', 'pH', 'TOC','FMax2']
scaled_features = scaler1.fit_transform(data1[feature_columns])
data3 = pd.DataFrame(scaled_features, columns=feature_columns)


X = data3[['lg(O3)', 'lg(H2O2)', 'pH', 'FMax2']].values
y = data3['TOC'].values

def black_box_function(learning_rate, n_estimators, min_samples_split, max_features, max_depth, max_leaf_nodes):
    model = GradientBoostingRegressor(
            learning_rate=learning_rate,
            n_estimators=int(n_estimators),
            min_samples_split=int(min_samples_split),
            max_features=min(max_features, 0.999),
            max_depth=int(max_depth),
            max_leaf_nodes=int(max_leaf_nodes),
            random_state=2
        )
    loo = LeaveOneOut()
    preds, truths = [], []
    for train_idx, test_idx in loo.split(X):
        X_train, X_test = X[train_idx], X[test_idx]
        y_train, y_test = y[train_idx], y[test_idx]
        model.fit(X_train, y_train)
        preds.append(model.predict(X_test)[0])
        truths.append(y_test[0])
    return r2_score(truths, preds)

optimizer = BayesianOptimization(
        f=black_box_function,
        pbounds={
            'learning_rate': (0.001, 0.2),
            'n_estimators': (10, 500),
            'min_samples_split': (2, 25),
            'max_features': (1, 4),
            'max_depth': (1, 5),
            'max_leaf_nodes': (2, 15)
        },
        random_state=42
    )
optimizer.maximize(init_points=15, n_iter=25)
best_params = optimizer.max['params']
    
model = GradientBoostingRegressor(
        learning_rate=best_params['learning_rate'],
        n_estimators=int(best_params['n_estimators']),
        max_leaf_nodes=int(best_params['max_leaf_nodes']),
        max_features=min(best_params['max_features'], 0.999),
        min_samples_split=int(best_params['min_samples_split']),
        max_depth=int(best_params['max_depth']),
        random_state=2
    )
model.fit(X, y)

model_FMax2 = LinearRegression()
model_FMax2.fit(data3[['lg(O3)']], data3['FMax2'])
    
scaler2 = MinMaxScaler()
train_features = data1[['lg(O3)', 'lg(H2O2)', 'pH', 'TOC']].values  
scaler2.fit(train_features)
gan_features = scaler2.transform(df_GAN_origin[['lg(O3)', 'lg(H2O2)', 'pH', 'TOC']])
df_GAN = pd.DataFrame(gan_features, columns=['lg(O3)', 'lg(H2O2)', 'pH', 'TOC'])
    
df_GAN['FMax2'] = model_FMax2.predict(df_GAN[['lg(O3)']])

X_new = df_GAN[['lg(O3)', 'lg(H2O2)', 'pH', 'FMax2']]
y_pred_new = model.predict(X_new)

toc_min = scaler2.data_min_[-1]
toc_max = scaler2.data_max_[-1]
y_pred_orig = y_pred_new * (toc_max - toc_min) + toc_min

|   iter    |  target   | learni... | max_depth | max_fe... | max_le... | min_sa... | n_esti... |
-------------------------------------------------------------------------------------------------
| [39m1        [39m | [39m0.7892   [39m | [39m0.07553  [39m | [39m4.803    [39m | [39m3.196    [39m | [39m9.783    [39m | [39m5.588    [39m | [39m86.44    [39m |
| [39m2        [39m | [39m0.7396   [39m | [39m0.01256  [39m | [39m4.465    [39m | [39m2.803    [39m | [39m11.2     [39m | [39m2.473    [39m | [39m485.3    [39m |
| [39m3        [39m | [39m0.6762   [39m | [39m0.1667   [39m | [39m1.849    [39m | [39m1.545    [39m | [39m4.384    [39m | [39m8.998    [39m | [39m267.1    [39m |
| [35m4        [39m | [35m0.8168   [39m | [35m0.08696  [39m | [35m2.165    [39m | [35m2.836    [39m | [35m3.813    [39m | [35m8.719    [39m | [35m189.5    [39m |
| [39m5        [39m | [39m0.6939   [39m | [39m0.09176  [39m | [39m4.141    [39m | [



In [16]:
# ================== 0.05，36 ==================
all_train_true = []
all_train_pred = []
all_test_true = []
all_test_pred = []

r2_results = []
rmse_results = []

scaler3 = MinMaxScaler()
feature_columns3 = ['lg(O3)', 'lg(H2O2)', 'pH']
scaler3.fit(data1[feature_columns3])
# ================== Training cycle ==================
for iter in range(10):
    print(f"\n========== Iteration times {iter+1}/10 ==========")
    
    valid_indices = []
    for i in range(len(y_pred_orig)):
        if abs(df_GAN_origin.iloc[i]['TOC'] - y_pred_orig[i]) < 0.05:
            valid_indices.append(i)
    np.random.seed(iter)  

    if len(valid_indices) == 0:
        raise ValueError("No valid samples were found, please adjust the screening threshold.")

    try:
        selected_indices = np.random.choice(valid_indices, size=36, replace=False)
    except ValueError:
        selected_indices = np.random.choice(valid_indices, size=36, replace=True)

    selected_data = df_GAN_origin.iloc[selected_indices]


    test_indices = np.random.choice(data1.index, size=4, replace=False)
    test_set = data1.loc[test_indices]
    train_set = data1.drop(test_indices)
    

    combined_data = pd.concat([train_set, selected_data])
    X_final = scaler3.transform(combined_data[['lg(O3)', 'lg(H2O2)', 'pH']])
    y_final = combined_data['TOC'].values.reshape(-1, 1)

    
    best_params = run_bayesian_optimization(X_final, y_final)
    final_model = GradientBoostingRegressor(learning_rate=best_params['learning_rate'],
        n_estimators=int(best_params['n_estimators']),
        min_samples_split=int(best_params['min_samples_split']),
        max_features=best_params['max_features'],
        max_depth=int(best_params['max_depth']),
        max_leaf_nodes=int(best_params['max_leaf_nodes']),
        random_state=2)
    final_model.fit(X_final, y_final.ravel())

    train_pred = final_model.predict(X_final)
    all_train_true.extend(y_final.ravel().tolist())
    all_train_pred.extend(train_pred.tolist())

    X_test = scaler3.transform(test_set[['lg(O3)', 'lg(H2O2)', 'pH']])
    y_test = test_set['TOC'].values
    test_pred = final_model.predict(X_test)
    all_test_true.extend(y_test.tolist())
    all_test_pred.extend(test_pred.tolist())
    
    r2 = r2_score(y_test, test_pred)
    rmse = np.sqrt(mean_squared_error(y_test, test_pred))
    
    r2_results.append(r2)
    rmse_results.append(rmse)
    print(f"Results of this round: R2={r2:.4f}, RMSE={rmse:.4f}")

print("\n================ Final result ================")
print(f"Average R2 score: {np.mean(r2_results):.4f} ± {np.std(r2_results):.4f}")
print(f"Average RMSE score: {np.mean(rmse_results):.4f} ± {np.std(rmse_results):.4f}")
print("\n Detailed R2 results:", [round(x, 4) for x in r2_results])
print("Detailed RMSE results:", [round(x, 4) for x in rmse_results])


|   iter    |  target   | learni... | max_depth | max_fe... | max_le... | min_sa... | n_esti... |
-------------------------------------------------------------------------------------------------
| [39m1        [39m | [39m0.8465   [39m | [39m0.08399  [39m | [39m3.881    [39m | [39m0.1001   [39m | [39m5.93     [39m | [39m5.375    [39m | [39m55.25    [39m |
| [35m2        [39m | [35m0.8621   [39m | [35m0.03807  [39m | [35m2.382    [39m | [35m0.4571   [39m | [35m9.005    [39m | [35m11.64    [39m | [35m345.8    [39m |
| [39m3        [39m | [39m0.8487   [39m | [39m0.04169  [39m | [39m4.512    [39m | [39m0.1246   [39m | [39m10.72    [39m | [39m11.6     [39m | [39m283.8    [39m |
| [39m4        [39m | [39m0.7943   [39m | [39m0.02894  [39m | [39m1.792    [39m | [39m0.8207   [39m | [39m14.59    [39m | [39m9.209    [39m | [39m349.2    [39m |
| [39m5        [39m | [39m0.8145   [39m | [39m0.1754   [39m | [39m4.578    [39m | 

| [39m9        [39m | [39m0.7768   [39m | [39m0.05827  [39m | [39m1.52     [39m | [39m0.1174   [39m | [39m10.82    [39m | [39m6.867    [39m | [39m140.1    [39m |
| [39m10       [39m | [39m0.7314   [39m | [39m0.09882  [39m | [39m1.213    [39m | [39m0.6167   [39m | [39m3.907    [39m | [39m15.55    [39m | [39m352.9    [39m |
| [35m11       [39m | [35m0.8631   [39m | [35m0.02136  [39m | [35m2.656    [39m | [35m0.725    [39m | [35m7.384    [39m | [35m3.149    [39m | [35m272.6    [39m |
| [39m12       [39m | [39m0.8614   [39m | [39m0.1331   [39m | [39m3.06     [39m | [39m0.9501   [39m | [39m9.625    [39m | [39m22.78    [39m | [39m77.36    [39m |
| [39m13       [39m | [39m0.8301   [39m | [39m0.02872  [39m | [39m4.23     [39m | [39m0.4579   [39m | [39m4.15     [39m | [39m23.33    [39m | [39m180.4    [39m |
| [39m14       [39m | [39m0.8535   [39m | [39m0.1504   [39m | [39m3.904    [39m | [39m0.895    [39m 

| [39m19       [39m | [39m0.7895   [39m | [39m0.1115   [39m | [39m2.655    [39m | [39m0.7233   [39m | [39m7.353    [39m | [39m3.148    [39m | [39m272.6    [39m |
| [39m20       [39m | [39m0.8037   [39m | [39m0.2      [39m | [39m2.962    [39m | [39m1.0      [39m | [39m7.056    [39m | [39m3.457    [39m | [39m272.5    [39m |
| [39m21       [39m | [39m0.3158   [39m | [39m0.002686 [39m | [39m2.355    [39m | [39m0.6387   [39m | [39m2.712    [39m | [39m16.91    [39m | [39m229.1    [39m |
| [39m22       [39m | [39m0.7471   [39m | [39m0.05254  [39m | [39m4.857    [39m | [39m0.2515   [39m | [39m12.21    [39m | [39m12.49    [39m | [39m115.3    [39m |
| [39m23       [39m | [39m0.7449   [39m | [39m0.06679  [39m | [39m1.646    [39m | [39m0.3375   [39m | [39m9.999    [39m | [39m10.33    [39m | [39m488.5    [39m |
| [39m24       [39m | [39m0.7552   [39m | [39m0.02503  [39m | [39m2.029    [39m | [39m0.2521   [39m 

| [39m29       [39m | [39m0.6554   [39m | [39m0.0739   [39m | [39m1.727    [39m | [39m0.8856   [39m | [39m9.309    [39m | [39m16.39    [39m | [39m458.7    [39m |
| [39m30       [39m | [39m0.7659   [39m | [39m0.1083   [39m | [39m2.952    [39m | [39m0.9125   [39m | [39m9.517    [39m | [39m22.67    [39m | [39m77.26    [39m |
| [39m31       [39m | [39m0.03275  [39m | [39m0.001    [39m | [39m3.039    [39m | [39m0.7681   [39m | [39m9.604    [39m | [39m22.76    [39m | [39m77.32    [39m |
| [39m32       [39m | [39m0.661    [39m | [39m0.1639   [39m | [39m1.364    [39m | [39m0.2557   [39m | [39m4.134    [39m | [39m24.37    [39m | [39m234.2    [39m |
| [39m33       [39m | [39m0.6629   [39m | [39m0.1624   [39m | [39m4.524    [39m | [39m0.416    [39m | [39m14.96    [39m | [39m5.073    [39m | [39m309.8    [39m |
| [39m34       [39m | [39m0.7544   [39m | [39m0.08626  [39m | [39m2.077    [39m | [39m0.5619   [39m 

| [35m2        [39m | [35m0.8668   [39m | [35m0.03807  [39m | [35m2.382    [39m | [35m0.4571   [39m | [35m9.005    [39m | [35m11.64    [39m | [35m345.8    [39m |
| [39m3        [39m | [39m0.8233   [39m | [39m0.04169  [39m | [39m4.512    [39m | [39m0.1246   [39m | [39m10.72    [39m | [39m11.6     [39m | [39m283.8    [39m |
| [39m4        [39m | [39m0.7789   [39m | [39m0.02894  [39m | [39m1.792    [39m | [39m0.8207   [39m | [39m14.59    [39m | [39m9.209    [39m | [39m349.2    [39m |
| [39m5        [39m | [39m0.7634   [39m | [39m0.1754   [39m | [39m4.578    [39m | [39m0.1765   [39m | [39m2.508    [39m | [39m5.906    [39m | [39m440.3    [39m |
| [39m6        [39m | [39m0.822    [39m | [39m0.02057  [39m | [39m2.684    [39m | [39m0.9621   [39m | [39m8.931    [39m | [39m17.91    [39m | [39m164.6    [39m |
| [39m7        [39m | [39m0.8397   [39m | [39m0.1376   [39m | [39m4.339    [39m | [39m0.1165   [39m 

| [39m12       [39m | [39m0.8068   [39m | [39m0.1331   [39m | [39m3.06     [39m | [39m0.9501   [39m | [39m9.625    [39m | [39m22.78    [39m | [39m77.36    [39m |
| [39m13       [39m | [39m0.7875   [39m | [39m0.02872  [39m | [39m4.23     [39m | [39m0.4579   [39m | [39m4.15     [39m | [39m23.33    [39m | [39m180.4    [39m |
| [39m14       [39m | [39m0.8266   [39m | [39m0.1504   [39m | [39m3.904    [39m | [39m0.895    [39m | [39m10.11    [39m | [39m19.27    [39m | [39m181.0    [39m |
| [39m15       [39m | [39m0.7865   [39m | [39m0.05472  [39m | [39m4.584    [39m | [39m0.4853   [39m | [39m14.54    [39m | [39m17.26    [39m | [39m314.6    [39m |
| [39m16       [39m | [39m0.8058   [39m | [39m0.08677  [39m | [39m2.722    [39m | [39m0.7904   [39m | [39m7.45     [39m | [39m3.214    [39m | [39m272.7    [39m |
| [39m17       [39m | [39m0.8089   [39m | [39m0.1777   [39m | [39m2.789    [39m | [39m0.6977   [39m 

| [39m22       [39m | [39m0.8177   [39m | [39m0.05254  [39m | [39m4.857    [39m | [39m0.2515   [39m | [39m12.21    [39m | [39m12.49    [39m | [39m115.3    [39m |
| [39m23       [39m | [39m0.7698   [39m | [39m0.06679  [39m | [39m1.646    [39m | [39m0.3375   [39m | [39m9.999    [39m | [39m10.33    [39m | [39m488.5    [39m |
| [39m24       [39m | [39m0.8271   [39m | [39m0.02503  [39m | [39m2.029    [39m | [39m0.2521   [39m | [39m14.07    [39m | [39m14.75    [39m | [39m163.4    [39m |
| [39m25       [39m | [39m0.8218   [39m | [39m0.07754  [39m | [39m3.498    [39m | [39m0.5247   [39m | [39m8.575    [39m | [39m20.35    [39m | [39m485.0    [39m |
| [39m26       [39m | [39m0.7795   [39m | [39m0.0751   [39m | [39m4.215    [39m | [39m0.9389   [39m | [39m2.239    [39m | [39m18.28    [39m | [39m100.3    [39m |
| [39m27       [39m | [39m0.8528   [39m | [39m0.1218   [39m | [39m4.151    [39m | [39m0.1004   [39m 

| [39m32       [39m | [39m0.7895   [39m | [39m0.1639   [39m | [39m1.364    [39m | [39m0.2557   [39m | [39m4.134    [39m | [39m24.37    [39m | [39m234.2    [39m |
| [39m33       [39m | [39m0.8119   [39m | [39m0.1624   [39m | [39m4.524    [39m | [39m0.416    [39m | [39m14.96    [39m | [39m5.073    [39m | [39m309.8    [39m |
| [39m34       [39m | [39m0.8686   [39m | [39m0.1733   [39m | [39m2.877    [39m | [39m0.9129   [39m | [39m7.12     [39m | [39m3.332    [39m | [39m272.7    [39m |
| [39m35       [39m | [39m0.8674   [39m | [39m0.2      [39m | [39m2.521    [39m | [39m0.8402   [39m | [39m7.469    [39m | [39m3.919    [39m | [39m272.7    [39m |
本轮结果: R²=0.9084, RMSE=0.0556

|   iter    |  target   | learni... | max_depth | max_fe... | max_le... | min_sa... | n_esti... |
-------------------------------------------------------------------------------------------------
| [39m1        [39m | [39m0.7811   [39m | [39m0.08399  [