In [1]:
import shap
import numpy as np
import pandas as pd
import warnings
import matplotlib.pyplot as plt
from matplotlib.colors import Normalize
from itertools import combinations
import seaborn as sns
from sklearn import metrics
from sklearn import preprocessing 
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import LeaveOneOut
from sklearn.feature_selection import RFE
from bayes_opt import BayesianOptimization

In [2]:
warnings.filterwarnings('ignore')

def run_bayesian_optimization(X_train_all, y_train_all):
    def black_box_function(learning_rate, n_estimators, min_samples_split, max_features, max_depth, max_leaf_nodes):
        params = {
            'learning_rate': max(learning_rate, 1e-3),
            'n_estimators': int(n_estimators),
            'min_samples_split': int(min_samples_split),
            'max_features': min(max_features, 0.999),
            'max_depth': int(max_depth),
            'max_leaf_nodes': int(max_leaf_nodes),
            'random_state': 2
        }
        
        model = GradientBoostingRegressor(**params)
        loo = LeaveOneOut()
        preds, truths = [], []
        
        for train_idx, val_idx in loo.split(X_train_all):
            X_train, X_val = X_train_all[train_idx], X_train_all[val_idx]
            y_train, y_val = y_train_all[train_idx], y_train_all[val_idx]
            model.fit(X_train, y_train.ravel())
            preds.append(model.predict(X_val)[0])
            truths.append(y_val[0])
            
        return r2_score(truths, preds)

    pbounds = {
        'learning_rate': (0.001, 0.2),
        'n_estimators': (10, 500),
        'min_samples_split': (2, 25),
        'max_features': (0.1, 1.0),
        'max_depth': (1, 5),
        'max_leaf_nodes': (2, 15)
    }

    optimizer = BayesianOptimization(
        f=black_box_function,
        pbounds=pbounds,
        random_state=1
    )
    optimizer.maximize(init_points=15, n_iter=20)
    return optimizer.max['params']

data = pd.read_excel(r"C:\Users\HP\Desktop\Data.xlsx", 
                    sheet_name='16+3',
                    index_col=0,
                    header=0)
features = data[['lg(O3)', 'lg(H2O2)', 'pH', 'TOC']]
data1 = features.iloc[0:18]

GAN0 = pd.read_excel(r"C:\Users\HP\jupyternotebook\FSL-Github\2_Data Augmentation\GANModel\BS36.xlsx", 
                   header=0)
GAN=GAN0[['lg(O3)', 'lg(H2O2)', 'pH', 'TOC']]

r2_results = []
rmse_results = []

for iter in range(10):
    print(f"\n========== Iteration times {iter+1}/10 ==========")
    
    test_indices = np.random.choice(data1.index, size=4, replace=False)
    test_set = data1.loc[test_indices]
    train_set = data1.drop(test_indices)
    
    scaler = MinMaxScaler()
    
    X_train = scaler.fit_transform(train_set[['lg(O3)', 'lg(H2O2)', 'pH']])
    y_train = train_set['TOC'].values.reshape(-1, 1)
    
    GAN_features = scaler.transform(GAN[['lg(O3)', 'lg(H2O2)', 'pH']])
    GAN_processed = np.hstack([GAN_features, GAN['TOC'].values.reshape(-1, 1)])
    
    X_augmented = np.vstack([X_train, GAN_features])
    y_augmented = np.vstack([y_train, GAN['TOC'].values.reshape(-1, 1)])
    
    best_params = run_bayesian_optimization(X_augmented, y_augmented)
    
    final_model = GradientBoostingRegressor(
        learning_rate=best_params['learning_rate'],
        n_estimators=int(best_params['n_estimators']),
        min_samples_split=int(best_params['min_samples_split']),
        max_features=best_params['max_features'],
        max_depth=int(best_params['max_depth']),
        max_leaf_nodes=int(best_params['max_leaf_nodes']),
        random_state=2
    )
    final_model.fit(X_augmented, y_augmented.ravel())
    
    X_test = scaler.transform(test_set[['lg(O3)', 'lg(H2O2)', 'pH']])
    y_test = test_set['TOC'].values
    y_pred = final_model.predict(X_test)
    
    r2 = r2_score(y_test, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    
    r2_results.append(r2)
    rmse_results.append(rmse)
    print(f"Results of this round: R2={r2:.4f}, RMSE={rmse:.4f}")

print("\n================ Final result ================")
print(f"Average R2 score: {np.mean(r2_results):.4f} ± {np.std(r2_results):.4f}")
print(f"Average RMSE score: {np.mean(rmse_results):.4f} ± {np.std(rmse_results):.4f}")
print("\n Detailed R2 results:", [round(x, 4) for x in r2_results])
print("Detailed RMSE results:", [round(x, 4) for x in rmse_results])


|   iter    |  target   | learni... | max_depth | max_fe... | max_le... | min_sa... | n_esti... |
-------------------------------------------------------------------------------------------------
| [39m1        [39m | [39m0.162    [39m | [39m0.08399  [39m | [39m3.881    [39m | [39m0.1001   [39m | [39m5.93     [39m | [39m5.375    [39m | [39m55.25    [39m |
| [39m2        [39m | [39m0.1195   [39m | [39m0.03807  [39m | [39m2.382    [39m | [39m0.4571   [39m | [39m9.005    [39m | [39m11.64    [39m | [39m345.8    [39m |
| [39m3        [39m | [39m0.06732  [39m | [39m0.04169  [39m | [39m4.512    [39m | [39m0.1246   [39m | [39m10.72    [39m | [39m11.6     [39m | [39m283.8    [39m |
| [39m4        [39m | [39m0.02687  [39m | [39m0.02894  [39m | [39m1.792    [39m | [39m0.8207   [39m | [39m14.59    [39m | [39m9.209    [39m | [39m349.2    [39m |
| [39m5        [39m | [39m-0.1156  [39m | [39m0.1754   [39m | [39m4.578    [39m | 

| [39m9        [39m | [39m0.07002  [39m | [39m0.05827  [39m | [39m1.52     [39m | [39m0.1174   [39m | [39m10.82    [39m | [39m6.867    [39m | [39m140.1    [39m |
| [39m10       [39m | [39m-0.01166 [39m | [39m0.09882  [39m | [39m1.213    [39m | [39m0.6167   [39m | [39m3.907    [39m | [39m15.55    [39m | [39m352.9    [39m |
| [39m11       [39m | [39m0.06587  [39m | [39m0.02136  [39m | [39m2.656    [39m | [39m0.725    [39m | [39m7.384    [39m | [39m3.149    [39m | [39m272.6    [39m |
| [39m12       [39m | [39m-0.01995 [39m | [39m0.1331   [39m | [39m3.06     [39m | [39m0.9501   [39m | [39m9.625    [39m | [39m22.78    [39m | [39m77.36    [39m |
| [35m13       [39m | [35m0.115    [39m | [35m0.02872  [39m | [35m4.23     [39m | [35m0.4579   [39m | [35m4.15     [39m | [35m23.33    [39m | [35m180.4    [39m |
| [39m14       [39m | [39m-0.09289 [39m | [39m0.1504   [39m | [39m3.904    [39m | [39m0.895    [39m 

| [39m19       [39m | [39m0.1188   [39m | [39m0.03792  [39m | [39m3.435    [39m | [39m0.5635   [39m | [39m5.672    [39m | [39m3.495    [39m | [39m46.19    [39m |
| [39m20       [39m | [39m0.0858   [39m | [39m0.07941  [39m | [39m3.735    [39m | [39m0.2562   [39m | [39m6.286    [39m | [39m9.344    [39m | [39m212.0    [39m |
| [39m21       [39m | [39m0.01675  [39m | [39m0.002686 [39m | [39m2.355    [39m | [39m0.6387   [39m | [39m2.712    [39m | [39m16.91    [39m | [39m229.1    [39m |
| [39m22       [39m | [39m0.05916  [39m | [39m0.05254  [39m | [39m4.857    [39m | [39m0.2515   [39m | [39m12.21    [39m | [39m12.49    [39m | [39m115.3    [39m |
| [39m23       [39m | [39m-0.06923 [39m | [39m0.06679  [39m | [39m1.646    [39m | [39m0.3375   [39m | [39m9.999    [39m | [39m10.33    [39m | [39m488.5    [39m |
| [39m24       [39m | [39m0.1138   [39m | [39m0.02503  [39m | [39m2.029    [39m | [39m0.2521   [39m 

| [39m29       [39m | [39m-0.106   [39m | [39m0.0739   [39m | [39m1.727    [39m | [39m0.8856   [39m | [39m9.309    [39m | [39m16.39    [39m | [39m458.7    [39m |
| [39m30       [39m | [39m-0.07249 [39m | [39m0.1186   [39m | [39m1.18     [39m | [39m0.6864   [39m | [39m10.74    [39m | [39m4.78     [39m | [39m217.8    [39m |
| [39m31       [39m | [39m0.0625   [39m | [39m0.004293 [39m | [39m3.756    [39m | [39m0.2024   [39m | [39m12.86    [39m | [39m3.325    [39m | [39m234.5    [39m |
| [39m32       [39m | [39m-0.09917 [39m | [39m0.1639   [39m | [39m1.364    [39m | [39m0.2557   [39m | [39m4.134    [39m | [39m24.37    [39m | [39m234.2    [39m |
| [35m33       [39m | [35m0.1109   [39m | [35m0.1624   [39m | [35m4.524    [39m | [35m0.416    [39m | [35m14.96    [39m | [35m5.073    [39m | [35m309.8    [39m |
| [39m34       [39m | [39m-0.06972 [39m | [39m0.197    [39m | [39m4.558    [39m | [39m0.4505   [39m 

| [39m2        [39m | [39m0.06879  [39m | [39m0.03807  [39m | [39m2.382    [39m | [39m0.4571   [39m | [39m9.005    [39m | [39m11.64    [39m | [39m345.8    [39m |
| [39m3        [39m | [39m0.02637  [39m | [39m0.04169  [39m | [39m4.512    [39m | [39m0.1246   [39m | [39m10.72    [39m | [39m11.6     [39m | [39m283.8    [39m |
| [39m4        [39m | [39m0.06303  [39m | [39m0.02894  [39m | [39m1.792    [39m | [39m0.8207   [39m | [39m14.59    [39m | [39m9.209    [39m | [39m349.2    [39m |
| [39m5        [39m | [39m-0.1212  [39m | [39m0.1754   [39m | [39m4.578    [39m | [39m0.1765   [39m | [39m2.508    [39m | [39m5.906    [39m | [39m440.3    [39m |
| [35m6        [39m | [35m0.1274   [39m | [35m0.02057  [39m | [35m2.684    [39m | [35m0.9621   [39m | [35m8.931    [39m | [35m17.91    [39m | [35m164.6    [39m |
| [39m7        [39m | [39m-0.04289 [39m | [39m0.1376   [39m | [39m4.339    [39m | [39m0.1165   [39m 

| [39m12       [39m | [39m-0.1016  [39m | [39m0.1331   [39m | [39m3.06     [39m | [39m0.9501   [39m | [39m9.625    [39m | [39m22.78    [39m | [39m77.36    [39m |
| [35m13       [39m | [35m0.09974  [39m | [35m0.02872  [39m | [35m4.23     [39m | [35m0.4579   [39m | [35m4.15     [39m | [35m23.33    [39m | [35m180.4    [39m |
| [39m14       [39m | [39m-0.08235 [39m | [39m0.1504   [39m | [39m3.904    [39m | [39m0.895    [39m | [39m10.11    [39m | [39m19.27    [39m | [39m181.0    [39m |
| [39m15       [39m | [39m-0.04843 [39m | [39m0.05472  [39m | [39m4.584    [39m | [39m0.4853   [39m | [39m14.54    [39m | [39m17.26    [39m | [39m314.6    [39m |
| [39m16       [39m | [39m0.02824  [39m | [39m0.1887   [39m | [39m2.696    [39m | [39m0.9353   [39m | [39m8.799    [39m | [39m11.87    [39m | [39m344.8    [39m |
| [39m17       [39m | [39m0.00797  [39m | [39m0.04763  [39m | [39m4.38     [39m | [39m0.3656   [39m 

| [39m22       [39m | [39m0.1567   [39m | [39m0.1101   [39m | [39m3.668    [39m | [39m0.1      [39m | [39m5.717    [39m | [39m5.162    [39m | [39m55.03    [39m |
| [39m23       [39m | [39m-0.04611 [39m | [39m0.06679  [39m | [39m1.646    [39m | [39m0.3375   [39m | [39m9.999    [39m | [39m10.33    [39m | [39m488.5    [39m |
| [39m24       [39m | [39m0.1423   [39m | [39m0.02503  [39m | [39m2.029    [39m | [39m0.2521   [39m | [39m14.07    [39m | [39m14.75    [39m | [39m163.4    [39m |
| [39m25       [39m | [39m-0.003175[39m | [39m0.07754  [39m | [39m3.498    [39m | [39m0.5247   [39m | [39m8.575    [39m | [39m20.35    [39m | [39m485.0    [39m |
| [39m26       [39m | [39m0.1319   [39m | [39m0.1297   [39m | [39m3.673    [39m | [39m0.15     [39m | [39m5.722    [39m | [39m5.168    [39m | [39m55.04    [39m |
| [39m27       [39m | [39m0.1365   [39m | [39m0.1411   [39m | [39m3.64     [39m | [39m0.1131   [39m 

| [39m32       [39m | [39m-0.02161 [39m | [39m0.001    [39m | [39m3.431    [39m | [39m0.1      [39m | [39m5.713    [39m | [39m5.352    [39m | [39m55.51    [39m |
| [39m33       [39m | [39m0.08744  [39m | [39m0.1624   [39m | [39m4.524    [39m | [39m0.416    [39m | [39m14.96    [39m | [39m5.073    [39m | [39m309.8    [39m |
| [39m34       [39m | [39m0.08718  [39m | [39m0.1552   [39m | [39m3.482    [39m | [39m0.1798   [39m | [39m11.12    [39m | [39m19.23    [39m | [39m60.62    [39m |
| [39m35       [39m | [39m-0.1444  [39m | [39m0.1779   [39m | [39m1.378    [39m | [39m0.1688   [39m | [39m13.33    [39m | [39m18.66    [39m | [39m445.5    [39m |
本轮结果: R2=-3.2458, RMSE=0.2056

|   iter    |  target   | learni... | max_depth | max_fe... | max_le... | min_sa... | n_esti... |
-------------------------------------------------------------------------------------------------
| [39m1        [39m | [39m0.09763  [39m | [39m0.08399  