In [28]:
import numpy as np
import pandas as pd
from scipy.stats import norm
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import RBF, WhiteKernel, ConstantKernel as C


In [29]:
EPS = 1e-9  # numerical stability

def _expected_improvement(mu, sigma, y_best, xi=0.0):
    sigma_safe = sigma + EPS
    improvement = mu - y_best - xi
    z = improvement / sigma_safe
    ei = improvement * norm.cdf(z) + sigma_safe * norm.pdf(z)
    ei[sigma == 0.0] = 0.0
    return ei


In [30]:
def propose_next_experiment(
    X_train,
    y_train,
    bounds,
    n_candidates=1000,
    xi=0.0,
    random_state=None,
    return_all=False
):
    rng = np.random.default_rng(random_state)
    cols = list(bounds)
    d = len(cols)
    if X_train.shape[1] != d:
        raise ValueError("X_train dimension does not match number of bounds")

    # Step 1: Monte‑Carlo sampling in parameter space
    cand_mat = np.column_stack([
        rng.uniform(lo, hi, n_candidates) for lo, hi in bounds.values()
    ])
    cand_df = pd.DataFrame(cand_mat, columns=cols)

    # Step 2: Fit Gaussian Process surrogate
    kernel = (
        C(1.0, (1e-3, 1e3))
        * RBF(length_scale=np.ones(d), length_scale_bounds=(1e-2, 1e2))
        + WhiteKernel(noise_level=1e-6, noise_level_bounds=(1e-10, 1e-2))
    )
    gp = GaussianProcessRegressor(
    kernel=kernel,
    n_restarts_optimizer=5,
    alpha=0.0,
    normalize_y=True,
    random_state=42,  
)
    gp.fit(X_train, y_train)
    
    mu, sigma = gp.predict(cand_mat, return_std=True)

    # Step 3: Calculate Expected Improvement (EI)
    y_best = y_train.max()
    ei = _expected_improvement(mu, sigma, y_best, xi=xi)
    best_idx = np.argmax(ei)

    best_params = cand_df.iloc[best_idx].to_dict()
    best_ei = ei[best_idx]

    if return_all:
        cand_df = cand_df.assign(mu=mu, sigma=sigma, EI=ei)
        return cand_df, best_params, best_ei
    return best_params, best_ei


In [None]:

df = pd.read_csv("data/aryltrifluoromethylation_cytosine.csv")


key_cols = [
    "Cytosine_Conc_(M)",
    "CF3SO2Na_loading_(equiv.)",
    "(NH4)2S2O8_loading_(equiv.)",
    "Residence_time_(min)",
    "Light_intensity_(W)",
]

# random select 10 rows as initial experiment data
rng = np.random.default_rng(42)
seed_idx = rng.choice(len(df), size=10, replace=False)

X_train = df.loc[seed_idx, key_cols].to_numpy()
y_train = df.loc[seed_idx, "Yield_(%)"].to_numpy() # when get the real y, replace y_train with the real y

# define the bounds of the parameter space
bounds = {c: (df[c].min(), df[c].max()) for c in key_cols}


In [36]:
# 推荐 x₁₁，并打印参数、mu、sigma、EI
cand_df, best_params, best_ei = propose_next_experiment(
    X_train, y_train, bounds, random_state=42, return_all=True
)

best_idx = cand_df['EI'].idxmax()
best_mu = cand_df.loc[best_idx, 'mu']
best_sigma = cand_df.loc[best_idx, 'sigma']

print("recommended next experiment (x₁₁):")
for k, v in best_params.items():
    print(f"{k:<35s}: {v:.4g}")
print(f"Predicted mean (mu): {best_mu:.4g}")
print(f"Predicted std dev (sigma): {best_sigma:.4g}")
print(f"Expected Improvement (EI): {best_ei:.4g}")


recommended next experiment (x₁₁):
Cytosine_Conc_(M)                  : 0.05248
CF3SO2Na_loading_(equiv.)          : 2.813
(NH4)2S2O8_loading_(equiv.)        : 1.291
Residence_time_(min)               : 4.796
Light_intensity_(W)                : 174.8
Predicted mean (mu): 72.83
Predicted std dev (sigma): 4.946
Expected Improvement (EI): 1.076




In [39]:
n_steps = 5  # 你想要循环几步，比如5步
X_loop = X_train.copy()
y_loop = y_train.copy()

for i in range(n_steps):
    cand_df, next_params, next_ei = propose_next_experiment(
        X_loop, y_loop, bounds, random_state=42, return_all=True
    )
    best_idx = cand_df['EI'].idxmax()
    best_mu = cand_df.loc[best_idx, 'mu']
    best_sigma = cand_df.loc[best_idx, 'sigma']

    print(f"Step {i+1} - recommended next experiment (x_{11+i}):")
    for k, v in next_params.items():
        print(f"{k:<35s}: {v:.4g}")
    print(f"Predicted mean (mu): {best_mu:.4g}")
    print(f"Predicted std dev (sigma): {best_sigma:.4g}")
    print(f"Expected Improvement (EI): {next_ei:.4g}\n")

    # if get the real y, replace the real_y with the real y
    real_y_list = [75.0, 80.5, 65.2, 70.1, 78.9]
    new_x = np.array([list(next_params.values())])
    new_y = np.array([real_y])
    X_loop = np.vstack([X_loop, new_x])
    y_loop = np.append(y_loop, new_y)




Step 1 - recommended next experiment (x_11):
Cytosine_Conc_(M)                  : 0.05248
CF3SO2Na_loading_(equiv.)          : 2.813
(NH4)2S2O8_loading_(equiv.)        : 1.291
Residence_time_(min)               : 4.796
Light_intensity_(W)                : 174.8
Predicted mean (mu): 72.83
Predicted std dev (sigma): 4.946
Expected Improvement (EI): 1.076

Step 2 - recommended next experiment (x_12):
Cytosine_Conc_(M)                  : 0.05325
CF3SO2Na_loading_(equiv.)          : 3.066
(NH4)2S2O8_loading_(equiv.)        : 1.098
Residence_time_(min)               : 8.139
Light_intensity_(W)                : 142.4
Predicted mean (mu): 44.96
Predicted std dev (sigma): 13.73
Expected Improvement (EI): 0.06958

Step 3 - recommended next experiment (x_13):
Cytosine_Conc_(M)                  : 0.06879
CF3SO2Na_loading_(equiv.)          : 2.769
(NH4)2S2O8_loading_(equiv.)        : 1.198
Residence_time_(min)               : 3.119
Light_intensity_(W)                : 142.9
Predicted mean (mu): 69.



In [40]:
mu_array = cand_df['mu'].values
sigma_array = cand_df['sigma'].values

# Min–Max normalization 到 [0,1]
mu_min, mu_max = mu_array.min(), mu_array.max()
sigma_min, sigma_max = sigma_array.min(), sigma_array.max()
mu_norm = (mu_array - mu_min) / (mu_max - mu_min + 1e-12)
sigma_norm = (sigma_array - sigma_min) / (sigma_max - sigma_min + 1e-12)

# 打分逻辑
score_exploit = mu_norm - sigma_norm     # [-1,1]，正为exploit，负为explore
score_explore = sigma_norm - mu_norm     # [-1,1]
score_combined = 2*((mu_norm + sigma_norm)/2) - 1  # [-1,1]

df_scores = pd.DataFrame({
    'mu': mu_array,
    'sigma': sigma_array,
    'mu_norm': mu_norm,
    'sigma_norm': sigma_norm,
    'score_exploit': score_exploit,
    'score_explore': score_explore,
    'score_combined': score_combined
})

