## **Imports**

In [40]:
from bayes_opt import BayesianOptimization
import numpy as np
import pandas as pd
from sklearn import preprocessing
from sklearn.ensemble import RandomForestRegressor
import math
from scipy.stats import norm
import random

In [41]:
import rf_utils

## **Parameters**

In [42]:
dataset = rf_utils.dataset("AgNP")
data_df = dataset["AgNP"].copy()

AgNP


In [43]:
n_ensemble = 50
n_initial = 2
n_top = int(math.ceil(len(data_df) * 0.05))
objective_metric = list(data_df.columns)[-1]
top_indices = list(data_df.sort_values(objective_metric).head(n_top).index)
n_est = 100
N = len(data_df)
seed_list = [4295, 8508, 326, 3135, 1549, 2528, 1274, 6545, 5971, 6269, 2422, 4287, 9320, 4932, 951,
            4304, 1745, 5956, 7620, 4545, 6003, 9885, 5548, 9477, 30, 8992, 7559, 5034, 9071, 6437,
            3389, 9816, 8617, 3712, 3626, 1660, 3309, 2427, 9872, 938, 5156, 7409, 7672, 3411, 3559,
            9966, 7331, 8273, 8484, 5127, 2260, 6054, 5205, 311, 6056, 9456, 928, 6424, 7438, 8701,
            8634, 4002, 6634, 8102, 8503, 1540, 9254, 7972, 7737, 3410, 4052, 8640, 9659, 8093, 7076,
            7268, 2046, 7492, 3103, 3034, 7874, 5438, 4297, 291, 5436, 9021, 3711, 7837, 9188, 2036,
            8013, 6188, 3734, 187, 1438, 1061, 674, 777, 7231, 7096, 3360, 4278, 5817, 5514, 3442, 6805,
            6750, 8548, 9751, 3526, 9969, 8979, 1526, 1551, 2058, 6325, 1237, 5917, 5821, 9946, 5049, 654,
            7750, 5149, 3545, 9165, 2837, 5621, 6501, 595, 3181, 1747, 4405, 4480, 4282, 9262, 6219, 3960,
            4999, 1495, 6007, 9642, 3902, 3133, 1085, 3278, 1104, 5939, 7153, 971, 8733, 3785, 9056, 2020,
            7249, 5021, 3384, 8740, 4593, 7869, 9941, 8813, 3688, 8139, 6436, 3742, 5503, 1587, 4766, 9846,
            9117, 7001, 4853, 9346, 4927, 8480, 5298, 4753, 1151, 9768, 5405, 6196, 5721, 3419, 8090, 8166,
            7834, 1480, 1150, 9002, 1134, 2237, 3995, 2029, 5336, 7050, 6857, 8794, 1754, 1184, 3558, 658,
            6804, 8750, 5088, 1136, 626, 8462, 5203, 3196, 979, 7419, 1162, 5451, 6492, 1562, 8145, 8937,
            8764, 4174, 7639, 8902, 7003, 765, 1554, 6135, 1689, 9530, 1398, 2273, 7925, 5948, 1036, 868,
            4617, 1203, 7680, 7, 93, 3128, 5694, 6979, 7136, 8084, 5770, 9301, 1599, 737, 7018, 3774, 9843,
            2296, 2287, 9875, 2349, 2469, 8941, 4973, 3798, 54, 2938, 4665, 3942, 3951, 9400, 3094, 2248,
            3376, 1926, 5180, 1773, 3681, 1808, 350, 6669, 826, 539, 5313, 6193, 5752, 9370, 2782, 8399,
            4881, 3166, 4906, 5829, 4827, 29, 6899, 9012, 6986, 4175, 1035, 8320, 7802, 3777, 6340, 7798, 7705]
seed_list = seed_list[:n_ensemble]

## **Code**

In [44]:
def RF_pred(X, RF_model):
    tree_predictions = []
    for j in np.arange(n_est):
        tree_predictions.append((RF_model.estimators_[j].predict(np.array([X]))).tolist())
    mean = np.mean(np.array(tree_predictions), axis=0)[0]

    
    std = np.std(np.array(tree_predictions), axis=0)[0]
    return mean, std


def EI(X, RF_model, y_best):

    mean, std = RF_pred(X, RF_model)
    
    z = (y_best - mean)/std
    return (y_best - mean) * norm.cdf(z) + std * norm.pdf(z)

def LCB(X, RF_model, ratio):
    
    mean, std = RF_pred(X, RF_model)

    return - mean + ratio * std

def PI(X, RF_model, y_best):
    
    mean, std = RF_pred(X, RF_model)
    
    z = (y_best - mean)/std
    return norm.cdf(z)

In [50]:
feature_names = list(data_df.columns)[:-1]
objective_metric = list(data_df.columns)[-1]

X_features = data_df[feature_names].values
y = np.array(data_df[objective_metric].values)

index_collection = []
X_collection = []
y_collection = []
TopCount_collection = []

for s in seed_list:
    ### Initial setup for each model run
    print(f"Using seed of {s}")
    random.seed(s)
    index_learn = list(np.arange(N)) #all possible indices
    index_ = random.sample(index_learn, n_initial) #list of initial index's
    X_ = [] # ???store all observed good candidates' input feature X???
    y_ = [] # ???store all observed good candidates' objective value y???
    c = 0 # num top candidates found so far
    TopCount_ = [] # ???cumulative number of top candidates found at each learning cycle???
    for idx in index_:
        X_.append(X_features[idx]) # add the features of the random index
        y_.append(y[idx])
        if idx in top_indices:
            c += 1
        TopCount_.append(c) # add initial-count 
        index_learn.remove(idx) # remove idx from index_ so not reused
        
    ### Run loop until all candidates (in pool) have been observed
    # !!!should make into while loop!!!
    for i in np.arange(len(index_learn)):
        y_best = np.min(y_) # select best data-point so far
        s_scaler = preprocessing.StandardScaler()
        X_train = s_scaler.fit_transform(X_)
        y_train = s_scaler.fit_transform([[i] for i in y_])
        
        RF_model = RandomForestRegressor(n_estimators= n_est, n_jobs= -1)
        RF_model.fit(X_train, y_train)
        
        ### Now we look at remaining candidates and use acquisition function to choose next best one
        next_index = None
        max_ac = -10**10
        for j in index_learn:
            X_j = X_features[j]
            y_j = y[j]
            
            ac_value = LCB(X_j, RF_model, 10)
            
            if max_ac <= ac_value:
                max_ac = ac_value
                next_index = j
        
        X_.append(X_features[next_index])
        y_.append(y[next_index])
        
        if next_index in top_indices:
            c += 1
            
        TopCount_.append(c)
        index_learn.remove(next_index)
        index_.append(next_index)
    
    index_collection.append(index_)
    X_collection.append(X_)
    y_collection.append(y_)
    TopCount_collection.append(TopCount_)
    break


Using seed of 4295


  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **