## *Prototyping Bayesian Optimization code*

1. INPUT: Takes in boundary points of a target wing disc shape
2. OUTPUT: Spits out corresponding Surface Evolver modeling parameters resulting in the target shape
3. A test case for a single iteration. A complete implementation can be found in master_bayesian_optimization.py file

In [1]:
# Importing libraries
import pandas as pd
import matplotlib.pyplot as plt
import spatial_efd
import math 
import signac
import numpy as np
import os.path
import os
import torch
import gpytorch
from matplotlib import pyplot as plt
from sklearn.preprocessing import StandardScaler
import gc
# Importing helper libraries for bayesian optimization
from dependencies.data_preprocessing_class import dataPreprocessing
from dependencies.gaussian_process_regression_class import gaussianProcessRegression
from dependencies.acquisition_functions_class import acqisitionFunctions
from dependencies.geometry_writer import geometryWriter
from dependencies.feature_extractor_4 import FeatureExtractor

##### STEP 1
1. Loading Input and output data generated using parameter screening 
2. Data preprocessing

    a. Transforming input parameters into log scale<br>
    b. Normalizing transformed input data<br>
    c. Normaling the EFD output data<br>
    c. Taking PC of the output EFD features for dimensionality reduction<br>
    e. Selecting Principal component 1 as the output
    

In [2]:
# Checking if data exists
doesDataFileExist = os.path.isfile("master_feature_output.npy")

# Loading datafiles if they exist
# Else fetching and preparing data from signac workspace
if doesDataFileExist == True:
    # Loading input parameters
    master_parameter_input_n = np.load('master_parameter_input_n.npy', )
    # Loading output EFD coefficients
    master_feature_output = np.load('master_feature_output.npy', )

# Printing shape of the daya
print(np.shape(master_parameter_input_n))
print(np.shape(master_feature_output))

(133, 35)
(133, 80)


In [3]:

data_efd_mean = np.mean(master_feature_output,axis = 0)
print(data_efd_mean)
data_efd_variance = np.std(master_feature_output,axis = 0)
print(data_efd_variance)
data_efd_variance[0,] = 10**-33
print(data_efd_variance)
# Loading in the data processing class
dataPreprocess  = dataPreprocessing(master_parameter_input_n, master_feature_output, 133)
# Converting the input parameters to logscale
master_parameter_input_log = dataPreprocess.inputLogTransform()

# Selecting the parameters that were sampled in the latin hypercube sampling
num_parameters_LHS = 7
LHS_parameter_index = [1, 4, 7, 17, 18, 19, 33]
# Calling in the function to separate out the desired parameters
data_x = dataPreprocess.inputParameterSelection(num_parameters_LHS, LHS_parameter_index, master_parameter_input_log)


# PCA to reduce dimensionality of the output data
total_variance_explained, principalComponents, weights, weights_af = dataPreprocess.pcaEfdFeatures(8)

data_x_mean = np.mean(data_x, axis=0)
data_x_variance = np.std(data_x, axis=0)


# Normalizing data
data_x = StandardScaler().fit_transform(data_x)





[ 1.00000000e+00 -2.76754803e-19  5.80933467e-18  3.42554689e-01
 -3.36692091e-03  5.04448513e-02 -1.53920024e-01  3.75995929e-03
  6.68438137e-02  8.19851531e-04 -1.53214419e-03  1.12106414e-01
  3.31577552e-05  2.08928305e-02 -4.17752060e-03 -1.46670356e-03
  2.97181673e-02 -2.25045217e-04  5.09745966e-04  2.39267618e-02
 -5.95402946e-05  3.37261858e-03 -2.92267727e-03  6.05960381e-04
  2.21606330e-03  7.51224791e-05 -5.09978159e-04  2.42044959e-03
 -2.10668077e-04 -1.08677673e-04 -1.91295464e-03 -3.11016836e-04
 -1.76325268e-03  2.17188843e-05  1.85252196e-04 -7.38423269e-04
 -1.70643207e-04 -3.35810679e-04 -5.33373540e-04  3.01050739e-04
 -8.83231410e-04  3.98459979e-05 -2.67858200e-04 -3.01117971e-04
 -2.55933403e-04 -6.07551552e-04 -1.50421751e-04 -8.06691424e-05
 -3.77398624e-04 -9.64304780e-05  8.80573889e-05  1.40610440e-04
 -1.09575349e-04 -3.67602910e-04  3.15161949e-04  4.15247662e-05
 -3.66956272e-04 -2.60787812e-05  2.08523249e-05  3.34352101e-04
 -1.33508313e-04 -2.70658

  master_parameter_input = np.log(self.master_parameter_input_n)


In [4]:
data_norm = (np.divide(np.subtract(master_feature_output[0,:],data_efd_mean), data_efd_variance))
data_norm[0] = 0
print(data_norm)
pca_recon = np.matmul(weights,data_norm)
print(principalComponents[0,:])
print(pca_recon)

[ 0.          0.75216795 -1.08024297  1.11075852  0.47547595 -1.19967053
  1.10045732 -1.08297095  0.49090498 -0.86845551  1.05495297  1.47788899
  0.04500868 -0.8842792  -0.8790828   1.17882731 -0.80360646  0.29877076
 -1.37880963 -1.17159546 -0.12745999  0.83428777  0.00422269 -1.14958305
 -0.72446233 -0.20300411  1.21206762 -0.29973492  0.06735282  0.53489167
  0.18792774  1.10571763 -0.19811786 -0.18825431 -1.09128313 -0.39455329
  0.10508231  0.48968872  0.21691441 -0.99015103 -0.1730472  -0.08450309
  1.04938089 -0.25919342  0.13594339  0.49877192  0.03190021  0.90148685
 -0.04677344  0.04941939 -0.6618466  -0.26590885  0.08428024  0.47235724
 -0.28978822 -1.00225826  0.07005857  0.07099854  0.58208516 -0.3815408
  0.07937393  0.33737349 -0.24199658  0.74028599  0.20424563  0.10775298
 -0.54144728 -0.09170787  0.09531838  0.18355539 -0.61304878 -0.28683552
  0.27695506  0.13016934  0.57142126 -0.16119131  0.10059289 -0.049814
 -0.15856291  0.2813207 ]
[-4.3307239  -1.3928108  -1.

##### STEP 2: Training the GPR model
1. Split training and test data
2. Input to GPR model: Log transformed and normalized SE parameters
3. Output to GPR model: PC1 of the normalized EFD features

In [5]:
"""STEP 1"""
"""
# Geometry file containing the boundary point for the wing disc shape for which the papameters have to be estimated
geometry_data = 'input_data/vertices_target.txt' 
if type(geometry_data) is str:
    # Checking if the file containing vertices coordinates are empty
    if os.stat(geometry_data).st_size != 0:
        # 
        a1 = []
        a2 = []
        
        with open(geometry_data) as f:
            # next(f)
            for line in f:
                data = line.split()
                a1.append(float(data[0]))
                a2.append(float(data[1]))
                

    else:
        a1 = 0
        a2 = 0
        
# vposx_exp and vpos_y_exp contains the x and y coordinates of the experimental data
vpos_x_exp = a1
vpos_y_exp = a2

# EFD ciefficients are extracted using the spatial EFD package and boundary points of the target shape
coeffs = spatial_efd.CalculateEFD(vpos_x_exp, vpos_y_exp, 20)
# Normalizing the coefficients against rotation and size
coeffs_exp, rotation = spatial_efd.normalize_efd(coeffs, size_invariant=True)
# Reverse EFD for plotting the normalized tissue shape
xt, yt = spatial_efd.inverse_transform(coeffs, harmonic=20)

# Plotting the experimental tissue contour
plt.plot(xt,yt,'black')
plt.axes().set_aspect('equal', 'datalim')
plt.xlabel("x [nondimensional]")
plt.ylabel("y [nondimensional]")
plt.show()

# Reshaping the EFD features (20x4) as a horizontal array for using it as (80) features
efd_coeff_exp_reshaped = np.reshape(coeffs_exp, (80,))
print(efd_coeff_exp_reshaped)
# normalizing efd coefficients with existing data mean and variance
efd_coeff_exp_normalized = (np.add(np.multiply(efd_coeff_exp_reshaped,data_efd_variance), data_efd_mean)) 
efd_coeff_exp_normalized = np.reshape(efd_coeff_exp_normalized, (80,1))
# Multiplying EFD coefficients by already obtained weight of pc
efd_coeff_exp_normalized_pc = np.matmul(weights,efd_coeff_exp_normalized)
# Reshaping array for appending to the original data array
y_exp = np.reshape(efd_coeff_exp_normalized_pc, (1,8))
"""
# Reading the vertices output file from a sample SE simulation output with known parameters
fe_exp = FeatureExtractor('input_data/vertices_target_SE.txt', 'log_edges.xlsx')
# Extracting the efd coefficients
coeff_exp = fe_exp.tissue_efd_coeff(20)
# Reverse EFD for plotting the normalized tissue shape
xt_exp, yt_exp = spatial_efd.inverse_transform(coeff_exp, harmonic=20)
efd_coeff_exp_reshaped = np.reshape(coeff_exp, (80,))

efd_coeff_exp_normalized = (np.divide(np.subtract(efd_coeff_exp_reshaped,data_efd_mean), data_efd_variance)) 
efd_coeff_exp_normalized = np.reshape(efd_coeff_exp_normalized, (80,1))
# Multiplying EFD coefficients by already obtained weight of pc
efd_coeff_exp_normalized_pc = np.matmul(weights,efd_coeff_exp_normalized)
# Reshaping array for appending to the original data array
y_exp = np.reshape(efd_coeff_exp_normalized_pc, (1,8))


# To do: Incorporate it in the GPR class
class ExactGPModel(gpytorch.models.ExactGP):
    def __init__(self, train_x, train_y, likelihood):
        super(ExactGPModel, self).__init__(train_x, train_y, likelihood)
        self.mean_module = gpytorch.means.ConstantMean()
        # Defining a RBF kernel
        #self.covar_module = gpytorch.kernels.ScaleKernel(gpytorch.kernels.RBFKernel())
        #defing a Matern kernel
        # mu is the smoothness parameter
        self.covar_module = gpytorch.kernels.ScaleKernel(gpytorch.kernels.MaternKernel(nu=0.5))

    def forward(self, x):
        mean_x = self.mean_module(x)
        covar_x = self.covar_module(x)
        return gpytorch.distributions.MultivariateNormal(mean_x, covar_x)



In [None]:
maxIter = 1
num_pc_components = 3
error_target_sampled = []
iter_counter = []
param_sampled = np.zeros((maxIter,7))

for i in range(maxIter):
	""" Step 5a: Training the GP model 
	"""	
	""" Step 5b: Estimating acquisition function
			Sample random points in space
			Use the gpr model to estimate expected improvement
			find optimum x
			Transform to parameter space
	"""
	# Method I: latine hypercube sampling
	xlimits = np.array([[min_data_x[0], max_data_x[0]],[min_data_x[1], max_data_x[1]],[min_data_x[2], max_data_x[2]],[min_data_x[3], max_data_x[3]],[min_data_x[4], max_data_x[4]],[min_data_x[5], max_data_x[5]],[min_data_x[6], max_data_x[6]]])
	sampling = LHS(xlimits = xlimits)
	# Defining numvber of samples
	num_samples = 1000000
	# Implementing latin hypercube sampling
	x = sampling(num_samples)
	
	# Method II: Random sampling
	#num_samples = 1000000
	#x = np.random.rand(num_samples, 7)
	
	ei = np.zeros((num_samples,))
	
	for j in range(num_pc_components):
		# Getting the trained model and likelihood using the training data
		model, likelihood = gpr.GP_Regressor(train_x, train_y, test_x, test_y, 1000, i, ExactGPModel,j)
		y_target = efd_coeff_exp_normalized_pc[j]
		# Calling in the acquisition function class
		af = acqisitionFunctions(x, test_x, test_y[:,j])
		# Calculating the xpected improvement
		ei = ei + weights_af[j]*af.expected_improvement_modified(model, likelihood, y_target)
		del model
		del likelihood
		del af
		gc.collect()

##### STEP 3: Acquisition function for sampling a new point
1. Extracts EFD coeeficients of a target image Takes PC of the EFD coefficients to get a target value of PC1
2. A large number of points are sampled within the parameter space and normalized.
3. Calculates expected improvement over sampled points in the parameter space
4. A new point is sampled using the defined acquistion function and it is then transformed to the parameter space.


In [None]:
""" STEP 2"""
"""
from smt.sampling_methods import LHS

xlimits = np.array([[-2, 2],[-2, 2],[-2, 2],[-2, 2],[-2, 2],[-2, 2],[-2, 2]])
sampling = LHS(xlimits = xlimits)
# Defining numvber of samples
num_samples = 10000000
# Implementing latin hypercube sampling
x = sampling(num_samples)
"""
""" Temporary bypassing due to anaconda smt installation errors"""
# Defining numvber of samples
num_samples = 1000000
# Implementing latin hypercube sampling
x = np.random.rand(num_samples, 7)
ei = np.zeros((num_samples,))
data_y = principalComponents

    # Calling in the gpr class
gpr  = gaussianProcessRegression(data_x, data_y)
# Splitting up the training and test data
train_x, train_y, test_x, test_y = gpr.split_data(110, 133)

for i in range(8):
    # PC1 is selected as the output data 
    


    # Getting the trained model and likelihood using the training data
    model, likelihood = gpr.GP_Regressor(train_x, train_y, test_x, test_y, 100, 1, ExactGPModel,i)

    y_target = efd_coeff_exp_normalized_pc[i]

    """STEP 3"""
    """
    #ACQUISITION FUNCTION OLD
    # Calling in the acquisition function class
    af = acqisitionFunctions(x, test_x, test_y)
    # Calculating the xpected improvement
    ei = af.expected_improvement(model, likelihood, 0.9)
    """
    # Calling in the acquisition function class
    af = acqisitionFunctions(x, test_x, test_y)
    # Calculating the xpected improvement
    ei = ei + weights_af[i]*af.expected_improvement_modified(model, likelihood, y_target)
    print(np.shape(ei))
    del model
    del likelihood
    del af
    gc.collect()

In [None]:




"""STEP 4"""
param_sampling = np.zeros((10,7))
# Finding the indez that leads to maximum acquisition function
x_sampled_index = np.argmax(ei)
# Assessing the new sampled value
x_sampled_logscale_standardized = x[x_sampled_index,:]
# Converting x sampled into parameter space
# Multiplying by standard deviation and adding the mean pf data
x_sampled = np.exp(np.add(np.multiply(x_sampled_logscale_standardized,data_x_variance), data_x_mean)) 
param_sampling[1,:] = x_sampled 

print(param_sampling)
print(np.shape(x_sampled))



##### STEP 4: Obtaining y for the new point sampled
1. Run surface evolver on the new sampled parameter
2. Extract EFD coefficients from the output shape
3. Transform EFD to PC space and obtain the new ysampled

In [None]:
""" Step 1"""
# Initializaib=ng the surface evolver parameters
paraminputs = [0,0.0001,0,0,0,0,0,0.001,0,0,0, 0.1,0.1,10,0.1,0.1,0.1,0.1,10,0.0001,0.001,0.001, 1,1,0.6,0.6,0.6,0.6,0.2,0.1,3,0.6,1.8, 0.001,0.001]
# Repalcaing the parameters with newly sampled values

paraminputs[1] = x_sampled[0,]
# tension cuboidal basal
paraminputs[4] = x_sampled[1,]
# tension columnar basal
paraminputs[7] = x_sampled[2,]
# k columnar apical
paraminputs[17] = x_sampled[3,]
# k columnar basal
paraminputs[18] = x_sampled[4,]
# k columnar lateral
paraminputs[19] = x_sampled[5,]
# K_ECM
paraminputs[33] = x_sampled[6,]

# Defining the set system pressure
param_pressure = 0.001

# Writing geometry file
geometryWriter(paraminputs, param_pressure, 'wingDisc')

# Running surface evolver simulations
os.system("/home/nkumar4/Desktop/evolver_installation/src/evolver wingDisc.fe")
os.system("exit")

""" Step 2 """
fe = FeatureExtractor('vertices.txt', 'log_edges.xlsx')
efd_coeff_sampled = fe.tissue_efd_coeff(20)
efd_coeff_sampled_reshaped = np.reshape(efd_coeff_sampled, (80,))

""" Step 3 """
# normalizing efd coefficients with existing data mean and variance
efd_coeff_sampled_normalized = (np.add(np.multiply(efd_coeff_sampled_reshaped,data_efd_variance), data_efd_mean)) 
efd_coeff_sampled_normalized = np.reshape(efd_coeff_sampled_normalized, (80,1))
# Multiplying EFD coefficients by already obtained weight of pc
efd_coeff_sampled_normalized_pc = np.matmul(weights,efd_coeff_sampled_normalized)
# Reshaping array for appending to the original data array
y_sampled = np.reshape(efd_coeff_sampled_normalized_pc, (1,8))
print(np.shape(efd_coeff_sampled_normalized_pc))

In [None]:
"""STEP 5: UPDATING TRAINING DATA"""
#data_y = np.vstack((data_y, y_sampled[0,0]))
data_x = np.vstack((data_x, np.reshape(x_sampled,(1,7))))
print(np.shape(data_x))