In [1]:
import pandas as pd
import os
import warnings
warnings.filterwarnings('ignore')

In [2]:
PATH = os.getcwd()
CSV_FILE = os.path.join(PATH, "./Combined_set_prescreened (1).csv");
df = pd.read_csv(CSV_FILE);
df.head()

Unnamed: 0,MOF,LISD,LFSD,LISFS,Unit_cell_volume,Density,ASA_A2,ASA_m2_per_cm3,ASA_m2_per_g,NASA_A2,...,La,Cr,Ti,Ba,Rh,Ce,Cu,Al,Re,COP
0,XUKYEI_neutral,13.18217,10.2037,13.18217,6140.0,0.287208,1122.81,1828.68,6367.08,0.0,...,0,0,0,0,0,0,2,0,0,0.584018
1,ja300034j_si_002_clean,17.497,17.44104,17.497,2800.68,0.713223,390.541,1394.45,1955.14,0.0,...,0,0,0,0,0,0,0,0,0,0.542496
2,QIYDAF01_clean,22.00141,13.48659,22.00141,52812.6,0.303251,8995.15,1703.22,5616.54,0.0,...,0,0,0,0,0,0,24,0,0,0.522996
3,XAHPIH_clean,14.37026,13.2266,14.37026,12821.8,0.356183,2130.49,1661.62,4665.07,0.0,...,0,0,0,0,0,0,8,0,0,0.515473
4,VETMIS_clean,18.1343,11.96931,18.1343,33152.2,0.311959,5856.65,1766.6,5662.9,0.0,...,0,0,0,0,0,0,12,0,0,0.512787


In [3]:
# Drop the rows with missing values
df.dropna(inplace=True)
# Drop the column that is repeated
df.drop(columns=["Number_of_pockets.1"], inplace=True)
# Drop the column that has only one unique value
df.drop(columns=["Pu"], inplace=True)

In [4]:
# Developing Complete Bayesian Optimazaition function
from sklearn.preprocessing import StandardScaler
import torch
import numpy as np
from botorch.models import SingleTaskGP
from gpytorch.mlls import ExactMarginalLogLikelihood
from botorch.fit import fit_gpytorch_mll
from botorch.acquisition.analytic import ExpectedImprovement
import matplotlib.pyplot as plt
from ipywidgets import Output, HBox
from IPython.display import display

output_max_cop = Output()
output_cop_iteration = Output()
display(HBox([output_max_cop, output_cop_iteration]))

def bayesian_optimization_function(number_of_iterations, number_of_initial_samples, labelled_data, unlabelled_data):
    """
        Explaination
    """
    assert number_of_iterations <= len(unlabelled_data)
    name_MOFs = unlabelled_data["MOF"].values
    labelled_data = labelled_data.drop(columns="MOF")
    unlabelled_data = unlabelled_data.drop(columns="MOF")

    X_ndarray_labelled_data = labelled_data.drop(columns="COP")
    X_ndarray_unlabelled_data = unlabelled_data
    y_ndarray_labelled_data = labelled_data["COP"]

    X_ndarray_labelled_data = labelled_data.drop(columns="COP").values
    X_ndarray_unlabelled_data = unlabelled_data.values
    y_ndarray_labelled_data = labelled_data["COP"].values

    scaler = StandardScaler()
    X_ndarray_labelled_data = scaler.fit_transform(X_ndarray_labelled_data)
    X_ndarray_unlabelled_data = scaler.transform(X_ndarray_unlabelled_data)

    max_cop_observed = np.array([])
    cop_at_iteration = np.array([])

    X_tensor_labelled_data = torch.tensor(X_ndarray_labelled_data, dtype=torch.float32)
    X_tensor_unlabelled_data = torch.tensor(X_ndarray_unlabelled_data, dtype=torch.float32)
    y_tensor_labelled_data = torch.tensor(y_ndarray_labelled_data, dtype=torch.float32)

    max_y_labelled_data = y_tensor_labelled_data.max().item()
    cop_at_iteration = np.array([max_y_labelled_data])

    X_tensor_unlabelled_data_unsqueezed = X_tensor_unlabelled_data.unsqueeze(1)

    initial_data_points = np.array([])


    initial_y = y_tensor_labelled_data
    initial_y = initial_y.unsqueeze(1)
    initial_X = X_tensor_labelled_data

    for i in range(number_of_iterations):
        model = SingleTaskGP(initial_X, initial_y)
        modelLikelihood = ExactMarginalLogLikelihood(model.likelihood, model)
        fit_gpytorch_mll(modelLikelihood)
    
        if len(max_cop_observed) == 0 and len(initial_y) > 0:
            max_cop_observed = np.concatenate((max_cop_observed, [initial_y.max().item()]), axis=0)
        EI = ExpectedImprovement(model, best_f=max(max_cop_observed))
        with torch.no_grad():
            new_values = EI.forward(X_tensor_unlabelled_data_unsqueezed)
        
        
        new_data_points = new_values.argsort(descending=True)
        

        for data_point in new_data_points:
            if not data_point.item() in initial_data_points:
                new_max_data_point = data_point.item()
                break
        new_max_data_point_cop = float(input(f"Enter the COP value for {name_MOFs[new_max_data_point]}:"))

        
        initial_y = torch.cat([initial_y, torch.tensor([new_max_data_point_cop], dtype=torch.float32).unsqueeze(1)], dim=0)
        initial_X = torch.cat([initial_X, X_tensor_unlabelled_data[new_max_data_point].unsqueeze(0)], dim=0)
        initial_data_points = np.concatenate((initial_data_points, [new_max_data_point]), axis=0)

        new_row = pd.DataFrame([X_ndarray_unlabelled_data[new_max_data_point]], columns=labelled_data.columns[:-1])
        new_row['COP'] = new_max_data_point_cop
        labelled_data = pd.concat([labelled_data, new_row], ignore_index=True)
        
        
        current_max_cop = max_cop_observed[-1]
        if new_max_data_point_cop > current_max_cop:
            max_cop_observed = np.concatenate((max_cop_observed, [new_max_data_point_cop]), axis=0)
        else:
            max_cop_observed = np.concatenate((max_cop_observed, [current_max_cop]), axis=0)
        cop_at_iteration = np.concatenate((cop_at_iteration, [new_max_data_point_cop]), axis=0)

        with output_max_cop:
            output_max_cop.clear_output(wait=True)
            plt.figure(figsize=(6, 3))
            iterations = np.arange(i + 1)  # Ensure x-axis matches the iteration count
            plt.plot(iterations, max_cop_observed[:i + 1], marker='o', linestyle='-', color='b')  # Slice max_cop_observed to match iterations
            plt.title('Maximum COP Observed')
            plt.xlabel('Iteration')
            plt.ylabel('COP')
            plt.show()

        with output_cop_iteration:
            output_cop_iteration.clear_output(wait=True)
            plt.figure(figsize=(6, 3))
            iterations = np.arange(i + 1)  # Ensure x-axis matches the iteration count
            plt.plot(iterations, cop_at_iteration[:i + 1], marker='x', linestyle='--', color='r')  # Slice cop_at_iteration to match iterations
            plt.title('COP at Each Iteration')
            plt.xlabel('Iteration')
            plt.ylabel('COP')
            plt.show()

    csv_file_name = "updated_labelled_data.csv"
    labelled_data.to_csv(csv_file_name, index=False)
    return csv_file_name

HBox(children=(Output(), Output()))

In [5]:
number_of_initial_samples = 10
number_of_iterations = 70
labelled_data = df.sample(number_of_initial_samples)
unlabelled_data = df.drop(labelled_data.index)
unlabelled_data = unlabelled_data.drop(columns="COP")
bayesian_optimization_function(number_of_iterations, number_of_initial_samples, labelled_data, unlabelled_data)

'updated_labelled_data.csv'