In [1]:
##Importing all needed libraries
try:
    # Summit-related imports
    import summit
    from summit.benchmarks import ExperimentalEmulator
    from summit.domain import *
    from summit.utils.dataset import DataSet
    from summit.strategies import SOBO, MultitoSingleObjective, LHS

    # External libraries
    import numpy as np
    import pandas as pd
    import matplotlib.pyplot as plt

    # File and path handling
    import pathlib
    import os

except ModuleNotFoundError as e:
    print(f"Error: {e}. Please install the required libraries before running the program.")


In [2]:
# Configurable parameters
base_path = pathlib.Path("F:/Python Programs/NakulMidazolamBO")
BOUNDS_NAME = "Nakul_Midazolam_BoundariesV2.csv"
DATA_NAME = "StartExp.csv"
PROJECT_NAME = "Midazolam_Optimization_Test"

# Folder Name
DATA_DIR = "Data"
MODEL_DIR = "Models"
IT_DIR = "IterData"

# Function to create directory if it doesn't exist
def create_directory(base_path, directory):
    """Create a directory if it doesn't exist."""
    dir_path = base_path / directory
    if not dir_path.is_dir():
        dir_path.mkdir(parents=True)
        
# Create directories
for directory in [DATA_DIR, MODEL_DIR, IT_DIR]:
    create_directory(base_path, directory)

# Set data paths
data_path = base_path / DATA_DIR
model_path = base_path / MODEL_DIR
it_path = base_path / IT_DIR

# Load initial boundaries data
try:
    init_bounds_df = pd.read_csv(data_path / BOUNDS_NAME)
except FileNotFoundError:
    print(f"Error: File '{BOUNDS_NAME}' not found. Please check the file path.")
except pd.errors.EmptyDataError:
    print(f"Error: File '{BOUNDS_NAME}' is empty or in an invalid format.")


In [3]:
init_bounds_df #Temporary code to visualize the Boundaries.csv dataframe

Unnamed: 0,Condition,Type,Categories,BoundaryMin,BoundaryMax,Description,Maximize
0,Temperature,Continuous,,40.0,80.0,Reaction temperature in degrees Celsius (ºC),
1,Catalyst_Amount,Continuous,,0.01,2.0,Catalyst amounts in molar equivalents (Equiv.),
2,Starting_Reagent,Continuous,,1.1,2.0,2-Methylimidozole amounts in molar equivalents...,
3,Solvent,Continuous,,0.1,0.35,Solvent amount in milliliters (mL),
4,Time,Continuous,,2.0,24.0,Duration of reaction in hours (hr),
5,Base,Continuous,,1.0,5.0,Base amount in molar equivalents (Equiv.),
6,Main_Product,Objective,,0.0,1.0,LCAP of Main Product,True
7,Main_Impurity,Objective,,0.0,1.0,LCAP of Main Impurity,False


In [4]:
def create_domain(init_bounds_df):
    """
    Create a Summit domain based on the provided boundaries DataFrame.

    Parameters:
    - init_bounds_df (pd.DataFrame): DataFrame containing information about variable boundaries.

    Returns:
    - domain (Domain): The created Summit domain.
    - obj_df (pd.DataFrame): DataFrame for objective variables.
    - in_count (int): Count of input variables.
    - out_count (int): Count of output variables.
    """
    if not isinstance(init_bounds_df, pd.DataFrame) or init_bounds_df.empty:
        raise ValueError("Invalid input: init_bounds_df must be a non-empty DataFrame.")
        
    domain = Domain()
    obj_df = pd.DataFrame()
    obj_df = DataSet.from_df(obj_df)

    in_count = 0
    out_count = 0

    for idx, row in init_bounds_df.iterrows():
        name = row[0]
        description = row[5]
        data_type = row[1]

        if data_type == 'Categorical':
            levels = row[2].split(',')

            domain += CategoricalVariable(
                name = name,
                description = description,
                levels = levels
            )
            in_count += 1

        elif data_type == 'Continuous':
            bounds = [row[3], row[4]]

            domain += ContinuousVariable(
                name = name,
                description = description,
                bounds = bounds
            )
            in_count += 1

        elif data_type == 'Objective':
            bounds = [row[3], row[4]]
            maximize = row[6]

            domain += ContinuousVariable(
                name = name,
                description = description,
                bounds = bounds,
                is_objective = True,
                maximize = maximize
            )
            out_count += 1

            obj_df[(name, "DATA")] = ""
        
    return domain, obj_df, in_count, out_count


In [8]:
def preprocess_data(data_df, init_bounds_df, out_count):
    """
    Preprocess the input data.

    Parameters:
    - data_df (pd.DataFrame): Input data DataFrame.
    - init_bounds_df (pd.DataFrame): DataFrame containing information about variable boundaries.
    - out_count (int): Count of output variables.

    Returns:
    - sorted_data_df (pd.DataFrame): Sorted data DataFrame.
    - in_bounds_thresh_df (pd.DataFrame): Thresholds for initial boundary data.
    - INIT_BOUNDS_THRESH_FRAC (float): Fraction used for calculating bounds thresholds.
    - ach_func_bounds (list): Achievement function bounds.
    - ach_func_thresh (float): Threshold for the achievement function.
    """   
    # Constants
    DATA_COL_NAME = ('Achievement_Function','DATA')
    BOUNDS_COL_NAME = 'Threshold'
    INIT_BOUNDS_THRESH_FRAC = 0.10
    ACH_FUNC_THRESH_FRAC = 0.025
    
    # Achievement function bounds
    ach_func_bounds = [-1,1] #Change this to automatic later on    
    
    # Calculate the achievement function
    data_df[DATA_COL_NAME] = data_df.iloc[:, -2] - data_df.iloc[:, -1]
    
    # Copy the data DataFrame and sorts by the achievement function
    sorted_data_df = data_df.sort_values(
        by = DATA_COL_NAME,
        ascending = False
    ).copy()

    # Create a copy of the initial bounds DataFrame
    bounds_thresh_df = init_bounds_df.copy()
    
    # Calculate the threshold for the initial boundaries
    bounds_thresh_df[BOUNDS_COL_NAME] = (init_bounds_df.iloc[:, 4] - init_bounds_df.iloc[:, 3])*INIT_BOUNDS_THRESH_FRAC

    # Create a copy of the initial boundaries + threshold DataFrame and removes the output boundaries + threshold
    in_bounds_thresh_df = bounds_thresh_df.iloc[:-(out_count)].copy()

    # Calculate the achievement function threshold
    ach_func_thresh = (ach_func_bounds[1] - ach_func_bounds[0])*ACH_FUNC_THRESH_FRAC
    
    return sorted_data_df, in_bounds_thresh_df, INIT_BOUNDS_THRESH_FRAC, ach_func_bounds, ach_func_thresh

In [9]:
domain, obj_df, in_count, out_count = create_domain(init_bounds_df)
data_df = DataSet.read_csv(data_path / DATA_NAME) 
sorted_data_df, in_bounds_thresh_df, INIT_BOUNDS_THRESH_FRAC, ach_func_bounds, ach_func_thresh = preprocess_data(data_df, init_bounds_df, out_count)


In [14]:
def check_af_converg(sorted_data_df):
    """
    Check the convergence of the achievement function for the top 3 rows.

    Parameters:
    - sorted_data_df (pd.DataFrame): Sorted data DataFrame.

    Returns:
    - avg_diff_top_3 (float): Average absolute difference of the achievement function for the top 3 rows from their mean.
    """
    if len(sorted_data_df) < 3:
        raise ValueError("Insufficient data for convergence check. Need at least 3 rows.")

    top_3_af = sorted_data_df.iloc[:3, -1]
    avg_top_3 = top_3_af.mean()
    avg_diff_top_3 = np.abs(top_3_af - avg_top_3).mean()

    return avg_diff_top_3

In [15]:
avg_diff_top_3 = check_af_converg(sorted_data_df)

In [16]:
avg_diff_top_3

0.084