In [None]:
from armored.models import *
from armored.preprocessing import *

In [None]:
import matplotlib.pyplot as plt
import time
from scipy.stats import linregress

import seaborn as sns

params = {'legend.fontsize': 18,
          'figure.figsize': (8, 6),
         'axes.labelsize': 24,
         'axes.titlesize':24,
         'axes.linewidth':5,
         'xtick.labelsize':20,
         'ytick.labelsize':20}
plt.rcParams.update(params)
plt.style.use('seaborn-colorblind')
plt.rcParams['pdf.fonttype'] = 42

In [None]:
from numpy.random import default_rng
rng = default_rng(seed = 123)

# Define simulation parameters and import full dataset

In [None]:
# number of trials 
n_trials = 30

# number of dtl cycles 
n_dtl  = 5

# define number of initial samples to train on
n_init = 5

# number of samples for next experiment 
n_test = 5

# number of species in model
n_s = 5

# number of resources
n_r = 7

# define all system variables 
species = ['s'+str(i+1) for i in range(n_s)]
outputs = ['product']
sys_var = species + outputs

# define parameters in the objective function
obj_params = ['volume']

# define subset of controls
controls = ['rf'+str(i+1) for i in range(n_r)] + ['feed']
system_variables = species + outputs + controls

# define an objective function (product of endpoint volume * endpoint product concentration)
# where product concentration is the last column of the predicted output
objective = lambda pred, vol: pred[-1, -1]*vol[-1] 

# import data 
main_df = pd.read_csv("Data/reactor_ubiome.csv")
all_exp_names = main_df.Experiments.values

In [None]:
# determine random sets of initial experiments
initial_exps = [rng.choice(np.unique(all_exp_names), n_init, replace=False) for _ in range(n_trials)]

In [None]:
# function to compute sum of squares error 
def sse(a, b):
    return np.sum((a-b)**2)

# Loop over each trial

In [None]:
# init dataframe that stores DTL information
dtl_df = pd.DataFrame()
dtl_df_R = pd.DataFrame()
dtl_df_sse = pd.DataFrame()
elapsed_time = []

for trial in range(n_trials):

    # format data 
    main_data, main_obj_params, unique_exp_names, N_total = format_data(main_df, species, outputs, controls, obj_params=obj_params)

    # keep track of objective 
    objective_found = []

    # choose random set of training samples
    train_df = main_df.iloc[np.in1d(all_exp_names, initial_exps[trial])].copy()
    train_data, train_obj_params, new_experiments, N = format_data(train_df, species, outputs, controls, obj_params=obj_params)

    # remove training samples from main dataset so that they're not selected more than once
    train_inds = np.in1d(unique_exp_names, new_experiments)
    main_data = main_data[~train_inds]
    main_obj_params  = main_obj_params[~train_inds]
    unique_exp_names = unique_exp_names[~train_inds]

    # compute objectives
    target_found = []
    for sample, train_obj_param in zip(train_data, train_obj_params):
        target_found.append(objective(sample[:, :len(sys_var)], train_obj_param))
    target_found = np.array(target_found)
    objective_found.append(np.max(target_found))
    objective_rval = []
    objective_sse  = []

    # Search over full factorial and update model
    for dtl in range(n_dtl):
        print(f"Running trial {trial+1}, cycle {dtl+1}")

        # scale train and design space data
        scaler = ZeroMaxScaler().fit(train_data)
        train_data = scaler.transform(train_data)
        main_data_scaled = scaler.transform(main_data)

        # fit model 
        brnn = RNN(n_species=n_s, n_metabolites=1, n_controls=len(controls), n_hidden=16, N=N)
        brnn.fit(train_data)
        
        # assess prediction performance of end-point product
        pred, stdv, cov = brnn.predict(main_data_scaled)
        pred = scaler.inverse_transform(pred)
        stdv = scaler.inverse_transform(stdv)
        rvalue = linregress(np.array(main_data[:, -1, 5]), pred[:, -1, -1]).rvalue
        sse_value = sse(np.array(main_data[:, -1, 5]), pred[:, -1, -1])
        plt.scatter(np.array(main_data[:, -1, 5]), pred[:, -1, -1], label="R = {:.3f}\nSSE = {:.3f}".format(rvalue, sse_value))
        plt.legend()
        plt.show()
        objective_rval.append(rvalue)
        objective_sse.append(sse_value)
        
        # randomly search over design space
        t0 = time.time()
        new_experiments = rng.choice(unique_exp_names, n_test, replace=False)
        elapsed_time.append(time.time()-t0)

        # collect new data 
        new_df   = main_df.iloc[np.in1d(all_exp_names, new_experiments)].copy()
        new_data, new_obj_params, new_experiments, N = format_data(new_df, species, outputs, controls, obj_params=obj_params)

        # remove training samples from main dataset
        train_inds = np.in1d(unique_exp_names, new_experiments)
        main_data = main_data[~train_inds]
        main_obj_params  = main_obj_params[~train_inds]
        unique_exp_names = unique_exp_names[~train_inds]

        # compute objectives
        target_found = []
        for sample, new_obj_param in zip(new_data, new_obj_params):
            target_found.append(objective(sample[:, :len(sys_var)], new_obj_param))
        target_found = np.array(target_found)

        # store the best objective found (so far)
        objective_found.append(np.max([np.max(objective_found), np.max(target_found)]))

        # Update dataset
        train_df = pd.concat((train_df, new_df))
        train_data, train_obj_params, train_experiments, N = format_data(train_df, species, outputs, controls, obj_params=obj_params)
        
    ### fit model one last time to assess final prediction performance ### 
    # scale train and design space data
    scaler = ZeroMaxScaler().fit(train_data)
    train_data = scaler.transform(train_data)
    main_data_scaled = scaler.transform(main_data)

    # fit model 
    brnn = miRNN(n_species=n_s, n_metabolites=1, n_controls=len(controls), n_hidden=16, N=N)
    brnn.fit(train_data)

    # assess prediction performance of end-point product
    pred, stdv, cov = brnn.predict(main_data_scaled)
    pred = scaler.inverse_transform(pred)
    stdv = scaler.inverse_transform(stdv)
    rvalue = linregress(np.array(main_data[:, -1, 5]), pred[:, -1, -1]).rvalue
    sse_value = sse(np.array(main_data[:, -1, 5]), pred[:, -1, -1])
    plt.scatter(np.array(main_data[:, -1, 5]), pred[:, -1, -1], label="R = {:.3f}\nSSE = {:.3f}".format(rvalue, sse_value))
    plt.legend()
    plt.show()
    objective_rval.append(rvalue)
    objective_sse.append(sse_value)
        
    # save data to dataframe
    dtl_df_i = pd.DataFrame()
    dtl_df_i['Trial'] = [trial]
    for j,obj_found in enumerate(objective_found):
        dtl_df_i[f'DTL {j}'] = [obj_found]
    dtl_df = pd.concat((dtl_df, dtl_df_i))
    
    # save data to dataframe
    dtl_df_r = pd.DataFrame()
    dtl_df_r['Trial'] = [trial]
    for j,r_val in enumerate(objective_rval):
        dtl_df_r[f'DTL {j}'] = [r_val]
    dtl_df_R = pd.concat((dtl_df_R, dtl_df_r))
    
    # save data to dataframe
    dtl_df_e = pd.DataFrame()
    dtl_df_e['Trial'] = [trial]
    for j,e in enumerate(objective_sse):
        dtl_df_e[f'DTL {j}'] = [e]
    dtl_df_sse = pd.concat((dtl_df_sse, dtl_df_e))

In [None]:
dtl_df.to_csv("results/RNN_random.csv", index=False)
dtl_df_R.to_csv("results/RNN_random_rvals.csv", index=False)
dtl_df_sse.to_csv("results/RNN_random_sse.csv", index=False)