In [1]:
# import plotting, io, stats, linalg libraries
import matplotlib.pyplot as plt
import numpy as np
from scipy import stats
from scipy.integrate import odeint, solve_ivp
from scipy.stats import linregress
import scipy.io as sio
import pandas as pd

# import DOE library
from doepy import build
import itertools
from scipy.stats.qmc import Sobol

# set plot parameters
params = {'legend.fontsize': 18,
          'figure.figsize': (16, 12),
          'axes.labelsize': 24,
          'axes.titlesize':24,
          'axes.linewidth':5,
          'xtick.labelsize':20,
          'ytick.labelsize':20}
plt.rcParams.update(params)
plt.style.use('seaborn-colorblind')
plt.rcParams['pdf.fonttype'] = 42

np.random.seed(123)



# Define simulation parameters

In [2]:
# number of species in gLV model
numspecies=6

# define compression functions 
compressor0 = lambda x: np.sum(x, 1)   # sum over outputs 
compressor1 = lambda x: x
compressors = [compressor0, compressor1]

# number of time points to sample
NTs = [16, 4]

# time span to take samples
TF = 16

# number of low fidelity samples
NS_LF = 64

# define names of measured values
species_names = ['s'+str(i+1) for i in range(numspecies)]

# name datasets
data_names = ['sum', 'full']

# std. dev. of Gaussian noise to add to data 
noise = 0.005

In [3]:
#Randomly create the parameters that define the system
#Normally distributed aij parameters with strictly negative diagonal
#These parameter statistics were calculated from the parameter set in Venturelli et al, Molecular Systems Biology, 2018
#Note that these statistics frequently give me unbounded growth for pairs of species (positive feedback loops). If you make mu_aij more negative, this happens less frequently.
mu_aii=-1.5
sigma_aii=0.25
mu_aij=-0.22
sigma_aij=0.33
params_ii=np.random.normal(mu_aii,sigma_aii,numspecies)
params_ij=np.random.normal(mu_aij,sigma_aij,numspecies**2-numspecies)
A=np.zeros((numspecies,numspecies))
k=0
l=0
for i in range(numspecies):
    for j in range(numspecies):
        if i==j:
            A[i,j]=-abs(params_ii[l])
            l+=1
        else:
            A[i,j]=params_ij[k]
            k+=1
#Normally distributed basal growth rates
mu_r = .36
sigma_r=0.16
r= np.random.normal(mu_r,sigma_r,numspecies)
for k in range(len(r)):
    if r[k]<0:
        r[k]=abs(r[k])
        
# define gLV ODE model
def runGLV(x, r, A, t_eval):
    # define system of equations
    def system(t, x, r, A):
        # derivative of x (gLV equation)
        return x * (r + A@x)
    # solve system
    soln = solve_ivp(system, (0, t_eval[-1]), x, t_eval=t_eval, args=(np.vstack(r), A), 
                     method='RK45', vectorized=True)
    #y = odeint(system, x, t_eval, args=(r, A))
    #    return t_eval, y
    return soln.t, soln.y.T

# Low fidelity design matrix

In [4]:
# # use DOEpy to define latin hyper-cube initial design
# design_dict = {}
# for i in range(numspecies):
#     design_dict[f's{i+1}'] = [0,2]

# exp_design = build.space_filling_lhs(
#                 design_dict,
#                 num_samples = NS_LF)
# X = exp_design.values

# use Sobol sequence to generate design
sampler = Sobol(d=numspecies)
X = sampler.random(NS_LF)

# create dataframe with measured species at specified time intervals
# create matrix of all possible communities
Xlist = [np.reshape(np.array(i), (1, numspecies)) for i in itertools.product([0, 1], repeat = numspecies)]
# remove all zeros community
X = np.concatenate((X, np.array(np.concatenate(Xlist)[1:, :][::-1], float)))

In [5]:
# low fidelity data
NT = NTs[0]
compressor = compressors[0] 

# init data matrix
N_samples = X.shape[0]
tspan = (0, TF)
teval = np.linspace(0,tspan[-1], NT)
D = np.zeros([X.shape[0]*len(teval), 1+numspecies])

for i, x in enumerate(X):
    # solve
    IC = x*.1
    t, y = runGLV(IC, r, A, teval)

    # add noise to y 
    y[1:] = y[1:] + noise*np.random.randn(y.shape[0]-1, y.shape[1])
    
    # compress y 
    c_y = compressor(y)
    
    # add noise to compressed y after initial condition
    # c_y[1:] = c_y[1:]*(1. + noise*np.random.randn(len(c_y)-1))
    
    # store data 
    D[i*len(teval):(i+1)*len(teval), 0]  = c_y
    D[i*len(teval):(i+1)*len(teval), 1:] = y

# save data to dataframe
time = list(teval)*X.shape[0]

all_treatments = []
for i,x in enumerate(X):
    if sum(x>0) == 1:
        exp_name = f"mono_exp_{i+1}"
    else: 
        exp_name = f"exp_{i+1}"
    for _ in range(len(teval)):
        all_treatments.append(exp_name)

# determine names of unique treatments
unique_treatments = np.unique(all_treatments)        

df = pd.DataFrame()
df['Treatments'] = all_treatments
df['Time'] = time

# store data in dataframe
df['OD'] = D[:,0] 
for j,s in enumerate(species_names):
    df[s] = D[:, j+1] 

# save
df.to_csv(f"Simulated_gLV_data_sum.csv", index=False)

In [6]:
df

Unnamed: 0,Treatments,Time,OD,s1,s2,s3,s4,s5,s6
0,exp_1,0.000000,0.333630,0.048940,0.084298,0.069792,0.069608,0.000660,0.060331
1,exp_1,1.066667,0.413867,0.054126,0.118841,0.087627,0.086900,0.012687,0.053687
2,exp_1,2.133333,0.475434,0.066119,0.163460,0.099013,0.098643,0.009487,0.038713
3,exp_1,3.200000,0.567411,0.063799,0.195641,0.132898,0.127369,0.008261,0.039443
4,exp_1,4.266667,0.607873,0.059446,0.231095,0.155406,0.123725,0.007853,0.030347
...,...,...,...,...,...,...,...,...,...
2027,mono_exp_127,11.733333,0.089289,-0.004891,-0.004472,-0.001478,0.002989,0.005576,0.091565
2028,mono_exp_127,12.800000,0.078395,-0.000673,-0.005390,0.000435,-0.007088,0.004728,0.086382
2029,mono_exp_127,13.866667,0.078881,0.000489,-0.009140,-0.004241,0.003246,0.001666,0.086860
2030,mono_exp_127,14.933333,0.077371,-0.005571,0.005544,0.001655,-0.000371,-0.003725,0.079840


# High fidelity data matrix

In [7]:
# create dataframe with measured species at specified time intervals
# create matrix of all possible communities
Xlist = [np.reshape(np.array(i), (1, numspecies)) for i in itertools.product([0, 1], repeat = numspecies)]
# remove all zeros community
X = np.array(np.concatenate(Xlist)[1:, :][::-1], float)

In [8]:
# high fidelity data
NT = NTs[1]
compressor = compressors[1] 

# init data matrix
N_samples = X.shape[0]
tspan = (0, TF)
teval = np.linspace(0,tspan[-1], NT)
D = np.zeros([X.shape[0]*len(teval), numspecies])

for i, x in enumerate(X):
    # solve
    IC = x*.1
    t, y = runGLV(IC, r, A, teval)

    # add noise to y 
    y[1:] = y[1:] + noise*np.random.randn(y.shape[0]-1, y.shape[1])

    # store data 
    D[i*len(teval):(i+1)*len(teval)] = y

# save data to dataframe
time = list(teval)*X.shape[0]

all_treatments = []
for i,x in enumerate(X):
    if sum(x>0) == 1:
        exp_name = f"mono_exp_{i+1}"
    else: 
        exp_name = f"exp_{i+1}"
    for _ in range(len(teval)):
        all_treatments.append(exp_name)

# determine names of unique treatments
unique_treatments = np.unique(all_treatments)        

df = pd.DataFrame()
df['Treatments'] = all_treatments
df['Time'] = time

# add noise to data
for j,s in enumerate(species_names):
    df[s] = D[:, j]

# save
df.to_csv(f"Simulated_gLV_data_full.csv", index=False)

In [9]:
df

Unnamed: 0,Treatments,Time,s1,s2,s3,s4,s5,s6
0,exp_1,0.000000,0.100000,0.100000,0.100000,0.100000,0.100000,0.100000
1,exp_1,5.333333,0.070143,0.251241,0.219458,0.148464,0.048577,0.022528
2,exp_1,10.666667,0.033895,0.352250,0.274270,0.156035,0.024668,0.001089
3,exp_1,16.000000,0.016436,0.384658,0.300375,0.141680,0.011583,0.009051
4,exp_2,0.000000,0.100000,0.100000,0.100000,0.100000,0.100000,0.000000
...,...,...,...,...,...,...,...,...
247,mono_exp_62,16.000000,-0.003499,-0.001338,-0.013623,-0.007559,0.136129,-0.002683
248,mono_exp_63,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.100000
249,mono_exp_63,5.333333,-0.001771,0.003506,0.002205,-0.007235,0.000837,0.087807
250,mono_exp_63,10.666667,0.000661,0.001382,0.002492,0.002918,0.002202,0.089749
