In [1]:
import numpy as np
import itertools
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import scipy as sp
from scipy.stats import linregress

from sklearn.model_selection import KFold, LeaveOneOut

# from autode.autode import *
from bamf.bamf import *

import time

np.random.seed(12345)

# Define number of K-Folds 

In [2]:
# Comment out if using Leave One Out
K = 12

# Import sequenced data

## REU 3 community data

In [3]:
t0_data = pd.read_csv("DTL0/REU03_table_t0_20220811.csv")
tf_data = pd.read_csv("DTL0/REU03_table_tf_20220811.csv")

exp_info = ['Treatments', 'Rep', 'Time']
inputs = ['Inulin', 'Starch', 'Pectin', 'ArGal', 'Gum', 'AmAc', 'pH']
species = ['BAabs', 'BPabs', 'BTabs', 'BUabs', 'PCabs', 'PJabs',
       'ACabs', 'CGabs', 'CHabs', 'FPabs', 'ERabs', 'BHabs', 'RIabs',
       'CSabs', 'EHabs']

# data with replicates
reps_data = pd.concat((t0_data[exp_info+inputs+species], tf_data[exp_info+inputs+species]))
rep1_data = reps_data.iloc[reps_data['Rep'].values==1].sort_values(by=['Treatments', 'Time'])
rep2_data = reps_data.iloc[reps_data['Rep'].values==2].sort_values(by=['Treatments', 'Time'])

# average replicates
avg_data_3 = rep1_data.copy().drop(['Rep'], axis=1)
avg_data_3[species] = (avg_data_3[species].values + rep2_data[species].values)/2.

# normalize data 
# t0_inds = avg_data_3.Time.values == 0.

# normalize values after initial condition 
max_od = 1. # np.max(avg_data[species].iloc[~t0_inds].values, 0)  
species_inds = np.in1d(avg_data_3.columns.values, species)
# avg_data.iloc[~t0_inds, species_inds] /= max_od

# set initial conditions 
# avg_data.iloc[t0_inds, species_inds] = np.ceil(avg_data.iloc[t0_inds, species_inds].values) / len(species)

avg_data_3.describe()

Unnamed: 0,Time,Inulin,Starch,Pectin,ArGal,Gum,AmAc,pH,BAabs,BPabs,...,PJabs,ACabs,CGabs,CHabs,FPabs,ERabs,BHabs,RIabs,CSabs,EHabs
count,94.0,94.0,94.0,94.0,94.0,94.0,94.0,94.0,94.0,94.0,...,94.0,94.0,94.0,94.0,94.0,94.0,94.0,94.0,94.0,94.0
mean,12.904468,0.199856,0.20005,0.221409,0.189319,0.189365,0.521277,0.521277,0.008277,0.031115,...,0.00299,0.003367,0.002559,0.003708,0.000429,0.000488,0.000559,0.000444,0.00361,0.000353
std,13.252697,0.262279,0.262375,0.285047,0.249499,0.249526,0.496844,0.496844,0.018884,0.052038,...,0.004039,0.0068,0.00395,0.005937,0.000263,0.000492,0.000847,0.000243,0.009219,0.000323
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000123,0.000667,...,0.000624,1.9e-05,6.6e-05,6.4e-05,8e-06,2.3e-05,1.6e-05,2.9e-05,0.000115,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000667,0.000667,...,0.000667,0.000667,0.000667,0.000667,0.000156,0.000176,0.000121,0.0002,0.000585,2.1e-05
50%,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.000667,0.000667,...,0.000667,0.000667,0.000667,0.000667,0.000667,0.000667,0.000667,0.000667,0.000667,0.000667
75%,26.37,0.333455,0.334476,0.334147,0.333349,0.333326,1.0,1.0,0.004558,0.0461,...,0.003589,0.002087,0.002399,0.004621,0.000667,0.000667,0.000667,0.000667,0.000667,0.000667
max,26.37,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.110081,0.241349,...,0.023777,0.04467,0.015978,0.027096,0.000667,0.004437,0.007572,0.000667,0.050191,0.000667


## REU 4 community data

In [4]:
t0_data = pd.read_csv("DTL0/REU04_table_t0_20220811.csv")
tf_data = pd.read_csv("DTL0/REU04_table_tf_20220811.csv")

exp_info = ['Treatments', 'Rep', 'Time']
inputs = ['Inulin', 'Starch', 'Pectin', 'ArGal', 'Gum', 'AmAc', 'pH']
species = ['BAabs', 'BPabs', 'BTabs', 'BUabs', 'PCabs', 'PJabs',
       'ACabs', 'CGabs', 'CHabs', 'FPabs', 'ERabs', 'BHabs', 'RIabs',
       'CSabs', 'EHabs']

# data with replicates
reps_data = pd.concat((t0_data[exp_info+inputs+species], tf_data[exp_info+inputs+species]))
rep1_data = reps_data.iloc[reps_data['Rep'].values==1].sort_values(by=['Treatments', 'Time'])
rep2_data = reps_data.iloc[reps_data['Rep'].values==2].sort_values(by=['Treatments', 'Time'])

# average replicates
avg_data_4 = rep1_data.copy().drop(['Rep'], axis=1)
avg_data_4[species] = (avg_data_4[species].values + rep2_data[species].values)/2.

# normalize data 
# t0_inds = avg_data.Time.values == 0.

# normalize values after initial condition 
max_od = 1. # np.max(avg_data[species].iloc[~t0_inds].values, 0)  
species_inds = np.in1d(avg_data_4.columns.values, species)
# avg_data.iloc[~t0_inds, species_inds] /= max_od

# set initial conditions 
# avg_data.iloc[t0_inds, species_inds] = np.ceil(avg_data.iloc[t0_inds, species_inds].values) / len(species)

## Concatenate REU3 and REU4 data

In [5]:
# concatenate data 
avg_data = pd.concat((avg_data_3, avg_data_4))
avg_data

Unnamed: 0,Treatments,Time,Inulin,Starch,Pectin,ArGal,Gum,AmAc,pH,BAabs,...,PJabs,ACabs,CGabs,CHabs,FPabs,ERabs,BHabs,RIabs,CSabs,EHabs
0,REU03_1,0.000000,0.5,0.0,0.0,0.0,0.5,0.0,1.0,0.000667,...,0.000667,0.000667,0.000667,0.000667,0.000667,0.000667,0.000667,0.000667,0.000667,0.000667
0,REU03_1,26.370000,0.5,0.0,0.0,0.0,0.5,0.0,1.0,0.034408,...,0.002912,0.011221,0.002455,0.000894,0.000135,0.000111,0.000089,0.000265,0.000321,0.000014
9,REU03_10,0.000000,0.0,0.5,0.0,0.5,0.0,1.0,0.0,0.000667,...,0.000667,0.000667,0.000667,0.000667,0.000667,0.000667,0.000667,0.000667,0.000667,0.000667
8,REU03_10,26.370000,0.0,0.5,0.0,0.5,0.0,1.0,0.0,0.001794,...,0.006062,0.000493,0.000515,0.012082,0.000130,0.000177,0.000071,0.000302,0.000264,0.000011
10,REU03_11,0.000000,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.000667,...,0.000667,0.000667,0.000667,0.000667,0.000667,0.000667,0.000667,0.000667,0.000667,0.000667
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3,REU04_ER,24.004444,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,-0.008392,0.000000,0.000000,0.000000,0.000000
30,REU04_ER-FP,0.000000,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.002000,0.002000,0.000000,0.000000,0.000000,0.000000
30,REU04_ER-FP,24.004444,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.000000,...,0.000000,0.000000,0.000000,0.000000,-0.001419,-0.009376,0.000000,0.000000,0.000000,0.000000
4,REU04_FP,0.000000,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.002000,0.000000,0.000000,0.000000,0.000000,0.000000


## REU 3 sum-of-OD data

In [6]:
# import sum of OD data
sum_data_3 = pd.read_csv("DTL0/REU03_table_timeSeriesData_20220811.csv")

# data with replicates
reps_data = sum_data_3[exp_info+inputs+['OD']].copy()
rep1_data = reps_data.iloc[reps_data['Rep'].values==1].sort_values(by=['Treatments', 'Time'])
rep2_data = reps_data.iloc[reps_data['Rep'].values==2].sort_values(by=['Treatments', 'Time'])

# average replicates
avg_sum_data_3 = rep1_data.copy().drop(['Rep'], axis=1)
avg_sum_data_3.describe()

Unnamed: 0,Time,Inulin,Starch,Pectin,ArGal,Gum,AmAc,pH,OD
count,672.0,672.0,672.0,672.0,672.0,672.0,672.0,672.0,672.0
mean,13.350218,0.195693,0.195882,0.216797,0.195792,0.195836,0.510417,0.510417,0.138112
std,8.113401,0.259915,0.260012,0.282544,0.259966,0.25999,0.495023,0.495023,0.124472
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.001833
25%,6.375556,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.038898
50%,13.376944,0.0,0.0,0.0,0.0,0.0,0.75,0.75,0.098247
75%,20.378611,0.333369,0.334456,0.333762,0.333404,0.333616,1.0,1.0,0.204613
max,26.379167,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.547167


In [7]:
# insert initial OD 
t0_inds = avg_data.Time.values == 0.
avg_sum_data_3[species] = avg_data.iloc[t0_inds, species_inds].values[0]
avg_sum_data_3.head()

Unnamed: 0,Treatments,Time,Inulin,Starch,Pectin,ArGal,Gum,AmAc,pH,OD,...,PJabs,ACabs,CGabs,CHabs,FPabs,ERabs,BHabs,RIabs,CSabs,EHabs
0,REU03_1,0.0,0.5,0.0,0.0,0.0,0.5,0.0,1.0,0.01,...,0.000667,0.000667,0.000667,0.000667,0.000667,0.000667,0.000667,0.000667,0.000667,0.000667
1,REU03_1,2.375,0.5,0.0,0.0,0.0,0.5,0.0,1.0,0.042733,...,0.000667,0.000667,0.000667,0.000667,0.000667,0.000667,0.000667,0.000667,0.000667,0.000667
2,REU03_1,4.375278,0.5,0.0,0.0,0.0,0.5,0.0,1.0,0.04798,...,0.000667,0.000667,0.000667,0.000667,0.000667,0.000667,0.000667,0.000667,0.000667,0.000667
3,REU03_1,6.375556,0.5,0.0,0.0,0.0,0.5,0.0,1.0,0.055872,...,0.000667,0.000667,0.000667,0.000667,0.000667,0.000667,0.000667,0.000667,0.000667,0.000667
4,REU03_1,8.375833,0.5,0.0,0.0,0.0,0.5,0.0,1.0,0.070998,...,0.000667,0.000667,0.000667,0.000667,0.000667,0.000667,0.000667,0.000667,0.000667,0.000667


## REU 4 sum-of-OD data

In [None]:
# import sum of OD data
sum_data_4 = pd.read_csv("DTL0/REU04_table_timeSeriesData_20220811.csv")

# data with replicates
reps_data = sum_data_4[exp_info+inputs+['OD']].copy()
rep1_data = reps_data.iloc[reps_data['Rep'].values==1].sort_values(by=['Treatments', 'Time'])
rep2_data = reps_data.iloc[reps_data['Rep'].values==2].sort_values(by=['Treatments', 'Time'])

# average replicates
avg_sum_data_4 = rep1_data.copy().drop(['Rep'], axis=1)

# insert initial conditions to sum of OD data 
initial_conditions = np.zeros([avg_sum_data_4.values.shape[0], len(species)])
all_treatments = avg_sum_data_4['Treatments'].values
unique_treatments = np.unique(all_treatments)
k = 0
for treatment in unique_treatments:
    inds = np.in1d(all_treatments, treatment)
    sum_data = avg_sum_data_4.iloc[inds].copy()
    n_time = sum_data.values.shape[0]
    
    # get initial condition from community data
    inds = np.in1d(avg_data.Treatments.values, treatment)
    initial_condition = avg_data.iloc[inds].copy()[species].values[0]
    
    # store initial condition
    initial_conditions[k:k+n_time] = initial_condition    
    k += n_time

# store initial conditions in dataframe 
avg_sum_data_4[species] = initial_conditions

# print df 
avg_sum_data_4.head()

## Concatenate sum of OD data

In [None]:
avg_sum_data = pd.concat((avg_sum_data_3, avg_sum_data_4))
avg_sum_data.head()

# Define function to make predictions on test data

In [None]:
# Define function to make predictions on test data

def test_model(model, df_test, max_od, species, plot=False):
    all_treatments = df_test.Treatments.values
    unique_treatments = np.unique(all_treatments)
    numspecies = len(species)

    # save true and predicted values
    y_true = []
    y_pred = []
    y_std  = []
    test_treatments = []
    test_times = []
    all_species_names = []

    # pull a random community trajectory
    for treatment in unique_treatments:
        comm_inds = np.in1d(df_test['Treatments'].values, treatment)
        comm_data = df_test.iloc[comm_inds].copy()

        # make sure comm_data is sorted in chronological order
        comm_data.sort_values(by='Time', ascending=True, inplace=True)
        tspan = comm_data.Time.values

        # pull just the community data
        output_true = comm_data[species].values

        # run model using parameters
        x_test = np.copy(output_true[0, :])
        
        # control parameters 
        ctrl_params = comm_data[inputs].values #[0]

        # test full community
        output, stdv, COV = model.predict(x_test, tspan, ctrl_params=ctrl_params)
        
        # un-normalize
        output_true *= max_od
        output *= max_od
        stdv   *= max_od

        # save predictions after initial value 
        for i, (true, pred, std) in enumerate(zip(output_true[1:], output[1:], stdv[1:])):
            y_true += list(true)
            y_pred += list(pred)
            y_std  += list(std)
            test_times += [tspan[i+1]]*numspecies
            all_species_names += list(species)
            test_treatments += [treatment]*numspecies

        if plot:
            # increase teval
            t_eval = np.linspace(0, tspan[-1]+5)
            steps = len(t_eval)
            output, stdv, COV = model.predict(x_test, t_eval, ctrl_params=ctrl_params)   
            
            # un-normalize
            output *= max_od
            stdv   *= max_od

            # plot the results
            plt.figure(figsize=(9, 6))
            ylim = 0
            for i in range(numspecies):
                out = output[:,i]
                out_true = output_true[:, i]
                std = stdv[:, i]
                if ylim < np.max([np.max(out) + np.max(std)+.1, np.max(out_true)+.1]):
                    ylim = np.max([np.max(out) + np.max(std)+.1, np.max(out_true)+.1])
                if out[0] > 0:
                    plt.scatter(tspan, out_true, color='C{}'.format(i))
                    plt.plot(t_eval, out, label="Predicted species " + str(i+1), color='C{}'.format(i))
                    plt.fill_between(t_eval, out-std, out+std, color='C{}'.format(i), alpha=0.2)

            plt.xlabel("time", fontsize=16)
            plt.ylabel("Abundance", fontsize=16)
            plt.legend(loc='upper left')
            plt.ylim([0, np.min([ylim, 3])])
            plt.title(f"Treatment {treatment} predictions")
            #plt.savefig("Kfold/Figures/{}_{}.pdf".format(dataset.replace("_",""), treatment.replace("<","")))
            #plt.close()
            plt.show()

    return test_treatments, test_times, all_species_names, y_true, y_pred, y_std

# Initialize model parameters

In [None]:
# system dimensions
ns = len(species)
nu = len(inputs)
nx = ns + nu

# hidden dimension
nh = 5

# map to hidden dimension
stdv = 1./np.sqrt(nh*nx)
A = np.random.uniform(0, stdv, [nh, nx])

# init bias term
a = np.random.uniform(0, stdv, nh)

# map back to original dimension
stdv = 1./np.sqrt(ns*nh)
B = np.random.uniform(-stdv, 0, [ns, nh])

# init growth rates
b = np.random.uniform(0, stdv, ns)

# init carrying capacities 
t0_inds = avg_data.Time.values == 0.
c = 1./np.max(avg_data[species].values, 0)

# concatenate parameters 
params = np.concatenate((A.flatten(), a, B.flatten(), b, c))
prior  = np.zeros_like(params)
prior[-ns:] = c

n_params = len(params)
n_params

# Define model

In [None]:
# using NODE model 
def system(t, s, params, ctrl_params): 

    # append species to ctrl params
    x = jnp.concatenate((s, ctrl_params[0]))
    
    # map to hidden dimension
    A = jnp.reshape(params[:nh*nx], [nh,nx])
    a = params[nh*nx:nh*nx+nh]

    # map back to original dimension
    B = jnp.reshape(params[nh*nx+nh:nh*nx+nh+nh*ns], [ns,nh])
    b = params[nh*nx+nh+nh*ns:nh*nx+nh+nh*ns+ns]

    # carrying capacity
    c = params[nh*nx+nh+nh*ns+ns:nh*nx+nh+nh*ns+ns+ns]
    
    # compute hidden dimension
    h = jnp.tanh(A@x + a)

    # rate of change of species 
    dsdt = s * (B@h + b) * (1. - c*s)

    return dsdt

# Define compression functions

In [None]:
# define compression functions 
compressor0 = lambda x: jnp.expand_dims(jnp.sum(x*max_od), 0)     # sum over outputs 
compressor1 = lambda x: x

compressors = [compressor0, compressor1]

## Define ODE (time, x, parameters, u(t), control parameters)

In [None]:
# pull treatment names 
all_sum_treatments = avg_sum_data.Treatments.values
all_treatments = avg_data.Treatments.values
unique_treatments = np.unique(all_treatments)

# set up kfold iterator
kf = KFold(n_splits = K) 

# set up list of measured and predicted values
kfold_species_names = []
kfold_y_true = []
kfold_y_pred = []
kfold_y_stdv = []

# iterate over folds 
for train_index, test_index in kf.split(unique_treatments):
    # train_index, test_index = next(iter(kf.split(unique_treatments)))

    # get index of train and test data
    train_inds_sum = np.in1d(all_sum_treatments, unique_treatments[train_index])
    train_inds = np.in1d(all_treatments, unique_treatments[train_index])
    test_inds  = np.in1d(all_treatments, unique_treatments[test_index])

    # pull out train and test data 
    df_train_sum = avg_sum_data.iloc[train_inds_sum].copy()
    df_train = avg_data.iloc[train_inds].copy()
    df_test  = avg_data.iloc[test_inds].copy()

    # instantiate gLV fit 
    model = ODE(system = system, 
                dataframes=[df_train_sum, df_train],
                compressors = compressors,
                params = params, 
                prior = prior,
                sys_vars = species,
                measured_vars = [['OD'], species],
                controls = inputs,
                verbose=True)
    
    # fit to data 
    t0 = time.time()
    model.fit(evidence_tol=1e-3, beta_tol=1e-4)
    print("Elapsed time {:.2f}s".format(time.time()-t0))

    # predict held-out data
    test_treatments, test_times, all_species_names, y_true, y_pred, y_std = test_model(model, df_test, max_od, species, plot=False)
    kfold_species_names += all_species_names
    kfold_y_true += y_true
    kfold_y_pred += y_pred
    kfold_y_stdv += y_std

In [None]:
test_treatments, test_times, all_species_names, y_true, y_pred, y_std = test_model(model, avg_data, max_od, species, plot=True)

In [None]:
r_vals = []
plt.figure(figsize=(9,8))
for s in species:
    y_inds = np.in1d(all_species_names, s)
    y_s_true = np.array(y_true)[y_inds]
    y_s_pred = np.array(y_pred)[y_inds]
    
    r = linregress(y_s_true, y_s_pred).rvalue
    r_vals.append(r)
    plt.scatter(y_s_true, y_s_pred, label=s.replace("abs","")+" R={:.2f}".format(r))
plt.legend()
plt.xlabel("Measured", fontsize=18)
plt.ylabel("Predicted", fontsize=18)

plt.savefig("Results/node_fit_mf.pdf", dpi=200)
plt.show()

In [None]:
plt.figure(figsize=(9,8))
r_vals = []
for s in species:
    y_inds = np.in1d(kfold_species_names, s)
    y_s_true = np.array(kfold_y_true)[y_inds]
    y_s_pred = np.array(kfold_y_pred)[y_inds]
    
    r = linregress(y_s_true, y_s_pred).rvalue
    r_vals.append(r)
    plt.scatter(y_s_true, y_s_pred, label=s.replace("abs","")+" R={:.2f}".format(r))
plt.legend()
#plt.ylim([0,.5])
plt.show()
print(np.median(r_vals))

In [None]:
r_vals