In [1]:
import pickle
import numpy as np
from scipy.special import logsumexp

In [2]:
def simulate_clv(ndays, A, B, g, N):
    """Simulates data under compositional Lotka-Volterra.
        
        Let p = (p_1, ..., p_D) be the relative proportions
        of D taxa (species).

        Let x = alr(p), the additive log-ratio of p. Note 
        x is in R^{D-1} and p is in S^D.

        The state space model is:
            x_t ~ Normal(x_{t-1} + g + Ap_{t-1}, e)
        
        The observation model is:
            y_t ~ Multinomial(C_t, p_t = alr^{-1}(x_t))

        The count parameter C_t is chosen to simulate the
        varying sequencing depths observed across real samples.


    Parameters
    ----------
        ntaxa  : number of species to simulate
        ndays  : number of days to simulate
        ss_var : state space variance

    Returns
    -------
        x  : an ndays by ntaxa-1 matrix of latent states
        y  : an ndays by ntaxa matrix of observed sequencing counts
        A  : simulated interaction matrix A in R^{D-1 x D}
        g  : simulated growth rate vector g in R^{D-1}
        mu : initial mean

    """
    latent_dim = A.shape[0]
    input_dim = B.shape[1]
    
    x = []
    y_count = []
    y_percentage = []
    v = []
    
    mu  = np.random.multivariate_normal(mean=np.zeros(latent_dim), cov=np.eye(latent_dim))
    for t in range(ndays):
        xt = mu

        # increase dimension by 1
        xt1 = np.concatenate((xt, np.array([0])))
        pt = np.exp(xt1 - logsumexp(xt1))

        # simulate total number of reads with over-dispersion
        logN = np.random.normal(loc=np.log(N), scale=0.5)
        Nt = np.random.poisson(np.exp(logN))
        yt_count = np.random.multinomial(Nt, pt).astype(float)
        yt_percentage = yt_count / np.sum(yt_count)
        
        vt = np.random.normal(loc=0,scale=0.1,size=input_dim)

        x.append(xt)
        y_count.append(yt_count)
        y_percentage.append(yt_percentage)
        v.append(vt)

        transition_noise = np.random.multivariate_normal(mean=np.zeros(latent_dim), cov=0.1 * np.eye(latent_dim))
        mu  = xt + g + A.dot(pt) + B.dot(vt) + transition_noise
    return x, y_count, y_percentage, v

In [3]:
ntaxa = 11
ninput = 15
ndays = 50
n_train, n_test = 200, 40
A = np.random.normal(loc=0,scale=0.2,size=(ntaxa-1, ntaxa))
B = np.random.normal(loc=0,scale=0.2,size=(ntaxa-1, ninput))
g = np.random.normal(loc=0,scale=0.1,size=ntaxa-1)
N = 10000 # sequencing reads parameter

In [7]:
x_train = []
x_test = []
y_count_train = []
y_count_test = []
y_percentage_train = []
y_percentage_test = []
v_train = []
v_test = []
days = np.arange(ndays)[:, np.newaxis]
for i in range(n_train + n_test):
    x, y_count, y_percentage, v = simulate_clv(ndays, A, B, g, N)
    x, y_count, y_percentage, v = np.asarray(x), np.asarray(y_count), np.asarray(y_percentage), np.asarray(v)
    y_count = np.concatenate([days, y_count], axis=-1)
    y_percentage = np.concatenate([days, y_percentage], axis=-1)
    v = np.concatenate([days, v], axis=-1)
    if i < n_train:
        x_train.append(x)
        y_count_train.append(y_count)
        y_percentage_train.append(y_percentage)
        v_train.append(v)
    else:
        x_test.append(x)
        y_count_test.append(y_count)
        y_percentage_test.append(y_percentage)
        v_test.append(v)
data_count_obs = {"Xtrain": x_train, "Xtest": x_test,
                  "Ytrain": y_count_train, "Ytest": y_count_test,
                  "Vtrain": v_train, "Vtest": v_test,
                  "A": A, "B": B, "g": g, "N": N}
data_percentage_obs = {"Xtrain": x_train, "Xtest": x_test,
                       "Ytrain": y_percentage_train, "Ytest": y_percentage_test,
                       "Vtrain": v_train, "Vtest": v_test,
                       "A": A, "B": B, "g": g, "N": N}

In [8]:
with open("clv_count.p", "wb") as f:
    pickle.dump(data_count_obs, f)
with open("clv_percentage.p", "wb") as f:
    pickle.dump(data_percentage_obs, f)

In [10]:
T = ndays
for obs_percentage in [0.8, 0.6, 0.5, 0.4]:
    y_count_train_tmp = []
    y_count_test_tmp = []
    y_percentage_train_tmp = []
    y_percentage_test_tmp = []
    for y_count, y_percentage in zip(y_count_train, y_percentage_train):
        obsed_days = np.random.choice(np.arange(T), int(T * obs_percentage), replace=False)
        obsed_days = np.sort(obsed_days)
        y_count_train_tmp.append(y_count[obsed_days])
        y_percentage_train_tmp.append(y_percentage[obsed_days])
    for y_count, y_percentage in zip(y_count_test, y_percentage_test):
        obsed_days = np.random.choice(np.arange(T), int(T * obs_percentage), replace=False)
        obsed_days = np.sort(obsed_days)
        y_count_test_tmp.append(y_count[obsed_days])
        y_percentage_test_tmp.append(y_percentage[obsed_days])
        
    y_count_train_tmp = np.stack(y_count_train_tmp)
    y_count_test_tmp = np.stack(y_count_test_tmp)
    y_percentage_train_tmp = np.stack(y_percentage_train_tmp)
    y_percentage_test_tmp = np.stack(y_percentage_test_tmp)
    
    data_count_obs = {"Xtrain": x_train, "Xtest": x_test,
                      "Ytrain": y_count_train_tmp, "Ytest": y_count_test_tmp,
                      "Vtrain": v_train, "Vtest": v_test,
                      "A": A, "B": B, "g": g, "N": N}
    data_percentage_obs = {"Xtrain": x_train, "Xtest": x_test,
                           "Ytrain": y_percetage_train_tmp, "Ytest": y_percetage_test_tmp,
                           "Vtrain": v_train, "Vtest": v_test,
                           "A": A, "B": B, "g": g, "N": N}

    with open("clv_count_{}_obs.p".format(obs_percentage), "wb") as f:
        pickle.dump(data_count_obs, f)
    with open("clv_percentage_{}_obs.p".format(obs_percentage), "wb") as f:
        pickle.dump(data_percentage_obs, f)