In [1]:
import pickle
import numpy as np
from scipy.special import logsumexp

In [2]:
def simulate_clv(ndays, A, g, N):
    """Simulates data under compositional Lotka-Volterra.
        
        Let p = (p_1, ..., p_D) be the relative proportions
        of D taxa (species).

        Let x = alr(p), the additive log-ratio of p. Note 
        x is in R^{D-1} and p is in S^D.

        The state space model is:
            x_t ~ Normal(x_{t-1} + g + Ap_{t-1}, e)
        
        The observation model is:
            y_t ~ Multinomial(C_t, p_t = alr^{-1}(x_t))

        The count parameter C_t is chosen to simulate the
        varying sequencing depths observed across real samples.


    Parameters
    ----------
        ntaxa  : number of species to simulate
        ndays  : number of days to simulate
        ss_var : state space variance

    Returns
    -------
        x  : an ndays by ntaxa-1 matrix of latent states
        y  : an ndays by ntaxa matrix of observed sequencing counts
        A  : simulated interaction matrix A in R^{D-1 x D}
        g  : simulated growth rate vector g in R^{D-1}
        mu : initial mean

    """
    mu = np.random.normal(loc=0,scale=0.1)

    latent_dim = A.shape[0]
    x = []
    y = []
    mu  = np.random.multivariate_normal(mean=np.zeros(latent_dim), cov=np.eye(latent_dim))
    for t in range(ndays):
        xt = mu

        # increase dimension by 1
        xt1 = np.concatenate((xt, np.array([0])))
        pt = np.exp(xt1 - logsumexp(xt1))

        # simulate total number of reads with over-dispersion
        logN = np.random.normal(loc=np.log(N), scale=0.5)
        Nt = np.random.poisson(np.exp(logN))
        yt = np.random.multinomial(Nt, pt).astype(float)

        x.append(xt)
        y.append(yt)

        mu  = xt + g + A.dot(pt)
    return x, y

In [3]:
ntaxa = 11
ndays = 50
n_train, n_test = 200, 40
A  = np.random.normal(loc=0,scale=0.2,size=(ntaxa-1, ntaxa))
g  = np.random.normal(loc=0,scale=0.1,size=ntaxa-1)
N  = 10000 # sequencing reads parameter

In [35]:
x_train = []
x_test = []
y_train = []
y_test = []
v_train = []
v_test = []
for i in range(n_train + n_test):
    x, y = simulate_clv(ndays, A, g, N)
    x, y = np.asarray(x), np.asarray(y)
    #y = y / np.sum(y, axis=-1, keepdims=True)
    y = np.concatenate([np.arange(ndays)[:, np.newaxis], y], axis=-1)
    v = np.stack([np.arange(ndays), np.zeros(ndays)], axis=-1)[:,:1]
    if i < n_train:
        x_train.append(x)
        y_train.append(y)
        v_train.append(v)
    else:
        x_test.append(x)
        y_test.append(y)
        v_test.append(v)


In [47]:
counts_train = []
for single_obs in y_train:
    single_counts = single_obs[:,1:].sum(axis=-1)
    counts_train.append(single_counts)

In [49]:
counts_test = []
for single_obs in y_test:
    single_counts = single_obs[:,1:].sum(axis=-1)
    counts_test.append(single_counts)

In [50]:
res = {"Xtrain": x_train, "Xtest": x_test,
       "Ytrain": y_train, "Ytest": y_test,
       "Vtrain": v_train, "Vtest": v_test,
       "counts_train": counts_train, "counts_test": counts_test}

In [7]:
from src.rslts_saving.rslts_saving import plot_obs_bar_plot

In [11]:
plot_obs_bar_plot(res['Ytrain'][0:5], to_normalize=False, rslt_dir="clv_data_train_count")

In [12]:
plot_obs_bar_plot(res['Ytrain'][-5:], to_normalize=False, rslt_dir="clv_data_train_2_count")

In [51]:
with open("data/count_clv.p", "wb") as f:
    pickle.dump(res, f)