In [1]:
import GPy
import pickle
import numpy as np
from scipy.special import logsumexp

import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# generate binary input vector
def simulate_single_input(time, Dv):
    inputs = np.zeros((time, Dv))
    if Dv > 0:

        number_of_non_zero_dimenions = np.random.choice(np.arange(3,8), 1)[0]

        non_zero_dimensions = np.random.choice(Dv - 1, number_of_non_zero_dimenions, replace=False)

        for k in non_zero_dimensions:
            start = np.random.choice(time - 1, 1)[0]
            end = start + 1 + np.random.choice(np.arange(40, 60),1)[0]
            end = min(end, time)
            inputs[start:end, k] = np.ones(end-start)

        # surgery is at the last axis
        start = np.random.randint(int(0.3 * time), int(0.6 * time))
        end = time
        inputs[start:end, -1] = np.ones(end-start)
    
    return inputs

In [3]:
def simulate_clv_with_inputs(A, g, Wv, f_cov, N, inputs, X_cov, beta):
    #  Wv: (Dx, Dv), W1 (Dx, Dv), W2 (1, Dx+1)
    # x_t + g_t + Wv v_t + (A+ A(v_t)) * p_t where A(v_t) = (W1 * vt) * W2
    latent_dim = A.shape[0]
    ndays, input_dim = inputs.shape
    x_mean = []
    y_count = []
    y_percentage = []

    mu = np.random.multivariate_normal(mean=np.zeros(latent_dim), cov=2 * np.eye(latent_dim))
    # simulate hidden alpha
    for t in range(ndays):
        xt = mu
        x_mean.append(xt)

        # increase dimension by 1
        xt1 = np.concatenate((xt, np.array([0])))
        pt = np.exp(xt1 - logsumexp(xt1))

        transition_noise = np.random.multivariate_normal(mean=np.zeros(latent_dim), cov=np.diag(f_cov))
        vt = inputs[t]

        mu = xt + g + Wv.dot(vt) + A.dot(pt) + transition_noise
    
    x_mean = np.array(x_mean)
    X = np.zeros_like(x_mean)
    for i in range(x_mean.shape[1]):
        X[:, i] = np.random.multivariate_normal(x_mean[:, i], X_cov)

    for xt in X:
        xt1 = np.concatenate((xt, np.array([0])))
        pt = np.exp(xt1 - logsumexp(xt1))
        pt = beta.dot(pt)

        # simulate total number of reads with over-dispersion
        logN = np.random.normal(loc=np.log(N), scale=0.5)
        Nt = np.random.poisson(np.exp(logN))

        yt_count = np.random.multinomial(Nt, pt).astype(float)
        yt_percentage = yt_count / np.sum(yt_count)

        y_count.append(yt_count)
        y_percentage.append(yt_percentage)

    return np.array(X), np.array(y_count), np.array(y_percentage)

In [50]:
Dx = 5
Dy = 2 * Dx
Dv = 0  # including surgery
n_train, n_test = 600, 150
time = 90
obs_percentage = 1

# interaction
A  = np.random.normal(loc=0, scale=0.3, size=(Dx, Dx))
for i in range(Dx):
    A[i, i] = -np.abs(A[i, i])
    for j in range(i + 1, Dx):
        A[i, j] = -np.sign(A[j, i]) * np.abs(A[i, j])

# inputs
Wv = np.random.normal(loc=0, scale=0.02, size=(Dx, Dv))
if Dv > 0:
    Wv[:, -1] = -np.abs(Wv[:, -1])
for i in range(Dv - 1):
    n_zero_items = int(Dx * 0.75)
    idxs = np.random.choice(np.arange(Dx), n_zero_items, replace=False)
    Wv[idxs, i] = np.zeros_like(idxs)

g = np.random.gamma(shape=1.1, scale=0.01, size=(Dx,))
f_cov = np.abs(np.random.normal(loc=0, scale=0.0, size=(Dx - 1,)))
N = 10000 # sequencing reads parameter

beta = np.zeros((Dy, Dx))
for i in range(Dx):
    beta[:, i] = np.ones(Dy) * (1-0.6-0.3)/(Dy-2)
    beta[2 * i, i] = 0.6
    beta[2 * i + 1, i] = 0.3

k_var = 0.0
k_len = 1
kernel = GPy.kern.RBF(1, variance=k_var, lengthscale=k_len)

print("A\n" + "\n".join([" ".join(["{:>5.2f}".format(ele) for ele in row]) for row in A]))
print("g\n" + " ".join(["{:>5.3f}".format(ele) for ele in g]))
print("Wv\n" + "\n".join([" ".join(["{:>5.2f}".format(ele) for ele in row]) for row in Wv]))
print("f_cov\n" + " ".join(["{:>5.3f}".format(ele) for ele in f_cov]))

A_r = A[:-1] - A[-1:]
g_r = g[:-1] - g[-1:]
Wv_r = Wv[:-1] - Wv[-1:]

A
-0.07 -0.19  0.22 -0.12 -0.46
 0.32 -0.14  0.39 -1.08 -0.05
-0.13 -0.36 -0.19  0.13 -0.02
 0.13  0.07 -0.02 -0.28  0.02
 0.12  0.10  0.01 -0.35 -0.08
g
0.003 0.005 0.004 0.007 0.010
Wv





f_cov
0.000 0.000 0.000 0.000


In [51]:
overwrite_params = True
if overwrite_params:
    with open("data/clv_count_Dx_5_Dy_10_ntrain_20_obp_10_noinput.p", "rb") as f:
        d = pickle.load(f)
    A = d["A"]
    g = d["g"]
    Wv = d["Wv"]
    f_cov = d["f_cov"]
    N = d["N"]
    k_var = d["k_var"]
    k_len = d["k_len"]
    beta = d["beta"]
    
    A_r = A[:-1] - A[-1:]
    g_r = g[:-1] - g[-1:]
    Wv_r = Wv[:-1] - Wv[-1:]
    k_var = 0.1
    k_len = 1
    kernel = GPy.kern.RBF(1, variance=k_var, lengthscale=k_len)

In [52]:
n_sparsity = 1/3

In [53]:
selected_time = [3*k for k in range(30)]

In [54]:
selected_time

[0,
 3,
 6,
 9,
 12,
 15,
 18,
 21,
 24,
 27,
 30,
 33,
 36,
 39,
 42,
 45,
 48,
 51,
 54,
 57,
 60,
 63,
 66,
 69,
 72,
 75,
 78,
 81,
 84,
 87]

In [55]:
# create data with missing observation
x_train = []
x_test = []
y_count_train = []
y_count_test = []
y_percentage_train = []
y_percentage_test = []
v_train = []
v_test = []

batch_inputs = [simulate_single_input(time, Dv) for _ in range(n_train + n_test)]
X_cov = kernel.K(np.arange(time)[:, np.newaxis])


for i in range(n_train + n_test):
    v = batch_inputs[i]  # (time, Dv)
    x, y_count, y_percentage = simulate_clv_with_inputs(A_r, g_r, Wv_r, f_cov, N, v, X_cov, beta)
    
    # make missing observations, the first day cannot be missing
    #if int(time * obs_percentage) == time:
     #   obsed_days = np.arange(time)
    #else:
     #   obsed_days = np.random.choice(np.arange(1, time), int(time * obs_percentage) - 1, replace=False)
      #  obsed_days = np.sort(np.concatenate(([0], obsed_days)))

    #y_percentage = y_percentage[obsed_days]
    #x = x[obsed_days]
    #y_count = y_count[obsed_days]
    
    y_percentage = y_percentage[selected_time]
    x = x[selected_time]
    y_count = y_count[selected_time]
    v = v[selected_time]
    
    days = np.arange(len(selected_time))[:, np.newaxis]
    y_count = np.concatenate([days, y_count], axis=-1)
    y_percentage = np.concatenate([days, y_percentage], axis=-1)
    v = np.concatenate([days, v], axis=-1)
    
    
    if i < n_train:
        x_train.append(x)
        y_count_train.append(y_count)
        y_percentage_train.append(y_percentage)
        v_train.append(v)
    else:
        x_test.append(x)
        y_count_test.append(y_count)
        y_percentage_test.append(y_percentage)
        v_test.append(v)

In [56]:
counts_train = []
for single_obs in y_count_train:
    single_counts = single_obs[:,1:].sum(axis=-1)
    counts_train.append(single_counts)
    
counts_test = []
for single_obs in y_count_test:
    single_counts = single_obs[:,1:].sum(axis=-1)
    counts_test.append(single_counts)

In [58]:
len(x_train)

600

In [57]:
x_train[0].shape

(30, 4)

In [59]:
c_data = {}
c_data["Xtrain"] = x_train
c_data["Xtest"] = x_test
c_data["Ytrain"] = y_count_train
c_data["Ytest"] = y_count_test
c_data["Vtrain"] = v_train
c_data["Vtest"] = v_test
c_data["counts_train"] = counts_train
c_data["counts_test"] = counts_test

c_data["A"] = A
c_data["g"] = g
c_data["Wv"] = Wv
c_data["f_cov"] = f_cov
c_data["N"] = N
c_data["k_var"] = k_var
c_data["k_len"] = k_len
c_data["beta"] = beta

with open("data/clv_gp_Dx_{}_Dy_{}_Dv_{}_ntrain_{}_sparse_{:0.2f}_ls_{}.p".format(Dx, Dy, Dv, n_train, n_sparsity, k_len), "wb") as f:
    pickle.dump(c_data, f)

1800