# GPcounts Regression on synthetic data

Nuha BinTayyash, 2020

GPcounts is Gaussain process regression package for count data with negative binomial and zero-inflated negative binomial likelihoods described in the paper "Gaussian process modelling of count data with
application to bulk and single-cell RNA-Seq".

This notebook shows how to build a GPcount model and plot the posterior model fit on synthetic data.

In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf
import scipy.stats as ss
import gpflow
%pylab inline 

### Sample synthetic count data from Gaussian Process with negative binomial likelihood

In [None]:
tf.compat.v1.reset_default_graph()
tf.compat.v1.set_random_seed(0)
gpflow.config.set_default_float(np.float64)
np.random.seed(0)
tf.random.set_seed(0)

def sample_GP():
    kern =  gpflow.kernels.RBF( lengthscale = 4.,variance = 1.) 
    K = kern(x) + np.eye(N) * 1e-6
    L = np.linalg.cholesky(K)  
    mu = np.ones((N, 1)) 
    f = np.dot(L, np.random.randn(N, 1)) + mu
    plt.plot(x, f,color= 'green',label = 'latent function f(x)')
    plt.legend()
    plt.show()
    return f

def sample_from_NegativeBinom(r,mean):
    # r  number of failures
    # prob probability of success
    prob = r/(mean+r)
    y =[]
    for i in range(mean.shape[0]):
        y.append(ss.nbinom.rvs(r, prob[i], size=1))
    y = np.vstack(y)    
    return y

N = 30  # cell
D = 2  # genes 
S = 3 # Samples 
x = np.array([0.,1.,2.,3.,4.,5.,6.,7.,8.,9.])
x = np.repeat(x, S).reshape(-1, 1)
f = sample_GP() 
link_f = np.exp(f)
alphas = [.5,3.]
y = []

for i in range(D):
    
    y.append(sample_from_NegativeBinom(1./alphas[i],link_f)[: None])
    plt.plot(x, y[i],'kx')
    plt.xlabel('Times - NB', fontsize=16)
    plt.ylabel('Gene Expression', fontsize=16)
    plt.show()

##### Create dataframes for gene expression count data and pseudotime points

In [None]:
y = np.array(y)
y = y.reshape(y.shape[0],y.shape[1])
genes_name = ['gene_%s' % (s+1) for s in range(y.shape[0])] 
cells_name = ['cell_%s' % (s+1) for s in range(y.shape[1])] 
Y = pd.DataFrame(data= y,index= genes_name,columns= cells_name) # gene expression count data
X = pd.DataFrame.from_dict({i: j for i, j in enumerate(x)}, orient='index',columns=['time_point'])

### Fit GPcounts

The avaliable likelihoods are: Negative_binomial, Zero_inflated_negative_binomial, Poisson and Gaussian.  

### Infre trajectory with RBF kernel

In [None]:
from GPcounts.GPcounts_Module import Fit_GPcounts
likelihood = 'Negative_binomial' 
gp_counts = Fit_GPcounts(X,Y) 
log_likelihood = gp_counts.Infer_trajectory(likelihood)
log_likelihood

### save genes log likelihood 

In [None]:
log_likelihood.to_csv("log_likelihood.csv")

## Plot GPcounts fitting 

Load GPmodels models for selected genes and plot samples from the posterior predictive with negative binomial distribution or any selected distribution

In [None]:
import numpy as np
from matplotlib import pyplot as plt
import statsmodels.api as sm

def plot(likelihood,xtest,model,mean,var):
         
    fig = plt.figure()
    plt.ylabel('Gene Expression', fontsize=16)
    plt.xlabel('Times', fontsize=16)

   
    if likelihood == 'Gaussian':
        y = np.log(model.data[1]+1)
        plt.fill_between(xtest[:,0],
                            mean[:,0] - 2*np.sqrt(var[:,0]),
                            mean[:,0] + 2*np.sqrt(var[:,0]), alpha=0.2)
    else:
        y = model.data[1]
        lowess = sm.nonparametric.lowess
        percentile_5 = lowess(np.percentile(var, 5, axis=0),xtest[:,0],frac=1./5, return_sorted=False)
        percentile_95 = lowess(np.percentile(var, 95, axis=0),xtest[:,0],frac=1./5, return_sorted=False)
        plt.fill_between(xtest[:,0],percentile_5,percentile_95,alpha = 0.2)

    plt.scatter(model.data[0],model.data[1], s=10, color='black', alpha=0.6) #data
    plt.plot(xtest, mean, lw=2) 
    plt.show()
    
indexes = log_likelihood.index.values # list of genes to be plotted 
test = 'Infer_trajectory' # name of the test
xtest = np.linspace(np.min(X.values),np.max(X.values),100)[:,None] # points to make prediction
sample = True # sample or/and load model 
params = gp_counts.load_and_sample_models(indexes,test,xtest,likelihood,sample)

for i in range(len(indexes)):
    print(indexes[i])
    for mean,var,model in zip(params['means'][i],params['vars'][i],params['models'][i]):
        mean = np.array(mean)
        var = np.array(var)
        plot(likelihood,xtest,model,mean,var)
        gpflow.utilities.print_summary(model, fmt='notebook')