## GPcounts with zero-inflated negative binomial likelihood

Nuha BinTayyash, 2020

This notebook comapre GPcount fit with zero-inflated negative binomial, negative binomial and Gaussian likelihoods on ScRNA-seq gene expression data for highly expressed genes in Islet $\alpha$ cell from [GSE8737 single cell RNA-seq ](https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE87375) dataset.

In [1]:
import pandas as pd
import numpy as np
import gpflow 

Load $\alpha$ dataset and pseudotime information

In [2]:
Y = pd.read_csv('normalized_alpha_counts.csv',index_col=[0]).astype(float)
X = pd.read_csv('alpha_time_points.csv',index_col=[0])

#### Fit GPcount with zero inflated negative binomial likelihood

In [3]:
from GPcounts.GPcounts_Module import Fit_GPcounts

In [4]:
gene_name = ['Fam184b','Pde1a' ]
gp_counts = Fit_GPcounts(X,Y.iloc[[0]]) 

Probabilty of zeros in Fam184b and Pde1a genes

In [5]:
#Y.loc[gene_name][Y.loc[gene_name] == 0].count(axis=1)/len(Y.columns)
Y.iloc[32:35,:][Y.iloc[34:35,:] == 0].count(axis=1)/len(Y.columns)

ENSMUSG00000000266    0.000000
ENSMUSG00000000275    0.000000
ENSMUSG00000000276    0.167683
dtype: float64

In [6]:
Y.iloc[32:35,:]

Unnamed: 0,aE17.5_2_09,aE17.5_2_16,aE17.5_1_11,aE17.5_3_07,aE17.5_4_06,aE17.5_3_04,aE17.5_2_11,aE17.5_1_25,aE17.5_4_01,aE17.5_4_03,...,aP18_3_12,aP60_1_11,aP60_3_05,aP15_1_15,aP60_1_13,aP60_3_08,aP60_5_16,aP18_1_17,aP60_1_10,aP60_5_05
ENSMUSG00000000266,0.0,0.0,0.0,0.0,86.165206,0.812166,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,6.073315,30.10076,0.0,0.0,0.0,0.0,128.68414
ENSMUSG00000000275,1.042968,50.832833,0.0,4.215918,0.0,5.685164,20.856865,0.0,39.395133,47.436661,...,16.262649,0.0,3.734786,0.0,38.310058,86.792367,0.0,0.0,54.082832,0.0
ENSMUSG00000000276,36.503888,3.986889,21.378473,65.34673,61.546575,0.812166,25.247784,39.644143,6.354054,58.323763,...,27.104415,0.0,109.24248,145.759571,1.368216,59.01881,1.043644,0.0,0.0,1.063505


In [7]:
from matplotlib import pyplot as plt
import statsmodels.api as sm

def plot():
    plt.tick_params(labelsize='large', width=2)     
    plt.ylabel('Gene Expression', fontsize=16)
    plt.xlabel('Times', fontsize=16)
    
    if model_index == 1 or model_index == 2:
        c = 'blue'

    else:
        c = 'salmon'
    
    
    if likelihood == 'Gaussian':
        plt.fill_between(xtest[:,0],
                            mean[:,0] - 1*np.sqrt(var[:,0]),
                            mean[:,0] + 1*np.sqrt(var[:,0]), alpha=0.2) # one standard deviation
        plt.fill_between(xtest[:,0],
                            mean[:,0] - 2*np.sqrt(var[:,0]),
                            mean[:,0] + 2*np.sqrt(var[:,0]),color='light'+c, alpha=0.2)# two standard deviation
    else:
        
        lowess = sm.nonparametric.lowess
        
        # one standard deviation 68%
        percentile_16 = lowess(np.percentile(var, 16, axis=0),xtest[:,0],frac=1./5, return_sorted=False)
        percentile_84 = lowess(np.percentile(var, 84, axis=0),xtest[:,0],frac=1./5, return_sorted=False)
        plt.fill_between(xtest[:,0],percentile_16,percentile_84,alpha = 0.2)
        
        # two standard deviation 95%
        percentile_5 = lowess(np.percentile(var, 5, axis=0),xtest[:,0],frac=1./5, return_sorted=False)
        percentile_95 = lowess(np.percentile(var,95, axis=0),xtest[:,0],frac=1./5, return_sorted=False)
        plt.fill_between(xtest[:,0],percentile_5,percentile_95,color='light'+c,alpha = 0.2)
        
    plt.plot(xtest, mean, lw=2) 
    plt.scatter(model.data[0],model.data[1], s=10, color= c, alpha=0.6) #data
    
    if test == 'Two_samples_test':
        if model_index == 1  or model_index ==3:
            plt.scatter(model.data[0][int(model.data[0].shape[0]/2)::],model.data[1][int(model.data[0].shape[0]/2)::], s=10, color= 'salmon', alpha=0.6) #data
            plt.show()
    
    else:
        plt.show()

In [9]:
likelihood = 'Zero_inflated_negative_binomial' 
log_likelihood = gp_counts.One_sample_test(likelihood)
log_likelihood

100%|██████████| 1/1 [00:23<00:00, 23.19s/it]


Unnamed: 0,Dynamic_model_log_likelihood,Constant_model_log_likelihood,log_likelihood_ratio
ENSMUSG00000000001,-1818.722041,-1832.525235,13.803194


In [None]:
#likelihood = 'Negative_binomial' 
indexes = log_likelihood.index.values.tolist() # list of genes to be plotted 
test = 'One_sample_test' # name of the test
xtest = np.linspace(np.min(X.values),np.max(X.values),100)[:,None]

params = gp_counts.load_models(indexes,test,xtest,likelihood)

for i in range(len(indexes)):
    fig = plt.figure()
    print(indexes[i])
    model_index = 1
    for model,mean,var in zip(params['models'][i],params['means'][i],params['vars'][i]):
        plot()
        model_index = model_index + 1

#### Fit GPcount with negative binomial likelihood

In [None]:
likelihood = 'Negative_binomial' 
log_likelihood_nb = gp_counts.Infer_trajectory(likelihood)
log_likelihood_nb

In [None]:
params = gp_counts.load_models(indexes,test,xtest,likelihood)
for i in range(len(indexes)):
    fig = plt.figure()
    print(indexes[i])
    model_index = 1
    for model,mean,var in zip(params['models'][i],params['means'][i],params['vars'][i]):
        plot()
        model_index = model_index + 1

#### Fit GPcount with Gaussian likelihood

In [None]:
likelihood = 'Gaussian'
log_likelihood = gp_counts.Infer_trajectory(likelihood)
log_likelihood

In [None]:
params = gp_counts.load_models(indexes,test,xtest,likelihood)

for i in range(len(indexes)):
    fig = plt.figure()
    print(indexes[i])
    model_index = 1
    for model,mean,var in zip(params['models'][i],params['means'][i],params['vars'][i]):
        plot()
        model_index = model_index + 1

In [None]:
test = float('nan')
test

In [None]:

if np.isnan(test):
    print('yes')