## GPcounts with zero-inflated negative binomial likelihood

Nuha BinTayyash, 2020

This notebook comapre GPcount fit with zero-inflated negative binomial, negative binomial and Gaussian likelihoods on ScRNA-seq gene expression data for highly expressed genes in Islet $\alpha$ cell from [GSE8737 single cell RNA-seq ](https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE87375) dataset.

In [1]:
import pandas as pd
import numpy as np
import gpflow 

Load $\alpha$ dataset and pseudotime information

In [2]:
Y = pd.read_csv('normalized_alpha_counts.csv',index_col=[0])
X = pd.read_csv('alpha_time_points.csv',index_col=[0])

#### Fit GPcount with zero inflated negative binomial likelihood

In [3]:
from GPcounts.GPcounts_Module import Fit_GPcounts

In [4]:
gene_name = ['ENSMUSG00000038943', 'ENSMUSG00000000394', 'ENSMUSG00000039396',
       'ENSMUSG00000040856', 'ENSMUSG00000023944', 'ENSMUSG00000062510',
       'ENSMUSG00000012443', 'ENSMUSG00000066366', 'ENSMUSG00000017716',
       'ENSMUSG00000027326', 'ENSMUSG00000027447', 'ENSMUSG00000021270',
       'ENSMUSG00000036928', 'ENSMUSG00000027419', 'ENSMUSG00000071178',
       'ENSMUSG00000022034', 'ENSMUSG00000001403', 'ENSMUSG00000038224',
       'ENSMUSG00000079015', 'ENSMUSG00000087594', 'ENSMUSG00000016319',
       'ENSMUSG00000073530', 'ENSMUSG00000024552', 'ENSMUSG00000025386',
       'ENSMUSG00000027322', 'ENSMUSG00000022285', 'ENSMUSG00000027469',
       'ENSMUSG00000007892', 'ENSMUSG00000052854', 'ENSMUSG00000049517']
gp_counts = Fit_GPcounts(X,Y.iloc[22:25,:])
                         #.loc[gene_name]) 

Probabilty of zeros in Fam184b and Pde1a genes

In [5]:
Y.iloc[22:25,:]

Unnamed: 0,aE17.5_2_09,aE17.5_2_16,aE17.5_1_11,aE17.5_3_07,aE17.5_4_06,aE17.5_3_04,aE17.5_2_11,aE17.5_1_25,aE17.5_4_01,aE17.5_4_03,...,aP18_3_12,aP60_1_11,aP60_3_05,aP15_1_15,aP60_1_13,aP60_3_08,aP60_5_16,aP18_1_17,aP60_1_10,aP60_5_05
ENSMUSG00000000167,0.0,0.996722,0.0,0.0,0.94687,0.812166,0.0,4.757297,0.635405,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ENSMUSG00000000168,63.621062,68.773833,71.261578,74.48122,69.121539,49.542146,117.457081,35.679729,99.123239,14.775353,...,0.0,0.0,92.435945,0.0,147.767368,108.779767,0.0,0.0,1.081657,121.239603
ENSMUSG00000000171,201.29287,355.829831,387.48483,273.332023,273.645543,287.506881,267.846054,159.369455,223.662693,241.849206,...,229.845437,50.631054,107.375087,85.026416,183.340994,127.295472,129.411838,164.126858,97.349098,47.857738
ENSMUSG00000000184,213.808488,954.859882,555.840308,181.284478,223.461413,119.388451,217.350487,357.59017,99.123239,305.61652,...,948.654515,881.175082,1660.112216,1155.144598,1509.142659,1816.853557,1226.281526,2461.902866,799.344257,853.994745
ENSMUSG00000000194,27.117174,24.918055,24.050783,87.831627,37.874816,38.983984,0.0,27.7509,30.499458,45.881361,...,23.851885,103.209457,174.601229,1.214663,2.736433,1.157232,1.043644,0.0,16.22485,0.0
ENSMUSG00000000197,4.171873,0.996722,0.0,9.837142,0.0,0.0,3.293189,0.0,0.0,0.0,...,27.104415,0.0,0.0,71.665122,38.310058,17.358473,82.447864,4.689339,0.0,144.636719
ENSMUSG00000000202,0.0,32.891833,0.0,38.645916,52.077872,0.0,0.0,19.822071,30.499458,31.106007,...,0.0,70.104537,5.602178,0.0,0.0,55.547115,30.265672,5.861673,0.0,30.841653
ENSMUSG00000000214,0.0,145.521443,0.0,0.0,0.0,17.867659,0.0,0.0,33.04108,54.435513,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ENSMUSG00000000215,2.085936,926.95166,0.0,53.401629,11.362445,22.740657,97.697946,5125.194804,131.528913,73.876767,...,0.0,0.0,28.010892,20.649273,1.368216,16.201242,1.043644,4.689339,154.6769,1157.093752
ENSMUSG00000000216,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [6]:
#Y.loc[gene_name][Y.loc[gene_name] == 0].count(axis=1)/len(Y.columns)
Y.iloc[20:30,:][Y.iloc[20:30,:] == 0].count(axis=1)/len(Y.columns)

ENSMUSG00000000167    0.814024
ENSMUSG00000000168    0.274390
ENSMUSG00000000171    0.015244
ENSMUSG00000000184    0.000000
ENSMUSG00000000194    0.213415
ENSMUSG00000000197    0.368902
ENSMUSG00000000202    0.503049
ENSMUSG00000000214    0.896341
ENSMUSG00000000215    0.268293
ENSMUSG00000000216    0.969512
dtype: float64

In [7]:
from matplotlib import pyplot as plt
import statsmodels.api as sm

def plot():
    plt.tick_params(labelsize='large', width=2)     
    plt.ylabel('Gene Expression', fontsize=16)
    plt.xlabel('Times', fontsize=16)
    
    if model_index == 1 or model_index == 2:
        c = 'blue'

    else:
        c = 'salmon'
    
    
    if likelihood == 'Gaussian':
        plt.fill_between(xtest[:,0],
                            mean[:,0] - 1*np.sqrt(var[:,0]),
                            mean[:,0] + 1*np.sqrt(var[:,0]), alpha=0.2) # one standard deviation
        plt.fill_between(xtest[:,0],
                            mean[:,0] - 2*np.sqrt(var[:,0]),
                            mean[:,0] + 2*np.sqrt(var[:,0]),color='light'+c, alpha=0.2)# two standard deviation
    else:
        
        lowess = sm.nonparametric.lowess
        
        # one standard deviation 68%
        percentile_16 = lowess(np.percentile(var, 16, axis=0),xtest[:,0],frac=1./5, return_sorted=False)
        percentile_84 = lowess(np.percentile(var, 84, axis=0),xtest[:,0],frac=1./5, return_sorted=False)
        plt.fill_between(xtest[:,0],percentile_16,percentile_84,alpha = 0.2)
        
        # two standard deviation 95%
        percentile_5 = lowess(np.percentile(var, 5, axis=0),xtest[:,0],frac=1./5, return_sorted=False)
        percentile_95 = lowess(np.percentile(var,95, axis=0),xtest[:,0],frac=1./5, return_sorted=False)
        plt.fill_between(xtest[:,0],percentile_5,percentile_95,color='light'+c,alpha = 0.2)
        
    plt.plot(xtest, mean, lw=2) 
    plt.scatter(model.data[0],model.data[1], s=10, color= c, alpha=0.6) #data
    
    if test == 'Two_samples_test':
        if model_index == 1  or model_index ==3:
            plt.scatter(model.data[0][int(model.data[0].shape[0]/2)::],model.data[1][int(model.data[0].shape[0]/2)::], s=10, color= 'salmon', alpha=0.6) #data
            plt.show()
    
    else:
        plt.show()

In [None]:
likelihood = 'Zero_inflated_negative_binomial' 
log_likelihood = gp_counts.One_sample_test(likelihood)
log_likelihood

 12%|█▎        | 1/8 [00:22<02:38, 22.63s/it]

In [None]:
#likelihood = 'Negative_binomial' 
indexes = log_likelihood.index.values.tolist() # list of genes to be plotted 
test = 'One_sample_test' # name of the test
xtest = np.linspace(np.min(X.values),np.max(X.values),100)[:,None]

params = gp_counts.load_models(indexes,test,xtest,likelihood)

for i in range(len(indexes)):
    fig = plt.figure()
    print(indexes[i])
    model_index = 1
    for model,mean,var in zip(params['models'][i],params['means'][i],params['vars'][i]):
        plot()
        model_index = model_index + 1

#### Fit GPcount with negative binomial likelihood

In [None]:
likelihood = 'Negative_binomial' 
log_likelihood_nb = gp_counts.Infer_trajectory(likelihood)
log_likelihood_nb

In [None]:
params = gp_counts.load_models(indexes,test,xtest,likelihood)
for i in range(len(indexes)):
    fig = plt.figure()
    print(indexes[i])
    model_index = 1
    for model,mean,var in zip(params['models'][i],params['means'][i],params['vars'][i]):
        plot()
        model_index = model_index + 1

#### Fit GPcount with Gaussian likelihood

In [None]:
likelihood = 'Gaussian'
log_likelihood = gp_counts.Infer_trajectory(likelihood)
log_likelihood

In [None]:
params = gp_counts.load_models(indexes,test,xtest,likelihood)

for i in range(len(indexes)):
    fig = plt.figure()
    print(indexes[i])
    model_index = 1
    for model,mean,var in zip(params['models'][i],params['means'][i],params['vars'][i]):
        plot()
        model_index = model_index + 1