## GPcounts with zero-inflated negative binomial likelihood

Nuha BinTayyash, 2020

This notebook comapre GPcount fit with zero-inflated negative binomial, negative binomial and Gaussian likelihoods on ScRNA-seq gene expression data for highly expressed genes in Islet $\alpha$ cell from [GSE8737 single cell RNA-seq ](https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE87375) dataset.

In [1]:
import pandas as pd
import numpy as np
import gpflow 

Load $\alpha$ dataset and pseudotime information

In [2]:
Y = pd.read_csv('normalized_alpha_counts26.csv',index_col=[0]).astype(float)
X = pd.read_csv('alpha_time_points.csv',index_col=[0])

#### Fit GPcount with zero inflated negative binomial likelihood

In [3]:
from GPcounts.GPcounts_Module import Fit_GPcounts

In [5]:
gene_name = ['ENSMUSG00000038943', 'ENSMUSG00000000394', 'ENSMUSG00000039396',
       'ENSMUSG00000040856', 'ENSMUSG00000023944', 'ENSMUSG00000062510',
       'ENSMUSG00000012443', 'ENSMUSG00000066366', 'ENSMUSG00000017716',
       'ENSMUSG00000027326', 'ENSMUSG00000027447', 'ENSMUSG00000021270',
       'ENSMUSG00000036928', 'ENSMUSG00000027419', 'ENSMUSG00000071178',
       'ENSMUSG00000022034', 'ENSMUSG00000001403', 'ENSMUSG00000038224',
       'ENSMUSG00000079015', 'ENSMUSG00000087594', 'ENSMUSG00000016319',
       'ENSMUSG00000073530', 'ENSMUSG00000024552', 'ENSMUSG00000025386',
       'ENSMUSG00000027322', 'ENSMUSG00000022285', 'ENSMUSG00000027469',
       'ENSMUSG00000007892', 'ENSMUSG00000052854', 'ENSMUSG00000049517']
gp_counts = Fit_GPcounts(X,Y)
                         #.loc[gene_name]) 

Probabilty of zeros in Fam184b and Pde1a genes

In [6]:
#Y.loc[gene_name][Y.loc[gene_name] == 0].count(axis=1)/len(Y.columns)
Y.iloc[32:35,:][Y.iloc[34:35,:] == 0].count(axis=1)/len(Y.columns)

ENSMUSG00000021147    0.000000
ENSMUSG00000021149    0.000000
ENSMUSG00000021156    0.115854
dtype: float64

In [7]:
Y.iloc[32:35,:]

Unnamed: 0,aE17.5_2_09,aE17.5_2_16,aE17.5_1_11,aE17.5_3_07,aE17.5_4_06,aE17.5_3_04,aE17.5_2_11,aE17.5_1_25,aE17.5_4_01,aE17.5_4_03,...,aP18_3_12,aP60_1_11,aP60_3_05,aP15_1_15,aP60_1_13,aP60_3_08,aP60_5_16,aP18_1_17,aP60_1_10,aP60_5_05
ENSMUSG00000021147,1.042968,17.941,91.749282,56.212241,4.734352,0.0,15.368216,6.343063,18.426756,27.217756,...,0.0,45.762684,4.668482,0.0,25.996111,0.0,0.0,0.0,0.0,64.873822
ENSMUSG00000021149,66.749967,112.62961,97.0939,28.106121,23.67176,150.250771,64.766054,56.294683,38.759728,37.327209,...,0.0,0.0,0.0,0.0,0.0,68.276662,115.844468,216.881919,0.0,1.063505
ENSMUSG00000021156,117.855411,119.606666,307.315555,72.373261,150.552392,121.82495,290.898378,119.725312,172.830263,431.595849,...,35.777827,0.0,1.867393,60.733154,158.713099,0.0,0.0,121.922809,0.0,38.28619


In [8]:
from matplotlib import pyplot as plt
import statsmodels.api as sm

def plot():
    plt.tick_params(labelsize='large', width=2)     
    plt.ylabel('Gene Expression', fontsize=16)
    plt.xlabel('Times', fontsize=16)
    
    if model_index == 1 or model_index == 2:
        c = 'blue'

    else:
        c = 'salmon'
    
    
    if likelihood == 'Gaussian':
        plt.fill_between(xtest[:,0],
                            mean[:,0] - 1*np.sqrt(var[:,0]),
                            mean[:,0] + 1*np.sqrt(var[:,0]), alpha=0.2) # one standard deviation
        plt.fill_between(xtest[:,0],
                            mean[:,0] - 2*np.sqrt(var[:,0]),
                            mean[:,0] + 2*np.sqrt(var[:,0]),color='light'+c, alpha=0.2)# two standard deviation
    else:
        
        lowess = sm.nonparametric.lowess
        
        # one standard deviation 68%
        percentile_16 = lowess(np.percentile(var, 16, axis=0),xtest[:,0],frac=1./5, return_sorted=False)
        percentile_84 = lowess(np.percentile(var, 84, axis=0),xtest[:,0],frac=1./5, return_sorted=False)
        plt.fill_between(xtest[:,0],percentile_16,percentile_84,alpha = 0.2)
        
        # two standard deviation 95%
        percentile_5 = lowess(np.percentile(var, 5, axis=0),xtest[:,0],frac=1./5, return_sorted=False)
        percentile_95 = lowess(np.percentile(var,95, axis=0),xtest[:,0],frac=1./5, return_sorted=False)
        plt.fill_between(xtest[:,0],percentile_5,percentile_95,color='light'+c,alpha = 0.2)
        
    plt.plot(xtest, mean, lw=2) 
    plt.scatter(model.data[0],model.data[1], s=10, color= c, alpha=0.6) #data
    
    if test == 'Two_samples_test':
        if model_index == 1  or model_index ==3:
            plt.scatter(model.data[0][int(model.data[0].shape[0]/2)::],model.data[1][int(model.data[0].shape[0]/2)::], s=10, color= 'salmon', alpha=0.6) #data
            plt.show()
    
    else:
        plt.show()

In [None]:
likelihood = 'Zero_inflated_negative_binomial' 
log_likelihood = gp_counts.One_sample_test(likelihood)
log_likelihood

  2%|▏         | 2/100 [00:59<51:32, 31.56s/it]

local Optima
1
y_mean 2.451219512195122
mean_mean 17.677572005956844
abs(round((mean_mean-y_mean)/y_mean)) 6.0
local Optima
1
y_mean 2.451219512195122
mean_mean 18.37409456328539
abs(round((mean_mean-y_mean)/y_mean)) 6.0
Fit Cholesky decomposition was not successful.
local Optima
1
y_mean 2.451219512195122
mean_mean 30.579500374877703
abs(round((mean_mean-y_mean)/y_mean)) 11.0
local Optima
1
y_mean 2.451219512195122
mean_mean 22.182523732201915
abs(round((mean_mean-y_mean)/y_mean)) 8.0
local Optima
1
y_mean 2.451219512195122
mean_mean 17.677572005956844
abs(round((mean_mean-y_mean)/y_mean)) 6.0
local Optima
1
y_mean 2.451219512195122
mean_mean 18.37409456328539
abs(round((mean_mean-y_mean)/y_mean)) 6.0
Fit Cholesky decomposition was not successful.
local Optima
1
y_mean 2.451219512195122
mean_mean 30.579500374877703
abs(round((mean_mean-y_mean)/y_mean)) 11.0
local Optima
1
y_mean 2.451219512195122
mean_mean 22.182523732201915
abs(round((mean_mean-y_mean)/y_mean)) 8.0


  9%|▉         | 9/100 [04:45<33:21, 21.99s/it]  

local Optima
1
y_mean 0.9359756097560976
mean_mean 2.4310297972886956
abs(round((mean_mean-y_mean)/y_mean)) 2.0


In [None]:
#likelihood = 'Negative_binomial' 
indexes = log_likelihood.index.values.tolist() # list of genes to be plotted 
test = 'One_sample_test' # name of the test
xtest = np.linspace(np.min(X.values),np.max(X.values),100)[:,None]

params = gp_counts.load_models(indexes,test,xtest,likelihood)

for i in range(len(indexes)):
    fig = plt.figure()
    print(indexes[i])
    model_index = 1
    for model,mean,var in zip(params['models'][i],params['means'][i],params['vars'][i]):
        plot()
        model_index = model_index + 1

#### Fit GPcount with negative binomial likelihood

In [None]:
likelihood = 'Negative_binomial' 
log_likelihood_nb = gp_counts.Infer_trajectory(likelihood)
log_likelihood_nb

In [None]:
params = gp_counts.load_models(indexes,test,xtest,likelihood)
for i in range(len(indexes)):
    fig = plt.figure()
    print(indexes[i])
    model_index = 1
    for model,mean,var in zip(params['models'][i],params['means'][i],params['vars'][i]):
        plot()
        model_index = model_index + 1

#### Fit GPcount with Gaussian likelihood

In [None]:
likelihood = 'Gaussian'
log_likelihood = gp_counts.Infer_trajectory(likelihood)
log_likelihood

In [None]:
params = gp_counts.load_models(indexes,test,xtest,likelihood)

for i in range(len(indexes)):
    fig = plt.figure()
    print(indexes[i])
    model_index = 1
    for model,mean,var in zip(params['models'][i],params['means'][i],params['vars'][i]):
        plot()
        model_index = model_index + 1

In [None]:
test = float('nan')
test

In [None]:

if np.isnan(test):
    print('yes')