# GPcounts on bulk dataset

Nuha BinTayyash, 2020

This notebook shows how to run [DESeq2](https://bioconductor.org/packages/release/bioc/html/DESeq2.html) R package to normalize [fission yeast](https://bioconductor.org/packages/release/data/experiment/html/fission.html) gene expression data. Then, compare GPcounts with negative binomial likelihood and Gaussian likelihood to find differentially expressed genes in one sample test and two-sample tests.

In [1]:
import numpy as np
import pandas as pd
import gpflow

In [2]:
Y = pd.read_csv('exons_counts.csv',index_col=[0])
X = pd.read_csv('time_points.csv',index_col=[0])
X = (X - X.min(axis=0)) / (X.max(axis=0) - X.min(axis=0)) 
X

Unnamed: 0,time
E11_1_F,0.0
E12_1_F,0.08
E13A_1_F,0.16
E13B_1_F,0.24
E14A_1_F,0.4
E14B_1_F,0.52
E14C_1_F,0.64
E14D_1_F,0.76
E14E_1_F,0.88
ECF_1_F,1.0


In [3]:
((10*(len(X)))/100)

3.0

Fit some fission gene normalized data using GPcounts -- Two samples test

In [4]:
from matplotlib import pyplot as plt
import statsmodels.api as sm

def plot():
    plt.tick_params(labelsize='large', width=2)     
    #plt.ylabel('Gene Expression', fontsize=16)
    #plt.xlabel('Times', fontsize=16)
    c = 'royalblue'
    
    if model_index == 3:
        c = 'green'
    
    plt.plot(xtest, mean,color= c, lw=2) 
    
    if likelihood == 'Gaussian':
        plt.fill_between(xtest[:,0],
                            mean[:,0] - 1*np.sqrt(var[:,0]),
                            mean[:,0] + 1*np.sqrt(var[:,0]),color=c,alpha=0.2) # one standard deviation
        plt.fill_between(xtest[:,0],
                            mean[:,0] - 2*np.sqrt(var[:,0]),
                            mean[:,0] + 2*np.sqrt(var[:,0]),color=c, alpha=0.1)# two standard deviation
    else:
       
        lowess = sm.nonparametric.lowess    
        # one standard deviation 68%
        percentile_16 = lowess(np.percentile(var, 16, axis=0),xtest[:,0],frac=1./5, return_sorted=False)
        percentile_16 = [(i > 0) * i for i in percentile_16]
        percentile_84 = lowess(np.percentile(var, 84, axis=0),xtest[:,0],frac=1./5, return_sorted=False)
        percentile_84 = [(i > 0) * i for i in percentile_84]
        plt.fill_between(xtest[:,0],percentile_16,percentile_84,color=c,alpha=0.2)
        
        # two standard deviation 95%
        percentile_5 = lowess(np.percentile(var, 5, axis=0),xtest[:,0],frac=1./5, return_sorted=False)
        percentile_5 = [(i > 0) * i for i in percentile_5]
        percentile_95 = lowess(np.percentile(var,95, axis=0),xtest[:,0],frac=1./5, return_sorted=False)
        percentile_95 = [(i > 0) * i for i in percentile_95]
        plt.fill_between(xtest[:,0],percentile_5,percentile_95,color=c,alpha=0.1)
    
    if test == 'Two_samples_test' and model_index == 1:
        plt.scatter(model.data[0][0:int(model.data[0].shape[0]/2)],model.data[1][0:int(model.data[0].shape[0]/2)], s=30, marker='o', color= 'royalblue',alpha=1.) #data    
        plt.scatter(model.data[0][int(model.data[0].shape[0]/2)::],model.data[1][int(model.data[0].shape[0]/2)::], s=30, marker='o', color= 'green',alpha=1.) #data

    else: 
        plt.scatter(model.data[0],model.data[1],s=30,marker = 'o',color=c,alpha=1.)
    
    
    if not(test == 'Two_samples_test' and model_index == 2):
        plt.show()
    

In [5]:
genes_name = [
 'FBgn0003638']
'''
,
 'FBgn0004237',
 'FBgn0010226',
 'FBgn0010504',
 'FBgn0011888',
 'FBgn0011944',
 'FBgn0011994',
 'FBgn0013679',
 'FBgn0013681',
 'FBgn0013684',
 'FBgn0015791',
 'FBgn0022710']
 '''

"\n,\n 'FBgn0004237',\n 'FBgn0010226',\n 'FBgn0010504',\n 'FBgn0011888',\n 'FBgn0011944',\n 'FBgn0011994',\n 'FBgn0013679',\n 'FBgn0013681',\n 'FBgn0013684',\n 'FBgn0015791',\n 'FBgn0022710']\n "

In [6]:
from GPcounts.GPcounts_Module import Fit_GPcounts
likelihood = 'Negative_binomial' 
gp_counts = Fit_GPcounts(X,Y.loc[genes_name])
log_likelihood_ratio,var_ratio = gp_counts.One_sample_test(likelihood)
log_likelihood_ratio

  0%|          | 0/1 [00:00<?, ?it/s]

{'ls': 3.0, 'var': 1.5, 'alpha': 5.0, 'km': 35.0}
{'ls': 1000.0, 'var': 1.5, 'alpha': 0.024501052284063463, 'km': 35.0}


100%|██████████| 1/1 [00:06<00:00,  6.55s/it]


Unnamed: 0,Dynamic_model_log_likelihood,Constant_model_log_likelihood,log_likelihood_ratio
FBgn0003638,-116.184762,-116.184008,-0.000754


In [None]:
indexes = genes_name # list of genes to be plotted 
test = 'One_sample_test' # name of the test
xtest = np.linspace(np.min(X.values)-.1,np.max(X.values)+.1,100)[:,None] # points to make prediction
likelihood = 'Negative_binomial'
params = gp_counts.load_models(indexes,test,xtest,likelihood)

for i in range(len(indexes)):
    fig = plt.figure()
    print(indexes[i])
    model_index = 1
    for mean,var,model in zip(params['means'][i],params['vars'][i],params['models'][i]):
        plot() 
        model_index = model_index + 1
        gpflow.utilities.print_summary(model, fmt='notebook')

In [7]:
likelihood = 'Gaussian'
log_likelihood_ratio,var_ratio = gp_counts.One_sample_test(likelihood)
log_likelihood_ratio

  0%|          | 0/1 [00:00<?, ?it/s]

{'ls': 3.0, 'var': 1.5, 'alpha': 5.0, 'km': 35.0}


100%|██████████| 1/1 [00:00<00:00,  1.25it/s]

{'ls': 1000.0, 'var': 1.5, 'alpha': 5.0, 'km': 35.0}





Unnamed: 0,Dynamic_model_log_likelihood,Constant_model_log_likelihood,log_likelihood_ratio
FBgn0003638,1.230009,1.230142,-0.000133


In [None]:
params = gp_counts.load_models(indexes,test,xtest,likelihood)

for i in range(len(indexes)):
    fig = plt.figure()
    print(indexes[i])
    model_index = 1
    for model,mean,var in zip(params['models'][i],params['means'][i],params['vars'][i]):
        plot()
        model_index = model_index + 1
        gpflow.utilities.print_summary(model, fmt='notebook')

In [None]:
genes_name = [
 'FBgn0000083',
 'FBgn0000108',
 'FBgn0000109',
 'FBgn0000115',
 'FBgn0000221',
 'FBgn0000289',
 'FBgn0000318',
 'FBgn0000464',
 'FBgn0000618',
 'FBgn0001099',
 'FBgn0001202',
 'FBgn0001276',
 'FBgn0001977',
 'FBgn0001995',
 'FBgn0003189']
from GPcounts.GPcounts_Module import Fit_GPcounts
likelihood = 'Negative_binomial' 
gp_counts = Fit_GPcounts(X,Y.loc[genes_name])
log_likelihood_ratio = gp_counts.One_sample_test(likelihood)
log_likelihood_ratio

In [None]:
indexes = log_likelihood_ratio.index.values # list of genes to be plotted 
test = 'One_sample_test' # name of the test
xtest = np.linspace(np.min(X.values)-.1,np.max(X.values)+.1,100)[:,None] # points to make prediction
likelihood = 'Negative_binomial'
params = gp_counts.load_models(indexes,test,xtest,likelihood)

for i in range(len(indexes)):
    fig = plt.figure()
    print(indexes[i])
    model_index = 1
    for mean,var,model in zip(params['means'][i],params['vars'][i],params['models'][i]):
        plot() 
        model_index = model_index + 1
        gpflow.utilities.print_summary(model, fmt='notebook')

In [None]:
likelihood = 'Gaussian'
log_likelihood_ratio = gp_counts.One_sample_test(likelihood)
log_likelihood_ratio

In [None]:
params = gp_counts.load_models(indexes,test,xtest,likelihood)

for i in range(len(indexes)):
    fig = plt.figure()
    print(indexes[i])
    model_index = 1
    for model,mean,var in zip(params['models'][i],params['means'][i],params['vars'][i]):
        plot()
        model_index = model_index + 1
        gpflow.utilities.print_summary(model, fmt='notebook')

In [None]:
likelihood = 'Poisson' 
#genes_name = ['FBgn0004185','FBgn0001977']
gp_counts = Fit_GPcounts(X,Y.loc[genes_name])
log_likelihood_ratio = gp_counts.One_sample_test(likelihood)
log_likelihood_ratio

In [None]:
indexes = log_likelihood_ratio.index.values # list of genes to be plotted 
test = 'One_sample_test' # name of the test
xtest = np.linspace(np.min(X.values)-.1,np.max(X.values)+.1,100)[:,None] # points to make prediction
likelihood = 'Poisson'
params = gp_counts.load_models(indexes,test,xtest,likelihood)

for i in range(len(indexes)):
    fig = plt.figure()
    print(indexes[i])
    model_index = 1
    for mean,var,model in zip(params['means'][i],params['vars'][i],params['models'][i]):
        plot() 
        model_index = model_index + 1
        gpflow.utilities.print_summary(model, fmt='notebook')