# One sample test on simulated bulk RNA-seq

Nuha BinTayyash, 2020

This notebook shows how to run GPcounts with one sample test on simulated bulk RNA-seq datasets.

In [1]:
import numpy as np
import pandas as pd
from GPcounts.GPcounts_Module import Fit_GPcounts

In [None]:
files = ['low_counts_low_dispersion_non_differentially_expressed_genes_DE_non_DE_genes.csv','low_counts_high_dispersion_non_differentially_expressed_genes_DE_non_DE_genes.csv'                                      
             ,'high_counts_low_dispersion_non_differentially_expressed_genes_DE_non_DE_genes.csv','high_counts_high_dispersion_non_differentially_expressed_genes_DE_non_DE_genes.csv']

X = pd.read_csv('time_points.csv',index_col=[0])

for file in files:
    print(file)
    Y = pd.read_csv(file,index_col=[0])
    gp_counts = Fit_GPcounts(X,Y) 
    likelihoods = ['Negative_binomial','Gaussian','Poisson'] 
    for likelihood in likelihoods:
        log_likelihood = gp_counts.One_sample_test(likelihood)
        log_likelihood.to_csv("ll_"+likelihood+"_"+file)

  0%|          | 0/300 [00:00<?, ?it/s]

low_counts_low_dispersion_non_differentially_expressed_genes_DE_non_DE_genes.csv


 82%|████████▏ | 245/300 [23:39<03:17,  3.59s/it] 

In [None]:
X = pd.read_csv('time_points.csv',index_col=[0])
Y = pd.read_csv('low_counts_low_disperison_samples_dynamic_constant.csv',index_col=[0])

In [None]:
from GPcounts.GPcounts_Module import Fit_GPcounts
likelihood = 'Negative_binomial' 
genes = ['gene_137']
         #,'gene_215','gene_219','gene_263']
gp_counts = Fit_GPcounts(X,Y.loc[genes]) 
log_likelihood = gp_counts.One_sample_test(likelihood)
log_likelihood

In [None]:
likelihood = 'Gaussian' 
genes = ['gene_137']
         #,'gene_215','gene_219','gene_263']
#gp_counts = Fit_GPcounts(X,Y.loc[genes]) 
log_likelihood = gp_counts.One_sample_test(likelihood)
log_likelihood

In [None]:
import numpy as np
from matplotlib import pyplot as plt
import statsmodels.api as sm

import statsmodels.api as sm

def plot(likelihood,xtest,mean,var):
         
    fig = plt.figure()
    plt.ylabel('Gene Expression', fontsize=16)
    plt.xlabel('Times', fontsize=16)

   
    if likelihood == 'Gaussian':
        y = np.log(Y.loc[[indexes[i]]].values+1)
        plt.fill_between(xtest[:,0],
                            mean[:,0] - 2*np.sqrt(var[:,0]),
                            mean[:,0] + 2*np.sqrt(var[:,0]), alpha=0.2)
    else:
        y = Y.loc[[indexes[i]]].values
        lowess = sm.nonparametric.lowess
        percentile_5 = lowess(np.percentile(var, 5, axis=0),xtest[:,0],frac=1./5, return_sorted=False)
        percentile_95 = lowess(np.percentile(var, 95, axis=0),xtest[:,0],frac=1./5, return_sorted=False)
        plt.fill_between(xtest[:,0],percentile_5,percentile_95,alpha = 0.2)

    plt.scatter(X.values,y, s=10, color='black', alpha=0.6) #data
    plt.plot(xtest, mean, lw=2) 
    plt.show()

indexes = genes # list of genes to be plotted 
test = 'One_sample_test' # name of the test
xtest = np.linspace(np.min(X.values),np.max(X.values),100)[:,None] # points to make prediction
sample = True # sample or/and load model 

for i in range(len(indexes)):
    print(indexes[i])
    params = gp_counts.load_and_sample_models(indexes[i],test,xtest,likelihood,sample)
    for mean,var,model in zip(params['means'],params['vars'],params['models']):
        mean = np.array(mean)
        var = np.array(var)
        plot(likelihood,xtest,mean,var)
        