Nuha BinTayyash, 2020

This notebook shows the ROC and precision-recall curves resulted from running GPcounts with one sample test on simulated bulk RNA-seq datasets.

In [1]:
import numpy as np
from matplotlib import pyplot as plt
from sklearn import metrics
import pandas as pd
from scipy import stats

In [2]:
DESeq2 = pd.read_csv('alpha_DESeq2.csv',index_col=[0])

GPcounts_NB = pd.read_csv('ll_Negative_binomial_normalized_alpha_counts.csv',index_col=[0])
GPcounts_G = pd.read_csv('ll_Gaussian_normalized_alpha_counts.csv',index_col=[0])
print(DESeq2.shape)
print(GPcounts_NB.shape)
print(GPcounts_G.shape)

(18274, 6)
(18274, 5)
(18274, 5)


Remove NaNs from DESeq2 results 

In [3]:
nan_gene = list(DESeq2[DESeq2['padj'].isnull()].index.values)
DESeq2 = DESeq2.drop(nan_gene)
DESeq2.head()

Unnamed: 0,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj
ENSMUSG00000000001,123.914854,-1.450563,0.332834,20.681661,5e-06,0.000102
ENSMUSG00000000028,8.918803,-4.498371,1.855694,6.261511,0.012339,0.069156
ENSMUSG00000000031,0.515605,0.154787,1.565667,0.005131,0.942894,0.984825
ENSMUSG00000000037,1.4689,-2.194759,2.30994,1.013488,0.314069,0.606066
ENSMUSG00000000049,0.479602,-4.598312,4.380796,1.90978,0.166988,0.425824


In [None]:
GPcounts_NB = GPcounts_NB.drop(nan_gene)
GPcounts_NB.head()

In [None]:
GPcounts_G = GPcounts_G.drop(nan_gene)
GPcounts_G.head()

DESeq2 and GPcounts with NB likelihood identify similar top 4 genes

In [None]:
display( DESeq2.sort_values(by=['padj']).head()
#DESeq_sorted.head()
#GPcounts_NB_sorted = GPcounts_NB.sort_values(by=['log_likelihood_ratio'], ascending=False)
#GPcounts_NB_sorted.head()
#GPcounts_G_sorted = GPcounts_G.sort_values(by=['log_likelihood_ratio'], ascending=False)
#GPcounts_G_sorted.head()

#### label genes as differentially using DESeq2 padj score

In [None]:
padj = [.05,.2,.5] # Threshold  
fig = plt.figure()

dfs = [GPcounts_NB['log_likelihood_ratio'],GPcounts_G['log_likelihood_ratio']]

for df in range(len(dfs)):
    D = len(DESeq_sorted)
    for i in range(3):
        true_label = np.zeros(D)
  
        for j in range(D):
            # label gene as differentially expressed if DESeq2 give it a score lower than the threshold
            if DESeq2['padj'][j] <= padj[i]:
                true_label[j] = 1 
        labels = pd.DataFrame(true_label, index =DESeq2.index.values, columns =['label'])
        
        #unique, counts = np.unique(true_label, return_counts=True)
        
        #print(dict(zip(unique, counts)))
        
        precision, recall, thresholds = metrics.precision_recall_curve(labels['label'],dfs[df])

        plt.plot( recall,precision,label='adjusted p-value='+str(padj[i]))
        plt.tick_params(labelsize='large', width=2) 
        '''
        plt.xlabel('recall')
        plt.ylabel('precision')
        plt.legend(loc='best', bbox_to_anchor=(1.22, .75))
        '''
        if df == 0:
            plt.legend(loc='best', bbox_to_anchor=(1., .65))
        
            #title = 'GPcounts with NB likelihood'
        '''
        else:
            title = 'GPcounts with Gaussian likelihood'
            
        plt.title(title )
        '''
        plt.ylim([-.1,1.1])
        
    plt.show()


In [None]:
percentage = np.array(list(range(1,11)))
percentage = percentage * .10
DESeq_id = list(DESeq_sorted.index.values)
GPcounts_NB_id = list(GPcounts_NB_sorted.index)
GPcounts_G_id = list(GPcounts_G_sorted.index)
corr_NB = []
corr_G = []
inter_NB = []
inter_G = []

for count in range(10):
    print(percentage[count])
    DESeq_id_part= DESeq_id[int(len(DESeq_id) * .0) : int(len(DESeq_id) * percentage[count])]
    GPcounts_NB_index_part = []
    GPcounts_G_index_part = []
    for i in DESeq_id_part:
        GPcounts_NB_index_part.append(GPcounts_NB_id.index(i))
        GPcounts_G_index_part.append(GPcounts_G_id.index(i))
    
    DESeq_index_part = list(range(0,len(DESeq_id_part)))
    #print(len(DESeq_index_part))
    #print(len(list(set(DESeq_index_part).intersection(GPcounts_NB_index_part))))
    intersect_NB = len(list(set(DESeq_index_part).intersection(GPcounts_NB_index_part)))
    #print(intersect/len(DESeq_index_part)*100.)
    inter_NB.append(intersect_NB/len(DESeq_index_part))
    print(intersect_NB/len(DESeq_index_part)*100.)
    rho, pval = stats.spearmanr(DESeq_index_part,GPcounts_NB_index_part)
    print(rho)
    corr_NB.append(rho)
    plt.scatter(DESeq_index_part ,GPcounts_NB_index_part,s=5,alpha=0.5)
    plt.show() 
    
    intersect_G = len(list(set(DESeq_index_part).intersection(GPcounts_G_index_part)))
    #print(intersect/len(DESeq_index_part)*100.)
    inter_G.append(intersect_G/len(DESeq_index_part))
    rho, pval = stats.spearmanr(DESeq_index_part,GPcounts_G_index_part)
    print(rho)
    corr_G.append(rho)
    plt.scatter(DESeq_index_part ,GPcounts_G_index_part,s=5,alpha=0.5)
    plt.show()

In [None]:
plt.xticks(percentage*100)
plt.ylim(0.,1.)
plt.tick_params(labelsize='large', width=2)  
plt.scatter(percentage*100,corr_NB,s=30,label = 'GPcount with Negative Binomial likelihood')
#plt.scatter(percentage*100,inter_NB,s=30,label = 'GPcount with Negative Binomial likelihood')

#plt.legend(bbox_to_anchor=(.23, 0), loc='lower left', ncol=1)
plt.show()
plt.xticks(percentage*100)
plt.scatter(percentage*100,corr_G,s=30,color = 'darkorange',label = 'GPcount with Gaussian likelihood')
#plt.scatter(percentage*100,inter_G,s=30,label = 'GPcount with Negative Binomial likelihood')

#plt.xlabel('percentage of dataset')
#plt.ylabel('Spearman correlation')
plt.ylim(0.,1.)
plt.tick_params(labelsize='large', width=2)  
#plt.legend(bbox_to_anchor=(.35, 0), loc='lower left', ncol=1)