Nuha BinTayyash, 2020

This notebook shows the ROC and precision-recall curves resulted from running GPcounts with one sample test on simulated bulk RNA-seq datasets.

In [1]:
import numpy as np
from matplotlib import pyplot as plt
from sklearn import metrics
import pandas as pd
from scipy import stats

In [2]:
DESeq2 = pd.read_csv('fission_DESeq2_tst.csv',index_col=[0])
print(DESeq2.shape)
GPcounts_NB = pd.read_csv('ll_Negative_binomial_fission_normalized_counts.csv',index_col=[0])
GPcounts_G = pd.read_csv('ll_Gaussian_fission_normalized_counts.csv',index_col=[0])
print(GPcounts_NB.shape)
print(GPcounts_G.shape)

(6459, 6)
(6459, 4)
(6459, 4)


In [3]:
DESeq_sorted = DESeq2.sort_values(by=['pvalue'])
DESeq_sorted

Unnamed: 0,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj
SPBC2F12.09c,174.671162,-2.657185,0.751020,98.267454,1.224945e-19,7.911920e-16
SPAC1002.18,444.504950,-0.051168,0.203134,57.672042,3.675626e-11,1.187043e-07
SPAC1002.19,336.373207,-0.392611,0.576457,42.999342,3.696211e-08,7.957941e-05
SPAC1002.17c,261.773133,-1.138898,0.608548,38.940974,2.440696e-07,3.941114e-04
SPNCRNA.1628,21.638834,0.201416,0.511589,28.035566,3.582068e-05,4.189223e-02
...,...,...,...,...,...,...
SPCPJ732.01,228.644608,0.105057,0.334712,0.139283,9.996335e-01,9.999412e-01
SPAC25H1.08c,1329.688464,-0.011105,0.183171,0.114618,9.997729e-01,9.999412e-01
SPAC29A4.05,456.659427,0.025396,0.161020,0.113559,9.997780e-01,9.999412e-01
SPCC1919.12c,741.087681,0.042403,0.210459,0.102344,9.998281e-01,9.999412e-01


In [4]:
GPcounts_NB_sorted = GPcounts_NB.sort_values(by=['log_likelihood_ratio'], ascending=False)
GPcounts_NB_sorted

Unnamed: 0,Shared_log_likelihood,model_1_log_likelihood,model_2_log_likelihood,log_likelihood_ratio
SPNCRNA.1091,-3.232678e+29,-76.218042,-6.546611e+01,3.232678e+29
SPBC3F6.01c,-6.330996e+08,-110.821979,-1.124976e+02,6.330994e+08
SPCC1827.04,-5.055079e+08,-106.768377,-1.121822e+02,5.055077e+08
SPAC22F8.11,-7.953822e+07,-121.042404,-1.220908e+02,7.953798e+07
SPBC691.01,-1.608270e+02,761.747301,-8.607271e+01,8.365016e+02
...,...,...,...,...
SPAC22A12.07c,-2.585124e+02,-151.619864,-1.445514e+02,-3.765890e+01
SPCC970.07c,-1.964614e+02,-109.148345,-1.255955e+02,-3.828249e+01
SPAC3C7.01c,-2.070735e+02,-107.583349,-1.529333e+02,-5.344315e+01
SPNCRNA.1108,-9.790178e+01,-43.716623,-1.335765e+08,-1.335764e+08


In [18]:
GPcounts_NB_sorted.index.values

array(['SPNCRNA.1091', 'SPBC3F6.01c', 'SPCC1827.04', ..., 'SPAC3C7.01c',
       'SPNCRNA.1108', 'SPAC3G6.11'], dtype=object)

In [None]:
genes_name = ['SPNCRNA.750','SPBTRNAARG.07','SPNCRNA.735','SPBPB2B2.06c','SPAC869.06c','SPCC1281.04','SPBC1711.14','SPAC11D3.01c']
GPcounts_NB.loc[genes_name]

In [None]:
GPcounts_G.loc[genes_name]

In [5]:
D = GPcounts_NB.shape[0]
true_label = np.zeros(D)
for j in range(D):
    if GPcounts_NB['log_likelihood_ratio'][j] > 0:
        true_label[j] = 1
labels = pd.DataFrame(true_label, index =GPcounts_NB.index.values, columns =['label'])
GPcounts_NB = pd.concat([GPcounts_NB,labels],axis = 1)

In [23]:
TP_genes_NB = list(GPcounts_NB.loc[(GPcounts_NB['log_likelihood_ratio'] > 0)&(GPcounts_NB['label']==1.0)].index.values)
len(TP_genes_NB)


700

In [24]:
FP_genes_G = list(GPcounts_G.loc[(GPcounts_G['log_likelihood_ratio'] < 0)&(GPcounts_NB['label']==1.0)].index.values)
FP_genes_G

['SPAC1F8.06',
 'SPAC11D3.01c',
 'SPAC11D3.02c',
 'SPNCRNA.610',
 'SPAC806.11',
 'SPNCRNA.627',
 'SPNCRNA.638',
 'SPAC12G12.03',
 'SPATRNAPRO.01',
 'SPAC31A2.06',
 'SPNCRNA.653',
 'SPAC227.10',
 'SPNCRNA.676',
 'SPNCRNA.696',
 'SPNCRNA.220',
 'SPNCRNA.704',
 'SPNCRNA.710',
 'SPNCRNA.717',
 'SPNCRNA.730',
 'SPNCRNA.741',
 'SPAC5D6.10c',
 'SPAC5D6.02c',
 'SPAC167.08',
 'SPATRNALYS.01',
 'SPAC17A5.14',
 'SPAC1399.04c',
 'SPAP11E10.02c',
 'SPAC110.04c',
 'SPNCRNA.786',
 'SPAC3C7.04',
 'SPAC23C11.07',
 'SPNCRNA.187',
 'SPAC13F5.04c',
 'SPNCRNA.809',
 'SPAC6C3.05',
 'SPNCRNA.827',
 'SPAC823.02',
 'SPAC644.18c',
 'SPNCRNA.847',
 'SPACUNK4.20',
 'SPAPB24D3.10c',
 'SPNCRNA.874',
 'SPATRNATHR.04',
 'SPAC6G9.15c',
 'SPNCRNA.900',
 'SPNCRNA.903',
 'SPAC17A2.11',
 'SPAC17G6.04c',
 'SPAC17G6.06',
 'SPNCRNA.921',
 'SPAC15A10.06',
 'SPNCRNA.927',
 'SPNCRNA.941',
 'SPAC2F3.11',
 'SPNCRNA.240',
 'SPNCRNA.995',
 'SPAC637.04',
 'SPAC637.06',
 'SPNCRNA.1000',
 'SPNCRNA.1006',
 'SPAC11H11.02c',
 'SPAC11H11.

In [25]:
# SPAPB1A11.02,SPBC20F10.08c,SPNCRNA.750,SPAC15F9.02,SPNCRNA.827,SPAC1B3.16c
best =set(FP_genes_G).intersection(TP_genes_NB)
list(best)

['SPNCRNA.627',
 'SPAC2F3.11',
 'SPBC3B8.08',
 'SPBC1D7.02c',
 'SPNCRNA.1068',
 'SPNCRNA.51',
 'SPCC18B5.11c',
 'SPAC6F12.05c',
 'SPSNRNA.06',
 'SPCC830.06',
 'SPATRNATHR.04',
 'SPNCRNA.1655',
 'SPNCRNA.1620',
 'SPAC1783.06c',
 'SPNCRNA.918',
 'SPBC14C8.16c',
 'SPNCRNA.873',
 'SPAC4D7.01c',
 'SPCC70.05c',
 'SPBC31A8.02',
 'SPNCRNA.1242',
 'SPNCRNA.696',
 'SPAC3C7.04',
 'SPNCRNA.187',
 'SPNCRNA.1558',
 'SPAC17G6.04c',
 'SPNCRNA.1043',
 'SPCC1840.12',
 'SPCC757.02c',
 'SPBC36.12c',
 'SPNCRNA.463',
 'SPAC688.12c',
 'SPNCRNA.68',
 'SPCC794.16',
 'SPNCRNA.1460',
 'SPNCRNA.939',
 'SPAC4D7.07c',
 'SPBC557.03c',
 'SPNCRNA.874',
 'SPBP4H10.13',
 'SPAC212.08c',
 'SPNCRNA.1649',
 'SPNCRNA.556',
 'SPBC1348.11',
 'SPBC18E5.01',
 'SPNCRNA.1495',
 'SPAPB1A11.02',
 'SPNCRNA.786',
 'SPNCRNA.1090',
 'SPAC644.18c',
 'SPBC2D10.07c',
 'SPNCRNA.417',
 'SPNCRNA.710',
 'SPNCRNA.244',
 'SPNCRNA.730',
 'SPNCRNA.1201',
 'SPAPB2B4.02',
 'SPNCRNA.1304',
 'SPNCRNA.729',
 'SPBC119.04',
 'SPNCRNA.1070',
 'SPBC211.03c

In [26]:
GPcounts_NB.loc[best]

Unnamed: 0,Shared_log_likelihood,model_1_log_likelihood,model_2_log_likelihood,log_likelihood_ratio,label
SPNCRNA.627,-82.278874,-37.912559,-43.241619,1.124695,1.0
SPAC2F3.11,-225.027408,-115.619943,-109.284148,0.123317,1.0
SPBC3B8.08,-180.073407,-92.163217,-84.654302,3.255888,1.0
SPBC1D7.02c,-209.209378,-106.990019,-101.753165,0.466194,1.0
SPNCRNA.1068,-105.815114,-56.145403,-47.881333,1.788378,1.0
...,...,...,...,...,...
SPCC338.10c,-252.315904,-115.192832,-121.005598,16.117474,1.0
SPAC212.06c,-61.254787,-20.552141,-38.471932,2.230713,1.0
SPAC17A2.11,-53.692605,-25.788862,-24.356107,3.547636,1.0
SPAC29A4.09,-208.445163,-101.566743,-91.500948,15.377473,1.0


In [27]:
GPcounts_G.loc[best]

Unnamed: 0,Shared_log_likelihood,model_1_log_likelihood,model_2_log_likelihood,log_likelihood_ratio
SPNCRNA.627,-38.958886,-20.390502,-21.056779,-2.488395
SPAC2F3.11,12.590653,1.761873,6.110737,-4.718042
SPBC3B8.08,-7.086775,-13.859269,4.458896,-2.313599
SPBC1D7.02c,-25.791366,-16.126325,-14.011708,-4.346667
SPNCRNA.1068,-44.390026,-23.225708,-21.303965,-0.139647
...,...,...,...,...
SPCC338.10c,18.977713,9.702005,5.323112,-3.952596
SPAC212.06c,-44.318343,-21.386138,-23.811690,-0.879485
SPAC17A2.11,-36.924379,-22.419619,-14.984181,-0.479421
SPAC29A4.09,28.114055,8.204902,16.412739,-3.496414


In [11]:
GPcounts_G_sorted = GPcounts_G.sort_values(by=['log_likelihood_ratio'], ascending=False)
GPcounts_G_sorted

Unnamed: 0,Shared_log_likelihood,model_1_log_likelihood,model_2_log_likelihood,log_likelihood_ratio
SPAC186.05c,-71.918127,-37.138320,1.571749,36.351556
SPBC23G7.10c,-73.918640,-35.620948,-20.115951,18.181741
SPAC823.10c,-43.952972,-24.726198,-1.137123,18.089652
SPAC869.01,-42.050219,-25.453980,1.000095,17.596334
SPNCRNA.1628,-48.976339,-27.811865,-4.185531,16.978942
...,...,...,...,...
SPBC14C8.14c,17.279136,-3.442629,-4.419805,-25.141570
SPBC651.08c,22.856545,-2.674657,0.256785,-25.274417
SPAC4A8.16c,21.463603,-3.284543,-0.626030,-25.374176
SPBC4C3.05c,22.426259,-0.398700,-2.817688,-25.642648


In [None]:
percentage = np.array(list(range(1,11)))
percentage = percentage * .10
DESeq_id = list(DESeq_sorted.index.values)
GPcounts_NB_id = list(GPcounts_NB_sorted.index)
GPcounts_G_id = list(GPcounts_G_sorted.index)
corr_NB = []
corr_G = []

for count in range(10):
    print(percentage[count])
    print(int(len(DESeq_id) * percentage[count]))
    DESeq_id_part= DESeq_id[int(len(DESeq_id) * .0) : int(len(DESeq_id) * percentage[count])]
    '''
    GPcounts_NB_index_part = []
    GPcounts_G_index_part = []
    for i in DESeq_id_part:
        GPcounts_NB_index_part.append(GPcounts_NB_id.index(i))
        GPcounts_G_index_part.append(GPcounts_G_id.index(i))
    
    DESeq_index_part = list(range(0,len(DESeq_id_part)))
    '''
    rho, pval = stats.spearmanr(DESeq_sorted['padj'][int(len(DESeq_id) * .0) : int(len(DESeq_id) * percentage[count])],GPcounts_NB_sorted['log_likelihood_ratio'][int(len(DESeq_id) * .0) : int(len(DESeq_id) * percentage[count])])
    print(rho,pval)
    corr_NB.append(rho)
    plt.scatter(DESeq_sorted['padj'][int(len(DESeq_id) * .0) : int(len(DESeq_id) * percentage[count])][int(len(DESeq_id) * .0) : int(len(DESeq_id) * percentage[count])] ,GPcounts_NB_sorted['log_likelihood_ratio'][int(len(DESeq_id) * .0) : int(len(DESeq_id) * percentage[count])],s=5,alpha=0.5)
    plt.show() 
    
    rho, pval = stats.spearmanr(DESeq_sorted['padj'][int(len(DESeq_id) * .0) : int(len(DESeq_id) * percentage[count])],GPcounts_G_sorted['log_likelihood_ratio'][int(len(DESeq_id) * .0) : int(len(DESeq_id) * percentage[count])])
    print(rho,pval)
    corr_G.append(rho)
    plt.scatter(DESeq_sorted['padj'][int(len(DESeq_id) * .0) : int(len(DESeq_id) * percentage[count])][int(len(DESeq_id) * .0) : int(len(DESeq_id) * percentage[count])] ,GPcounts_G_sorted['log_likelihood_ratio'][int(len(DESeq_id) * .0) : int(len(DESeq_id) * percentage[count])],s=5,alpha=0.5)
    plt.show() 
    '''
    rho, pval = stats.spearmanr(DESeq_index_part,GPcounts_G_index_part)
    print(rho)
    corr_G.append(rho)
    plt.scatter(DESeq_index_part ,GPcounts_G_index_part,s=5,alpha=0.5)
    plt.show() 
    '''

In [None]:
plt.scatter(percentage,corr_NB,s=10,label = 'NB')
plt.scatter(percentage,corr_G,s=10,label = 'Gaussian')
plt.xlabel('percentage')
plt.ylabel('Spearman correlation')
plt.legend(bbox_to_anchor=(1, 0), loc='lower left', ncol=1)

sort GPcounts results according to DESeq2 

In [None]:
GPcounts_NB_DESeq_sorted = GPcounts_NB.reindex(list(DESeq_sorted.index.values))
GPcounts_NB_DESeq_sorted 

In [None]:
GPcounts_G_DESeq_sorted = GPcounts_G.reindex(list(DESeq_sorted.index.values))
GPcounts_G_DESeq_sorted

In [None]:
percentage = np.array(list(range(1,11)))
percentage = percentage * .10
fig = plt.figure()

dfs = [GPcounts_G_DESeq_sorted['log_likelihood_ratio']]#,GPcounts_G_DESeq_sorted['log_likelihood_ratio']]
for df in range(len(dfs)):
    for i in range(10):
        D =int(len(DESeq_id) * percentage[i])
        true_label = np.zeros(D)

        for j in range(D):
            if DESeq_sorted['padj'][j] <= .005 :
                true_label[j] = 1

        labels = pd.DataFrame(true_label, index =DESeq_sorted.index.values[0:D], columns =['label'])

        precision_NB, recall_NB, thresholds_NB = metrics.precision_recall_curve(labels['label'],dfs[df][0:D])

        plt.plot( recall_NB,precision_NB,label=str(int(percentage[i]*100))+'%')
        plt.tick_params(labelsize='large', width=2) 
        #plt.xlabel('recall')
        #plt.ylabel('precision')
        #plt.legend(loc='best', bbox_to_anchor=(1.22, .75))
        '''
        if df == 0:
            title = 'GPcounts with NB likelihood'
        else:
            title = 'GPcounts with Gaussian likelihood'
            
        plt.title(title )
        '''
        
    plt.show()


In [None]:
padj = [.05,.1,.2]
DESeq_id = list(DESeq_sorted.index.values)
fig = plt.figure()

dfs = [GPcounts_NB_DESeq_sorted['log_likelihood_ratio'],GPcounts_G_DESeq_sorted['log_likelihood_ratio']]
D = len(DESeq_sorted)

for df in range(len(dfs)):
    D = len(DESeq_sorted)
    for i in range(3):
        true_label = np.zeros(D)
  

        for j in range(D):
            if DESeq_sorted['padj'][j] <= padj[i]:
                true_label[j] = 1

        labels = pd.DataFrame(true_label, index =DESeq_sorted.index.values, columns =['label'])
        
        unique, counts = np.unique(true_label, return_counts=True)
        
        print(dict(zip(unique, counts)))
        precision, recall, thresholds = metrics.precision_recall_curve(labels['label'],dfs[df])

        plt.plot( recall,precision,label='padj='+str(padj[i]))
        plt.tick_params(labelsize='large', width=2) 
        '''
        plt.xlabel('recall')
        plt.ylabel('precision')
        plt.legend(loc='best', bbox_to_anchor=(1.22, .75))
        '''
        if df == 0:
            plt.legend(loc='best', bbox_to_anchor=(1., .65))
        
            #title = 'GPcounts with NB likelihood'
        '''
        else:
            title = 'GPcounts with Gaussian likelihood'
            
        plt.title(title )
        '''
        plt.ylim([-.1,1.1])
        
    plt.show()
