### Nuha BinTayyash, 2020

This notebook shows the ROC and precision-recall curves resulted from running GPcounts with one sample test on simulated bulk RNA-seq datasets.

In [1]:
import numpy as np
from matplotlib import pyplot as plt
from sklearn import metrics
import pandas as pd
from scipy import stats

In [2]:
GPcounts_NB = pd.read_csv('ll_Negative_binomial_exons_counts.csv',index_col=[0])
GPcounts_G = pd.read_csv('ll_Gaussian_exons_counts.csv',index_col=[0])
print(GPcounts_NB.shape)
print(GPcounts_G.shape)

(8217, 5)
(8217, 5)


In [6]:
NegativeBinomial = pd.read_csv('ll_Negative_binomial_exons_counts.csv',index_col=[0])
NegativeBinomial['log_likelihood_ratio'] = pd.to_numeric(NegativeBinomial['log_likelihood_ratio'], errors='coerce')
nan_gene = list(NegativeBinomial[NegativeBinomial['log_likelihood_ratio'].isnull()].index.values)
GPcounts_NB = NegativeBinomial.drop(nan_gene)
GPcounts_NB

Unnamed: 0,Dynamic_model_log_likelihood,Constant_model_log_likelihood,log_likelihood_ratio,time,failure counter
FBgn0000003,-73.7899363362756,-68.52892952423885,-5.261007,0:00:09.277584,0
FBgn0000008,-115.45385720660255,-147.467852809965,32.013996,0:00:05.823610,0
FBgn0000014,-49.159336901470965,-79.54083562055467,30.381499,0:00:06.675063,0
FBgn0000015,-26.47604613053277,-38.467021705493,11.990976,0:00:05.362831,0
FBgn0000017,-165.56235815619738,-201.7558911894686,36.193533,0:00:06.847503,0
...,...,...,...,...,...
FBgn0285954,-94.9151034656098,-108.68849549863636,13.773392,0:00:04.923832,1
FBgn0285955,-32.422456924308484,-46.31973778917264,13.897281,0:00:02.990246,0
FBgn0285962,-94.03234694846505,-89.88310397447434,-4.149243,0:00:05.222474,1
FBgn0285971,-131.3216404477003,-473.31464760876383,341.993007,0:00:02.596877,0


In [7]:
Gaussian = pd.read_csv('ll_Gaussian_exons_counts.csv',index_col=[0])
Gaussian['log_likelihood_ratio'] = pd.to_numeric(Gaussian['log_likelihood_ratio'], errors='coerce')
GPcounts_G = Gaussian.drop(nan_gene)
GPcounts_G

Unnamed: 0,Dynamic_model_log_likelihood,Constant_model_log_likelihood,log_likelihood_ratio,time,failure counter
FBgn0000003,-12.043035,-11.454129,-0.588906,0:00:00.583296,0
FBgn0000008,-20.419973,-51.580668,31.160696,0:00:00.549918,0
FBgn0000014,-9.007350,-54.453686,45.446335,0:00:00.571156,0
FBgn0000015,-19.377525,-40.122539,20.745015,0:00:00.566677,0
FBgn0000017,-6.891059,-18.298687,11.407628,0:00:00.580477,0
...,...,...,...,...,...
FBgn0285954,-3.320550,-13.022536,9.701986,0:00:00.890007,0
FBgn0285955,-15.405277,-43.393667,27.988391,0:00:00.568858,0
FBgn0285962,16.249576,16.277114,-0.027538,0:00:00.564914,0
FBgn0285971,3.360024,-32.029308,35.389332,0:00:00.590313,0


In [8]:
best  = set(GPcounts_NB.index.values).intersection(GPcounts_G.index.values)
len(list(best))

8207

In [9]:
GPcounts_NB_sorted = GPcounts_NB.sort_values(by=['log_likelihood_ratio'], ascending=False)
GPcounts_NB_sorted

Unnamed: 0,Dynamic_model_log_likelihood,Constant_model_log_likelihood,log_likelihood_ratio,time,failure counter
FBgn0037045,2099.3297155155938,-2961968635.71972,2.961971e+09,0:00:06.592187,5
FBgn0031766,2071.367648138938,-1765603119.9278157,1.765605e+09,0:00:14.926573,9
FBgn0036461,1554.9385857865796,-1764000000.0859141,1.764002e+09,0:00:07.509770,3
FBgn0039234,2509.2350672575108,-55981.639525337116,5.849087e+04,0:00:15.432820,5
FBgn0262560,16.839380020378265,-17540.514007082245,1.755735e+04,0:00:08.265485,7
...,...,...,...,...,...
FBgn0034950,-30.646694276037326,39.55562211137547,-7.020232e+01,0:00:02.821623,0
FBgn0263967,-5.792196074903567,81.78189342359987,-8.757409e+01,0:00:08.022736,10
FBgn0083970,-38.31337625726975,73.13915974353716,-1.114525e+02,0:00:08.489511,9
FBgn0263750,-1.5557892889641227,119.06189947541992,-1.206177e+02,0:00:04.584085,2


True Positives - dynamic genes

In [10]:
D = GPcounts_NB.shape[0]
print(D)
true_label = np.zeros(D)
for j in range(D):
    if GPcounts_NB['log_likelihood_ratio'][j] > 0:
        true_label[j] = 1
labels = pd.DataFrame(true_label, index =GPcounts_NB.index.values, columns =['label'])
GPcounts_NB = pd.concat([GPcounts_NB,labels],axis = 1)
GPcounts_NB.head()

8207


Unnamed: 0,Dynamic_model_log_likelihood,Constant_model_log_likelihood,log_likelihood_ratio,time,failure counter,label
FBgn0000003,-73.7899363362756,-68.52892952423885,-5.261007,0:00:09.277584,0,0.0
FBgn0000008,-115.45385720660256,-147.467852809965,32.013996,0:00:05.823610,0,1.0
FBgn0000014,-49.159336901470965,-79.54083562055467,30.381499,0:00:06.675063,0,1.0
FBgn0000015,-26.47604613053277,-38.467021705493,11.990976,0:00:05.362831,0,1.0
FBgn0000017,-165.56235815619738,-201.7558911894686,36.193533,0:00:06.847503,0,1.0


In [12]:
TP_genes_NB = list(GPcounts_NB.loc[(GPcounts_NB['log_likelihood_ratio'] > 0)&(GPcounts_NB['label']==1.0)].index.values)
len(TP_genes_NB)

7254

In [13]:
TP_genes_G =list(GPcounts_G.loc[(GPcounts_G['log_likelihood_ratio'] > 0)& (GPcounts_NB['label']==1.0).index.values].index.values)
len(TP_genes_G)

7913

In [14]:
FP_genes_G = list(GPcounts_G.loc[(GPcounts_G['log_likelihood_ratio'] < 0)&(GPcounts_NB['label']==1.0).index.values].index.values)
len(FP_genes_G)

294

In [15]:
GPcounts_NB.loc[FP_genes_G]

Unnamed: 0,Dynamic_model_log_likelihood,Constant_model_log_likelihood,log_likelihood_ratio,time,failure counter,label
FBgn0000003,-73.7899363362756,-68.52892952423885,-5.261007,0:00:09.277584,0,0.0
FBgn0000100,-228.16814037630238,-226.60378016490608,-1.564360,0:00:09.551720,0,0.0
FBgn0000221,-104.63356175182057,-102.2761191582751,-2.357443,0:00:06.211985,0,0.0
FBgn0000416,-192.1956357191592,-189.6846462535236,-2.510989,0:00:19.795151,0,0.0
FBgn0000541,-191.65140359745163,-190.77287263340202,-0.878531,0:00:10.428141,1,0.0
...,...,...,...,...,...,...
FBgn0278598,-72.17552795881366,-66.88103426577555,-5.294494,0:00:02.557526,0,0.0
FBgn0283427,-99.45063616021473,-99.45034236297151,-0.000294,0:00:02.915365,0,0.0
FBgn0284251,-149.786333835635,-138.82726855705886,-10.959065,0:00:11.245870,0,0.0
FBgn0285950,-207.779052923762,-205.9816824928,-1.797370,0:00:06.970728,0,0.0


In [16]:
GPcounts_G.loc[FP_genes_G]

Unnamed: 0,Dynamic_model_log_likelihood,Constant_model_log_likelihood,log_likelihood_ratio,time,failure counter
FBgn0000003,-12.043035,-11.454129,-0.588906,0:00:00.583296,0
FBgn0000100,-6.017218,-6.017201,-0.000017,0:00:00.612348,0
FBgn0000221,-8.449259,-6.435773,-2.013487,0:00:00.551754,0
FBgn0000416,8.286033,8.286082,-0.000049,0:00:00.585862,0
FBgn0000541,6.497887,7.196130,-0.698243,0:00:00.769905,0
...,...,...,...,...,...
FBgn0278598,1.121638,1.121660,-0.000022,0:00:00.976666,0
FBgn0283427,-30.194080,-30.194038,-0.000041,0:00:00.600265,0
FBgn0284251,23.673065,25.948161,-2.275096,0:00:00.514875,0
FBgn0285950,17.630836,18.316040,-0.685204,0:00:00.587945,0


True Negatives- Constant genes

In [17]:
TN_genes_NB = list(GPcounts_NB.loc[(GPcounts_NB['log_likelihood_ratio'] < 0)&(GPcounts_NB['label']==0.0)].index.values)
len(TN_genes_NB)

953

In [18]:
TN_genes_G = list(GPcounts_G.loc[(GPcounts_G['log_likelihood_ratio'] < 0)&(GPcounts_NB['label']==0.0)].index.values)
len(TN_genes_G)

252

In [19]:
FN_genes_G = list(GPcounts_G.loc[(GPcounts_G['log_likelihood_ratio'] > 0)&(GPcounts_NB['label']==0.0)].index.values)
len(FN_genes_G)

701

In [20]:
FN_genes_G

['FBgn0000045',
 'FBgn0000047',
 'FBgn0000083',
 'FBgn0000109',
 'FBgn0000115',
 'FBgn0000274',
 'FBgn0000283',
 'FBgn0000318',
 'FBgn0000559',
 'FBgn0000618',
 'FBgn0000928',
 'FBgn0001105',
 'FBgn0001197',
 'FBgn0001276',
 'FBgn0001942',
 'FBgn0001977',
 'FBgn0001995',
 'FBgn0002022',
 'FBgn0002466',
 'FBgn0002521',
 'FBgn0002645',
 'FBgn0002772',
 'FBgn0002989',
 'FBgn0003008',
 'FBgn0003149',
 'FBgn0003189',
 'FBgn0003513',
 'FBgn0003520',
 'FBgn0003710',
 'FBgn0003716',
 'FBgn0003923',
 'FBgn0003933',
 'FBgn0003934',
 'FBgn0003935',
 'FBgn0003938',
 'FBgn0004132',
 'FBgn0004186',
 'FBgn0004373',
 'FBgn0004404',
 'FBgn0004811',
 'FBgn0005278',
 'FBgn0005355',
 'FBgn0005411',
 'FBgn0005775',
 'FBgn0010113',
 'FBgn0010220',
 'FBgn0010247',
 'FBgn0010287',
 'FBgn0010409',
 'FBgn0010470',
 'FBgn0011206',
 'FBgn0011217',
 'FBgn0011227',
 'FBgn0011230',
 'FBgn0011290',
 'FBgn0011335',
 'FBgn0011455',
 'FBgn0011476',
 'FBgn0011570',
 'FBgn0011741',
 'FBgn0011762',
 'FBgn0011987',
 'FBgn00

In [21]:
best  = set(TP_genes_NB).intersection(TP_genes_G)
len(list(best))

7212

In [40]:
G_plus = list(GPcounts_G.loc[(GPcounts_G['log_likelihood_ratio'] > 0)].index.values)
len(G_plus)

7913

In [41]:
G_minus = list(GPcounts_G.loc[(GPcounts_G['log_likelihood_ratio'] < 0)].index.values)
len(G_minus)

294

In [42]:
NB_plus = list(GPcounts_NB.loc[(GPcounts_NB['log_likelihood_ratio'] > 0)].index.values)
len(NB_plus)

7254

In [43]:
NB_minus = list(GPcounts_NB.loc[(GPcounts_NB['log_likelihood_ratio'] < 0)].index.values)
len(NB_minus)

953

In [44]:
best  = set(NB_plus).intersection(G_plus)
len(list(best))

7212

In [45]:
best  = set(NB_plus).intersection(G_minus)
len(list(best))

42

In [46]:
best  = set(NB_minus).intersection(G_plus)
len(list(best))

701

In [47]:
best  = set(NB_minus).intersection(G_minus)
len(list(best))

252

NB + GP + --> 7212
NB + GP -  --> 42
NB - GP + --> 701
NB - GP - -->  252

Introns 

In [48]:
NegativeBinomial = pd.read_csv('ll_Negative_binomial_introns_counts.csv',index_col=[0])
NegativeBinomial['log_likelihood_ratio'] = pd.to_numeric(NegativeBinomial['log_likelihood_ratio'], errors='coerce')
nan_gene = list(NegativeBinomial[NegativeBinomial['log_likelihood_ratio'].isnull()].index.values)
GPcounts_NB_in = NegativeBinomial.drop(nan_gene)
GPcounts_NB_in

Unnamed: 0,Dynamic_model_log_likelihood,Constant_model_log_likelihood,log_likelihood_ratio,time,failure counter
FBgn0000008,-24.731196286803552,-83.88832263437664,59.157126,0:00:03.332731,0
FBgn0000014,-48.47719797601507,-76.35713831970791,27.879940,0:00:03.626980,0
FBgn0000015,-17.633441046368414,-26.076545327975627,8.443104,0:00:03.901272,0
FBgn0000097,-51.26028488234646,-85.4980136548894,34.237729,0:00:03.643114,0
FBgn0000108,-31.588300243347767,-31.609192415599622,0.020892,0:00:03.914944,0
...,...,...,...,...,...
FBgn0285948,-27.83774498560247,-35.545178620060284,7.707434,0:00:03.307070,0
FBgn0285952,-33.19810353477502,-71.79022933323674,38.592126,0:00:03.574617,0
FBgn0285954,-44.767011987372534,-99.94233960470586,55.175328,0:00:03.999506,0
FBgn0285955,-47.72839546993644,-78.87006608856014,31.141671,0:00:08.692293,0


In [49]:
Gaussian = pd.read_csv('ll_Gaussian_introns_counts.csv',index_col=[0])
Gaussian['log_likelihood_ratio'] = pd.to_numeric(Gaussian['log_likelihood_ratio'], errors='coerce')
GPcounts_G_in = Gaussian.drop(nan_gene)
GPcounts_G_in 

Unnamed: 0,Dynamic_model_log_likelihood,Constant_model_log_likelihood,log_likelihood_ratio,time,failure counter
FBgn0000008,-6.461049,-39.248623,32.787574,0:00:00.598716,0
FBgn0000014,-14.046489,-53.412492,39.366003,0:00:00.596016,0
FBgn0000015,-10.112098,-24.654526,14.542428,0:00:00.580449,0
FBgn0000097,-9.725929,-59.216374,49.490445,0:00:00.591606,0
FBgn0000108,-14.687515,-16.715880,2.028365,0:00:00.574437,0
...,...,...,...,...,...
FBgn0285948,-3.964415,-22.484862,18.520447,0:00:00.631537,0
FBgn0285952,7.267011,-38.273207,45.540218,0:00:00.615517,0
FBgn0285954,-11.399486,-44.496660,33.097173,0:00:00.637541,0
FBgn0285955,-11.553415,-58.652894,47.099479,0:00:00.666757,0


In [50]:
GPcounts_NB_in.sort_values(by=['log_likelihood_ratio'], ascending=False)

Unnamed: 0,Dynamic_model_log_likelihood,Constant_model_log_likelihood,log_likelihood_ratio,time,failure counter
FBgn0052685,525.2086545748789,-270719685.571038,2.707202e+08,0:00:05.600907,6
FBgn0039286,-68.13618898038972,-818.8205113629238,7.506843e+02,0:00:04.248007,0
FBgn0264857,-72.25573400850561,-754.9886266948872,6.827329e+02,0:00:03.065764,1
FBgn0003118,-73.85652084381762,-716.0146917934552,6.421582e+02,0:00:22.454836,1
FBgn0027528,-67.08070608877044,-586.2084514267976,5.191277e+02,0:00:03.704006,0
...,...,...,...,...,...
FBgn0026058,-66.76691339100852,-63.06435016390847,-3.702563e+00,0:00:02.768783,0
FBgn0000183,-18.30159610785631,3.385623557516242,-2.168722e+01,0:00:02.366577,0
FBgn0034321,-78.35095298993974,-45.822009454382616,-3.252894e+01,0:00:08.599023,0
FBgn0041160,-32.68373256308725,2.449179491542754,-3.513291e+01,0:00:03.321407,0


In [51]:
D = GPcounts_NB_in.shape[0]
true_label = np.zeros(D)
for j in range(D):
    if GPcounts_NB_in['log_likelihood_ratio'][j] > 0:
        true_label[j] = 1
labels = pd.DataFrame(true_label, index =GPcounts_NB_in.index.values, columns =['label'])
GPcounts_NB_in = pd.concat([GPcounts_NB_in,labels],axis = 1)

In [52]:
TP_genes_NB_in = list(GPcounts_NB_in.loc[(GPcounts_NB_in['log_likelihood_ratio'] > 0)&(GPcounts_NB_in['label']==1.0)].index.values)
len(TP_genes_NB_in)

1747

In [53]:
best  = set(TP_genes_NB).intersection(TP_genes_NB_in)
len(list(best))

1587

In [54]:
TP_genes_G_in = list(GPcounts_G_in.loc[(GPcounts_G_in['log_likelihood_ratio'] > 0)].index.values)
len(TP_genes_G_in)

1771

In [55]:
best  = set(TP_genes_G).intersection(TP_genes_G_in)
len(list(best))

1666