# Examine overlaps with published data

In [1]:
import numpy as np
import pandas as pd
import session_info
from pyhere import here

## Public si-eQTL analysis

In [2]:
shen = ["GDAP2", "AIM2", "SLAMF6", "RLF", "ATG4C", "FUT7",
        "TMEM218", "C11orf74", "RAB35", "TMEM5", "HNRNPK",
        "CDCA3", "ERCC5", "GJB6", "SNTB2", "SPNS3", 
        "XAF1", "RBBP8", "RUFY4", "CA2", "RAPGEF1"]
print("Shen et al.:")
print(len(shen))

kukurba = ["NOD2", "WDR36", "BSCL2", "MAP7D3", "RHOXF1", "DNAH1"]
print("Kukurba et al.:")
print(len(kukurba))

yao = ["NOD2", "HLA-DRB5", "HLA-DRB5", "KIAA0586", "PPP2R5A", 
       "TSNAXIP1", "MUT", "GRIK2", "C15orf37", "LIMA1", "IL6ST", 
       "HCG8", "BLOC1S3", "NKX3-1", "CXorf23"]
print("Yao et al.:")
print(len(np.unique(yao)))
print("Total of Yao + Kukurba:")
len(set(yao) | set(kukurba))

Shen et al.:
21
Kukurba et al.:
6
Yao et al.:
14
Total of Yao + Kukurba:


19

## Load BrainSeq si-eQTL results

### Interacting variant-gene pairs

In [3]:
bs0 = pd.read_csv("../../_m/BrainSeq_sexGenotypes_4features_3regions.txt.gz", sep='\t')
bs0["ensembl_gene_id"] = bs0.gene_id.str.replace("\\..*", "", regex=True)
biomart = pd.read_csv("../_h/biomart.csv", index_col=0)
bs = bs0.merge(biomart, on="ensembl_gene_id").drop_duplicates(subset="gene_id")
print(bs.shape)
bs.tail(2)

(692, 15)


Unnamed: 0,region,gene_id,variant_id,gencode_id,gene_name,seqnames,start,end,lfsr,posterior_mean,feature_type,ensembl_gene_id,external_gene_name,entrezgene,description
8285,Caudate,ENSG00000270605.1,chr1:28102893:G:C,ENSG00000270605.1,ENSG00000270605,chr1,28239509,28241453,0.049873,-0.261352,Gene,ENSG00000270605,AL353622.1,,
8286,DLPFC,ENSG00000187498.16,chr13:109650494:C:T,ENSG00000187498.16,COL4A1,chr13,110148963,110307157,0.048863,0.193807,Gene,ENSG00000187498,COL4A1,1282.0,collagen type IV alpha 1 chain [Source:HGNC Sy...


In [4]:
bs[(bs['external_gene_name'].isin(shen))].to_csv("siEQTL_Shen_comparison.csv", index=False)

In [5]:
bs[(bs['external_gene_name'].isin(kukurba))]

Unnamed: 0,region,gene_id,variant_id,gencode_id,gene_name,seqnames,start,end,lfsr,posterior_mean,feature_type,ensembl_gene_id,external_gene_name,entrezgene,description


In [6]:
bs[(bs['external_gene_name'].isin(yao))]

Unnamed: 0,region,gene_id,variant_id,gencode_id,gene_name,seqnames,start,end,lfsr,posterior_mean,feature_type,ensembl_gene_id,external_gene_name,entrezgene,description


In [7]:
bs[(bs['external_gene_name'].isin(shen+kukurba+yao))]

Unnamed: 0,region,gene_id,variant_id,gencode_id,gene_name,seqnames,start,end,lfsr,posterior_mean,feature_type,ensembl_gene_id,external_gene_name,entrezgene,description
5787,Caudate,ENSG00000125703.15,chr1:63060301:G:A,ENSG00000125703.15,ATG4C,chr1,62784132,62865516,0.019839,0.094136,Gene,ENSG00000125703,ATG4C,84938.0,autophagy related 4C cysteine peptidase [Sourc...
8130,Caudate,ENSG00000104267.10,chr8:84966439:A:T,ENSG00000104267.10,CA2,chr8,85463968,85481493,0.045578,0.289557,Gene,ENSG00000104267,CA2,760.0,carbonic anhydrase 2 [Source:HGNC Symbol;Acc:H...


## GTEx comparison

In [8]:
gtex = pd.read_csv(here("input/public_results/gtex_results/_m",
                        "GTEx_Analysis_v8_sbeQTLs/GTEx_Analysis_v8_sbeQTLs.txt"), 
                   sep='\t')
gtex.iloc[0:2, 0:10]

Unnamed: 0,ensembl_gene_id,hugo_gene_id,gene_type,variant_id,rs_id,Tissue,maf,pval_nominal_sb,slope_sb,slope_se_sb
0,ENSG00000241860.6,RP11-34P13.13,processed_transcript,chr1_14677_G_A_b38,rs201327123,Adipose_Subcutaneous,0.051635,0.847114,0.05508,0.285537
1,ENSG00000227232.5,WASH7P,unprocessed_pseudogene,chr1_64764_C_T_b38,rs769952832,Adipose_Subcutaneous,0.061102,0.316881,0.222928,0.222511


In [9]:
gtex.iloc[0:2, 10:14]

Unnamed: 0,numtested,pvals.corrected,qval,pval_nominal_f
0,1,0.847114,1.0,0.022302
1,1,0.316881,0.981254,0.003978


In [10]:
## qval threshold equal to number of published sb-eQTL
gtex[(gtex['qval'] < 0.25) & (gtex["Tissue"].str.contains("Brain"))]\
    .loc[:, ["ensembl_gene_id", "hugo_gene_id", "Tissue", "pvals.corrected", 'qval']].head(10)

Unnamed: 0,ensembl_gene_id,hugo_gene_id,Tissue,pvals.corrected,qval
62155,ENSG00000026025.15,VIM,Brain_Amygdala,4e-06,0.012836
116842,ENSG00000160818.16,GPATCH4,Brain_Nucleus_accumbens_basal_ganglia,8.8e-05,0.198445
121904,ENSG00000141562.17,NARF,Brain_Nucleus_accumbens_basal_ganglia,5.6e-05,0.198445
122123,ENSG00000267174.5,CTC-510F12.4,Brain_Nucleus_accumbens_basal_ganglia,8.3e-05,0.198445


In [11]:
## qval threshold equal to number of published sb-eQTL
gtex[(gtex['qval'] < 0.25) & (gtex["Tissue"].str.contains("Whole"))]\
    .loc[:, ["ensembl_gene_id", "hugo_gene_id", "Tissue", "pvals.corrected", 'qval']].head(10)

Unnamed: 0,ensembl_gene_id,hugo_gene_id,Tissue,pvals.corrected,qval
362961,ENSG00000221571.3,RNU6ATAC35P,Whole_Blood,3.9e-05,0.139762
365043,ENSG00000196743.8,GM2A,Whole_Blood,1.1e-05,0.116825
367164,ENSG00000148459.15,PDSS1,Whole_Blood,2.7e-05,0.139762


In [12]:
gtex_sig = gtex[(gtex['qval'] < 0.25)]
gtex_sig.shape

(369, 22)

In [13]:
gtex_sig.head(10)

Unnamed: 0,ensembl_gene_id,hugo_gene_id,gene_type,variant_id,rs_id,Tissue,maf,pval_nominal_sb,slope_sb,slope_se_sb,...,qval,pval_nominal_f,slope_f,slope_se_f,pval_nominal_m,slope_m,slope_se_m,pval_nominal,slope,slope_se
1096,ENSG00000076356.6,PLXNA2,protein_coding,chr1_208030492_G_A_b38,rs3811383,Adipose_Subcutaneous,0.123924,5.3916e-05,0.338278,0.083064,...,0.121068,1.71888e-08,0.456729,0.075705,0.91557,0.009739,0.091682,2.7474e-05,0.17183,0.040604
5262,ENSG00000170632.13,ARMC10,protein_coding,chr7_103076937_C_T_b38,rs6958836,Adipose_Subcutaneous,0.169535,5.01113e-05,0.357403,0.087384,...,0.1929,0.493324,-0.054539,0.079379,3.21922e-07,-0.4298,0.079545,8.79753e-08,-0.216374,0.039857
5644,ENSG00000120907.17,ADRA1A,protein_coding,chr8_26839198_G_A_b38,rs117380715,Adipose_Subcutaneous,0.216867,1.04589e-05,-0.323552,0.072676,...,0.084548,4.63741e-18,-0.779707,0.076596,3.97666e-10,-0.469672,0.069091,5.637370000000001e-52,-0.568916,0.033334
6414,ENSG00000136830.11,FAM129B,protein_coding,chr9_127584339_G_A_b38,rs10739693,Adipose_Subcutaneous,0.304647,7.38701e-07,-0.28366,0.056579,...,0.004976,1.978e-06,-0.333315,0.066772,0.165338,-0.082625,0.059205,1.39316e-08,-0.168762,0.02926
7220,ENSG00000166787.3,SAA3P,transcribed_unprocessed_pseudogene,chr11_18269355_T_C_b38,rs34068567,Adipose_Subcutaneous,0.27883,2.20729e-05,0.32303,0.075427,...,0.074347,6.4094e-08,0.453034,0.078725,0.3138,0.063002,0.062292,2.4336e-08,0.21191,0.037395
8540,ENSG00000183463.5,URAD,protein_coding,chr13_27990205_T_A_b38,rs7335293,Adipose_Subcutaneous,0.5,9.0787e-09,-0.444892,0.076123,...,0.000122,9.98265e-21,-0.887723,0.078738,1.89229e-09,-0.457733,0.070571,3.07731e-53,-0.640604,0.036976
9191,ENSG00000282651.2,IGHV5-10-1,IG_V_gene,chr14_106114510_A_G_b38,rs4573838,Adipose_Subcutaneous,0.419105,2.02515e-05,-0.40676,0.094541,...,0.074347,5.24871e-12,-0.682629,0.089412,0.00280506,-0.289091,0.094806,3.45842e-21,-0.445408,0.045073
14611,ENSG00000143933.16,CALM2,protein_coding,chr2_46225349_C_T_b38,rs12477148,Adipose_Visceral_Omentum,0.072495,4.49793e-05,-0.480557,0.116471,...,0.161955,0.000474715,-0.491287,0.134732,0.916574,0.013043,0.124116,2.19775e-05,-0.246023,0.057281
15082,ENSG00000144410.4,CPO,protein_coding,chr2_206822186_C_T_b38,rs12470278,Adipose_Visceral_Omentum,0.097015,3.20412e-05,0.682291,0.162191,...,0.11537,0.116543,0.280837,0.176978,4.43106e-06,-0.558002,0.113158,7.8964e-06,-0.320288,0.070745
17452,ENSG00000211698.2,TRGV4,TR_V_gene,chr7_38361995_A_C_b38,rs10233345,Adipose_Visceral_Omentum,0.335821,6.4381e-05,0.427491,0.105837,...,0.139089,7.01123e-06,-0.481758,0.100091,8.85767e-15,-1.06884,0.112111,1.6303899999999999e-49,-0.838766,0.04909


### mashr

In [14]:
gtex_overlap = bs[(bs['gene_id'].isin(gtex_sig.ensembl_gene_id))].drop_duplicates()
print(gtex_overlap.shape)
gtex_overlap

(2, 15)


Unnamed: 0,region,gene_id,variant_id,gencode_id,gene_name,seqnames,start,end,lfsr,posterior_mean,feature_type,ensembl_gene_id,external_gene_name,entrezgene,description
4943,Caudate,ENSG00000272977.1,chr22:25059120:A:C,ENSG00000272977.1,ENSG00000272977,chr22,25476218,25479971,0.011928,0.323847,Gene,ENSG00000272977,AL008721.2,,
8285,Caudate,ENSG00000270605.1,chr1:28102893:G:C,ENSG00000270605.1,ENSG00000270605,chr1,28239509,28241453,0.049873,-0.261352,Gene,ENSG00000270605,AL353622.1,,


In [15]:
gtex_overlap.shape[0]/bs.shape[0] * 100

0.2890173410404624

In [16]:
gtex_sig[(gtex_sig['ensembl_gene_id'].isin(bs.gene_id))]

Unnamed: 0,ensembl_gene_id,hugo_gene_id,gene_type,variant_id,rs_id,Tissue,maf,pval_nominal_sb,slope_sb,slope_se_sb,...,qval,pval_nominal_f,slope_f,slope_se_f,pval_nominal_m,slope_m,slope_se_m,pval_nominal,slope,slope_se
297207,ENSG00000270605.1,RP5-1092A3.4,antisense,chr1_28223937_C_T_b38,rs481640,Skin_Not_Sun_Exposed_Suprapubic,0.323985,0.000237,-0.241524,0.065161,...,0.208195,4.04659e-21,-0.760755,0.063142,6.39588e-13,-0.599143,0.07247,9.91708e-60,-0.591873,0.031047
338770,ENSG00000272977.1,CTA-390C10.10,sense_intronic,chr22_25459662_G_A_b38,rs6004655,Spleen,0.167401,3.8e-05,0.413128,0.097856,...,0.212883,7.93121e-14,-1.12903,0.107327,5.15526e-16,-1.27659,0.10469,4.8966399999999995e-51,-1.19367,0.056659


In [17]:
gtex_sig[(gtex_sig['ensembl_gene_id'].isin(bs.gene_id))]\
    .to_csv("siEQTL_gtex_comparison.csv", index=False)

# Session information

In [18]:
session_info.show()