# Examine overlaps with published data

In [None]:
import numpy as np
import pandas as pd
import session_info
from pyhere import here

## Public si-eQTL analysis

In [None]:
shen = ["GDAP2", "AIM2", "SLAMF6", "RLF", "ATG4C", "FUT7",
        "TMEM218", "C11orf74", "RAB35", "TMEM5", "HNRNPK",
        "CDCA3", "ERCC5", "GJB6", "SNTB2", "SPNS3", 
        "XAF1", "RBBP8", "RUFY4", "CA2", "RAPGEF1"]
print("Shen et al.:")
print(len(shen))

kukurba = ["NOD2", "WDR36", "BSCL2", "MAP7D3", "RHOXF1", "DNAH1"]
print("Kukurba et al.:")
print(len(kukurba))

yao = ["NOD2", "HLA-DRB5", "HLA-DRB5", "KIAA0586", "PPP2R5A", 
       "TSNAXIP1", "MUT", "GRIK2", "C15orf37", "LIMA1", "IL6ST", 
       "HCG8", "BLOC1S3", "NKX3-1", "CXorf23"]
print("Yao et al.:")
print(len(np.unique(yao)))
print("Total of Yao + Kukurba:")
len(set(yao) | set(kukurba))

## Load BrainSeq si-eQTL results

### Interacting variant-gene pairs

In [None]:
bs0 = pd.read_csv("../../_m/BrainSeq_sexGenotypes_4features_3regions.txt.gz", sep='\t')
bs0["ensembl_gene_id"] = bs0.gene_id.str.replace("\\..*", "", regex=True)
biomart = pd.read_csv("../_h/biomart.csv", index_col=0)
bs = bs0.merge(biomart, on="ensembl_gene_id").drop_duplicates(subset="gene_id")
print(bs.shape)
bs.tail(2)

In [None]:
bs[(bs['external_gene_name'].isin(shen))].to_csv("siEQTL_Shen_comparison.csv", index=False)

In [None]:
bs[(bs['external_gene_name'].isin(kukurba))]

In [None]:
bs[(bs['external_gene_name'].isin(yao))]

In [None]:
bs[(bs['external_gene_name'].isin(shen+kukurba+yao))]

## GTEx comparison

In [None]:
gtex = pd.read_csv(here("input/public_results/gtex_results/_m",
                        "GTEx_Analysis_v8_sbeQTLs/GTEx_Analysis_v8_sbeQTLs.txt"), 
                   sep='\t')
gtex.iloc[0:2, 0:10]

In [None]:
gtex.iloc[0:2, 10:14]

In [None]:
## qval threshold equal to number of published sb-eQTL
gtex[(gtex['qval'] < 0.25) & (gtex["Tissue"].str.contains("Brain"))]\
    .loc[:, ["ensembl_gene_id", "hugo_gene_id", "Tissue", "pvals.corrected", 'qval']].head(10)

In [None]:
## qval threshold equal to number of published sb-eQTL
gtex[(gtex['qval'] < 0.25) & (gtex["Tissue"].str.contains("Whole"))]\
    .loc[:, ["ensembl_gene_id", "hugo_gene_id", "Tissue", "pvals.corrected", 'qval']].head(10)

In [None]:
gtex_sig = gtex[(gtex['qval'] < 0.25)]
gtex_sig.shape

In [None]:
gtex_sig.head(10)

### mashr

In [None]:
gtex_overlap = bs[(bs['gene_id'].isin(gtex_sig.ensembl_gene_id))].drop_duplicates()
print(gtex_overlap.shape)
gtex_overlap

In [None]:
gtex_overlap.shape[0]/bs.shape[0] * 100

In [None]:
gtex_sig[(gtex_sig['ensembl_gene_id'].isin(bs.gene_id))]

In [None]:
gtex_sig[(gtex_sig['ensembl_gene_id'].isin(bs.gene_id))]\
    .to_csv("siEQTL_gtex_comparison.csv", index=False)

# Session information

In [None]:
session_info.show()