# OmicSynth NDD Drug Target Analysis

## Library Load and Data Read in

In [28]:
import pandas as pd
import subprocess
import sys
import numpy as np
import os
import glob
from scipy.stats import rankdata
#from omicsynth_func import *
import warnings
warnings.simplefilter(action='ignore')


# Display all rows from data frame using pandas
#pd.set_option('display.max_rows', None)

In [2]:
def extract_topsig_genes(main_df,  sig = 0.05):
    # create df to hold top genes
    top_gene_df = pd.DataFrame()
 
    df_adj = main_df.query(f'p_SMR_multi <= {sig} & p_HEIDI > 0.01')
    top_gene = df_adj[['Omic', 'Disease', 'Gene','probeID','ProbeChr', 'Probe_bp', 'topRSID','topSNP_chr', 'topSNP_bp', 'A1', 'A2', 'b_SMR', 'se_SMR','p_SMR', 'p_SMR_multi','p_HEIDI']] # filter out any genes whose fdr pval does not meet defined significance; default = 0.05

    top_gene_df = pd.concat([top_gene_df,top_gene])
    top_gene_df = top_gene_df.drop_duplicates()
        
    return top_gene_df

In [3]:
def extract_top_genes(main_df,dx_list, omic_list, pcol):
    
    # create df to hold top genes
    top_gene_df = pd.DataFrame()

    for disease in dx_list:
        for omic in omic_list:

            # filter for hits that have  FDR_pval < 0.05 and p_HEIDI > 0.01
            df = main_df.query(f"Disease == '{disease}' & Omic == '{omic}'")
            
            df = df.query("p_SMR_multi < 0.05 & p_HEIDI > 0.01")
            top_gene = df[df['p_SMR_multi'] == df['p_SMR_multi'].min()]
            top_gene = top_gene[['Omic', 'Disease', 'Gene','probeID','ProbeChr', 'Probe_bp', 'topRSID','topSNP_chr', 'topSNP_bp', 'A1', 'A2', 'b_SMR', 'se_SMR','p_SMR', 'p_SMR_multi','p_HEIDI']]

            top_gene_df = pd.concat([top_gene_df,top_gene])
    return top_gene_df

In [4]:
# load in SMR results for NDDs
ndd_df = pd.read_csv("NDD_SMR_genes.csv")

# read in therapeutic drug data - from Finan et al and DGidb
drug_df = pd.read_csv('/../omicSynth/GTEx/drug_genome_dgidb.csv', sep = ',')


# list of unique gene targets from drug data
thera_genes = list(drug_df['gene_name'].unique())


In [5]:
# read in protein coding genes
coding = pd.read_csv('/../omicSynth/GTEx/proteincoding_genesym.csv')

# remove novel_or_none
coding = coding.query('Gene_symbol != "novel_or_none"')

# get list
cgenes = list(coding.Gene_symbol)

# print shape of df before removing non protein coding genes
ndd_df.shape

(1732786, 25)

In [6]:
# remove all non protein coding genes
ndd_df_pc = ndd_df.query('Gene == @cgenes')

# print shape of df after removing non protein coding genes
ndd_df_pc.shape

(1156428, 25)

In [7]:
# difference
dif = ndd_df.shape[0] - ndd_df_pc.shape[0]

print(f'Removed {dif} non protein coding genes')

Removed 576358 non protein coding genes


In [29]:
omic_map = {'Cerebellum_metaBrain': 'Cerebellum eQTL',
 'Basalganglia_metaBrain': 'Basal Ganglia eQTL',
 'Spinalcord_metaBrain': 'Spinalcord eQTL',
 'Hippocampus_metaBrain': 'Hippocampus eQTL',
 'brain_mMeta': 'Whole Brain meta-analysis mQTL',
 'blood_mcrae': 'Whole Blood mQTL',
 'Cortex_metaBrain': 'Cortex eQTL metaBrain',
 'Brain_Frontal_Cortex_BA9': 'Frontal Cortex BA9 eQTL',
 'Brain_Cerebellar_Hemisphere': 'Cerebellar Hemisphere eQTL',
 'psychEncode_prefrontal_cortex': 'Prefrontal Cortex eQTL',
 'Brain_Cortex': 'Cortex eQTL GTEx',
 'Brain_Caudate_basal_ganglia': 'Caudate Basal Ganglia eQTL',
 'Nerve_Tibial': 'Tibial Nerve eQTL',
 'Muscle_Skeletal': 'Skeletal Muscle eQTL',
 'Brain_Hippocampus': 'Hippocampus eQTL',
 'multiancestry': 'Multi Ancestry Whole Brain Meta-analysis eQTL',
 'Brain_Substantia_nigra': 'Substantia nigra eQTL',
 'atlas_csf': 'Serebral Spinal Fluid pQTL',
 'Brain_Hypothalamus': 'Hypothalamus eQTL',
 'Liver': 'Liver eQTL',
 'Brain_Anterior_cingulate_cortex_BA24': 'Anterior Cingulate Cortex BA24 eQTL',
 'blood_eQTLgen': 'Whole Blood eQTL eQTLgen',
 'Brain_Putamen_basal_ganglia': 'Putamen Basal Ganglia eQTL',
 'Brain_Amygdala': 'Amygdala eQTL',
 'brain_eMeta': 'Whole Brain eQTL',
 'Whole_Blood': 'Whole Blood eQTL GTEx',
 'Brain_Cerebellum': 'Cerebellum eQTL',
 'Brain_Nucleus_accumbens_basal_ganglia': 'Nucleus Accumbens Basal Ganglia'}

In [30]:
# change omics
ndd_df_pc['Omic Annotated'] = ndd_df_pc.Omic.map(omic_map)
ndd_df_pc = ndd_df_pc[['Omic Annotated', 'Disease', 'Gene', 'probeID', 'ProbeChr', 'Probe_bp', 'topRSID',
       'topSNP_chr', 'topSNP_bp', 'A1', 'A2', 'b_SMR', 'se_SMR', 'p_SMR',
       'p_SMR_multi', 'p_HEIDI']]
ndd_df_pc.rename({'Omic Annotated': 'Omic'}, axis = 1, inplace = True)
ndd_df_pc

Unnamed: 0,Omic,Disease,Gene,probeID,ProbeChr,Probe_bp,topRSID,topSNP_chr,topSNP_bp,A1,A2,b_SMR,se_SMR,p_SMR,p_SMR_multi,p_HEIDI
0,Cerebellum eQTL,AD,GOLGA8S,ENSG00000261739,15,23354846,rs139243742,15,23610305,C,T,-0.008140,0.028169,0.772598,0.772598,-9999.000000
1,Cerebellum eQTL,AD,GABRG3,ENSG00000182256,15,26971181,rs4887534,15,27234173,G,C,0.023336,0.019121,0.222284,0.246380,0.078280
2,Cerebellum eQTL,AD,GOLGA6L7,ENSG00000261649,15,28848675,rs4572359,15,29075650,G,C,0.007375,0.013230,0.577194,0.149214,0.095128
3,Cerebellum eQTL,AD,CHRFAM7A,ENSG00000166664,15,30393849,rs2079192,15,31058693,C,T,0.011640,0.016104,0.469785,0.404434,0.372044
4,Cerebellum eQTL,AD,GOLGA8R,ENSG00000186399,15,30414162,rs143536437,15,30898332,C,T,0.018233,0.016386,0.265817,0.483425,0.582195
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1732780,Nucleus Accumbens Basal Ganglia,PSP,ALG12,ENSG00000182858,22,50293877,rs1321,22,50297435,C,T,0.135554,0.125697,0.280849,0.278519,0.962902
1732782,Nucleus Accumbens Basal Ganglia,PSP,TUBGCP6,ENSG00000128159,22,50656118,rs11703226,22,50658424,C,T,-0.044539,0.243659,0.854962,0.942211,-9999.000000
1732783,Nucleus Accumbens Basal Ganglia,PSP,CPT1B,ENSG00000205560,22,51007290,rs470117,22,51009953,T,C,0.020012,0.172706,0.907753,0.929421,-9999.000000
1732784,Nucleus Accumbens Basal Ganglia,PSP,CHKB,ENSG00000100288,22,51017378,rs3213445,22,51015838,C,T,0.088450,0.193157,0.647013,0.647013,-9999.000000


In [31]:
# get desired NDDs from Gtex
gtex_brain = glob.glob('/../omicSynth/GTEx/Brain*')
gtex_liver = glob.glob('/../omicSynth/GTEx/Liver*')
gtex_nerve = glob.glob('/../omicSynth/GTEx/Nerve*')
gtex_muscle = glob.glob('/../omicSynth/GTEx/Muscle*')
gtex_blood = glob.glob('/../omicSynth/GTEx/*Blood*')

gtex_list = gtex_brain + gtex_liver + gtex_nerve + gtex_blood + gtex_muscle

gtex = []
for x in gtex_list:
    shortname = x.split('/')[-1].rsplit('.',2)[0] 
    if shortname not in gtex:
        gtex.append(x.split('/')[-1].rsplit('.',2)[0])
        
gtex = list(map(lambda x: x.replace('Brain_Spinal_cord_cervical_c-1', 'Brain_Spinal_cord_cervical_c1'), gtex))

ndd_list = ['AD', 'ALS', 'FTD', 'LBD', 'PD', 'PSP']

# add non-GTEx xQTL sources
ndd_omic = gtex + ['Cerebellum_metaBrain', 'Spinalcord_metaBrain', 'brain_eMeta', 'Cortex_metaBrain', 
            'Basalganglia_metaBrain', 'Hippocampus_metaBrain', 'blood_eQTLgen', 'brain_mMeta', 'blood_Bryois', 'blood_mcrae', 'psychEncode_prefrontal_cortex', 'atlas_csf', 'atlas_plasma', 'atlas_brain', 'multiancestry']

# list of eqtl omics except multiancestry
eqtl_omics = ['Brain_Amygdala', 'Brain_Hippocampus',
       'Brain_Anterior_cingulate_cortex_BA24',
       'Brain_Nucleus_accumbens_basal_ganglia', 'Brain_Hypothalamus',
       'Brain_Cerebellar_Hemisphere', 'Brain_Substantia_nigra',
       'Brain_Caudate_basal_ganglia', 'Brain_Putamen_basal_ganglia',
       'Brain_Cerebellum', 'Brain_Cortex', 'Brain_Frontal_Cortex_BA9',
       'Liver', 'Nerve_Tibial', 'Whole_Blood', 'Muscle_Skeletal',
       'Cerebellum_metaBrain', 'Spinalcord_metaBrain', 'brain_eMeta',
       'Cortex_metaBrain', 'Basalganglia_metaBrain',
       'Hippocampus_metaBrain', 'blood_eQTLgen', 'psychEncode_prefrontal_cortex', 'Brain_Spinal_cord_cervical_c1']

## 1. Significant Genes <a id='siggen'></a>

### Significant Therapeutic Genes p_SMR_multi < 0.05/16875 & p_HEIDI > 0.01 - primary significance threshold for paper

In [32]:
def extract_topsig_genes2(main_df, sig = 0.05): # no fdr just P SMR multi < 0.01/20000 and HEIDI > 0.01.
    # create df to hold top genes
    top_gene_df = pd.DataFrame()
    # pval based on largest number of unique protein coding genes amongst NDDs
    pval = sig/16875
    df = main_df.query(f"p_SMR_multi < {pval} & p_HEIDI > 0.01") # initial filter by each dx/omic combo  
    top_gene = df[['Omic', 'Disease', 'Gene','probeID','ProbeChr', 'Probe_bp', 'topRSID','topSNP_chr', 'topSNP_bp', 'A1', 'A2', 'b_SMR', 'se_SMR','p_SMR', 'p_SMR_multi','p_HEIDI']] # filter out any genes whose fdr pval does not meet defined significance; default = 0.05

    top_gene_df = pd.concat([top_gene_df,top_gene])
    top_gene_df = top_gene_df.drop_duplicates()
        
    return top_gene_df

In [33]:
ndd_adj = extract_topsig_genes2(ndd_df_pc)

ndd_adj

Unnamed: 0,Omic,Disease,Gene,probeID,ProbeChr,Probe_bp,topRSID,topSNP_chr,topSNP_bp,A1,A2,b_SMR,se_SMR,p_SMR,p_SMR_multi,p_HEIDI
271,Cerebellum eQTL,AD,GPC2,ENSG00000213420,7,100177381,rs17309333,7,99813092,C,A,-0.005518,0.018258,7.624718e-01,8.862247e-07,0.272504
2440,Cerebellum eQTL,AD,CR1,ENSG00000203710,1,207496147,rs1830763,1,207800389,A,C,0.245292,0.045868,8.901474e-08,1.723238e-08,0.109188
2657,Cerebellum eQTL,AD,LRRC37A,ENSG00000176681,17,46292733,rs2696466,17,44289832,A,G,-0.065755,0.011450,9.315664e-09,2.127122e-06,0.519064
2658,Cerebellum eQTL,AD,ARL17B,ENSG00000228696,17,46361797,rs7225002,17,44189067,A,G,-0.062603,0.010536,2.822204e-09,4.203789e-07,0.593753
2701,Cerebellum eQTL,AD,ACE,ENSG00000159640,17,63477061,rs4459609,17,61548948,C,A,-0.130730,0.022878,1.102263e-08,6.234866e-08,0.550246
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1717384,Nucleus Accumbens Basal Ganglia,AD,LRRC37A2,ENSG00000238083,17,44588877,rs413917,17,43723189,A,G,-0.043294,0.008585,4.586552e-07,6.966281e-07,0.484854
1728224,Nucleus Accumbens Basal Ganglia,PD,CD38,ENSG00000004468,4,15779898,rs4698413,4,15737882,T,C,-0.317415,0.060710,1.710260e-07,4.311773e-07,0.446134
1728684,Nucleus Accumbens Basal Ganglia,PD,GPNMB,ENSG00000136235,7,23275586,rs199357,7,23288309,A,G,0.121781,0.024914,1.018557e-06,8.078376e-07,0.100483
1730055,Nucleus Accumbens Basal Ganglia,PD,ARHGAP27,ENSG00000159314,17,43471275,rs753236,17,43946318,C,T,-0.569009,0.089788,2.339136e-10,3.652036e-10,0.945530


#### Per Disease Breakdown for significant hits at p_SMR_multi < 0.05 & p_HEIDI > 0.01

In [34]:
for dx in ndd_list:
    df = ndd_adj.query(f'Disease == "{dx}"')
    uni = len(df.Gene.unique())
    print(f'{dx} has {df.shape[0]} significant associations and {uni} unique genes')

AD has 317 significant associations and 116 unique genes
ALS has 4 significant associations and 3 unique genes
FTD has 0 significant associations and 0 unique genes
LBD has 13 significant associations and 5 unique genes
PD has 184 significant associations and 46 unique genes
PSP has 22 significant associations and 9 unique genes


In [35]:
# grab most significant omic association for each gene and put into a df
top_genes_list_adj = ndd_adj.Gene.unique()
single_sig_adj = pd.DataFrame()
for gene in top_genes_list_adj:
    df = ndd_adj.query(f'Gene == "{gene}"')
    tmp = df[df['p_SMR_multi'] == df['p_SMR_multi'].min()]
    single_sig_adj = pd.concat([single_sig_adj, tmp])

# export single most significant omic association for each gene
#single_sig_adj[['Gene', 'Omic', 'Disease', 'topRSID', 'b_SMR','se_SMR', 'p_SMR', 'p_SMR_multi', 'p_HEIDI']].to_csv('top_sig_genetx_adj.csv', index = False)

In [36]:
# find all diseases a gene is associated with
gene_dx_adj = pd.DataFrame()
dx_list_adj = []
for gene in top_genes_list_adj:
    df = ndd_adj.query(f'Gene == "{gene}"')
    
    # get list of diseases associated with a significant gene
    dxs = list(df.Disease.unique())
    
    # turn list into string to use as a column in output df
    dx_string = ', '.join(dxs)
    
    # append string to a list var 
    dx_list_adj.append(dx_string)

# create df
gene_dx_adj['Gene'] = top_genes_list_adj
gene_dx_adj['Diseases'] = dx_list_adj

# export
#gene_dx_adj.to_csv('top_gene_dx_combo_adj.csv', index = None)

In [37]:
# final Gene-Disease-Omic combos
gene_dxtx_adj = pd.DataFrame()
tmp_list_adj = []

for gene in top_genes_list_adj:
    df = ndd_adj.query(f'Gene == "{gene}"')
    
    # create column that has every dx-omic combo for that gene
    df['dxtx'] = df.Disease + '-' + df.Omic
    #df = df.assign(dxtx=lambda x: str(x.Disease + '-' + x.Omic))
    
    # get list of each unique combo
    combos = list(set(df['dxtx'].unique()))
    
    # turn list into string to use as a column in output df
    dxtx_string = ', '.join(combos)
    
    # append string to a list var 
    tmp_list_adj.append(dxtx_string)
    
# create df
gene_dxtx_adj['Gene'] = top_genes_list_adj
gene_dxtx_adj['Diseases-Omic Associations'] = tmp_list_adj

#export
#gene_dxtx_adj.to_csv('gene_dxtx_combos_adj.csv', index = None)

In [38]:
# final Gene-Disease-Omic combos and seperate by disease
gene_dxtx_adj = pd.DataFrame()
for gene in top_genes_list_adj:
    df = ndd_adj.query(f'Gene == "{gene}"')
    
    # groupby
    df2 = df.groupby(['Gene', 'Disease']).agg({'Gene':'first','Disease':'first', 'Omic': ', '.join})
  

    # append resulting dict into larger df for genes
    gene_dxtx_adj = pd.concat([gene_dxtx_adj, df2])

# export
#gene_dxtx_adj.to_csv('gene_dxtx_combos_adj.csv', index = None)
gene_dxtx_adj

Unnamed: 0_level_0,Unnamed: 1_level_0,Gene,Disease,Omic
Gene,Disease,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
GPC2,AD,GPC2,AD,Cerebellum eQTL
CR1,AD,CR1,AD,"Cerebellum eQTL, Basal Ganglia eQTL, Cortex eQ..."
LRRC37A,AD,LRRC37A,AD,"Cerebellum eQTL, Cortex eQTL metaBrain, Fronta..."
ARL17B,AD,ARL17B,AD,"Cerebellum eQTL, Cortex eQTL metaBrain"
ARL17B,PD,ARL17B,PD,Spinalcord eQTL
...,...,...,...,...
SETD1A,PD,SETD1A,PD,Whole Blood eQTL eQTLgen
ITGAX,PD,ITGAX,PD,Whole Blood eQTL eQTLgen
DCAKD,PSP,DCAKD,PSP,Whole Blood eQTL eQTLgen
ZNF668,PD,ZNF668,PD,Whole Blood eQTL GTEx


### Genes Significant in 2 Diseases

In [39]:
# grab most significant omic association for each gene and put into a df
top_genes_list_adj = ndd_adj.Gene.unique()
two_sig_adj = pd.DataFrame()
dx_2plus_adj = []
for gene in top_genes_list_adj:
    df = ndd_adj.query(f'Gene == "{gene}"')
    
    # get list of diseases associated with a significant gene
    dxs = list(df.Disease.unique())
    
    # only get info on genes that are isgnificant in 3+ dx

    if len(dxs) == 2:
        two_sig_adj = pd.concat([two_sig_adj, df])
        # append genes that meet criteria to a list
        dx_2plus_adj.append(gene)

# find all diseases a gene is associated with
gene_dx_adj2 = pd.DataFrame()
dx_list_adj2 = []
for gene in dx_2plus_adj:
    df = ndd_adj.query(f'Gene == "{gene}"')
    
    # get list of diseases associated with a significant gene
    dxs = list(df.Disease.unique())
    
    # turn list into string to use as a column in output df
    dx_string = ', '.join(dxs)
    
    # append string to a list var 
    dx_list_adj2.append(dx_string)

# create df
gene_dx_adj2['Gene'] = dx_2plus_adj
gene_dx_adj2['Diseases'] = dx_list_adj2

thera_sig_adj2 = []
non_sig_adj2 = []
for x in gene_dx_adj2['Gene']:
    if x in thera_genes: # thera_genes = list of thera genes from drug dbs
        thera_sig_adj2.append(x)
    else:
        non_sig_adj2.append(x)

In [40]:
gene_dx_adj2.query('Gene == @thera_sig_adj2')

Unnamed: 0,Gene,Diseases
1,KAT8,"PD, AD"


In [41]:
gene_dx_adj2.query('Gene == @non_sig_adj2')  

Unnamed: 0,Gene,Diseases
0,ARL17B,"AD, PD"
2,LRRC37A2,"PD, AD"
3,PRSS36,"AD, PD"
4,IDUA,"LBD, PD"
5,TMEM175,"PD, LBD"
6,FMNL1,"PSP, AD"
7,PLEKHM1,"PD, PSP"
8,WNT3,"PD, AD"
9,SPPL2C,"PD, AD"


In [42]:
# use top gene list to pull all hits
dx2_df = ndd_adj.query('Gene == @dx_2plus_adj')
dx2_df

Unnamed: 0,Omic,Disease,Gene,probeID,ProbeChr,Probe_bp,topRSID,topSNP_chr,topSNP_bp,A1,A2,b_SMR,se_SMR,p_SMR,p_SMR_multi,p_HEIDI
2658,Cerebellum eQTL,AD,ARL17B,ENSG00000228696,17,46361797,rs7225002,17,44189067,A,G,-0.062603,0.010536,2.822204e-09,4.203789e-07,0.593753
20690,Cerebellum eQTL,PD,KAT8,ENSG00000103510,16,31114489,rs1549293,16,31141993,C,T,-0.175210,0.039031,7.155658e-06,1.206698e-06,0.041075
27184,Basal Ganglia eQTL,PD,LRRC37A2,ENSG00000238083,17,46511511,rs199530,17,44836653,G,A,-0.204835,0.031658,9.779892e-11,5.010068e-10,0.010354
30029,Spinalcord eQTL,PD,ARL17B,ENSG00000228696,17,46361797,rs199451,17,44801784,G,A,-0.202836,0.033261,1.071756e-09,6.417524e-09,0.012236
30030,Spinalcord eQTL,PD,LRRC37A2,ENSG00000238083,17,46511511,rs58879558,17,44095467,T,C,-0.223417,0.035782,4.268185e-10,4.692435e-09,0.267978
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1690043,Cerebellum eQTL,AD,SPPL2C,ENSG00000185294,17,43922256,rs17573607,17,44083081,A,G,-0.071854,0.014682,9.875698e-07,2.714257e-06,0.920137
1690052,Cerebellum eQTL,AD,LRRC37A2,ENSG00000238083,17,44588877,rs17564020,17,43991781,T,G,-0.045558,0.008779,2.108390e-07,1.635340e-06,0.857088
1707341,Cerebellum eQTL,PD,TMEM175,ENSG00000127419,4,926175,rs73211813,4,975238,T,C,-0.275567,0.057472,1.628208e-06,7.710669e-07,0.330365
1710059,Cerebellum eQTL,PD,KAT8,ENSG00000103510,16,31127075,rs9972727,16,31149142,G,A,-0.176591,0.039045,6.102740e-06,1.039952e-06,0.012644


In [43]:
# export 
dx2_df.to_csv('gene_2dx_hits.csv', index = None)

### Genes Significant in 3+ Diseases

In [44]:
# grab most significant omic association for each gene and put into a df
top_genes_list_adj = ndd_adj.Gene.unique()
three_sig_adj = pd.DataFrame()
dx_3plus_adj = []
for gene in top_genes_list_adj:
    df = ndd_adj.query(f'Gene == "{gene}"')
    
    # get list of diseases associated with a significant gene
    dxs = list(df.Disease.unique())
    
    # only get info on genes that are isgnificant in 3+ dx

    if len(dxs) >= 3:
        three_sig_adj = pd.concat([three_sig_adj, df])
        # append genes that meet criteria to a list
        dx_3plus_adj.append(gene)

# find all diseases a gene is associated with
gene_dx_adj3 = pd.DataFrame()
dx_list_adj3 = []
for gene in dx_3plus_adj:
    df = ndd_adj.query(f'Gene == "{gene}"')
    
    # get list of diseases associated with a significant gene
    dxs = list(df.Disease.unique())
    
    # turn list into string to use as a column in output df
    dx_string = ', '.join(dxs)
    
    # append string to a list var 
    dx_list_adj3.append(dx_string)

# create df
gene_dx_adj3['Gene'] = dx_3plus_adj
gene_dx_adj3['Diseases'] = dx_list_adj3

thera_sig_adj3 = []
non_sig_adj3 = []
for x in gene_dx_adj3['Gene']:
    if x in thera_genes: # thera_genes = list of thera genes from drug dbs
        thera_sig_adj3.append(x)
    else:
        non_sig_adj3.append(x)

In [45]:
gene_dx_adj3

Unnamed: 0,Gene,Diseases
0,KANSL1,"PD, PSP, AD"
1,ARL17A,"PSP, PD, AD"
2,MAPT,"AD, PD, PSP"
3,ARHGAP27,"PD, AD, PSP"
4,CRHR1,"PD, PSP, AD"


In [46]:
# use top gene list to pull all hits
dx3_df = ndd_adj.query('Gene == @dx_3plus_adj')
dx3_df

Unnamed: 0,Omic,Disease,Gene,probeID,ProbeChr,Probe_bp,topRSID,topSNP_chr,topSNP_bp,A1,A2,b_SMR,se_SMR,p_SMR,p_SMR_multi,p_HEIDI
30027,Spinalcord eQTL,PD,KANSL1,ENSG00000120071,17,46225389,rs199451,17,44801784,G,A,-0.262279,0.054263,1.341968e-06,1.277333e-06,0.013012
30480,Spinalcord eQTL,PSP,KANSL1,ENSG00000120071,17,46225389,rs169201,17,44790203,A,G,1.838150,0.345331,1.021386e-07,1.021386e-07,0.040308
30484,Spinalcord eQTL,PSP,ARL17A,ENSG00000185829,17,46579682,rs242557,17,44019712,G,A,1.053060,0.197783,1.013176e-07,1.114835e-10,0.044074
33017,Hippocampus eQTL,PD,ARL17A,ENSG00000185829,17,46579682,rs3785884,17,44057595,A,G,-0.269115,0.043518,6.251873e-10,2.771664e-09,0.272407
115182,Whole Brain meta-analysis mQTL,AD,MAPT,cg17569492,17,44026659,rs17650872,17,44039516,T,G,-0.081413,0.016108,4.321768e-07,4.321768e-07,0.640374
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1594931,Putamen Basal Ganglia eQTL,PD,ARL17A,ENSG00000185829,17,44594068,rs757055,17,43946875,T,C,-0.312190,0.054573,1.061206e-08,4.337545e-08,0.651490
1604719,Amygdala eQTL,PSP,ARL17A,ENSG00000185829,17,44594068,rs393152,17,43719143,G,A,1.951560,0.325598,2.050152e-09,6.079622e-09,0.068247
1710357,Cerebellum eQTL,PD,ARL17A,ENSG00000185829,17,44594068,rs55974014,17,43757450,A,C,-0.226764,0.029954,3.718993e-14,2.936298e-12,0.209266
1730055,Nucleus Accumbens Basal Ganglia,PD,ARHGAP27,ENSG00000159314,17,43471275,rs753236,17,43946318,C,T,-0.569009,0.089788,2.339136e-10,3.652036e-10,0.945530


In [47]:
# export 
dx3_df.to_csv('gene_3dx_hits.csv', index = None)

In [26]:
gene_dx_adj3.query('Gene == @thera_sig_adj3')

Unnamed: 0,Gene,Diseases
2,MAPT,"AD, PD, PSP"
4,CRHR1,"PD, PSP, AD"


In [27]:
gene_dx_adj3.query('Gene == @non_sig_adj3')  

Unnamed: 0,Gene,Diseases
0,KANSL1,"PD, PSP, AD"
1,ARL17A,"PSP, PD, AD"
3,ARHGAP27,"PD, AD, PSP"


## 2. Summary Statistics <a id='sumstats'></a>

### 2.1 Number of Unique Genes in each Disease & Overall <a id='sumstats2.1'></a>

In [22]:
# pull numbers from all genes used in SMR
for dx in ndd_list:
    df = ndd_df_pc.query(f"Disease == '{dx}'")
    num = len(df.Gene.unique())
    print(f'{dx} has {num} total genes')

AD has 16833 total genes
ALS has 16875 total genes
FTD has 16788 total genes
LBD has 16797 total genes
PD has 16872 total genes
PSP has 16042 total genes


In [23]:
# pull numbers from all genes used in SMR
for dx in ndd_list:
    df = ndd_df_pc.query(f"Disease == '{dx}' & Omic == 'Liver'")
    num = len(df.Gene.unique())
    print(f'{dx} has {num} total liver genes')

AD has 1597 total liver genes
ALS has 1610 total liver genes
FTD has 1537 total liver genes
LBD has 1540 total liver genes
PD has 1596 total liver genes
PSP has 1033 total liver genes


In [24]:
ndd_thera = ndd_df_pc.query('Gene == @thera_genes')
ndd_nonthera = ndd_df_pc.query('Gene != @thera_genes')

In [25]:
for dx in ndd_list:
    df = ndd_thera.query(f"Disease == '{dx}'")
    num = len(df.Gene.unique())
    print(f'{dx} has {num} therapeutic genes')

AD has 3562 therapeutic genes
ALS has 3565 therapeutic genes
FTD has 3551 therapeutic genes
LBD has 3554 therapeutic genes
PD has 3566 therapeutic genes
PSP has 3420 therapeutic genes


In [26]:
for dx in ndd_list:
    df = ndd_nonthera.query(f"Disease == '{dx}'")
    num = len(df.Gene.unique())
    print(f'{dx} has {num} nontherapeutic genes')

AD has 13271 nontherapeutic genes
ALS has 13310 nontherapeutic genes
FTD has 13237 nontherapeutic genes
LBD has 13243 nontherapeutic genes
PD has 13306 nontherapeutic genes
PSP has 12622 nontherapeutic genes


In [27]:
# calculate how which genes are significant
all_sig = extract_topsig_genes(ndd_df_pc)

# pull numbers from significant genes used in SMR
for dx in ndd_list:
    df = all_sig.query(f"Disease == '{dx}'")
    num = len(df.Gene.unique())
    print(f'{dx} has {num} total unique genes')

AD has 4948 total unique genes
ALS has 3188 total unique genes
FTD has 2318 total unique genes
LBD has 2530 total unique genes
PD has 3592 total unique genes
PSP has 2275 total unique genes


In [28]:
# pull numbers from significant genes used in SMR and split by therapeutic status
for dx in ndd_list:
    df = all_sig.query(f"Disease == '{dx}'")
    thera = df.query('Gene == @thera_genes')
    non_thera = df.query('Gene != @thera_genes')
    num1 = len(thera.Gene.unique())
    num2 = len(non_thera.Gene.unique())
    print(f'{dx} has {num1} thera genes and {num2} nonthera genes')

AD has 1142 thera genes and 3806 nonthera genes
ALS has 715 thera genes and 2473 nonthera genes
FTD has 542 thera genes and 1776 nonthera genes
LBD has 580 thera genes and 1950 nonthera genes
PD has 811 thera genes and 2781 nonthera genes
PSP has 574 thera genes and 1701 nonthera genes


In [29]:
# pull numbers for liver genes used in SMR
for dx in ndd_list:
    df = all_sig.query(f"Disease == '{dx}' & Omic == 'Liver'")
    num = len(df.Gene.unique())
    print(f'{dx} has {num} total unique liver genes')

AD has 175 total unique liver genes
ALS has 83 total unique liver genes
FTD has 78 total unique liver genes
LBD has 82 total unique liver genes
PD has 108 total unique liver genes
PSP has 30 total unique liver genes


### 2.2 eQTL <a id='sumstats2.2'></a>

#### All Genes

In [None]:
# filter out for all eQTL omics
ndd_eQTL = ndd_df_pc.query('Omic == @eqtl_omics')

multi_eqtl = ndd_df_pc.query('Omic == "multiancestry"')

ma_genes = list(multi_eqtl.Gene.unique())

In [None]:
# get number of eQTLS that aren't multiancestry for each disease
for dx in ndd_list:
    df = ndd_eQTL.query(f'Disease == "{dx}"')
    num = len(df.Gene.unique())
    print(f'There are {num} genes in {dx} eQTLs')

There are 15112 genes in AD eQTLs
There are 15163 genes in ALS eQTLs
There are 15038 genes in FTD eQTLs
There are 15069 genes in LBD eQTLs
There are 15159 genes in PD eQTLs
There are 13839 genes in PSP eQTLs


In [None]:
# get number of eQTLS that are multiancestry for each disease
for dx in ndd_list:
    df = multi_eqtl.query(f'Disease == "{dx}"')
    num = len(df.Gene.unique())
    print(f'There are {num} unique genes in {dx} multiancestry eQTLs')

There are 8516 unique genes in AD multiancestry eQTLs
There are 8528 unique genes in ALS multiancestry eQTLs
There are 8411 unique genes in FTD multiancestry eQTLs
There are 8404 unique genes in LBD multiancestry eQTLs
There are 8507 unique genes in PD multiancestry eQTLs
There are 6889 unique genes in PSP multiancestry eQTLs


In [None]:
# find how many hits replicated
for dx in ndd_list:
    eqtl_tmp = ndd_eQTL.query(f'Disease == "{dx}"')
    
    eqtl = eqtl_tmp.query("Gene == @ma_genes")
    
    num = len(eqtl.Gene.unique())
    print(f'There are {num} replicated multiancestry genes in {dx} eQTLs')

There are 8404 replicated multiancestry genes in AD eQTLs
There are 8408 replicated multiancestry genes in ALS eQTLs
There are 8394 replicated multiancestry genes in FTD eQTLs
There are 8388 replicated multiancestry genes in LBD eQTLs
There are 8407 replicated multiancestry genes in PD eQTLs
There are 8073 replicated multiancestry genes in PSP eQTLs


### 2.3 Significant genes <a id='sumstats2.3'></a>
#### 2.3.1 p_SMR_multi < 0.05 + HEIDI > 0.01 

In [None]:
# filter out for all eQTL omics
ndd_sigeQTL = ndd_top.query('Omic == @eqtl_omics')

multi_sigeqtl = ndd_top.query('Omic == "multiancestry"')

ma_genes = list(multi_eqtl.Gene.unique())

In [None]:
# get number of eQTLS that aren't multiancestry for each disease
for dx in ndd_list:
    df = ndd_sigeQTL.query(f'Disease == "{dx}"')
    num = len(df.Gene.unique())
    print(f'There are {num} genes in {dx} eQTLs')

There are 3189 genes in AD eQTLs
There are 1857 genes in ALS eQTLs
There are 1243 genes in FTD eQTLs
There are 1384 genes in LBD eQTLs
There are 2161 genes in PD eQTLs
There are 1270 genes in PSP eQTLs


In [None]:
# get number of eQTLS that are multiancestry for each disease
for dx in ndd_list:
    df = multi_sigeqtl.query(f'Disease == "{dx}"')
    num = len(df.Gene.unique())
    print(f'There are {num} unique genes in {dx} multiancestry eQTLs')

There are 843 unique genes in AD multiancestry eQTLs
There are 484 unique genes in ALS multiancestry eQTLs
There are 314 unique genes in FTD multiancestry eQTLs
There are 350 unique genes in LBD multiancestry eQTLs
There are 582 unique genes in PD multiancestry eQTLs
There are 356 unique genes in PSP multiancestry eQTLs


In [None]:
# find how many hits replicated using all multiancestry gene
# use list of all multiancestry eqtls(regardless of significance) to find any replicated hits in significant NDDs
for dx in ndd_list:
    eqtl_tmp = ndd_sigeQTL.query(f'Disease == "{dx}"')
    
    eqtl = eqtl_tmp.query("Gene == @ma_genes")
    
    num = len(eqtl.Gene.unique())
    print(f'There are {num} replicated multiancestry genes in {dx} eQTLs')

There are 2079 replicated multiancestry genes in AD eQTLs
There are 1260 replicated multiancestry genes in ALS eQTLs
There are 810 replicated multiancestry genes in FTD eQTLs
There are 900 replicated multiancestry genes in LBD eQTLs
There are 1434 replicated multiancestry genes in PD eQTLs
There are 842 replicated multiancestry genes in PSP eQTLs


#### 2.3.2 p < 0.05/16875 & HEIDI > 0.01 <a id='sumstats2.3.2'></a>

In [34]:
sig2 = extract_topsig_genes2(ndd_df)

# filter out for all eQTL omics
ndd_sigeQTL = sig2.query('Omic == @eqtl_omics')

multi_sigeqtl = sig2.query('Omic == "multiancestry"')

ma_genes = list(multi_eqtl.Gene.unique())

In [35]:
# pull numbers from significant genes used in SMR
for dx in ndd_list:
    df = sig2.query(f"Disease == '{dx}'")
    num = len(df.Gene.unique())
    print(f'{dx} has {num} total unique genes')
    
# pull numbers from significant genes used in SMR and split by therapeutic status
for dx in ndd_list:
    df = sig2.query(f"Disease == '{dx}'")
    thera = df.query('Gene == @thera_genes')
    non_thera = df.query('Gene != @thera_genes')
    num1 = len(thera.Gene.unique())
    num2 = len(non_thera.Gene.unique())
    print(f'{dx} has {num1} thera genes and {num2} nonthera genes')

AD has 159 total unique genes
ALS has 4 total unique genes
FTD has 0 total unique genes
LBD has 6 total unique genes
PD has 71 total unique genes
PSP has 14 total unique genes
AD has 31 thera genes and 128 nonthera genes
ALS has 0 thera genes and 4 nonthera genes
FTD has 0 thera genes and 0 nonthera genes
LBD has 1 thera genes and 5 nonthera genes
PD has 15 thera genes and 56 nonthera genes
PSP has 2 thera genes and 12 nonthera genes


In [36]:
# pull numbers for liver genes used in SMR
for dx in ndd_list:
    df = sig2.query(f"Disease == '{dx}' & Omic == 'Liver'")
    num = len(df.Gene.unique())
    print(f'{dx} has {num} total unique liver genes')

AD has 4 total unique liver genes
ALS has 0 total unique liver genes
FTD has 0 total unique liver genes
LBD has 0 total unique liver genes
PD has 5 total unique liver genes
PSP has 1 total unique liver genes


In [37]:
# get number of eQTLS that aren't multiancestry for each disease
for dx in ndd_list:
    df = ndd_sigeQTL.query(f'Disease == "{dx}"')
    num = len(df.Gene.unique())
    print(f'There are {num} genes in {dx} eQTLs')

There are 97 genes in AD eQTLs
There are 3 genes in ALS eQTLs
There are 0 genes in FTD eQTLs
There are 2 genes in LBD eQTLs
There are 56 genes in PD eQTLs
There are 8 genes in PSP eQTLs


In [38]:
# find how many hits replicated using all multiancestry gene
# use list of all multiancestry eqtls(regardless of significance) to find any replicated hits in significant NDDs
for dx in ndd_list:
    eqtl_tmp = ndd_sigeQTL.query(f'Disease == "{dx}"')
    
    eqtl = eqtl_tmp.query("Gene == @ma_genes")
    
    num = len(eqtl.Gene.unique())
    print(f'There are {num} replicated multiancestry genes in {dx} eQTLs')

There are 44 replicated multiancestry genes in AD eQTLs
There are 3 replicated multiancestry genes in ALS eQTLs
There are 0 replicated multiancestry genes in FTD eQTLs
There are 0 replicated multiancestry genes in LBD eQTLs
There are 26 replicated multiancestry genes in PD eQTLs
There are 5 replicated multiancestry genes in PSP eQTLs


#### 2.3.3 Significance p_SMR_multi < 0.01/16875*[N Omic-Disease pairs]  & p_HEIDI >  0.01 <a id='sumstats2.3.3'></a>

In [39]:
def extract_topsig_genes3(main_df,dx_list, omic_list, sig = 0.05): # no fdr just P SMR multi < 0.01/20000 and HEIDI > 0.01.
    # create df to hold top genes
    top_gene_df = pd.DataFrame()
    all_df = main_df.query(f"Disease == '{dx_list[0]}'")
    txdx = len(all_df[['Omic', 'Disease']].value_counts()) # of omic-dx pairs for each disease
    pval = 0.05/(16875*txdx)
    for disease in dx_list:
        for omic in omic_list:
            df = main_df.query(f"Disease == '{disease}' & Omic == '{omic}' & p_SMR_multi < {pval} & p_HEIDI > 0.01") # initial filter by each dx/omic combo  
            top_gene = df[['Omic', 'Disease', 'Gene','probeID', 'topRSID', 'b_SMR', 'se_SMR','p_SMR', 'p_SMR_multi','p_HEIDI']] # filter out any genes whose fdr pval does not meet defined significance; default = 0.05

            top_gene_df = pd.concat([top_gene_df,top_gene])
            top_gene_df = top_gene_df.drop_duplicates()
        
    return top_gene_df

In [40]:
sig3 = extract_topsig_genes3(ndd_df, ndd_list, ndd_omic)

# filter out for all eQTL omics
ndd_sigeQTL = sig3.query('Omic == @eqtl_omics')

multi_sigeqtl = sig3.query('Omic == "multiancestry"')

ma_genes = list(multi_eqtl.Gene.unique())

In [41]:
# pull numbers from significant genes used in SMR
for dx in ndd_list:
    df = sig3.query(f"Disease == '{dx}'")
    num = len(df.Gene.unique())
    print(f'{dx} has {num} total unique genes')
    
# pull numbers from significant genes used in SMR and split by therapeutic status
for dx in ndd_list:
    df = sig3.query(f"Disease == '{dx}'")
    thera = df.query('Gene == @thera_genes')
    non_thera = df.query('Gene != @thera_genes')
    num1 = len(thera.Gene.unique())
    num2 = len(non_thera.Gene.unique())
    print(f'{dx} has {num1} thera genes and {num2} nonthera genes')

AD has 60 total unique genes
ALS has 1 total unique genes
FTD has 0 total unique genes
LBD has 2 total unique genes
PD has 42 total unique genes
PSP has 13 total unique genes
AD has 14 thera genes and 46 nonthera genes
ALS has 0 thera genes and 1 nonthera genes
FTD has 0 thera genes and 0 nonthera genes
LBD has 1 thera genes and 1 nonthera genes
PD has 8 thera genes and 34 nonthera genes
PSP has 2 thera genes and 11 nonthera genes


In [42]:
# pull numbers for liver genes used in SMR
for dx in ndd_list:
    df = sig3.query(f"Disease == '{dx}' & Omic == 'Liver'")
    num = len(df.Gene.unique())
    print(f'{dx} has {num} total unique liver genes')

AD has 1 total unique liver genes
ALS has 0 total unique liver genes
FTD has 0 total unique liver genes
LBD has 0 total unique liver genes
PD has 2 total unique liver genes
PSP has 0 total unique liver genes


In [43]:
# get number of eQTLS that aren't multiancestry for each disease
for dx in ndd_list:
    df = ndd_sigeQTL.query(f'Disease == "{dx}"')
    num = len(df.Gene.unique())
    print(f'There are {num} genes in {dx} eQTLs')

There are 39 genes in AD eQTLs
There are 0 genes in ALS eQTLs
There are 0 genes in FTD eQTLs
There are 1 genes in LBD eQTLs
There are 36 genes in PD eQTLs
There are 8 genes in PSP eQTLs


In [44]:
# find how many hits replicated using all multiancestry gene
# use list of all multiancestry eqtls(regardless of significance) to find any replicated hits in significant NDDs
for dx in ndd_list:
    eqtl_tmp = ndd_sigeQTL.query(f'Disease == "{dx}"')
    
    eqtl = eqtl_tmp.query("Gene == @ma_genes")
    
    num = len(eqtl.Gene.unique())
    print(f'There are {num} replicated multiancestry genes in {dx} eQTLs')

There are 19 replicated multiancestry genes in AD eQTLs
There are 0 replicated multiancestry genes in ALS eQTLs
There are 0 replicated multiancestry genes in FTD eQTLs
There are 0 replicated multiancestry genes in LBD eQTLs
There are 14 replicated multiancestry genes in PD eQTLs
There are 5 replicated multiancestry genes in PSP eQTLs


## 4. Tier Nomination <a id='tier'></a>

In [45]:
# read in therapeutic drug data - from Finan et al and DGidb
drugs_df = pd.read_csv('/../omicSynth/drug_genome_dgidb.csv', sep = ',')
# drop nans in gene_name
drugs_df = drugs_df.query('gene_name != "nan"')

# fill in any NaN bc theyre annoying
drugs_df.drug_concept_id = drugs_df.drug_concept_id.fillna('none')

# clean chemblid col since we need
drugs_df['chemblid'] = drugs_df.drug_concept_id.apply(lambda x: str(x.split(':')[1]) if ':'in x else x)


# remove any rows that do not have chembl id
drugs_df_red = drugs_df.query('chemblid != "none"')

drugs_df_red['drug_claim_primary_name'] = drugs_df_red['drug_claim_primary_name'].astype('str')

drugs_df_red['drug_claim_primary_name'] = drugs_df_red['drug_claim_primary_name'].apply(lambda x: x.lower())

In [65]:
# pull genes that have medications
drugs_df_no = drugs_df.query('drug_name ! = "none"')

# list of unique gene targets from drug data
thera_genes = list(drugs_df_no['gene_name'].unique())

drugs_df_no

Unnamed: 0,gene_name,gene_claim_name,entrez_id,interaction_claim_source,interaction_types,drug_claim_name,drug_claim_primary_name,drug_name,drug_concept_id,interaction_group_score,...,chr_b37,start_b37,end_b37,strand,description,no_of_gwas_regions,small_mol_druggable,bio_druggable,adme_gene,chemblid
0,CDK7,CDK7,1022.0,CancerCommons,inhibitor,SNS-032,SNS-032,BMS-387032,chembl:CHEMBL296468,0.82,...,5,68530668.0,68573250.0,1.0,cyclin-dependent kinase 7 [Source:HGNC Symbol;...,0.0,Y,N,N,CHEMBL296468
1,VDR,VDR,7421.0,DTC,,NIFEKALANT,NIFEKALANT,NIFEKALANT,chembl:CHEMBL360861,0.14,...,12,48235320.0,48336831.0,-1.0,"vitamin D (1,25- dihydroxyvitamin D3) receptor...",0.0,Y,N,N,CHEMBL360861
2,GPR55,GPR55,9290.0,DTC,,BISACODYL,BISACODYL,BISACODYL,chembl:CHEMBL942,0.94,...,2,231772033.0,231825781.0,-1.0,G protein-coupled receptor 55 [Source:HGNC Sym...,0.0,Y,N,N,CHEMBL942
3,NR1H4,NR1H4,9971.0,DTC,,PENTACHLOROPHENOL,PENTACHLOROPHENOL,PENTACHLOROPHENOL,chembl:CHEMBL75967,0.07,...,12,100867486.0,100958191.0,1.0,"nuclear receptor subfamily 1, group H, member ...",0.0,Y,N,N,CHEMBL75967
4,FGFR2,FGFR2,2263.0,JAX-CKB,,AZ6089,AZ6089,,none,,...,10,123237848.0,123357972.0,-1.0,fibroblast growth factor receptor 2 [Source:HG...,10.0,Y,Y,N,none
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
85455,PIK3C3,PIK3C3,5289.0,MyCancerGenome,inhibitor,BGJ398,BGJ398,INFIGRATINIB,chembl:CHEMBL1852688,0.17,...,18,39535171.0,39667794.0,1.0,"phosphatidylinositol 3-kinase, catalytic subun...",0.0,Y,N,N,CHEMBL1852688
85456,ABL1,Tyrosine-protein kinase ABL1,25.0,TTD,,Azaindole derivative 2,Azaindole derivative 2,,none,,...,9,133589333.0,133763062.0,1.0,"c-abl oncogene 1, non-receptor tyrosine kinase...",0.0,Y,N,N,none
85457,CACNA1D,776,776.0,GuideToPharmacology,inhibitor,135651166,[3H](+)-ISRADIPINE,,none,,...,3,53528683.0,53847760.0,1.0,"calcium channel, voltage-dependent, L type, al...",2.0,Y,N,N,none
85458,PRKCA,PRKCA,5578.0,DTC,,RESVERATROL,RESVERATROL,RESVERATROL,chembl:CHEMBL165,0.12,...,17,64298754.0,64806861.0,1.0,"protein kinase C, alpha [Source:HGNC Symbol;Ac...",3.0,Y,N,N,CHEMBL165


In [46]:
# extract all genes that meet significance, not just top single gene ( only need to run this cell if you didn't run it in sections before)
ndd_adj = extract_topsig_genes2(ndd_df_pc)

# number of unique genes that are significant at p < 0.05/16875 & p_HEIDI > 0.01
print(len(ndd_adj.Gene.unique()))

### 4.1 Tier 1 - Novel Genes Multi Ancestry p_SMR_multi < 0.05/16875 and p_HEIDI > 0.01<a id='eqtl4.1'></a>

In [48]:
# split data frame into therapeutic vs non-therapeutic
therapeutic_genes = ndd_adj.query('Gene == @thera_genes')
nonthera_genes = ndd_adj.query('Gene != @thera_genes')

# split by multiancestry
# filter for all non-multiancestry eQTLs
ndd_other = therapeutic_genes.query("Omic != 'multiancestry'")

# filter for multiancestry
ndd_ma = therapeutic_genes.query("Omic == 'multiancestry'")

# extract list of genes from multiancestry hits to compare against all other eQTLs
ma_genes = list(ndd_ma['Gene'].unique())
print(len(ma_genes))

other_genes = list(ndd_other.Gene.unique())
print(len(other_genes))

common_genes = []

for x in ma_genes:
    if x in other_genes:
        common_genes.append(x)
len(common_genes)

1
44


1

In [49]:
# pull any hits from all other eqtls that passed inital filter
merge_hits_other = ndd_other.query("Gene == @common_genes")
merge_hits_ma = ndd_ma.query("Gene == @common_genes")

# combine ma hits df and merge_hits
merged_hits_all = pd.concat([merge_hits_other, merge_hits_ma])


# merge results with drug data
merged_drugs = merged_hits_all.merge(drugs_df[['gene_name', 'interaction_types', 'drug_claim_name', 'drug_claim_primary_name', 'drug_name', 'chemblid']], left_on = 'Gene', right_on = 'gene_name')

# sig genes replicated in MA
len(merged_drugs.Gene.unique())

1

In [50]:
# find drugs with chemblid's and export for opentargets API
chembl_drugs = merged_drugs.query('chemblid != "none"')

# export df
chembl_drugs.to_csv('sig_genes_ma_chemblthresh2.csv', index = None)

# find drugs with no chemblid's and export 
nochembl_drugs = merged_drugs.query('chemblid == "none"')

# export df
nochembl_drugs.to_csv('sig_genes_ma_nochemblthresh2.csv', index = None)

### Filter for genes that have no known therapeutic drug

In [59]:
# list of genes with drugs 
drug_genes = list(drugs_df_no['gene_name'].unique())

# filter SMR hits identified against ones with known thera
t1_novel = merged_hits_all.query('Gene != @drug_genes')

t1_novel.shape

(0, 10)

### 4.2 T1 Novel - all Significant hits regardless of multiancestry replication

In [66]:
# split data frame into therapeutic vs non-therapeutic
therapeutic_genes = ndd_adj.query('Gene == @thera_genes')
nonthera_genes = ndd_adj.query('Gene != @thera_genes')

len(therapeutic_genes.Gene.unique())

# list of genes with drugs 
drug_genes = list(drugs_df['gene_name'].unique())
# filter SMR hits identified against ones with known thera
t1_novel = therapeutic_genes.query('Gene != @drug_genes')

t1_novel

Unnamed: 0,Omic,Disease,Gene,probeID,topRSID,b_SMR,se_SMR,p_SMR,p_SMR_multi,p_HEIDI


In [67]:
# merge thera genes on drug data
thera_drug_data = therapeutic_genes.merge(drugs_df[['gene_name', 'interaction_types', 'drug_claim_name', 'drug_claim_primary_name', 'drug_name', 'chemblid']], left_on = 'Gene', right_on = 'gene_name')

# find drugs with chemblid's and export for opentargets API
chembl_drugs_all = thera_drug_data.query('chemblid != "none"')

len(chembl_drugs_all.Gene.unique())

38

In [68]:
# export df
chembl_drugs_all.to_csv('all_sig_genes_chemblthresh2.csv', index = None)

# find drugs with no chemblid's and export
nochembl_drugs_all = thera_drug_data.query('chemblid == "none"')
nochembl_drugs_all

len(nochembl_drugs_all.Gene.unique())

# export df
nochembl_drugs_all.to_csv('all_sig_genes_nochemblthresh2.csv', index = None)

### Tier 3 - No Therapeutic Status

In [69]:
nonthera_genes

Unnamed: 0,Omic,Disease,Gene,probeID,topRSID,b_SMR,se_SMR,p_SMR,p_SMR_multi,p_HEIDI
271,Cerebellum_metaBrain,AD,GPC2,ENSG00000213420,rs17309333,-0.005518,0.018258,7.624718e-01,8.862247e-07,0.272504
2657,Cerebellum_metaBrain,AD,LRRC37A,ENSG00000176681,rs2696466,-0.065755,0.011450,9.315664e-09,2.127122e-06,0.519064
2658,Cerebellum_metaBrain,AD,ARL17B,ENSG00000228696,rs7225002,-0.062603,0.010536,2.822204e-09,4.203789e-07,0.593753
2875,Cerebellum_metaBrain,AD,INO80E,ENSG00000169592,rs9932196,0.077817,0.014521,8.370481e-08,8.807658e-07,0.198869
3736,Cerebellum_metaBrain,AD,SNX31,ENSG00000174226,rs1693568,0.067412,0.013318,4.153519e-07,4.685159e-07,0.716067
...,...,...,...,...,...,...,...,...,...,...
1710357,Brain_Cerebellum,PD,ARL17A,ENSG00000185829,rs55974014,-0.226764,0.029954,3.718993e-14,2.936298e-12,0.209266
1716109,Brain_Nucleus_accumbens_basal_ganglia,AD,PVRIG,ENSG00000213413,rs7811662,-0.097927,0.020056,1.045882e-06,5.477972e-07,0.162387
1717384,Brain_Nucleus_accumbens_basal_ganglia,AD,LRRC37A2,ENSG00000238083,rs413917,-0.043294,0.008585,4.586552e-07,6.966281e-07,0.484854
1730055,Brain_Nucleus_accumbens_basal_ganglia,PD,ARHGAP27,ENSG00000159314,rs753236,-0.569009,0.089788,2.339136e-10,3.652036e-10,0.945530


In [70]:
# number of unique genes
len(nonthera_genes.Gene.unique())

115