# Annotated Gene Networks via Open Targets
1. Obtain network partners for Tier 1/ Novel Gene targets

In [1]:
import pandas as pd
import numpy as np
import warnings
warnings.simplefilter(action='ignore')

In [2]:
def ot_query(query_string, ensg, base_url):
    # Build query string to get general information about AR and genetic constraint and tractability assessments 
    # Set variables object of arguments to be passed to endpoint
    variables = {"ensgId": ensg, "sourceDatabase": "signor"}
    
    # Perform POST request and check status code of response
    r = requests.post(base_url, json={"query": query_string, "variables": variables})
    
    
    # Transform API response from JSON into Python dictionary and print in console
    api_response = json.loads(r.text)
    
    if r.status_code != 200:
        return r.status_code
    else:
        return api_response

## 1. Load in Novel Gene data

In [3]:
novel_hits = pd.read_csv('tier1hits_all_thresh2.csv')

# remove MAPT, ADORA2B, KCNN4 which are tier 2 gene targets
rem = ['MAPT', 'ADORA2B', 'KCNN4']

hits = novel_hits.query('Gene != @rem')

In [4]:
# number of unique genes in tier 1 hits 
len(hits.Gene.unique())

41

In [5]:
# number of genes we have ENSG IDs for
len(hits[hits.probeID.str.contains('ENSG')].Gene.unique())

27

In [6]:
# need to hand annotate ensgIDs for missing genes 
need_ensg = list(set(hits.Gene.unique()) - set(hits[hits.probeID.str.contains('ENSG')].Gene.unique())) # checking for any genes that may have ensg but also cg probe id
need_ensg

['CLU',
 'MICB',
 'MS4A2',
 'EPHX2',
 'FBXL19',
 'MAT1A',
 'PSORS1C1',
 'CDSN',
 'SLC44A4',
 'STX1B',
 'STK39',
 'POU5F1',
 'PSMC3',
 'EPHA1']

In [7]:
ensg_gene = {'ENSG00000146904': 'EPHA1',
 'ENSG00000099365': 'STX1B',
 'ENSG00000165916': 'PSMC3',
 'ENSG00000204531': 'POU5F1',
 'ENSG00000204539': 'CDSN',
 'ENSG00000204516': 'MICB',
 'ENSG00000198648': 'STK39',
 'ENSG00000149534': 'MS4A2',
 'ENSG00000120885': 'CLU',
 'ENSG00000204540': 'PSORS1C1',
 'ENSG00000120915': 'EPHX2',
 'ENSG00000204385': 'SLC44A4',
 'ENSG00000151224': 'MAT1A',
 'ENSG00000099364': 'FBXL19'}

In [8]:
# df that only has genes with ensgids
ensg_hits = hits[hits.probeID.str.contains('ENSG')]

# create dictionary for ensg we already have
hits_ensg = dict(zip(ensg_hits['probeID'], ensg_hits['Gene']))

ensg_gene_dict = {**hits_ensg, **ensg_gene}

In [9]:
# query to get list of approved diseases where drug can be used to treat
query_string = """
     query InteractionsSectionQuery(
  $ensgId: String!
  $sourceDatabase: String
  $index: Int = 0
  $size: Int = 10
) {
  target(ensemblId: $ensgId) {
    id
    approvedName
    approvedSymbol

    interactions(
      sourceDatabase: $sourceDatabase
      page: { index: $index, size: $size }
    ) {
      count
      rows {
        intB
        targetB {
          id
          approvedSymbol
        }
      }
    }
  }
}
    """

# Set base URL of GraphQL API endpoint
base_url = "https://api.platform.opentargets.org/api/v4/graphql"

In [10]:
gene_net_dict = {}
g_ensg = [] # list to hold all_network partners gene symbols
g_ensg_id = [] # list to hold all_network partners gene ENSG ID
for eid, gene in ensg_gene_dict.items():
    # create list to hold any network partners
    gene_part = []
    # run ot query
    out = ot_query(query_string, eid, base_url)
    if isinstance(out, int):
        print('something broke')  
    elif isinstance(out, dict):
        if isinstance(out['data']['target']['interactions'], type(None)):
            gene_part.append('No curated network partners')
        else:
            # number of curated network partners
            num_dx = len(out['data']['target']['interactions']['rows'])
            # get symbol names + ensgids
            for x in range(num_dx):
                symbol = out['data']['target']['interactions']['rows'][x]['targetB']
                if isinstance(symbol, type(None)):
                    continue
                else:
                    gene_part.append(symbol['approvedSymbol'])
                    g_ensg.append(symbol['approvedSymbol'])
                    g_ensg_id.append(symbol['id'])
    # remove duplicates
    partners = list(set(gene_part))
    # turn list into string
    partners_string = ', '.join(partners)
    # add all gene symbols to dictionary for gene
    gene_net_dict[gene] = partners_string
                                     

In [11]:
gene_net_dict

{'CR1': 'C5AR2, C5AR1',
 'ACE': 'AGT, CSNK2A1',
 'GPNMB': 'No curated network partners',
 'STAG3': 'No curated network partners',
 'ADAM10': 'ELAVL4, ELAVL3, NOTCH1, CD44, CDH1, ERBB2, ELAVL1, SP1, ELAVL2, EGF',
 'SNCA': 'PRKN, PLK1, PPP2CA, GRK2, PPP2CB, PLK3, SYK, CSNK1A1, CADPS2, DYRK1A',
 'GAK': 'SRC',
 'CRHR1': 'GNAI1, GNAQ, CRH, GNAS, POMC',
 'CHRNE': 'No curated network partners',
 'INPP5D': 'GRB2, PLCG2, TRAF6, SYK, ARRB2, SHC1',
 'NDUFS2': 'No curated network partners',
 'STX4': 'STXBP4',
 'EGFR': 'GALNT8, PTPRJ, HGS, LRRFIP1, ADAM17, SOX2, MAPK3, CAMK2A, MUC1, NCK1',
 'APH1B': 'PSEN1, NCSTN, PSENEN, PSEN2',
 'CD38': 'OXT, PECAM1',
 'FCER1G': 'No curated network partners',
 'VKORC1': 'No curated network partners',
 'EPHB4': 'EFNB1',
 'DNTT': 'No curated network partners',
 'MINK1': 'KRAS, KCNH2, PRICKLE1',
 'RABEP1': 'RABGEF1, RAB5A',
 'CKM': 'No curated network partners',
 'ERCC2': 'ERCC3, ERCC5',
 'KAT8': 'KMT2A, FASN',
 'HSD3B7': 'No curated network partners',
 'BST1': 'No 

In [12]:
# create dict for gene partners and their ensg_ids
net_gene_ensg = dict(zip(g_ensg, g_ensg_id)) # dictionary to hold ensg ids for any partner genes

In [13]:
table2 = pd.DataFrame()
gene_dx_list = [] # list to hold diseases associated with a gene
gene_omic_list = [] # list to hold omics associated with a gene
omic_num_list = [] # list to hold number of omics associated with a gene


for g in ensg_gene_dict.values():
    df = hits.query(f'Gene == "{g}"')
    
    # get list of diseases associated with the gene
    dx_list = list(df.Disease.unique())
    
    # turn list into string
    dx_string = ', '.join(dx_list)
    
    # append strign to list that will be used as column in df
    gene_dx_list.append(dx_string)
    
    # find omics that each gene is associated with
    tx_list = list(df.Omic.unique())
    
    # turn list into string
    tx_string = ', '.join(tx_list)
    
    # append strign to list that will be used as column in df
    gene_omic_list.append(tx_string)
    
    # append number of omics per gene to list
    omic_num_list.append(len(tx_list))
    
    
table2['Gene'] = list(ensg_gene_dict.values())
table2['Diseases'] = gene_dx_list
table2['Omics'] = gene_omic_list
table2['Number of Omics'] = omic_num_list

In [14]:
# add network partners
table2['Network Partners'] = table2.Gene.map(gene_net_dict)

In [15]:
# add number of network partners
gene_num_list = [] # list to hold number of netowrk partners associated with a gene

for ix, row in table2.iterrows():
    # turn network genes column entry into list from string
    val_list = row['Network Partners'].split(',')
    val_list_clean = [x.strip() for x in val_list]
    if 'No curated network partners' in val_list_clean:
        gene_num_list.append(0)
    else:
        # append number of network partners per gene to list
        gene_num_list.append(len(val_list_clean))
table2['Number of Network Partners'] = gene_num_list

In [16]:
table2

Unnamed: 0,Gene,Diseases,Omics,Number of Omics,Network Partners,Number of Network Partners
0,CR1,AD,"Cerebellum_metaBrain, Basalganglia_metaBrain, ...",6,"C5AR2, C5AR1",2
1,ACE,AD,"Cerebellum_metaBrain, blood_mcrae, Cortex_meta...",6,"AGT, CSNK2A1",2
2,GPNMB,PD,"Cerebellum_metaBrain, brain_mMeta, Cortex_meta...",14,No curated network partners,0
3,STAG3,AD,"brain_mMeta, blood_mcrae, Cortex_metaBrain, Br...",14,No curated network partners,0
4,ADAM10,AD,"brain_mMeta, blood_eQTLgen",2,"ELAVL4, ELAVL3, NOTCH1, CD44, CDH1, ERBB2, ELA...",10
5,SNCA,LBD,"brain_mMeta, blood_mcrae, blood_eQTLgen",3,"PRKN, PLK1, PPP2CA, GRK2, PPP2CB, PLK3, SYK, C...",10
6,GAK,PD,"brain_mMeta, brain_eMeta",2,SRC,1
7,CRHR1,"PD, PSP, AD","brain_mMeta, blood_mcrae, Cortex_metaBrain, Mu...",4,"GNAI1, GNAQ, CRH, GNAS, POMC",5
8,CHRNE,AD,"blood_mcrae, Brain_Cortex",2,No curated network partners,0
9,INPP5D,AD,"blood_mcrae, blood_eQTLgen, Whole_Blood",3,"GRB2, PLCG2, TRAF6, SYK, ARRB2, SHC1",6


In [19]:
# export
table2.to_csv('manuscript_table3.csv', index = None)

In [18]:
# get list of all genes in order to see if any are therapeutic
net_genes = [x for x in gene_net_dict.values()]
net_genes

['C5AR1, C5AR2',
 'AGT, CSNK2A1',
 'No curated network partners',
 'No curated network partners',
 'CD44, EGF, SP1, CDH1, ELAVL4, ERBB2, NOTCH1, ELAVL3, ELAVL1, ELAVL2',
 'PRKN, CADPS2, GRK2, SYK, CSNK1A1, PLK1, PLK3, DYRK1A, PPP2CB, PPP2CA',
 'SRC',
 'GNAS, CRH, GNAI1, POMC, GNAQ',
 'No curated network partners',
 'SYK, ARRB2, SHC1, TRAF6, PLCG2, GRB2',
 'No curated network partners',
 'STXBP4',
 'NCK1, ADAM17, MUC1, LRRFIP1, MAPK3, SOX2, HGS, PTPRJ, CAMK2A, GALNT8',
 'NCSTN, PSENEN, PSEN1, PSEN2',
 'PECAM1, OXT',
 'No curated network partners',
 'No curated network partners',
 'EFNB1',
 'No curated network partners',
 'KRAS, PRICKLE1, KCNH2',
 'RABGEF1, RAB5A',
 'No curated network partners',
 'ERCC5, ERCC3',
 'FASN, KMT2A',
 'No curated network partners',
 'No curated network partners',
 'ICAM1, FGA',
 'EFNA1',
 'No curated network partners',
 'No curated network partners',
 'HLX, MAPK1, DKK1, AKT1, EOMES, NANOG, MAPK3, THY1, LEFTY2, AKT2',
 'No curated network partners',
 'No curat

In [19]:
net_genes2 = [x.split(',') for x in net_genes]
net_genes3 = [x.strip() for sub in net_genes2 for x in sub]

In [20]:
# list of unique network partner genes
final_net_genes = list(set(net_genes3))

# how many netowrk partners overall
len(final_net_genes)

87

In [21]:
# read in therapeutic drug data - from Finan et al and DGidb
drugs_df = pd.read_csv('/data/CARD_AA/projects/omicSynth/v8/analysis/drug_genome_dgidb.csv', sep = ',')

# ensure gene_names are all clean
drugs_df.gene_name = drugs_df.gene_name.astype('str') # treat each anme as string even if all numbers
drugs_df.gene_name = drugs_df.gene_name.apply(lambda x: x.strip()) # strip any leading/trailing white space

# drop nans in gene_name
drugs_df = drugs_df.query('gene_name != "nan"')

# fill in any NaN bc theyre annoying
drugs_df.drug_concept_id = drugs_df.drug_concept_id.fillna('none')

# clean chemblid col since we need
drugs_df['chemblid'] = drugs_df.drug_concept_id.apply(lambda x: str(x.split(':')[1]) if ':'in x else x)

# remove any rows that do not have chembl id
drugs_df_red = drugs_df.query('chemblid != "none"')

drugs_df_red['drug_claim_primary_name'] = drugs_df_red['drug_claim_primary_name'].astype('str')

drugs_df_red['drug_claim_primary_name'] = drugs_df_red['drug_claim_primary_name'].apply(lambda x: x.lower())

In [22]:
# list of unique gene targets from drug data
thera_genes = list(drugs_df['gene_name'].unique())

In [23]:
# list of network genes that are therapeutic
net_genes_thera = [x for x in final_net_genes if x in thera_genes]
net_genes_thera

['ELAVL1',
 'CRH',
 'ARRB2',
 'CD44',
 'MAPK1',
 'MUC1',
 'DKK1',
 'KMT2A',
 'GRK2',
 'PECAM1',
 'PSEN1',
 'SLC12A1',
 'MAPK14',
 'CAMK2A',
 'AGT',
 'GRB2',
 'SYK',
 'CDH1',
 'PLK3',
 'ERCC3',
 'CSNK2A1',
 'FASN',
 'RAB5A',
 'NCSTN',
 'NOTCH1',
 'CFTR',
 'PLCG2',
 'ERCC5',
 'SLC12A2',
 'THY1',
 'ICAM1',
 'GDNF',
 'DYRK1A',
 'KCNH2',
 'EGF',
 'ADAM17',
 'PSEN2',
 'AKT2',
 'SLC12A3',
 'OXT',
 'C5AR1',
 'MAPK3',
 'GNAQ',
 'NCK1',
 'SP1',
 'AKT1',
 'ERBB2',
 'SRC',
 'LRP2',
 'C5AR2',
 'KRAS',
 'PSENEN',
 'FGA',
 'CSNK1A1',
 'PLK1',
 'GNAS',
 'WNK1',
 'PPP2CA']

In [24]:
len(net_genes_thera)

58

In [25]:
drug_gene_partners = {}
for index,row in table2.iterrows():
    # pull gene name
    g_name = row['Gene']
    # turn network genes column entry into list from string
    val_list = row['Network Partners'].split(',')
    val_list_clean = [x.strip() for x in val_list]
    # create list to hold any there net genes associated with novel gene
    asc_genes = []
    
    # iterate through therapeutic network genes
    for gene in net_genes_thera:
        
        # check if ther net gene is in val_list
        if gene in val_list_clean:
            asc_genes.append(gene)
    # once checked all ther net genes append associated gene list to dict for novel gene it belongs to
    drug_gene_partners[g_name] = asc_genes

In [26]:
drug_gene_partners

{'CR1': ['C5AR1', 'C5AR2'],
 'ACE': ['AGT', 'CSNK2A1'],
 'GPNMB': [],
 'STAG3': [],
 'ADAM10': ['ELAVL1', 'CD44', 'CDH1', 'NOTCH1', 'EGF', 'SP1', 'ERBB2'],
 'SNCA': ['GRK2', 'SYK', 'PLK3', 'DYRK1A', 'CSNK1A1', 'PLK1', 'PPP2CA'],
 'GAK': ['SRC'],
 'CRHR1': ['CRH', 'GNAQ', 'GNAS'],
 'CHRNE': [],
 'INPP5D': ['ARRB2', 'GRB2', 'SYK', 'PLCG2'],
 'NDUFS2': [],
 'STX4': [],
 'EGFR': ['MUC1', 'CAMK2A', 'ADAM17', 'MAPK3', 'NCK1'],
 'APH1B': ['PSEN1', 'NCSTN', 'PSEN2', 'PSENEN'],
 'CD38': ['PECAM1', 'OXT'],
 'FCER1G': [],
 'VKORC1': [],
 'EPHB4': [],
 'DNTT': [],
 'MINK1': ['KCNH2', 'KRAS'],
 'RABEP1': ['RAB5A'],
 'CKM': [],
 'ERCC2': ['ERCC3', 'ERCC5'],
 'KAT8': ['KMT2A', 'FASN'],
 'HSD3B7': [],
 'BST1': [],
 'ITGAX': ['ICAM1', 'FGA'],
 'EPHA1': [],
 'STX1B': [],
 'PSMC3': [],
 'POU5F1': ['MAPK1', 'DKK1', 'THY1', 'AKT2', 'MAPK3', 'AKT1'],
 'CDSN': [],
 'MICB': [],
 'STK39': ['SLC12A1', 'MAPK14', 'CFTR', 'SLC12A2', 'SLC12A3', 'WNK1'],
 'MS4A2': [],
 'CLU': ['GDNF', 'LRP2'],
 'PSORS1C1': [],
 'EPH

In [27]:
# genes with no therapeutic gene partners
no_partners = list({k: v for k, v in drug_gene_partners.items() if not v}.keys())

# genes with therapeutic gene partners
thera_partners = {k: v for k, v in drug_gene_partners.items() if v}

In [28]:
# how many of t1/ novel genes have therapeutic partner genes
len(thera_partners)

18

In [29]:
# use therapeutic gene partners list to see what drugs target them
len(net_genes_thera)

58

In [30]:
# list of ensg to search from therapeutic netowrk partners
ensg_search =[]
for g in net_genes_thera:
    ensg_search.append(net_gene_ensg[g])
#ensg_search

In [31]:
# query to get list of approved diseases where drug can be used to treat
query_string = """
     query InteractionsSectionQuery(
  $ensgId: String!) {
  target(ensemblId: $ensgId) {
    id
    approvedSymbol
    knownDrugs{
      rows{
        drugId
        prefName
        disease {
          name
        }
      }
    }
  }
}
    """

In [32]:
pg_drugdx = {} # dictionary to hold gene and resulting drug-disease interactions
for ensg in ensg_search:
    out = ot_query(query_string,ensg, base_url)
    
    # list to hold any druggable gene partners drugs
    part_druggability = []
    if isinstance(out, int):
        print('something broke')
    
    elif isinstance(out, dict):
        if isinstance(out['data']['target']['knownDrugs'], type(None)):
            part_druggability.append('No known drugs')
        else:
            # number of curated network partners
            num_dx = len(out['data']['target']['knownDrugs']['rows'])
            
            # 
            for x in range(num_dx):
                drug_name = out['data']['target']['knownDrugs']['rows'][x]['prefName']
                
                if isinstance(drug_name, type(None)):
                    continue
                else:
                    # get disease name
                    dx_name = out['data']['target']['knownDrugs']['rows'][x]['disease']['name']
                    part_druggability.append(drug_name + '-' + dx_name)
                    
    # merge into string
    part_druggability = ', '.join(part_druggability)
    # combine into dict key:value pair
    pg_drugdx[ensg] = part_druggability

In [33]:
map_swap = {v: k for k, v in net_gene_ensg.items()}
df_pg_drug_dx = pd.DataFrame(pg_drugdx.items())
df_pg_drug_dx['Gene'] = df_pg_drug_dx[0].map(map_swap)
df_pg_drug_dx

Unnamed: 0,0,1,Gene
0,ENSG00000066044,No known drugs,ELAVL1
1,ENSG00000147571,No known drugs,CRH
2,ENSG00000141480,No known drugs,ARRB2
3,ENSG00000026508,BIVATUZUMAB-upper aerodigestive tract neoplasm...,CD44
4,ENSG00000100030,"ULIXERTINIB-Uveal Melanoma, TEMUTERKIB-cancer,...",MAPK1
5,ENSG00000185499,"TECEMOTIDE-breast cancer, HUHMFG1-peritoneum c...",MUC1
6,ENSG00000107984,"BHQ-880-multiple myeloma, DKN-01-biliary tract...",DKK1
7,ENSG00000118058,"SNDX-5613 FREE BASE-acute myeloid leukemia, SN...",KMT2A
8,ENSG00000173020,No known drugs,GRK2
9,ENSG00000261371,No known drugs,PECAM1


In [34]:
have_known = df_pg_drug_dx[df_pg_drug_dx[1] != 'No known drugs']
have_known

Unnamed: 0,0,1,Gene
3,ENSG00000026508,BIVATUZUMAB-upper aerodigestive tract neoplasm...,CD44
4,ENSG00000100030,"ULIXERTINIB-Uveal Melanoma, TEMUTERKIB-cancer,...",MAPK1
5,ENSG00000185499,"TECEMOTIDE-breast cancer, HUHMFG1-peritoneum c...",MUC1
6,ENSG00000107984,"BHQ-880-multiple myeloma, DKN-01-biliary tract...",DKK1
7,ENSG00000118058,"SNDX-5613 FREE BASE-acute myeloid leukemia, SN...",KMT2A
10,ENSG00000080815,"TARENFLURBIL-Alzheimer disease, TARENFLURBIL-A...",PSEN1
11,ENSG00000074803,"FUROSEMIDE-congestive heart failure, FUROSEMID...",SLC12A1
12,ENSG00000112062,"LOSMAPIMOD-acute coronary syndrome, ARRY-797-d...",MAPK14
14,ENSG00000135744,"GSK-2586881-COVID-19, GSK-2586881-pulmonary ar...",AGT
15,ENSG00000177885,PREXIGEBERSEN-acute myeloid leukemia,GRB2


In [36]:
#
dx_name_list =['Alzheimer disease', 'Dementia','Parkinson disease', 'Lewy body dementia', 'parkinsonian disorder', 'supranuclear palsy, progressive, 1']

In [37]:
thera_ndd_part = [] # list to hold genes that have NDD targeting drugs

for ix,row in have_known.iterrows():
    for dx in dx_name_list:
        if dx in row[1]:
            thera_ndd_part.append(row['Gene'] + '-' + dx)

In [38]:
thera_ndd_part

['PSEN1-Alzheimer disease',
 'PSEN1-Dementia',
 'MAPK14-Alzheimer disease',
 'NCSTN-Alzheimer disease',
 'NCSTN-Dementia',
 'PSEN2-Alzheimer disease',
 'PSEN2-Dementia',
 'PSENEN-Alzheimer disease',
 'PSENEN-Dementia']

## Liver Toxicity Companion Genes

In [18]:
# get list of all companion genes
comp_genes = list(net_gene_ensg.keys())
comp_genes[:5]

# load in SMR hits sig when p_SMR_multi < 0.05 to check if any network partners show up
sig5 = pd.read_csv('sig_noheidi_05.csv')

# look for any matches
sig_net_partner = sig5.query('Gene == @comp_genes ')

sig_net_partner.query('Omic == "Liver"')

In [28]:
sig_net_partner['Gene'].unique()

array(['GNAQ', 'EFNA1', 'PSEN2', 'ADAM17', 'EOMES', 'NCK1', 'GDNF',
       'CFTR', 'CADPS2', 'SYK', 'NOTCH1', 'WNK1', 'MAPK3', 'PLCG2',
       'ERBB2', 'CD44', 'PTPRJ', 'AKT1', 'AKT2', 'GNAS', 'DYRK1A', 'POMC',
       'SP1', 'AATK', 'SLC4A4', 'GALNT8', 'PRICKLE1', 'HGS', 'FASN',
       'SLC12A1', 'CAMK2A', 'MAPK14', 'PPP2CB', 'DKK1', 'THY1', 'GNAI1',
       'PSEN1', 'ERCC3', 'WNK4', 'EGF', 'OXT', 'C5AR2', 'SLC12A3',
       'SLC12A2', 'LRRFIP1', 'ICAM1', 'FGA', 'RAB5A', 'AGT', 'PLK1',
       'TRAF6', 'KRAS', 'ARRB2', 'STXBP4', 'SRC', 'CSNK1A1', 'SHC1',
       'MUC1', 'RABGEF1', 'GRB2', 'MAPK1', 'NCSTN'], dtype=object)