## ATC_FDA Protein Target Drug-Set Library
### Drug-set labels: Entrez Gene Symbols
#### ALL DATABASES ACCESSED 05/20/19
##### Author : Eryk Kropiwnicki | eryk.kropiwnicki@icahn.mssm.edu

In [1]:
import json
import pandas as pd
import requests
import time
from collections import defaultdict
import csv
import numpy as np

### Matching ATC_FDA drug names to UniProt IDs
#### Input File : approved_drugtargets.csv (http://www.drugbank.ca)

In [2]:
# Import all protein names and ids matched to drugbank drugs #
uniprot_ids = []
drugbank_ids = []

proteins = open('input/approved_drugtargets.csv')
proteins_csv = csv.reader(proteins)
for row in proteins_csv:
    uniprot_ids.append(row[5])
    drugbank_ids.append(row[-1])

del uniprot_ids[0]
del drugbank_ids[0]

In [3]:
print(len(uniprot_ids))
print(len(drugbank_ids))

2908
2908


In [36]:
unique_uniprot_ids = list(set(uniprot_ids))
len(unique_uniprot_ids)

2694

### Querying UniProt IDs through BioDBnet API to Retrieve Entrez Gene Symbols
#### Resource Accessed : https://biodbnet-abcc.ncifcrf.gov/db/dbFind.php

In [38]:
# Using biodbnet API to query Uniprot IDs and convert them to Entrez Gene Symbols #
failed = 0
failed_list = []
protein_dict = {}

for protein in unique_uniprot_ids:
    try:
        url = 'https://biodbnet-abcc.ncifcrf.gov/webServices/rest.php/biodbnetRestApi.json?method=dbfind&inputValues='+str(protein)+'&output=genesymbol&format=row'
        response = requests.get(url)
    except ValueError:
        pass
    if '-' not in response.json()[0].values():
        gene_id = response.json()[0]['Gene Symbol']
        protein_dict[protein] = gene_id
    else:
        failed += 1
        failed_list.append(protein)
    time.sleep(0.2)
        
print(failed) 

42


In [55]:
# Export failed list as TXT file #
dataFile = open('input/failed_uniprotids.txt', 'w')
for eachitem in failed_list:
    dataFile.write("%s\n" % eachitem)
dataFile.close()

In [39]:
len(protein_dict)

2652

In [40]:
# Creating a list of all entrez gene symbols #
entrez_symbols = []
for k,v in protein_dict.items():
    entrez_symbols.append(v)

### Creating Drug-Set Library of Entrez Gene Symbols matched to sets of corresponding drugs 

In [41]:
# Importing Drugbank accession numbers and corresponding drug names #
atc_fda_drugbank_ids = []
atc_fda = []

attributes = open('/Users/maayanlab/Documents/DrugSetEnrichment/Drugsetlibraries/Drugbank/ATC_FDA/atc_fda_attributes.csv')
attributes_csv = csv.reader(attributes)

for row in attributes_csv:
    atc_fda_drugbank_ids.append(row[-1])
    atc_fda.append(row[1].lower())

del atc_fda_drugbank_ids[0]
del atc_fda[0]

In [42]:
# Creating a dictionary of DrugBank accession numbers to ATC_FDA drugs #
atc_fda_dict = dict(zip(atc_fda_drugbank_ids, atc_fda))

In [43]:
len(atc_fda_dict)

1834

In [44]:
# The input file contains duplicate protein ids matched to unique Drugbank accession numbers #
# Tupelizing protein ids and drugbank accession numbers and grouping all corresponding drugbank accession numbers under one common dictionary key #

id_dict = tuple(zip(uniprot_ids, drugbank_ids))

drugsetlibrary = defaultdict(list)
for k, v in id_dict:
    drugsetlibrary[k].append(v)

In [45]:
# Splitting each string within each list so that each list element is unique #
drugsetlibrary = {k : string.split("; ") for k,v in drugsetlibrary.items() for string in v}

In [46]:
# Replacing every Drugbank accession number with a drug name #
drugsetlibrary = {k : [atc_fda_dict.get(x,x) for x in v] for k,v in drugsetlibrary.items()}

In [48]:
# Replacing every UniProt ID with an Entrez Gene Symbol #
drugsetlibrary = {protein_dict.get(k,k): v for k,v in drugsetlibrary.items()}

In [49]:
# Retaining only matched drug names and removing unpaired Drugbank accession numbers #
drugsetlibrary = {i: list(set(v) & set(atc_fda)) for i,v in drugsetlibrary.items()}

In [50]:
# Removing Uniprot IDs not matched to an Entrez Gene Symbol #
drugsetlibrary = {k: v for k, v in drugsetlibrary.items() if k in entrez_symbols}

In [51]:
# Removing Entrez Gene Symbols not matched to a drug-set #
drugsetlibrary = {k: v for k,v in drugsetlibrary.items() if v}

In [52]:
len(drugsetlibrary)

1811

In [54]:
df = pd.DataFrame.from_dict(drugsetlibrary, orient = 'index')
df = df[df.columns[0:]].apply(
    lambda x: ','.join(x.dropna().astype(str).astype(str)),
    axis = 1
)
df.to_csv('/Users/maayanlab/Documents/DrugSetEnrichment/Drugsetlibraries/Drugbank/ATC_FDA/atc_fda_target_drugsetlibrary.csv')