## Drugbank Experimental Drug Protein Target Drug-Set Library
### Drug-set labels: Entrez Gene Symbols
#### ALL DATABASES ACCESSED 05/01/19
##### Author : Eryk Kropiwnicki | eryk.kropiwnicki@icahn.mssm.edu

In [8]:
import json
import pandas as pd
import requests
import time
from collections import defaultdict
import csv

### Matching Experimental Drug Names To UniProt IDs
#### Input File : experimental_drugtargets.csv (http://www.drugbank.ca)

In [9]:
# Import all protein names and ids matched to drugbank drugs #
uniprot_ids = []
drugbank_ids = []

proteins = open('input/experimental_drugtargets.csv')
proteins_csv = csv.reader(proteins)
for row in proteins_csv:
    uniprot_ids.append(row[5])
    drugbank_ids.append((row[-1]))

del uniprot_ids[0]
del drugbank_ids[0]

In [10]:
print(len(uniprot_ids))
print(len(drugbank_ids))

2933
2933


In [7]:
unique_uniprot_ids = list(set(uniprot_ids))
len(unique_uniprot_ids)

2862

### Querying UniProt IDs through BioDBnet API to Retrieve Entrez Gene Symbols
#### Resource Accessed : https://biodbnet-abcc.ncifcrf.gov/db/dbFind.php

In [27]:
# Using biodbnet API to query Uniprot IDs and convert them to Entrez Gene Symbols #
failed_list = []
protein_dict = {}

for protein in unique_uniprot_ids:
    try:
        url = 'https://biodbnet-abcc.ncifcrf.gov/webServices/rest.php/biodbnetRestApi.json?method=dbfind&inputValues='+str(protein)+'&output=genesymbol&format=row'
        response = requests.get(url)
    except ValueError:
        pass
    if '-' not in response.json()[0].values():
        gene_id = response.json()[0]['Gene Symbol']
        protein_dict[protein] = gene_id
    else:
        failed_list.append(protein)
    time.sleep(0.2)
        
print(len(failed_list))

163


In [None]:
# Export failed list as TXT file #
dataFile = open('input/failed_uniprotids.txt', 'w')
for eachitem in failed_list:
    dataFile.write("%s\n" % eachitem)
dataFile.close()

In [28]:
# Creating a list of all entrez gene symbols #
entrez_symbols = []
for k,v in protein_dict.items():
    entrez_symbols.append(v)

### Creating Drug-Set Library of Entrez Gene Symbols matched to sets of corresponding drugs 

In [29]:
# Importing Drugbank accession numbers and corresponding drug names #
experimental_drugbank_ids = []
experimental_drugs = []

attributes = open('input/drug links-3.csv')
attributes_csv = csv.reader(attributes)

for row in attributes_csv:
    experimental_drugbank_ids.append(row[0])
    experimental_drugs.append(row[1].lower())

del experimental_drugbank_ids[0]
del experimental_drugs[0]

In [30]:
# Creating a dictionary of DrugBank accession numbers to drug names #
experimental_dict = dict(zip(experimental_drugbank_ids, experimental_drugs))

In [31]:
len(experimental_dict)

5764

In [35]:
# The input file contains duplicate protein ids matched to unique drug ids, dictionaries cannot contain duplicate keys #
# Tupelizing protein ids and drugbank accession numbers and grouping all corresponding drugbank accession numbers under one common dictionary key #

id_dict = tuple(zip(uniprot_ids, drugbank_ids))

drugsetlibrary = defaultdict(list)
for k, v in id_dict:
    drugsetlibrary[k].append(v)

In [36]:
# Splitting each string within each list so that each list element is unique #
drugsetlibrary = {k : string.split("; ") for k,v in drugsetlibrary.items() for string in v}

In [37]:
# Replacing every Drugbank accession number with a drug name #
drugsetlibrary = {k : [experimental_dict.get(x,x) for x in v] for k,v in drugsetlibrary.items()}

In [38]:
# Replacing every UniProt ID with an Entrez Gene Symbol #
drugsetlibrary = {protein_dict.get(k,k): v for k,v in drugsetlibrary.items()}

In [39]:
# Retaining only matched drug names and removing unpaired Drugbank accession numbers #
drugsetlibrary = {i: list(set(v) & set(experimental_drugs)) for i,v in drugsetlibrary.items()}

In [40]:
# Removing Uniprot IDs not matched to an Entrez Gene Symbol #
drugsetlibrary = {k: v for k, v in drugsetlibrary.items() if k in entrez_symbols}

In [41]:
# Removing Entrez Gene IDs not matched to a drug-set #
drugsetlibrary = {k: v for k,v in drugsetlibrary.items() if v}

In [42]:
len(drugsetlibrary)

2455

In [44]:
df = pd.DataFrame.from_dict(drugsetlibrary, orient = 'index')
df = df[df.columns[0:]].apply(
    lambda x: ','.join(x.dropna().astype(str).astype(str)),
    axis = 1
)
df.to_csv('/Users/maayanlab/Documents/DrugSetEnrichment/Drugsetlibraries/Drugbank/Experimental/drugbank_experimental_target_drugsetlibrary.csv')