## ATC_FDA Protein Target Drug-Set Library
### Drug-set labels: Protein targets
#### ALL DATABASES ACCESSED 05/20/19
##### Author : Eryk Kropiwnicki | eryk.kropiwnicki@icahn.mssm.edu

In [1]:
import json
import pandas as pd
import xml.etree.ElementTree as et
import requests
import time
from collections import defaultdict
import csv
import numpy as np

### Matching Small Molecule Names to UniProt IDs
#### Input File : smallmolecule_drugtargets.csv (http://www.drugbank.ca)

In [7]:
# Import all protein names and ids matched to drugbank drugs #
uniprot_ids = []
drugbank_ids = []

proteins = open('input/smallmolecule_drugtargets.csv')
proteins_csv = csv.reader(proteins)
for row in proteins_csv:
    uniprot_ids.append(row[5])
    drugbank_ids.append((row[-1]))

del uniprot_ids[0]
del drugbank_ids[0]

In [8]:
print(len(uniprot_ids))
print(len(drugbank_ids))

4846
4846


In [9]:
unique_uniprot_ids = list(set(uniprot_ids))
len(unique_uniprot_ids)

4625

### Querying UniProt IDs through BioDBnet API to Retrieve Entrez Gene Symbols
#### Resource Accessed : https://biodbnet-abcc.ncifcrf.gov/db/dbFind.php

In [11]:
# Using biodbnet API to query Uniprot IDs and convert them to Entrez Gene Symbols #
failed_list = []
protein_dict = {}

for protein in unique_uniprot_ids:
    try:
        url = 'https://biodbnet-abcc.ncifcrf.gov/webServices/rest.php/biodbnetRestApi.json?method=dbfind&inputValues='+str(protein)+'&output=genesymbol&format=row'
        response = requests.get(url)
    except ValueError:
        pass
    if '-' not in response.json()[0].values():
        gene_id = response.json()[0]['Gene Symbol']
        protein_dict[protein] = gene_id
    else:
        failed_list.append(protein)
    time.sleep(0.2)
        
print(len(failed_list))

190


In [27]:
# Export failed list as TXT file #
dataFile = open('input/failed_uniprotids.txt', 'w')
for eachitem in failed_list:
    dataFile.write("%s\n" % eachitem)
dataFile.close()

In [12]:
# Creating a list of all entrez gene symbols #
entrez_symbols = []
for k,v in protein_dict.items():
    entrez_symbols.append(v)

### Creating Drug-Set Library of Entrez Gene Symbols matched to sets of corresponding drugs 

In [13]:
# Importing Drugbank accession numbers and corresponding drug names #
smallmolecule_drugbank_ids = []
smallmolecules = []

attributes = open('input/drug links-3.csv')
attributes_csv = csv.reader(attributes)

for row in attributes_csv:
    smallmolecule_drugbank_ids.append(row[0])
    smallmolecules.append(row[1].lower())

del smallmolecule_drugbank_ids[0]
del smallmolecules[0]

In [14]:
# Creating a dictionary of DrugBank accession numbers to ATC_FDA drugs #
smallmolecule_dict = dict(zip(smallmolecule_drugbank_ids, smallmolecules))

In [15]:
len(smallmolecule_dict)

10254

In [17]:
# The input file contains duplicate protein ids matched to unique drug ids, dictionaries cannot contain duplicate keys #
# Tupelizing protein ids and drugbank accession numbers and grouping all corresponding drugbank accession numbers under one common dictionary key #

id_dict = tuple(zip(uniprot_ids, drugbank_ids))

drugsetlibrary = defaultdict(list)
for k, v in id_dict:
    drugsetlibrary[k].append(v)

In [18]:
# Splitting each string within each list so that each list element is unique #
drugsetlibrary = {k : string.split("; ") for k,v in drugsetlibrary.items() for string in v}

In [19]:
# Matching every Drugbank accession number with a drug name #
drugsetlibrary = {k : [smallmolecule_dict.get(x,x) for x in v] for k,v in drugsetlibrary.items()}

In [20]:
# Replacing every UniProt ID with an Entrez Gene Symbol #
drugsetlibrary = {protein_dict.get(k,k): v for k,v in drugsetlibrary.items()}

In [21]:
# Retaining only matched drug names and removing unpaired Drugbank accession numbers #
drugsetlibrary = {i: list(set(v) & set(smallmolecules)) for i,v in drugsetlibrary.items()}

In [22]:
# Removing Uniprot IDs not matched to an Entrez Gene Symbol #
drugsetlibrary = {k: v for k, v in drugsetlibrary.items() if k in entrez_symbols}

In [23]:
# Removing Entrez Gene IDs not matched to a drug-set #
drugsetlibrary = {k: v for k,v in drugsetlibrary.items() if v}

In [24]:
len(drugsetlibrary)

4075

In [26]:
df = pd.DataFrame.from_dict(drugsetlibrary, orient = 'index')
df = df[df.columns[0:]].apply(
    lambda x: ','.join(x.dropna().astype(str).astype(str)),
    axis = 1
)
df.to_csv('/Users/maayanlab/Documents/DrugSetEnrichment/Drugsetlibraries/Drugbank/Small molecules/drugbank_smallmolecule_target_drugsetlibrary.csv')