## Drugbank Small Molecules Geneshot Drug-Set Library
### Drug-set labels: Genes
#### ALL DATABASES ACCESSED 05/01/19
##### Author : Eryk Kropiwnicki | eryk.kropiwnicki@icahn.mssm.edu

In [1]:
import csv
import pubchempy as pcp
import time 
import requests
import json
from collections import defaultdict
import pandas as pd
import numpy as np

In [2]:
smallmolecules = []

db_drugs = open('input/drug links-3.csv')
db_drugs_csv = csv.reader(db_drugs)
for row in db_drugs_csv:
    smallmolecules.append(row[1].lower())
del smallmolecules[0]

In [3]:
len(smallmolecules)

10254

### Querying Small Molecule Names Through Geneshot
#### PLATFORM : https://amp.pharm.mssm.edu/geneshot
#### OUTPUT FILES: drugbank_smallmolecules_geneset_autorif.json

In [5]:
# Querying each drug through geneshot(AUTORIF) #
i = 0
with open('input/drugbank_smallmolecules_geneset_autorif.json','a') as outfile:
    outfile.write("[")
    for drug in smallmolecules:
        try:
            res = requests.get('https://amp.pharm.mssm.edu/geneshot/api/search/auto/' + drug)
            smallmolecule_geneset_autorif = res.json()
        except ValueError:
            pass
        json.dump(smallmolecule_geneset_autorif, outfile, indent = 4)
        i += 1
        time.sleep(1)
        if i < (len(smallmolecules)):
            outfile.write(",")
        else:
            outfile.write("]")
outfile.close()

### Creating Geneset Libraries From Drug-Gene Associations Collected From Geneshot
#### INPUT FILE : drugbank_smallmolecules_geneset_autorif.json
#### OUTPUT FILE: drugbank_smallmolecules_genesetlibrary.txt | drugbank_smallmolecules_geneshot_drugsetlibrary.csv

In [6]:
# Creating a dictionary where small molecules are terms and genes are set members #
genelist = {}

with open ('input/drugbank_smallmolecules_geneset_autorif.json') as data_file:
    data = json.load(data_file)

for item in data:
    genes = (item["gene_count"])
    genes = [key for key in genes.keys()]
    genes = genes[:51]
    compound = (item["search_term"]).lower()
    genelist[compound] = []
    genelist[compound].extend(genes)

genelist = {k: v for k,v in genelist.items() if v}

# Converting dictionary into GMT format #
GSL_SM = []

for term in genelist.keys():    
    genes = genelist[term]
    line = '{0}\t\t{1}'.format(term, '\t'.join(genes))
    GSL_SM.append(line)

GSL_SM_out = '\n'.join(GSL_SM)

In [9]:
# Exporting Drugbank Experimental Drug Geneset library as TXT file #
dataFile = open('/Users/maayanlab/Documents/DrugSetEnrichment/GeneSetLibraries/Drugbank/Small molecules/drugbank_smallmolecules_genesetlibrary.txt', 'w')
for eachitem in GSL_SM_out:
    dataFile.write(eachitem)

In [10]:
# Transposing the 'genelist' dictionary where genes are terms and small molecules are set members #
# Each value (gene) in the dictionary is a list, therefore we need to separate all list elements as their own term #
d = defaultdict(list)
for k,v in genelist.items():
    for gene in v:
        gene.split(',')
        d[gene].append(k)

drugsetlibrary = dict(d)

In [12]:
# Converting drugsetlibrary into dataframe and exporting #
df = pd.DataFrame.from_dict(drugsetlibrary, orient='index')
df = df[df.columns[0:]].apply(
    lambda x: ','.join(x.dropna().astype(str).astype(str)),
    axis = 1
)
df.to_csv('/Users/maayanlab/Documents/DrugSetEnrichment/Drugsetlibraries/Drugbank/Small molecules/drugbank_smallmolecules_geneshot_drugsetlibrary.csv')