## Geneshot Associated Gene Drug-Set Library
### Drug-set labels: Genes
#### ALL DATABASES ACCESSED 11/01/2019
##### Author : Eryk Kropiwnicki | eryk.kropiwnicki@icahn.mssm.edu

In [1]:
import csv
import time 
import requests
import json
from collections import defaultdict
import pandas as pd
import numpy as np
import os

In [2]:
os.chdir('../scripts')
from export_script import *
from gene_resolver import *
os.chdir('../Geneshot')

#### Input file : drug_metadata.tsv (generated from Drug Metadata Aggregation.ipynb)

In [3]:
# Import all small molecules #
df = pd.read_csv('../mapping_files/merged_drug_metadata.tsv', sep = '\t')

In [4]:
df.head()

Unnamed: 0,Drug_name,PubChemID,Synonyms,Canonical_SMILES,InChiKey,Molecular_formula
0,goserelin,5311128,"['goserelin', 'goserelin acetate', 'zoladex', ...",CC(C)CC(C(=O)NC(CCCN=C(N)N)C(=O)N1CCCC1C(=O)NN...,BLCLNMBMMGCOAS-URPVMXJPSA-N,C59H84N18O14
1,cetrorelix,25074887,"['cetrorelix', '120287-85-6', 'cetrorelix acet...",CC(C)CC(C(=O)NC(CCCN=C(N)N)C(=O)N1CCCC1C(=O)NC...,SBNPWPIBESPSIF-MHWMIDJBSA-N,C70H92ClN17O14
2,ciclosporin,5284373,"['cyclosporin a', 'cyclosporine', 'ciclosporin...",CCC1C(=O)N(CC(=O)N(C(C(=O)NC(C(=O)N(C(C(=O)NC(...,PMATZTZNYRCHOR-CGLBZJNRSA-N,C62H111N11O12
3,octreotide,448601,"['octreotide', 'octreotide acetate', '83150-76...",CC(C1C(=O)NC(CSSCC(C(=O)NC(C(=O)NC(C(=O)NC(C(=...,DEQANNDTNATYII-OULOTJBUSA-N,C49H66N10O10S2
4,choline,305,"['choline', 'choline ion', 'bilineurine', 'cho...",C[N+](C)(C)CCO,OEYIOHPDSNJKLS-UHFFFAOYSA-N,C5H14NO+


In [5]:
smallmolecules = df['Drug_name'].tolist()

In [6]:
len(smallmolecules)

43945

### Querying Small Molecule Names Through Geneshot
#### PLATFORM : https://amp.pharm.mssm.edu/geneshot

In [8]:
# Queries a drug list through the Geneshot API to retrieve gene terms associated with each drug
# and saves the results as a json in the input folder

feeds = []
with open('input/geneset_autorif.json','a') as outfile:
    for entry in smallmolecules:
        response = requests.get('https://amp.pharm.mssm.edu/geneshot/api/search/auto/' + entry)
        try:
            response.json()
            data = response.json()
        except ValueError:
            pass
        feeds.append(data)
        time.sleep(0.50)
    json.dump(feeds,outfile,indent = 4)        
    outfile.close()

### Creating geneset library from drug-gene associations collected from Geneshot

In [52]:
# Creating a dictionary where small molecules are terms and genes are set members #
df_counts = pd.DataFrame()
gene_list = []
compound_list = []
total_mentions = []
normalized_mentions = []

with open ('input/geneset_autorif.json') as data_file:
    data = json.load(data_file)

for item in data:
    genes = (item["gene_count"])
    compound = (item["search_term"]).lower()
    for gene in genes:
        #if genes[gene][0] > 5:
        gene_list.append(gene)
        compound_list.append(compound)
        total_mentions.append(genes[gene][0])
        normalized_mentions.append(genes[gene][1])

df_counts['Compound_name'] = compound_list
df_counts['Gene'] = gene_list
df_counts['Total_mentions'] = total_mentions
df_counts['Normalized_mentions'] = normalized_mentions

In [53]:
print(df_counts['Total_mentions'].mean())
print(df_counts['Total_mentions'].median())

19.72057647457125
2.0


In [54]:
print(df_counts['Normalized_mentions'].mean())
print(df_counts['Normalized_mentions'].median())

0.11665613584652752
0.05136138613861386


In [55]:
len(set(df_counts['Compound_name']))

12466

In [58]:
df_counts.to_csv('/Users/maayanlab/Desktop/geneshot_drug_gene_mentions.tsv', sep = '\t', index = False)

In [28]:
# Creating a dictionary where small molecules are terms and genes are set members #
genelist = {}

with open ('input/geneset_autorif.json') as data_file:
    data = json.load(data_file)

for item in data:
    genes = (item["gene_count"])
    genes = [key for key in genes.keys()]
    genes = genes[:50]
    compound = (item["search_term"]).lower()
    genelist[compound] = []
    genelist[compound].extend(genes)

genelist = {k:v for k,v in genelist.items() if v}

### Validating gene names

In [29]:
# Importing approved symbol table
df_lookup = pd.read_csv('input/Homo_sapiens.gene_info', delimiter = '\t')
approved_symbols = df_lookup['Symbol'].tolist()

# Gene synonym lookup
with open('input/gene_symbol_lookup.json', 'r') as f:
    synonym_lookup = json.load(f)

In [30]:
# Confirming all gene names in dictionary are valid
genelist = {k : [synonym_lookup.get(x,x) for x in v] for k,v in genelist.items()} # Matching synonyms with approved symbols
genelist = {k: list(set(v) & set(approved_symbols)) for k,v in genelist.items()} # Removing unmatched/unapproved symbols

### Creating genesetlibrary with drugs as terms

In [31]:
# Converting dictionary into GMT format #
GSL = []

for term in genelist.keys():    
    genes = genelist[term]
    line = '{0}\t\t{1}'.format(term, '\t'.join(genes))
    GSL.append(line)

GSL_out = '\n'.join(GSL)

In [32]:
# Exporting Drugbank Experimental Drug Geneset library as TXT file #
dataFile = open('input/geneshot_drug_genesetlibrary.txt', 'w')
for eachitem in GSL_out:
    dataFile.write(eachitem)
dataFile.close()

### Creating drugsetlibrary from gene-drug associations collected from Geneshot

In [33]:
# Transposing the 'genelist' dictionary where genes are terms and small molecules are set members #
# Each value (gene) in the dictionary is a list, therefore we need to separate all list elements as their own term #
d = defaultdict(list)
for k,v in genelist.items():
    for gene in v:
        gene.split(',')
        d[gene].append(k)

drugsetlibrary = dict(d)

In [34]:
len(drugsetlibrary)

8235

In [35]:
# Removing all terms paired with less than 5 drugs #
drugsetlibrary = {k:v for k,v in drugsetlibrary.items() if len(v)>=5}

In [36]:
len(drugsetlibrary)

4145

### Library counts

In [37]:
library_counts(drugsetlibrary)

12386 unique drugs
4145 unique association terms
273468 unique associations
65.97539203860072 average drugs per term


### Exporting the drugsetlibrary in DMT format

In [38]:
gmt_formatter(drugsetlibrary, '../data/Geneshot/Geneshot_associated_drugsetlibrary.txt')