## Geneshot Predicted Gene Drug-Set Library
### Drug-set labels: Genes
#### ALL DATABASES ACCESSED 11/2019
##### Author : Eryk Kropiwnicki | eryk.kropiwnicki@icahn.mssm.edu

In [1]:
import csv
import time 
import requests
import json
from collections import defaultdict
import pandas as pd
import numpy as np
import os

In [2]:
os.chdir('../scripts')
from export_script import *
from gene_resolver import *
os.chdir('../Geneshot')

#### Input file : geneshot_drug_genesetlibrary.txt (generated from Geneshot Associated Gene Drugsetlibrary.ipynb)

In [3]:
with open('input/geneshot_drug_genesetlibrary.txt', 'r') as f:
    reader = csv.reader(f, delimiter = '\t')
    # Formatting dictionary values to be compatible with Geneshot API
    genedict = {str(row[0]):(','.join([str(g)for g in row[2:]if g != ''])) for row in reader}

In [4]:
len(genedict)

12421

In [5]:
# List of all drugs
drugs = []
for k,v in genedict.items():
    drugs.append(k)

### Querying each small molecule genelist through Geneshot API

In [6]:
GENESHOT_URL = 'http://amp.pharm.mssm.edu/geneshot/api'
query_string = '/associate/%s/%s'
similarity_matrix = ['autorif','enrichr','generif','tagger','coexpression']

In [8]:
for matrix in similarity_matrix:
    feeds = []
    with open("input/"+query+"_predicted_genes.json", "w") as outfile:
        for drug, genelist in genedict.items():
            response = requests.get(
                GENESHOT_URL + query_string % (matrix,genelist)
            )
            data = response.json()
            data[drug] = data.pop("association")
            feeds.append(data)
            time.sleep(2)
        json.dump(feeds, outfile, indent = 4)
        outfile.close()

### Creating drugsetlibraries for each similarity matrix query

In [9]:
# Importing synonym lookup dict and approved gene symbol list
df_lookup = pd.read_csv('input/Homo_sapiens.gene_info', delimiter = '\t')
approved_symbols = df_lookup['Symbol'].tolist()

with open('input/gene_symbol_lookup.json', 'r') as f:
    synonym_lookup = json.load(f)

In [10]:
# Function for converting each json file into drugsetlibrary

def drugsetlibrary_converter(similarity_matrix):
    with open ('input/'+similarity_matrix+'_predicted_genes.json') as data_file:
        data = json.load(data_file)
        
    # Creating genesetlibrary out of input json file
    genesetlibrary = {}
    for drug in drugs:
        for item in data:
            if drug in item:
                genelist = []
                for gene in item[drug]:
                    genelist.append(gene)
        genesetlibrary[drug] = genelist[:50]
        
    # Validating gene names
        
    genesetlibrary = {k : [synonym_lookup.get(x,x) for x in v] for k,v in genesetlibrary.items()} # Matching synonyms with approved symbols
    genesetlibrary = {k: list(set(v) & set(approved_symbols)) for k,v in genesetlibrary.items()} # Removing unmatched/unapproved symbols
   
    # Transposing the genesetlibrary into a drugsetlibrary
    d = defaultdict(list)
    for k,v in genesetlibrary.items():
        for gene in v:
            gene.split(',')
            d[gene].append(k)

    drugsetlibrary = dict(d)
    drugsetlibrary = {k:v for k,v in drugsetlibrary.items() if len(v)>=5} # Removing all terms paired with less than 5 drugs
    
    # Printing library counts
    print(similarity_matrix)
    library_counts(drugsetlibrary)
    
    # Exporting drugsetlibrary 
    gmt_formatter(drugsetlibrary, '../data/Geneshot/Geneshot_predicted_'+similarity_matrix+'_drugsetlibrary.txt')

In [11]:
for term in similarity_matrix:
    drugsetlibrary_converter(term)

autorif
12421 unique drugs
11556 unique association terms
613342 unique associations
53.07563170647283 average drugs per term
enrichr
12421 unique drugs
7348 unique association terms
557068 unique associations
75.81219379422973 average drugs per term
generif
12421 unique drugs
8406 unique association terms
600675 unique associations
71.45788722341184 average drugs per term
tagger
12421 unique drugs
13279 unique association terms
606470 unique associations
45.6713607952406 average drugs per term
coexpression
12421 unique drugs
8837 unique association terms
591984 unique associations
66.9892497453887 average drugs per term
