## Drugmonizome ETL : Geneshot

##### Author : Eryk Kropiwnicki | eryk.kropiwnicki@icahn.mssm.edu

In [None]:
#%%appyter init
from appyter import magic
magic.init(lambda _=globals: _())

In [None]:
import os
import sys
import zipfile
import datetime
import time
import json
from collections import defaultdict

import requests
from urllib3.util.retry import Retry
from requests.adapters import HTTPAdapter

import pandas as pd
import numpy as np
from tqdm import tqdm
import drugmonizome.utility_functions as uf
import harmonizome.lookup as lookup

In [None]:
%load_ext autoreload
%autoreload 2

### Notebook Information

In [None]:
print('This notebook was run on:', datetime.date.today(), '\nPython version:', sys.version)

### Initializing Notebook

In [None]:
%%appyter hide_code

{% do SectionField(
    name='data',
    title='Upload Data',
    img='load_icon.png'
) %}

In [None]:
%%appyter code_eval

{% do DescriptionField(
    name='description',
    text='This appyter queries the Geneshot API with user-submitted small molecule names to retrieve lists of associated genes.\
    Then using the associated gene lists, Geneshot returns predicted lists of genes based on co-occurrence and co-expression methods.',
    section='data'
) %}

{% set data_file = FileField(
    constraint='.*\.txt$',
    name='smallmolecule_names', 
    label='List of small molecule names (.txt)', 
    default='geneshot_smallmolecules.txt',
    examples={
        'geneshot_smallmolecules.txt': 'https://appyters.maayanlab.cloud/storage/Drugmonizome_ETL_Appyters/geneshot_smallmolecules.txt'
    },
    section='data'
) %}

{% set similarity_matrix = MultiChoiceField(
    name='identifier',
    label='Choose gene-gene similarity matrix for determining predicted genes',
    description='',
    choices=['autorif',
             'enrichr',
             'generif',
             'tagger',
             'coexpression'],
    default= [],
    section='data'
) %}

### Load Gene Mapping Dictionaries

In [None]:
symbol_lookup, geneid_lookup = lookup.get_lookups()

### Create Output Path

In [None]:
%%appyter code_exec

output_name = 'geneshot_associated'
path = 'output/drugmonizome_geneshot_associated'
if not os.path.exists(path):
    os.makedirs(path)

### Load Data

In [None]:
%%appyter code_exec

with open({{data_file}}) as f:
    smallmolecules = [line.strip() for line in f]

### Query small molecule names through Geneshot to retrieve associated genes

In [None]:
s = requests.Session()

retries = Retry(total=5,
                backoff_factor=0.5,
                status_forcelist=[ 500, 502, 503, 504 ])

s.mount('http://', HTTPAdapter(max_retries=retries))

In [None]:
feeds = []
for entry in tqdm(smallmolecules):
    response = s.get('http://amp.pharm.mssm.edu/geneshot/api/search/auto/' + entry)
    try:
        response.json()
        data = response.json()
    except ValueError:
        pass
    
    feeds.append(data)
    time.sleep(0.5)

### Create dataframe of small molecules, associated genes, and mention scores

In [None]:
df_counts = pd.DataFrame()
gene_list = []
compound_list = []
mention_score = []

for item in feeds:
    genes = item["gene_count"]
    compound = (item["search_term"])
    for gene in genes:
        gene_list.append(gene)
        compound_list.append(compound)
        mention_score.append(float(genes[gene][0])*float(genes[gene][1])) # total mentions * frequency in literature


df_counts['Compound_name'] = compound_list
df_counts['Gene_name'] = gene_list
df_counts['Mention_score'] = mention_score

### Filter dataframe by top 200 returned associated genes from each unique small molecule query, and top 1000 small molecule associations for each unique gene returned across all queries

In [None]:
# Filter each small molecule and its top 200 returned associated genes by mention score
df_counts = df_counts.sort_values(by = ['Compound_name', 'Mention_score'], ascending = False)\
    .groupby('Compound_name')\
    .head(200)

In [None]:
# For each unique gene, take the top thousand associated small molecules by mention score
df_filtered = df_counts.sort_values(by = ['Gene_name', 'Mention_score'], ascending = False)\
    .groupby('Gene_name')\
    .head(1000)

In [None]:
df_filtered.head()

In [None]:
df_output = df_filtered[['Compound_name','Gene_name']]
df_output.set_index('Compound_name', inplace = True)
df_output.shape

### Matching Gene Symbols to Approved Entrez Gene Symbols

In [None]:
df_output = uf.map_symbols(df_output, symbol_lookup)
df_output.shape

## Analyze Geneshot Associated Data

### Export Edge List

In [None]:
uf.save_data(df_output, path, output_name + '_edge_list', 
        ext='tsv', compression='gzip')

### Create Binary Matrix

In [None]:
binary_matrix = uf.binary_matrix(df_output)
binary_matrix.head()

In [None]:
binary_matrix.shape

In [None]:
uf.save_data(binary_matrix, path, output_name + '_binary_matrix', 
            compression='npz', dtype=np.uint8)

### Create Drug and Attribute Set Library

In [None]:
uf.save_setlib(binary_matrix, 'drug', path, output_name + '_drug_setlibrary')

In [None]:
uf.save_setlib(binary_matrix, 'attribute', path, output_name + '_attribute_setlibrary')

### Create Attribute Similarity Matrix

In [None]:
attribute_similarity_matrix = uf.similarity_matrix(binary_matrix.T, 'jaccard', sparse=True)
attribute_similarity_matrix.head()

In [None]:
uf.save_data(attribute_similarity_matrix, path,
            output_name + '_attribute_similarity_matrix', 
            compression='npz', symmetric=True, dtype=np.float32)

### Create Drug Similarity Matrix

In [None]:
drug_similarity_matrix = uf.similarity_matrix(binary_matrix, 'jaccard', sparse=True)
drug_similarity_matrix.head()

In [None]:
uf.save_data(drug_similarity_matrix, path,
            output_name + '_drug_similarity_matrix', 
            compression='npz', symmetric=True, dtype=np.float32)

## Predicted Genes

### Match each small molecule to associated genelists retrieved from Geneshot

In [None]:
genes = df_filtered['Gene_name'].tolist()
compounds = df_filtered['Compound_name'].tolist()

id_list = tuple(zip(compounds,genes))
genedict = defaultdict(list)
for k,v in id_list:
    genedict[k].append(v)

# Ensure each compound is matched with a gene list of length 5 or greater
genedict = {k:list(set(v)) for k,v in genedict.items() if len(set(v)) >=5}

### The next lines will create outputs specific to the selected predicted gene similarity metrics
**1)** Querying each unique small molecule associated genelist through Geneshot to retrieve predicted genes

**2)** Create dataframe of small molecules, predicted genes, and mention scores

**3)** Create edge lists, binary matrices, drug and attribute set libraries, and similarity matrices for each output

In [None]:
s = requests.Session()

retries = Retry(total=5,
                backoff_factor=0.5,
                status_forcelist=[ 500, 502, 503, 504 ])

s.mount('http://', HTTPAdapter(max_retries=retries))

In [None]:
GENESHOT_URL = 'http://amp.pharm.mssm.edu/geneshot/api'
query_string = '/associate/%s/%s'

In [None]:
%%appyter code_exec
{% if similarity_matrix.value %}
for matrix in {{ similarity_matrix }}:
    feeds = []
    for drug, genelist in tqdm(genedict.items()):
        response = s.get(
            GENESHOT_URL + query_string % (matrix,genelist)
        )
        data = response.json()
        data[drug] = data.pop("association") # label each set of returned results with small molecule name
        feeds.append(data)
        time.sleep(0.5)
    
    # Create list of queried drugs to match dict_keys in json object
    drugs = [k for k,v in genedict.items()]

    # Iterate through json object and retrieve compounds matched to genes and similarity scores
    df = pd.DataFrame()

    simscores = []
    drug_list = []
    gene_list = []

    for drug in drugs:
        for item in feeds:
            if drug in item:
                for gene in item[drug]:
                    simscores.append((item[drug][gene]["simScore"]))
                    drug_list.append(drug)
                    gene_list.append(gene)

    df['Compound_name'] = drug_list 
    df['Gene_name'] = gene_list
    df['Similarity_score'] = simscores
    
    # Filter each gene by top 1000 small molecule associations by similarity score
    df_filtered = df.sort_values(by = ['Gene_name', 'Similarity_score'], ascending = False)\
        .groupby('Gene_name')\
        .head(1000)
    
    # Set compound name as dataframe index
    df_output = df_filtered[['Compound_name','Gene_name']]
    df_output.set_index('Compound_name', inplace = True)
    df_output.shape
    
    # Matching gene symbols to approved entrez gene symbols
    df_output = uf.map_symbols(df_output, symbol_lookup)
    
    # Create output path specific to similarity matrix
    output_name = 'geneshot_' + matrix
    path = 'output/drugmonizome_geneshot_' + matrix
    if not os.path.exists(path):
        os.makedirs(path)
    
    # Export edge list
    uf.save_data(df_output, path, output_name + '_edge_list', 
        ext='tsv', compression='gzip')
    
    # Create binary matrix
    binary_matrix_down = uf.binary_matrix(df_output)
    uf.save_data(binary_matrix, path, output_name + '_binary_matrix', 
            compression='npz', dtype=np.uint8)
    
    # Create drug and attribute set library
    uf.save_setlib(binary_matrix_down, 'drug', path, output_name + '_drug_setlibrary')
    uf.save_setlib(binary_matrix_down, 'attribute', path, output_name + '_attribute_setlibrary')
    
    # Create attribute similarity matrix
    attribute_similarity_matrix = uf.similarity_matrix(binary_matrix.T, 'jaccard', sparse=True)
    uf.save_data(attribute_similarity_matrix, path,
            output_name + '_attribute_similarity_matrix', 
            compression='npz', symmetric=True, dtype=np.float32)
    
    # Create drug similarity matrix
    drug_similarity_matrix = uf.similarity_matrix(binary_matrix, 'jaccard', sparse=True)
    uf.save_data(drug_similarity_matrix, path,
            output_name + '_drug_similarity_matrix', 
            compression='npz', symmetric=True, dtype=np.float32)

        
{% else %} 
print('No similarity methods specified for generating predicted genes')
{% endif %}

### Create download folder with all outputs

In [None]:
paths = [x[0] for x in os.walk('output')]
zipf = zipfile.ZipFile("output_archive.zip", "w")
for path in paths:
    for root, _, files in os.walk(path):
        for f in files:
            zipf.write(os.path.join(root, f))

### Link to the output folder: [Download](./output_archive.zip)