# DrugShot
DrugShot searches PubMed for articles that co-mention any search term and terms that describe drugs.

It then prioritizes these drugs using various methods, and predicts additional drugs based on shared properties among drugs and other small molecules.


In [None]:
#%%appyter init
from appyter import magic
magic.init(lambda _=globals: _())

In [None]:
import os
import sys
import zipfile
import datetime

import pandas as pd
import numpy as np

# Display / graphing
from IPython.display import display, HTML
import plotly.express as px
import plotly.graph_objects as go

# API access
import requests
import io
import time

In [None]:
# Notebook display util functions
def make_clickable(link):
    return f'<a target="_blank" href="{link}">{link}</a>'

table_number = 0
figure_number = 0
def figure_header(label,title):
    global table_number
    global figure_number
    if label == 'Table':
        table_number += 1
        label = f'Table {table_number}'
    elif label == 'Figure':
        figure_number += 1
        label = f'Figure {figure_number}'
    display(HTML(f"<div style='font-size:2rem; padding:1rem 0;'><b>{label}</b>: {title}</div>"))
    
def figure_legend(label,title,content=''):
    global table_number
    global figure_number
    if label == 'Table':
        label = f'Table {table_number}'
    elif label == 'Figure':
        label = f'Figure {figure_number}'
    display(HTML(f'<style>div.caption {{text-align: center;}}</style><div class=caption><b>{label}</b>: <i>{title}</i>. {content} </div>'))

In [None]:
%%appyter hide_code

{% do SectionField(name='section1',
                   title = '1. Submit Your Biomedical Term of Interest:')%}

{% do SectionField(name='section2',
                   title = '2. Chooose Number of Top Associated Drugs to Make Predictions:')%}

In [None]:
%%appyter hide_code

{% set term = StringField(name='input_term',
                     label='Biomedical Term',
                     default='Lung Cancer',
                     description='Input your biomedical term of interest.',
                     section = 'section1') %}

{% set set_size = IntField(name = 'set_size',
                        label = 'Associated drug set size',
                        min = 50,
                        max = 200,
                        default = 50,
                        description = 'Size of drug set used for predictions of additional compounds',
                        section = 'section2') %}

### Load DrugRIF

In [None]:
DrugRIF = pd.read_csv('https://appyters.maayanlab.cloud/storage/DrugShot/DrugRIF.tsv.gz',sep = '\t', usecols = ['name','PMID']).set_index('name')

### Query Term Using PubMed Eutils API

In [None]:
%%appyter code_exec
i = 0
pubmed_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=pubmed&term={}&retmax=100000&retstart={}&retmode=json"

results = []
res = requests.get(pubmed_url.format({{term}}, i)).json()
while i <= int(res['esearchresult']['count']):
    results.extend(requests.get(pubmed_url.format({{term}},i)).json()['esearchresult']['idlist'])
    i += len(res['esearchresult']['idlist'])
    time.sleep(0.2)

### Associated Drug Table

In [None]:
# Retrieve top 500 associated drugs from DrugRIF based on search term co-mentions in the literature
df_associated = pd.DataFrame(DrugRIF[DrugRIF['PMID'].isin(results) == True].index.value_counts())\
.rename(columns = {'name':'Publications with Search Term'})[0:500]

# Get counts of each chemical in DrugRIF
chemical_counts = []
for chemical in df_associated.index:
    chemical_counts.append(DrugRIF.index.tolist().count(chemical))
    
df_associated['Publications with Search Term / Total Publications'] = df_associated.div(chemical_counts, axis=0)
df_associated['Score'] = df_associated['Publications with Search Term'] * df_associated['Publications with Search Term / Total Publications']

del(DrugRIF)

In [None]:
%%appyter code_exec
associated_table = df_associated.sort_values(by = ['Publications with Search Term'], ascending = False)
associated_table.to_csv({{term}}.replace(' ','_')+'_associated_drug_table.csv')

figure_header('Table', 'Top Associated Compounds ({})'.format(make_clickable({{term}}.replace(' ','_')+'_associated_drug_table.csv')))
display(associated_table[associated_table.columns[0:2]].head(20))
figure_legend('Table', 'Top 20 Drugs associated with '+{{term}})

del(df_associated)

### Scatter Plot of Drug Frequency in Literature

In [None]:
%%appyter code_exec
fig = px.scatter(associated_table.reset_index().rename(columns = {'index':'chemical'}),
                 x = 'Publications with Search Term',
                 y = 'Publications with Search Term / Total Publications',
                 hover_name = 'chemical',
                 title={{term}})
fig.show()

#### Create list of top associated compounds ranked by Publications with Search Term * Publications with Search Term / Total Publications

In [None]:
%%appyter code_exec
associated_compounds = associated_table.sort_values(by = 'Score', ascending = False)[0:{{set_size}}].index.tolist()

In [None]:
%%appyter markdown

### Predicted DrugRIF Co-occurence
The top {{ set_size }} drugs from the associated drug table are ranked by the product of (Publications with Search Term) * (Publication with Search Term / Total Publications) and treated as an unweighted drug set

Predicted compounds are computed based on average co-occurence counts of PubMedIDs between the unweighted drug set and other drugs & small molecules within DrugRIF

In [None]:
# Load cooccurrence matrix into pandas DataFrame
df_cooccurence = pd.read_csv('https://appyters.maayanlab.cloud/storage/DrugShot/DrugRIF_cooccurence_matrix.tsv.gz',
                                sep = '\t',
                                index_col = 0)

In [None]:
%%appyter code_exec
# Calculate average co-occurrence for each drug with the associated drug set
df_cooccurence = df_cooccurence.loc[df_cooccurence.index.isin(associated_compounds)]
df_cooccurence.loc['Score'] = df_cooccurence[df_cooccurence.columns].mean()
df_cooccurence.sort_values(by = ['Score'], axis = 1, ascending = False, inplace = True)

In [None]:
%%appyter code_exec
# Calculate rank of each of the compounds in the associated drug set
ranksum_associated = sorted([(len(df_cooccurence.columns)-df_cooccurence.columns.get_loc(x))\
                      for x in associated_compounds], reverse = True)

# Create dict for labeling each of the associated compounds
plot_text_dict = {}
for x in df_cooccurence.loc['Score'].loc[associated_compounds].sort_values(ascending = False).index:
    plot_text_dict[x] = df_cooccurence.loc['Score'].index.get_loc(x)+1
# Plot   
fig = go.Figure(data=go.Scatter(x= [x for x in reversed(range(1,len(df_cooccurence.columns)+1))],
                                y= ranksum_associated,
                                mode='lines+markers',
                                text=[f'{x}<br>Rank: {plot_text_dict[x]}' for x in plot_text_dict],
                                hoverinfo='text'),
               layout = {"title": {"text": "Retrieval of Top Associated Drugs Based on Ranking in Co-occurence Prediction Matrix"},
                         "xaxis": {"title": "Ranking of All {} Compounds in Prediction Matrix".format(len(df_cooccurence.columns)), "showticklabels": False},
                         "yaxis": {"title": "Ranking of Top {} Associated Compounds".format({{ set_size }}), "showticklabels": False}})
fig.show()

In [None]:
%%appyter code_exec
predicted_table = pd.DataFrame(df_cooccurence.loc['Score']).drop(associated_compounds)
predicted_table.to_csv({{term}}.replace(' ','_')+'_cooccurence_similarity_predicted_drug_table.csv')
figure_header('Table', 'Top Predicted Compounds ({})'.format(make_clickable({{term}}.replace(' ','_')+'_cooccurence_similarity_predicted_drug_table.csv')))
display(predicted_table.head(20))
figure_legend('Table', 'Top 20 drugs predicted to be associated with {} based on DrugRIF co-occurence'.format({{term}}))

del(df_cooccurence)

In [None]:
%%appyter markdown

### Predicted L1000 Co-expression
The top {{ set_size }} drugs from the associated drug table are ranked by the product of (Publications with Search Term) * (Publication with Search Term / Total Publications) and treated as an unweighted drug set

Predicted compounds are computed based on average cosine similarity of drug-induced L1000 gene expression signatures between the unweighted drug set and thousands of approved and experimental small molecules from [SEP-L1000](https://maayanlab.net/SEP-L1000/index.html)

In [None]:
# Load correlation matrix into pandas DataFrame
response = requests.get('https://appyters.maayanlab.cloud/storage/DrugShot/L1000_similarity_matrix.npz')
coexpression_matrix = np.load(io.BytesIO(response.content), allow_pickle = True)

df_coexpression = pd.DataFrame(data = coexpression_matrix['correlations'], columns = coexpression_matrix['index'],
                      index = coexpression_matrix['index'])

del(coexpression_matrix)

In [None]:
%%appyter code_exec
# Calculate average similarity for each drug with the associated drug set
df_coexpression = df_coexpression.loc[df_coexpression.index.isin(associated_compounds)]
df_coexpression.loc['Score'] = df_coexpression[df_coexpression.columns].mean()
df_coexpression.sort_values(by = ['Score'], axis = 1, ascending = False, inplace = True)

In [None]:
%%appyter code_exec
# Calculate rank of each of the compounds in the associated drug set
ranksum_associated = sorted([(len(df_coexpression.columns)-df_coexpression.columns.get_loc(x))\
                      for x in associated_compounds if x in df_coexpression.index], reverse = True)

# Create dict for labeling each of the associated compounds
plot_text_dict = {}
for x in df_coexpression.loc['Score'][df_coexpression.columns.isin(associated_compounds)].sort_values(ascending = False).index:
    plot_text_dict[x] = df_coexpression.loc['Score'].index.get_loc(x)+1
# Plot
fig = go.Figure(data=go.Scatter(x= [x for x in reversed(range(1,len(df_coexpression.columns)+1))],
                                y= ranksum_associated,
                                mode='lines+markers',
                                text=[f'{x}<br>Rank: {plot_text_dict[x]}' for x in plot_text_dict],
                                hoverinfo='text'),
               layout = {"title": {"text": "Retrieval of Top Associated Drugs Based on Ranking in Co-Expression Prediction Matrix"},
                         "xaxis": {"title": "Ranking of All {} Compounds in Prediction Matrix".format(len(df_coexpression.columns)), "showticklabels": False},
                         "yaxis": {"title": "Ranking of Top {} Associated Compounds".format({{ set_size }}), "showticklabels": False}})
fig.show()

In [None]:
%%appyter code_exec
predicted_table = pd.DataFrame(df_coexpression.loc['Score'])
predicted_table = predicted_table[~predicted_table.index.isin(associated_compounds)]
predicted_table.to_csv({{term}}.replace(' ','_')+'_coexpression_similarity_predicted_drug_table.csv')
figure_header('Table', 'Top Predicted Compounds ({})'.format(make_clickable({{term}}.replace(' ','_')+'_coexpression_similarity_predicted_drug_table.csv')))
display(predicted_table.head(20))
figure_legend('Table', 'Top 20 drugs predicted to be associated with {} based on coexpression'.format({{term}}))

del(df_coexpression)

In [None]:
%%appyter markdown

### Predicted Tanimoto Structural Similarity
The top {{ set_size }} drugs from the associated drug table are ranked by the product of (Publications with Search Term) * (Publication with Search Term / Total Publications) and treated as an unweighted drug set

Predicted compounds are computed based on average Tanimoto similarity between the unweighted drug set and all other drugs & small molecules included in DrugRIF

In [None]:
# Load tanimoto similarity matrix into pandas DataFrame
response = requests.get('https://appyters.maayanlab.cloud/storage/DrugShot/Tanimoto_similarity_matrix.npz')
tanimoto_matrix = np.load(io.BytesIO(response.content), allow_pickle = True)

df_tanimoto = pd.DataFrame(data = tanimoto_matrix['correlations'], columns = tanimoto_matrix['index'],
                      index = tanimoto_matrix['index'])
df_tanimoto[df_tanimoto.columns] = df_tanimoto[df_tanimoto.columns].replace({1:np.nan})

In [None]:
%%appyter code_exec
# Calculate average similarity for each drug with the associated drug set
df_tanimoto = df_tanimoto.loc[df_tanimoto.index.isin(associated_table.index[0:{{set_size}}])]
df_tanimoto.loc['Score'] = df_tanimoto[df_tanimoto.columns].mean()
df_tanimoto.sort_values(by = ['Score'], axis = 1, ascending = False, inplace = True)

In [None]:
%%appyter code_exec
# Calculate rank of each of the compounds in the associated drug set
ranksum_associated = sorted([(len(df_tanimoto.columns)-df_tanimoto.columns.get_loc(x))\
                      for x in associated_compounds], reverse = True)

# Create dict for labeling each of the associated compounds
plot_text_dict = {}
for x in df_tanimoto.loc['Score'].loc[associated_compounds].sort_values(ascending = False).index:
    plot_text_dict[x] = df_tanimoto.loc['Score'].index.get_loc(x)+1
# Plot
fig = go.Figure(data=go.Scatter(x= [x for x in reversed(range(1,len(df_tanimoto.columns)+1))],
                                y= ranksum_associated,
                                mode='lines+markers',
                                text=[f'{x}<br>Rank: {plot_text_dict[x]}' for x in plot_text_dict],
                                hoverinfo='text'),
               layout = {"title": {"text": "Retrieval of Top Associated Drugs Based on Ranking in Tanimoto Similarity Matrix"},
                         "xaxis": {"title": "Ranking of All {} Compounds in Prediction Matrix".format(len(df_tanimoto.columns)), "showticklabels": False},
                         "yaxis": {"title": "Ranking of Top {} Associated Compounds".format({{ set_size }}), "showticklabels": False}})
fig.show()

In [None]:
%%appyter code_exec
predicted_table = pd.DataFrame(df_tanimoto.loc['Score']).drop(associated_compounds)
predicted_table.to_csv({{term}}.replace(' ','_')+'_tanimoto_similarity_predicted_drug_table.csv')
figure_header('Table', 'Top Predicted Compounds ({})'.format(make_clickable({{term}}.replace(' ','_')+'_tanimoto_similarity_predicted_drug_table.csv')))
display(predicted_table.head(20))
figure_legend('Table', 'Top 20 drugs predicted to be associated with {} based on chemical structure similarity'.format({{term}}))