# DeepCoverMOA Drug MOAs
[DeepCoverMOA](http://wren.hms.harvard.edu/DeepCoverMOA/#) is a resource detailing protein expression by mass spectrometry 24 hours after small molecule perturbation in order to study drug mechanisms of action (MOAs). The effect of 875 small molecules was studied across 9960 proteins in the human HCT116 cancerous cell line in a high-thoroughput, data-driven study. The researchers leveraged protein-protein and compound-compound correlation networks to uncover previously unknown MOAs for several compounds.

Mitchell, D. C., et al. (2023). "A proteome-wide atlas of drug mechanism of action." Nature Biotechnology.  
The paper is available [here](https://www.nature.com/articles/s41587-022-01539-0) and the data processed herein is available as Supplementary Table 1.


In [None]:
import pandas as pd
import datetime
import numpy as np
import scipy.spatial.distance as dist
import seaborn as sns
import sys
import json
import scanpy as sc
from tqdm import tqdm
from collections import OrderedDict

from sklearn.feature_extraction.text import TfidfVectorizer
import anndata
from collections import OrderedDict

# Bokeh
from bokeh.io import output_notebook
from bokeh.plotting import figure, show, save, output_file
from bokeh.models import HoverTool, ColumnDataSource
from bokeh.palettes import Category20
output_notebook()

from IPython.display import display, HTML, Markdown
sys.setrecursionlimit(100000)

In [None]:
output_notebook()

## Load Data

In [None]:
drugmoa = pd.read_csv('newdata/DeepCoverMOA/DeepCoverMOA.csv')
drugmoa

## Pre-Process Data

In [None]:
drugmoa = drugmoa.dropna(subset=['Gene Name'])
drugmoa = drugmoa.set_index('Gene Name').drop(columns='UniprotID')
drugmoa.columns = drugmoa.columns.map(lambda x: '_'.join(x.split(sep='_')[:-1]))
drugmoa = drugmoa.replace(np.nan, 0)
drugmoa

In [None]:
drugmoa.dropna(thresh=drugmoa.shape[1]/2)

## Process Data

In [None]:
associations = drugmoa.stack().to_frame().reset_index()
associations.columns = ['Gene', 'Drug', 'Score']
associations

In [None]:
print(associations['Drug'].unique().__len__(), 'drugs,', associations['Gene'].unique().__len__(), 'genes')

In [None]:
up = associations[associations['Score']>0].sort_values(['Drug','Score'], ascending=[True, False]).set_index('Drug')
positive = pd.DataFrame(columns=['Gene','Score'])
for drug in pd.unique(up.index):
    positive = pd.concat([positive, up.loc[drug][:100]])
positive = positive.sort_values('Score', ascending=False).reset_index()
positive.columns = ['Drug', 'Gene', 'Score']
positive

In [None]:
down = associations[associations['Score']<0].sort_values(['Drug','Score']).set_index('Drug')
negative = pd.DataFrame(columns=['Gene','Score'])
for drug in pd.unique(down.index):
    negative = pd.concat([negative, down.loc[drug][:100]])
negative = negative.sort_values('Score').reset_index()
negative.columns = ['Drug', 'Gene', 'Score']
negative

In [None]:
def threshold(x): 
    if x > 0:
        return 1
    elif x < 0:
        return -1
    return x

In [None]:
associations = pd.concat([positive,negative]).sort_values('Score',ascending=False).reset_index(drop=True)
associations['Threshold'] = associations['Score'].apply(threshold)
associations['Gene'] = associations['Gene'].apply(str.upper)
associations

In [None]:
associations.sort_values(['Drug','Score'],ascending=[True,False]).reset_index(drop=True)

In [None]:
print(associations['Drug'].unique().__len__(), 'drugs,', associations['Gene'].unique().__len__(), 'genes')

## Harmonizome Additions

#### Resource

In [None]:
(103, 'DeepCoverMOA', None, 'Here we quantify "proteome fingerprints" depicting proteome-wide effects of 875 small molecule perturbagens as a resource for MOA deconvolution and compound repurposing', 'A proteome-wide atlas of drug mechanism of action', 'http://wren.hms.harvard.edu/DeepCoverMOA/#', 874, 1, None)

#### Dataset

In [None]:
# id, name, name wo resource, desc, association, gene set desc, gene sets desc, attribute set desc, positive association, negative association, signed, continuous, last_update, directory, num page views, resource fk, measurement fk, dataset group fk, attribute type fk, attribute group fk, evidence type, evidence group, measurement bias, attribute type plural
(133, 'DeepCoverMOA Drug Mechanisms of Action', 'Drug Mechanisms of Action', 'protein expression profiles in HCT116 cell line following drug treatment', 'gene-small molecule associations by differential expression of gene following small molecule perturbation', 'proteins with high or low expression in {0} relative to other proteins from the DeepCoverMOA Drug Mechanisms of Action dataset.', 'sets of proteins with high or low expression relative to other proteins from the DeepCoverMOA Drug Mechanisms of Action dataset.', 'small molecule perturbations with high or low expression of {0} protein relative to other small molecule perturbations from the DeepCoverMOA Drug Mechanisms of Action dataset.', 'high expression', 'low expression', 1, 1, '01-09-23', 'deepcovermoa', 0, 103, 25, 5, 12, 2, 'protein expression by mass spectrometry', 'primary experimental data', 'high thoroughput, data driven', 'small molecule perturbations')

### Publication

In [None]:
(138)

### Genes To Add

In [None]:
production = pd.read_csv('tables/gene.csv', index_col='id')
production['symbol'] = production['symbol'].apply(str.upper)
geneinfo = pd.read_csv('tables/gene_info', sep='\t').get(['GeneID','Symbol','description'])
geneinfo['Symbol'] = geneinfo['Symbol'].apply(str.upper)
geneinfo = geneinfo.set_index('Symbol')

In [None]:
newgenes=[]
for gene in associations['Gene'].unique().tolist():
    if gene not in production['symbol'].to_list() and gene in geneinfo.index:
        newgenes.append(gene.upper())

In [None]:
#(id,symbol,ncbi_entrez_gene_id,name,ncbi_entrez_gene_url)
genefk = production.reset_index().set_index('symbol')['id'].to_dict()
url = 'https://ncbi.nlm.nih.gov/gene/'
index = 57029
for gene in newgenes:
    id = geneinfo.loc[gene, 'GeneID']
    print((index, gene, id, geneinfo.loc[gene, 'description'], url+str(id)), end=',\n')
    genefk[gene] = index
    index += 1

In [None]:
proddict = production.set_index('symbol')['ncbi_entrez_gene_id'].to_dict()
geneinfodict = geneinfo['GeneID'].to_dict()

genes = associations['Gene'].to_frame().drop_duplicates().reset_index(drop=True)
genes['production'] = genes['Gene'].apply(lambda x: x in proddict)
genes['new'] = genes['Gene'].apply(lambda x: x in newgenes)
genes['id'] = 0

for gene in genes.index:
    name = genes.loc[gene,'Gene']
    prod = genes.loc[gene, 'production']
    new = genes.loc[gene, 'new']
    if prod:
        genes.loc[gene,'id'] = proddict[name]
    elif new:
        genes.loc[gene,'id'] = geneinfodict[name]

genes = genes.get(['Gene','id'])
genes = genes[genes['id']>0].reset_index(drop=True)
genes.columns = ['Gene Symbol', 'Gene ID']
genes['Gene FK'] = genes['Gene Symbol'].apply(lambda x: genefk[x])
genes

### Attributes To Add

In [None]:
pubchem = pd.read_csv('tables/pubchem.txt', sep='\t').dropna(how='any').drop_duplicates('Compound Name').set_index('Compound Name')['Unnamed: 1'].astype(int).to_dict()

In [None]:
def decodererror(drug):
    dict = {'Veliparib�':'Veliparib', 'Sapropterin�':'Sapropterin','Resmetirom�':'Resmetirom'}
    if drug in dict:
        return dict[drug]
    return drug

In [None]:
attributes = pd.DataFrame(associations['Drug'].unique(), columns=['Compound Name'])
attributes['Compound Name'] = attributes['Compound Name'].apply(decodererror)
attributes['Pubchem'] = attributes['Compound Name'].apply(lambda x: pubchem[x])
attributes = attributes.reset_index().reset_index()
attributes.columns = ['attribute_fk','gene_set_fk','Compound Name', 'Pubchem']
attributes['attribute_fk'] += 297012
attributes['gene_set_fk'] += 133200000
genesetfk = attributes.set_index('Compound Name')['gene_set_fk'].to_dict()
attributes

In [None]:
production = pd.read_csv('production/attribute.csv')
production = production[production['naming_authority_fk']==19]
url = 'https://pubchem.ncbi.nlm.nih.gov/compound/'
for drug in attributes.index:
    drug = attributes.loc[drug]
    id = drug['Pubchem']
    #print((drug['attribute_fk'], drug['Compound Name'], id, url+str(id), 19), end=',\n')

### Gene Sets To Add

In [None]:
url = 'https://pubchem.ncbi.nlm.nih.gov/compound/' # need to replace with DeepCover url, 
for drug in attributes.index:
    drug = attributes.loc[drug]
    id = drug['Pubchem']
    #print((drug['gene_set_fk'], drug['Compound Name'], id, url+str(id), 133, 8, drug['attribute_fk']), end=',\n')

### Associations

In [None]:
dropgenes = []
for gene in associations['Gene'].unique().tolist():
    if gene not in genes['Gene Symbol'].to_list():
        dropgenes.append(gene)

In [None]:
associations = associations.set_index('Gene').drop(dropgenes, axis=0).reset_index()
associations['Drug'] = associations['Drug'].apply(decodererror)
deepcovermoa = associations.copy()
associations['gene_fk'] = associations['Gene'].apply(lambda x: genefk[x])
associations['gene_set_fk'] = associations['Drug'].apply(lambda x: genesetfk[x])
associations = associations.drop_duplicates(subset=['gene_fk', 'gene_set_fk']).reset_index(drop=True)
associations.index += 12000000
associations = associations[['gene_fk', 'gene_set_fk', 'Score', 'Threshold']]
associations.columns = ['gene_fk', 'gene_set_fk', 'standardized_value', 'threshold_value']
associations.to_csv('harmonizome-update/deepcovermoa.csv')
associations

## Downloads

In [None]:
output_path = 'newdata/DeepCoverMOA/downloads/'
deepcovermoa = deepcovermoa.drop_duplicates(subset=['Drug', 'Gene']).reset_index(drop=True)

### Ternary Matrix

In [None]:
ternarymatrix = pd.crosstab(deepcovermoa['Gene'], deepcovermoa['Drug'], deepcovermoa['Threshold'], aggfunc=np.max).replace(np.nan, 0).astype(int)
ternarymatrixT = ternarymatrix.T
ternarymatrix.to_csv(output_path+'gene_similarity_matrix.txt.gz', sep='\t', compression='gzip')
ternarymatrix

### Gene-Attribute Edge List

In [None]:
edgelist = deepcovermoa.copy()
geneid = genes.set_index('Gene Symbol')['Gene ID'].to_dict()
drugid = attributes.set_index('Compound Name')['Pubchem'].to_dict()
edgelist['Gene ID'] = edgelist['Gene'].apply(lambda x: geneid[x])
edgelist['Pubchem ID'] = edgelist['Drug'].apply(lambda x: drugid[x])
edgelist = edgelist.get(['Gene', 'Gene ID', 'Drug', 'Pubchem ID', 'Score', 'Threshold'])

edgelist.to_csv(output_path+'gene_attribute_edges.txt.gz', sep='\t', compression='gzip')
edgelist

### Gene List

In [None]:
genelist = edgelist[['Gene', 'Gene ID']].drop_duplicates().reset_index(drop=True)
genelist.to_csv(output_path+'gene_list_terms.txt.gz', sep='\t', compression='gzip')
genelist

### Attribute List

In [None]:
attributelist = edgelist[['Drug', 'Pubchem ID']].drop_duplicates().reset_index()
attributelist.to_csv(output_path+'attribute_list_entries.txt.gz', sep='\t', compression='gzip')
attributelist

### Gene Up Set Library

In [None]:
arr = ternarymatrix.reset_index(drop=True).to_numpy(dtype=np.int_)

with open(output_path+'gene_set_library_up_crisp.gmt', 'w') as f:
    for i in range(arr.shape[1]):
        print(ternarymatrix.columns[i],*ternarymatrix.index[arr[:,i]==1],
            sep='\t', end='\n', file=f)

### Gene Down Set Library

In [None]:
with open(output_path+'gene_set_library_dn_crisp.gmt', 'w') as f:
    for i in range(arr.shape[1]):
        print(ternarymatrix.columns[i],*ternarymatrix.index[arr[:,i]==-1],
        sep='\t', end='\n', file=f)

### Attribute Up Set Library

In [None]:
arr = ternarymatrixT.reset_index(drop=True).to_numpy(dtype=np.int_)

with open(output_path+'attribute_set_library_up_crisp.gmt', 'w') as f:
    for i in range(arr.shape[1]):
        print(ternarymatrixT.columns[i],*ternarymatrixT.index[arr[:,i]==1],
            sep='\t', end='\n', file=f)

### Attribute Down Set Library

In [None]:
with open(output_path+'attribute_set_library_dn_crisp.gmt', 'w') as f:
    for i in range(arr.shape[1]):
        print(ternarymatrixT.columns[i],*ternarymatrixT.index[arr[:,i]==-1],
            sep='\t', end='\n', file=f)

### Gene Similarity Matrix

In [None]:
similarity_matrix = dist.pdist(ternarymatrix.to_numpy(dtype=np.int_), 'cosine')
similarity_matrix = dist.squareform(similarity_matrix)
similarity_matrix = 1 - similarity_matrix

gene_similarity_matrix = pd.DataFrame(similarity_matrix, index=ternarymatrix.index, columns=ternarymatrix.index)
gene_similarity_matrix.index.name = None
gene_similarity_matrix.columns.name = None

gene_similarity_matrix.to_csv(output_path+'gene_similarity_matrix_cosine.txt.gz', sep='\t', compression='gzip')
gene_similarity_matrix

### Attribute Similarity Matrix

In [None]:
similarity_matrix = dist.pdist(ternarymatrixT.to_numpy(dtype=np.int_), 'cosine')
similarity_matrix = dist.squareform(similarity_matrix)
similarity_matrix = 1 - similarity_matrix

attribute_similarity_matrix = pd.DataFrame(similarity_matrix, index=ternarymatrixT.index, columns=ternarymatrixT.index)
attribute_similarity_matrix.index.name = None
attribute_similarity_matrix.columns.name = None

attribute_similarity_matrix.to_csv(output_path+'attribute_similarity_matrix_cosine.txt.gz', sep='\t', compression='gzip')
attribute_similarity_matrix

### Standardized Gene-Attribute Matrix

In [None]:
standardmatrix = pd.crosstab(deepcovermoa['Gene'], deepcovermoa['Drug'], deepcovermoa['Score'], aggfunc=np.max).replace(np.nan, 0)
standardmatrix.to_csv(output_path+'gene_attribute_matrix_standardized.txt.gz', sep='\t', compression='gzip')
standardmatrix

### Knowledge Graph Serializations

In [None]:
nodes = {}
edges = []

for gene in genelist.index:
    gene = genelist.loc[gene]
    label = gene['Gene']
    id = int(gene['Gene ID'])
    nodes[id] = {
        "type":"gene",
        "properties":{
            "id":id,
            "label":label
        }}


for drug in attributelist.index:
    drug = attributelist.loc[drug]
    label = drug['Drug']
    id = int(drug['Pubchem ID'])
    nodes[label] = {
        "type":"drug",
        "properties": {
            "id":id,
            "label":label
        }}

for edge in edgelist.index:
    edge = edgelist.loc[edge]
    source = edge['Drug']
    sourceid = int(edge['Pubchem ID'])
    target = edge['Gene']
    targetid = int(edge['Gene ID'])
    score = edge['Score']
    threshold = edge['Threshold']
    if threshold==1:
        edges.append({
            "source": source,
            "relation": "positively regulates",
            "target": targetid,
            "properties":{
                "id":source+":"+str(targetid),
                "source_label":sourceid,
                "target_label":target,
                "directed":True,
                "score":score,
                "threshold":int(threshold)
            }})
    if threshold==-1:
        edges.append({
            "source": source,
            "relation": "negatively regulates",
            "target": targetid,
            "properties":{
                "id":source+":"+str(targetid),
                "source_label":sourceid,
                "target_label":target,
                "directed":True,
                "score":score,
                "threshold":int(threshold)
            }})

### RDF

In [None]:
with open(output_path+'serializations/deepcovermoa.rdf', 'w') as f:
    print('@prefix pubchem: https://pubchem.ncbi.nlm.nih.gov/compound/', file=f)
    print('@prefix RO: http://purl.obolibrary.org/obo/RO_', file=f)
    print('@prefix gene: https://ncbi.nlm.nih.gov/gene/', file=f)

    for edge in edges:
        sourceid = str(edge['properties']['source_label'])
        relation = {'positively regulates':'0002213', 'negatively regulates':'0002212'}[edge['relation']]
        targetid = str(edge['target'])
        print('pubchem:'+sourceid, 'RO:'+relation, 'gene:'+targetid, end=' .\n', file=f)

### JSON

In [None]:
with open(output_path+'serializations/deepcovermoa.json', 'w') as f:
    serial = json.dump(
        {
            "Version":"1", 
            "nodes": nodes,
            "edges": edges
        }, indent=4, fp=f)

### TSV

In [None]:
nodeframe = pd.DataFrame(nodes).T
nodeframe['id'] = nodeframe['properties'].apply(lambda x: x['id'])
nodeframe['label'] = nodeframe['properties'].apply(lambda x: x['label'])
nodeframe['namespace'] = nodeframe['type'].apply(lambda x: {'gene':'NCBI Entrez', 'drug':'Pubchem'}[x])
nodeframe = nodeframe.get(['namespace','id','label']).reset_index(drop=True)
nodeframe.to_csv(output_path+'serializations/deepcovermoa_tsv/nodes.tsv', sep='\t')

In [None]:
edgeframe = pd.DataFrame(edges)
edgeframe['source_label'] = edgeframe['properties'].apply(lambda x: x['source_label'])
edgeframe['target_label'] = edgeframe['properties'].apply(lambda x: x['target_label'])
edgeframe['directed'] = edgeframe['properties'].apply(lambda x: x['directed'])
edgeframe['score'] = edgeframe['properties'].apply(lambda x: x['score'])
edgeframe['threshold'] = edgeframe['properties'].apply(lambda x: x['threshold'])
edgeframe = edgeframe.drop(columns=['properties'])
edgeframe = edgeframe.get(['source_label', 'relation', 'target', 'source', 'target_label', 'directed', 'score', 'threshold'])
edgeframe.columns = ['source', 'relation', 'target', 'source_label', 'target_label', 'directed', 'score', 'threshold']
edgeframe.to_csv(output_path+'serializations/deepcovermoa_tsv/edges.tsv', sep='\t')

## Visualizations
### Gene-Attribute Heatmap

In [None]:
sns.clustermap(ternarymatrix, cmap='seismic', center=0)

### Gene Similarity Heatmap

In [None]:
sns.clustermap(gene_similarity_matrix, cmap='seismic', center=0)

### Attribute Similarity Heatmap

In [None]:
sns.clustermap(attribute_similarity_matrix, cmap='seismic', center=0)

### UMAP

In [None]:
def load_gmt(file, dir='up'):
    gmt = OrderedDict()
    for line in file:
        term, *geneset = line.strip().split('\t')
        gmt[term+'_'+dir] = ' '.join(set(geneset))
    return gmt

In [None]:
libdict = load_gmt(open('newdata/DeepCoverMOA/downloads/gene_set_library_up_crisp.gmt', 'r'))
libdict.update(load_gmt(open('newdata/DeepCoverMOA/downloads/gene_set_library_dn_crisp.gmt', 'r'), 'dn'))
scatterdir = 'newdata/DeepCoverMOA/images/'

In [None]:
def process_scatterplot(libdict, nneighbors=30, mindist=0.1, spread=1.0, maxdf=1.0, mindf=1):
    print("\tTF-IDF vectorizing gene set data...")
    vec = TfidfVectorizer(max_df=maxdf, min_df=mindf)
    X = vec.fit_transform(libdict.values())
    print(X.shape)
    adata = anndata.AnnData(X)
    adata.obs.index = libdict.keys()

    print("\tPerforming Leiden clustering...")
    ### the n_neighbors and min_dist parameters can be altered
    sc.pp.neighbors(adata, n_neighbors=nneighbors, use_rep='X')
    sc.tl.leiden(adata, resolution=1.0)
    sc.tl.umap(adata, min_dist=mindist, spread=spread, random_state=42)

    new_order = adata.obs.sort_values(by='leiden').index.tolist()
    adata = adata[new_order, :]
    adata.obs['leiden'] = 'Cluster ' + adata.obs['leiden'].astype('object')

    df = pd.DataFrame(adata.obsm['X_umap'])
    df.columns = ['x', 'y']

    df['cluster'] = adata.obs['leiden'].values
    df['term'] = adata.obs.index
    df['genes'] = [libdict[l] for l in df['term']]

    return df

In [None]:
def get_scatter_colors(df):
    clusters = pd.unique(df['cluster']).tolist()
    colors = list(Category20[20])[::2] + list(Category20[20])[1::2]
    color_mapper = {clusters[i]: colors[i % 20] for i in range(len(clusters))}
    return color_mapper

def get_scatterplot(scatterdf):
    df = scatterdf.copy()
    color_mapper = get_scatter_colors(df)
    df['color'] = df['cluster'].apply(lambda x: color_mapper[x])

    hover_emb = HoverTool(name="df", tooltips="""
        <div style="margin: 10">
            <div style="margin: 0 auto; width:300px;">
                <span style="font-size: 12px; font-weight: bold;">Gene Set:</span>
                <span style="font-size: 12px">@gene_set</span>
            <div style="margin: 0 auto; width:300px;">
                <span style="font-size: 12px; font-weight: bold;">Coordinates:</span>
                <span style="font-size: 12px">(@x,@y)</span>
            <div style="margin: 0 auto; width:300px;">
                <span style="font-size: 12px; font-weight: bold;">Cluster:</span>
                <span style="font-size: 12px">@cluster</span>
            </div>
        </div>
    """)
    tools_emb = [hover_emb, 'pan', 'wheel_zoom', 'reset', 'save']

    plot_emb = figure(
        width=1000, 
        height=700, 
        tools=tools_emb
    )

    source = ColumnDataSource(
        data=dict(
            x = df['x'],
            y = df['y'],
            gene_set = df['term'],
            cluster = df['cluster'],
            colors = df['color'],
            label = df['cluster']
        )
    )

    # hide axis labels and grid lines
    plot_emb.xaxis.major_tick_line_color = None
    plot_emb.xaxis.minor_tick_line_color = None
    plot_emb.yaxis.major_tick_line_color = None
    plot_emb.yaxis.minor_tick_line_color = None
    plot_emb.xaxis.major_label_text_font_size = '0pt'
    plot_emb.yaxis.major_label_text_font_size = '0pt' 

    plot_emb.output_backend = "svg"    
    
    plot_emb.title = 'Gene Sets in the DeepCoverMOA Drug Mechanisms of Action Library'
    plot_emb.xaxis.axis_label = "UMAP_1"
    plot_emb.yaxis.axis_label = "UMAP_2"
    plot_emb.xaxis.axis_label_text_font_style = 'normal'
    plot_emb.xaxis.axis_label_text_font_size = '18px'
    plot_emb.yaxis.axis_label_text_font_size = '18px'
    plot_emb.yaxis.axis_label_text_font_style = 'normal'
    plot_emb.title.align = 'center'
    plot_emb.title.text_font_size = '18px'
    
    s = plot_emb.scatter(
        'x', 
        'y', 
        size = 4, 
        source = source, 
        color = 'colors'
    )
    
    return plot_emb

In [None]:
## defaults: nneighbors=30, mindist=0.1, spread=1.0, maxdf=1.0, mindf=1
scatter_df = process_scatterplot(libdict, nneighbors=30,mindist=0.1
    #,spread=0.25, 
    #,maxdf=0.7 
    #,mindf=0.055
)

# Display Scatter Plot
plot = get_scatterplot(scatter_df)
output_notebook()
show(plot)

In [None]:
# save HTML if desired
output_file(filename=f"{scatterdir}umap.html", title = 'Gene Sets in the DeepCoverMOA Drug Mechanisms of Action Library')
save(plot)