In [None]:
#%%appyter init
from appyter import magic
magic.init(lambda _=globals: _())

# Dr.Emb Appyter: A Web Platform for Drug Discovery using Embedding Vectors
Dr.Emb Appyter is a web-based platform that integrates various embedding methods to search compounds. It searches for closely located compounds of query compounds in the embedding space. <br> <br>
The inputs for this Appyter are (a) Library, (b) Query Compound Data (Name, SMILES), and (c) Parameters (top k candidates, embedding method, similarity method). If users want to choose different data or parameters, users can simply go back, make the changes, and then run it again. <br> <br>
The default libraries and embedding vectors used to run this Appyter can be found <a href="https://github.com/KU-MedAI/The-Dr.Emb-Appyter.git" target="_blank" rel="noopener noreferrer">here</a>.

In [None]:
import os
import sys
import json
import requests
from tqdm import tqdm
import utils

import deepchem as dc
from rdkit.Chem import AllChem, Lipinski, Descriptors, Crippen
from rdkit import Chem, DataStructs
import torch
from sklearn.metrics.pairwise import cosine_similarity
import gseapy as gp
import methods.moable.model
from macaw import *

import pandas as pd
import numpy as np
import pickle
import faiss

from upsetplot import plot, from_contents, UpSet
from sklearn.manifold import TSNE
import umap
import plotly.io as po
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from itertools import combinations

from IPython.display import HTML, display, Markdown, IFrame, FileLink, Image, HTML

import certifi
import urllib3
urllib3.disable_warnings()

In [None]:
%%appyter hide_code_exec
{% do SectionField(
    name='LB_Section',
    title='Select a Library',
    subtitle='Option 1: Select your library: KCB(Korean Chemical Bank, 605K), ZINC(20K), MCE(4.3K), Selleckchem(9K).
    Option 2: Upload your custom library.',
    img='library.png'
) %}

{% do SectionField(
    name='Data_Section',
    title='Enter Your Query',
    subtitle='Load your query as tab separated formats without headers and index columns (One drug per row). 
    Option 1: One query compound. 
    Option 2: Multiple query compounds (maximum 10 compounds) as tab-separated formats without headers and index columns (One drug per row)',
    img='data.png'
) %}

{% do SectionField(
    name='Parameter_Section',
    title='Set Parameters',
    subtitle='Top K candidates: the maximum number(K) of top candidates to be returned. 
        Embedding Methods: Choose an embedding method among ReSimNet, MoAble, Mol2vec, MACAW, ECFP and MACCS Keys. 
        Similarity Metric: Choose similarity measurement. 
        We recommend cosine similarity for ReSimNet and MoAble, Euclidean distance for Mol2vec and MACAW and Jaccard similarity for ECFP and MACCS Keys.',
    img='setting.png'
) %}

In [None]:
%%appyter hide_code_exec

{% set LB_field = TabField(
    name='lb_field',
    label='Library Selection',
    default='Select a Library',
    choices={
        'Select a Library': [
            ChoiceField(
                name='lb_kind',
                label='Select your library',
                default='MCE',
                choices={'KCB':'kcb','ZINC':'zinc','MCE':'mce','Selleckchem':'selleck'}
            )
        ],
        'Custom Library': [
            FileField(
                name='custom',
                label='Load your Custom Library',
                default='example_custom_library.tsv',
                examples={"example_custom_library.tsv":url_for('static', filename='custom_library.tsv')},
                description='Upload tab-delimeted file. Columns name should correspond to drug_name, drug_smiles without index columns'),
            StringField(
                name='custom_info', 
                label='Custom Information', 
                default='Description', 
                description='Columns name of information you want to highlight in the custom library.'
            )
        ]
    },
    section = 'LB_Section')
 %}

In [None]:
%%appyter hide_code_exec
{% set input_types = TabField(
    name='input_types',
    label='Data file',
    default='Multiple Query Compounds',
    description='Upload your drugs information files',
    choices={        
        'One Query Compound': [
            StringField(
                name='drug1_name', 
                label='Query Compound Name', 
                default='Tenofovir', 
                description='', 
                section='Data_Section'
            ),
            StringField(
                name='drug1_smiles', 
                label='Query Compound SMILES', 
                default='C(CN1C=NC2=C(N=CN=C21)N)OCP(=O)(O)O', 
                description='', 
                section='Data_Section'
            )
        ],
        'Multiple Query Compounds':[
            FileField(
                name='drug1_list_filename', 
                label='List of Candidates (.tsv or .txt)', 
                default='example_drugs_antiviral.txt', 
                examples={"example_drugs_antiviral.txt":url_for('static', filename='antiviral_drugs.txt')},
                description='List of Candidates. Columns should correspond to drug name and drug smiles without any header and index columns. One drug per row',
                section='Data_Section'
            )]

    },
    section = 'Data_Section',
) %}

In [None]:
%%appyter hide_code_exec

{% set topk_candidate = IntField(
    name='topk_candidate', 
    label='Top K Candidates', 
    min=1, 
    max=10000, 
    default=30, 
    description='The maximum number of Top candidates', 
    section='Parameter_Section')
%}

{% set embedding_method = ChoiceField(
    name='embedding_method',
    label='Embedding Methods',
    choices={'ReSimNet (Expression-based)':'ReSimNet','MoAble (Expression-based)':'MoAble','ECFP (Fingerprint-based)':'ECFP','MACCS Keys (Fingerprint-based)':'MACCSKeys',
             'Mol2vec (Sequence-based)':'Mol2vec', 'MACAW (Sequence-based)':'MACAW'},
    default='ReSimNet (Expression-based)',
    description='Select a embedding method',
    section='Parameter_Section')
%}

{% set similarity_method = ChoiceField(
    name='similarity_method',
    label='Similarity Methods',
    choices={'Cosine similarity':'Cosine', 'Euclidean distance':'Euclidean', 'Jaccard similarity':'Jaccard'},
    default='Cosine similarity',
    description='Select a similarity method to calculate',
    section='Parameter_Section')
%}

In [None]:
%%appyter code_exec

input_path = 'input_data/'
if not os.path.exists(input_path):
    os.makedirs(input_path)
    
output_path = 'output_data/'
if not os.path.exists(output_path):
    os.makedirs(output_path)

In [None]:
%%appyter code_exec

# database_field
{% if LB_field.raw_value == "Select a Library" %}
input_db = "{{ LB_field.value[0] }}"
library_df = utils.library_npl(input_db)[0]
npl = utils.library_npl(input_db)[1]

{% elif LB_field.raw_value == "Custom Library" %}
input_db = "custom"
custom_df = pd.read_csv({{ LB_field.value[0] }}, sep='\t')
label = {{ LB_field.value[1] }}
{% endif %}

# prototypes_field
{% if input_types.raw_value == "One Query Compound" %}
drug1_name = {{ input_types.value[0] }}
drug1_smiles = {{ input_types.value[1] }}
input_mode = "single"

{% elif input_types.raw_value == "Multiple Query Compounds" %}
query_filename = {{ input_types.value[0] }}
input_mode = "multiple"
{% endif %}

# set parameters
embed_method = "{{embedding_method.value}}"
sim_method = "{{similarity_method.value}}"
topk_candidate = {{topk_candidate.value}}

In [None]:
%%appyter code_exec
{% if similarity_method.value == "Jaccard" %}
if embed_method not in ["ECFP", "MACCSKeys"]:
    try:
        raise ValueError("Error: Invalid embed_method. Please use 'ECFP' or 'MACCSKeys' for Embedding Methods.")
    except ValueError as e:
        print(e)
        assert False    
    
{% endif %}

In [None]:
%%appyter markdown
{% if LB_field.raw_value == "Custom Library" %}

## Custom Library
If users have their own libraries, users can upload a library file to the ‘Custom Library’ field in the input page. The file should contain compound names and SMILES information. <br>
Once uploaded, the custom library is preprocessed to generate embedding vectors. <br> <br>
Please note that embedding the compounds in a large number of compounds in a custom library may take a long time.

{% endif %}

In [None]:
%%appyter code_exec
{% if LB_field.raw_value == "Custom Library" %}

library_df, npl = utils.custom_npl(custom_df)
custom_dict = dict(zip(library_df["drug_name"], library_df["drug_smiles"]))

if embed_method == 'ReSimNet':
    input_embed_filename = "./input_drugs.pkl"
    with open(input_embed_filename, "wb") as f:
        pickle.dump(custom_dict, f)

    resimnet_model = f"ReSimNet7.mdl"
    custom_output_file = 'Custom_ReSimNet_embedding_vectors_7.pkl'

    !python3 ../../../methods/ReSimNet/main.py --save-embed True --embed-d 2 --drug-file $input_embed_filename --output-filename $custom_output_file --checkpoint-dir ../../../methods/ReSimNet/results/ --model-name $resimnet_model --data-path ../../../methods/ReSimNet/tasks/data/drug\(v0.6\).pkl

    custom_embed_dict = pd.read_pickle(custom_output_file)
    
    embedding_html = "<a href=\"./{}\" target='_blank'>{}</a>".format(custom_output_file, f"Download {embed_method} embedding vectors for custom library")
    display(HTML(embedding_html))

else:
    custom_output_file = f'Custom_{embed_method}_embedding_vectors.pkl'

    if embed_method == 'MACAW':
        mcw = MACAW()
        mcw.fit(library_df['drug_smiles'])
        X = mcw.transform(library_df['drug_smiles'])
        drug_names = library_df['drug_name']
        with open(output_path + custom_output_file, 'wb') as f:
            custom_embed_dict = dict(zip(drug_names, X))
            pickle.dump(custom_embed_dict, f)
    
    else:
        custom_embed_dict = utils.custom_embedding(custom_dict, embed_method, output_path, custom_output_file)

    embedding_html = "<a href=\"./{}\" target='_blank'>{}</a>".format(output_path+custom_output_file, f"Download {embed_method} embedding vectors for custom library")
    display(HTML(embedding_html))
        
{% endif %}

## Generating Embedding Vectors
The embedding vectors of the query compounds are generated based on the selected embedding method. Dr.Emb Appyter offers six embedding models: ReSimNet, MoAble, ECFP, MACCS Keys, Mol2vec, and MACAW. <br>
The embedding vectors are available for download by clicking the link below:

In [None]:
filenames, combined = [], []
embed_dict = {}

if input_mode == 'multiple':
    df = pd.read_csv(query_filename, sep="\t", header=None)
    df.columns = ["drug1_name", "drug1_smiles"]
    drug_dict = dict(zip(df["drug1_name"], df["drug1_smiles"]))
    output_file = f'{embed_method}_embedding_vectors.pkl'

else:
    df = pd.DataFrame([{'drug1_name':drug1_name, 'drug1_smiles':drug1_smiles}])
    output_file = f'{embed_method}_embedding_vectors.pkl'
    drug_dict = dict(zip(df["drug1_name"], df["drug1_smiles"]))

# Gene expression
if embed_method == 'ReSimNet':
    for name, smiles in drug_dict.items():
        globals()[f'input_embed_{name}'] = {name:smiles}
        with open(input_path + f'input_{name}.pkl', "wb") as f:
            pickle.dump(globals()[f'input_embed_{name}'], f)
            
        input_embed_filename = input_path + f"input_{name}.pkl"
        output_embed_filename = output_path + f"output_{name}_embeddings.pkl"
        resimnet_model_filename = "ReSimNet7.mdl"
        filenames.append(output_embed_filename)
        
        !python3 ../../../methods/ReSimNet/main.py --save-embed True --embed-d 2 --drug-file $input_embed_filename --output-filename $output_embed_filename --checkpoint-dir ../../../methods/ReSimNet/results/ --model-name $resimnet_model_filename --data-path ../../../methods/ReSimNet/tasks/data/drug\(v0.6\).pkl
        
    for file in filenames:
        with open(file, 'rb') as file:
            data = pickle.load(file)
            combined.append(data)
    with open(output_path+output_file, 'wb') as file:
        pickle.dump(combined, file)
    embed_dict = combined.copy()

elif embed_method == 'MoAble':
    embed_dict = utils.drug_embeddings(drug_dict)
    with open(output_path + output_file,'wb') as f:
        pickle.dump(embed_dict, f)
    for k, v in embed_dict.items():
        with open(output_path + f'{k}_output_embedding.pkl', 'wb') as f:
            pickle.dump({k:v}, f)
        
# Sequence
elif embed_method == 'Mol2vec':
    featurizer = dc.feat.Mol2VecFingerprint('../../../methods/mol2vec/mol2vec_model_300dim.pkl')
    for name, smiles in drug_dict.items():
        with open(output_path + f'{name}_output_embedding.pkl', 'wb') as f:
            pickle.dump({name: featurizer.featurize(smiles)}, f)
        with open(output_path + output_file, 'wb') as f:
            globals()[f'input_{name}'] = {name: featurizer.featurize(smiles)}
            embed_dict[name] = featurizer.featurize(smiles)
            pickle.dump(embed_dict, f)
    
# Fingerprint
elif embed_method == 'ECFP':
    for name, smiles in drug_dict.items():
        with open(output_path + f'{name}_output_embedding.pkl', 'wb') as f:
            pickle.dump({name:utils.smiles2fp(smiles)}, f)
        with open(output_path + output_file, 'wb') as f:
            globals()[f'input_{name}'] = {name:utils.smiles2fp(smiles)}
            embed_dict[name] = utils.smiles2fp(smiles)
            pickle.dump(embed_dict, f)


elif embed_method == 'MACCSKeys':
    featurizer = dc.feat.MACCSKeysFingerprint()
    for name, smiles in drug_dict.items():
        with open(output_path + f'{name}_output_embedding.pkl', 'wb') as f:
            pickle.dump({name:featurizer.featurize(smiles)}, f)
        with open(output_path + output_file, 'wb') as f:
            globals()[f'input_{name}'] = {name:featurizer.featurize(smiles)}
            embed_dict[name] = featurizer.featurize(smiles)
            for drug, values in embed_dict.items():
                embed_dict[drug] = values.flatten()
            pickle.dump(embed_dict, f)


elif embed_method == 'MACAW':
    if input_db != 'custom':
        mcw = utils.pretrained_MACAW(input_db)
    for name, smiles in drug_dict.items():
        with open(output_path + f'{name}_output_embedding.pkl', 'wb') as f:
            pickle.dump({name:mcw.transform([smiles])}, f)
        with open(output_path + output_file, 'wb') as f:
            globals()[f'input_{name}'] = {name:mcw.transform([smiles])}
            embed_dict[name] = mcw.transform([smiles])
            pickle.dump(embed_dict, f)

embedding_html = "<a href=\"./{}\" target='_blank'>{}</a>".format(output_path+output_file, f"Download {embed_method} embedding vectors for query compounds")
display(HTML(embedding_html))

## Compounds Search
Dr.Emb Appyter uses the Faiss-based search system to calculate and compare distances between query embedding vectors and library embedding vectors. Faiss efficiently measures the distances between the library and query compounds and returns the top closest compounds as a result. <br> <br>
The distance between embedding vectors is calculated based on the selected similarity method. Please note that Jaccard similarity is calculated only for binary vectors such as ECFP and MACCS Keys.

In [None]:
# custom_result_df
if sim_method != 'Jaccard':
    for name, smiles in drug_dict.items():
        data_filename = f"{name}_candidates.tsv"
        if input_db == 'custom':
            if embed_method == 'ReSimNet':
                globals()[f'{name}_simliarity'], globals()[f'{name}_index'], globals()[f'{name}_result_df'] = utils.custom_finder(custom_embed_dict, embed_method, npl, sim_method, output_path+f'output_{name}_embeddings.pkl', topk_candidate, name)
            else:
                globals()[f'{name}_simliarity'], globals()[f'{name}_index'], globals()[f'{name}_result_df'] = utils.custom_finder(custom_embed_dict, embed_method, npl, sim_method, output_path+f'{name}_output_embedding.pkl', topk_candidate, name)

            globals()[f'{name}_result_df'] = pd.concat(globals()[f'{name}_result_df'], axis=1).reset_index()
            globals()[f'{name}_result_df'] = globals()[f'{name}_result_df'].rename(columns={"index":"drug_name" if input_db == 'custom' else 'drug2_name'})
            globals()[f'{name}_result_df'] = pd.merge(globals()[f'{name}_result_df'], library_df)
            globals()[f'{name}_result_df'][f'{sim_method} similarity'] = globals()[f'{name}_simliarity'][0]
            globals()[f'{name}_result_df']['rank'] = [x for x in range(1,topk_candidate+1)]
            globals()[f'{name}_result_df'] = globals()[f'{name}_result_df'].set_index(['rank'])
            if input_db == 'custom':
                globals()[f'{name}_result_df']['drug_smiles'] = globals()[f'{name}_result_df'].apply(lambda x: utils.make_clickable(x['drug_smiles']), axis=1)
                globals()[f'{name}_result_df'].rename(columns={'drug_name':'compound_name', 'drug_smiles':'compound_smiles'}, inplace=True)
            else:
                globals()[f'{name}_result_df']['drug2_smiles'] = globals()[f'{name}_result_df'].apply(lambda x: utils.make_clickable(x['drug2_smiles']), axis=1)
                globals()[f'{name}_result_df'].rename(columns={'drug2_name':'compound_name', 'drug2_smiles':'compound_smiles'}, inplace=True)
            globals()[f'{name}_result_df'].to_csv(data_filename, sep='\t')

In [None]:
if sim_method == 'Jaccard':
    data_filename = f"{name}_candidates.tsv"
    query_dataframes = utils.jaccard_dataframes(input_db, custom_embed_dict if input_db=='custom' else embed_dict, embed_method, embed_dict, topk_candidate)
    for query_name, df in query_dataframes.items():
        globals()[f'{query_name}_result_df'] = df
        if input_db != 'custom':
            globals()[f'{query_name}_result_df'].rename(columns={'drug_name':'drug2_name'}, inplace=True)
        globals()[f'{query_name}_result_df'] = pd.merge(globals()[f'{query_name}_result_df'], library_df)
        second_column = globals()[f'{query_name}_result_df'].pop('Jaccard Similarity')
        globals()[f'{query_name}_result_df']['Jaccard Similarity'] = second_column
        globals()[f'{query_name}_result_df']['rank'] = [x for x in range(1,topk_candidate+1)]
        globals()[f'{query_name}_result_df'] = globals()[f'{query_name}_result_df'].set_index(['rank'])
        if input_db == 'custom':
            globals()[f'{query_name}_result_df']['drug_smiles'] = globals()[f'{query_name}_result_df'].apply(lambda x: utils.make_clickable(x['drug_smiles']), axis=1)
            globals()[f'{query_name}_result_df'].rename(columns={'drug_name':'compound_name', 'drug_smiles':'compound_smiles'}, inplace=True)
        else:
            globals()[f'{query_name}_result_df']['drug2_smiles'] = globals()[f'{query_name}_result_df'].apply(lambda x: utils.make_clickable(x['drug2_smiles']), axis=1)
            globals()[f'{query_name}_result_df'].rename(columns={'drug2_name':'compound_name', 'drug2_smiles':'compound_smiles'}, inplace=True)
        globals()[f'{query_name}_result_df'].to_csv(data_filename, sep='\t')

In [None]:
# result_df
if sim_method != 'Jaccard':
    for name, smiles in drug_dict.items():
        if input_db != 'custom':
            if embed_method == 'ReSimNet':
                globals()[f'{name}_simliarity'], globals()[f'{name}_index'], globals()[f'{name}_result_df'] = utils.resimnet_finder(input_db, npl, output_path+f'output_{name}_embeddings.pkl', topk_candidate, name, "ReSimNet7")

                globals()[f'{name}_result_df'] = pd.concat(globals()[f'{name}_result_df'], axis=1).reset_index()
                globals()[f'{name}_result_df'] = globals()[f'{name}_result_df'].rename(columns={"index":"drug2_name"})
                globals()[f'{name}_result_df'] = pd.merge(globals()[f'{name}_result_df'], library_df)
                
                globals()[f'{name}_df'] = globals()[f'{name}_result_df'].copy()
                globals()[f'{name}_df']["drug1_name"] = name
                globals()[f'{name}_df']["drug1_smiles"] = smiles

                input_filename = f"./input_pairs_{name}.tsv"
                output_filename = f"output_data/output_pairs_{name}.tsv"
                globals()[f'{name}_df'].to_csv(input_filename, index=None, sep="\t")

                # resimnet score calculation
                !python3 ../../../methods/ReSimNet/main.py --save-prediction-new-pairs True --new-drug-pair-filename $input_filename --output-filename $output_filename --checkpoint-dir ../../../methods/ReSimNet/results/ --data-path ../../../methods/ReSimNet/tasks/data/drug\(v0.6\).pkl
            
            else:
                if embed_method in ['MoAble', 'ECFP']:
                    globals()[f'{name}_simliarity'], globals()[f'{name}_index'], globals()[f'{name}_result_df'] = utils.MA_finder(input_db, embed_method, npl, sim_method, output_path+f'{name}_output_embedding.pkl', topk_candidate, name)
                else:
                    globals()[f'{name}_simliarity'], globals()[f'{name}_index'], globals()[f'{name}_result_df'] = utils.finder(input_db, embed_method, npl, sim_method, output_path+f'{name}_output_embedding.pkl', topk_candidate, name)

                globals()[f'{name}_result_df'] = pd.concat(globals()[f'{name}_result_df'], axis=1).reset_index()
                globals()[f'{name}_result_df'] = globals()[f'{name}_result_df'].rename(columns={"index":"drug_name" if input_db == 'custom' else 'drug2_name'})
                globals()[f'{name}_result_df'] = pd.merge(globals()[f'{name}_result_df'], library_df)
                globals()[f'{name}_result_df'][f'{sim_method} similarity' if sim_method!='Euclidean' else f'{sim_method} distance'] = globals()[f'{name}_simliarity'][0]
                globals()[f'{name}_result_df']['rank'] = [x for x in range(1,topk_candidate+1)]
                globals()[f'{name}_result_df'] = globals()[f'{name}_result_df'].set_index(['rank'])
                if input_db == 'custom':
                    globals()[f'{name}_result_df']['drug_smiles'] = globals()[f'{name}_result_df'].apply(lambda x: utils.make_clickable(x['drug_smiles']), axis=1)
                    globals()[f'{name}_result_df'].rename(columns={'drug_name':'compound_name', 'drug_smiles':'compound_smiles'}, inplace=True)
                else:
                    globals()[f'{name}_result_df']['drug2_smiles'] = globals()[f'{name}_result_df'].apply(lambda x: utils.make_clickable(x['drug2_smiles']), axis=1)
                    globals()[f'{name}_result_df'].rename(columns={'drug2_name':'compound_name', 'drug2_smiles':'compound_smiles'}, inplace=True)
                globals()[f'{name}_result_df'].to_csv(data_filename, sep='\t')

In [None]:
if sim_method != 'Jaccard':
    if input_db != 'custom' and embed_method == 'ReSimNet':
        for name, smiles in drug_dict.items():
            globals()[f'{name}_result_df'] = pd.read_csv(output_path + f'output_pairs_{name}.tsv', sep="\t")
            globals()[f'{name}_result_df']['drug2_smiles'] = globals()[f'{name}_result_df'].apply(lambda x: utils.make_clickable(x['drug2_smiles']), axis=1)
            globals()[f'{name}_result_df'] = globals()[f'{name}_result_df'].set_index(['drug2_name', 'drug2_smiles'])
            globals()[f'{name}_result_df'] = globals()[f'{name}_result_df'].drop(['drug1_name','drug1_smiles'], axis=1)
            globals()[f'{name}_result_df']["ReSimNet_avg"] = globals()[f'{name}_result_df'][[x for x in globals()[f'{name}_result_df'].columns if x.startswith("ReSimNet")]].mean(axis=1)
            globals()[f'{name}_result_df']['ReSimNet_std'] = globals()[f'{name}_result_df'][[x for x in globals()[f'{name}_result_df'].columns if x.startswith('ReSimNet')]].std(axis=1)
            # order columns
            globals()[f'{name}_result_df'] = globals()[f'{name}_result_df'].reindex(sorted(globals()[f'{name}_result_df'].columns), axis=1)
            globals()[f'{name}_result_df'] = globals()[f'{name}_result_df'].sort_values(["ReSimNet_avg", "jaccard_similarity"], ascending=False)
            globals()[f'{name}_result_df'].reset_index(drop=False, inplace=True)
            # rank
            globals()[f'{name}_result_df']['rank'] = [x for x in range(1,topk_candidate+1)]
            globals()[f'{name}_result_df'] = globals()[f'{name}_result_df'].set_index(['rank'])
            globals()[f'{name}_result_df'].rename(columns={'drug2_name':'compound_name', 'drug2_smiles':'compound_smiles'}, inplace=True)
            
            # html
            globals()[f'{name}_result_df1'] = globals()[f'{name}_result_df'].copy() 
            globals()[f'{name}_result_df1'].drop(columns=['ReSimNet_0','ReSimNet_1','ReSimNet_2','ReSimNet_3','ReSimNet_4','ReSimNet_5','ReSimNet_6','ReSimNet_7','ReSimNet_8','ReSimNet_9'], inplace=True)
            globals()[f'{name}_result_df1'].rename(columns={'drug2_name':'compound_name', 'drug2_smiles':'compound_smiles'}, inplace=True)

## Search Results
The search results for the query compound(s) are presented as follows:
-	The top k compounds are retrieved from the library and ranked based on the highest similarity.
-	The names and SMILES of the compounds in the library are displayed in the 'compound_name' and 'compound_smiles' columns, respectively.
-	The selected similarity (or distance) results are shown in the similarity (or distance) column.
-	For custom libraries, user-specified column information is displayed accordingly in the result table.
-	Additionally, the SMILES information is interactive, with clickable links redirecting to PubChem.
-	The results can be downloaded as a .tsv (tab-separated values) file.

In [None]:
css = """
<style>
table.dataframe td{
    white-space: pre-wrap;
    max-width: 450px;
    overflow: hidden;
}
</style>
"""

tb_number = 1
styles = [dict(selector="caption", props=[("text-align", "left")])]
for name in drug_dict.keys():
    if input_db != 'custom' and embed_method == 'ReSimNet':
        caption = f"Table {tb_number}. Top {topk_candidate} compounds similar to {name} based on {embed_method} embedding"
        globals()[f'{name}_result_df'].to_csv(f'{name}_results.tsv',sep='\t')
        tb_number += 1
        globals()[f'{name}_html'] = "<a href=\"./{}\" target='_blank'>{}</a>".format(f'{name}_results.tsv', "Download results: {}".format(f'{name}_results.tsv'))
        html_table = css+globals()[f'{name}_result_df1'].to_html(classes='styled-table', escape=False)
        styled_html = f"<caption>{caption}</caption>{html_table}"
        result_html = "<a href=\"./{}\" target='_blank'>{}</a>".format(f'{name}_results.tsv', "Download results: {}".format(f'{name}_results.tsv'))
        globals()[f'{name}_html'] = "<a href=\"./{}\" target='_blank'>{}</a>".format(f'{name}_results.tsv', "Download results: {}".format(f'{name}_results.tsv'))
        display(HTML(styled_html))
        display(HTML(result_html))
        
    else:
        globals()[f'{name}_result_df1'] = globals()[f'{name}_result_df'].copy()
        caption = f"Table {tb_number}. Top {topk_candidate} compounds similar to {name} based on {embed_method} embedding"
        globals()[f'{name}_result_df'].to_csv(f'{name}_results.tsv',sep='\t')
        tb_number += 1
        globals()[f'{name}_html'] = "<a href=\"./{}\" target='_blank'>{}</a>".format(f'{name}_results.tsv', "Download results: {}".format(f'{name}_results.tsv'))
        html_table = css+globals()[f'{name}_result_df1'].to_html(classes='styled-table', escape=False)
        styled_html = f"<caption>{caption}</caption>{html_table}"
        result_html = "<a href=\"./{}\" target='_blank'>{}</a>".format(f'{name}_results.tsv', "Download results: {}".format(f'{name}_results.tsv'))
        globals()[f'{name}_html'] = "<a href=\"./{}\" target='_blank'>{}</a>".format(f'{name}_results.tsv', "Download results: {}".format(f'{name}_results.tsv'))
        display(HTML(styled_html))
        display(HTML(result_html))


    topk_embedding_file = f'{name}_top_{topk_candidate}_embedding_vectors.pkl'
    if input_db == 'custom':
        globals()[f'{name}_final_vectors'] = {key: custom_embed_dict[key] for key in globals()[f'{name}_result_df']['compound_name'] if key in custom_embed_dict}
    else:
        lib_embed_dict = utils.embed_vector_lib(input_db, embed_method)
        globals()[f'{name}_final_vectors'] = {key: lib_embed_dict[key] for key in globals()[f'{name}_result_df']['compound_name'] if key in lib_embed_dict}

    with open(output_path+topk_embedding_file, 'wb') as f:
        pickle.dump(globals()[f'{name}_final_vectors'], f)
        
    embedding_html = "<a href=\"./{}\" target='_blank'>{}</a>".format(output_path+topk_embedding_file, f"Download results: {name} Top {topk_candidate} {embed_method} embedding vectors")
    display(HTML(embedding_html))

In [None]:
%%appyter markdown
{% if input_types.raw_value == "Multiple Query Compounds" %}

### Overlapping Compounds Information
Dr.Emb Appyter checks for any overlapping compounds between the top k compounds of query compounds and displays their information in a table. <br> <br>
If there are overlapping compounds, their names, SMILES, and the query compounds they overlap with are displayed in a table. The column names are compound_name, compound_smiles, and overlap, respectively. However, if no overlapping compounds are detected, the table is omitted, and the message "There are no overlapping compounds." is displayed.

{% endif %}

In [None]:
%%appyter code_exec
{% if input_types.raw_value == "Multiple Query Compounds" %}

upset_dict = {}
rdf = []

for name in drug_dict.keys():
    upset_dict[f"{name}"] = list(globals()[f'{name}_result_df']['compound_name'])

result_keys = utils.find_duplicate_names(upset_dict)

for compound_name, values in result_keys.items():
    if compound_name in library_df['drug2_name' if input_db != "custom" else "drug_name"].values:
        compound_smiles = library_df.loc[library_df['drug2_name' if input_db != "custom" else "drug_name"] == compound_name, 'drug2_smiles' if input_db != "custom" else "drug_smiles"].values[0]
        rdf.append({'compound_name': compound_name, 'compound_smiles': compound_smiles, 'overlap': values})

ovlap_df = pd.DataFrame(rdf)

if not ovlap_df.empty:
    caption = f"Table {tb_number}. Overlapping compounds results based on {embed_method} embedding"
    tb_number += 1
    ovlap_df['compound_smiles'] = ovlap_df.apply(lambda x: utils.make_clickable(x['compound_smiles']), axis=1)
    ovlap_table = ovlap_df.to_html(classes='styled-table', escape=False)
    ovlap_df_html = f"<caption>{caption}</caption>{ovlap_table}"
    ovlap_html = "<a href=\"./{}\" target='_blank'>{}</a>".format('overlap_results.tsv', "Download results: {}".format(f'overlap_results.tsv'))

    display(HTML(ovlap_df_html))
    display(HTML(ovlap_html))
else:
    print("There are no overlapping compounds.")

{% endif %}

## Plots

### Scatter Plot
A scatter plot by UMAP visualizes all the compounds in the library (denoted as L, in gray, with a circle shape), along with the query compounds (Q, diamond) and the top k compounds (Q_topk, circle) in the embedding space. <br> <br>
-	Node size: Nodes increase in size with the number of overlapping compounds. The default node size is 0.3, the size of query compound nodes is 0.5, and the size of overlapping compounds increases incrementally by 1.
-	Color: The default node color is gray. For custom libraries, node color varies based on the user-specified column. The top k compounds for each query compound are distinctively colored.
-	Shape: Query compounds are represented by diamond-shaped nodes, while the top k compounds and the library compounds are shown as circles.
-	Hover Information: Hovering over a node reveals details such as compound name, SMILES, coordinates of the compound embedding vector, node size, and overlapping query compounds.

In [None]:
if input_db == 'custom':
    library_dr = custom_embed_dict
else:
    if embed_method == 'ReSimNet':
        library_dr = pd.read_pickle(f'../../../Library/ReSimNet_{input_db}/ReSimNet_{input_db}_7.pkl')
    elif embed_method == 'MACCSKeys':
        library_dr = pd.read_pickle(f'../../../Library/{embed_method}_{input_db}/{embed_method}_{input_db}.pkl')
        for drug, values in library_dr.items():
            library_dr[drug] = values.flatten()      
    else:
        library_dr = pd.read_pickle(f'../../../Library/{embed_method}_{input_db}/{embed_method}_{input_db}.pkl')

In [None]:
if input_db == 'custom':
    if embed_method in ['Mol2vec', 'MACAW']:
        data_array_l = np.array([v[0] for v in custom_embed_dict.values()])
        L = pd.DataFrame(data_array_l, columns=[i for i in range(data_array_l.shape[1])], index=custom_embed_dict.keys())
        data_array_q = np.array([v[0] for v in embed_dict.values()])
        Q = pd.DataFrame(data_array_q, columns=[i for i in range(data_array_q.shape[1])], index=embed_dict.keys())
        
    elif embed_method in ['MoAble', 'ECFP', 'MACCSKeys']:
        L = pd.DataFrame.from_dict(library_dr).T
        Q = pd.DataFrame.from_dict(embed_dict).T
    
    elif embed_method in ['ReSimNet']:
        L = pd.DataFrame.from_dict(library_dr).T
        dataframes = []
        for d in embed_dict:
            for key, value in d.items():
                dataframes.append(pd.DataFrame({key: value}))
        Q = pd.concat(dataframes, axis=1).T
    
else:        
    if embed_method in ['Mol2vec', 'MACAW']:
        data_array_l = np.array([v[0] for v in library_dr.values()])
        L = pd.DataFrame(data_array_l, columns=[i for i in range(data_array_l.shape[1])], index=library_dr.keys())
        data_array_q = np.array([v[0] for v in embed_dict.values()])
        Q = pd.DataFrame(data_array_q, columns=[i for i in range(data_array_q.shape[1])], index=embed_dict.keys())
        
    elif embed_method == 'ReSimNet':
        L = pd.DataFrame.from_dict(library_dr).T
        dataframes = []
        for d in embed_dict:
            for key, value in d.items():
                dataframes.append(pd.DataFrame({key: value}))
        Q = pd.concat(dataframes, axis=1).T
        
    else:
        L = pd.DataFrame.from_dict(library_dr).T
        Q = pd.DataFrame.from_dict(embed_dict).T

In [None]:
L.reset_index(inplace=True)
Q.reset_index(inplace=True)
L.rename(columns={'index':'compound_name'}, inplace=True)
Q.rename(columns={'index':'compound_name'}, inplace=True)

# custom
Q['node_shape'] = Q['compound_name']
L['node_shape'] = f'{input_db}'
Q['node_size'] = 0.5
L['node_size'] = 0.3

if input_db == 'custom':
    Q['color'] = Q['compound_name']
    L['color'] = library_df[label]
    
all_labels = []
for name in drug_dict.keys():
    globals()[f'{name}_toplist'] = list(globals()[f'{name}_result_df']['compound_name'])
    L.loc[L['compound_name'].isin(globals()[f'{name}_toplist']), 'node_shape'] = f'{name}_top{topk_candidate}'
    all_labels.append(globals()[f'{name}_toplist'])
all_top_list = [item for sublist in all_labels for item in sublist]
label_counts = {label: all_top_list.count(label) for label in set(all_top_list)}
data = {'compound_name': list(label_counts.keys()), 'node_size': list(label_counts.values())}
dataframe = pd.DataFrame(data)
dataframe.loc[dataframe['node_size'] == 1, 'node_size'] = 0.3
dataframe.set_index('compound_name', inplace=True)
L.set_index('compound_name', inplace=True)
L.update(dataframe)
L.reset_index(inplace=True)

library_query = pd.concat([L, Q], axis=0, ignore_index=True)

if embed_method in ['ReSimNet', 'Mol2vec']:
    features = library_query.loc[:,0:299]
elif embed_method in ['ECFP']:
    features = library_query.loc[:,0:2047]
elif embed_method in ['MACAW']:
    features = library_query.loc[:,0:14]
elif embed_method in ['MACCSKeys']:
    features = library_query.loc[:,0:166]
else:
    features = library_query.loc[:,0:255]


In [None]:
%%appyter code_exec
{% if input_types.raw_value == "Multiple Query Compounds" %}

library_query['Overlap'] = "-"

for index, row in library_query.iterrows():
    compound_name = row['compound_name']
    if compound_name in result_keys:
        substances = result_keys[compound_name]
        library_query.at[index, 'Overlap'] = ', '.join(substances)
        
{% endif %}

In [None]:
Umap = umap.UMAP(n_components=3, metric='cosine')
umap_projections = Umap.fit_transform(features, )

In [None]:
special_labels = [label for label in library_query['node_shape'].unique() if not (f'{input_db}' in label or label.endswith(f'_top{topk_candidate}'))]
color_discrete_map = {f'{input_db}': 'gray'}
hover_data_config = {}

if 'Overlap' in library_query.columns:
    hover_data_config['Overlap'] = library_query.Overlap.tolist()

if input_db == 'custom':
    fig = px.scatter_3d(
        umap_projections, x=0, y=1, z=2,
        color=library_query.color,
        labels={'color': 'label'},
        color_discrete_map=color_discrete_map,
        hover_name=library_query['compound_name'],
        size=library_query.node_size,
        opacity=0.3,
        hover_data=hover_data_config
    )
    
else:
    fig = px.scatter_3d(
        umap_projections, x=0, y=1, z=2,
        color=library_query.node_shape,
        labels={'color': 'label'},
        color_discrete_map=color_discrete_map,
        hover_name=library_query['compound_name'],
        size=library_query.node_size,
        opacity=0.3,
        hover_data=hover_data_config
    )
    
fig.update_traces(marker=dict(line=dict(width=0)))

fig.update_layout(
    width=800,
    height=600,
    title=f'Scatter plot of {input_db}_UMAP'
)

for trace in fig.data:
    if trace.name in special_labels:
        trace.marker.symbol = 'diamond'

fig.show()

In [None]:
%%appyter markdown
{% if input_types.raw_value == "Multiple Query Compounds" %}

### UpSet Plot
In this section, Dr.Emb Appyter visualizes overlapping compounds graphically. An Upset plot illustrates any overlapping compounds found for each query compound. <br> <br>
When there are overlapping compounds, the black dots under the query compounds in the plot are connected. These connections indicate that the compounds have shared results. <br> <br>
If there are no overlapping compounds, the black dots are not connected, and the bar graph represents the number of compounds.

{% endif %}

In [None]:
%%appyter code_exec
{% if input_types.raw_value == "Multiple Query Compounds" %}

upsetplot = from_contents(upset_dict)
UpSet(upsetplot, subset_size='count', show_counts=True).plot()
plt.title('UpSet Plot')
plt.savefig('Upset_plot.png')
plt.figtext(0.5, 0.01, f"figure 1. UpSet Plot for the Entered Query Compounds Results", ha="center", fontsize=10)
display(HTML(f'<a href="Upset_plot.png" download>Download: UpSet Plot</a>'))

{% endif %}

In [None]:
%%appyter markdown
{% if input_types.raw_value == "Multiple Query Compounds" %}

### Heatmap
Dr.Emb Appyter shows similarities between user-queried compounds. The heatmap visualizes the similarity between the embedding vectors of the query compounds entered by the user, facilitating an assessment of the likelihood that the query compounds share similar properties.
<br> <br>
Please note that ReSimNet produces two heatmaps based on ReSimNet score and Jaccard similarity.

{% endif %}

In [None]:
%%appyter code_exec
{% if input_types.raw_value == "Multiple Query Compounds" %}
if embed_method == 'ReSimNet':
    df = pd.read_csv(query_filename, sep="\t", header=None)
    df.columns = ["drug2_name", "drug2_smiles"]

    for name, smiles in drug_dict.items():
        globals()[f'input_{name}_df'] = df
        globals()[f'input_{name}_df']['drug1_name'] = name
        globals()[f'input_{name}_df']['drug1_smiles'] = smiles
        
        input_filename = f"./input_heatmap_{name}.tsv"
        output_filename = f"./output_heatmap_{name}.tsv"
        globals()[f'input_{name}_df'].to_csv(input_filename, index=None, sep="\t")
        
        !python3 ../../../methods/ReSimNet/main.py --save-prediction-new-pairs True --new-drug-pair-filename $input_filename --output-filename $output_filename --checkpoint-dir ../../../methods/ReSimNet/results/ --data-path ../../../methods/ReSimNet/tasks/data/drug\(v0.6\).pkl

else:
    drug_names = list(embed_dict.keys())
    if sim_method == 'Cosine':
        vectors = np.array(list(embed_dict.values()))
        vectors = vectors.squeeze()
        cosine_similarity_matrix = utils.calculate_cosine_similarity(vectors)
        df = pd.DataFrame(cosine_similarity_matrix, index=drug_names, columns=drug_names)
        
    elif sim_method == 'Euclidean':
        num_drugs = len(drug_names)
        df = np.zeros((num_drugs, num_drugs))
        for i in range(num_drugs):
            for j in range(num_drugs):
                if i != j:
                    df[i, j] = utils.euclidean_distance(embed_dict[drug_names[i]], embed_dict[drug_names[j]])

    elif sim_method == 'Jaccard':
        embed_df = pd.DataFrame(embed_dict)
        df = pd.DataFrame(np.nan, index=embed_df.columns, columns=embed_df.columns)
        for drug1, drug2 in combinations(embed_df.columns, 2):
            similarity = utils.jaccard_similarity(embed_df[drug1], embed_df[drug2])
            df.at[drug1, drug2] = similarity
            df.at[drug2, drug1] = similarity
        np.fill_diagonal(df.values, 1)

    labels = list(embed_dict.keys())
    df = pd.DataFrame(df, index=labels, columns=labels)
    sns.set(font_scale=1.0)
    plt.figure(figsize=(6, 6))
    sns.heatmap(df, cmap="OrRd", annot=True, fmt=".2f", cbar_kws={'label': f'{sim_method}'})
    plt.title(f"{embed_method}_{sim_method}")
    plt.savefig(f'{embed_method}_{sim_method}.png')
    plt.figtext(0.5, 0.01, f"figure 2. Heatmap for the Entered Query Compounds", ha="center", fontsize=10)
    plt.show()
    display(HTML(f'<a href="./{embed_method}_{sim_method}.png" download>Download: {embed_method}_{sim_method} heatmap</a>')) 
{% endif %}

In [None]:
%%appyter code_exec
{% if input_types.raw_value == "Multiple Query Compounds" %}
data_avg = []
data_jac = []
if embed_method == 'ReSimNet':
    for name in drug_dict.keys():
        output_filename = f'./output_heatmap_{name}.tsv'
        
        globals()[f'{name}_result_hp'] = pd.read_csv(output_filename, sep="\t")
        globals()[f'{name}_result_hp']['drug2_smiles'] = globals()[f'{name}_result_hp'].apply(lambda x: utils.make_clickable(x['drug2_smiles']), axis=1)
        globals()[f'{name}_result_hp']["ReSimNet_avg"] = globals()[f'{name}_result_hp'][[x for x in globals()[f'{name}_result_hp'].columns if x.startswith("ReSimNet")]].mean(axis=1)
        globals()[f'{name}_result_hp_avg'] = globals()[f'{name}_result_hp'][['drug1_name','drug2_name','ReSimNet_avg']]
        globals()[f'{name}_result_hp_jac'] = globals()[f'{name}_result_hp'][['drug1_name','drug2_name','jaccard_similarity']]
        globals()[f'{name}_result_hp_avg_pv'] = globals()[f'{name}_result_hp_avg'].pivot(index='drug1_name', columns='drug2_name', values='ReSimNet_avg')
        globals()[f'{name}_result_hp_jac_pv'] = globals()[f'{name}_result_hp_jac'].pivot(index='drug1_name', columns='drug2_name', values='jaccard_similarity')
        
        data_avg.append(globals()[f'{name}_result_hp_avg_pv'])
        data_jac.append(globals()[f'{name}_result_hp_jac_pv'])
        heatmap_avg = pd.concat(data_avg, axis=0)
        heatmap_avg = heatmap_avg.sort_index(axis=0).sort_index(axis=1)
        heatmap_jac = pd.concat(data_jac, axis=0)
        heatmap_jac = heatmap_jac.sort_index(axis=0).sort_index(axis=1)
{% endif %}

In [None]:
%%appyter code_exec
{% if input_types.raw_value == "Multiple Query Compounds" %}
if embed_method == 'ReSimNet':
    resavg = sns.clustermap(heatmap_avg, annot=True, cmap='OrRd', figsize=(6,6))
    resavg.ax_heatmap.set_xlabel('')
    resavg.ax_heatmap.set_ylabel('')
    resavg.fig.suptitle('Heatmap_ReSimNet avg', y=1, x=0.5);
    resavg_filename = 'heatmap_resimnet_avg.png'
    plt.savefig(resavg_filename, dpi=300)
    plt.close()

    resjac = sns.clustermap(heatmap_jac, annot=True, cmap='OrRd', figsize=(6,6))
    resjac.ax_heatmap.set_xlabel('')
    resjac.ax_heatmap.set_ylabel('')
    resjac.fig.suptitle('Heatmap_Jaccard similarity', y=1, x=0.5);
    resjac_filename = 'heatmap_jaccard_similarity.png'
    plt.savefig(resjac_filename, dpi=300)
    plt.close()
    
    fig, axes = plt.subplots(1, 2, figsize=(12, 6))

    axes[0].imshow(plt.imread(resavg_filename))
    axes[0].axis('off')

    axes[1].imshow(plt.imread(resjac_filename))
    axes[1].axis('off')

    plt.figtext(0.5, 0.01, f"figure 2. Heatmap for the Entered Query Compounds", ha="center", fontsize=10)
    plt.show()

    display(HTML(f'<a href="./heatmap_resimnet_avg.png" download>Download: Heatmap_ReSimNet avg</a>'))
    display(HTML(f'<a href="./heatmap_jaccard_similarity.png" download>Download: Heatmap_Jaccard similarity</a>'))
{% endif %}

## Drug-Set Enrichment Analysis (DSEA)

### Drug-Set Enrichment Analysis Results by Drug Enrichr
Drug Enrichr is a tool that performs drug set enrichment analysis (DSEA) in terms of targets, mode of actions, side effects, pathways or ontologies. Dr.Emb Appyter performs DSEA by entering the names of the top k compounds into Drug Enrichr. <br> <br>
Warning: If the compound names in the library are not common, it may affect the quality of the analysis results. <br> <br>
To explore additional drug set results, please click the link provided below:

In [None]:
upset_dict = {}
for name in drug_dict.keys():
    upset_dict[f"{name}"] = list(globals()[f'{name}_result_df']['compound_name'])

In [None]:
for drug, candidates in upset_dict.items():
    def drug_enrichr_link(dataset, title = "Drug Enrichr Overall Results: {}"):
        url = f"https://maayanlab.cloud/DrugEnrichr/enrich?dataset={dataset}"
        html = "<a href=\"{}\" target='_blank'>{}</a>".format(url, title.format(drug))
        return HTML(html)

    ENRICHR_URL = 'http://amp.pharm.mssm.edu/DrugEnrichr/addList'
    drugs_str = '\n'.join(candidates)
    description = f'{drug} candidates'
    payload = {
        'list': (None, drugs_str),
        'description': (None, description)
    }

    response = requests.post(ENRICHR_URL, files=payload, verify=False)
    if not response.ok:
        raise Exception('Error analyzing drug list')

    data = json.loads(response.text)
    
    ENRICHR_URL = 'http://amp.pharm.mssm.edu/DrugEnrichr/view?userListId=%s'
    user_list_id = data['userListId']
    dataset = data['shortId']
    response = requests.get(ENRICHR_URL % user_list_id, verify=False)
    if not response.ok:
        raise Exception('Error getting drug list')
        
    data = json.loads(response.text)
    
    ENRICHR_URL = 'http://amp.pharm.mssm.edu/DrugEnrichr/enrich'
    query_string = '?userListId=%s&backgroundType=%s'
    drug_set_library = 'Drug_Repurposing_Hub_Mechanism_of_Action'
    response = requests.get(
        ENRICHR_URL + query_string % (user_list_id, drug_set_library), verify=False
    )
    if not response.ok:
        raise Exception('Error fetching enrichment results')

    data = json.loads(response.text)
    moa = [item[1] for item in data['Drug_Repurposing_Hub_Mechanism_of_Action']]
    p_value = [item[2] for item in data['Drug_Repurposing_Hub_Mechanism_of_Action']]
    z_score = [item[3] for item in data['Drug_Repurposing_Hub_Mechanism_of_Action']]
    comb_score = [item[4] for item in data['Drug_Repurposing_Hub_Mechanism_of_Action']]

    df = pd.DataFrame({'moa': moa, 'p-value': p_value, 'z-score': z_score, 'combined score': comb_score})
    styles = [dict(selector="caption", props=[("text-align", "left")])] 
    caption = f"Table {tb_number}. {drug}'s Mode of Action (MoA) Analysis Results Using Drug Enrichr"
    if df.empty:
        caption += " ※ If you didn't get any results from Drug Enrichr, you need to configure the name of your custom library to a common name." 
    df = df.to_html(classes='styled-table', escape=False)
    styled_html = f"<caption>{caption}</caption>{df}"
    tb_number += 1
    display(HTML(styled_html))
    display(drug_enrichr_link(dataset))


## Acknowledgement
The chemical library used in this study was kindly provided by Korea Chemical Bank (http://www.chemicalbank.org/) of Korea Research Institute of Chemical Technology, Sellckchem (https://www.selleckchem.com/) of Selleck Chemicals LLC, and MCE (https://www.medchemexpress.com/) of MedChemExpress LLC.
This work was supported by the National Research Foundation of Korea Grant funded by the Korean Government (NRF-2022R1F1A1070111) and the MSIT(Ministry of Science and ICT), Korea, under the ICAN (ICT Challenge and Advanced Network of HRD) program(IITP-2023-RS-2022-00156439) supervised by the IITP (Institute of Information & Communications Technology Planning & Evaluation).

## References
1. Clarke, D. J. et al. (2021). Appyters: Turning jupyter notebooks into data-driven web apps. Patterns, 2(3).
2. Jeon, M. et al. (2019). Resimnet: drug response similarity prediction using siamese neural networks. Bioinformatics, 35(24), 5249–5256.
3. Jang, G. et al. (2021). Predicting mechanism of action of novel compounds using compound structure and transcriptomic signature coembedding. Bioinformatics, 37(Supplement_1), i376–i382.
4. Rogers, D. and Hahn, M. (2010). Extended-connectivity fingerprints. Journal of chemical information and modeling, 50(5), 742–754.
5. Durant, J. L. et al. (2002). Reoptimization of mdl keys for use in drugdiscovery. Journal of chemical information and computer sciences, 42(6), 1273–1280.
6. Jaeger, S. et al. (2018). Mol2vec: unsupervised machine learning approach with chemical intuition. Journal of chemical information and modeling, 58(1), 27–35.
7. Blay, V. et al. (2022). Macaw: an accessible tool for molecular embedding and inverse molecular design. Journal of chemical information and modeling, 62(15), 3551–3564.
8. Johnson, J. et al. (2019). Billion-scale similarity search with GPUs. IEEE Transactions on Big Data, 7(3), 535–547.
9. McInnes, L. et al. (2018). UMAP: Uniform Manifold Approximation and Projection for Dimension Reduction. ArXiv e-prints.
10. Kuleshov, M. V. et al. (2019). modEnrichr: a suite of gene set enrichment analysis tools for model organisms. Nucleic Acids Research, 47(W1), W183–W190.