In [None]:
#%%appyter init
import os, sys; sys.path.insert(0, os.path.realpath('..'))
from appyter import magic
magic.init(lambda _=globals: _())

In [None]:
%matplotlib inline
# Imports
## Data processing
import pandas as pd
import numpy as np
import scipy as sp
## Machine Learning
import sklearn as sk
from sklearn import (
    calibration,
    decomposition,
    ensemble,
    feature_selection,
    linear_model,
    manifold,
    metrics,
    model_selection,
    multioutput,
    pipeline,
    preprocessing,
    svm,
    tree,
    feature_extraction,
    neural_network,
)
from split import StratifiedGroupKFold, RepeatedStratifiedGroupKFold
import umap
## Plotting
from matplotlib import pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
## Drugmonizome API
from drugmonizome import Drugmonizome
## SEP-L1000 data retrieval
from sepl1000 import SEPL1000
## L1000FWD queries
import querysepl1000fwd
## Match drug name inputs using PubChem API
from DrugNameConverter import DrugNameConverter
# Utility
import os
import re
import json
from functools import reduce
from IPython.display import display, HTML
from tqdm import tqdm
# Interactive tables
from itables import show
# Plotly fix
import plotly.io as pio
pio.renderers.default = 'notebook'

In [None]:
rng = 2020
np.random.seed(rng)

In [None]:
# Notebook display util functions (adapted from Nicole Moiseyev's Patient Cohorts RNA-Seq Viewer appyter)

def make_clickable(link):
    return f'<a target="_blank" href="{link}">{link}</a>'

table_number = 0
figure_number = 0
def figure_header(label,title):
    global table_number
    global figure_number
    if label == 'Table':
        table_number += 1
        label = f'Table {table_number}'
    elif label == 'Figure':
        figure_number += 1
        label = f'Figure {figure_number}'
    display(HTML(f"<div style='font-size:1.5rem; padding:1rem 0;'><b>{label}</b>: {title}</div>"))
    
def figure_legend(label,title,content=''):
    global table_number
    global figure_number
    if label == 'Table':
        label = f'Table {table_number}'
    elif label == 'Figure':
        label = f'Figure {figure_number}'
    display(HTML(f'<style>div.caption {{text-align: center;}}</style><div class=caption><b>{label}</b>: <i>{title}</i>. {content} </div>'))

## Select Input Datasets and Target Classes

Selected drug set libraries and phenotypic datasets are downloaded and joined on the compound InChI Key to produce a large input feature matrix. A machine learning model will be trained to predict the specified target labels from these features. This is a binary classification task that can be used to predict compounds that are likely to be associated with the target class.

In [None]:
%%appyter hide
{% do SectionField(
    title='Input Dataset Selection',
    subtitle='Select the input datasets to use for learning and classification. \
              A model will be trained to predict the target labels from the selected features. \
              If no datasets are selected, default features will be used.',
    name='ATTRIBUTES',
    img='attributes.png',
) %}

{% set sepl1000_phenotypic_datasets = MultiCheckboxField(
    name='sepl1000_phenotypic_datasets',
    label='Transcriptomic and Imaging Datasets after Perturbation (From the SEP-L1000 project)',
    description='These input datasets were used previously for side effect prediction (https://maayanlab.net/SEP-L1000/).',
    choices=[
        'LINCS Gene Expression Signatures',
        'GO Transformed Signatures (PAEA)',
        'MLPCN Cell Morphological Profiling',
    ],
    descriptions={
        'LINCS Gene Expression Signatures': 'Gene expression signatures for drugs/small molecule compounds in the landmark gene space. The Characteristic Direction (CD) method was used to compute gene expression signatures. Contains 20338 compounds with 978 features (genes).',
        'GO Transformed Signatures (PAEA)': 'Gene Ontology (GO) transformed gene expression profiles of drug/small molecule compound perturbations. Principal Angle Enrichment Analysis (PAEA) was used to compute enrichment p-values for each CD signature in the space of all genes against gene sets created from the Gene Ontology including Biological Processes, Cellular Components and Molecular Function. Contains 20337 compounds with 4438 features (GO terms).',
        'MLPCN Cell Morphological Profiling': 'Drug/small molecule compound induced cell morphological profiles. Contains 19864 compounds with 812 features (from imaging).',
    },
    default=[],
    section='ATTRIBUTES'
) %}

{% set sepl1000_structural_datasets = MultiCheckboxField(
    name='sepl1000_structural_datasets',
    label='Chemical Fingerprints Generated for Compounds from SEP-L1000',
    description='These input datasets were used previously for side effect prediction (https://maayanlab.net/SEP-L1000/).',
    choices=[
        'MACCS Chemical Fingerprint',
        'Morgan Chemical Fingerprint',
    ],
    descriptions={
        'MACCS Chemical Fingerprint': '166-bit MACCS chemical fingerprint matrix for drugs/small molecule compounds computed using Open Babel. Contains 41701 compounds with 166 binary features (structural keys).',
        'Morgan Chemical Fingerprint': '2048-bit Morgan chemical fingerprints (circular fingerprints) computed using RDKIT with a radius of 4. Contains 19878 compounds and 2048 binary features (hashed bits).',
    },
    default=[],
    section='ATTRIBUTES'
) %}

{% set exprdatasets = MultiCheckboxField(
    name='exprdatasets',
    label='L1000FWD (Drug set libraries from Drugmonizome)',
    description='Binary features were generated from Drugmonizome drug sets based on top up- and down-regulated genes after perturbation, along with enriched pathways, using data from the L1000 fireworks display (L1000FWD). L1000FWD is a web application that provides interactive visualization of over 16,000 drug and small-molecule induced gene expression signatures.',
    choices=[
        'L1000FWD Downregulated Signatures',
        'L1000FWD Upregulated Signatures',
        'L1000FWD Downregulated GO Biological Processes',
        'L1000FWD Upregulated GO Biological Process',
        'L1000FWD Downregulated GO Cellular Components',
        'L1000FWD Upregulated GO Cellular Components',
        'L1000FWD Downregulated GO Molecular Function',
        'L1000FWD Upregulated GO Molecular Function',
        'L1000FWD Downregulated KEGG Pathways',
        'L1000FWD Upregulated KEGG Pathways',
        'L1000FWD Predicted Side Effects',
    ],
    descriptions={
        'L1000FWD Downregulated GO Biological Processes': 'Downregulated Gene Ontology (GO) Biological Process terms retrieved from querying gene signatures of drugs through Enrichr. Contains 4013 compounds with 1068 binary features (GO terms).',
        'L1000FWD Downregulated GO Cellular Components': 'Downregulated Gene Ontology (GO) Cellular Component terms retrieved from querying gene signatures of drugs through Enrichr. Contains 3246 compounds with 157 binary features (GO terms).',
        'L1000FWD Downregulated GO Molecular Function': 'Downregulated Gene Ontology (GO) Molecular Function terms retrieved from querying gene signatures of drugs through Enrichr. Contains 2158 compounds with 158 binary features (GO terms).',
        'L1000FWD Downregulated KEGG Pathways': 'Downregulated KEGG pathways retrieved from querying gene signatures of drugs through Enrichr. Contains 3309 compounds with 236 binary features (KEGG pathways).',
        'L1000FWD Downregulated Signatures': 'Drug-induced downregulated genes extracted from L1000FWD. Contains 4884 compounds with 7622 binary features (genes).',
        'L1000FWD Predicted Side Effects': 'Side effect associations predicted by drug-induced gene expression signatures. Contains 4852 compounds with 1013 binary features (predicted side effects).',
        'L1000FWD Upregulated GO Biological Process': 'Upregulated Gene Ontology (GO) Biological Process terms retrieved from querying gene signatures of drugs through Enrichr. Contains 4195 compounds with 1228 binary features (GO terms).',
        'L1000FWD Upregulated GO Cellular Components': 'Upregulated Gene Ontology (GO) Cellular Component terms retrieved from querying gene signatures of drugs through Enrichr. Contains 3366 compounds with 153 binary features (GO terms).',
        'L1000FWD Upregulated GO Molecular Function': 'Upregulated Gene Ontology (GO) Molecular Function terms retrieved from querying gene signatures of drugs through Enrichr. Contains 2427 compounds with 183 binary features (GO terms).',
        'L1000FWD Upregulated KEGG Pathways': 'Upregulated KEGG pathways retrieved from querying gene signatures of drugs through Enrichr. Contains 3662 compounds with 245 binary features (KEGG pathways).',
        'L1000FWD Upregulated Signatures': 'Drug-induced upregulated genes extracted from L1000FWD. Contains 4884 compounds with 7611 binary features (genes).',
    },
    default=[],
    section='ATTRIBUTES'
) %}

{% set targetdatasets = MultiCheckboxField(
    name='targetdatasets',
    label='Drug Targets and Associated Genes (Drug set libraries from Drugmonizome)',
    description='Binary features were generated from Drugmonizome drug sets based on known drug targets and associated genes from literature.',
    choices=[
        'Downregulated CREEDS Signatures',
        'Upregulated CREEDS Signatures',
        'DrugCentral Targets',
        'DrugRepurposingHub Drug Targets',
        'Drugbank Small Molecule Carriers',
        'Drugbank Small Molecule Enzymes',
        'Drugbank Small Molecule Targets',
        'Drugbank Small Molecule Transporters',
        'Geneshot Associated Genes',
        'Geneshot Predicted AutoRIF Genes',
        'Geneshot Predicted Coexpression Genes',
        'Geneshot Predicted Enrichr Genes',
        'Geneshot Predicted GeneRIF Genes',
        'Geneshot Predicted Tagger Genes',
        'KinomeScan Kinases',
        'PharmGKB Single Nucleotide Polymorphisms',
        'STITCH Targets',
    ],
    descriptions={
        'Downregulated CREEDS Signatures': 'Downregulated drug-induced gene expression signatures from CREEDS, a crowdsourcing resource for the curation and reanalysis of gene expression profiles from GEO. Contains 72 compounds with 2532 binary features (genes).',
        'Upregulated CREEDS Signatures': 'Upregulated drug-induced gene expression signatures from CREEDS, a crowdsourcing resource for the curation and reanalysis of gene expression profiles from GEO. Contains 71 compounds with 2535 binary features (genes).',
        'DrugCentral Targets': 'Drug targets for approved and unapproved drugs curated from the literature. Contains 1555 compounds with 540 binary features (genes).',
        'DrugRepurposingHub Drug Targets': 'Associated drug targets of approved drugs and drugs in clinical trials. Contains 1720 compounds with 375 binary features (genes).',
        'Drugbank Small Molecule Carriers': 'Genes encoding carriers associated with Drugbank small molecules. Contains 458 compounds with 14 binary features (genes).',
        'Drugbank Small Molecule Enzymes': 'Genes encoding enzymes associated with Drugbank small molecules. Contains 1473 compounds with 72 binary features (genes).',
        'Drugbank Small Molecule Targets': 'Drug targets of Drugbank small molecules. Contains 4467 compounds with 611 binary features (genes).',
        'Drugbank Small Molecule Transporters': 'Genes encoding transporters associated with Drugbank small molecules. Contains 832 compounds with 51 binary features (genes).',
        'Geneshot Associated Genes': 'Associated genes based on co-mentions with drugs in the literature. Contains 3938 compounds with 7503 binary features (genes).',
        'Geneshot Predicted AutoRIF Genes': 'Predicted genes based on AutoRIF co-occurrence. Contains 3938 compounds with 11695 binary features (genes).',
        'Geneshot Predicted Coexpression Genes': 'Predicted genes based on ARCHS4 coexpression. Contains 3938 compounds with 9087 binary features (genes).',
        'Geneshot Predicted Enrichr Genes': 'Predicted genes based on Enrichr co-occurrence. Contains 3938 compounds with 11845 binary features (genes).',
        'Geneshot Predicted GeneRIF Genes': 'Predicted genes based on GeneRIF co-occurrence. Contains 3938 compounds with 9193 binary features (genes).',
        'Geneshot Predicted Tagger Genes': 'Predicted genes based on Tagger co-occurrence. Contains 3938 compounds with 13882 binary features (genes).',
        'KinomeScan Kinases': 'Kinases associated with drugs elucidated from KINOMEscan kinase profiling assay. KINOMEscan is a biochemical kinase profiling assay that measures drug binding using a panel of ~440 purified kinases. Contains 54 compounds with 301 binary features (genes).',
        'PharmGKB Single Nucleotide Polymorphisms': 'Potentially clinically actionable gene-SNP associations. Contains 483 compounds with 554 binary features (SNPs).',
        'STITCH Targets': 'Gene-drug interactions from computational prediction and aggregation from primary databases. Contains 7303 compounds with 9063 binary features (genes).',
    },
    default=[],
    section='ATTRIBUTES'
) %}

{% set indicationdatasets = MultiCheckboxField(
    name='indicationdatasets',
    label='Indications, Modes of Action, and Side Effects (Drug set libraries from Drugmonizome)',
    description='Binary features were generated from Drugmonizome drug sets based on known mechanisms of action and side effects.',
    choices=[
        'ATC Codes Drugsetlibrary',
        'DrugRepurposingHub Mechanisms of Action',
        'PharmGKB OFFSIDES Side Effects',
        'SIDER Indications',
        'SIDER Side Effects',
    ],
    descriptions={
        'ATC Codes Drugsetlibrary': 'A classification system used to organize chemicals by chemical, therapeutic, pharmacological subgroups, cut off at the fourth level. Contains 2233 compounds with 308 binary features (mechanisms of action).',
        'DrugRepurposingHub Mechanisms of Action': 'Associated mechanisms of action of approved drugs and drugs in clinical trials. Contains 1854 compounds with 154 binary features (mechanisms of action).',
        'PharmGKB OFFSIDES Side Effects': 'Side effects mined from adverse event reporting databases predicted by a unique detection algorithm. Contains 1435 compounds with 7137 binary features (side effects).',
        'SIDER Indications': 'Approved drug indications mined from FDA package inserts and public documents. Contains 1546 compounds with 867 binary features (mechanisms of action).',
        'SIDER Side Effects': 'Approved drug side effects mined from FDA package inserts and public documents. Contains 1635 compounds with 2078 binary features (side effects).',
    },
    default=[],
    section='ATTRIBUTES'
) %}

{% set structuraldatasets = MultiCheckboxField(
    name='structuraldatasets',
    label='Structural Features (Drug set libraries from Drugmonizome)',
    description='Binary features were generated from Drugmonizome drug sets based on molecular fingerprints.',
    choices=[
        'RDKIT MACCS Chemical Fingerprints',
        'PubChem Chemical Fingerprints',
    ],
    descriptions={
        'RDKIT MACCS Chemical Fingerprints': 'Chemical structure motifs generated from SMILEs strings of small molecules. Computed for Drugmonizome compounds using RDKIT. Contains 14308 compounds with 163 binary features (chemical structure motifs).',
        'PubChem Chemical Fingerprints': '881-bit PubChem chemical structure motifs generated from SMILEs strings of small molecules. Contains 13379 compounds with 669 binary features (chemical structure motifs).',
    },
    default=[],
    section='ATTRIBUTES'
) %}

{% set keepmissing = BoolField(
    name='keepmissing',
    label='Keep drugs with missing data when joining datasets',
    description='Keep drugs that appear in some datasets and not in others. \
                 Missing data is filled in with zeros. Otherwise, only drugs \
                 that are present in all datasets are preserved.',
    default=False,
    section='ATTRIBUTES',
) %}

{% set tfidf = BoolField(
    name='tfidf',
    label='Apply tf–idf normalization to binary inputs',
    description='For binary drug-attribute associations in the input matrix, \
                 apply tf-idf transformation to normalize data.',
    default=True,
    section='ATTRIBUTES',
) %}

{% set attribute_datasets = exprdatasets.value +
                             targetdatasets.value +
                             indicationdatasets.value +
                             structuraldatasets.value %}

In [None]:
%%appyter markdown

To construct the input matrix, we download drug set libraries and phenotypic datasets and join them on the InChI Key.
{% if keepmissing.value %} Drugs that appear in some datasets and not in others are retained, and missing data is filled in with zeros.
{% else %} Only drugs that are present in all datasets are retained.
{% endif %}

In [None]:
%%appyter hide
{% do SectionField(
    title='Target Label Selection',
    subtitle='Upload a list of compounds or select an attribute from Drugmonizome to be assigned a positive class label for binary classification.',
    name='TARGET',
    img='target.png',
) %}

{% set target_field = TabField(
    name='target_field',
    label='Target Selection',
    default='Attribute',
    description='Select input method',
    choices={
        'List': [
            ChoiceField(
                name='drugformat',
                label='Drug Identifier Format',
                description='Compounds can be specified by either drug name or InChI Key.',
                default='InChI Key',
                choices=[
                    'Drug Name',
                    'InChI Key'
                ],
                section='TARGET'
            ),
            FileField(
                name='drughitlist',
                label='Upload List of Compounds',
                description='Upload a list of compounds to be assigned positive class labels for binary classification. \
                             Compounds should be in a text file, specified by either drug name or InChI Key and separated by newlines.',
                default='COVID19ScreenHitsInChIKeys.txt',
                examples={
                    'COVID19ScreenHits.txt': 'https://appyters.maayanlab.cloud/storage/Drugmonizome_ML/COVID19ScreenHits.txt',
                    'COVID19ScreenHitsInChIKeys.txt': 'https://appyters.maayanlab.cloud/storage/Drugmonizome_ML/COVID19ScreenHitsInChIKeys.txt',
                },
                section='TARGET'
            ),
        ],
        'Attribute': [
            AutocompleteField(
                name='target_attribute',
                description='Enter a small molecule attribute from one of the Drugmonizome datasets that should be predicted.',
                file_path="https://appyters.maayanlab.cloud/storage/Drugmonizome_ML/drugmonizome_terms.json",
                label='Attribute',
                hint='Enter Drugmonizome term...',
                default='neuropathy peripheral (from SIDER Side Effects)',
                constraint='(^(.+) \\(from (.+)\\)$|^$)',
        )],
    },
    section='TARGET',
) %}

{% set includestereo = BoolField(
    name='includestereo',
    label='Include stereoisomers',
    description='If true, compounds are matched to entries in the datasets by the first 14 characters of their InChI Keys, \
                 so stereoisomers of the compounds in the input list or with a particular attritube are also counted as hits. \
                 Note that different resources record different details for charge and stereochemistry, \
                 causing some compounds to have different full-length InChI Keys in different datasets. \
                 Selecting this option may allow such drugs to be better matched to entries in the datasets.',
    default=False,
    section='TARGET',
) %}

{% set target_name, target_dataset = '', '' %}
{% if target_field.raw_value == 'Attribute' %}
{% set target_name, target_dataset = target_field.value[0].value|re_match('^(.+) \\(from (.+)\\)$') %}
{% endif %}

In [None]:
%%appyter code_exec

{% if sepl1000_phenotypic_datasets.value == [] and sepl1000_structural_datasets.value == [] and attribute_datasets == [] %}
# No datasets selected, so use default datasets
{% set sepl1000_phenotypic_datasets, sepl1000_structural_datasets = ['LINCS Gene Expression Signatures'], ['Morgan Chemical Fingerprint'] %}
sepl1000_phenotypic_datasets = {{ sepl1000_phenotypic_datasets }}
sepl1000_structural_datasets = {{ sepl1000_structural_datasets }}
{% else %}
# Use the selected SEP-L1000 datasets
sepl1000_phenotypic_datasets = {{ sepl1000_phenotypic_datasets }}
sepl1000_structural_datasets = {{ sepl1000_structural_datasets }}
{% endif %}

dataset_sizes = []

In [None]:
%%appyter code_exec
{% if sepl1000_phenotypic_datasets.value != [] %}
phenotypic_datasets = {
    'LINCS Gene Expression Signatures': 'LINCS_Gene_Experssion_signatures_CD.csv.gz',
    'GO Transformed Signatures (PAEA)': 'GO_transformed_signatures_PAEA.csv.gz',
    'MLPCN Cell Morphological Profiling': 'MLPCN_morplological_profiles.csv.gz'
}

df_sepl1000_phenotypic = list(SEPL1000.download_df(list(phenotypic_datasets[dataset] for dataset in sepl1000_phenotypic_datasets),
                                             index_col=0))
dataset_sizes += list(zip(sepl1000_phenotypic_datasets, [dataset.shape[1] for dataset in df_sepl1000_phenotypic]))

# Assemble all phenotypic SEP-L1000 datasets
if len(df_sepl1000_phenotypic) > 1:
    # Obtain merged dataframe with omics and target data
    df_sepl1000 = reduce(
        lambda a, b: pd.merge( # Merge two dataframes item by item
            a, # left
            b, # right
            # Items with the same left and right index are merged
            left_index=True,
            right_index=True,
            {% if keepmissing.value %}
            how='outer', # Keep mis-matched indices
            {% else %}
            how='inner', # Keep only matched indices
            {% endif %}
        ),
        df_sepl1000_phenotypic,
    )
else:
    df_sepl1000 = df_sepl1000_phenotypic[0]
    
# Mean-fill infinite and missing values
df_sepl1000 = df_sepl1000.replace([np.inf, -np.inf], np.nan)
df_sepl1000 = df_sepl1000.fillna(np.mean(df_sepl1000))
{% endif %}

In [None]:
%%appyter code_exec
{% if sepl1000_structural_datasets.value != [] %}
# Structural dataset processing
structural_datasets = {
    'MACCS Chemical Fingerprint': 'MACCS_bitmatrix.csv.gz',
    'Morgan Chemical Fingerprint': 'Morgan_bitmatrix.csv.gz',
}

df_sepl1000_structural = list(SEPL1000.download_df(list(structural_datasets[dataset] for dataset in sepl1000_structural_datasets),
                                             index_col=0))
dataset_sizes += list(zip(sepl1000_structural_datasets, [dataset.shape[1] for dataset in df_sepl1000_structural]))

# Assemble all structural SEP-L1000 datasets
if len(df_sepl1000_structural) > 1:
    # Obtain merged dataframe with omics and target data
    df_sepl1000_fingerprints = reduce(
        lambda a, b: pd.merge( # Merge two dataframes item by item
            a, # left
            b, # right
            # Items with the same left and right index are merged
            left_index=True,
            right_index=True,
            {% if keepmissing.value %}
            how='outer', # Keep mis-matched indices
            {% else %}
            how='inner', # Keep only matched indices
            {% endif %}
        ),
        df_sepl1000_structural,
    )
else:
    df_sepl1000_fingerprints = df_sepl1000_structural[0]

{% if tfidf.value %}
# Apply tf-idf normalization
transformer = feature_extraction.text.TfidfTransformer()
X_tfidf = transformer.fit_transform(df_sepl1000_fingerprints).toarray()
df_sepl1000_fingerprints = pd.DataFrame(X_tfidf, columns=df_sepl1000_fingerprints.columns, index=df_sepl1000_fingerprints.index)
{% endif %}
{% if sepl1000_phenotypic_datasets.value != [] %}
# Concatenate structural features with phenotypic features
{% if keepmissing.value %}
df_sepl1000 = pd.merge(df_sepl1000, df_sepl1000_fingerprints, left_index=True, right_index=True, how='outer') # Keep mis-matched indices
{% else %}
df_sepl1000 = pd.merge(df_sepl1000, df_sepl1000_fingerprints, left_index=True, right_index=True) # Keep only matched indices
{% endif %}
{% else %}
df_sepl1000 = df_sepl1000_fingerprints
{% endif %}
{% endif %}

In [None]:
%%appyter code_exec

# Use the selected attribute datasets
attribute_datasets = {{ attribute_datasets }}

{% if attribute_datasets == [] %}
X = df_sepl1000
{% else %}
df_attributes = list(Drugmonizome.download_df(
    [dataset
     for dataset in attribute_datasets]
))
dataset_sizes += list(zip(attribute_datasets, [dataset.shape[1] for dataset in df_attributes]))

# Assemble all attribute datasets
if len(df_attributes) > 1:
    # Obtain merged dataframe with omics and target data
    df = reduce(
        lambda a, b: pd.merge( # Merge two dataframes item by item
            a, # left
            b, # right
            # Items with the same left and right index are merged
            left_index=True,
            right_index=True,
            {% if keepmissing.value %}
            how='outer', # Keep mis-matched indices
            {% else %}
            how='inner', # Keep only matched indices
            {% endif %}
        ),
        df_attributes,
    )
else:
    df = df_attributes[0]

del(df_attributes)

df = df.fillna(0)
X = df.applymap(lambda f: 1 if f!=0 else 0)
{% if tfidf.value %}
# Apply tf-idf normalization
transformer = feature_extraction.text.TfidfTransformer()
X_tfidf = transformer.fit_transform(X).toarray()
X = pd.DataFrame(X_tfidf, columns=X.columns, index=X.index)
{% if sepl1000_phenotypic_datasets.value != [] or sepl1000_structural_datasets.value != [] %}
{% if keepmissing.value %}
X = pd.merge(df_sepl1000, X, left_index=True, right_index=True, how='outer') # Keep mis-matched indices
{% else %}
X = pd.merge(df_sepl1000, X, left_index=True, right_index=True) # Keep only matched indices
{% endif %}
{% endif %}
{% endif %}
{% endif %}

In [None]:
# View input data
figure_header('Table', 'Input data')
display(X.head())
figure_legend('Table', 'Input data',
              f'The input data contain {X.shape[0]} compounds and {X.shape[1]} features per compound, \
              taken from the following datasets: {", ".join(sepl1000_phenotypic_datasets + sepl1000_structural_datasets + attribute_datasets)}.')

In [None]:
%%appyter markdown

{% if target_field.raw_value == 'List' %}
The target labels are produced from the uploaded list of hits: 1 if the drug is specified as a hit, 0 otherwise.
{% if target_field.value[0].value == 'Drug Name' %} Drug names are matched to InChI Keys from PubChem, L1000FWD, and the Drugmonizome metadata.
{% endif %}
{% endif %}

In [None]:
%%appyter code_exec

{% if target_field.raw_value == 'List' %}
{% if target_field.value[1].value == '' %}
# Using default list of hits from COVID-19 in vitro drug screens
hits_filename = '../../COVID19ScreenHits.txt'
{% else %}
# Using user-specified list of positive drug hits
hits_filename = {{target_field.value[1]}}
{% endif %}

{% if target_field.value[0].value == 'InChI Key' %}
def save_items(out_file, items):
    """
    Saves list of items as rows in a file.
    """
    with open(out_file, 'w') as f:
        for i in range(len(items)):
            if i < len(items) - 1:
                f.write(items[i] + '\n')
            else:
                f.write(items[i])
                
# Read InChI Keys from file
with open(hits_filename, 'r') as hits_file:
    drug_hits = set(drug.strip().upper() for drug in hits_file.read().strip().split('\n') 
                    if len(drug.strip()) > 0)

{% elif target_field.value[0].value == 'Drug Name' %}
# Helper functions
def merge(A, B, f):
    """
    Merges two dictionaries, where items from shared keys are merged using a custom function.
    """
    merged = {k: A.get(k, B.get(k)) for k in A.keys() ^ B.keys()}
    merged.update({k: f(A[k], B[k]) for k in A.keys() & B.keys()})
    return merged

def save_items(out_file, items):
    """
    Saves list of items as rows in a file.
    """
    with open(out_file, 'w') as f:
        for i in range(len(items)):
            if i < len(items) - 1:
                f.write(items[i] + '\n')
            else:
                f.write(items[i])

def save_gmt(out_file, keys_to_sets, sep='\t'):
    """
    Saves dict with key-set pairs as gmt file format.
    """
    lines = []
    for key in sorted(keys_to_sets):
        lines.append(key + sep*2 + sep.join(sorted(keys_to_sets[key])))
    save_items(out_file, lines)

# Read drug names from file
with open(hits_filename, 'r') as hits_file:
    drug_hits = set(drug.strip().lower() for drug in hits_file.read().strip().split('\n') 
                    if len(drug.strip()) > 0)

# Query PubChem API to map drug names to InChI Keys
print('Querying PubChem API...')
drug_hits_inchi_pubchem = DrugNameConverter.batch_to_inchi_keys(drug_hits)
# Query Drugmonizome API to map drug names to InChI Keys
print('Querying Drugmonizome API...')
drug_hits_inchi_drugmonizome = Drugmonizome.map_names_to_inchi_keys(drug_hits)
# Query L1000FWD API to map drug names to InChI Keys
print('Querying L1000FWD API...')
drug_hits_inchi_l1000fwd = querysepl1000fwd.map_names_to_inchi_keys(drug_hits)

# Combine InChI Keys from all resources
drug_hits_inchi = merge(drug_hits_inchi_pubchem, drug_hits_inchi_drugmonizome, lambda s1, s2: s1 | s2)
drug_hits_inchi = merge(drug_hits_inchi, drug_hits_inchi_l1000fwd, lambda s1, s2: s1 | s2)
save_gmt('hits_drug_name_to_inchi_keys.gmt', drug_hits_inchi)
# Unmatched drug names
unmatched_drugs = set(drug for drug in drug_hits
                      if drug not in drug_hits_inchi or len(drug_hits_inchi[drug]) == 0)
print(f'Drugs without InChI Keys ({ len(unmatched_drugs) }/{ len(drug_hits) }):', unmatched_drugs)

# Set of InChI Keys for user-specified hits
drug_hits = set(key for drug in drug_hits_inchi
                    for key in drug_hits_inchi[drug])
save_items('hits_inchi_keys.txt', sorted(drug_hits))
{% endif %}

{% else %}

df_target = list(Drugmonizome.download_df(
    ['{{ target_dataset }}']
))
df = df_target[0]
df = df.fillna(0)
Y = df.applymap(lambda f: 1 if f!=0 else 0)
drug_hits = set(Y[Y['{{ target_name }}'] == 1].index)

# Helper function
def save_items(out_file, items):
    """
    Saves list of items as rows in a file.
    """
    with open(out_file, 'w') as f:
        for i in range(len(items)):
            if i < len(items) - 1:
                f.write(items[i] + '\n')
            else:
                f.write(items[i])
save_items('hits_inchi_keys.txt', sorted(drug_hits))
{% endif %}

In [None]:
%%appyter markdown

{% if target_field.raw_value == 'List' %}
{% if target_field.value[0].value == 'Drug Name' %}
For the user-inputted drug names:
* Mapping of drug name to InChI Key: [hits_drug_name_to_inchi_keys.gmt](./hits_drug_name_to_inchi_keys.gmt)
* List of InChI Keys: [hits_inchi_keys.txt](./hits_inchi_keys.txt)
{% endif %}
{% endif %}

In [None]:
%%appyter markdown

{% if target_field.raw_value == 'List' %}
We produce a target array containing 1 if the compound is specified as a hit and 0 otherwise.
{% else %}
We produce a target array containing 1 if the compound is associated with the attribute _{{ target_name }}_ in the Drugmonizome resource _{{ target_dataset }}_ and 0 otherwise.
{% endif %}

In [None]:
%%appyter code_exec

{% if includestereo.value %}
# Match first 14 characters of InChI Keys (hash of InChI connectivity information)
drug_hits_inchi_main_layer = set(key[:14] for key in drug_hits)
y = np.array([drug[:14] in drug_hits_inchi_main_layer for drug in X.index]).astype(np.int8)
unmatched = list(set([drug[:14] for drug in drug_hits]) - set(drug[:14] for drug in X.index))
{% else %}
# Match full InChI Keys
y = np.array([drug in drug_hits for drug in X.index]).astype(np.int8)
unmatched = list(set(drug_hits) - set(X.index))
{% endif %}
save_items('unmatched_inchikeys.txt', unmatched)
print('Number of hits matched in input: %d (%0.3f %%)' % (y.sum(), 100*y.sum()/len(y)))
print('Number of unmatched hits: %d' % (len(unmatched)))

In [None]:
%%appyter markdown
* File of unmatched InChI keys: [unmatched_inchikeys.txt](./unmatched_inchikeys.txt)

In [None]:
# Output data shapes
print('Input shape:', X.shape)
print('Target shape:', y.shape)

## Dimensionality Reduction and Visualization

In [None]:
%%appyter hide
{% do SectionField(
    title='Machine Learning Pipeline',
    subtitle='Select from available machine learning algorithms, their unique settings, and methods to use to evaluate the classifier.',
    name='SETTINGS',
    img='settings.png',
) %}

{% set visualization_reduction = ChoiceField(
    name='visualization_reduction',
    label='Data Visualization Method',
    description='Select a dimensionality reduction algorithm for data visualization.',
    default='UMAP',
    choices={
        'UMAP': 'umap.UMAP(low_memory=True, random_state=rng)',
        'NMF': 'sk.decomposition.NMF(n_components=2)',
        'PCA': 'sk.decomposition.PCA(n_components=2)',
        'TruncatedSVD': 'sk.decomposition.TruncatedSVD(n_components=2)',
        'IncrementalPCA': 'sk.decomposition.IncrementalPCA(n_components=2)',
        'ICA': 'sk.decomposition.FastICA(n_components=2)',
        'SparsePCA': 'sk.decomposition.SparsePCA(n_components=2)',
    },
    section='SETTINGS'
) %}

In [None]:
%%appyter markdown

We reduce the dimensionality of our omics feature space for visualization with {{ visualization_reduction.raw_value }}.

In [None]:
%%appyter code_exec
clf_dimensionality_reduction = {{ visualization_reduction }}
X_reduced = clf_dimensionality_reduction.fit_transform(X.values)
{% if visualization_reduction.raw_value == 'PCA' %}
print('Explained variance:', np.sum(clf_dimensionality_reduction.explained_variance_))
{% endif %}

In [None]:
X_reduced_df = pd.DataFrame(X_reduced, columns=['Component 1', 'Component 2'])
X_reduced_df['Drug Name'] = querysepl1000fwd.get_drug_names(X.index)
X_reduced_df['InChI Key'] = X.index
X_reduced_df['Label'] = y
X_reduced_df['marker symbol'] = ['x' if label else 'circle' for label in X_reduced_df['Label']]
X_reduced_df['text'] = ['<br>'.join(['Drug Name: ' + str(name),
                                     'InChI Key: ' + str(inchi),
                                     'Label: ' + str(label)])
                        for name, inchi, label in zip(X_reduced_df['Drug Name'],
                                                      X_reduced_df['InChI Key'],
                                                      X_reduced_df['Label'])]

In [None]:
%%appyter code_exec

fig = go.Figure()
for label in set(X_reduced_df['Label']):
    X_plot = X_reduced_df[X_reduced_df['Label'] == label].sort_values('Label')
    fig.add_trace(go.Scatter(mode='markers',
                             x=X_plot['Component 1'], y=X_plot['Component 2'],
                             text=X_plot['text'],
                             name=label,
                             marker=dict(
                                 color=['#0d0887', '#f0f921'][label%2],
                                 size=8,
                                 symbol=X_plot['marker symbol'],
                                 line_width=1,
                                 line_color='white'
                             )))
fig.update_layout(height=600, width=800,
                  xaxis_title='Component 1',
                  yaxis_title='Component 2',
                  title_text='Known Labels ({{ visualization_reduction.raw_value }})',
                  legend_title_text='Target Label',
                  template='simple_white')
figure_header('Figure', 'Input feature space with {{ visualization_reduction.raw_value }} dimensionality reduction')
fig.show()
figure_legend('Figure', 'Input feature space with {{ visualization_reduction.raw_value }} dimensionality reduction',
              f'Each point represents one of {X.shape[0]} compounds, with {X.shape[1]} features per compound, \
              taken from the following datasets: {", ".join(sepl1000_phenotypic_datasets + sepl1000_structural_datasets + attribute_datasets)}. \
              Compounds with known positive labels are marked by X\'s.')

## Machine Learning

We train and evaluate a machine learning model across multiple cross-validation splits by randomly dividing the input dataset into training and validation sets. For each round of cross-validation, a model is trained on the training set and is then used to make predictions for the compounds in the validation set. Each compound appears in at least one validation set, so the validation set predictions are used to assess model performance based on existing labels and to suggest novel predictions.

In [None]:
%%appyter hide
{% set dimensionality_reduction = ChoiceField(
    name='dimensionality_reduction',
    label='Dimensionality Reduction Algorithm',
    description='Optionally select a dimensionality reduction algorithm as a data preprocessing step in the ML pipeline.',
    default='None',
    choices={
        'None': 'None',
        'PCA': 'sk.decomposition.PCA(n_components=64)',
        'TruncatedSVD': 'sk.decomposition.TruncatedSVD(n_components=64)',
        'IncrementalPCA': 'sk.decomposition.IncrementalPCA(n_components=64)',
        'ICA': 'sk.decomposition.FastICA(n_components=64)',
        'SparsePCA': 'sk.decomposition.SparsePCA(n_components=64)',
    },
    section='SETTINGS'
) %}
{% set feature_selection = ChoiceField(
    name='feature_selection',
    label='Machine Learning Feature Selection',
    description='Optionally select a feature selection algorithm to include in the ML pipeline. \
                 If RecursiveSelectionFromExtraTrees is chosen, additional information can be obtained \
                 on the relative importance of different features based on which features are eliminated.',
    default='None',
    choices={
        'None': 'None',
        'SelectFromLinearSVC': 'sk.feature_selection.SelectFromModel(sk.svm.LinearSVC(loss="squared_hinge", penalty="l1", dual=False, class_weight="balanced"))',
        'SelectFromExtraTrees': 'sk.feature_selection.SelectFromModel(sk.ensemble.ExtraTreesClassifier(class_weight="balanced"))',
        'RecursiveSelectionFromExtraTrees': 'sk.feature_selection.RFE(sk.ensemble.ExtraTreesClassifier(class_weight="balanced"), n_features_to_select=256, step=0.1)',
        'SelectKBest': 'sk.feature_selection.SelectKBest("f_classif")',
        'SelectKBestChi2': 'sk.feature_selection.SelectKBest("chi2")',
        'SelectKBestMultiInfo': 'sk.feature_selection.SelectKBest("mutual_info_classif")',
    },
    section='SETTINGS'
) %}
{% set algorithm = TabField(
    name='algorithm',
    label='Machine Learning Algorithm',
    default='ExtraTreesClassifier',
    description='Select a machine learning algorithm to construct the predictive model. \
                 (See scikit-learn User Guide for details.)',
    choices={
        'GradientBoostingClassifier': [
            ChoiceField(
                name='loss_gb',
                label='loss',
                description='Loss function to be optimized.',
                default="deviance",
                choices=["deviance", "exponential"],
            ),
            FloatField(
                name='learning_rate_gb',
                label='learning_rate',
                description='Shrinks the contribution of each tree by learning_rate.',
                default=0.1,
            ),
            IntField(
                name='n_estimators_gb',
                label='n_estimators',
                description='Number of boosting stages to perform.',
                default=100,
            ),
            FloatField(
                name='subsample_gb',
                label='subsample',
                description='Fraction of samples to be used for fitting individual base learners.',
                default=1.0,
            ),
            ChoiceField(
                name='criterion_gb',
                label='criterion',
                description='Function to measure the quality of a split.',
                default="friedman_mse",
                choices=["friedman_mse", "mse", "mae"],
            ),
            FloatField(
                name='tol_gb',
                label='tol',
                description='Tolerance for early stopping.',
                default=1e-4,
            ),
        ],
        'RandomForestClassifier': [
            IntField(
                name='n_estimators_rf',
                label='n_estimators',
                description='Number of trees in the forest.',
                default=100,
            ),
            ChoiceField(
                name='criterion_rf',
                label='criterion',
                description='Function to measure the quality of a split.',
                default="gini",
                choices=["gini", "entropy"],
            ),
            FloatField(
                name='min_samples_split_rf',
                label='min_samples_split',
                description='Minimum number of samples required to split an internal node. \
                             If int, then min_samples_split specifies the minimum number. \
                             If float, then min_samples_split specifies a fraction of the total number of samples.',
                default=2,
            ),
            FloatField(
                name='min_samples_leaf_rf',
                label='min_samples_leaf',
                description='Minimum number of samples required to be at a leaf node. \
                             If int, then min_samples_leaf specifies the minimum number. \
                             If float, then min_samples_leaf specifies a fraction of the total number of samples.',
                default=1,
            ),
            ChoiceField(
                name='max_features_rf',
                label='max_features',
                description='The number of features to consider when looking for the best split.',
                default="None",
                choices=["None", '"auto"', '"sqrt"', '"log2"'],
            ),
            FloatField(
                name='min_impurity_decrease_rf',
                label='min_impurity_decrease',
                description='A node will be split if this split induces a decrease of the impurity greater than or equal to this value.',
                default=0.0,
            ),
            ChoiceField(
                name='class_weight_rf',
                label='class_weight',
                description='Weights associated with classes. If None, then all classes have weight one. \
                             The balanced mode adjusts weights inversely proportional to class frequencies in the input data. \
                             The balanced_subsample mode is the same as balanced except weights are computed based on the bootstrap sample for each tree.',
                default='"balanced"',
                choices=["None", '"balanced"', '"balanced_subsample"'],
            ),
            FloatField(
                name='ccp_alpha_rf',
                label='ccp_alpha',
                description='Complexity parameter used for Minimal Cost-Complexity Pruning. \
                             The subtree with the largest cost complexity that is smaller than ccp_alpha will be chosen. \
                             By default, no pruning is performed.',
                default=0.0,
            ),
        ],
        'AdaBoostClassifier': [
            IntField(
                name='max_depth_ab',
                label='max_depth',
                description='Maximum depth of the decision tree used as the base estimator.',
                default=1,
            ),
            IntField(
                name='n_estimators_ab',
                label='n_estimators',
                description='Maximum number of estimators at which boosting is terminated.',
                default=50,
            ),
            FloatField(
                name='learning_rate_ab',
                label='learning_rate',
                description='Shrinks the contribution of each classifier by learning_rate.',
                default=1.0,
            ),
            ChoiceField(
                name='algorithm_ab',
                label='algorithm',
                description='Select the real or discrete boosting algorithm to use.',
                default="SAMME.R",
                choices=["SAMME", "SAMME.R"],
            ),
        ],
        'ExtraTreesClassifier': [
            IntField(
                name='n_estimators_et',
                label='n_estimators',
                description='Number of trees in the forest.',
                default=1250,
            ),
            ChoiceField(
                name='criterion_et',
                label='criterion',
                description='Function to measure the quality of a split.',
                default="entropy",
                choices=["gini", "entropy"],
            ),
            FloatField(
                name='min_samples_split_et',
                label='min_samples_split',
                description='Minimum number of samples required to split an internal node. \
                             If int, then min_samples_split specifies the minimum number. \
                             If float, then min_samples_split specifies a fraction of the total number of samples.',
                default=2,
            ),
            FloatField(
                name='min_samples_leaf_et',
                label='min_samples_leaf',
                description='Minimum number of samples required to be at a leaf node. \
                             If int, then min_samples_leaf specifies the minimum number. \
                             If float, then min_samples_leaf specifies a fraction of the total number of samples.',
                default=1,
            ),
            ChoiceField(
                name='max_features_et',
                label='max_features',
                description='The number of features to consider when looking for the best split.',
                default='"log2"',
                choices=["None", '"auto"', '"sqrt"', '"log2"'],
            ),
            FloatField(
                name='min_impurity_decrease_et',
                label='min_impurity_decrease',
                description='A node will be split if this split induces a decrease of the impurity greater than or equal to this value.',
                default=0.0,
            ),
            ChoiceField(
                name='class_weight_et',
                label='class_weight',
                description='Weights associated with classes. If None, then all classes have weight one. \
                             The balanced mode adjusts weights inversely proportional to class frequencies in the input data. \
                             The balanced_subsample mode is the same as balanced except weights are computed based on the bootstrap sample for each tree.',
                default='"balanced"',
                choices=["None", '"balanced"', '"balanced_subsample"'],
            ),
            FloatField(
                name='ccp_alpha_et',
                label='ccp_alpha',
                description='Complexity parameter used for Minimal Cost-Complexity Pruning. \
                             The subtree with the largest cost complexity that is smaller than ccp_alpha will be chosen. \
                             By default, no pruning is performed.',
                default=0.0,
            ),
        ],
        'DecisionTreeClassifier': [
            ChoiceField(
                name='criterion_dt',
                label='criterion',
                description='Function to measure the quality of a split.',
                default="gini",
                choices=["gini", "entropy"],
            ),
            ChoiceField(
                name='splitter_dt',
                label='splitter',
                description='Strategy used to choose the split at each node.',
                default="best",
                choices=["best", "random"],
            ),
            FloatField(
                name='min_samples_split_dt',
                label='min_samples_split',
                description='Minimum number of samples required to split an internal node. \
                             If int, then min_samples_split specifies the minimum number. \
                             If float, then min_samples_split specifies a fraction of the total number of samples.',
                default=2,
            ),
            FloatField(
                name='min_samples_leaf_dt',
                label='min_samples_leaf',
                description='Minimum number of samples required to be at a leaf node. \
                             If int, then min_samples_leaf specifies the minimum number. \
                             If float, then min_samples_leaf specifies a fraction of the total number of samples.',
                default=1,
            ),
            ChoiceField(
                name='max_features_dt',
                label='max_features',
                description='The number of features to consider when looking for the best split.',
                default="None",
                choices=["None", '"auto"', '"sqrt"', '"log2"'],
            ),
            FloatField(
                name='min_impurity_decrease_dt',
                label='min_impurity_decrease',
                description='A node will be split if this split induces a decrease of the impurity greater than or equal to this value.',
                default=0.0,
            ),
            ChoiceField(
                name='class_weight_dt',
                label='class_weight',
                description='Weights associated with classes. If None, then all classes have weight one. \
                             The balanced mode adjusts weights inversely proportional to class frequencies in the input data. \
                             The balanced_subsample mode is the same as balanced except weights are computed based on the bootstrap sample for each tree.',
                default='"balanced"',
                choices=["None", '"balanced"', '"balanced_subsample"'],
            ),
            FloatField(
                name='ccp_alpha_dt',
                label='ccp_alpha',
                description='Complexity parameter used for Minimal Cost-Complexity Pruning. \
                             The subtree with the largest cost complexity that is smaller than ccp_alpha will be chosen. \
                             By default, no pruning is performed.',
                default=0.0,
            ),
        ],
        'KNeighborsClassifier': [
            IntField(
                name='n_neighbors_knn',
                label='n_neighbors',
                description='Number of neighbors to use for queries.',
                default=5,
            ),
            ChoiceField(
                name='weights_knn',
                label='weights',
                description='Weight function used in prediction. \
                             If uniform, all points in each neighborhood are weighted equally. \
                             If distance, points are weighted by the inverse of their distance.',
                default="uniform",
                choices=["uniform", "distance"],
            ),
            ChoiceField(
                name='algorithm_knn',
                label='algorithm',
                description='Algorithm used to compute the nearest neighbors.',
                default="auto",
                choices=["auto", "ball_tree", "kd_tree", "brute"],
            ),
            IntField(
                name='leaf_size_knn',
                label='leaf_size',
                description='Leaf size passed to BallTree or KDTree.',
                default=30,
            ),
            IntField(
                name='p_knn',
                label='p',
                description='Power parameter for the Minkowski metric.',
                default=2,
            ),
            ChoiceField(
                name='metric_knn',
                label='metric',
                description='Distance metric to use for the tree.',
                default="minkowski",
                choices=["minkowski", "euclidean", "manhattan", "chebyshev"],
            ),
        ],
        'RadiusNeighborsClassifier': [
            FloatField(
                name='radius_rn',
                label='radius',
                description='Range of parameter space to use for queries.',
                default=1.0,
            ),
            ChoiceField(
                name='weights_rn',
                label='weights',
                description='Weight function used in prediction. \
                             If uniform, all points in each neighborhood are weighted equally. \
                             If distance, points are weighted by the inverse of their distance.',
                default="uniform",
                choices=["uniform", "distance"],
            ),
            ChoiceField(
                name='algorithm_rn',
                label='algorithm',
                description='Algorithm used to compute the nearest neighbors.',
                default="auto",
                choices=["auto", "ball_tree", "kd_tree", "brute"],
            ),
            IntField(
                name='leaf_size_rn',
                label='leaf_size',
                description='Leaf size passed to BallTree or KDTree.',
                default=30,
            ),
            IntField(
                name='p_rn',
                label='p',
                description='Power parameter for the Minkowski metric.',
                default=2,
            ),
            ChoiceField(
                name='metric_rn',
                label='metric',
                description='Distance metric to use for the tree.',
                default="minkowski",
                choices=["minkowski", "euclidean", "manhattan", "chebyshev"],
            ),
        ],
        'MLPClassifier': [
            StringField(
                name='hidden_layer_sizes_mlp',
                label='hidden_layer_sizes',
                description='Enter a tuple, where the ith element represents the number of neurons in the ith hidden layer.',
                hint='Enter a tuple: e.g. (128, 64)',
                default='(100,)',
                constraint='^\\(\\s*(?:\\d+,\\s*)+(?:\\d+,?\\s*)?\\)$',
            ),
            ChoiceField(
                name='activation_mlp',
                label='activation',
                description='Activation function for the hidden layer.',
                default="relu",
                choices=["identity", "logistic", "tanh", "relu"],
            ),
            ChoiceField(
                name='solver_mlp',
                label='solver',
                description='Solver for weight optimization.',
                default="adam",
                choices=["lbfgs", "sgd", "adam"],
            ),
            FloatField(
                name='alpha_mlp',
                label='alpha',
                description='L2 penality (regularization term) parameter.',
                default=0.0001,
            ),
            ChoiceField(
                name='learning_rate_mlp',
                label='learning_rate',
                description='Learning rate schedule for weight updates. Only used for sgd solver.',
                default="constant",
                choices=["constant", "invscaling", "adaptive"],
            ),
            FloatField(
                name='learning_rate_init_mlp',
                label='learning_rate_init',
                description='The initial learning rate used. Controls the step-size in updating the weights. Only used for sgd or adam solver.',
                default=0.001,
            ),
            FloatField(
                name='power_t_mlp',
                label='power_t',
                description='Exponent for inverse scaling learning rate. Only used for sgd solver with invscaling for learning_rate.',
                default=0.5,
            ),
            IntField(
                name='max_iter_mlp',
                label='max_iter',
                description='Maximum number of iterations. The solver iterates until convergence (determined by tol) or this number of iterations.',
                default=200,
            ),
            FloatField(
                name='tol_mlp',
                label='tol',
                description='Tolerance for the optimization.',
                default=1e-4,
            ),
            BoolField(
                name='early_stopping_mlp',
                label='early_stopping',
                description='Whether to use early stopping to terminate training when validation score is not improving.',
                default=False,
            ),
            FloatField(
                name='validation_fraction_mlp',
                label='validation_fraction',
                description='The proportion of training data to set aside as validation set for early stopping.',
                default=0.1,
            ),
        ],
        'SVC': [
            FloatField(
                name='C_svm',
                label='C',
                description='Regularization parameter. The strength of the regularization is inversely proportional to C. Must be strictly positive. The penalty is a squared l2 penalty.',
                default=1.0,
            ),
            ChoiceField(
                name='kernel_svm',
                label='kernel',
                description='Specifies the kernel type to be used in the algorithm.',
                default="rbf",
                choices=["linear", "poly", "rbf", "sigmoid", "precomputed"],
            ),
            IntField(
                name='degree_svm',
                label='degree',
                description='Degree of the polynomial kernel function (‘poly’). Ignored by all other kernels.',
                default=3,
            ),
            ChoiceField(
                name='gamma_svm',
                label='gamma',
                description='Kernel coefficient for rbf, poly and sigmoid kernels.',
                default="scale",
                choices=["scale", "auto"],
            ),
            FloatField(
                name='coef0_svm',
                label='coef0',
                description='Independent term in kernel function. It is only significant in poly and sigmoid.',
                default=0.0,
            ),
            BoolField(
                name='shrinking_svm',
                label='shrinking',
                description='Whether to use the shrinking heuristic.',
                default=True,
            ),
            FloatField(
                name='tol_svm',
                label='tol',
                description='Tolerance for stopping criterion.',
                default=1e-3,
            ),
            ChoiceField(
                name='class_weight_svm',
                label='class_weight',
                description='Weights associated with classes. If None, then all classes have weight one. \
                             The balanced mode adjusts weights inversely proportional to class frequencies in the input data.',
                default='"balanced"',
                choices=["None", '"balanced"'],
            ),
            IntField(
                name='max_iter_svm',
                label='max_iter',
                description='Hard limit on iterations within solver, or -1 for no limit.',
                default=-1,
            ),
        ],
    },
    section='SETTINGS'
) %}
{% set calibrated = BoolField(
    name='calibrated',
    label='Calibrate algorithm predictions',
    description='Calibrate the prediction probabilities, eliminating model-imparted bias.',
    default=True,
    section='SETTINGS',
) %}
{% set cv_algorithm = ChoiceField(
    name='cv_algorithm',
    label='Cross-Validation Algorithm',
    description='Select a cross-validation method for training and evaluating the pipeline, and for making predictions. \
                 StratifiedGroupKFold or RepeatedStratifiedGroupKFold are recommended because they will maintain class ratios \
                 across train/validation splits (stratification of labels) and will group compounds by the first 14 characters of their \
                 InChI Keys to avoid compounds with multiple entries from appearing in both the train and validation sets.',
    default='RepeatedStratifiedGroupKFold',
    choices={
        'KFold': 'sk.model_selection.KFold',
        'GroupKFold': 'sk.model_selection.GroupKFold',
        'RepeatedKFold': 'sk.model_selection.RepeatedKFold',
        'StratifiedKFold': 'sk.model_selection.StratifiedKFold',
        'StratifiedGroupKFold': 'StratifiedGroupKFold',
        'RepeatedStratifiedKFold': 'sk.model_selection.RepeatedStratifiedKFold',
        'RepeatedStratifiedGroupKFold': 'RepeatedStratifiedGroupKFold'
    },
    section='SETTINGS',
) %}
{% set cross_validation_n_folds = IntField(
    name='cross_validation_n_folds',
    label='Number of Cross-Validated Folds',
    description='Cross-validation is employed as a strategy to train the model on data that the model has not seen before, more folds will ensure that the model is generalizing well.',
    default=10,
    min=2,
    max=10,
    section='SETTINGS'
) %}
{% set cross_validation_n_repeats = IntField(
    name='cross_validation_n_repeats',
    label='Number of Cross-Validated Repetitions',
    description='Number of repetitions of cross-validation to perform. \
                 Only used for RepeatedKFold, RepeatedStratifiedKFold, or RepeatedStratifiedGroupKFold cross-validation algorithms, \
                 which repeat cross-validation with different randomizations. This yields multiple predictions per compound, which can be evaluated for consistency.',
    default=3,
    min=2,
    section='SETTINGS'
) %}
{% set primary_metric = ChoiceField(
    name='primary_metric',
    label='Primary Evaluation Metric',
    default='roc_auc',
    description='The primary evaluation metric is used for deciding how we assess the performance of our model. \
                 Area under the receiver operating characteristic curve (roc_auc) is recommended for most tasks.',
    choices=[
        'accuracy',
        'adjusted_mutual_info_score',
        'adjusted_rand_score',
        'average_precision',
        'balanced_accuracy',
        'completeness_score',
        'explained_variance',
        'f1',
        'f1_macro',
        'f1_micro',
        'f1_weighted',
        'fowlkes_mallows_score',
        'homogeneity_score',
        'jaccard',
        'jaccard_macro',
        'jaccard_micro',
        'jaccard_weighted',
        'max_error',
        'mutual_info_score',
        'neg_brier_score',
        'neg_log_loss',
        'neg_mean_absolute_error',
        'neg_mean_squared_error',
        'neg_mean_squared_log_error',
        'neg_median_absolute_error',
        'neg_root_mean_squared_error',
        'normalized_mutual_info_score',
        'precision',
        'precision_macro',
        'precision_micro',
        'precision_weighted',
        'r2',
        'recall',
        'recall_macro',
        'recall_micro',
        'recall_weighted',
        'roc_auc',
        'roc_auc_ovo',
        'roc_auc_ovo_weighted',
        'roc_auc_ovr',
        'roc_auc_ovr_weighted',
        'v_measure_score'
    ],
    section='SETTINGS'
) %}
{% set evaluation_metrics = MultiChoiceField(
    name='evaluation_metrics',
    label='Evaluation Metrics',
    default=[],
    description='Additional evaluation metrics can be specified, these metrics will also be reported for all models trained.',
    value=[],
    choices=[
        'accuracy',
        'adjusted_mutual_info_score',
        'adjusted_rand_score',
        'average_precision',
        'balanced_accuracy',
        'completeness_score',
        'explained_variance',
        'f1',
        'f1_macro',
        'f1_micro',
        'f1_weighted',
        'fowlkes_mallows_score',
        'homogeneity_score',
        'jaccard',
        'jaccard_macro',
        'jaccard_micro',
        'jaccard_weighted',
        'max_error',
        'mutual_info_score',
        'neg_brier_score',
        'neg_log_loss',
        'neg_mean_absolute_error',
        'neg_mean_squared_error',
        'neg_mean_squared_log_error',
        'neg_median_absolute_error',
        'neg_root_mean_squared_error',
        'normalized_mutual_info_score',
        'precision',
        'precision_macro',
        'precision_micro',
        'precision_weighted',
        'r2',
        'recall',
        'recall_macro',
        'recall_micro',
        'recall_weighted',
        'roc_auc',
        'roc_auc_ovo',
        'roc_auc_ovo_weighted',
        'roc_auc_ovr',
        'roc_auc_ovr_weighted',
        'v_measure_score'
    ],
    section='SETTINGS',
) %}
{% set all_metrics = [primary_metric.value] + evaluation_metrics.value %}

In [None]:
%%appyter code_hide

{% set algorithm_code = {
    'GradientBoostingClassifier': 'sk.ensemble.GradientBoostingClassifier(loss="{}", learning_rate={}, n_estimators={}, subsample={}, criterion="{}", tol={})',
    'RandomForestClassifier': 'sk.ensemble.RandomForestClassifier(n_estimators={}, criterion="{}", min_samples_split={}, min_samples_leaf={}, max_features={}, min_impurity_decrease={}, n_jobs=-1, class_weight={}, ccp_alpha={})',
    'AdaBoostClassifier': 'sk.ensemble.AdaBoostClassifier(sk.tree.DecisionTreeClassifier(max_depth={}), n_estimators={}, learning_rate={}, algorithm="{}")',
    'ExtraTreesClassifier': 'sk.ensemble.ExtraTreesClassifier(n_estimators={}, criterion="{}", min_samples_split={}, min_samples_leaf={}, max_features={}, min_impurity_decrease={}, n_jobs=-1, class_weight={}, ccp_alpha={})',
    'DecisionTreeClassifier': 'sk.tree.DecisionTreeClassifier(criterion="{}", splitter="{}", min_samples_split={}, min_samples_leaf={}, max_features={}, min_impurity_decrease={}, class_weight={}, ccp_alpha={})',
    'KNeighborsClassifier': 'sk.neighbors.KNeighborsClassifier(n_neighbors={}, weights="{}", algorithm="{}", leaf_size={}, p={}, metric="{}", n_jobs=-1)',
    'RadiusNeighborsClassifier': 'sk.neighbors.RadiusNeighborsClassifier(radius={}, weights="{}", algorithm="{}", leaf_size={}, p={}, metric="{}", outlier_label="most_frequent", n_jobs=-1)',
    'MLPClassifier': 'sk.neural_network.MLPClassifier(hidden_layer_sizes={}, activation="{}", solver="{}", alpha={}, learning_rate="{}", learning_rate_init={}, power_t={}, max_iter={}, tol={}, early_stopping={}, validation_fraction={})',
    'SVC': 'sk.svm.SVC(C={}, kernel="{}", degree={}, gamma="{}", coef0={}, shrinking={}, tol={}, class_weight={}, max_iter={})',
} %}

In [None]:
%%appyter markdown

We apply a sklearn pipeline with a dimensionality reduction step of {{ dimensionality_reduction.raw_value }}
{% if feature_selection.value != 'None' %}and a feature selection step of {{ feature_selection.raw_value }}
{% endif %} and a{% if calibrated.value %} calibrated{%endif %} {{ algorithm.raw_value }} classifier
using {{ cross_validation_n_folds.value }}-fold {{ cv_algorithm.raw_value }} cross-validation,
optimizing {{ primary_metric.value }}{% if evaluation_metrics.value %} and computing {{ ', '.join(evaluation_metrics.value) }}{% endif %}.

Note that training can take a long time as we are training a model for each of multiple cross-validation splits.

In [None]:
%%appyter code_exec

cv = {{ cv_algorithm }}(
    n_splits={{ cross_validation_n_folds }},
    {% if cv_algorithm.raw_value in ['RepeatedStratifiedKFold', 'RepeatedStratifiedGroupKFold'] %}
    n_repeats={{ cross_validation_n_repeats }},
    {% else %}
    shuffle=True,
    {% endif %}
    random_state=rng,
)

{% if cv_algorithm.raw_value in ['GroupKFold', 'StratifiedGroupKFold', 'RepeatedStratifiedGroupKFold'] %}
groups=[key[:14] for key in X.index]    # Group compounds by atom connectivity
{% endif %}

# Scoring parameters
primary_metric = '{{ primary_metric }}'
evaluation_metrics = {{ evaluation_metrics }}
scoring_params = {k: metrics.get_scorer(k)
                  for k in [primary_metric, *evaluation_metrics]}

In [None]:
%%appyter code_exec

df_results = pd.DataFrame()

# Store performance on each split for computing ROC and PRC curves
fprs = []
tprs = []
precs = []
recs = []

# Store cross-validation test predictions and folds
y_proba_cv = [[] for _ in range(len(y))]
folds_cv = [[] for _ in range(len(y))]

# Store models
models = []

{% if cv_algorithm.raw_value in ['GroupKFold', 'StratifiedGroupKFold', 'RepeatedStratifiedGroupKFold'] %}
groups=[key[:14] for key in X.index]    # Group compounds by atom connectivity
for fold, (train, test) in tqdm(enumerate(cv.split(X.values, y, groups=groups))):
{% else %}
for fold, (train, test) in tqdm(enumerate(cv.split(X.values, y))):
{% endif %}
    model = sk.pipeline.Pipeline([
                {%- if dimensionality_reduction.value != 'None' %}
                ('reduce_dim', {{ dimensionality_reduction }}),
                {% endif %}
                {%- if feature_selection.value != 'None' %}
                ('feature_selection', {{ feature_selection }}),
                {% endif %}
                ('clf', {% if algorithm.raw_value == 'MLPClassifier' %}{{ algorithm_code.get(algorithm.raw_value).format(algorithm.value[0].value|str_to_tuple, *algorithm.value[1:]) }}
                        {% elif algorithm.raw_value in ['DecisionTreeClassifier', 'RandomForestClassifier', 'ExtraTreesClassifier'] %}{{ algorithm_code.get(algorithm.raw_value).format(algorithm.value[0].value, algorithm.value[1].value, algorithm.value[2].value|int_or_float, algorithm.value[3].value|int_or_float, *algorithm.value[4:]) }}
                        {% else %}{{ algorithm_code.get(algorithm.raw_value).format(*algorithm.value) }}{% endif %}
                ),
            ])
    model.fit(X.values[train], y[train])
    
    {% if calibrated.value %}
    calibrator = sk.calibration.CalibratedClassifierCV(model, cv='prefit')
    calibrator.fit(X.values[test], y[test])
    model = calibrator
    {% endif %}
    
    {% for metric in all_metrics %}
    df_results.loc[fold, '{{ metric }}'] = scoring_params['{{ metric }}'](model, X.values[test], y[test])
    {% endfor %}
    
    y_proba = model.predict_proba(X.values[test]) # Probability prediction will be True
    for i in range(len(test)):
        y_proba_cv[test[i]].append(y_proba[i, 1])
        folds_cv[test[i]].append(fold % {{ cross_validation_n_folds }})
    model_fpr, model_tpr, _ = metrics.roc_curve(y[test], y_proba[:, 1])
    model_prec, model_rec, _ = metrics.precision_recall_curve(y[test], y_proba[:, 1])
    fprs.append(model_fpr)
    tprs.append(model_tpr)
    precs.append(model_prec)
    recs.append(model_rec)
    models.append(model)

assert not(any(len(probs) == 0 for probs in y_proba_cv)), 'All probabilities should have been calculated'

display(df_results.agg(['mean', 'std']))

This visualization shows the cross-validated performance of the model. Low fold variance and high AUC is desired in a well-generalized model.
* ROC curve: [roc.svg](./roc.svg)
* Precision-recall curve: [prc.svg](./prc.svg)
* Confusion matrix: [confusion_matrix.svg](./confusion_matrix.svg)

In [None]:
%%appyter code_exec

fig, ax = plt.subplots()

tprs_interp = []
aucs = []
mean_fpr = np.linspace(0, 1, 100)

for fold, (fpr, tpr) in enumerate(zip(fprs, tprs)):
    tpr_interp = np.interp(mean_fpr, fpr, tpr)
    tpr_interp[0] = 0.
    roc_auc = metrics.auc(fpr, tpr)
    tprs_interp.append(tpr_interp)
    aucs.append(roc_auc)
    {% if cv_algorithm.raw_value in ['RepeatedStratifiedKFold', 'RepeatedStratifiedGroupKFold'] %}
    ax.plot(fpr, tpr, alpha=0.4)
    {% else %}
    ax.plot(fpr, tpr, alpha=0.4, label='ROC Fold %d (AUC=%0.3f)' % (fold, roc_auc))
    {% endif %}

mean_tpr = np.mean(tprs_interp, axis=0)
mean_tpr[-1] = 1.0
mean_auc = sk.metrics.auc(mean_fpr, mean_tpr)
std_auc = np.std(aucs)
ax.plot(mean_fpr, mean_tpr, color='b',
         label=r'Mean ROC (AUC = %0.2f $\pm$ %0.2f)' % (mean_auc, std_auc),
         lw=2, alpha=.8)

std_tpr = np.std(tprs_interp, axis=0)
tprs_upper = np.minimum(mean_tpr + std_tpr, 1)
tprs_lower = np.maximum(mean_tpr - std_tpr, 0)
plt.fill_between(mean_fpr, tprs_lower, tprs_upper, color='grey', alpha=.2)

ax.plot([0,1],[0,1],'--', label='Random')
ax.set_xlabel('False Positive Rate')
ax.set_ylabel('True Positive Rate')
ax.legend()
plt.savefig('roc.svg')
figure_header('Figure', 'Receiver operating characteristic (ROC) curves across cross-validation splits ({})'.format(make_clickable('roc.svg')))
plt.show()
figure_legend('Figure', 'Receiver operating characteristic (ROC) curves across cross-validation splits ({})'.format(make_clickable('roc.svg')),
              'Individual curves are shown for each {{ cross_validation_n_folds }}-fold cross-validation split{% if cv_algorithm.raw_value in ['RepeatedStratifiedKFold', 'RepeatedStratifiedGroupKFold'] %}, repeated with {{ cross_validation_n_repeats }} different randomizations{% endif %}. \
               Mean ROC shows the average and standard deviation across cross-validation splits.')

z = (mean_auc - 0.5)/std_auc
cl = sp.stats.norm.cdf(z) * 100
ci = sp.stats.norm.interval(0.95, loc=mean_auc, scale=std_auc)
print('Confidence interval (95%)', ci)
print("We are %0.3f %% confident the model's results are not just chance." % (cl))
if cl > 95:
    print('This is statistically significant. These results can be trusted.')
else:
    print('This is not statistically significant. These results should not be trusted.')

In [None]:
%%appyter code_exec

fig, ax = plt.subplots()

precs_interp = []
prc_aucs = []
mean_rec = np.linspace(0, 1, 100)

for fold, (rec, prec) in enumerate(zip(recs, precs)):
    prec_interp = np.interp(mean_rec, rec[::-1], prec[::-1])
    prc_auc = metrics.auc(rec, prec)
    precs_interp.append(prec_interp)
    prc_aucs.append(prc_auc)
    {% if cv_algorithm.raw_value in ['RepeatedStratifiedKFold', 'RepeatedStratifiedGroupKFold'] %}
    ax.plot(rec, prec, alpha=0.4)
    {% else %}
    ax.plot(rec, prec, alpha=0.4, label='PRC Fold %d (AUC=%0.3f)' % (fold, prc_auc))
    {% endif %}
    
mean_prec = np.mean(precs_interp, axis=0)
mean_auc = sk.metrics.auc(mean_rec, mean_prec)
std_auc = np.std(prc_aucs)
ax.plot(mean_rec, mean_prec, color='b',
         label=r'Mean PRC (AUC = %0.2f $\pm$ %0.2f)' % (mean_auc, std_auc),
         lw=2, alpha=.8)

std_prec = np.std(precs_interp, axis=0)
precs_upper = np.minimum(mean_prec + std_prec, 1)
precs_lower = np.maximum(mean_prec - std_prec, 0)
plt.fill_between(mean_rec, precs_lower, precs_upper, color='grey', alpha=.2)

ax.set_xlabel('Recall')
ax.set_ylabel('Precision')
ax.legend()
plt.savefig('prc.svg')
figure_header('Figure', 'Precision-recall curves (PRC) across cross-validation splits ({})'.format(make_clickable('prc.svg')))
plt.show()
figure_legend('Figure', 'Precision-recall curves (PRC) across cross-validation splits ({})'.format(make_clickable('prc.svg')),
              'Individual curves are shown for each {{ cross_validation_n_folds }}-fold cross-validation split{% if cv_algorithm.raw_value in ['RepeatedStratifiedKFold', 'RepeatedStratifiedGroupKFold'] %}, repeated with {{ cross_validation_n_repeats }} different randomizations{% endif %}. \
               Mean PRC shows the average and standard deviation across cross-validation splits.')

In [None]:
sns.heatmap(
    metrics.confusion_matrix(y, np.array([np.mean(probs) for probs in y_proba_cv]) > 0.5),
    annot=True,
    cmap=plt.cm.Blues,
    fmt='g'
)
plt.xlabel('Predicted')
plt.ylabel('True')
plt.savefig('confusion_matrix.svg')
figure_header('Figure', 'Confusion matrix for cross-validation predictions ({})'.format(make_clickable('confusion_matrix.svg')))
plt.show()
figure_legend('Figure', 'Confusion matrix for cross-validation predictions ({})'.format(make_clickable('confusion_matrix.svg')),
              'Note that the predicted probabilities can be greatly affected by imbalanced labels and by the model choice. \
               Thus, performance measures such as ROC and PRC, which evaluate performance across a range of prediction thresholds, \
               are more useful than the confusion-matrix, which uses an fixed cutoff of 0.5')

## Examine predictions

By examining the validation-set predictions, we can rank the positive compounds and identify additional compounds that were not known to be in the positive class, but nevertheless had high predictions. These may share similar properties with the known compounds.

First, we can compare the distribution of predictions for positive and negative labels.

In [None]:
%%appyter code_exec

# Calculate mean and deviation of predictions
y_probas = np.array([np.mean(probs) for probs in y_proba_cv])
{% if cv_algorithm.raw_value in ['RepeatedStratifiedKFold', 'RepeatedStratifiedGroupKFold'] %}
y_probas_std = np.array([np.std(probs) for probs in y_proba_cv])
# Find minimum non-zero standard deviation to avoid dividing by zero when computing t-statistic
min_y_probas_std = max(np.min(y_probas_std[y_probas_std != 0]), 1e-10)
t_stats = (y_probas - np.mean(y_probas)) / (np.maximum(y_probas_std, min_y_probas_std)/np.sqrt({{ cross_validation_n_repeats }}))
# Calculate p-value using one-sample t-test
p_vals_t = 1-sp.stats.t({{ cross_validation_n_repeats }}-1).cdf(t_stats)
{% endif %}

In [None]:
%%appyter code_exec

{% if cv_algorithm.raw_value in ['RepeatedStratifiedKFold', 'RepeatedStratifiedGroupKFold'] %}
# Simulate mean predictions by 
y_probas_means_{{ cross_validation_n_repeats }} = []
y_probas_values = np.array(y_proba_cv).flatten()

np.random.seed(rng)
for i in tqdm(range(100000)):
    y_probas_means_{{ cross_validation_n_repeats }}.append(np.mean(np.random.choice(y_probas_values, {{ cross_validation_n_repeats }})))
    
y_probas_means_{{ cross_validation_n_repeats }} = np.array(sorted(y_probas_means_{{ cross_validation_n_repeats }}))
{% endif %}

In [None]:
%%appyter code_exec

{% if cv_algorithm.raw_value in ['RepeatedStratifiedKFold', 'RepeatedStratifiedGroupKFold'] %}
y_probas_ts_{{ cross_validation_n_repeats }} = []
mean_y_probas = np.mean(y_probas)
y_probas_values = np.array(y_proba_cv).flatten()

np.random.seed(rng)
for i in tqdm(range(100000)):
    sample = np.random.choice(y_probas_values, {{ cross_validation_n_repeats }})
    y_probas_ts_{{ cross_validation_n_repeats }}.append((np.mean(sample) - mean_y_probas) / (np.maximum(np.std(sample), min_y_probas_std)/np.sqrt({{ cross_validation_n_repeats }})))
    
y_probas_ts_{{ cross_validation_n_repeats }} = np.array(sorted(y_probas_ts_{{ cross_validation_n_repeats }}))
{% endif %}

In [None]:
%%appyter code_exec

{% if cv_algorithm.raw_value in ['RepeatedStratifiedKFold', 'RepeatedStratifiedGroupKFold'] %}
max_mean = np.max(y_probas_means_{{ cross_validation_n_repeats }})
p_vals = np.array(list(tqdm((1 - np.argwhere(y_probas_means_{{ cross_validation_n_repeats }} >= min(pred, max_mean))[0][0] / len(y_probas_means_{{ cross_validation_n_repeats }})
                             for pred in y_probas), total=len(y_probas))))
{% endif %}

In [None]:
%%appyter code_exec

{% if cv_algorithm.raw_value in ['RepeatedStratifiedKFold', 'RepeatedStratifiedGroupKFold'] %}
max_t = np.max(y_probas_ts_{{ cross_validation_n_repeats }})
p_vals_t_sim = np.array(list(tqdm((1 - np.argwhere(y_probas_ts_{{ cross_validation_n_repeats }} >= min(t, max_t))[0][0] / len(y_probas_ts_{{ cross_validation_n_repeats }})
                                   for t in t_stats), total=len(t_stats))))
{% endif %}

In [None]:
%%appyter code_exec

sns.histplot(y_probas[y == 0], bins=int(np.sqrt(np.sum(y == 0))*10), kde_kws={'gridsize':2000}, stat = 'density', label='Not known positive compound', color='blue')
sns.histplot(y_probas[y == 1], bins=int(np.sqrt(np.sum(y == 1))*10), kde_kws={'gridsize':2000}, stat = 'density', label='Known positive compound', color='red')
{% if cv_algorithm.raw_value in ['RepeatedStratifiedKFold', 'RepeatedStratifiedGroupKFold'] %}
sns.histplot(y_probas_means_{{ cross_validation_n_repeats }}, bins=int(np.sqrt(len(y_probas_means_{{ cross_validation_n_repeats }}))*10), kde_kws={'gridsize':2000}, label='Null distribution\n(simulated)', stat = 'density', color='green')
{% endif %}
plt.xlabel('Mean Predicted Probability')
plt.xlim([np.min(y_probas), np.percentile(y_probas, 99)])
plt.legend()
plt.savefig('mean-prediction-distribution.svg')
figure_header('Figure', 'Distribution of{% if cv_algorithm.raw_value in ['RepeatedStratifiedKFold', 'RepeatedStratifiedGroupKFold'] %} mean{% endif %} cross-validation predictions ({})'.format(make_clickable('mean-prediction-distribution.svg')))
plt.show()
figure_legend('Figure', 'Distribution of{% if cv_algorithm.raw_value in ['RepeatedStratifiedKFold', 'RepeatedStratifiedGroupKFold'] %} mean{% endif %} cross-validation predictions ({})'.format(make_clickable('mean-prediction-distribution.svg')),
              'Distribution of{% if cv_algorithm.raw_value in ['RepeatedStratifiedKFold', 'RepeatedStratifiedGroupKFold'] %} mean{% endif %} cross-validation predictions for all {number_of_compounds} compounds, \
               including both those with known positive labels and other small molecules.\
               {% if cv_algorithm.raw_value in ['RepeatedStratifiedKFold', 'RepeatedStratifiedGroupKFold'] %} The null distribution was simulated by drawing independent samples of predictions with replacement from the distribution of all predictions.{% endif %}'.format(number_of_compounds=X.shape[0]))

In [None]:
%%appyter code_exec

{% if cv_algorithm.raw_value in ['RepeatedStratifiedKFold', 'RepeatedStratifiedGroupKFold'] %}
sns.histplot(t_stats[y == 0], bins=int(np.sqrt(np.sum(y == 0))*10), kde_kws={'gridsize':2000}, stat = 'density', label='Not known positive compound', color = 'blue')
sns.histplot(t_stats[y == 1], bins=int(np.sqrt(np.sum(y == 1))*10), kde_kws={'gridsize':2000}, stat = 'density', label='Known positive compound', color = 'red')
{% if cv_algorithm.raw_value in ['RepeatedStratifiedKFold', 'RepeatedStratifiedGroupKFold'] %}
sns.histplot(y_probas_ts_{{ cross_validation_n_repeats }}, bins=int(np.sqrt(len(y_probas_ts_{{ cross_validation_n_repeats }}))*10), kde_kws={'gridsize':2000}, stat = 'density', label='Null distribution\n(simulated)', color = 'green')
{% endif %}
plt.xlabel('t-statistic')
plt.xlim([-20,20])
plt.legend()
plt.savefig('t-statistic-distribution.svg')
figure_header('Figure', 'Distribution of t-statistics ({})'.format(make_clickable('t-statistic-distribution.svg')))
plt.show()
figure_legend('Figure', 'Distribution of t-statistics ({})'.format(make_clickable('t-statistic-distribution.svg')),
              'Distributions of t-statistics for all {number_of_compounds} compounds, \
               including both those with known positive labels and other small molecules. \
               The null distribution was simulated by drawing independent samples of predictions with replacement from the distribution of all predictions.'.format(number_of_compounds=X.shape[0]))
{% endif %}

Overlaying the predictions on a visualization of the input space allows us to examine the predictions and may indicate groups of highly predicted compounds.

In [None]:
%%appyter code_exec

# Add attributes for plotting to Dataframe
X_reduced_df['Predicted Probability'] = y_probas
X_reduced_df['log10(pred)'] = np.log10(y_probas + 1e-10)
{% if cv_algorithm.raw_value in ['RepeatedStratifiedKFold', 'RepeatedStratifiedGroupKFold'] %}
X_reduced_df['p-value'] = p_vals_t_sim
X_reduced_df['log10(p-value)'] = np.log10(X_reduced_df['p-value'])
X_reduced_df['Standard Deviation'] = y_probas_std
{% endif %}
X_reduced_df['Cross-validation fold'] = folds_cv
{% if cv_algorithm.raw_value in ['RepeatedStratifiedKFold', 'RepeatedStratifiedGroupKFold'] %}
X_reduced_df['marker size'] = 2*np.minimum(2-np.log10(X_reduced_df['p-value']), 5)
{% else %}
max_p, min_p = np.min(-X_reduced_df['log10(pred)']), np.max(-X_reduced_df['log10(pred)'])
X_reduced_df['marker size'] = (-X_reduced_df['log10(pred)'] - min_p) / (max_p - min_p) * 6 + 4
{% endif %}
X_reduced_df['text'] = ['<br>'.join(['Drug Name: ' + str(name),
                                     'InChI Key: ' + str(inchi),
                                     'Predicted Probability: {:.1e}'.format(p),
                                     {% if cv_algorithm.raw_value in ['RepeatedStratifiedKFold', 'RepeatedStratifiedGroupKFold'] %}
                                     'Standard Deviation: {:.1e}'.format(s),
                                     'p-value: {:.1e}'.format(p_val),
                                     {% endif %}
                                     'Label: ' + str(label),
                                     'Cross-validation fold: ' + str(fold)])
                  for name, inchi, p, {% if cv_algorithm.raw_value in ['RepeatedStratifiedKFold', 'RepeatedStratifiedGroupKFold'] %}s, p_val, {% endif %}label, fold in zip(X_reduced_df['Drug Name'],
                                                         X_reduced_df['InChI Key'],
                                                         X_reduced_df['Predicted Probability'],
                                                         {% if cv_algorithm.raw_value in ['RepeatedStratifiedKFold', 'RepeatedStratifiedGroupKFold'] %}
                                                         X_reduced_df['Standard Deviation'],
                                                         X_reduced_df['p-value'],
                                                         {% endif %}
                                                         X_reduced_df['Label'],
                                                         X_reduced_df['Cross-validation fold'])]
X_reduced_df.to_csv('X_reduced_df.csv')

# Helper function for formatting Plotly colorbar
def colorbar_param(values_log10, **kwargs):
    min_val = np.floor(np.min(values_log10))
    max_val = np.ceil(np.max(values_log10))
    
    ticks1 = 10**np.arange(min_val, max_val+1)
    ticks2 = 3*10**np.arange(min_val, max_val)
    
    ticktext = sorted(np.concatenate([ticks1, ticks2]))
    tickvals = list(np.log10(ticktext))
    ticktext = ['{:.0e}'.format(text) for text in ticktext]
    
    return dict(ticktext=ticktext, tickvals=tickvals, **kwargs)

fig = go.Figure()
for label in sorted(set(X_reduced_df['Label'])):
    X_plot = X_reduced_df[X_reduced_df['Label'] == label].sort_values(['Predicted Probability'])
    fig.add_trace(go.Scatter(mode='markers',
                               x=X_plot['Component 1'], y=X_plot['Component 2'],
                               text=X_plot['text'],
                               name=label,
                               marker=dict(
                                   color=X_plot['log10(pred)'],
                                   cmin=np.percentile(X_reduced_df['log10(pred)'], 50),
                                   cmax=np.max(X_reduced_df['log10(pred)']),
                                   size=X_plot['marker size'],
                                   colorbar=colorbar_param(X_plot['log10(pred)'], title='Predicted Probability'),
                                   symbol=X_plot['marker symbol'],
                                   line_width=1,
                                   colorscale='plasma'
                               )))
fig.update_layout(height=600, width=800,
                  xaxis_title='Component 1',
                  yaxis_title='Component 2',
                  title_text='Predicted Probabilities ({{ visualization_reduction.raw_value }})',
                  legend_title_text='Target Label',
                  legend=dict(
                      yanchor="top",
                      y=0.98,
                      xanchor="left",
                      x=0.02
                  ),
                  template='simple_white')
figure_header('Figure', '{{ visualization_reduction.raw_value }} dimensionality reduction of the input feature space overlayed with predictions')
fig.show()
figure_legend('Figure', '{{ visualization_reduction.raw_value }} dimensionality reduction of the input feature space overlayed with predictions',
              f'Each point represents one of {X.shape[0]} compounds, with {X.shape[1]} features per compound, \
              taken from the following datasets: {", ".join(sepl1000_phenotypic_datasets + sepl1000_structural_datasets + attribute_datasets)}. \
              Compounds with known positive labels are marked by X\'s. The color and size of each point correspond to the{% if cv_algorithm.raw_value in ['RepeatedStratifiedKFold', 'RepeatedStratifiedGroupKFold'] %} mean{% endif %} predicted \
              probability {% if cv_algorithm.raw_value in ['RepeatedStratifiedKFold', 'RepeatedStratifiedGroupKFold'] %}and its signficance (estimated from the simulated t-statistic null distribution), respectively{% endif %}.')

Full tables of top-predicted compounds with and without known positive labels are shown below.

In [None]:
%%appyter code_exec
# Obtain prediction results
results = pd.DataFrame(np.array([
    querysepl1000fwd.get_drug_names(X.index),
    Drugmonizome.get_drug_names(X.index),
    folds_cv,
    y,
    y_probas,
    {% if cv_algorithm.raw_value in ['RepeatedStratifiedKFold', 'RepeatedStratifiedGroupKFold'] %}
    y_probas_std,
    t_stats,
    p_vals,
    p_vals_t,
    p_vals_t_sim,
    {% endif %}
], dtype='object').T, columns=[
    'Name (L1000FWD)',
    'Name (Drugmonizome)',
    'Cross-validation fold',
    'Known',
    'Prediction Probability',
    {% if cv_algorithm.raw_value in ['RepeatedStratifiedKFold', 'RepeatedStratifiedGroupKFold'] %}
    'Prediction Probability Std. Dev.',
    't statistic',
    'p value (simulated mean distribution)',
    'p value (one sample t test)',
    'p value (simulated t distribution)',
    {% endif %}
], index=X.index).astype({'Known': 'bool',
                          'Prediction Probability': 'float64',
                          {% if cv_algorithm.raw_value in ['RepeatedStratifiedKFold', 'RepeatedStratifiedGroupKFold'] %}
                          'Prediction Probability Std. Dev.': 'float64',
                          't statistic': 'float64',
                          'p value (simulated mean distribution)': 'float64',
                          'p value (one sample t test)': 'float64',
                          'p value (simulated t distribution)': 'float64',{% endif %}})

results.to_csv('drug_cv_predictions.csv')

In [None]:
# Rank predictions
figure_header('Table', 'Top-predicted compounds ({})'.format(make_clickable('drug_cv_predictions.csv')))
show(results.reset_index(), maxBytes=0, order=[[ 5, "desc" ]], columnDefs=[{'width': '120px', 'targets': [0, 1]}])
figure_legend('Table', 'Top-predicted compounds ({})'.format(make_clickable('drug_cv_predictions.csv')),
              f'All {X.shape[0]} compounds ranked by cross-validation prediction probability. \
                Search \'true\' or \'false\' to filter compounds with known positive labels or not, respectively. \
                The table can also be sorted by other columns by selecting the column name in the header.')

## Examine feature importances

The relative contribution of each input feature to the final model predictions can be estimated for recursive feature selection and for a variety of tree-based models. Note that this analysis is not available if a dimensionality reduction algorithm is used.

In [None]:
%%appyter markdown

{% if feature_selection.raw_value == 'RecursiveSelectionFromExtraTrees' and dimensionality_reduction.raw_value == 'None' %}
When recursive feature selection is performed, the features are ranked by the stage at which they were removed.
Selected (i.e. estimated best) features are have importance 1. The ranks are averaged across cross-validation
splits to produce an average importance score. The full feature importance table is available at
[feature_importance.csv](./feature_importance.csv).
{% endif %}

In [None]:
%%appyter code_exec

{% if dimensionality_reduction.raw_value == 'None' %}
{% if feature_selection.raw_value == 'RecursiveSelectionFromExtraTrees' and dimensionality_reduction.raw_value == 'None' %}
all_rankings = []
{% endif %}
{% if algorithm.raw_value in ['GradientBoostingClassifier', 'RandomForestClassifier', 'AdaBoostClassifier', 'ExtraTreesClassifier', 'DecisionTreeClassifier'] %}
all_feature_importances = []
{% endif %}
for model in models:
    {% if calibrated.value %}
    for calibrated_clf in model.calibrated_classifiers_:
        pipeline = calibrated_clf.base_estimator
    {% else %}
        pipeline = model
    {% endif %}
        
        {% if feature_selection.raw_value == 'RecursiveSelectionFromExtraTrees' %}
        ranking = pipeline['feature_selection'].ranking_
        all_rankings.append(ranking)
        {% endif %}
        
        {% if algorithm.raw_value in ['GradientBoostingClassifier', 'RandomForestClassifier', 'AdaBoostClassifier', 'ExtraTreesClassifier', 'DecisionTreeClassifier'] %}
        {% if feature_selection.raw_value != 'None' %}
        feature_importances = np.zeros(pipeline['feature_selection'].get_support().shape)
        feature_importances[pipeline['feature_selection'].get_support()] = pipeline['clf'].feature_importances_
        {% else %}
        feature_importances = pipeline['clf'].feature_importances_
        {% endif %}
        all_feature_importances.append(feature_importances)
        {% endif %}
{% endif %}

In [None]:
%%appyter code_exec

{% if dimensionality_reduction.raw_value == 'None' %}
df_feat_imp = pd.DataFrame({'Feature': X.columns,
                            'Dataset': reduce(lambda a,b: a+b, ([dataset]*size for dataset, size in dataset_sizes)),
                            {% if feature_selection.raw_value == 'RecursiveSelectionFromExtraTrees' %}
                            'Ranking Mean': np.mean(all_rankings, axis=0),
                            'Ranking Std. Dev.': np.std(all_rankings, axis=0),
                            {% endif %}
                            {% if algorithm.raw_value in ['GradientBoostingClassifier', 'RandomForestClassifier', 'AdaBoostClassifier', 'ExtraTreesClassifier', 'DecisionTreeClassifier'] %}
                            'Importance Mean': np.mean(all_feature_importances, axis=0),
                            'Importance Std. Dev.': np.std(all_feature_importances, axis=0),
                            {% endif %}
                            })
df_feat_imp = df_feat_imp.set_index('Feature').sort_values('Importance Mean', ascending=False)
{% if feature_selection.raw_value == 'RecursiveSelectionFromExtraTrees' %}
figure_header('Table', 'Input features ranked by relative importance ({})'.format(make_clickable('feature_importance.csv')))
show(df_feat_imp.reset_index(), maxBytes=0, order=[[ 2, "asc"]])
figure_legend('Table', 'Input features ranked by relative importance ({})'.format(make_clickable('feature_importance.csv')),
              f'All {X.shape[1]} input features are ranked by their relative importance. \
                Feature ranking (Ranking Mean and Std. Dev.) specifies the round of recursive feature selection on which a given feature was eliminated. \
                A feature with lower ranking is more \
                important. {% if algorithm.raw_value in ['GradientBoostingClassifier', 'RandomForestClassifier', 'AdaBoostClassifier', 'ExtraTreesClassifier', 'DecisionTreeClassifier'] %}Tree-based \
                models can also be used to calculate impurity-based feature importances (Importance Mean and Std. Dev.). {% endif %}Search a dataset \
                name to filter features from a given dataset. \
                The table can also be sorted by other columns by selecting the column name in the header.')
{% elif algorithm.raw_value in ['GradientBoostingClassifier', 'RandomForestClassifier', 'AdaBoostClassifier', 'ExtraTreesClassifier', 'DecisionTreeClassifier'] %}
figure_header('Table', 'Input features ranked by relative importance ({})'.format(make_clickable('feature_importance.csv')))
show(df_feat_imp.reset_index(), maxBytes=0, order=[[ 2, "desc"]])
figure_legend('Table', 'Input features ranked by relative importance ({})'.format(make_clickable('feature_importance.csv')),
              f'All {X.shape[1]} input features are ranked by their relative importance. \
                Tree-based models can be used to calculate impurity-based feature importances (Importance Mean and Std. Dev.). \
                Search a dataset name to filter features from a given dataset. \
                The table can also be sorted by other columns by selecting the column name in the header.')
{% else %}
figure_header('Table', 'Input features ({})'.format(make_clickable('feature_importance.csv')))
show(df_feat_imp.reset_index(), maxBytes=0)
figure_legend('Table', 'Input features ({})'.format(make_clickable('feature_importance.csv')),
              f'All {X.shape[1]} input features. No ranking of features was possible for this pipeline.')
{% endif %}
df_feat_imp.to_csv('feature_importance.csv')
{% endif %}

In [None]:
%%appyter code_exec

{% if feature_selection.raw_value == 'RecursiveSelectionFromExtraTrees' and dimensionality_reduction.raw_value == 'None' %}
fig, axs = plt.subplots(1, 2, figsize=(15, 5))
df_feat_imp = df_feat_imp.sort_values('Ranking Mean')
for dataset in set(df_feat_imp.Dataset):
    importance_scores = df_feat_imp.loc[df_feat_imp.Dataset == dataset]['Ranking Mean'].values
    importance_scores_std = df_feat_imp.loc[df_feat_imp.Dataset == dataset]['Ranking Std. Dev.'].values
    lower = importance_scores - importance_scores_std
    upper = importance_scores + importance_scores_std
    axs[0].plot(importance_scores, label=dataset)
    axs[0].fill_between(np.arange(len(importance_scores)), lower, upper, alpha=.2)
    axs[1].plot(np.linspace(0, 1, len(importance_scores)), importance_scores, label=dataset)
    axs[1].fill_between(np.linspace(0, 1, len(importance_scores)), lower, upper, alpha=.2)
for i in [0, 1]:
    axs[i].legend()
    axs[i].set_title('Distribution of feature ranking from recursive feature elimination')
    axs[i].set_ylabel('Average feature ranking\n(lower ranking is more important)')
axs[0].set_xlabel('Ranked features (absolute count)')
axs[1].set_xlabel('Ranked features (relative count)')
axs[0].set_xlim([0,512])
plt.tight_layout()
plt.savefig('feature_importance_rfe.svg')
figure_header('Figure', 'Distribution of feature rankings from recursive feature elimination ({})'.format(make_clickable('feature_importance_rfe.svg')))
plt.show()
figure_legend('Figure', 'Distribution of feature rankings from recursive feature elimination ({})'.format(make_clickable('feature_importance_rfe.svg')),
              'The distribution of feature rankings from recursive feature elimination for each dataset. \
               Features with lower scores were retained for more rounds during recursive feature selection \
               and have greater relative importance.')
{% endif %}

In [None]:
%%appyter code_exec

{% if algorithm.raw_value in ['GradientBoostingClassifier', 'RandomForestClassifier', 'AdaBoostClassifier', 'ExtraTreesClassifier', 'DecisionTreeClassifier']  and dimensionality_reduction.raw_value == 'None' %}
fig, axs = plt.subplots(2, 2, figsize=(15, 10))
df_feat_imp = df_feat_imp.sort_values('Importance Mean', ascending=False)
for dataset in set(df_feat_imp.Dataset):
    importance_scores = df_feat_imp.loc[df_feat_imp.Dataset == dataset]['Importance Mean'].values
    importance_scores_std = df_feat_imp.loc[df_feat_imp.Dataset == dataset]['Importance Std. Dev.'].values
    lower = importance_scores - importance_scores_std
    upper = importance_scores + importance_scores_std
    axs[0][0].plot(importance_scores, label=dataset)
    axs[0][0].fill_between(np.arange(len(importance_scores)), lower, upper, alpha=.2)
    axs[0][1].plot(np.linspace(0, 1, len(importance_scores)), importance_scores, label=dataset)
    axs[0][1].fill_between(np.linspace(0, 1, len(importance_scores)), lower, upper, alpha=.2)
    
    importance_scores = np.cumsum(df_feat_imp.loc[df_feat_imp.Dataset == dataset]['Importance Mean'].values)
    axs[1][0].plot(importance_scores, label=dataset)
    axs[1][1].plot(np.linspace(0, 1, len(importance_scores)), importance_scores, label=dataset)
for i in [0, 1]:
    axs[0][i].legend()
    axs[0][i].set_title('Distribution of feature scores from model')
    axs[1][i].set_title('Cumulative distribution of feature scores from model')
    axs[i][0].set_xlabel('Ranked features (absolute count)')
    axs[i][1].set_xlabel('Ranked features (relative count)')
    axs[0][i].set_ylabel('Average feature importance\n(higher score is more important)')
    axs[1][i].set_ylabel('Cumulative sum of feature importance')
    axs[i][0].set_xlim([0,512])
plt.tight_layout()
plt.savefig('feature_importance.svg')
figure_header('Figure', 'Distribution of feature scores from model ({})'.format(make_clickable('feature_importance.svg')))
plt.show()
figure_legend('Figure', 'Distribution of feature scores from model ({})'.format(make_clickable('feature_importance.svg')),
              'The distribution of impurity-based feature importances for each dataset. \
               Features with higher scores have greater relative contribution to the overall tree-based model.')
{% endif %}