In [None]:
#%%appyter init
import os, sys; sys.path.insert(0, os.path.realpath('..'))
from appyter import magic
magic.init(lambda _=globals: _())

In [None]:
%matplotlib inline
# Imports
## Data processing
import pandas as pd
import numpy as np
import scipy as sp
## Machine Learning
import sklearn as sk
from sklearn import (
    calibration,
    decomposition,
    ensemble,
    feature_selection,
    linear_model,
    manifold,
    metrics,
    model_selection,
    multioutput,
    pipeline,
    preprocessing,
    svm,
    tree,
    feature_extraction,
)
from split import StratifiedGroupKFold, RepeatedStratifiedGroupKFold
import umap
## Plotting
from matplotlib import pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
## Drugmonizome API
# from drugmonizome import Drugmonizome
## SEP-L1000 data retrieval
from sepl1000 import SEPL1000
## L1000FWD queries
import querysepl1000fwd
## Match drug name inputs using PubChem API
# from DrugNameConverter import DrugNameConverter
# Utility
import os
import re
import json
from functools import reduce
from IPython.display import display
from tqdm import tqdm

In [None]:
rng = 2020
pd.set_option('display.max_rows', None)

## Select Input Datasets and Target Classes

Selected drug set libraries and omics datasets are downloaded and joined on the drug to produce a large association matrix. A machine learning model will be trained to predict the specified target labels from this association matrix. This is a binary classification task that can be used to predict drugs that are likely to be associated with the target class.

In [None]:
%%appyter hide
{% do SectionField(
    title='Attribute Dataset Selection',
    subtitle='Select the input datasets to use for learning and classification. \
              A model will be trained to predict the target labels from the selected attributes. \
              If no datasets are selected, default attributes will be used.',
    name='ATTRIBUTES',
    img='attributes.png',
) %}

{% set sepl1000datasets = MultiChoiceField(
    name='sepl1000datasets',
    label='SEP-L1000',
    description='These input datasets were used previously for side effect prediction (https://maayanlab.net/SEP-L1000/).',
    choices=[
        'LINCS Gene Expression Signatures',
        'GO Transformed Signatures (PAEA)',
        'MLPCN Cell Morphological Profiling',
        'MACCS Chemical Fingerprint',
    ],
    default=['LINCS Gene Expression Signatures', 'GO Transformed Signatures (PAEA)'],
    section='ATTRIBUTES'
) %}

{% set exprdatasets = MultiChoiceField(
    name='exprdatasets',
    label='L1000FWD (drug sets)',
    choices=[
        'L1000FWD Downregulated GO Biological Processes',
        'L1000FWD Downregulated GO Cellular Components',
        'L1000FWD Downregulated GO Molecular Function',
        'L1000FWD Downregulated KEGG Pathways',
        'L1000FWD Downregulated Signatures',
        'L1000FWD Predicted Side Effects',
        'L1000FWD Upregulated GO Biological Process',
        'L1000FWD Upregulated GO Cellular Components',
        'L1000FWD Upregulated GO Molecular Function',
        'L1000FWD Upregulated KEGG Pathways',
        'L1000FWD Upregulated Signatures',
    ],
    default=[],
    section='ATTRIBUTES'
) %}

{% set targetdatasets = MultiChoiceField(
    name='targetdatasets',
    label='Drug Targets and Associated Genes (drug sets)',
    choices=[
        'Downregulated CREEDS Signatures',
        'Upregulated CREEDS Signatures',
        'DrugCentral Targets',
        'DrugRepurposingHub Drug Targets',
        'Drugbank Small Molecule Carriers',
        'Drugbank Small Molecule Enzymes',
        'Drugbank Small Molecule Targets',
        'Drugbank Small Molecule Transporters',
        'Geneshot Associated Genes',
        'Geneshot Predicted AutoRIF Genes',
        'Geneshot Predicted Coexpression Genes',
        'Geneshot Predicted Enrichr Genes',
        'Geneshot Predicted GeneRIF Genes',
        'Geneshot Predicted Tagger Genes',
        'KinomeScan Kinases',
        'PharmGKB Single Nucleotide Polymorphisms',
        'STITCH Targets',
    ],
    default=[],
    section='ATTRIBUTES'
) %}

{% set indicationdatasets = MultiChoiceField(
    name='indicationdatasets',
    label='Indications, Modes of Action, and Side Effects (drug sets)',
    choices=[
        'ATC Codes Drugsetlibrary',
        'DrugRepurposingHub Mechanisms of Action',
        'PharmGKB OFFSIDES Side Effects',
        'SIDER Indications',
        'SIDER Side Effects',
    ],
    default=[],
    section='ATTRIBUTES'
) %}

{% set structuraldatasets = MultiChoiceField(
    name='structuraldatasets',
    label='Structural Features (drug sets)',
    choices=[
        'RDKIT MACCS Chemical Fingerprints'
    ],
    default=[],
    section='ATTRIBUTES'
) %}

{% set keepmissing = BoolField(
    name='keepmissing',
    label='Keep drugs with missing data when joining datasets',
    description='Keep drugs that appear in some datasets and not in others. \
                 Missing data is filled in with zeros. Otherwise, only drugs \
                 that are present in all datasets are preserved.',
    default=False,
    section='ATTRIBUTES',
) %}

{% set tfidf = BoolField(
    name='tfidf',
    label='Apply tf–idf normalization to binary inputs',
    description='For binary drug-attribute associations in the input matrix, \
                 apply tf-idf transformation to normalize data.',
    default=True,
    section='ATTRIBUTES',
) %}

{% set attribute_datasets = exprdatasets.value +
                             targetdatasets.value +
                             indicationdatasets.value +
                             structuraldatasets.value %}

In [None]:
%%appyter markdown

To construct the input matrix, we download drug set libraries and omics datasets and join them on the InChI Key.
{% if keepmissing.value %} Drugs that appear in some datasets and not in others are retained, and missing data is filled in with zeros.
{% else %} Only drugs that are present in all datasets are retained.
{% endif %}

In [None]:
%%appyter hide
{% do SectionField(
    title='Target Label Selection',
    subtitle='Upload a list of drugs to be given positive class labels for binary classification. \
              Drugs should be in a text file, specified by either drug name or InChI Key and separated by newlines. \
              If no file is selected, a default list of hits from COVID-19 drug screens will be used.',
    name='TARGET',
    img='target.png',
) %}

{% set drugformat = ChoiceField(
    name='drugformat',
    label='Drug Identifier Format',
    default='InChI Key',
    choices=[
        'Drug Name',
        'InChI Key'
    ],
    section='TARGET'
) %}

{% set drughitlist = FileField(
    name='drughitlist',
    label='Upload List of Drug Hits',
    default='COVID19ScreenHitsInChIKeys.txt',
    examples={
        'COVID19ScreenHits.txt': 'https://appyters.maayanlab.cloud/storage/Drugmonizome_ML/COVID19ScreenHits.txt',
        'COVID19ScreenHitsInChIKeys.txt': 'https://appyters.maayanlab.cloud/storage/Drugmonizome_ML/COVID19ScreenHitsInChIKeys.txt',
    },
    section='TARGET'
) %}

{% set includestereo = BoolField(
    name='includestereo',
    label='Include stereoisomers',
    description='If true, drugs are matched to entries in the datasets by the first 14 characters of their InChI Keys, \
                 so stereoisomers of the drugs in the input list are also counted as hits. \
                 Note that different resources record different details for charge and stereochemistry, \
                 causing some drugs to have different full-length InChI Keys in different datasets. \
                 Selecting this option may allow such drugs to be better matched to entries in the datasets.',
    default=True,
    section='TARGET',
) %}

In [None]:
%%appyter code_exec

# Use the selected SEP-L1000 datasets
sepl1000datasets = {{ sepl1000datasets }}

name_to_file = {
    'LINCS Gene Expression Signatures': 'LINCS_Gene_Experssion_signatures_CD.csv.gz',
    'GO Transformed Signatures (PAEA)': 'GO_transformed_signatures_PAEA.csv.gz',
    'MLPCN Cell Morphological Profiling': 'MLPCN_morplological_profiles.csv.gz',
    'MACCS Chemical Fingerprint': 'MACCS_bitmatrix.csv.gz',
}

df_sepl1000_list = list(SEPL1000.download_df(list(name_to_file[dataset] for dataset in sepl1000datasets),
                                             index_col=0))
dataset_sizes = list(zip(sepl1000datasets, [dataset.shape[1] for dataset in df_sepl1000_list]))

# Assemble all SEP-L1000 datasets
if len(df_sepl1000_list) > 1:
    # Obtain merged dataframe with omics and target data
    df_sepl1000 = reduce(
        lambda a, b: pd.merge( # Merge two dataframes item by item
            a, # left
            b, # right
            # Items with the same left and right index are merged
            left_index=True,
            right_index=True,
            {% if keepmissing.value %}
            how='outer', # Keep mis-matched indices
            {% else %}
            how='inner', # Keep only matched indices
            {% endif %}
        ),
        df_sepl1000_list,
    )
else:
    df_sepl1000 = df_sepl1000_list[0]

# del(df_sepl1000_list)

# Mean-fill infinite and missing values
df_sepl1000 = df_sepl1000.replace([np.inf, -np.inf], np.nan)
df_sepl1000 = df_sepl1000.fillna(np.mean(df_sepl1000))
print('Total shape:', df_sepl1000.shape)
display(df_sepl1000.head())

In [None]:
%%appyter code_exec

{% if attribute_datasets == [] and sepl1000datasets == [] %}
# No datasets selected, so use default datasets
attribute_datasets = ['L1000FWD Downregulated Signatures',
                      'L1000FWD Upregulated Signatures',
                      'RDKIT MACCS Chemical Fingerprints']
{% else %}
# Use the selected attribute datasets
attribute_datasets = {{ attribute_datasets }}
{% endif %}

{% if attribute_datasets == [] and sepl1000datasets != [] %}
X = df_sepl1000
{% else %}
df_attributes = list(Drugmonizome.download_df(
    [dataset
     for dataset in attribute_datasets]
))
dataset_sizes += list(zip(sepl1000datasets, [dataset.shape[1] for dataset in df_sepl1000_list]))

# Assemble all attribute datasets
if len(df_attributes) > 1:
    # Obtain merged dataframe with omics and target data
    df = reduce(
        lambda a, b: pd.merge( # Merge two dataframes item by item
            a, # left
            b, # right
            # Items with the same left and right index are merged
            left_index=True,
            right_index=True,
            {% if keepmissing.value %}
            how='outer', # Keep mis-matched indices
            {% else %}
            how='inner', # Keep only matched indices
            {% endif %}
        ),
        df_attributes,
    )
else:
    df = df_attributes[0]

df = df.fillna(0)
X = df.applymap(lambda f: 1 if f!=0 else 0)
{% if tfidf.value %}
# Apply tf-idf normalization
transformer = feature_extraction.text.TfidfTransformer()
X_tfidf = transformer.fit_transform(X)
X = pd.DataFrame.sparse.from_spmatrix(X_tfidf, columns=X.columns, index=X.index)
X = pd.merge(df_sepl1000, X, left_index=True, right_index=True)
{% endif %}
{% endif %}

print('Total shape:', X.shape)
display(X.head())

In [None]:
%%appyter markdown

The target labels are produced from the uploaded list of hits: 1 if the drug is specified as a hit, 0 otherwise.
{% if drugformat.value == 'Drug Name' %} Drug names are matched to InChI Keys from the Drugmonizome metadata.
{% endif %}

In [None]:
%%appyter code_exec

{% if drughitlist.value == '' %}
# Using default list of hits from COVID-19 in vitro drug screens
hits_filename = '../../COVID19ScreenHits.txt'
{% else %}
# Using user-specified list of positive drug hits
hits_filename = {{drughitlist}}
{% endif %}

{% if drugformat.value == 'InChI Key' %}
# Read InChI Keys from file
with open(hits_filename, 'r') as hits_file:
    drug_hits = set(drug.strip().upper() for drug in hits_file.read().strip().split('\n') 
                    if len(drug.strip()) > 0)

{% elif drugformat.value == 'Drug Name' %}
# Helper functions
def merge(A, B, f):
    """
    Merges two dictionaries, where items from shared keys are merged using a custom function.
    """
    merged = {k: A.get(k, B.get(k)) for k in A.keys() ^ B.keys()}
    merged.update({k: f(A[k], B[k]) for k in A.keys() & B.keys()})
    return merged
def save_items(out_file, items):
    """
    Saves list of items as rows in a file.
    """
    with open(out_file, 'w') as f:
        for i in range(len(items)):
            if i < len(items) - 1:
                f.write(items[i] + '\n')
            else:
                f.write(items[i])

def save_gmt(out_file, keys_to_sets, sep='\t'):
    """
    Saves dict with key-set pairs as gmt file format.
    """
    lines = []
    for key in sorted(keys_to_sets):
        lines.append(key + sep*2 + sep.join(sorted(keys_to_sets[key])))
    save_items(out_file, lines)

# Read drug names from file
with open(hits_filename, 'r') as hits_file:
    drug_hits = set(drug.strip().lower() for drug in hits_file.read().strip().split('\n') 
                    if len(drug.strip()) > 0)

# Query PubChem API to map drug names to InChI Keys
print('Querying PubChem API...')
drug_hits_inchi_pubchem = DrugNameConverter.batch_to_inchi_keys(drug_hits)
# Query Drugmonizome API to map drug names to InChI Keys
print('Querying Drugmonizome API...')
drug_hits_inchi_drugmonizome = Drugmonizome.map_names_to_inchi_keys(drug_hits)
# Query L1000FWD API to map drug names to InChI Keys
print('Querying L1000FWD API...')
drug_hits_inchi_l1000fwd = querysepl1000fwd.map_names_to_inchi_keys(drug_hits)

# Combine InChI Keys from all resources
drug_hits_inchi = merge(drug_hits_inchi_pubchem, drug_hits_inchi_drugmonizome, lambda s1, s2: s1 | s2)
drug_hits_inchi = merge(drug_hits_inchi, drug_hits_inchi_l1000fwd, lambda s1, s2: s1 | s2)
save_gmt('hits_drug_name_to_inchi_keys.gmt', drug_hits_inchi)
# Unmatched drug names
unmatched_drugs = set(drug for drug in drug_hits
                      if drug not in drug_hits_inchi or len(drug_hits_inchi[drug]) == 0)
print(f'Drugs without InChI Keys ({ len(unmatched_drugs) }/{ len(drug_hits) }):', unmatched_drugs)

# Set of InChI Keys for user-specified hits
drug_hits = set(key for drug in drug_hits_inchi
                    for key in drug_hits_inchi[drug])
save_items('hits_inchi_keys.txt', sorted(drug_hits))
{% endif %}

In [None]:
%%appyter markdown

{% if drugformat.value == 'Drug Name' %}
For the user-inputted drug names:
* Mapping of drug name to InChI Key: [hits_drug_name_to_inchi_keys.gmt](./hits_drug_name_to_inchi_keys.gmt)
* List of InChI Keys: [hits_inchi_keys.txt](./hits_inchi_keys.txt)
{% endif %}

We produce a target array containing 1 if the drug is specified as a hit and 0 otherwise.

In [None]:
%%appyter code_exec

{% if includestereo.value %}
# Match first 14 characters of InChI Keys (hash of InChI connectivity information)
drug_hits_inchi_main_layer = set(key[:14] for key in drug_hits)
y = np.array([drug[:14] in drug_hits_inchi_main_layer for drug in X.index]).astype(np.int8)
{% else %}
# Match full InChI Keys
y = np.array([drug in drug_hits for drug in X.index]).astype(np.int8)
{% endif %}
print('Number of hits matched in input: %d (%0.3f %%)' % (y.sum(), 100*y.sum()/len(y)))

In [None]:
# Output data shapes
print('Input shape:', X.shape)
print('Target shape:', y.shape)

## Dimensionality Reduction and Visualization

In [None]:
%%appyter hide
{% do SectionField(
    title='Machine Learning Pipeline',
    subtitle='Select from available machine learning algorithms, their unique settings, and methods to use to evaluate the classifier.',
    name='SETTINGS',
    img='settings.png',
) %}

{% set visualization_reduction = ChoiceField(
    name='visualization_reduction',
    label='Data Visualization Method',
    description='A dimensionality reduction algorithm should be selected for data visualization.',
    default='UMAP',
    choices={
        'UMAP': 'umap.UMAP(random_state=rng)',
        'NMF': 'sk.decomposition.NMF(n_components=2)',
        'PCA': 'sk.decomposition.PCA(n_components=2)',
        'TruncatedSVD': 'sk.decomposition.TruncatedSVD(n_components=2)',
        'IncrementalPCA': 'sk.decomposition.IncrementalPCA(n_components=2)',
        'ICA': 'sk.decomposition.FastICA(n_components=2)',
        'SparsePCA': 'sk.decomposition.SparsePCA(n_components=2)',
    },
    section='SETTINGS'
) %}

In [None]:
%%appyter markdown

We reduce the dimensionality of our omics feature space for visualization with {{ visualization_reduction.raw_value }}.

In [None]:
%%appyter code_exec
clf_dimensionality_reduction = {{ visualization_reduction }}
X_reduced = clf_dimensionality_reduction.fit_transform(X.values)
{% if visualization_reduction.raw_value == 'PCA' %}
print('Explained variance:', np.sum(clf_dimensionality_reduction.explained_variance_))
{% endif %}

In [None]:
X_reduced_df = pd.DataFrame(X_reduced, columns=['Component 1', 'Component 2'])
X_reduced_df['Drug Name'] = querysepl1000fwd.get_drug_names(X.index)
X_reduced_df['InChI Key'] = X.index
X_reduced_df['Label'] = y
X_reduced_df['marker symbol'] = ['x' if label else 'circle' for label in X_reduced_df['Label']]
X_reduced_df['text'] = ['<br>'.join(['Drug Name: ' + str(name),
                                     'InChI Key: ' + str(inchi),
                                     'Label: ' + str(label)])
                        for name, inchi, label in zip(X_reduced_df['Drug Name'],
                                                      X_reduced_df['InChI Key'],
                                                      X_reduced_df['Label'])]

In [None]:
%%appyter code_exec

fig = go.Figure()
for label in set(X_reduced_df['Label']):
    X_plot = X_reduced_df[X_reduced_df['Label'] == label].sort_values('Label')
    fig.add_trace(go.Scatter(mode='markers',
                             x=X_plot['Component 1'], y=X_plot['Component 2'],
                             text=X_plot['text'],
                             name=label,
                             marker=dict(
                                 color=['#0d0887', '#f0f921'][label%2],
                                 size=8,
                                 symbol=X_plot['marker symbol'],
                                 line_width=1,
                                 line_color='white'
                             )))
fig.update_layout(height=600, width=800,
                  xaxis_title='Component 1',
                  yaxis_title='Component 2',
                  title_text='Known Labels ({{ visualization_reduction.raw_value }})',
                  legend_title_text='Target Label',
                  template='simple_white')
fig.show()

## Machine Learning

In [None]:
%%appyter hide
{% set dimensionality_reduction = ChoiceField(
    name='dimensionality_reduction',
    label='Dimensionality Reduction Algorithm',
    description='A dimensionality reduction algorithm should be selected to improve the quality of the classifier.',
    default='None',
    choices={
        'None': 'None',
        'PCA': 'sk.decomposition.PCA(n_components=64)',
        'TruncatedSVD': 'sk.decomposition.TruncatedSVD(n_components=64)',
        'IncrementalPCA': 'sk.decomposition.IncrementalPCA(n_components=64)',
        'ICA': 'sk.decomposition.FastICA(n_components=64)',
        'SparsePCA': 'sk.decomposition.SparsePCA(n_components=64)',
    },
    section='SETTINGS'
) %}
{% set feature_selection = ChoiceField(
    name='feature_selection',
    label='Machine Learning Feature Selection',
    default='None',
    choices={
        'None': 'None',
        'SelectFromLinearSVC': 'sk.feature_selection.SelectFromModel(sk.svm.LinearSVC(loss="squared_hinge", penalty="l1", dual=False, class_weight="balanced"))',
        'SelectFromExtraTrees': 'sk.feature_selection.SelectFromModel(sk.ensemble.ExtraTreesClassifier(class_weight="balanced"))',
        'RecursiveSelectionFromExtraTrees': 'sk.feature_selection.RFE(sk.ensemble.ExtraTreesClassifier(class_weight="balanced"), n_features_to_select=256, step=0.1)',
        'SelectKBest': 'sk.feature_selection.SelectKBest("f_classif")',
        'SelectKBestChi2': 'sk.feature_selection.SelectKBest("chi2")',
        'SelectKBestMultiInfo': 'sk.feature_selection.SelectKBest("mutual_info_classif")',
    },
    section='SETTINGS'
) %}
{% set cv_algorithm = ChoiceField(
    name='cv_algorithm',
    label='Cross Validation Algorithm',
    default='RepeatedStratifiedGroupKFold',
    choices={
        'KFold': 'sk.model_selection.KFold',
        'GroupKFold': 'sk.model_selection.GroupKFold',
        'RepeatedKFold': 'sk.model_selection.RepeatedKFold',
        'StratifiedKFold': 'sk.model_selection.StratifiedKFold',
        'StratifiedGroupKFold': 'StratifiedGroupKFold',
        'RepeatedStratifiedKFold': 'sk.model_selection.RepeatedStratifiedKFold',
        'RepeatedStratifiedGroupKFold': 'RepeatedStratifiedGroupKFold'
    },
    section='SETTINGS',
) %}
{% set algorithm = ChoiceField(
    name='algorithm',
    label='Machine Learning Algorithm',
    default='RandomForestClassifier',
    description='A machine learning algorithm should be selected to construct the predictive model.',
    choices={
        'GradientBoostingClassifier': 'sk.ensemble.GradientBoostingClassifier()',
        'RandomForestClassifier': 'sk.ensemble.RandomForestClassifier(class_weight="balanced", n_jobs=-1)',
        'AdaBoostClassifier': 'sk.ensemble.AdaBoostClassifier()',
        'ExtraTreesClassifier': 'sk.ensemble.ExtraTreesClassifier(class_weight="balanced", n_jobs=-1)',
        'DecisionTreeClassifier': 'sk.tree.DecisionTreeClassifier(class_weight="balanced")',
        'KNeighborsClassifier': 'sk.neighbors.KNeighborsClassifier()',
        'RadiusNeighborsClassifier': 'sk.neighbors.RadiusNeighborsClassifier()',
        'MLPClassifier': 'sk.neural_network.MLPClassifier()',
        'OneClassSVM': 'sk.svm.OneClassSVM()',
    },
    section='SETTINGS'
) %}
{% set calibrated = BoolField(
    name='calibrated',
    label='Calibrate algorithm predictions',
    description='Calibrate the prediction probabilities eliminating model-imparted bias.',
    default=True,
    section='SETTINGS',
) %}
{% set hyper_param_search = ChoiceField(
    name='hyper_param_search',
    label='Hyper Parameter Search Type',
    default='None',
    description='Hyper parameter searching is used to automatically select the best parameters (using the primary metric as the criteria).',
    choices={
        'None': 'None',
        'RandomizedSearchCV': 'sk.model_selection.RandomizedSearchCV',
        'GridSearchCV': 'sk.model_selection.GridSearchCV',
    },
    section='SETTINGS'
) %}
{% set cross_validation_n_folds = IntField(
    name='cross_validation_n_folds',
    label='Cross-Validated Folds',
    description='Cross validation is employed as a strategy to train the model on data that the model has not seen before, more folds will ensure that the model is generalizing well.',
    default=5,
    min=2,
    max=10,
    section='SETTINGS'
) %}
{% set primary_metric = ChoiceField(
    name='primary_metric',
    label='Primary Evaluation Metric',
    default='roc_auc',
    description='The primary evaluation metric is used for deciding how we assess the performance of our model.',
    choices=[
        'accuracy',
        'adjusted_mutual_info_score',
        'adjusted_rand_score',
        'average_precision',
        'balanced_accuracy',
        'completeness_score',
        'explained_variance',
        'f1',
        'f1_macro',
        'f1_micro',
        'f1_weighted',
        'fowlkes_mallows_score',
        'homogeneity_score',
        'jaccard',
        'jaccard_macro',
        'jaccard_micro',
        'jaccard_weighted',
        'max_error',
        'mutual_info_score',
        'neg_brier_score',
        'neg_log_loss',
        'neg_mean_absolute_error',
        'neg_mean_squared_error',
        'neg_mean_squared_log_error',
        'neg_median_absolute_error',
        'neg_root_mean_squared_error',
        'normalized_mutual_info_score',
        'precision',
        'precision_macro',
        'precision_micro',
        'precision_weighted',
        'r2',
        'recall',
        'recall_macro',
        'recall_micro',
        'recall_weighted',
        'roc_auc',
        'roc_auc_ovo',
        'roc_auc_ovo_weighted',
        'roc_auc_ovr',
        'roc_auc_ovr_weighted',
        'v_measure_score'
    ],
    section='SETTINGS'
) %}
{% set evaluation_metrics = MultiChoiceField(
    name='evaluation_metrics',
    label='Evaluation Metrics',
    default=[],
    description='Additional evaluation metrics can be specified, these metrics will also be reported for all models trained.',
    value=[],
    choices=[
        'accuracy',
        'adjusted_mutual_info_score',
        'adjusted_rand_score',
        'average_precision',
        'balanced_accuracy',
        'completeness_score',
        'explained_variance',
        'f1',
        'f1_macro',
        'f1_micro',
        'f1_weighted',
        'fowlkes_mallows_score',
        'homogeneity_score',
        'jaccard',
        'jaccard_macro',
        'jaccard_micro',
        'jaccard_weighted',
        'max_error',
        'mutual_info_score',
        'neg_brier_score',
        'neg_log_loss',
        'neg_mean_absolute_error',
        'neg_mean_squared_error',
        'neg_mean_squared_log_error',
        'neg_median_absolute_error',
        'neg_root_mean_squared_error',
        'normalized_mutual_info_score',
        'precision',
        'precision_macro',
        'precision_micro',
        'precision_weighted',
        'r2',
        'recall',
        'recall_macro',
        'recall_micro',
        'recall_weighted',
        'roc_auc',
        'roc_auc_ovo',
        'roc_auc_ovo_weighted',
        'roc_auc_ovr',
        'roc_auc_ovr_weighted',
        'v_measure_score'
    ],
    section='SETTINGS',
) %}
{% set all_metrics = [primary_metric.value] + evaluation_metrics.value %}

In [None]:
%%appyter markdown

We apply a {% if hyper_param_search.value != 'None' %}{{ hyper_param_search.raw_value }} search for the hyper parameters
of a {% endif %}sklearn pipeline with a dimensionality reduction step of {{ dimensionality_reduction.raw_value }}
{% if feature_selection.value != 'None' %}and a feature selection step of {{ feature_selection.raw_value }}
{% endif %} and a{% if calibrated.value %} calibrated{%endif %} {{ algorithm.raw_value }} classifier
using {{ cross_validation_n_folds.value }}-fold {{ cv_algorithm.raw_value }} cross-validation,
optimizing {{ primary_metric.value }}{% if evaluation_metrics.value %} and computing {{ ', '.join(evaluation_metrics.value) }}{% endif %}.

This will take a long time as we are evaluating n_iter different models n_splits different times each computing all the metrics on `product(X.shape)` data points--not to mention the size of each model dictated by the range of parameters specified in the params dict.

In [None]:
%%appyter code_exec
{% if algorithm.value == 'GradientBoostingClassifier' %}
## Early stopping function
def early_stopping(n_rounds, tol=0.001):
    def early_stopping_func(i, self, local):
        rounds = getattr(self, '__rounds', 0)
        last = getattr(self, '__last', None)
        current = self.train_score_[i]
        if last and current and abs(current - last) < tol:
            rounds += 1
            if rounds > n_rounds:
                return True
        else:
            rounds = 0
        setattr(self, '__last', current)
        setattr(self, '__rounds', rounds)
        return False
    return early_stopping_func
{% endif %}

{#
param_grid = {
    'reduce_dim__n_components': randint(2, 1024),
{% if algorithm.value == 'GradientBoostingClassifier' %}
    'clf__loss': ['deviance', 'exponential'],
    'clf__learning_rate': randfloat(0.001, 1.),
    'clf__subsample': randfloat(0.01, 1.),
{% elif algorithm.value == 'RandomForestClassifier' %}
    'clf__oob_score': [True],
    'clf__criterion': ['gini', 'entropy'],
{% endif %}
    'clf__n_estimators': randint(10, 200),
    'clf__max_depth': randint(20, 50),
    'clf__max_features': ['sqrt', 'log2', None],
    'clf__min_impurity_decrease': randfloat(0., 0.2),
    'clf__min_weight_fraction_leaf': randfloat(0., 0.5),
}

fit_params = {
{% if algorithm.value == 'GradientBoostingClassifier' %}
    'clf__monitor': early_stopping(5),
{% endif %}
}
#}

{% if cv_algorithm.raw_value in ['RepeatedStratifiedKFold', 'RepeatedStratifiedGroupKFold'] %}
n_repeats=5
{% endif %}
cv = {{ cv_algorithm }}(
    n_splits={{ cross_validation_n_folds }},
    {% if cv_algorithm.raw_value in ['RepeatedStratifiedKFold', 'RepeatedStratifiedGroupKFold'] %}
    n_repeats=n_repeats,
    {% else %}
    shuffle=True,
    {% endif %}
    random_state=rng,
)

{% if cv_algorithm.raw_value in ['GroupKFold', 'StratifiedGroupKFold', 'RepeatedStratifiedGroupKFold'] %}
groups=[key[:14] for key in X.index]    # Group compounds by atom connectivity
{% endif %}

# Scoring parameters
primary_metric = '{{ primary_metric }}'
evaluation_metrics = {{ evaluation_metrics }}
scoring_params = {k: metrics.get_scorer(k)
                  for k in [primary_metric, *evaluation_metrics]}

In [None]:
%%appyter code_exec
{% if hyper_param_search.value == 'None' %}

df_results = pd.DataFrame()

# Store performance on each split for computing ROC and PRC curves
fprs = []
tprs = []
precs = []
recs = []

# Store cross-validation test predictions and folds
y_proba_cv = [[] for _ in range(len(y))]
folds_cv = [[] for _ in range(len(y))]

# Store models
models = []

{% if cv_algorithm.raw_value in ['GroupKFold', 'StratifiedGroupKFold', 'RepeatedStratifiedGroupKFold'] %}
groups=[key[:14] for key in X.index]    # Group compounds by atom connectivity
for fold, (train, test) in tqdm(enumerate(cv.split(X.values, y, groups=groups))):
{% else %}
for fold, (train, test) in tqdm(enumerate(cv.split(X.values, y))):
{% endif %}
    model =
    {%- if hyper_param_search.value != 'None' %} {{ hyper_param_search }}({% endif -%}
            sk.pipeline.Pipeline([
                {%- if dimensionality_reduction.value != 'None' %}
                ('reduce_dim', {{ dimensionality_reduction }}),
                {% endif %}
                {%- if feature_selection.value != 'None' %}
                ('feature_selection', {{ feature_selection }}),
                {% endif %}
                ('clf', {{ algorithm }}),
            ])
    {%- if hyper_param_search.value != 'None' %}){% endif %}
    model.fit(X.values[train], y[train])
    
    {% if calibrated.value %}
    calibrator = sk.calibration.CalibratedClassifierCV(model, cv='prefit')
    calibrator.fit(X.values[test], y[test])
    model = calibrator
    {% endif %}
    
    {% for metric in all_metrics %}
    df_results.loc[fold, '{{ metric }}'] = scoring_params['{{ metric }}'](model, X.values[test], y[test])
    {% endfor %}
    
    y_proba = model.predict_proba(X.values[test]) # Probability prediction will be True
    for i in range(len(test)):
        y_proba_cv[test[i]].append(y_proba[i, 1])
        folds_cv[test[i]].append(fold % {{ cross_validation_n_folds }})
    model_fpr, model_tpr, _ = metrics.roc_curve(y[test], y_proba[:, 1])
    model_prec, model_rec, _ = metrics.precision_recall_curve(y[test], y_proba[:, 1])
    fprs.append(model_fpr)
    tprs.append(model_tpr)
    precs.append(model_prec)
    recs.append(model_rec)
    models.append(model)

assert not(any(len(probs) == 0 for probs in y_proba_cv)), 'All probabilities should have been calculated'

display(df_results.agg(['mean', 'std']))
{% else %}
model.fit(X.values, y)
df_results = model.cv_results_
{% endif %}

This visualization shows the cross-validated performance of the model. Low fold variance and high AUC is desired in a well-generalized model.
* ROC curve: [roc.svg](./roc.svg)
* Precision-recall curve: [prc.svg](./prc.svg)
* Confusion matrix: [confusion_matrix.svg](./confusion_matrix.svg)

In [None]:
%%appyter code_exec

fig, ax = plt.subplots()

tprs_interp = []
aucs = []
mean_fpr = np.linspace(0, 1, 100)

for fold, (fpr, tpr) in enumerate(zip(fprs, tprs)):
    tpr_interp = np.interp(mean_fpr, fpr, tpr)
    tpr_interp[0] = 0.
    roc_auc = metrics.auc(fpr, tpr)
    tprs_interp.append(tpr_interp)
    aucs.append(roc_auc)
    {% if cv_algorithm.raw_value in ['RepeatedStratifiedKFold', 'RepeatedStratifiedGroupKFold'] %}
    ax.plot(fpr, tpr, alpha=0.4)
    {% else %}
    ax.plot(fpr, tpr, alpha=0.4, label='ROC Fold %d (AUC=%0.3f)' % (fold, roc_auc))
    {% endif %}

mean_tpr = np.mean(tprs_interp, axis=0)
mean_tpr[-1] = 1.0
mean_auc = sk.metrics.auc(mean_fpr, mean_tpr)
std_auc = np.std(aucs)
ax.plot(mean_fpr, mean_tpr, color='b',
         label=r'Mean ROC (AUC = %0.2f $\pm$ %0.2f)' % (mean_auc, std_auc),
         lw=2, alpha=.8)

std_tpr = np.std(tprs_interp, axis=0)
tprs_upper = np.minimum(mean_tpr + std_tpr, 1)
tprs_lower = np.maximum(mean_tpr - std_tpr, 0)
plt.fill_between(mean_fpr, tprs_lower, tprs_upper, color='grey', alpha=.2)

ax.plot([0,1],[0,1],'--', label='Random')
ax.set_xlabel('False Positive Rate')
ax.set_ylabel('True Positive Rate')
ax.legend()
plt.savefig('roc.svg')
plt.show()

z = (mean_auc - 0.5)/std_auc
cl = sp.stats.norm.cdf(z) * 100
ci = sp.stats.norm.interval(0.95, loc=mean_auc, scale=std_auc)
print('Confidence interval (95%)', ci)
print("We are %0.3f %% confident the model's results are not just chance." % (cl))
if cl > 95:
    print('This is statistically significant. These results can be trusted.')
else:
    print('This is not statistically significant. These results should not be trusted.')

In [None]:
%%appyter code_exec

fig, ax = plt.subplots()

precs_interp = []
prc_aucs = []
mean_rec = np.linspace(0, 1, 100)

for fold, (rec, prec) in enumerate(zip(recs, precs)):
    prec_interp = np.interp(mean_rec, rec[::-1], prec[::-1])
    prc_auc = metrics.auc(rec, prec)
    precs_interp.append(prec_interp)
    prc_aucs.append(prc_auc)
    {% if cv_algorithm.raw_value in ['RepeatedStratifiedKFold', 'RepeatedStratifiedGroupKFold'] %}
    ax.plot(rec, prec, alpha=0.4)
    {% else %}
    ax.plot(rec, prec, alpha=0.4, label='PRC Fold %d (AUC=%0.3f)' % (fold, prc_auc))
    {% endif %}
    
mean_prec = np.mean(precs_interp, axis=0)
mean_auc = sk.metrics.auc(mean_rec, mean_prec)
std_auc = np.std(prc_aucs)
ax.plot(mean_rec, mean_prec, color='b',
         label=r'Mean PRC (AUC = %0.2f $\pm$ %0.2f)' % (mean_auc, std_auc),
         lw=2, alpha=.8)

std_prec = np.std(precs_interp, axis=0)
precs_upper = np.minimum(mean_prec + std_prec, 1)
precs_lower = np.maximum(mean_prec - std_prec, 0)
plt.fill_between(mean_rec, precs_lower, precs_upper, color='grey', alpha=.2)

ax.set_xlabel('Recall')
ax.set_ylabel('Precision')
ax.legend()
plt.savefig('prc.svg')
plt.show()

In [None]:
plt.title('Confusion Matrix (Cross-Validation)')
sns.heatmap(
    metrics.confusion_matrix(y, np.array([np.mean(probs) for probs in y_proba_cv]) > 0.5),
    annot=True,
    cmap=plt.cm.Blues,
    fmt='g'
)
plt.xlabel('Predicted')
plt.ylabel('True')
plt.savefig('confusion_matrix.svg')
plt.show()

## Examine drug predictions

Using the binary classification model, we can rank the drug hits by their predicted score. The model can also be used to identify additional drugs that are likely to share properties with the hits. The results table is available at [drug_cv_predictions.csv](./drug_cv_predictions.csv).

### Plot distribution of predictions for positive and negative classes

In [None]:
%%appyter code_exec

# Calculate mean and deviation of predictions
y_probas = np.array([np.mean(probs) for probs in y_proba_cv])
{% if cv_algorithm.raw_value in ['RepeatedStratifiedKFold', 'RepeatedStratifiedGroupKFold'] %}
y_probas_std = np.array([np.std(probs) for probs in y_proba_cv])
# Find minimum non-zero standard deviation to avoid dividing by zero when computing t-statistic
min_y_probas_std = max(np.min(y_probas_std[y_probas_std != 0]), 1e-10)
t_stats = (y_probas - np.mean(y_probas)) / (np.maximum(y_probas_std, min_y_probas_std)/np.sqrt(n_repeats))
# Calculate p-value using one-sample t-test
p_vals_t = 1-sp.stats.t(n_repeats-1).cdf(t_stats)
{% endif %}

In [None]:
%%appyter code_exec

{% if cv_algorithm.raw_value in ['RepeatedStratifiedKFold', 'RepeatedStratifiedGroupKFold'] %}
y_probas_means_5 = []
y_probas_values = np.array(y_proba_cv).flatten()

np.random.seed(rng)
for i in tqdm(range(100000)):
    y_probas_means_5.append(np.mean(np.random.choice(y_probas_values, n_repeats)))
    
y_probas_means_5 = np.array(sorted(y_probas_means_5))
{% endif %}

In [None]:
%%appyter code_exec

{% if cv_algorithm.raw_value in ['RepeatedStratifiedKFold', 'RepeatedStratifiedGroupKFold'] %}
y_probas_ts_5 = []
mean_y_probas = np.mean(y_probas)
y_probas_values = np.array(y_proba_cv).flatten()

np.random.seed(rng)
for i in tqdm(range(100000)):
    sample = np.random.choice(y_probas_values, n_repeats)
    y_probas_ts_5.append((np.mean(sample) - mean_y_probas) / (np.maximum(np.std(sample), min_y_probas_std)/np.sqrt(n_repeats)))
    
y_probas_ts_5 = np.array(sorted(y_probas_ts_5))
{% endif %}

In [None]:
%%appyter code_exec

{% if cv_algorithm.raw_value in ['RepeatedStratifiedKFold', 'RepeatedStratifiedGroupKFold'] %}
max_mean = np.max(y_probas_means_5)
p_vals = np.array(list(tqdm((1 - np.argwhere(y_probas_means_5 >= min(pred, max_mean))[0][0] / len(y_probas_means_5)
                             for pred in y_probas), total=len(y_probas))))
{% endif %}

In [None]:
%%appyter code_exec

{% if cv_algorithm.raw_value in ['RepeatedStratifiedKFold', 'RepeatedStratifiedGroupKFold'] %}
max_t = np.max(y_probas_ts_5)
p_vals_t_sim = np.array(list(tqdm((1 - np.argwhere(y_probas_ts_5 >= min(t, max_t))[0][0] / len(y_probas_ts_5)
                                   for t in t_stats), total=len(t_stats))))
{% endif %}

In [None]:
%%appyter code_exec

sns.kdeplot(y_probas[y == 0], shade=True, gridsize=2000, clip=[np.min(y_probas), np.percentile(y_probas, 99.9)], bw=0.01, label='Not known NSAID')
sns.kdeplot(y_probas[y == 1], shade=True, gridsize=2000, clip=[np.min(y_probas), np.percentile(y_probas, 99.9)], bw=0.002, label='Known NSAID')
{% if cv_algorithm.raw_value in ['RepeatedStratifiedKFold', 'RepeatedStratifiedGroupKFold'] %}
sns.kdeplot(y_probas_means_5, shade=True, gridsize=2000, clip=[np.min(y_probas), np.percentile(y_probas, 99.9)], bw=0.01, label='Null distribution\n(simulated)')
{% endif %}
plt.xlabel('Mean Predicted Probability')
plt.xlim([np.min(y_probas), np.percentile(y_probas, 99)])
plt.legend()
plt.savefig('mean-prediction-distribution-kde.svg')
plt.show()

In [None]:
%%appyter code_exec

{% if cv_algorithm.raw_value in ['RepeatedStratifiedKFold', 'RepeatedStratifiedGroupKFold'] %}
sns.kdeplot(t_stats[y == 0], shade=True, gridsize=1000, clip=(-20, 20), bw=0.1, label='Not known NSAID')
sns.kdeplot(t_stats[y == 1], shade=True, gridsize=1000, clip=(-20, 20), bw=0.05, label='Known NSAID')
sns.kdeplot(y_probas_ts_5, shade=True, gridsize=1000, clip=(-20, 20), bw=0.1, label='Null distribution\n(simulated)')
plt.xlabel('t-statistic')
plt.xlim([-20,20])
plt.legend()
plt.savefig('t-statistic-distribution-kde.svg')
plt.show()
{% endif %}

### Overlay predictions on visualization of input space

In [None]:
%%appyter code_exec

# Add attributes for plotting to Dataframe
X_reduced_df['Predicted Probability'] = y_probas
X_reduced_df['log10(pred)'] = np.log10(y_probas + 1e-10)
X_reduced_df['p-value'] = p_vals_t_sim
X_reduced_df['log10(p-value)'] = np.log10(X_reduced_df['p-value'])
X_reduced_df['Predicted Probability'] = y_probas
X_reduced_df['Standard Deviation'] = y_probas_std
X_reduced_df['Cross-validation fold'] = folds_cv
X_reduced_df['marker size'] = 2*np.minimum(2-np.log10(X_reduced_df['p-value']), 5)
X_reduced_df['text'] = ['<br>'.join(['Drug Name: ' + str(name),
                                     'InChI Key: ' + str(inchi),
                                     'Predicted Probability: {:.1e}'.format(p),
                                     'Standard Deviation: {:.1e}'.format(s),
                                     'p-value: {:.1e}'.format(p_val),
                                     'Label: ' + str(label),
                                     'Cross-validation fold: ' + str(fold)])
                  for name, inchi, p, s, p_val, label, fold in zip(X_reduced_df['Drug Name'],
                                                         X_reduced_df['InChI Key'],
                                                         X_reduced_df['Predicted Probability'],
                                                         X_reduced_df['Standard Deviation'],
                                                         X_reduced_df['p-value'],
                                                         X_reduced_df['Label'],
                                                         X_reduced_df['Cross-validation fold'])]
X_reduced_df.to_csv('X_reduced_df.csv')

# Helper function for formatting Plotly colorbar
def colorbar_param(values_log10, **kwargs):
    min_val = np.floor(np.min(values_log10))
    max_val = np.ceil(np.max(values_log10))
    
    ticks1 = 10**np.arange(min_val, max_val+1)
    ticks2 = 3*10**np.arange(min_val, max_val)
    
    ticktext = sorted(np.concatenate([ticks1, ticks2]))
    tickvals = list(np.log10(ticktext))
    ticktext = ['{:.0e}'.format(text) for text in ticktext]
    
    return dict(ticktext=ticktext, tickvals=tickvals, **kwargs)

fig = go.Figure()
for label in sorted(set(X_reduced_df['Label'])):
    X_plot = X_reduced_df[X_reduced_df['Label'] == label].sort_values(['Predicted Probability'])
    fig.add_trace(go.Scatter(mode='markers',
                               x=X_plot['Component 1'], y=X_plot['Component 2'],
                               text=X_plot['text'],
                               name=label,
                               marker=dict(
                                   color=X_plot['log10(pred)'],
                                   cmin=np.percentile(X_reduced_df['log10(pred)'], 50),
                                   cmax=np.max(X_reduced_df['log10(pred)']),
                                   size=X_plot['marker size'],
                                   colorbar=colorbar_param(X_plot['log10(pred)'], title='Predicted Probability'),
                                   symbol=X_plot['marker symbol'],
                                   line_width=1,
                                   colorscale='plasma'
                               )))
fig.update_layout(height=600, width=800,
                  xaxis_title='Component 1',
                  yaxis_title='Component 2',
                  title_text='Predicted Probabilities ({{ visualization_reduction.raw_value }})',
                  legend_title_text='Target Label',
                  legend=dict(
                      yanchor="top",
                      y=0.98,
                      xanchor="left",
                      x=0.02
                  ),
                  template='simple_white')
fig.show()

### Tables of top-predicted compounds

In [None]:
%%appyter code_exec
# Obtain prediction results
results = pd.DataFrame(np.array([
    querysepl1000fwd.get_drug_names(X.index),
#     Drugmonizome.get_drug_names(X.index),
    folds_cv,
    y,
    y_probas,
    {% if cv_algorithm.raw_value in ['RepeatedStratifiedKFold', 'RepeatedStratifiedGroupKFold'] %}
    y_probas_std,
    t_stats,
    p_vals,
    p_vals_t,
    p_vals_t_sim,
    {% endif %}
]).T, columns=[
    'Name (L1000FWD)',
#     'Name (Drugmonizome)',
    'Cross-validation fold',
    'Known',
    'Prediction Probability',
    {% if cv_algorithm.raw_value in ['RepeatedStratifiedKFold', 'RepeatedStratifiedGroupKFold'] %}
    'Prediction Probability Std. Dev.',
    't statistic',
    'p value (simulated mean distribution)',
    'p value (one sample t test)',
    'p value (simulated t distribution)',
    {% endif %}
], index=X.index).astype({'Known': 'bool',
                          'Prediction Probability': 'float64',
                          {% if cv_algorithm.raw_value in ['RepeatedStratifiedKFold', 'RepeatedStratifiedGroupKFold'] %}
                          'Prediction Probability Std. Dev.': 'float64',
                          't statistic': 'float64',
                          'p value (simulated mean distribution)': 'float64',
                          'p value (one sample t test)': 'float64',
                          'p value (simulated t distribution)': 'float64',{% endif %}})

results.to_csv('drug_cv_predictions.csv')

In [None]:
# Rank drug hits
results[((results['Known'] == 1))].sort_values('Prediction Probability', ascending=False)

In [None]:
# Predict additional drugs
results[results['Known'] == 0].sort_values('Prediction Probability', ascending=False).head(25)

## Examine feature importances

The relative contribution of each input feature to the final model predictions can be estimated for recursive feature selection and for a variety of tree-based models. Note that this analysis is not available if a dimensionality reduction algorithm is used.

In [None]:
%%appyter markdown

{% if feature_selection.raw_value == 'RecursiveSelectionFromExtraTrees' and dimensionality_reduction.raw_value == 'None' %}
When recursive feature selection is performed, the features are ranked by the stage at which they were removed.
Selected (i.e. estimated best) features are have importance 1. The ranks are averaged across cross-validation
splits to produce an average importance score. The full feature importance table is available at
[feature_importance.csv](./feature_importance.csv).
{% endif %}

In [None]:
%%appyter code_exec

{% if feature_selection.raw_value == 'RecursiveSelectionFromExtraTrees' and dimensionality_reduction.raw_value == 'None' %}
all_rankings = []
{% endif %}
{% if algorithm.raw_value in ['GradientBoostingClassifier', 'RandomForestClassifier', 'AdaBoostClassifier', 'ExtraTreesClassifier', 'DecisionTreeClassifier'] %}
all_feature_importances = []
{% endif %}
for model in models:
    {% if calibrated.value %}
    for calibrated_clf in model.calibrated_classifiers_:
        pipeline = calibrated_clf.base_estimator
    {% else %}
        pipeline = model
    {% endif %}
        
        {% if feature_selection.raw_value == 'RecursiveSelectionFromExtraTrees' %}
        ranking = pipeline['feature_selection'].ranking_
        all_rankings.append(ranking)
        {% endif %}
        
        {% if algorithm.raw_value in ['GradientBoostingClassifier', 'RandomForestClassifier', 'AdaBoostClassifier', 'ExtraTreesClassifier', 'DecisionTreeClassifier'] %}
        {% if feature_selection.raw_value != 'None' %}
        feature_importances = np.zeros(pipeline['feature_selection'].get_support().shape)
        feature_importances[pipeline['feature_selection'].get_support()] = pipeline['clf'].feature_importances_
        {% else %}
        feature_importances = pipeline['clf'].feature_importances_
        {% endif %}
        all_feature_importances.append(feature_importances)
        {% endif %}

In [None]:
%%appyter code_exec

{% if dimensionality_reduction.raw_value == 'None' %}
df_feat_imp = pd.DataFrame({'Feature': X.columns,
                            'Dataset': reduce(lambda a,b: a+b, ([dataset]*size for dataset, size in dataset_sizes)),
                            {% if feature_selection.raw_value == 'RecursiveSelectionFromExtraTrees' %}
                            'Ranking Mean': np.mean(all_rankings, axis=0),
                            'Ranking Std. Dev.': np.std(all_rankings, axis=0),
                            {% endif %}
                            'Importance Mean': np.mean(all_feature_importances, axis=0),
                            'Importance Std. Dev.': np.std(all_feature_importances, axis=0)})
df_feat_imp = df_feat_imp.set_index('Feature').sort_values('Importance Mean', ascending=False)
display(df_feat_imp.head(25))
df_feat_imp.to_csv('feature_importance.csv')
{% endif %}

In [None]:
%%appyter markdown

{% if feature_selection.raw_value == 'RecursiveSelectionFromExtraTrees' and dimensionality_reduction.raw_value == 'None' %}
Plot the distribution of importance scores for features in each dataset ([feature_importance.svg](./feature_importance.svg)).
Features with lower scores were retained for more rounds during recursive feature selection
and have greater relative importance.
{% endif %}

In [None]:
%%appyter code_exec

{% if feature_selection.raw_value == 'RecursiveSelectionFromExtraTrees' and dimensionality_reduction.raw_value == 'None' %}
fig, axs = plt.subplots(1, 2, figsize=(15, 5))
df_feat_imp = df_feat_imp.sort_values('Ranking Mean')
for dataset in set(df_feat_imp.Dataset):
    importance_scores = df_feat_imp.loc[df_feat_imp.Dataset == dataset]['Ranking Mean'].values
    importance_scores_std = df_feat_imp.loc[df_feat_imp.Dataset == dataset]['Ranking Std. Dev.'].values
    lower = importance_scores - importance_scores_std
    upper = importance_scores + importance_scores_std
    axs[0].plot(importance_scores, label=dataset)
    axs[0].fill_between(np.arange(len(importance_scores)), lower, upper, alpha=.2)
    axs[1].plot(np.linspace(0, 1, len(importance_scores)), importance_scores, label=dataset)
    axs[1].fill_between(np.linspace(0, 1, len(importance_scores)), lower, upper, alpha=.2)
for i in [0, 1]:
    axs[i].legend()
    axs[i].set_title('Distribution of feature ranking from recursive feature elimination')
    axs[i].set_ylabel('Average feature ranking\n(lower ranking is more important)')
axs[0].set_xlabel('Ranked features (absolute count)')
axs[1].set_xlabel('Ranked features (relative count)')
axs[0].set_xlim([0,512])
plt.tight_layout()
plt.savefig('feature_importance_rfe.svg')
plt.show()
{% endif %}

In [None]:
%%appyter code_exec

{% if algorithm.raw_value in ['GradientBoostingClassifier', 'RandomForestClassifier', 'AdaBoostClassifier', 'ExtraTreesClassifier', 'DecisionTreeClassifier']  and dimensionality_reduction.raw_value == 'None' %}
fig, axs = plt.subplots(2, 2, figsize=(15, 10))
df_feat_imp = df_feat_imp.sort_values('Importance Mean', ascending=False)
for dataset in set(df_feat_imp.Dataset):
    importance_scores = df_feat_imp.loc[df_feat_imp.Dataset == dataset]['Importance Mean'].values
    importance_scores_std = df_feat_imp.loc[df_feat_imp.Dataset == dataset]['Importance Std. Dev.'].values
    lower = importance_scores - importance_scores_std
    upper = importance_scores + importance_scores_std
    axs[0][0].plot(importance_scores, label=dataset)
    axs[0][0].fill_between(np.arange(len(importance_scores)), lower, upper, alpha=.2)
    axs[0][1].plot(np.linspace(0, 1, len(importance_scores)), importance_scores, label=dataset)
    axs[0][1].fill_between(np.linspace(0, 1, len(importance_scores)), lower, upper, alpha=.2)
    
    importance_scores = np.cumsum(df_feat_imp.loc[df_feat_imp.Dataset == dataset]['Importance Mean'].values)
    axs[1][0].plot(importance_scores, label=dataset)
    axs[1][1].plot(np.linspace(0, 1, len(importance_scores)), importance_scores, label=dataset)
for i in [0, 1]:
    axs[0][i].legend()
    axs[0][i].set_title('Distribution of feature scores from model')
    axs[1][i].set_title('Cumulative distribution of feature scores from model')
    axs[i][0].set_xlabel('Ranked features (absolute count)')
    axs[i][1].set_xlabel('Ranked features (relative count)')
    axs[0][i].set_ylabel('Average feature importance\n(higher score is more important)')
    axs[1][i].set_ylabel('Cumulative sum of feature importance')
    axs[i][0].set_xlim([0,512])
plt.tight_layout()
plt.savefig('feature_importance.svg')
plt.show()
{% endif %}