In [None]:
#%%appyter init
import os, sys; sys.path.insert(0, os.path.realpath('..'))
from appyter import magic
magic.init(lambda _=globals: _())

In [None]:
%matplotlib inline
# Imports
## Data processing
import pandas as pd
import numpy as np
import scipy as sp
## Machine Learning
import sklearn as sk
from sklearn import (
    calibration,
    decomposition,
    ensemble,
    feature_selection,
    linear_model,
    manifold,
    metrics,
    model_selection,
    multioutput,
    pipeline,
    preprocessing,
    svm,
    tree,
    feature_extraction,
)
import umap
## Plotting
from matplotlib import pyplot as plt
import seaborn as sns
## Drugmonizome API
from drugmonizome import Drugmonizome
# Utility
import os
import re
import json
from functools import reduce
from IPython.display import display

In [None]:
rng = 2020
pd.set_option('display.max_rows', None)

## Select Input Datasets and Target Classes

Selected drug set libraries and omics datasets are downloaded and joined on the drug to produce a large association matrix. A machine learning model will be trained to predict the specified target labels from this association matrix. This is a binary classification task that can be used to predict drugs that are likely to be associated with the target class.

In [None]:
%%appyter hide
{% do SectionField(
    title='Attribute Dataset Selection',
    subtitle='Select the input datasets to use for learning and classification. \
              A model will be trained to predict the target labels from the selected attributes. \
              If no datasets are selected, default attributes will be used.',
    name='ATTRIBUTES',
    img='attributes.png',
) %}

{% set exprdatasets = MultiChoiceField(
    name='exprdatasets',
    label='L1000',
    choices=[
        'L1000FWD Downregulated GO Biological Processes',
        'L1000FWD Downregulated GO Cellular Components',
        'L1000FWD Downregulated GO Molecular Function',
        'L1000FWD Downregulated KEGG Pathways',
        'L1000FWD Downregulated Signatures',
        'L1000FWD Predicted Side Effects',
        'L1000FWD Upregulated GO Biological Process',
        'L1000FWD Upregulated GO Cellular Components',
        'L1000FWD Upregulated GO Molecular Function',
        'L1000FWD Upregulated KEGG Pathways',
        'L1000FWD Upregulated Signatures',
    ],
    default=[],
    section='ATTRIBUTES'
) %}

{% set targetdatasets = MultiChoiceField(
    name='targetdatasets',
    label='Drug Targets and Associated Genes',
    choices=[
        'Downregulated CREEDS Signatures',
        'Upregulated CREEDS Signatures',
        'DrugCentral Targets',
        'DrugRepurposingHub Drug Targets',
        'Drugbank Small Molecule Carriers',
        'Drugbank Small Molecule Enzymes',
        'Drugbank Small Molecule Targets',
        'Drugbank Small Molecule Transporters',
        'Geneshot Associated Genes',
        'Geneshot Predicted AutoRIF Genes',
        'Geneshot Predicted Coexpression Genes',
        'Geneshot Predicted Enrichr Genes',
        'Geneshot Predicted GeneRIF Genes',
        'Geneshot Predicted Tagger Genes',
        'KinomeScan Kinases',
        'PharmGKB Single Nucleotide Polymorphisms',
        'STITCH Targets',
    ],
    default=[],
    section='ATTRIBUTES'
) %}

{% set indicationdatasets = MultiChoiceField(
    name='indicationdatasets',
    label='Indications, Modes of Action, and Side Effects',
    choices=[
        'ATC Codes Drugsetlibrary',
        'DrugRepurposingHub Mechanisms of Action',
        'PharmGKB OFFSIDES Side Effects',
        'SIDER Indications',
        'SIDER Side Effects',
    ],
    default=[],
    section='ATTRIBUTES'
) %}

{% set structuraldatasets = MultiChoiceField(
    name='structuraldatasets',
    label='Structural Features',
    choices=[
        'RDKIT MACCS Chemical Fingerprints'
    ],
    default=[],
    section='ATTRIBUTES'
) %}

{% set keepmissing = BoolField(
    name='keepmissing',
    label='Keep drugs with missing data when joining datasets',
    description='Keep drugs that appear in some datasets and not in others. \
                 Missing data is filled in with zeros. Otherwise, only drugs \
                 that are present in all datasets are preserved.',
    default=False,
    section='ATTRIBUTES',
) %}

{% set tfidf = BoolField(
    name='tfidf',
    label='Apply tf–idf normalization to binary inputs',
    description='For binary drug-attribute associations in the input matrix, \
                 apply tf-idf transformation to normalize data.',
    default=True,
    section='ATTRIBUTES',
) %}

{% set attribute_datasets = exprdatasets.value +
                             targetdatasets.value +
                             indicationdatasets.value +
                             structuraldatasets.value %}

In [None]:
%%appyter markdown

To construct the input matrix, we download drug set libraries and omics datasets and join them on the drug ID.
{% if keepmissing.value %}
Drugs that appear in some datasets and not in others are preserved, and missing data is filled in with zeros.
{% else %}
Only drugs that are present in all datasets are preserved.
{% endif %}

In [None]:
%%appyter hide
{% do SectionField(
    title='Target Label Selection',
    subtitle='Upload a list of drugs to be given positive class labels for binary classification. \
              Drugs should be in a text file, specified by either drug name or InChI Key and separated by newlines. \
              If no file is selected, a default list of hits from COVID-19 drug screens will be used.',
    name='TARGET',
    img='target.png',
) %}

{% set drugformat = ChoiceField(
    name='drugformat',
    label='Drug Identifier Format',
    default='Drug Name',
    choices=[
        'Drug Name',
        'InChI Key'
    ],
    section='TARGET'
) %}

{% set drughitlist = FileField(
    name='drughitlist',
    label='Upload List of Drug Hits',
    default='all_hits.txt',
    section='TARGET'
) %}

In [None]:
%%appyter code_exec

{% if attribute_datasets == [] %}
# No attribute datasets selected, so use default datasets
attribute_datasets = ['L1000FWD Downregulated Signatures',
                      'L1000FWD Upregulated Signatures',
                      'RDKIT MACCS Chemical Fingerprints']
{% else %}
# Use the selected attribute datasets
attribute_datasets = {{ attribute_datasets }}
{% endif %}

df_attributes = list(Drugmonizome.download_df(
    [dataset
     for dataset in attribute_datasets]
))

# Assemble all attribute datasets
if len(df_attributes) > 1:
    # Obtain merged dataframe with omics and target data
    df = reduce(
        lambda a, b: pd.merge( # Merge two dataframes item by item
            a, # left
            b, # right
            # Items with the same left and right index are merged
            left_index=True,
            right_index=True,
            {% if keepmissing.value %}
            how='outer', # Keep mis-matched indices
            {% else %}
            how='inner', # Keep only matched indices
            {% endif %}
        ),
        df_attributes,
    )
else:
    df = df_attributes[0]

df = df.fillna(0)
X = df.applymap(lambda f: 1 if f!=0 else 0)
{% if tfidf.value %}
# Apply tf-idf normalization
transformer = feature_extraction.text.TfidfTransformer()
X_tfidf = transformer.fit_transform(X)
X = pd.DataFrame.sparse.from_spmatrix(X_tfidf, columns=X.columns, index=X.index)
{% endif %}
print('Total shape:', X.shape)
display(X.head())

In [None]:
%%appyter markdown

The target labels are produced from the uploaded list of hits: 1 if the drug is specified as a hit, 0 otherwise.
    {% if drugformat.value == 'Drug Name' %}
    Drug names are matched to InChI Keys from the Drugmonizome metadata.
    {% endif %}

In [None]:
%%appyter code_exec

{% if drughitlist.value == '' %}
# Using default list of hits from COVID-19 in vitro drug screens
hits_filename = '../../all_hits.txt'
{% else %}
# Using user-specified list of positive drug hits
hits_filename = {{drughitlist}}
{% endif %}

with open(hits_filename, 'r') as hits_file:
    drug_hits = set(drug.strip() for drug in hits_file.read().strip().split('\n') 
                    if len(drug.strip()) > 0)

{% if drugformat.value == 'Drug Name' %}
drug_hits = Drugmonizome.get_InChI_keys(drug_hits)
{% endif %}

We produce a target array containing 1 if the drug is specified as a hit and 0 otherwise.

In [None]:
y = np.array([drug in drug_hits for drug in X.index]).astype(np.int8)
print('Number of hits matched in input: %d (%0.3f %%)' % (y.sum(), 100*y.sum()/len(y)))

In [None]:
# Output data shapes
print('Input shape:', X.shape)
print('Target shape:', y.shape)

## Dimensionality Reduction and Visualization

In [None]:
%%appyter hide
{% do SectionField(
    title='Machine Learning Pipeline',
    subtitle='Select from available machine learning algorithms, their unique settings, and methods to use to evaluate the classifier.',
    name='SETTINGS',
    img='settings.png',
) %}

{% set visualization_reduction = ChoiceField(
    name='visualization_reduction',
    label='Data Visualization Method',
    description='A dimensionality reduction algorithm should be selected for data visualization.',
    default='UMAP',
    choices={
        'UMAP': 'umap.UMAP()',
        'NMF': 'sk.decomposition.NMF(n_components=2)',
        'PCA': 'sk.decomposition.PCA(n_components=2)',
        'TruncatedSVD': 'sk.decomposition.TruncatedSVD(n_components=2)',
        'IncrementalPCA': 'sk.decomposition.IncrementalPCA(n_components=2)',
        'ICA': 'sk.decomposition.FastICA(n_components=2)',
        'SparsePCA': 'sk.decomposition.SparsePCA(n_components=2)',
    },
    section='SETTINGS'
) %}

In [None]:
%%appyter markdown

We reduce the dimensionality of our omics feature space for visualization with {{ visualization_reduction.raw_value }}.

In [None]:
%%appyter code_exec
clf_dimensionality_reduction = {{ visualization_reduction }}
X_reduced = clf_dimensionality_reduction.fit_transform(X.values)
{% if visualization_reduction.raw_value == 'PCA' %}
print('Explained variance:', np.sum(clf_dimensionality_reduction.explained_variance_))
{% endif %}
plt.title('Low dimension representation')
plt.scatter(
    X_reduced[:, 0],
    X_reduced[:, 1],
    c=y,
)
plt.show()

## Machine Learning

In [None]:
%%appyter hide
{% set dimensionality_reduction = ChoiceField(
    name='dimensionality_reduction',
    label='Dimensionality Reduction Algorithm',
    description='A dimensionality reduction algorithm should be selected to improve the quality of the classifier.',
    default='PCA',
    choices={
        'PCA': 'sk.decomposition.PCA(n_components=64)',
        'TruncatedSVD': 'sk.decomposition.TruncatedSVD(n_components=64)',
        'IncrementalPCA': 'sk.decomposition.IncrementalPCA(n_components=64)',
        'ICA': 'sk.decomposition.FastICA(n_components=64)',
        'SparsePCA': 'sk.decomposition.SparsePCA(n_components=64)',
    },
    section='SETTINGS'
) %}
{% set feature_selection = ChoiceField(
    name='feature_selection',
    label='Machine Learning Feature Selection',
    default='None',
    choices={
        'None': 'None',
        'SelectFromLinearSVC': 'sk.feature_selection.SelectFromModel(sk.svm.LinearSVC(loss="squared_hinge", penalty="l1", dual=False))',
        'SelectFromExtraTrees': 'sk.feature_selection.SelectFromModel(sk.ensemble.ExtraTreesClassifier())',
        'SelectKBest': 'sk.feature_selection.SelectKBest("f_classif")',
        'SelectKBestChi2': 'sk.feature_selection.SelectKBest("chi2")',
        'SelectKBestMultiInfo': 'sk.feature_selection.SelectKBest("mutual_info_classif")',
    },
    section='SETTINGS'
) %}
{% set cv_algorithm = ChoiceField(
    name='cv_algorithm',
    label='Cross Validation Algorithm',
    default='StratifiedKFold',
    value='KFold',
    choices={
        'KFold': 'sk.model_selection.KFold',
        'GroupKFold': 'sk.model_selection.GroupKFold',
        'RepeatedKFold': 'sk.model_selection.RepeatedKFold',
        'StratifiedKFold': 'sk.model_selection.StratifiedKFold',
        'RepeatedStratifiedKFold': 'sk.model_selection.RepeatedStratifiedKFold',
    },
    section='SETTINGS',
) %}
{% set algorithm = ChoiceField(
    name='algorithm',
    label='Machine Learning Algorithm',
    default='RandomForestClassifier',
    description='A machine learning algorithm should be selected to construct the predictive model.',
    choices={
        'GradientBoostingClassifier': 'sk.ensemble.GradientBoostingClassifier()',
        'RandomForestClassifier': 'sk.ensemble.RandomForestClassifier()',
        'AdaBoostClassifier': 'sk.ensemble.AdaBoostClassifier()',
        'ExtraTreesClassifier': 'sk.ensemble.ExtraTreesClassifier()',
        'DecisionTreeClassifier': 'sk.tree.DecisionTreeClassifier()',
        'KNeighborsClassifier': 'sk.neighbors.KNeighborsClassifier()',
        'RadiusNeighborsClassifier': 'sk.neighbors.RadiusNeighborsClassifier()',
        'MLPClassifier': 'sk.neural_network.MLPClassifier()',
        'OneClassSVM': 'sk.svm.OneClassSVM()',
    },
    section='SETTINGS'
) %}
{% set calibrated = BoolField(
    name='calibrated',
    label='Calibrate algorithm predictions',
    description='Calibrate the prediction probabilities eliminating model-imparted bias.',
    default=True,
    section='SETTINGS',
) %}
{% set hyper_param_search = ChoiceField(
    name='hyper_param_search',
    label='Hyper Parameter Search Type',
    default='None',
    description='Hyper parameter searching is used to automatically select the best parameters (using the primary metric as the criteria).',
    choices={
        'None': 'None',
        'RandomizedSearchCV': 'sk.model_selection.RandomizedSearchCV',
        'GridSearchCV': 'sk.model_selection.GridSearchCV',
    },
    section='SETTINGS'
) %}
{% set cross_validation_n_folds = IntField(
    name='cross_validation_n_folds',
    label='Cross-Validated Folds',
    description='Cross validation is employed as a strategy to train the model on data that the model has not seen before, more folds will ensure that the model is generalizing well.',
    default=3,
    min=2,
    max=10,
    section='SETTINGS'
) %}
{% set primary_metric = ChoiceField(
    name='primary_metric',
    label='Primary Evaluation Metric',
    default='roc_auc',
    description='The primary evaluation metric is used for deciding how we assess the performance of our model.',
    choices=[
        'accuracy',
        'adjusted_mutual_info_score',
        'adjusted_rand_score',
        'average_precision',
        'balanced_accuracy',
        'completeness_score',
        'explained_variance',
        'f1',
        'f1_macro',
        'f1_micro',
        'f1_weighted',
        'fowlkes_mallows_score',
        'homogeneity_score',
        'jaccard',
        'jaccard_macro',
        'jaccard_micro',
        'jaccard_weighted',
        'max_error',
        'mutual_info_score',
        'neg_brier_score',
        'neg_log_loss',
        'neg_mean_absolute_error',
        'neg_mean_squared_error',
        'neg_mean_squared_log_error',
        'neg_median_absolute_error',
        'neg_root_mean_squared_error',
        'normalized_mutual_info_score',
        'precision',
        'precision_macro',
        'precision_micro',
        'precision_weighted',
        'r2',
        'recall',
        'recall_macro',
        'recall_micro',
        'recall_weighted',
        'roc_auc',
        'roc_auc_ovo',
        'roc_auc_ovo_weighted',
        'roc_auc_ovr',
        'roc_auc_ovr_weighted',
        'v_measure_score'
    ],
    section='SETTINGS'
) %}
{% set evaluation_metrics = MultiChoiceField(
    name='evaluation_metrics',
    label='Evaluation Metrics',
    default=[],
    description='Additional evaluation metrics can be specified, these metrics will also be reported for all models trained.',
    value=[],
    choices=[
        'accuracy',
        'adjusted_mutual_info_score',
        'adjusted_rand_score',
        'average_precision',
        'balanced_accuracy',
        'completeness_score',
        'explained_variance',
        'f1',
        'f1_macro',
        'f1_micro',
        'f1_weighted',
        'fowlkes_mallows_score',
        'homogeneity_score',
        'jaccard',
        'jaccard_macro',
        'jaccard_micro',
        'jaccard_weighted',
        'max_error',
        'mutual_info_score',
        'neg_brier_score',
        'neg_log_loss',
        'neg_mean_absolute_error',
        'neg_mean_squared_error',
        'neg_mean_squared_log_error',
        'neg_median_absolute_error',
        'neg_root_mean_squared_error',
        'normalized_mutual_info_score',
        'precision',
        'precision_macro',
        'precision_micro',
        'precision_weighted',
        'r2',
        'recall',
        'recall_macro',
        'recall_micro',
        'recall_weighted',
        'roc_auc',
        'roc_auc_ovo',
        'roc_auc_ovo_weighted',
        'roc_auc_ovr',
        'roc_auc_ovr_weighted',
        'v_measure_score'
    ],
    section='SETTINGS',
) %}
{% set all_metrics = [primary_metric.value] + evaluation_metrics.value %}

In [None]:
%%appyter markdown

We apply a {% if hyper_param_search.value != 'None' %}{{ hyper_param_search.raw_value }} search for the hyper parameters
of a {% endif %}sklearn pipeline with a dimensionality reduction step of {{ dimensionality_reduction.raw_value }}
{% if feature_selection.value != 'None' %}and a feature selection step of {{ feature_selection.raw_value }}
{% endif %} and a{% if calibrated.value %} calibrated{%endif %} {{ algorithm.raw_value }} classifier
using {{ cross_validation_n_folds.value }}-fold {{ cv_algorithm.raw_value }} cross-validation,
optimizing {{ primary_metric.value }}{% if evaluation_metrics.value %} and computing {{ ', '.join(evaluation_metrics.value) }}{% endif %}.

This will take a long time as we are evaluating n_iter different models n_splits different times each computing all the metrics on `product(X.shape)` data points--not to mention the size of each model dictated by the range of parameters specified in the params dict.

In [None]:
%%appyter code_exec
{% if algorithm.value == 'GradientBoostingClassifier' %}
## Early stopping function
def early_stopping(n_rounds, tol=0.001):
    def early_stopping_func(i, self, local):
        rounds = getattr(self, '__rounds', 0)
        last = getattr(self, '__last', None)
        current = self.train_score_[i]
        if last and current and abs(current - last) < tol:
            rounds += 1
            if rounds > n_rounds:
                return True
        else:
            rounds = 0
        setattr(self, '__last', current)
        setattr(self, '__rounds', rounds)
        return False
    return early_stopping_func
{% endif %}

{#
param_grid = {
    'reduce_dim__n_components': randint(2, 1024),
{% if algorithm.value == 'GradientBoostingClassifier' %}
    'clf__loss': ['deviance', 'exponential'],
    'clf__learning_rate': randfloat(0.001, 1.),
    'clf__subsample': randfloat(0.01, 1.),
{% elif algorithm.value == 'RandomForestClassifier' %}
    'clf__oob_score': [True],
    'clf__criterion': ['gini', 'entropy'],
{% endif %}
    'clf__n_estimators': randint(10, 200),
    'clf__max_depth': randint(20, 50),
    'clf__max_features': ['sqrt', 'log2', None],
    'clf__min_impurity_decrease': randfloat(0., 0.2),
    'clf__min_weight_fraction_leaf': randfloat(0., 0.5),
}

fit_params = {
{% if algorithm.value == 'GradientBoostingClassifier' %}
    'clf__monitor': early_stopping(5),
{% endif %}
}
#}
    
cv = {{ cv_algorithm }}(
    n_splits={{ cross_validation_n_folds }},
    shuffle=True,
    random_state=rng,
)

model =
{%- if hyper_param_search.value != 'None' %} {{ hyper_param_search }}({% endif -%}
    {%- if calibrated.value %} sk.calibration.CalibratedClassifierCV({% endif -%}
        sk.pipeline.Pipeline([
            ('reduce_dim', {{ dimensionality_reduction }}),
            {%- if feature_selection.value != 'None' %}('feature_selection', {{ feature_selection }}),{% endif %}
            ('clf', {{ algorithm }}),
        ]),
    cv=cv,
{% if calibrated.value %}){% endif -%}{%- if hyper_param_search.value != 'None' %}){% endif %}

# Scoring parameters
primary_metric = '{{ primary_metric }}'
evaluation_metrics = {{ evaluation_metrics }}
scoring_params = {k: metrics.get_scorer(k)
                  for k in [primary_metric, *evaluation_metrics]}

In [None]:
%%appyter code_exec
{% if hyper_param_search.value == 'None' %}

df_results = pd.DataFrame()

# Store performance on each split for computing ROC and PRC curves
fprs = []
tprs = []
precs = []
recs = []

# Store cross-validation test predictions
y_proba_cv = np.empty(len(y))
y_proba_cv[:] = np.nan

for fold, (train, test) in enumerate(cv.split(X.values, y)):
    model.fit(X.values[train], y[train])
    {% for metric in all_metrics %}
    df_results.loc[fold, '{{ metric }}'] = scoring_params['{{ metric }}'](model, X.values[test], y[test])
    {% endfor %}
    y_proba = model.predict_proba(X.values[test]) # Probability prediction will be True
    y_proba_cv[test] = y_proba[:, 1]
    model_fpr, model_tpr, _ = metrics.roc_curve(y[test], y_proba[:, 1])
    model_prec, model_rec, _ = metrics.precision_recall_curve(y[test], y_proba[:, 1])
    fprs.append(model_fpr)
    tprs.append(model_tpr)
    precs.append(model_prec)
    recs.append(model_rec)

assert not(any(np.isnan(y_proba_cv))), 'All probabilities should have been calculated'

display(df_results.agg(['mean', 'std']))
{% else %}
model.fit(X.values, y)
df_results = model.cv_results_
{% endif %}

This visualization shows the cross-validated performance of the model. Low fold variance and high AUC is desired in a well-generalized model.

In [None]:
fig, ax = plt.subplots()

tprs_interp = []
aucs = []
mean_fpr = np.linspace(0, 1, 100)

for fold, (fpr, tpr) in enumerate(zip(fprs, tprs)):
    tpr_interp = np.interp(mean_fpr, fpr, tpr)
    tpr_interp[0] = 0.
    roc_auc = metrics.auc(fpr, tpr)
    tprs_interp.append(tpr_interp)
    aucs.append(roc_auc)
    ax.plot(fpr, tpr, alpha=0.4, label='ROC Fold %d (AUC=%0.3f)' % (fold, roc_auc))

mean_tpr = np.mean(tprs_interp, axis=0)
mean_tpr[-1] = 1.0
mean_auc = sk.metrics.auc(mean_fpr, mean_tpr)
std_auc = np.std(aucs)
ax.plot(mean_fpr, mean_tpr, color='b',
         label=r'Mean ROC (AUC = %0.2f $\pm$ %0.2f)' % (mean_auc, std_auc),
         lw=2, alpha=.8)

std_tpr = np.std(tprs_interp, axis=0)
tprs_upper = np.minimum(mean_tpr + std_tpr, 1)
tprs_lower = np.maximum(mean_tpr - std_tpr, 0)
plt.fill_between(mean_fpr, tprs_lower, tprs_upper, color='grey', alpha=.2)

ax.plot([0,1],[0,1],'--', label='Random')
ax.set_xlabel('False Positive Rate')
ax.set_ylabel('True Positive Rate')
ax.legend()

z = (mean_auc - 0.5)/std_auc
cl = sp.stats.norm.cdf(z) * 100
ci = sp.stats.norm.interval(0.95, loc=mean_auc, scale=std_auc)
print('Confidence interval (95%)', ci)
print("We are %0.3f %% confident the model's results are not just chance." % (cl))
if cl > 95:
    print('This is statistically significant. These results can be trusted.')
else:
    print('This is not statistically significant. These results should not be trusted.')

In [None]:
fig, ax = plt.subplots()

precs_interp = []
prc_aucs = []
mean_rec = np.linspace(0, 1, 100)

for fold, (rec, prec) in enumerate(zip(recs, precs)):
    prec_interp = np.interp(mean_rec, rec[::-1], prec[::-1])
    prc_auc = metrics.auc(rec, prec)
    precs_interp.append(prec_interp)
    prc_aucs.append(prc_auc)
    ax.plot(rec, prec, alpha=0.4, label='PRC Fold %d (AUC=%0.3f)' % (fold, prc_auc))

mean_prec = np.mean(precs_interp, axis=0)
mean_auc = sk.metrics.auc(mean_rec, mean_prec)
std_auc = np.std(prc_aucs)
ax.plot(mean_rec, mean_prec, color='b',
         label=r'Mean PRC (AUC = %0.2f $\pm$ %0.2f)' % (mean_auc, std_auc),
         lw=2, alpha=.8)

std_prec = np.std(precs_interp, axis=0)
precs_upper = np.minimum(mean_prec + std_prec, 1)
precs_lower = np.maximum(mean_prec - std_prec, 0)
plt.fill_between(mean_rec, precs_lower, precs_upper, color='grey', alpha=.2)

ax.set_xlabel('Recall')
ax.set_ylabel('Precision')
ax.legend()

z = (mean_auc - 0.5)/std_auc
cl = sp.stats.norm.cdf(z) * 100
ci = sp.stats.norm.interval(0.95, loc=mean_auc, scale=std_auc)
print('Confidence interval (95%)', ci)
print("We are %0.3f %% confident the model's results are not just chance." % (cl))
if cl > 95:
    print('This is statistically significant. These results can be trusted.')
else:
    print('This is not statistically significant. These results should not be trusted.')

In [None]:
plt.title('Confusion Matrix (Cross-Validation)')
sns.heatmap(
    metrics.confusion_matrix(y, y_proba_cv > 0.5),
    annot=True,
    cmap=plt.cm.Blues,
    fmt='g'
)
plt.xlabel('Predicted')
plt.ylabel('True')
plt.show()

## Examine drug predictions

Using the binary classification model, we can rank the drug hits by their predicted score. The model can also be used to identify additional drugs that are likely to share properties with the hits.

In [None]:
# Obtain prediction results
y_probas = y_proba_cv
results = pd.DataFrame(np.array([
    Drugmonizome.get_drug_names(X.index),
    y,
    (y_probas > 0.5).astype('float64'),
    y_probas,
]).T, columns=[
    'Name',
    'Known',
    'Predicted',
    'Prediction Probability',
], index=X.index).astype({'Known': 'float64', 'Predicted': 'float64', 'Prediction Probability': 'float64'})

In [None]:
# Rank drug hits
results[((results['Known'] == 1))].sort_values('Prediction Probability', ascending=False)

In [None]:
# Predict additional drugs
results[results['Known'] == 0].sort_values('Prediction Probability', ascending=False).head(25)