In [None]:
#%%appyter init
from appyter import magic
magic.init(lambda _=globals: _())

# Imputing Knowledge about Gene and Protein Function with Machine Learning

In [None]:
# Imports
## Data processing
import pandas as pd
import numpy as np
import scipy as sp
## Machine Learning
import sklearn as sk
from sklearn import (
    calibration, decomposition, ensemble, feature_selection,
    linear_model, manifold, metrics, model_selection, multioutput,
    pipeline, preprocessing, svm, tree, neural_network,
)
## Plotting
import plotly.express as px
from matplotlib import pyplot as plt
## Harmonizome API
from harmonizome import Harmonizome
## Utility
import re
import json
from functools import reduce
from IPython.display import display, Markdown

In [None]:
# Utility functions

def try_json_loads(s):
    try:
        return json.loads(s)
    except:
        return ''

## Create custom "randfloat" that behaves like randint but for floats
from scipy.stats import uniform, randint
def randfloat(start, end):
    ''' Utility function for generating a float uniform distribution '''
    return uniform(start, end - start)

# reproducable random seed
rng = 42

## Inputs

Given a target attribute of interest, we will use machine learning to predict genes that are strongly correlated with that target. Using the Harmonizome data query API, we download the dataset containing the target attribute as well as a number of well-populated Omics datasets for more genes and features and build a large sparse dataframe.

Select Omics datasets are downloaded and joined on the Gene producing a large association matrix. Only association is preserved in order to create a binary classification task.

In [None]:
%%appyter hide_code
{% do SectionField(
    name='DATASETS',
    title='ATTRIBUTE AND PREDICTION CLASS DATASET SELECTION',
    subtitle='Select the datasets to use for learning and classification.',
    img='attributes.png',
) %}
{% set harmonizome_attribute_datasets = MultiCheckboxField(
    name='attribute_datasets',
    label='Attribute Selection (place cursor inside the box to add more datasets)',
    hint='Databases to use for prediction',
    description='The selected datasets will be concatenated and used to train the model.',
    default=[
        'CCLE Cell Line Gene Expression Profiles',
        'ENCODE Transcription Factor Targets',
    ],
    choices=[
        'CCLE Cell Line Gene Expression Profiles',
        'ENCODE Transcription Factor Targets',
        'Allen Brain Atlas Adult Human Brain Tissue Gene Expression Profiles',
        'CHEA Transcription Factor Targets',
        'BioGPS Cell Line Gene Expression Profiles',
        'GTEx Tissue Gene Expression Profiles',
    ],
    descriptions={
        'CCLE Cell Line Gene Expression Profiles': 'MRNA expression profiles for cancer cell lines',
        'ENCODE Transcription Factor Targets': 'Target genes of transcription factors from transcription factor binding site profiles',
        'Allen Brain Atlas Adult Human Brain Tissue Gene Expression Profiles': 'MRNA expression profiles for 6 adult human brain tissue samples spanning ~300 brain structures',
        'CHEA Transcription Factor Targets': 'Target genes of transcription factors from published ChIP-chip, ChIP-seq, and other transcription factor binding site profiling studies',
        'BioGPS Cell Line Gene Expression Profiles': 'MRNA expression profiles for the NCI-60 panel of cancer cell lines',
        'GTEx Tissue Gene Expression Profiles': 'MRNA expression profiles for tissues',
    },
    section='DATASETS',
) %}
{% set additional_attribute_dataset = FileField(
    name='additional_attribute_dataset',
    label='Custom Attribute Dataset (Optional)',
    description='We will use this on top of the harmonizome attribute data (or only if you deselect the harmonizome data)',
    default='',
    section='DATASETS',
) %}
{% if additional_attribute_dataset.value %}
{% set attribute_datasets = harmonizome_attribute_datasets.value + [additional_attribute_dataset.value] %}
{% else %}
{% set attribute_datasets = harmonizome_attribute_datasets.value %}
{% endif %}

{% set target = TabField(
    name='target',
    label='Target Selection',
    default='Harmonizome',
    choices={
        'Harmonizome': [AutocompleteField(
            name='harmonizome_class',
            label='Harmonizome Class',
            description='A class of genes annotated in select Harmonizome association datasets',
            default='cancer (DOID:162 from DISEASES Text-mining Gene-Disease Assocation Evidence Scores)',
            file_path='https://appyters.maayanlab.cloud/storage/Harmonizome_ML/class_list.json',
        )],
        'Custom': [TextListField(
            name='custom_class',
            label='Custom Geneset Class',
            hint='Newline separated geneset of genes in the class',
            description='A set of genes that make up your own class',
            default='',
        )],
    },
    section='DATASETS',
) %}
{% if target.raw_value == 'Harmonizome' %}
{% set target_label, target_group, target_dataset = target.value[0].value|re_match('^(.+) \\((.+) from (.+)\\)$') %}
{% set target_name = (target_label + ' ' + target_group).strip() %}
{% else %}
{% set target_name = 'target' %}
{% set target_dataset = 'custom' %}
{% endif %}

In [None]:
%%appyter code_exec
attribute_datasets = {{ attribute_datasets }}
df_attributes = list(Harmonizome.download_df(
    [dataset
     for dataset in attribute_datasets],
    ['gene_attribute_matrix.txt.gz'],
))
for name, df in zip(attribute_datasets, df_attributes):
    df.index.name = json.loads(df.index.name)[0]
    df.index = df.index.map(lambda s: json.loads(s)[0])
    df.columns = df.columns.map(lambda s: ' '.join(ss for ss in try_json_loads(s) if ss != 'na'))
    print('%s shape:' % (name), df.shape)
    display(df.head())

# Assemble all attribute datasets
if len(df_attributes) > 1:
    # Obtain merged dataframe with omics and target data
    df = reduce(
        lambda a, b: pd.merge( # Merge two dataframes item by item
            a, # left
            b, # right
            # Items with the same left and right index are merged
            left_index=True,
            right_index=True,
            how='outer', # Keep mis-matched index
        ),
        df_attributes,
    )
else:
    df = df_attributes[0]

X = df.applymap(lambda f: 1 if f!=0 else 0)
print('Total Shape:', X.shape)
display(X.head())

In [None]:
%%appyter markdown
We download the dataset containtaining the target{% if target.raw_value == 'Harmonizome' %} ({{ target_name }}), {{ target_dataset }}{% endif %}.

In [None]:
%%appyter code_exec
{%if target.raw_value == 'Harmonizome' %}
target_datasets = ['{{ target_dataset }}']

# Download attribute datasets from Harmonizome
df_targets = list(Harmonizome.download_df(
    [dataset
     for dataset in target_datasets],
    ['gene_attribute_matrix.txt.gz'],
))

for name, df in zip(target_datasets, df_targets):
    df.index.name = json.loads(df.index.name)[0]
    df.index = df.index.map(lambda s: json.loads(s)[0])
    df.columns = df.columns.map(lambda s: ' '.join(ss for ss in try_json_loads(s) if ss != 'na'))
    print('%s shape:' % (name), df.shape)
    display(df.head())

# Assemble all target datasets
if len(df_targets) > 1:
    # Obtain merged dataframe with omics and target data
    df = reduce(
        lambda a, b: pd.merge( # Merge two dataframes item by item
            a, # left
            b, # right
            # Items with the same left and right index are merged
            left_index=True,
            right_index=True,
            how='outer', # Keep mis-matched index
        ),
        df_targets,
    )
else:
    df = df_targets[0]
{% else %}
target = {{ target.value[0].value }}
df = pd.Series(
    np.in1d(X.index, [gene.upper() for gene in target]),
    index=X.index,
).to_frame('{{ target_name }}')
{% endif %}

Y = df.applymap(lambda f: 1 if f!=0 else 0)
print('Total Shape:', Y.shape)
display(Y.head())

In [None]:
%%appyter markdown
For the target class, we build a list (1 if gene is associated, otherwise 0)

In [None]:
%%appyter code_exec
y = np.in1d(X.index, Y[Y['{{ target_name }}'] == 1].index).astype(np.int8)

print('Known Targets: %d (%0.3f %%)' % (y.sum(), 100*y.sum()/len(y)))

We produce a target array containing 1 if the gene is associated and 0 otherwise.

In [None]:
# Output data shapes
print('Input shape:', X.shape)
print('Target shape:', y.shape)

In [None]:
%%appyter hide_code
{% do SectionField(
    name='SETTINGS',
    title='SETTINGS',
    subtitle='From here you can select the various available Machine Learning algorithms, their unique settings, and the methods to use to evaluate the classifier.',
    img='settings.png',
) %}
{% set dimensionality_reduction = ChoiceField(
    name='dimensionality_reduction',
    label='Dimensionality Reduction Algorithm',
    description='A dimensionality reduction algorithm should be selected to improve the quality of the classifier.',
    default='PCA',
    choices={
        'PCA': 'sk.decomposition.PCA(n_components=64)',
        'TruncatedSVD': 'sk.decomposition.TruncatedSVD(n_components=64)',
        'IncrementalPCA': 'sk.decomposition.IncrementalPCA(n_components=64)',
        'ICA': 'sk.decomposition.FastICA(n_components=64)',
        'SparsePCA': 'sk.decomposition.SparsePCA(n_components=64)',
    },
    section='SETTINGS'
) %}
{% set manifold_projection = ChoiceField(
    name='manifold_projection',
    label='Manifold Projection Algorithm',
    description='A an algorithm for projecting the reduced dimensionality data into 2 dimensions.',
    default='TSNE',
    choices={
        'TSNE': 'sk.manifold.TSNE(n_components=2)',
        'UMAP': 'umap.UMAP(n_components=2)',
    },
    section='SETTINGS'
) %}

In [None]:
%%appyter markdown
## Dimensionality Reduction

We reduce the dimensionality of our omics feature space with {{ dimensionality_reduction.raw_value }} and project it onto a manifold with {{ manifold_projection.raw_value }}.

In [None]:
%%appyter code_exec
clf_dimensionality_reduction = {{ dimensionality_reduction }}
X_reduced = pd.DataFrame(
    clf_dimensionality_reduction.fit_transform(X.values),
    index=X.index,
)
display(
    px.scatter_3d(
        X_reduced,
        x=X_reduced.columns[1],
        y=X_reduced.columns[2],
        z=X_reduced.columns[3],
        color=y,
        hover_data=[X_reduced.index],
    )
)

In [None]:
%%appyter code_exec
{% if manifold_projection.raw_value == 'UMAP' %}
import umap
{% endif %}
proj = {{ manifold_projection }}
X_transformed = pd.DataFrame(
    proj.fit_transform(X_reduced.iloc[:, :10].values),
    index=X_reduced.index,
)
display(
    px.scatter(
        X_transformed,
        x=X_reduced.columns[0],
        y=X_reduced.columns[1],
        color=y,
        hover_data=[X_transformed.index],
    )
)

In [None]:
%%appyter code_hide
{% set feature_selection = ChoiceField(
    name='feature_selection',
    label='Machine Learning Feature Selection',
    default='None',
    choices={
        'None': 'None',
        'SelectFromLinearSVC': 'sk.feature_selection.SelectFromModel(sk.svm.LinearSVC(loss="squared_hinge", penalty="l1", dual=False))',
        'SelectFromExtraTrees': 'sk.feature_selection.SelectFromModel(sk.tree.ExtraTreeClassifier())',
        'SelectKBest': 'sk.feature_selection.SelectKBest(sk.feature_selection.f_classif))',
        'SelectKBestChi2': 'sk.feature_selection.SelectKBest(sk.feature_selection.chi2)',
        'SelectKBestMultiInfo': 'sk.feature_selection.SelectKBest(sk.feature_selection.mutual_info_classif)',
    },
    section='SETTINGS',
) %}
{% set cv_algorithm = ChoiceField(
    name='cv_algorithm',
    label='Cross Validation Algorithm',
    default='StratifiedKFold',
    value='KFold',
    choices={
        'KFold': 'sk.model_selection.KFold',
        'GroupKFold': 'sk.model_selection.GroupKFold',
        'RepeatedKFold': 'sk.model_selection.RepeatedKFold',
        'StratifiedKFold': 'sk.model_selection.StratifiedKFold',
        'RepeatedStratifiedKFold': 'sk.model_selection.RepeatedStratifiedKFold',
    },
    section='SETTINGS',
) %}
{% set algorithm = ChoiceField(
    name='algorithm',
    label='Machine Learning Algorithm',
    default='RandomForestClassifier',
    description='A machine learning algorithm should be selected to construct the predictive model.',
    choices={
        'GradientBoostingClassifier': 'sk.ensemble.GradientBoostingClassifier()',
        'RandomForestClassifier': 'sk.ensemble.RandomForestClassifier()',
        'AdaBoostClassifier': 'sk.ensemble.AdaBoostClassifier()',
        'ExtraTreeClassifier': 'sk.tree.ExtraTreeClassifier()',
        'DecisionTreeClassifier': 'sk.tree.DecisionTreeClassifier()',
        'KNeighborsClassifier': 'sk.neighbors.KNeighborsClassifier()',
        'RadiusNeighborsClassifier': 'sk.neighbors.RadiusNeighborsClassifier()',
        'MLPClassifier': 'sk.neural_network.MLPClassifier()',
        'OneClassSVM': 'sk.svm.OneClassSVM()',
    },
    section='SETTINGS',
) %}
{% set calibrated = BoolField(
    name='calibrated',
    label='Calibrate algorithm predictions',
    description='Calibrate the prediction probabilities eliminating model-imparted bias.',
    default=True,
    section='SETTINGS',
) %}
{% set hyper_param_search = ChoiceField(
    name='hyper_param_search',
    label='Hyper Parameter Search Type',
    default='None',
    description='Hyper parameter searching is used to automatically select the best parameters (using the primary metric as the criteria).',
    choices={
        'None': 'None',
        'RandomizedSearchCV': 'sk.model_selection.RandomizedSearchCV',
        'GridSearchCV': 'sk.model_selection.GridSearchCV',
    },
    section='SETTINGS',
) %}
{% set cross_validation_n_folds = IntField(
    name='cross_validation_n_folds',
    label='Cross-Validated Folds',
    description='Cross validation is employed as a strategy to train the model on data that the model has not seen before, more folds will ensure that the model is generalizing well.',
    default=3,
    min=2,
    max=10,
    section='SETTINGS',
) %}
{# available_metrics from sk.metrics.SCORERS.keys() #}
{% set primary_metric = ChoiceField(
    name='primary_metric',
    label='Primary Evaluation Metric',
    default='roc_auc',
    description='The primary evaluation metric is used for deciding how we assess the performance of our model.',
    choices=['explained_variance', 'r2', 'max_error', 'neg_median_absolute_error', 'neg_mean_absolute_error', 'neg_mean_absolute_percentage_error', 'neg_mean_squared_error', 'neg_mean_squared_log_error', 'neg_root_mean_squared_error', 'neg_mean_poisson_deviance', 'neg_mean_gamma_deviance', 'accuracy', 'top_k_accuracy', 'roc_auc', 'roc_auc_ovr', 'roc_auc_ovo', 'roc_auc_ovr_weighted', 'roc_auc_ovo_weighted', 'balanced_accuracy', 'average_precision', 'neg_log_loss', 'neg_brier_score', 'adjusted_rand_score', 'rand_score', 'homogeneity_score', 'completeness_score', 'v_measure_score', 'mutual_info_score', 'adjusted_mutual_info_score', 'normalized_mutual_info_score', 'fowlkes_mallows_score', 'precision', 'precision_macro', 'precision_micro', 'precision_samples', 'precision_weighted', 'recall', 'recall_macro', 'recall_micro', 'recall_samples', 'recall_weighted', 'f1', 'f1_macro', 'f1_micro', 'f1_samples', 'f1_weighted', 'jaccard', 'jaccard_macro', 'jaccard_micro', 'jaccard_samples', 'jaccard_weighted'],
    section='SETTINGS',
) %}
{% set evaluation_metrics = MultiChoiceField(
    name='evaluation_metrics',
    label='Evaluation Metrics',
    default=[],
    description='Additional evaluation metrics can be specified, these metrics will also be reported for all models trained.',
    value=['recall', 'f1'],
    choices=['explained_variance', 'r2', 'max_error', 'neg_median_absolute_error', 'neg_mean_absolute_error', 'neg_mean_absolute_percentage_error', 'neg_mean_squared_error', 'neg_mean_squared_log_error', 'neg_root_mean_squared_error', 'neg_mean_poisson_deviance', 'neg_mean_gamma_deviance', 'accuracy', 'top_k_accuracy', 'roc_auc', 'roc_auc_ovr', 'roc_auc_ovo', 'roc_auc_ovr_weighted', 'roc_auc_ovo_weighted', 'balanced_accuracy', 'average_precision', 'neg_log_loss', 'neg_brier_score', 'adjusted_rand_score', 'rand_score', 'homogeneity_score', 'completeness_score', 'v_measure_score', 'mutual_info_score', 'adjusted_mutual_info_score', 'normalized_mutual_info_score', 'fowlkes_mallows_score', 'precision', 'precision_macro', 'precision_micro', 'precision_samples', 'precision_weighted', 'recall', 'recall_macro', 'recall_micro', 'recall_samples', 'recall_weighted', 'f1', 'f1_macro', 'f1_micro', 'f1_samples', 'f1_weighted', 'jaccard', 'jaccard_macro', 'jaccard_micro', 'jaccard_samples', 'jaccard_weighted'],
    section='SETTINGS',
) %}
{% set all_metrics = [primary_metric.value] + evaluation_metrics.value %}

In [None]:
%%appyter markdown
## Machine Learning

We apply a {% if hyper_param_search.raw_value != 'None' %}{{ hyper_param_search.raw_value }} search for the hyper parameters
of a {% endif %}sklearn pipeline with a dimensionality reduction step of {{ dimensionality_reduction.raw_value }}
{% if feature_selection.raw_value != 'None' %}and a feature selection step of {{ feature_selection.raw_value }}
{% endif %} and a{% if calibrated %} calibrated{%endif %} {{ algorithm.raw_value }} classifier
using {{ cross_validation_n_folds.raw_value }}-fold repeated
stratified cross-validation, optimizing {{ primary_metric.raw_value }}
{% if evaluation_metrics.raw_value %} and computing {{ ', '.join(evaluation_metrics.raw_value) }}{% endif %}.

In [None]:
%%appyter code_exec
{% if algorithm.raw_value == 'GradientBoostingClassifier' %}
## Early stopping function
def early_stopping(n_rounds, tol=0.001):
    def early_stopping_func(i, self, local):
        rounds = getattr(self, '__rounds', 0)
        last = getattr(self, '__last', None)
        current = self.train_score_[i]
        if last and current and abs(current - last) < tol:
            rounds += 1
            if rounds > n_rounds:
                return True
        else:
            rounds = 0
        setattr(self, '__last', current)
        setattr(self, '__rounds', rounds)
        return False
    return early_stopping_func
{% endif %}

{#
param_grid = {
    'reduce_dim__n_components': randint(2, 1024),
{% if algorithm.raw_value == 'GradientBoostingClassifier' %}
    'clf__loss': ['deviance', 'exponential'],
    'clf__learning_rate': randfloat(0.001, 1.),
    'clf__subsample': randfloat(0.01, 1.),
{% elif algorithm.raw_value == 'RandomForestClassifier' %}
    'clf__oob_score': [True],
    'clf__criterion': ['gini', 'entropy'],
{% endif %}
    'clf__n_estimators': randint(10, 200),
    'clf__max_depth': randint(20, 50),
    'clf__max_features': ['sqrt', 'log2', None],
    'clf__min_impurity_decrease': randfloat(0., 0.2),
    'clf__min_weight_fraction_leaf': randfloat(0., 0.5),
}

fit_params = {
{% if algorithm.raw_value == 'GradientBoostingClassifier' %}
    'clf__monitor': early_stopping(5),
{% endif %}
}
#}
    
cv = {{ cv_algorithm }}(
    n_splits={{ cross_validation_n_folds }},
    shuffle=True,
    random_state=rng,
)

model =
{%- if hyper_param_search.raw_value != 'None' %} {{ hyper_param_search }}({% endif -%}
{%- if target.raw_value == 'Gene' %} multioutput.MultiOutputClassifier({% endif -%}
    {%- if calibrated %} sk.calibration.CalibratedClassifierCV({% endif -%}
        sk.pipeline.Pipeline([
            ('reduce_dim', {{ dimensionality_reduction }}),
            {%- if feature_selection.raw_value != 'None' %}('feature_selection', {{ feature_selection }}),{% endif %}
            ('clf', {{ algorithm }}),
        ]),
    cv=cv,
{% if calibrated %}){% endif -%}{% if target.raw_value == 'Gene' %}){% endif %}{%- if hyper_param_search.raw_value != 'None' %}){% endif %}

# Scoring parameters
primary_metric = '{{ primary_metric }}'
evaluation_metrics = {{ evaluation_metrics }}
scoring_params = {k: scorer
                  for k, scorer in sk.metrics.SCORERS.items()
                  if k == primary_metric or k in evaluation_metrics}

In [None]:
%%appyter code_exec
{% if hyper_param_search.raw_value == 'None' %}
df_results = pd.DataFrame()
for fold, (train, test) in enumerate(cv.split(X.values, y)):
    model.fit(X.values[train], y[train])
    {% for metric in all_metrics %}
    df_results.loc[fold, '{{ metric }}'] = scoring_params['{{ metric }}'](model, X.values[test], y[test])
    {% endfor %}
display(df_results.agg(['mean', 'std']))
{% else %}
model.fit(X, y)
df_results = model.cv_results_
{% endif %}

This visualization shows illustrates the cross-validated performance of the model. Low fold variance and high AUC is desired in a well-generalized model.

In [None]:
%%appyter code_exec
{% if 'roc_auc' in all_metrics %}
fig, ax = plt.subplots()

tprs = []
aucs = []
mean_fpr = np.linspace(0, 1, 100)

for fold, (train, test) in enumerate(cv.split(X.values, y)):
    model.fit(X.values[train], y[train])
    y_proba = model.predict_proba(X.values[test]) # Probability prediction will be True
    fpr, tpr, _ = sk.metrics.roc_curve(y[test], y_proba[:, 1])
    tprs.append(np.interp(mean_fpr, fpr, tpr))
    tprs[-1][0] = 0.0
    roc_auc = sk.metrics.auc(fpr, tpr)
    aucs.append(roc_auc)
    ax.plot(fpr, tpr, alpha=0.4, label='ROC Fold %d (AUC=%0.3f)' % (fold, roc_auc))

mean_tpr = np.mean(tprs, axis=0)
mean_tpr[-1] = 1.0
mean_auc = sk.metrics.auc(mean_fpr, mean_tpr)
std_auc = np.std(aucs)
ax.plot(mean_fpr, mean_tpr, color='b',
         label=r'Mean ROC (AUC = %0.2f $\pm$ %0.2f)' % (mean_auc, std_auc),
         lw=2, alpha=.8)

std_tpr = np.std(tprs, axis=0)
tprs_upper = np.minimum(mean_tpr + std_tpr, 1)
tprs_lower = np.maximum(mean_tpr - std_tpr, 0)
plt.fill_between(mean_fpr, tprs_lower, tprs_upper, color='grey', alpha=.2)

ax.plot([0,1],[0,1],'--', label='Luck')
ax.legend()

z = (mean_auc - 0.5)/std_auc
cl = sp.stats.norm.cdf(z) * 100
ci = sp.stats.norm.interval(0.95, loc=mean_auc, scale=std_auc)
print('Confidence interval (95%)', ci)
print("We are %0.3f %% confident the model's results are not just chance." % (cl))
if cl > 95:
    print('This is statistically significant')
else:
    print('This is not statistically significant')
{% endif %}

This will take a long time as we are evaluating n_iter different models n_splits different times each computing all the metrics on `product(X.shape)` data points--not to mention the size of each model dictated by the range of parameters specified in the params dict.

In [None]:
model.fit(X.values, y)
sk.metrics.plot_confusion_matrix(model, X.values, y)

In [None]:
# Obtain prediction results
y_proba = model.predict_proba(X)[:, 1]
results = pd.DataFrame({
    'Known': y,
    'Predicted': (y_proba > 0.5).astype(int),
    'Prediction Probability': y_proba,
}, index=X.index).sort_values(
    'Prediction Probability',
    ascending=False,
)
results[((results['Known'] != results['Predicted']) & (results['Prediction Probability'] > 0.5))]

In [None]:
results.to_csv('results.tsv', sep='\t')
display(Markdown('Download model predictions at [results.tsv](./results.tsv)'))