In [None]:
# install https://github.com/MarkusHaak/dlomix/ with pip
# OR uncomment to insert its path with sys:
#import os, sys
#sys.path.insert(0, os.path.abspath('../../dlomix/'))

In [None]:
# set global seeds for reproducibility
from dlomix.utils import set_global_seed
set_global_seed(42)

In [None]:
import tensorflow as tf
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import dlomix
from dlomix import constants, data, eval, layers, models, pipelines, reports, utils
from time import time
import traceback
from tqdm import tqdm
from dlomix.data import RetentionTimeDataset
from dlomix.models import PrositRetentionTimePredictor

In [None]:
# alphabet using the same PTM identifiers as in the created datasets
ALPHABET_MOD = {
    "A": 1,
    "C": 2,
    "D": 3,
    "E": 4,
    "F": 5,
    "G": 6,
    "H": 7,
    "I": 8,
    "K": 9,
    "L": 10,
    "M": 11,
    "N": 12,
    "P": 13,
    "Q": 14,
    "R": 15,
    "S": 16,
    "T": 17,
    "V": 18,
    "W": 19,
    "Y": 20,
    "^": 21,
    "}": 22,
}

# Monte Carlo Dropout

Perform Monte Carlo Dropout (MCD) by loading the individual models from weights and calling them with active dropout on the calibration and test data.

In [None]:
def predict_with_dropout(model, test_data, n=50):
    predictions = []
    for i in tqdm(range(n)):
        res = np.concatenate([model(batch[0], training=True).numpy() for batch in list(test_data)])
        predictions.append(res)
    return np.column_stack(predictions)

In [None]:
# Define which n's to use for later analysis
Ns = [3,5,10,20,30,50,100]
models = {'PRT_med':"./output_median/cv{}/best",
          'PRT_sel10':"./output_sel10/cv{}/best"}

In [None]:
res = {}
res_calib = {}
res_sel10 = {}
res_calib_sel10 = {}
for cv in range(1,6):
    BATCH_SIZE = 256
    
    # load calibration data for the respective cross-validation split, and the test data
    # mdeian dataset
    CALIBRATION_DATAPATH = f'../data/PROSPECT_median_calibration{cv}.csv'
    calibration_rtdata = RetentionTimeDataset(data_source=CALIBRATION_DATAPATH,
                                       seq_length=30, batch_size=BATCH_SIZE, test=True,
                                       sequence_col='modified_sequence_single_letter',
                                       target_col='median')
    TEST_DATAPATH = '../data/PROSPECT_median_holdout_cv.csv'
    test_rtdata = RetentionTimeDataset(data_source=TEST_DATAPATH,
                                       seq_length=30, batch_size=BATCH_SIZE, test=True,
                                       sequence_col='modified_sequence_single_letter',
                                       target_col='median')
    test_targets = test_rtdata.get_split_targets(split="test")
    # sel10 dataset
    CALIBRATION_DATAPATH = f'../data/PROSPECT_sel10_calibration{cv}.csv'
    calibration_rtdata_sel10 = RetentionTimeDataset(data_source=CALIBRATION_DATAPATH,
                                       seq_length=30, batch_size=BATCH_SIZE, test=True,
                                       sequence_col='modified_sequence_single_letter',
                                       target_col='indexed_retention_time')
    TEST_DATAPATH = '../data/PROSPECT_sel10_holdout_cv.csv'
    test_rtdata_sel10 = RetentionTimeDataset(data_source=TEST_DATAPATH,
                                       seq_length=30, batch_size=BATCH_SIZE, test=True,
                                       sequence_col='modified_sequence_single_letter',
                                       target_col='indexed_retention_time')
    test_targets_sel10 = test_rtdata_sel10.get_split_targets(split="test")
    # perform MCD
    for l,model_save_path in models.items():
        for res_dict, data_save_path, data in [(res, f'MCD_{l}_cv{cv}_data.npy', test_rtdata.test_data), 
                                               (res_calib, f'MCD_{l}_cv{cv}_calib.npy', calibration_rtdata.test_data),
                                               (res_sel10, f'MCD_{l}_cv{cv}_sel10_data.npy', test_rtdata_sel10.test_data), 
                                               (res_calib_sel10, f'MCD_{l}_cv{cv}_sel10_calib.npy', calibration_rtdata_sel10.test_data)
                                              ]:
            # load the respective model from weights
            model_save_path = model_save_path.format(cv)
            print(l, cv, model_save_path, data_save_path)
            # skip if predictions were already performed (in case notebook crashed)
            if os.path.exists(data_save_path):
                with open(data_save_path, 'rb') as f:
                    pred = np.load(f)
            else:
                set_global_seed(42)
                model = PrositRetentionTimePredictor(seq_length=30, vocab_dict=ALPHABET_MOD)
                model.load_weights(model_save_path).expect_partial()
                try:
                    pred = predict_with_dropout(model, data, n=Ns[-1])
                except:
                    print("ERROR !!!")
                    print(traceback.format_exc())
                    continue
                with open(data_save_path, 'wb') as f:
                    np.save(f, pred)
            # store results
            for n in Ns:
                label = f"{l}_n={n}"
                if label not in res_dict:
                    res_dict[label] = {}
                res_dict[label][cv] = {}
                res_dict[label][cv]['data'] = np.array((pred[:,:n].mean(axis=1), pred[:,:n].std(axis=1)))

# Conformal Prediction for scalar MCD results

Apply conformal Prediction (scalar version) with alpha = 0.1 to assure marginal coverage of 0.9 for each model.
The same is done for a randomized background model that is identical to the original model with respect to the heuristic interval sizes, but they are randomly reassociated betwee the test datapoints.

In [None]:
from dlomix.eval.scalar_conformal import ScalarConformalScore, ScalarConformalQuantile
from dlomix.reports.MonteCarloReport import MonteCarloReport
from scipy.stats import ks_2samp

In [None]:
alpha = 0.1
for r, r_calib, ds in [(res_sel10, res_calib_sel10, 'sel10'), (res, res_calib, 'median')]
    for label in r:
        for cv in range(1,6):
            # skip in case computation failed / was not performed yet
            if cv not in r[label] or cv not in r_calib[label]:
                continue
            # load calibration data for the respective cross-validation split, and the test data
            if ds == 'median':
                CALIBRATION_DATAPATH = f'../data/PROSPECT_median_calibration{cv}.csv'
                calibration_rtdata = RetentionTimeDataset(data_source=CALIBRATION_DATAPATH,
                                                   seq_length=30, batch_size=BATCH_SIZE, test=True,
                                                   sequence_col='modified_sequence_single_letter',
                                                   target_col='median')
                calibration_targets = calibration_rtdata.get_split_targets(split="test")
                TEST_DATAPATH = '../data/PROSPECT_median_holdout_cv.csv'
                test_rtdata = RetentionTimeDataset(data_source=TEST_DATAPATH,
                                                   seq_length=30, batch_size=BATCH_SIZE, test=True,
                                                   sequence_col='modified_sequence_single_letter',
                                                   target_col='median')
                test_targets = test_rtdata.get_split_targets(split="test")
            else:
                CALIBRATION_DATAPATH = f'../data/PROSPECT_sel10_calibration{cv}.csv'
                calibration_rtdata = RetentionTimeDataset(data_source=CALIBRATION_DATAPATH,
                                                   seq_length=30, batch_size=BATCH_SIZE, test=True,
                                                   sequence_col='modified_sequence_single_letter',
                                                   target_col='indexed_retention_time')
                calibration_targets = calibration_rtdata.get_split_targets(split="test")
                TEST_DATAPATH = '../data/PROSPECT_sel10_holdout_cv.csv'
                test_rtdata = RetentionTimeDataset(data_source=TEST_DATAPATH,
                                                   seq_length=30, batch_size=BATCH_SIZE, test=True,
                                                   sequence_col='modified_sequence_single_letter',
                                                   target_col='indexed_retention_time')
                test_targets = test_rtdata.get_split_targets(split="test")
            
            # perform conformalization
            print(f'#### {label} {cv} ####')
            conf_scores = ScalarConformalScore(reduction='none')(calibration_targets, r_calib[label][cv]['data'].T).numpy()
            conf_quantile = ScalarConformalQuantile()(calibration_targets, r_calib[label][cv]['data'].T).numpy()
            print(f"alpha = {alpha}, conformal quantile: {conf_quantile:.2f}")
            avgs, stds = r[label][cv]['data'][0], r[label][cv]['data'][1]
            intervals = np.array([avgs - stds * conf_quantile, avgs + stds * conf_quantile]).T
            interval_sizes = intervals[:,1] - intervals[:,0]
            within = (test_targets >= intervals[:,0]) & (test_targets <= intervals[:,1])
            
            # plot results
            MonteCarloReport.plot_conformal_scores(conf_scores, quantile=conf_quantile)
            MonteCarloReport.plot_predictions_with_intervals(test_targets, avgs, intervals)
            MonteCarloReport.plot_conformalized_interval_size(interval_sizes)

            pvalue = ks_2samp(interval_sizes[within], interval_sizes[~within]).pvalue # prob. for Null: distr are identical
            print(f"p = {pvalue:.5f} : {'Reject' if pvalue < 0.01 else 'Accept'} Null Hypothesis (Distr. identical)")

            MonteCarloReport.plot_conformalized_interval_size_PDFs(interval_sizes, within, pvalue)

            # store results
            r[label][cv]['conf_scores'] = conf_scores
            r[label][cv]['conf_quantile'] = conf_quantile
            r[label][cv]['intervals'] = intervals
            r[label][cv]['within'] = within
            r[label][cv]['conf_scores_test'] = ScalarConformalScore(reduction='none')(test_targets, r[label][cv]['data'].T).numpy()
            r[label][cv]['pvalue'] = pvalue

            # calculate a random background model, then perfrom conformalization as above
            np.random.seed(42)
            p = np.random.permutation(r_calib[label][cv]['data'].T.shape[0])
            permuted_ivs = r_calib[label][cv]['data'].T.copy()
            permuted_ivs = np.column_stack([permuted_ivs[:,0], permuted_ivs[:,1][p]])
            conf_scores = ScalarConformalScore(reduction='none')(calibration_targets, permuted_ivs).numpy()
            conf_quantile = ScalarConformalQuantile()(calibration_targets, permuted_ivs).numpy()
            np.random.seed(cv)
            p = np.random.permutation(r[label][cv]['data'].T.shape[0])
            avgs, stds = r[label][cv]['data'][0], r[label][cv]['data'][1][p]
            intervals = np.array([avgs - stds * conf_quantile, avgs + stds * conf_quantile]).T
            interval_sizes = intervals[:,1] - intervals[:,0]
            within = (test_targets >= intervals[:,0]) & (test_targets <= intervals[:,1])
            r[label][cv]['rnd_conf_scores'] = conf_scores
            r[label][cv]['rnd_conf_quantile'] = conf_quantile
            r[label][cv]['rnd_intervals'] = intervals
            r[label][cv]['rnd_within'] = within

# Save results

In [None]:
import pickle

In [None]:
with open("../data/MonteCarloDropout_results_with_rnd.pkl", 'wb') as f:
    pickle.dump(res, f)

In [None]:
with open("../data/MonteCarloDropout_results_calibration_with_rnd.pkl", 'wb') as f:
    pickle.dump(res_calib, f)

In [None]:
with open("../data/MonteCarloDropout_results_sel10_with_rnd.pkl", 'wb') as f:
    pickle.dump(res_sel10, f)

In [None]:
with open("../data/MonteCarloDropout_results_calibration_sel10_with_rnd.pkl", 'wb') as f:
    pickle.dump(res_calib_sel10, f)