In [None]:
from peak_performance import pipeline as pl
from peak_performance import models
from peak_performance import plots

User information

In [None]:
# 1: Specify the absolute path to the raw data files.
path = r""

# obtain a list of raw data file names
raw_data_files = pl.detect_npy(path)
print(raw_data_files)

# necessary information from the user
double_peak = []                # 2: List with Booleans in the same order as raw_data_files. Set to True for a given signal, if the signal contains a double peak, and set to False, if it contains a single peak. Visually check this beforehand.
pre_filtering = True            # 3: Set this variable to True if you want to check for peaks before fitting/sampling to potentially save a lot of computation time. If you choose True, then you have to provide an expected retention time for each signal.
retention_time_estimate = []    # 4: in case you set pre_filtering to True, give a retention time estimate (float) for each signal in raw_data_files. In case of a double peak, give two retention times (in chronological order) as a tuple containing two floats.
peak_width_estimate = 1         # 5: in case you set pre_filtering to True, give a rough estimate of the average peak width in minutes you would expect for your LC-MS method.
minimum_sn = 5                  # 6: in case you set pre_filtering to True, give a minimum signal to noise ratio for a signal to be defined as a peak during pre-filtering

Pipeline

In [None]:
# create data structure and DataFrame(s) for results 
df_summary, path = pl.initiate(path)
for file in raw_data_files:
    # parse the data and extract information from the (standardized) file name
    timeseries, acquisition, experiment, precursor_mz, product_mz_start, product_mz_end = pl.parse_data(file)
    # instantiate the UserInput class all given information
    ui = pl.UserInput(path, raw_data_files, double_peak, retention_time_estimate, peak_width_estimate, pre_filtering, minimum_sn, timeseries, acquisition, experiment, precursor_mz, product_mz_start, product_mz_end)
    # calculate initial guesses for pre-filtering and defining prior probability distributions
    slope_guess, intercept_guess, noise_guess = models.initial_guesses(ui.timeseries[0], ui.timeseries[1])
    # apply pre-sampling filter (if selected)
    if pre_filtering:
        prefilter = pl.prefiltering(file, ui, noise_guess)
        if not prefilter:
            # if no peak candidates were found, continue with the next signal
            continue
    # model selection
    if ui.double_peak:
        pmodel = models.define_model_doublepeak(ui.timeseries[0], ui.timeseries[1])
    else:
        pmodel = models.define_model_skew(ui.timeseries[0], ui.timeseries[1])
    # sample the chosen model
    idata = pl.sampling(pmodel)
    # apply post-sampling filter
    resample, discard = pl.postfiltering(idata, ui)
    # if peak was discarded, continue with the next signal
    if discard:
        df_summary = pl.report_add_nan_to_summary(ui, df_summary)
        continue
    # if convergence was not yet reached, sample again with more tuning samples
    if resample:
        idata = pl.sampling(pmodel, tune = 4000)
        resample, discard = pl.postfiltering(idata, ui)
        if discard:
            continue
        if resample:
            # if signal was flagged for re-sampling a second time, discard it
            # TODO: should this really be disvarded or should the contents of idata be added with an additional comment? (would need to add a comment column)
            df_summary = pl.report_add_nan_to_summary(ui, df_summary)
            continue
    # add inference data to df_summary and save it as an Excel file
    df_summary = pl.report_add_data_to_summary(idata, df_summary, ui)
    # perform posterior predictive sampling
    idata = pl.posterior_predictive_sampling(pmodel, idata)
    # save the inference data object in a zip file
    pl.report_save_idata(path, idata)
# save condesed Excel file with area data
pl.report_area_sheet(path, df_summary)