# Runner

This notebook shows an exemplary AlphaPept Run by calling individual functions.

In [None]:

from tqdm import tqdm as tqdm
import alphapept.constants

mass_dict = constants.mass_dict

with tqdm(total=1) as pbar:

    def tqdm_wrapper(value):
        pbar.update(value - pbar.n)

    fasta_path = '/Users/mstrauss/Desktop/Human_July2019_with_isoforms_only_swissprot.fasta'

    to_add, pept_dict, fasta_dict = generate_library(mass_dict, fasta_path, callback = tqdm_wrapper, **kwargs)
    spectra = generate_spectra(to_add, mass_dict, callback = tqdm_wrapper)

    db_path = 'testlib.hd5'

    save_library(spectra, pept_dict, fasta_dict, db_path, **kwargs)

## Sanity Checks


In [None]:
import logging

def check_settings(settings):

    """
    Consistency check for settings.

    """
    logging.info('Checking raw path.')

    #Check if a valid raw file is provided. If a npz file is also provided do not convert

    if os.path.isfile(settings["raw"]["raw_path_npz"]):
        if settings["general"]["convert_raw"]:
            settings["general"]["convert_raw"] = False
            logging.info('NPZ for raw file present. Skipping conversion step.')
    else:
        if os.path.isfile(settings["raw"]["raw_path"]):
            logging.info('No NPZ for raw present. Performing conversion step.')
            settings["general"]["convert_raw"] = True
        else:
            raise FileNotFoundError('No raw or converted raw file provided')

    logging.info('Raw path okay.')

    #Check library file
    logging.info('Checking library path.')

    if os.path.isfile(settings["fasta"]["library_path"]):
        if settings["general"]["create_library"]:
            settings["general"]["create_library"] = False
            logging.info('NPZ for library file present. Skipping library creation step.')
    else:
        if os.path.isfile(settings["fasta"]["fasta_path"]):
            logging.info('No NPZ for library present. Creating library from scratch.')
            settings["general"]["create_library"] = True
        else:
            raise FileNotFoundError('No fasta or library file provided')

    logging.info('Library path okay.')

    return settings

In [None]:
if False:
    """
        runner
        ~~~~~~~~~~~~~~~~
        Library for running setting files.
        :authors: Maximilian Strauss, Matthias Mann, 2020
        :copyright: Copyright (c) 2020 Mann Lab
    """

    import numpy as np
    import pandas as pd

    from alphapept.fasta import fasta_to_npz

    from alphapept.io import raw_to_npz

    from alphapept.search import query_data_to_features, score_x_tandem, get_psms, get_score_columns

    from alphapept.calibration import get_calibration

    import os

    import logging

    from functools import partial


    def alpha_runner(settings, CALLBACK_GLOBAL = None, CALLBACK_LOCAL = None, CURRENT_TASK = print):


        def CALLBACK_WRAPPER(global_progress, delta, progress):

            """
            Wrapper function to allow chaning the global callback with the local Callback
            """
            if CALLBACK_GLOBAL:
                CALLBACK_GLOBAL(global_progress+progress*delta)
            if CALLBACK_LOCAL:
                CALLBACK_LOCAL(progress)

        CURRENT_TASK('Checking Settings.')
        settings = check_settings(settings)

        if settings["general"]["create_library"]:
            CURRENT_TASK('Generating library')
            n_sequences, db_path = fasta_to_npz(settings["fasta"])
            logging.info('A total of {:,} theoretical spectra generated. Database saved to {}.'.format(n_sequences, db_path))


        CALLBACK_GLOBAL(10)

        from alphapept.io import raw_to_npz

        if settings["general"]["convert_raw"]:
            CURRENT_TASK('Converting raw files.')
            out_path = raw_to_npz(settings["raw"])
            settings["raw"]["query_path"] = out_path[0]
            logging.info('Raw file(s) saved to {}'.format(out_path))

        settings["raw"]["query_path"] = settings["raw"]["raw_path_npz"]

        db_data = np.load(settings["fasta"]["library_path"], allow_pickle=True)
        query_data = np.load(settings["raw"]["query_path"], allow_pickle=True)


        CALLBACK_GLOBAL(20)

        CURRENT_TASK('Finding MS1 features.')
        features = query_data_to_features(query_data)
        logging.info('A total of {:,} features found.'.format(len(features)))

        CALLBACK_GLOBAL(30)

        CURRENT_TASK('Running first search.')


        callback_wrap = partial(CALLBACK_WRAPPER, 30, 10)

        psms, num_specs_compared = get_psms(query_data, db_data, features, callback=callback_wrap, **settings["search"])
        logging.info('First search complete. Compared {:,} spectra and found {:,} psms.'.format(num_specs_compared, len(psms)))
        CALLBACK_LOCAL(0)

        CURRENT_TASK('Extracting columns for scoring.')
        psms, num_specs_scored = get_score_columns(psms, query_data, db_data, features, **settings["search"])
        logging.info('Extracted columns for {:,} psms.'.format(num_specs_scored))


        CURRENT_TASK('Scoring psms.')
        df = score_x_tandem(pd.DataFrame(psms), plot=False, verbose=False, **settings["search"])
        logging.info('Scoring complete. For {} FDR found {:,} targets and {:,} decoys.'.format(settings["search"]["peptide_fdr"], df['target'].sum(), df['decoy'].sum()) )


        CALLBACK_GLOBAL(40)

        if settings['search']['calibrate']:

            CURRENT_TASK('Calibrating features.')
            logging.info('Precursor Offset (PPM) is {:.2f} (mean), {:.2f} (std)'.format(df['o_mass_ppm'].mean(), df['o_mass_ppm'].std()))
            'Calibrating MS1 spectra'
            features_calib, df_sub = get_calibration(df, features, **settings["calibration"])

            o_mass_ppm_mean = df_sub['o_mass_ppm_calib'].mean()
            o_mass_ppm_std = df_sub['o_mass_ppm_calib'].std()

            logging.info('Calibration complete. Precursor Offset (PPM) is {:.2f} (mean), {:.2f} (std)'.format(o_mass_ppm_mean, o_mass_ppm_std))

            CALLBACK_GLOBAL(50)

            logging.info('Adjusting search bound to {:.2f} ppm.'.format(3*o_mass_ppm_std))

            settings["search"]["m_offset"] = 3*o_mass_ppm_std

            CURRENT_TASK('Running second search.')
            callback_wrap = partial(CALLBACK_WRAPPER, 50, 10)
            psms, num_specs_compared = get_psms(query_data, db_data, features_calib, callback=callback_wrap, **settings["search"])
            logging.info('Second search complete. Compared {:,} spectra and found {:,} psms.'.format(num_specs_compared, len(psms)))
            CALLBACK_LOCAL(0)
            CALLBACK_GLOBAL(60)

            CURRENT_TASK('Extracting columns for scoring.')
            psms, num_specs_scored = get_score_columns(psms, query_data, db_data, features_calib, **settings["search"])
            logging.info('Extracted columns for {:,} psms.'.format(num_specs_scored))

            CURRENT_TASK('Scoring psms.')
            df_calib = score_x_tandem(pd.DataFrame(psms), plot=False, verbose=False, **settings["search"])
            logging.info('Scoring complete. For {} FDR found {:,} targets and {:,} decoys.'.format(settings["search"]["peptide_fdr"], df_calib['target'].sum(), df_calib['decoy'].sum()) )

        else:
            df_calib = df.copy()

        CALLBACK_GLOBAL(100)
        CALLBACK_LOCAL(0)

        return features, df_calib


In [None]:
#hide
from nbdev.showdoc import *