# Runner

This notebook shows how to perform a maxquant run.

In [None]:
from time import time
%load_ext autoreload
%autoreload 2

## Logger

In [None]:
import sys
import logging
# Create logger
logger = logging.getLogger()
logger.setLevel(logging.INFO)

# Create STDERR handler
handler = logging.StreamHandler(sys.stderr)
# ch.setLevel(logging.DEBUG)

# Create formatter and add it to the handler
formatter = logging.Formatter('%(asctime)s %(levelname)-s - %(message)s', "%Y-%m-%d %H:%M:%S")
handler.setFormatter(formatter)

# Set STDERR handler as the only handler 
logger.handlers = [handler]

## Sanity Checks


In [None]:
import logging


def check_enviroment():
    import numba 
    if float('.'.join(numba.__version__.split('.')[0:2])) < 0.46:
        raise RuntimeError('Numba version {} not sufficient'.format(numba.__version__))

def check_settings(settings):

    """
    Consistency check for settings.

    """
    check_enviroment()
    
    logging.info('Checking raw path.')

    #Check if a valid raw file is provided. If a npz file is also provided do not convert

    if os.path.isfile(settings["raw"]["raw_path_npz"]):
        if settings["general"]["convert_raw"]:
            settings["general"]["convert_raw"] = False
            logging.info('NPZ for raw file present. Skipping conversion step.')
    else:
        if os.path.isfile(settings["raw"]["raw_path"]):
            logging.info('No NPZ for raw present. Performing conversion step.')
            settings["general"]["convert_raw"] = True
        else:
            raise FileNotFoundError('No raw or converted raw file provided')

    logging.info('Raw path okay.')

    #Check library file
    logging.info('Checking library path.')

    if os.path.isfile(settings["fasta"]["library_path"]):
        if settings["general"]["create_library"]:
            settings["general"]["create_library"] = False
            logging.info('NPZ for library file present. Skipping library creation step.')
    else:
        if os.path.isfile(settings["fasta"]["fasta_path"]):
            logging.info('No NPZ for library present. Creating library from FASTA.')
            settings["general"]["create_library"] = True
        else:
            raise FileNotFoundError('No FASTA or library file provided')

    logging.info('Library path okay.')

    return settings

## Callbacks

For callback, we employ the following callback strategy:

* `CURRENT_TASK`
* `CURRENT_PROGRESS`
* `OVERALL_PROGESS`


How can we estimate how long a task will take? Maybe we don't need this. Or later we can have some good estimates

General Tasks:

* File Conversion - 10%
* Library Generation - 10%
* Feature Finding - 40%
* Search, Recalibration, Search - 30%
* Output Tables - 10%


At a later stage we should be able to add things


*ToDo fix long progess things

In [None]:
from tqdm import tqdm_notebook as tqdm

from alphapept.settings import settings

#settings['raw']['raw_path_npz'] = 'F:/rawdata/04_hela_testrun/20190402_QX1_SeVW_MA_HeLa_500ng_LC11.npz'
settings['raw']['raw_path'] = 'F:/rawdata/04_hela_testrun/20190402_QX1_SeVW_MA_HeLa_500ng_LC11.raw'
#settings['fasta']['library_path'] = 'F:/rawdata/zz.database/uniprot_Human_reviewed_March_2019_manual.npz'
settings['fasta']['fasta_path'] = 'F:/rawdata/zz.database/uniprot_Human_reviewed_March_2019_manual.fasta'

settings['general']['create_library'] = True

In [None]:
if False: #Uncomment to run
    import numpy as np
    import os
    from functools import partial
    import pandas as pd

    CURRENT_TASK = print

    time_dict = {}

    time_dict['generate_library'] = 2
    time_dict['generate_spectra'] = 3
    time_dict['raw_to_npz'] = 5
    time_dict['raw_to_centroid'] = 1
    time_dict['get_hills'] = 1
    time_dict['split_hills'] = 1
    time_dict['filter_hills'] = 1
    time_dict['get_hill_data'] = 1
    time_dict['get_edges'] = 1
    time_dict['get_isotope_patterns'] = 1
    time_dict['feature_finder_report'] = 1
    time_dict['get_psms'] = 5



    with tqdm(total=100, desc='Current Progress', unit='%') as current_progress:
        with tqdm(total=100, desc='Overall Progress', unit='%') as overall_progress:

            progress = 0

            def progress_wrapper(current, delta=1):
                """
                Wrapper function to change the overall progress with the current progress
                """

                global progress

                to_update = np.round(current*100-current_progress.n, 2)
                current_progress.update(to_update)

                overall = progress + current*delta
                to_update = np.round(overall-overall_progress.n, 2)

                overall_progress.update(to_update)

                if current == 1.0:
                    progress += delta


            from alphapept.constants import mass_dict


            CURRENT_TASK('Checking Settings')
            settings = check_settings(settings)

            if settings["general"]["create_library"]:
                from alphapept.fasta import generate_library, generate_spectra, save_library

                CURRENT_TASK('Digesting FASTA')
                to_add, pept_dict, fasta_dict = generate_library(mass_dict, callback = partial(progress_wrapper, delta=time_dict['generate_library']), **settings['fasta'])
                logging.info('Digested {:,} proteins and generated {:,} peptides'.format(len(fasta_dict), len(to_add)))

                CURRENT_TASK('Generating Spectra')
                spectra = generate_spectra(to_add, mass_dict, callback = partial(progress_wrapper, delta=time_dict['generate_spectra']))
                logging.info('Generated {:,} spectra'.format(len(spectra)))

                CURRENT_TASK('Saving library')
                base, ext = os.path.splitext(settings['fasta']['fasta_path'])
                settings['fasta']['library_path'] = base + '.npz'
                library_path = save_library(spectra, pept_dict, fasta_dict, **settings['fasta'])             
                logging.info('Database saved to {}. Filesize {:.2f} Gb'.format(library_path, os.stat(library_path).st_size/(1024**3)))


            if settings["general"]["convert_raw"]:
                from alphapept.io import raw_to_npz

                CURRENT_TASK('Converting raw files')
                out_path = raw_to_npz(settings["raw"], callback=partial(progress_wrapper, delta=time_dict['raw_to_npz']))
                settings["raw"]["query_path"] = out_path[0]
                logging.info('Raw file(s) saved to {}'.format(out_path))
            else:
                settings["raw"]["query_path"] = settings["raw"]["raw_path_npz"]

            db_data = np.load(settings["fasta"]["library_path"], allow_pickle=True)
            query_data = np.load(settings["raw"]["query_path"], allow_pickle=True)


            #Feature Finding Part

            from alphapept.feature_finding import raw_to_centroid, get_hills, split_hills, filter_hills, get_hill_data, get_edges, get_isotope_patterns, feature_finder_report
            from alphapept.constants import averagine_aa, isotopes


            CURRENT_TASK('Converting centroids')
            centroids = raw_to_centroid(query_data, callback=partial(progress_wrapper, delta=time_dict['raw_to_centroid']))
            logging.info('Loaded {:,} centroids.'.format(len(centroids)))

            CURRENT_TASK('Exctracting hills')
            completed_hills = get_hills(centroids, callback=partial(progress_wrapper, delta=time_dict['get_hills']))
            logging.info('A total of {:,} hills extracted. Average hill length {:.2f}'.format(len(completed_hills), np.mean([len(_) for _ in completed_hills])))

            CURRENT_TASK('Splitting hills')
            splitted_hills = split_hills(completed_hills, centroids, smoothing=1, callback=partial(progress_wrapper, delta=time_dict['split_hills']))
            logging.info('Split {:,} hills into {:,} hills'.format(len(completed_hills), len(splitted_hills)))

            CURRENT_TASK('Refining hills')
            filtered_hills = filter_hills(splitted_hills, centroids, callback=partial(progress_wrapper, delta=time_dict['filter_hills']))
            logging.info('Filtered {:,} hills. Remaining {:,} hills'.format(len(splitted_hills), len(filtered_hills)))

            CURRENT_TASK('Calculating hill statistics')
            sorted_hills, sorted_stats, sorted_data = get_hill_data(filtered_hills, centroids, callback=partial(progress_wrapper, delta=time_dict['get_hill_data']))
            logging.info('Extracting hill stats complete')

            CURRENT_TASK('Connecting pre isotope patterns')
            pre_isotope_patterns = get_edges(sorted_stats, sorted_data, callback=partial(progress_wrapper, delta=time_dict['get_edges']))
            logging.info('Found {} pre isotope patterns.'.format(len(pre_isotope_patterns)))

            CURRENT_TASK('Deisotope patterns')
            isotope_patterns, isotope_charges = get_isotope_patterns(pre_isotope_patterns, sorted_stats, sorted_data, averagine_aa, isotopes, callback=partial(progress_wrapper, delta=time_dict['get_isotope_patterns']))
            logging.info('Extracted {} isotope patterns.'.format(len(isotope_patterns)))

            CURRENT_TASK('Calculating feature statistics')
            feature_table = feature_finder_report(isotope_patterns, isotope_charges, sorted_stats, sorted_data, sorted_hills, query_data, callback=partial(progress_wrapper, delta=time_dict['feature_finder_report']))
            logging.info('Report complete.')

            from alphapept.matching import match_ms2
            features = match_ms2(feature_table, query_data)


            # Search part

            from alphapept.search import get_psms, get_score_columns
            from alphapept.score import score_x_tandem

            CURRENT_TASK('Running first search.')
            psms, num_specs_compared = get_psms(query_data, db_data, features, callback=partial(progress_wrapper, delta=time_dict['get_psms']), **settings["search"])
            logging.info('First search complete. Compared {:,} spectra and found {:,} psms.'.format(num_specs_compared, len(psms)))

            CURRENT_TASK('Extracting columns for scoring.')
            psms, num_specs_scored = get_score_columns(psms, query_data, db_data, features, **settings["search"])
            logging.info('Extracted columns for {:,} psms.'.format(num_specs_scored))


            CURRENT_TASK('Scoring psms.')
            df = score_x_tandem(pd.DataFrame(psms), plot=False, verbose=False, **settings["search"])
            logging.info('Scoring complete. For {} FDR found {:,} targets and {:,} decoys.'.format(settings["search"]["peptide_fdr"], df['target'].sum(), df['decoy'].sum()) )


            if settings['search']['calibrate']:

                from alphapept.recalibration import get_calibration

                CURRENT_TASK('Calibrating features.')
                logging.info('Precursor Offset (PPM) is {:.2f} (mean), {:.2f} (std)'.format(df['o_mass_ppm'].mean(), df['o_mass_ppm'].std()))
                'Calibrating MS1 spectra'
                features_calib, df_sub = get_calibration(df, features, **settings["calibration"])

                o_mass_ppm_mean = df_sub['o_mass_ppm_calib'].mean()
                o_mass_ppm_std = df_sub['o_mass_ppm_calib'].std()

                logging.info('Calibration complete. Precursor Offset (PPM) is {:.2f} (mean), {:.2f} (std)'.format(o_mass_ppm_mean, o_mass_ppm_std))

                logging.info('Adjusting search bound to {:.2f} ppm.'.format(3*o_mass_ppm_std))

                settings["search"]["m_offset"] = 3*o_mass_ppm_std

                CURRENT_TASK('Running second search.')
                psms, num_specs_compared = get_psms(query_data, db_data, features_calib, callback=partial(progress_wrapper, delta=time_dict['get_psms']), **settings["search"])
                logging.info('Second search complete. Compared {:,} spectra and found {:,} psms.'.format(num_specs_compared, len(psms)))

                CURRENT_TASK('Extracting columns for scoring.')
                psms, num_specs_scored = get_score_columns(psms, query_data, db_data, features_calib, **settings["search"])
                logging.info('Extracted columns for {:,} psms.'.format(num_specs_scored))

                CURRENT_TASK('Scoring psms.')
                df = score_x_tandem(pd.DataFrame(psms), plot=False, verbose=False, **settings["search"])
                logging.info('Scoring complete. For {} FDR found {:,} targets and {:,} decoys.'.format(settings["search"]["peptide_fdr"], df['target'].sum(), df['decoy'].sum()) )

            ## Protein Groups and FDR control

            from alphapept.score import cut_global_fdr, perform_protein_grouping, cut_global_fdr, get_x_tandem_score, filter_score

            CURRENT_TASK('Scoring')
            df = pd.DataFrame(psms)
            df['score'] = get_x_tandem_score(df)
            df['decoy'] = df['sequence'].str[-1].str.islower()
            df = filter_score(df)

            CURRENT_TASK('FDR control on peptides')
            df = cut_global_fdr(df, analyte_level='sequence',  plot=False, verbose=False)
            logging.info('Scoring peptides complete. For {} FDR found {:,} targets and {:,} decoys.'.format(settings["search"]["peptide_fdr"], df['target'].sum(), df['decoy'].sum()) )


            CURRENT_TASK('Perform protein grouping')
            df = perform_protein_grouping(df, db_data['pept_dict'].item(), db_data['fasta_dict'].item())
            df = cut_global_fdr(df, analyte_level='protein',  plot=False, verbose=False)
            logging.info('Scoring proteins complete. For {} FDR found {:,} targets and {:,} decoys. A total of {:,} proteins found.'.format(settings["search"]["protein_fdr"], df['target'].sum(), df['decoy'].sum(), len(set(df['protein']))))

            CURRENT_TASK('Saving')

            base, ext = os.path.splitext(settings['raw']['query_path'])
            out_path = base+'_ap.csv'
            df.to_csv(out_path, index = False)
            logging.info('Saved to {}'.format(out_path))


    CURRENT_TASK('COMPLETE')

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  with tqdm(total=100, desc='Current Progress', unit='%') as current_progress:


HBox(children=(FloatProgress(value=0.0, description='Current Progress', style=ProgressStyle(description_width=…

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  with tqdm(total=100, desc='Overall Progress', unit='%') as overall_progress:


HBox(children=(FloatProgress(value=0.0, description='Overall Progress', style=ProgressStyle(description_width=…

2020-04-20 00:13:34 INFO - Checking raw path.
2020-04-20 00:13:34 INFO - No NPZ for raw present. Performing conversion step.
2020-04-20 00:13:34 INFO - Raw path okay.
2020-04-20 00:13:34 INFO - Checking library path.
2020-04-20 00:13:34 INFO - No NPZ for library present. Creating library from FASTA.
2020-04-20 00:13:34 INFO - Library path okay.


Checking Settings
Digesting FASTA


2020-04-20 00:15:01 INFO - Digested 20,418 proteins and generated 5,331,318 peptides


Generating Spectra


2020-04-20 00:17:13 INFO - Generated 5,331,318 spectra


Saving library


2020-04-20 00:17:50 INFO - Database saved to F:/rawdata/zz.database/uniprot_Human_reviewed_March_2019_manual.npz. Filesize 3.31 Gb
2020-04-20 00:17:50 INFO - Imported existing <module 'comtypes.gen' from 'C:\\ProgramData\\Anaconda3\\envs\\alphap\\lib\\site-packages\\comtypes\\gen\\__init__.py'>
2020-04-20 00:17:50 INFO - Using writeable comtypes cache directory: 'C:\ProgramData\Anaconda3\envs\alphap\lib\site-packages\comtypes\gen'


Converting raw files
Raw File saved to F:/rawdata/04_hela_testrun/20190402_QX1_SeVW_MA_HeLa_500ng_LC11.npz


2020-04-20 00:19:33 INFO - Raw file(s) saved to ['F:/rawdata/04_hela_testrun/20190402_QX1_SeVW_MA_HeLa_500ng_LC11.npz']


Converting centroids


2020-04-20 00:19:50 INFO - Loaded 13,230 centroids.


Exctracting hills


Encountered the use of a type that is scheduled for deprecation: type 'reflected list' found for argument 'centroids' of function 'connect_centroids_forward'.

For more information visit http://numba.pydata.org/numba-doc/latest/reference/deprecation.html#deprecation-of-reflection-for-list-and-set-types
[1m
File "alphapept\feature_finding.py", line 189:[0m
[1m@njit
[1mdef connect_centroids_forward(centroids, max_centroids, max_gap, ppm_tol):
[0m[1m^[0m[0m
[0m
Encountered the use of a type that is scheduled for deprecation: type 'reflected list' found for argument 'centroids' of function 'connect_centroids_backward'.

For more information visit http://numba.pydata.org/numba-doc/latest/reference/deprecation.html#deprecation-of-reflection-for-list-and-set-types
[1m
File "alphapept\feature_finding.py", line 232:[0m
[1m@njit
[1mdef connect_centroids_backward(centroids, max_centroids, max_gap, ppm_tol):
[0m[1m^[0m[0m
[0m
2020-04-20 00:25:04 INFO - A total of 992,051 hills ex

Splitting hills


2020-04-20 00:26:37 INFO - Split 992,051 hills into 1,239,960 hills


Refining hills


2020-04-20 00:27:05 INFO - Filtered 1,239,960 hills. Remaining 1,221,467 hills


Calculating hill statistics


Encountered the use of a type that is scheduled for deprecation: type 'reflected list' found for argument 'hill_data' of function 'get_hill_data_numba'.

For more information visit http://numba.pydata.org/numba-doc/latest/reference/deprecation.html#deprecation-of-reflection-for-list-and-set-types
[1m
File "alphapept\feature_finding.py", line 581:[0m
[1m@njit
[1mdef get_hill_data_numba(hill_data):
[0m[1m^[0m[0m
[0m
2020-04-20 00:31:13 INFO - Extracting hill stats complete


Connecting pre isotope patterns


2020-04-20 00:32:42 INFO - Found 200012 pre isotope patterns.


Deisotope patterns


2020-04-20 00:34:47 INFO - Extracted 218786 isotope patterns.


Calculating feature statistics


2020-04-20 00:35:31 INFO - Report complete.
  query_mz = np.log(query_data['mono_mzs2'])*1e6/ppm_range
2020-04-20 00:35:32 INFO - Note: NumExpr detected 24 cores but "NUMEXPR_MAX_THREADS" not set, so enforcing safe limit of 8.
2020-04-20 00:35:32 INFO - NumExpr defaulting to 8 threads.


Running first search.


2020-04-20 00:36:06 INFO - First search complete. Compared 52,615,866 spectra and found 225,653 psms.


Extracting columns for scoring.


2020-04-20 00:36:38 INFO - Extracted columns for 225,653 psms.


Scoring psms.


2020-04-20 00:36:39 INFO - Scoring complete. For 0.01 FDR found 56,515 targets and 565 decoys.
2020-04-20 00:36:39 INFO - Precursor Offset (PPM) is -3.81 (mean), 5.11 (std)


Calibrating features.


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_sub['o_mass_ppm_offset'] = f2
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_sub['o_mass_ppm_calib'] = (df_sub['o_mass_ppm']-df_sub['o_mass_ppm_offset'])
2020-04-20 00:37:12 INFO - Calibration complete. Precursor Offset (PPM) is 0.01 (mean), 1.74 (std)
2020-04-20 00:37:12 INFO - Adjusting search bound to 5.22 ppm.


Running second search.


2020-04-20 00:37:24 INFO - Second search complete. Compared 14,653,853 spectra and found 116,850 psms.


Extracting columns for scoring.


2020-04-20 00:37:43 INFO - Extracted columns for 116,850 psms.


Scoring psms.


2020-04-20 00:37:44 INFO - Scoring complete. For 0.01 FDR found 53,821 targets and 538 decoys.


Scoring
FDR control on peptides


2020-04-20 00:37:45 INFO - Scoring peptides complete. For 0.01 FDR found 53,366 targets and 451 decoys.


Perform protein grouping


2020-04-20 00:39:18 INFO - Scoring proteins complete. For 0.01 FDR found 51,463 targets and 72 decoys. A total of 5,829 proteins found.


Saving


2020-04-20 00:39:20 INFO - Saved to F:/rawdata/04_hela_testrun/20190402_QX1_SeVW_MA_HeLa_500ng_LC11_ap.csv




COMPLETE
