# Parameter Test

Calls AlphaPept for a parameter range and checks performance.

In [1]:
from time import time
%load_ext autoreload
%autoreload 2

## Logger

In [2]:
import sys
import logging
# Create logger
logger = logging.getLogger()
logger.setLevel(logging.INFO)

# Create STDERR handler
handler = logging.StreamHandler(sys.stderr)
# ch.setLevel(logging.DEBUG)

# Create formatter and add it to the handler
formatter = logging.Formatter('%(asctime)s %(levelname)-s - %(message)s', "%Y-%m-%d %H:%M:%S")
handler.setFormatter(formatter)

# Set STDERR handler as the only handler 
logger.handlers = [handler]

## Parameter Test

In [3]:
import os

test_dir = './integration_test/'

files = os.listdir('./integration_test/')

raw_files = None
fasta_file = None

for file in files:
    base, ext = os.path.splitext(file)
    
    if ext == '.fasta':
        fasta_file = os.path.abspath(os.path.join(test_dir, file))
        print('Found fasta file: {}'.format(fasta_file))
    elif ext == '.raw':
        if not raw_files:
            raw_files = []
            raw_files.append(os.path.abspath(os.path.join(test_dir, file)))
        else:
            raw_files.append(os.path.abspath(os.path.join(test_dir, file)))
        print('Found thermo file: {}'.format(file))
    elif ext == '.d':
        if not raw_files:
            raw_files = []
            raw_files.append(os.path.abspath(os.path.join(test_dir, file)))
        else:
            raw_files.append(os.path.abspath(os.path.join(test_dir, file)))
        print('Found bruker file: {}'.format(file))
    else:
        print('File not recognized: {}'.format(file))
        
        
#Todo: Later combine files that might belong together (i.e. in folder, or of same type)

File not recognized: 20190402_QX1_SeVW_MA_HeLa_500ng_LC11.npz
Found thermo file: 20190402_QX1_SeVW_MA_HeLa_500ng_LC11.raw
File not recognized: 20190402_QX1_SeVW_MA_HeLa_500ng_LC11_ap.csv
File not recognized: 20190402_QX1_SeVW_MA_HeLa_500ng_LC11_ap.yaml
Found bruker file: 20191209_TIMS05_QC_ADB_AfterSRIGUpgrade_200ng_03_A1_1_1428.d
File not recognized: 20191209_TIMS05_QC_ADB_AfterSRIGUpgrade_200ng_03_A1_1_1428.npz
File not recognized: 20191209_TIMS05_QC_ADB_AfterSRIGUpgrade_200ng_03_A1_1_1428_ap.csv
File not recognized: 20191209_TIMS05_QC_ADB_AfterSRIGUpgrade_200ng_03_A1_1_1428_ap.yaml
Found fasta file: F:\projects\alphapept\integration_test\uniprot_Human_reviewed_March_2019_manual.fasta
File not recognized: uniprot_Human_reviewed_March_2019_manual.npz


In [5]:
from alphapept.settings import load_settings

settings_template = load_settings('settings_template.yaml')


In [6]:
param_group = 'search'
param_key = 'm_tol'

minval = settings_template[param_group][param_key]['min']
maxval = settings_template[param_group][param_key]['max']

In [7]:
from alphapept.settings import load_settings

default_settings = load_settings('default_settings.yaml')

# Deactivate maxlfq and match between runs in the default case as they are not implemented
# Also deactivete library creation
# Otherwise: Activate all functions and deactivate to test

default_settings['quantification']['max_lfq'] = False
default_settings['misc']['match_between_runs'] = False
default_settings['general']['create_library'] = False
default_settings['general']['find_features'] = False
default_settings['raw']['convert_raw'] = False

#Fill with paths from above 
default_settings['fasta']['fasta_path'] = 'F:/projects/alphapept/integration_test/uniprot_Human_reviewed_March_2019_manual.fasta'
default_settings['fasta']['contaminants_path'] = 'F:/projects/alphapept/contaminants.fasta'
default_settings['fasta']['library_path'] =  'F:/projects/alphapept/integration_test/uniprot_Human_reviewed_March_2019_manual.npz'


In [11]:
import copy 
from tqdm.notebook import tqdm as tqdm
import numpy as np

n_checks = 30

test_cases = []

for point in np.linspace(10, maxval, n_checks): #todo: Change minval to get 0 psms
    for file in raw_files:        
        test_case = copy.deepcopy(default_settings)

        test_case['raw']['raw_path'] = file
        base, ext = os.path.splitext(file)
        test_case['raw']['raw_path_npz'] = base + '.npz'
        test_case[param_group][param_key] = point

        test_cases.append(test_case)

In [12]:
from time import time
import numpy as np
from alphapept.runner import alpha_runner

def run_test(settings):
    """
    Run a test with certain settings
    """

    
    def dummy(input):
        pass
    
    with tqdm(total=100, desc='Current Progress', unit='%') as current_pbar:
        with tqdm(total=100, desc='Overall Progress', unit='%') as overall_pbar:

            def current_progress(current):
                to_update = np.round(current*100-current_pbar.n, 2)
                current_pbar.update(to_update)

            def overall_progress(current):
                to_update = np.round(current-overall_pbar.n, 2)
                overall_pbar.update(to_update)
                
                
            start = time()
            df, features = alpha_runner(settings, overall_progress = overall_progress, current_progress = current_progress, CURRENT_TASK = dummy)
            try:
                df, features = alpha_runner(settings, overall_progress = overall_progress, current_progress = current_progress, CURRENT_TASK = dummy)
                error = False
            except Exception as e:
                print('Experienced the following exception:')
                print('-'*100)
                print(e)
                print('-'*100)
                df = None
                error = True
                
            end = time()
            
            
    if df is not None:
        n_proteins = len(set(df['protein']))
        n_sequences = len(set(df['sequence']))
        n_sequence_charges = len(set(df['sequence'] + df['charge'].astype('int').astype('str')))
    else:
        n_proteins = np.nan
        n_sequences = np.nan
        n_sequence_charges = np.nan      
    
    time_minutes = (end-start)/60
    
    return (n_proteins, n_sequences, n_sequence_charges, time_minutes, error)

In [13]:
if False:
    import pandas as pd
    from tqdm.notebook import tqdm as tqdm

    report = []
    start = time()
    for test_case in tqdm(test_cases):

        report.append(tuple([test_case["raw"]["raw_path_npz"], test_case[param_group][param_key]]) + run_test(test_case))

    end = time()

    import pandas as pd

    print('Integration test complete. Time elapsed {:.2f} hours'.format((end-start)/60/60))

    columns = [_[1] for _ in return_bool_settings(test_case)]
    columns.extend(['n_proteins', 'n_sequences', 'n_sequence_charges', 'time_minutes', 'error'])

    pd.DataFrame(report, columns = columns)

    print('Complete')

HBox(children=(FloatProgress(value=0.0, max=60.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, description='Current Progress', style=ProgressStyle(description_width=…

HBox(children=(FloatProgress(value=0.0, description='Overall Progress', style=ProgressStyle(description_width=…

2020-05-08 00:39:41 INFO - Checking raw path F:\projects\alphapept\integration_test\20190402_QX1_SeVW_MA_HeLa_500ng_LC11.raw.
2020-05-08 00:39:41 INFO - NPZ for raw file present. Skipping conversion step.
2020-05-08 00:39:41 INFO - Raw path okay.
2020-05-08 00:39:41 INFO - Checking library path.
2020-05-08 00:39:41 INFO - NPZ for library file present.
2020-05-08 00:39:41 INFO - Library path okay.
2020-05-08 00:39:41 INFO - Using Features from raw data
2020-05-08 00:39:59 INFO - First search complete. Compared 30,099,867 spectra and found 95,293 psms.
2020-05-08 00:40:10 INFO - Extracted columns for 95,293 psms.
2020-05-08 00:40:10 INFO - Scoring complete. For 0.01 FDR found 64,653 targets and 646 decoys.
2020-05-08 00:40:10 INFO - Precursor Offset (PPM) is -0.25 (mean), 2.51 (std)
2020-05-08 00:41:04 INFO - Calibration complete. Precursor Offset (PPM) is 0.00 (mean), 0.79 (std)
2020-05-08 00:41:04 INFO - Adjusting search bound to 2.38 ppm.
2020-05-08 00:41:11 INFO - Second search compl





HBox(children=(FloatProgress(value=0.0, description='Current Progress', style=ProgressStyle(description_width=…

HBox(children=(FloatProgress(value=0.0, description='Overall Progress', style=ProgressStyle(description_width=…

2020-05-08 00:43:36 INFO - Checking raw path F:\projects\alphapept\integration_test\20191209_TIMS05_QC_ADB_AfterSRIGUpgrade_200ng_03_A1_1_1428.d.
2020-05-08 00:43:36 INFO - NPZ for raw file present. Skipping conversion step.
2020-05-08 00:43:36 INFO - Raw path okay.
2020-05-08 00:43:36 INFO - Checking library path.
2020-05-08 00:43:36 INFO - NPZ for library file present.
2020-05-08 00:43:36 INFO - Library path okay.
2020-05-08 00:43:36 INFO - Using Features from raw data
2020-05-08 00:44:10 INFO - First search complete. Compared 61,364,862 spectra and found 256 psms.
2020-05-08 00:44:17 INFO - Extracted columns for 256 psms.
2020-05-08 00:44:17 INFO - Scoring complete. For 0.01 FDR found 0 targets and 1 decoys.
2020-05-08 00:44:17 INFO - Precursor Offset (PPM) is 4.39 (mean), nan (std)






  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


ValueError: arange: cannot compute length