In [None]:
# default_exp settings

# Settings

> A template for settings

AlphaPept stores all settings in `*.yaml`-files. This notebook contains functions to load, save, and print settings. Additionally, a settings template is defined. Here we define parameters, default values, and a range and what kind of parameter this is (e.g., float value, list, etc.). The idea here is to have definitions to automatically create graphical user interfaces for the settings.

In [None]:
#hide
from nbdev.showdoc import *

## Settings

### Saving and Loading

The default scheme for saving settings are `*.yaml`-files. These files can be easily modified when opening with a text editor.

In [None]:
#export
import yaml

def print_settings(settings):
    """
    Print a yaml settings file
    """
    print(yaml.dump(settings, default_flow_style=False))


def load_settings(path):
    """
    Load a yaml settings file
    """
    with open(path, "r") as settings_file:
        SETTINGS_LOADED = yaml.load(settings_file, Loader=yaml.FullLoader)
        return SETTINGS_LOADED

def save_settings(settings, path):
    """
    Save a yaml settings file to path
    """
    with open(path, "w") as file:
        yaml.dump(settings, file, sort_keys=False)

## Settings Template

The settings template defines individual settings. The idea is to provide a template so that a graphical user interface can be automatically generated. The list below represents what each item would be when using PyQt. This could be adapted for any kind of GUI library.

Each entry has a type, default values, and a description.

* spinbox -> QSpinBox with minimum and maximum values (int)
* doublespinbox -> QDoubleSpinBox with minimum and maximum values (float)
* path -> Clickable button to select a path to save / load files.
* combobox -> QComboBox, dropdown menu with values to choose from
* checkbox -> QCheckBox, checkbox that can be selected
* checkgroup -> Creates a list of QCheckBox options that can be selected
* list -> Creates a list that is displayed
* placeholder -> This just prints the parameter and cannot be changed

In [None]:
#hide
import pandas as pd
from alphapept.constants import protease_dict

SETTINGS_TEMPLATE = {}

# General
general = {}
SETTINGS_TEMPLATE["general"] = general
general["ppm"] = {'type':'checkbox', 'default':True, 'description':"Use ppm instead of Dalton."}
general["score"] = {'type':'combobox', 'value':['x_tandem','random_forest'], 'default':'random_forest', 'description':"Scoring method."}
general['n_processes'] = {'type':'spinbox', 'min':1, 'max':60, 'default':60, 'description':"Maximum number of processes for multiprocessing. If larger than number of processors it will be capped."}


general["create_database"] = {'type':'checkbox', 'default':True, 'description':"Flag to create a database."}
general["import_raw_data"] = {'type':'checkbox', 'default':True, 'description':"Flag to import the raw data."}
general["find_features"] = {'type':'checkbox', 'default':True, 'description':"Flag to perform feature finding."}
general["search_data"] = {'type':'checkbox', 'default':True, 'description':"Flag to perform search."}
general["recalibrate_data"] = {'type':'checkbox', 'default':True, 'description':"Flag to perform recalibration."}
general["align"] = {'type':'checkbox', 'default':True, 'description':"Flag to align the data."}
general["match"] = {'type':'checkbox', 'default':False, 'description':"Flag to perform match-between runs."}
general["lfq_quantification"] = {'type':'checkbox', 'default':True, 'description':"Flag to perfrom lfq normalization."}

SETTINGS_TEMPLATE["general"] = general

experiment = {}

experiment["results_path"] = {'type':'path','default': None, 'filetype':['hdf'], 'folder':False, 'description':"Path where the results should be stored."}
experiment["shortnames"] = {'type':'list','default':[], 'description':"List of shortnames for the raw files."}
experiment["file_paths"] = {'type':'list','default':[], 'description':"Filepaths of the experiments."}
experiment["fractions"] = {'type':'list','default':[], 'description':"List of fractions for the raw files."}

SETTINGS_TEMPLATE["experiment"] = experiment

# Raw
raw = {}
SETTINGS_TEMPLATE["raw"] = raw

raw["most_abundant"] = {'type':'spinbox', 'min':1, 'max':1000, 'default':400, 'description':"Number of most abundant peaks to be isolated from raw spectra."}
raw["use_profile_ms1"] = {'type':'checkbox', 'default':False, 'description':"Use profile data for MS1 and perform own centroiding."}

# Fasta
fasta = {}
SETTINGS_TEMPLATE["fasta"] = fasta

## Read modifications from modifications file
mod_db = pd.read_csv('../modifications.tsv', sep='\t')

mods = {}
mods_terminal = {}
mods_protein = {}

for i in range(len(mod_db)):
    mod = mod_db.iloc[i]
    if 'terminus' in mod['Type']:
        if 'peptide' in mod['Type']:
            mods_terminal[mod['Identifier']] = mod['Description']
        elif 'protein' in mod['Type']:
            mods_protein[mod['Identifier']] = mod['Description']
        else:
            print('Not understood')
            print(mod['Type'])
    else:
        mods[mod['Identifier']] = mod['Description']

fasta["mods_fixed"] = {'type':'checkgroup', 'value':mods.copy(), 'default':['cC'],'description':"Fixed modifications."}
fasta["mods_fixed_terminal"] = {'type':'checkgroup', 'value':mods_terminal.copy(), 'default':[],'description':"Fixed terminal modifications."}
fasta["mods_variable"] = {'type':'checkgroup', 'value':mods.copy(), 'default':['oxM'],'description':"Variable modifications."}
fasta["mods_variable_terminal"]  = {'type':'checkgroup', 'value':mods_terminal.copy(), 'default':[], 'description':"Varibale terminal modifications."}

fasta["mods_fixed_terminal_prot"] = {'type':'checkgroup', 'value':mods_protein.copy(), 'default':[],'description':"Fixed terminal modifications on proteins."}
fasta["mods_variable_terminal_prot"]  = {'type':'checkgroup', 'value':mods_protein.copy(), 'default':[], 'description':"Varibale terminal modifications on proteins."}

fasta["num_missed_cleavages"] = {'type':'spinbox', 'min':0, 'max':99, 'default':2, 'description':"Number of missed cleavages."}
fasta["min_length"] = {'type':'spinbox', 'min':6, 'max':99, 'default':6, 'description':"Minimum peptide length."}
fasta["max_length"] = {'type':'spinbox', 'min':6, 'max':99, 'default':27, 'description':"Maximum peptide length."}
fasta["max_isoforms"] = {'type':'spinbox', 'min':1, 'max':4096, 'default':1024, 'description':"Maximum number of isoforms per peptide."}

fasta["pseudo_reverse"] = {'type':'checkbox', 'default':True, 'description':"Use pseudo-reverse strategy instead of reverse."}
fasta["AL_swap"] = {'type':'checkbox', 'default':False, 'description':"Swap A and L for decoy generation."}
fasta["KR_swap"] = {'type':'checkbox', 'default':False, 'description':"Swap K and R (only if terminal) for decoy generation."}

proteases = [_ for _ in protease_dict.keys()]
fasta["protease"] = {'type':'combobox', 'value':proteases, 'default':'trypsin', 'description':"Protease for digestions."}

fasta["spectra_block"] = {'type':'spinbox', 'min':1000, 'max':1000000, 'default':100000, 'description':"Maximum number of sequences to be collected before theoretical spectra are generated."}
fasta["fasta_block"] = {'type':'spinbox', 'min':100, 'max':10000, 'default':1000, 'description':"Number of fasta entries to be processed in one block."}
fasta["save_db"] = {'type':'checkbox', 'default':True, 'description':"Save DB or create on the fly."}
fasta["db_size"] = {'type':'spinbox', 'min':1000, 'max':1000000, 'default':200000, 'description':"Maximum number of fasta entries that are stored in a db."}


fasta["database_path"] = {'type':'path','default':None, 'filetype':['hdf'], 'folder':False, 'description':"Path to library file (.hdf)."}
fasta["fasta_paths"] = {'type':'list','default':[], 'description':"List of paths for FASTA files."}

# Feature Settings
features = {}
SETTINGS_TEMPLATE["features"] = features

features["min_hill_length"] = {'type':'spinbox', 'min':1, 'max':10, 'default':3}
features["max_gap"] = {'type':'spinbox', 'min':1, 'max':99, 'default':2}
features["ppm_tol"] = {'type':'spinbox', 'min':1, 'max':99, 'default':8}
features["smoothing"] = {'type':'spinbox', 'min':1, 'max':10, 'default':1}

features["max_neighbors"] = {'type':'spinbox', 'min':1, 'max':10, 'default':4}
features["max_distance"] = {'type':'doublespinbox', 'min':0.0, 'max':1.0, 'default':0.4}
features["mass_importance"] = {'type':'spinbox', 'min':1, 'max':1000, 'default':100}

features["search_unidentified"] = {'type':'checkbox', 'default':False, 'description':"Search MSMS w/o feature."}


# Search Settings
search = {}

SETTINGS_TEMPLATE["search"] = search

search["m_offset"] = {'type':'spinbox', 'min':1, 'max':99, 'default':30, 'description':"Precursor mass offset."}
search["m_tol"] = {'type':'spinbox', 'min':1, 'max':99, 'default':30, 'description':"MSMS mass offset."}
search["min_frag_hits"] = {'type':'spinbox', 'min':1, 'max':99, 'default':7, 'description':"Minimum number of fragment hits."}
search["ppm"] = {'type':'checkbox', 'default':True, 'description':"Use ppm instead of Dalton."}
search["calibrate"] = {'type':'checkbox', 'default':True, 'description':"Recalibrate masses."}
search["calibration_std"] = {'type':'spinbox', 'min':1, 'max':5, 'default':3, 'description':"Std range for search after calibration."}
search["parallel"] = {'type':'checkbox', 'default':True, 'description':"Use parallel processing."}
search["peptide_fdr"] = {'type':'doublespinbox', 'min':0.0, 'max':1.0, 'default':0.01, 'description':"FDR level for peptides."}
search["protein_fdr"] = {'type':'doublespinbox', 'min':0.0, 'max':1.0, 'default':0.01, 'description':"FDR level for proteins."}
search['recalibration_min'] = {'type':'spinbox', 'min':100, 'max':10000, 'default':100, 'description':"Minimum number of datapoints to perform calibration."}


# Calibration

calibration = {}
SETTINGS_TEMPLATE["calibration"] = calibration


calibration["outlier_std"] = {'type':'spinbox', 'min':1, 'max':5, 'default':3, 'description':"Number of std. deviations to filter outliers in psms."}
calibration["n_neighbors"] = {'type':'spinbox', 'min':1, 'max':1000, 'default':100, 'description':"Number of neighbors that are used for offset interpolation."}
calibration["ppm_range"] = {'type':'spinbox', 'min':1, 'max':1000, 'default':20, 'description':"Scaling factor for mz axis."}
calibration["rt_range"] = {'type':'doublespinbox', 'min':0.0, 'max':10, 'default':0.5, 'description':"Scaling factor for rt axis."}
calibration["mob_range"] = {'type':'doublespinbox', 'min':0.0, 'max':1.0, 'default':0.3, 'description':"Scaling factor for mobility axis."}

# Matching

matching = {}
SETTINGS_TEMPLATE["matching"] = matching

matching["min_match_p"] = {'type':'doublespinbox', 'min':0.001, 'max':1.0, 'default':0.05, 'description':"Minimum probability cutoff for matching"}
matching["min_match_d"] = {'type':'doublespinbox', 'min':0.001, 'max':10.0, 'default':3, 'description': "Minimum distance cutoff for matching"}

# Quantification

quantification = {}
SETTINGS_TEMPLATE["quantification"] = quantification
quantification["max_lfq"] = {'type':'checkbox', 'default':True, 'description':"Perform max lfq type quantification."}
quantification["mode"] = {'type':'combobox', 'value':['int_sum'], 'default':'int_sum', 'description':"Column to perform quantification on."}

# Save everything

path = "../alphapept/settings_template.yaml"

save_settings(SETTINGS_TEMPLATE, path)
print_settings(load_settings(path))

calibration:
  mob_range:
    default: 0.3
    description: Scaling factor for mobility axis.
    max: 1.0
    min: 0.0
    type: doublespinbox
  n_neighbors:
    default: 100
    description: Number of neighbors that are used for offset interpolation.
    max: 1000
    min: 1
    type: spinbox
  outlier_std:
    default: 3
    description: Number of std. deviations to filter outliers in psms.
    max: 5
    min: 1
    type: spinbox
  ppm_range:
    default: 20
    description: Scaling factor for mz axis.
    max: 1000
    min: 1
    type: spinbox
  rt_range:
    default: 0.5
    description: Scaling factor for rt axis.
    max: 10
    min: 0.0
    type: doublespinbox
experiment:
  file_paths:
    default: []
    description: Filepaths of the experiments.
    type: list
  fractions:
    default: []
    description: List of fractions for the raw files.
    type: list
  results_path:
    default: null
    description: Path where the results should be stored.
    filetype:
    - hdf
    f

In [None]:
#hide
settings = {}

for category in SETTINGS_TEMPLATE.keys():
    
    temp_settings = {}
    
    for key in SETTINGS_TEMPLATE[category].keys():
        temp_settings[key] = SETTINGS_TEMPLATE[category][key]['default']
        
    settings[category] = temp_settings
    
path = "../default_settings.yaml"

save_settings(settings, path)

In [None]:
#hide
from nbdev.export import *
notebook2script()

Converted 00_settings.ipynb.
Converted 01_chem.ipynb.
Converted 02_io.ipynb.
Converted 03_fasta.ipynb.
Converted 04_feature_finding.ipynb.
Converted 05_search.ipynb.
Converted 06_score.ipynb.
Converted 07_recalibration.ipynb.
Converted 08_quantification.ipynb.
Converted 09_matching.ipynb.
Converted 10_constants.ipynb.
Converted 11_interface.ipynb.
Converted index.ipynb.
