In [None]:
#default_exp protein.fasta

In [None]:
#export
# for nbdev_build_docs
# import os
# __file__ = os.path.expanduser('~/Workspace/AlphaBase/alphabase/protein/fasta.py')

In [None]:
#hide
# notebook does not have __file__
import alphabase.protein.fasta
__file__ = alphabase.protein.fasta.__file__

# Fasta processing and peptide generation

In [None]:
#export

import regex as re
import numpy as np
import pandas as pd
import numba
import os
import itertools
from Bio import SeqIO
from typing import Union

from alphabase.yaml_utils import load_yaml
from alphabase.io.hdf import HDF_File
from alphabase.utils import explode_multiple_columns

protease_dict = load_yaml(
    os.path.join(
        os.path.dirname(
            __file__
        ), 
        'protease.yaml'
    )
)

In [None]:
#hide
p = re.compile(protease_dict['trypsin'])

idx = '0123456789012345678901234567890123456789012345'
seq = 'ABDNGKENGLANGIXHGRKTNGLANGKVHNAKHNARKANGKPFAAT'
cut_pos = np.array([m.start()+1 for m in p.finditer(seq)])
assert np.all(cut_pos==np.array([6, 18, 19, 27, 32, 36, 37]))

In [None]:
#export
def get_uniprot_gene_name(description:str):
    idx = description.find(' GN=')
    if idx == -1: return ''
    else: idx += 4
    return description[idx:description.find(' ', idx)]

def read_fasta_file(fasta_filename:str=""):
    """
    Read a FASTA file line by line
    Args:
        fasta_filename (str): fasta.
    Yields:
        dict {id:str, name:str, description:str, sequence:str}: protein information.
    """
    with open(fasta_filename, "rt") as handle:
        iterator = SeqIO.parse(handle, "fasta")
        while iterator:
            try:
                record = next(iterator)
                parts = record.id.split("|")  # pipe char
                if len(parts) > 1:
                    id = parts[1]
                else:
                    id = record.name
                sequence = str(record.seq)
                entry = {
                    "protein_id": id,
                    "full_name": record.name,
                    "gene_name": get_uniprot_gene_name(record.description),
                    "description": record.description,
                    "sequence": sequence,
                }

                yield entry
            except StopIteration:
                break

def load_all_proteins(fasta_file_list:list):
    protein_dict = {}
    for fasta in fasta_file_list:
        for protein in read_fasta_file(fasta):
            protein_dict[protein['protein_id']] = protein
    return protein_dict

def concat_proteins(protein_dict:dict, sep='$')->str:
    """Concatenate all protein sequences into a single sequence, 
    seperated by `sep ($ by default)`.

    Args:
        protein_dict (dict): protein_dict by read_fasta_file()

    Returns:
        str: concatenated sequence seperated by `sep`.
    """
    seq_list = ['']
    seq_count = 1
    for key in protein_dict:
        protein_dict[key]['offset'] = seq_count
        seq_list.append(protein_dict[key]['sequence'])
        seq_count += protein_dict[key]['sequence']+1
    seq_list.append('')
    return '$'.join(seq_list)

In [None]:
#hide
assert get_uniprot_gene_name('sp|Q9H9K5|MER34_HUMAN Endogenous retroviral envelope protein HEMO OS=Homo sapiens OX=9606 GN=ERVMER34-1 PE=1 SV=1') == 'ERVMER34-1'

In [None]:
#export
@numba.njit
def cleave_sequence_with_cut_pos(
    sequence:str,
    cut_pos:np.ndarray,
    n_missed_cleavages:int=2,
    pep_length_min:int=6,
    pep_length_max:int=45,
)->np.ndarray:
    """
    Cleave a sequence with cut postions (cut_pos). 
    Filters to have a minimum and maximum length.
    Args:
        sequence (str): protein sequence
        cut_pos (np.array): cut postions determined by a given protease.
        n_missed_cleavages (int): the number of max missed cleavages.
        pep_length_min (int): min peptide length.
        pep_length_max (int): max peptide length.
    Returns:
        list (str): cleaved peptide sequences with missed cleavages.
        list (int): number of miss cleavage of each peptide.
        list (bool): if N-term peptide
        list (bool): if C-term pepetide
    """
    seq_list = []
    miss_list = []
    nterm_list = []
    cterm_list = []
    for i,start_pos in enumerate(cut_pos):
        for n_miss,end_pos in enumerate(
            cut_pos[i+1:i+2+n_missed_cleavages]
        ):
            if end_pos > start_pos + pep_length_max:
                break
            elif end_pos < start_pos + pep_length_min:
                continue
            else:
                seq_list.append(sequence[start_pos:end_pos])
                miss_list.append(n_miss)
                if start_pos == 0:
                    nterm_list.append(True)
                else:
                    nterm_list.append(False)
                if end_pos == len(sequence):
                    cterm_list.append(True)
                else:
                    cterm_list.append(False)
    return seq_list, miss_list, nterm_list, cterm_list

class Digest(object):
    def __init__(self,
        protease:str='trypsin',
        max_missed_cleavages:int=2,
        peptide_length_min:int=6,
        peptide_length_max:int=45,
    ):
        self.n_miss_cleave = max_missed_cleavages
        self.peptide_length_min = peptide_length_min
        self.peptide_length_max = peptide_length_max
        if protease in protease_dict:
            self.regex_pattern = re.compile(
                protease_dict[protease]
            )
        else:
            self.regex_pattern = re.compile(
                protease
            )

    def cleave_sequence(self,
        sequence:str,
    )->list:
        """
        Cleave a sequence.
        Args:
            sequence (str): the given (protein) sequence.
        Returns:
            list (of str): cleaved peptide sequences with missed cleavages.
        """

        cut_pos = [0]
        cut_pos.extend([
            m.start()+1 for m in 
            self.regex_pattern.finditer(sequence)
        ])
        cut_pos.append(len(sequence))
        cut_pos = np.array(cut_pos, dtype=np.int64)

        (
            seq_list, miss_list, nterm_list, cterm_list
        ) = cleave_sequence_with_cut_pos(
            sequence, cut_pos, 
            self.n_miss_cleave,
            self.peptide_length_min,
            self.peptide_length_max,
        )
        # Consider M loss at protein N-term
        if sequence.startswith('M'):
            for seq,miss,cterm in zip(
                seq_list,miss_list,cterm_list
            ):
                if (
                    sequence.startswith(seq) 
                    and len(seq)>self.peptide_length_min
                ):
                    seq_list.append(seq[1:])
                    miss_list.append(miss)
                    nterm_list.append(True)
                    cterm_list.append(cterm)
        return seq_list, miss_list, nterm_list, cterm_list

In [None]:
#hide
seq = 'MABCDEKHIJKLNOPQRST'
digest = Digest()
seq_list, miss_list, nterm_list, cterm_list = digest.cleave_sequence(seq)
assert len(seq_list) == len(miss_list) == len(nterm_list) == len(cterm_list)
M_start_seqs = [seq for seq in seq_list if seq.startswith('M')]
assert len(M_start_seqs)*2 == len([_ for _ in nterm_list if _])
assert np.all(nterm_list[-len(M_start_seqs):])
T_end_seqs = [seq for seq in seq_list if seq.endswith('T')]
assert len(T_end_seqs) == len([_ for _ in cterm_list if _])

In [None]:
#export
def get_fix_mods(
    sequence:str,
    fix_mod_aas:str,
    fix_mod_dict:dict
)->tuple:
    """
    Generate fix modifications for the sequence
    """
    mods = []
    mod_sites = []
    for i,aa in enumerate(sequence):
        if aa in fix_mod_aas:
            mod_sites.append(i+1)
            mods.append(fix_mod_dict[aa])
    return ';'.join(mods), ';'.join(str(i) for i in mod_sites)

In [None]:
seq = 'ACBCDCK'
_fix_mod_dict = {}
_fix_mod_dict['C'] = 'mod@C'
mods, mod_sites = get_fix_mods(seq, 'C', _fix_mod_dict)
assert mods==';'.join(['mod@C']*3)
assert mod_sites=='2;4;6'
get_fix_mods(seq, 'C', _fix_mod_dict)

('mod@C;mod@C;mod@C', '2;4;6')

In [None]:
#export
def get_candidate_sites(
    sequence:str, target_mod_aas:str
)->list:
    """get candidate modification sites

    Args:
        sequence (str): peptide sequence
        target_mod_aas (str): AAs that may have modifications

    Returns:
        list: candiadte mod sites in alphabase format (0: N-term, -1: C-term, 1-n:others)
    """
    candidate_sites = []
    for i,aa in enumerate(sequence):
        if aa in target_mod_aas:
            candidate_sites.append(i+1) #alphabase mod sites
    return candidate_sites

def get_var_mod_sites(
    sequence:str,
    target_mod_aas:str,
    max_var_mod: int,
    max_combs: int,
)->list:
    """get all combinations of variable modification sites

    Args:
        sequence (str): peptide sequence
        target_mod_aas (str): AAs that may have modifications
        max_var_mod (int): max number of mods in a sequence
        max_combs (int): max number of combinations for a sequence

    Returns:
        list: list of combinations of (tuple) modification sites 
    """
    candidate_sites = get_candidate_sites(
        sequence, target_mod_aas
    )
    mod_sites = [(s,) for s in candidate_sites]
    for n_var_mod in range(2, max_var_mod+1):
        if len(mod_sites)>=max_combs: break
        mod_sites.extend(
            itertools.islice(
                itertools.combinations(
                    candidate_sites, n_var_mod
                ),
                max_combs-len(mod_sites)
            )
        )
    return mod_sites

In [None]:
seq = 'AMCMSTYK'
candidate_sites = get_candidate_sites(seq, 'MSTY')
assert np.all(np.array(candidate_sites)==np.array([2,4,5,6,7]))
get_var_mod_sites(seq, 'MSTY', 3, 20)

[(2,),
 (4,),
 (5,),
 (6,),
 (7,),
 (2, 4),
 (2, 5),
 (2, 6),
 (2, 7),
 (4, 5),
 (4, 6),
 (4, 7),
 (5, 6),
 (5, 7),
 (6, 7),
 (2, 4, 5),
 (2, 4, 6),
 (2, 4, 7),
 (2, 5, 6),
 (2, 5, 7)]

In [None]:
#export
import copy
def get_var_mods_per_sites_multi_mods_on_aa(
    sequence:str,
    mod_sites:tuple,
    var_mod_dict:dict
)->list:
    """
    Used only when the var mod list contains 
    more than one mods on the same AA, for example:
    Mod1@A, Mod2@A ...
    """
    mods_str_list = ['']
    for i,site in enumerate(mod_sites):
        if len(var_mod_dict[sequence[site-1]]) == 1:
            for i in range(len(mods_str_list)):
                mods_str_list[i] += var_mod_dict[sequence[site-1]][0]+';'
        else:
            _new_list = []
            for mod in var_mod_dict[sequence[site-1]]:
                _lst = copy.deepcopy(mods_str_list)
                for i in range(len(_lst)):
                    _lst[i] += mod+';'
                _new_list.extend(_lst)
            mods_str_list = _new_list
    return [mod[:-1] for mod in mods_str_list]

def get_var_mods_per_sites_single_mod_on_aa(
    sequence:str,
    mod_sites:tuple,
    var_mod_dict:dict
)->list:
    """
    Used when the var mod list contains 
    only one mods on the each AA, for example:
    Mod1@A, Mod2@D ...
    """
    mod_str = ''
    for site in mod_sites:
            mod_str += var_mod_dict[sequence[site-1]]+';'
    return [mod_str[:-1]]

get_var_mods_per_sites = get_var_mods_per_sites_single_mod_on_aa

def get_var_mods(
    sequence:str,
    var_mod_aas:str,
    mod_dict:dict,
    max_var_mod:int,
    max_combs:int,
    keep_unmodified:bool=False,
)->tuple:
    """
    Generate all modification combinations and associated sites
    for the sequence.
    """
    mod_sites_list = get_var_mod_sites(
        sequence, var_mod_aas, 
        max_var_mod, max_combs
    )
    ret_mods = []
    ret_sites_list = []
    for mod_sites in mod_sites_list:
        _mods = get_var_mods_per_sites(
            sequence,mod_sites,mod_dict
        )
        mod_sites_str = ';'.join([str(i) for i in mod_sites])
        ret_mods.extend(_mods)
        ret_sites_list.extend([mod_sites_str]*len(_mods))
    if keep_unmodified:
        ret_mods.append('')
        ret_sites_list.append('')
    return ret_mods, ret_sites_list

In [None]:
#hide
get_var_mods_per_sites = get_var_mods_per_sites_multi_mods_on_aa
seq = 'AMCMSTYK'
candidate_sites = get_candidate_sites(seq, 'MSTY')
mod_sites_list = get_var_mod_sites(seq, 'MSTY', 3, 20)
_mod_dict = {
    'M':['mod@M'],
    'S':['mod@S','modX@S'],
    'T':['mod@T'],
    'Y':['mod@Y'],
}
_mods, _sites = get_var_mods(seq, 'MSTY', _mod_dict, 3, 16)

assert 'mod@M;mod@M;mod@S' in _mods
assert _sites[_mods.index('mod@M;mod@M;mod@S')] == '2;4;5'
assert 'mod@M;mod@M;modX@S' in _mods
assert _sites[_mods.index('mod@M;mod@M;modX@S')] == '2;4;5'
assert 'mod@M;mod@S' in _mods
assert 'mod@M;modX@S' in _mods
get_var_mods(seq, 'MSTY', _mod_dict, 3, 16)

(['mod@M',
  'mod@M',
  'mod@S',
  'modX@S',
  'mod@T',
  'mod@Y',
  'mod@M;mod@M',
  'mod@M;mod@S',
  'mod@M;modX@S',
  'mod@M;mod@T',
  'mod@M;mod@Y',
  'mod@M;mod@S',
  'mod@M;modX@S',
  'mod@M;mod@T',
  'mod@M;mod@Y',
  'mod@S;mod@T',
  'modX@S;mod@T',
  'mod@S;mod@Y',
  'modX@S;mod@Y',
  'mod@T;mod@Y',
  'mod@M;mod@M;mod@S',
  'mod@M;mod@M;modX@S'],
 ['2',
  '4',
  '5',
  '5',
  '6',
  '7',
  '2;4',
  '2;5',
  '2;5',
  '2;6',
  '2;7',
  '4;5',
  '4;5',
  '4;6',
  '4;7',
  '5;6',
  '5;6',
  '5;7',
  '5;7',
  '6;7',
  '2;4;5',
  '2;4;5'])

In [None]:
#hide
get_var_mods_per_sites = get_var_mods_per_sites_single_mod_on_aa
seq = 'AMCMSTYK'
candidate_sites = get_candidate_sites(seq, 'MSTY')
mod_sites_list = get_var_mod_sites(seq, 'MSTY', 3, 20)
_mod_dict = {
    'M':'mod@M',
    'S':'mod@S',
    'T':'mod@T',
    'Y':'mod@Y',
}
_mods, _sites = get_var_mods(seq, 'MSTY', _mod_dict, 3, 16)
assert len(_mods) == len(_sites) == 16
assert _sites[_mods.index('mod@M;mod@M;mod@S')] == '2;4;5'
get_var_mods(seq, 'MSTY', _mod_dict, 3, 16)

(['mod@M',
  'mod@M',
  'mod@S',
  'mod@T',
  'mod@Y',
  'mod@M;mod@M',
  'mod@M;mod@S',
  'mod@M;mod@T',
  'mod@M;mod@Y',
  'mod@M;mod@S',
  'mod@M;mod@T',
  'mod@M;mod@Y',
  'mod@S;mod@T',
  'mod@S;mod@Y',
  'mod@T;mod@Y',
  'mod@M;mod@M;mod@S'],
 ['2',
  '4',
  '5',
  '6',
  '7',
  '2;4',
  '2;5',
  '2;6',
  '2;7',
  '4;5',
  '4;6',
  '4;7',
  '5;6',
  '5;7',
  '6;7',
  '2;4;5'])

In [None]:
#export
def parse_term_mod(term_mod_name:str):
    _mod, term = term_mod_name.split('@')
    if '^' in term:
        return tuple(term.split('^'))
    else:
        return '', term

In [None]:
#hide
assert parse_term_mod('Acetyl@Protein N-term') == ('', 'Protein N-term')
assert parse_term_mod('Gln->pyro-Glu@Q^Any N-term') == ('Q', 'Any N-term')

In [None]:
#export
def add_single_peptide_labeling(
    seq:str,
    mods:str,
    mod_sites:str, 
    label_aas:str, 
    label_mod_dict:dict, 
    nterm_label_mod:str, 
    cterm_label_mod:str
):
    add_nterm_label = True if nterm_label_mod else False
    add_cterm_label = True if cterm_label_mod else False
    if mod_sites:
        _sites = mod_sites.split(';')
        if '0' in _sites: add_nterm_label = False
        if '-1' in _sites: add_cterm_label = False
        mod_list = [mods]
        mod_site_list = [mod_sites]
    else:
        mod_list = []
        mod_site_list = []
    if add_nterm_label:
        mod_list.append(nterm_label_mod)
        mod_site_list.append('0')
    if add_cterm_label:
        mod_list.append(cterm_label_mod)
        mod_site_list.append('-1')
    aa_labels, aa_label_sites = get_fix_mods(seq, label_aas, label_mod_dict)
    if aa_labels:
        mod_list.append(aa_labels)
        mod_site_list.append(aa_label_sites)
    return ';'.join(mod_list), ';'.join(mod_site_list)

def parse_labels(labels:list):
    label_aas = ''
    label_mod_dict = {}
    nterm_label_mod = ''
    cterm_label_mod = ''
    for label in labels:
        _, aa = label.split('@')
        if len(aa) == 1:
            label_aas += aa
            label_mod_dict[aa] = label
        elif aa == 'Any N-term':
            nterm_label_mod = label
        elif aa == 'Any C-term':
            cterm_label_mod = label
    return label_aas, label_mod_dict, nterm_label_mod, cterm_label_mod
        
def create_labeling_peptide_df(peptide_df:pd.DataFrame, labels:list):
    df = peptide_df.copy()
    (
        label_aas, label_mod_dict, 
        nterm_label_mod, cterm_label_mod
    ) = parse_labels(labels)

    (
        df['mods'],
        df['mod_sites']
    ) = zip(*df[
        ['sequence','mods','mod_sites']
    ].apply(lambda x:
        add_single_peptide_labeling(
            *x, label_aas, label_mod_dict, 
            nterm_label_mod, cterm_label_mod
        ), axis=1,
    ))

    return df

In [None]:
#hide
labels = ['label@Any N-term','label@K']
(
    label_aas, label_mod_dict, 
    nterm_label_mod, cterm_label_mod
) = parse_labels(labels)
assert add_single_peptide_labeling(
    'ABCK','','', label_aas, label_mod_dict, 
    nterm_label_mod, cterm_label_mod
) == ('label@Any N-term;label@K', '0;4')
assert add_single_peptide_labeling(
    'ABCK','Mod@Any N-term','0', label_aas, label_mod_dict, 
    nterm_label_mod, cterm_label_mod
) == ('Mod@Any N-term;label@K', '0;4')
assert add_single_peptide_labeling(
    'KBCK','','', label_aas, label_mod_dict, 
    nterm_label_mod, cterm_label_mod
) == ('label@Any N-term;label@K;label@K', '0;1;4')
assert add_single_peptide_labeling(
    'KBCK','Mod@Any N-term','0', label_aas, label_mod_dict, 
    nterm_label_mod, cterm_label_mod
) == ('Mod@Any N-term;label@K;label@K', '0;1;4')
pep_df = pd.DataFrame({
    'sequence': ['ABCD','ABCK','KABK','EFGK'],
    'mods': ['']*3+['Mod@Any N-term'],
    'mod_sites': ['']*3+['0']
})
df = create_labeling_peptide_df(pep_df, labels)
assert np.all(df.mods.values!=pep_df.mods.values)
assert df[df.sequence=='ABCD'].mods.values[0] == 'label@Any N-term'
assert df[df.sequence=='ABCD'].mod_sites.values[0] == '0'
assert df[df.sequence=='ABCK'].mods.values[0] == 'label@Any N-term;label@K'
assert df[df.sequence=='ABCK'].mod_sites.values[0] == '0;4'
assert df[df.sequence=='KABK'].mods.values[0] == 'label@Any N-term;label@K;label@K'
assert df[df.sequence=='KABK'].mod_sites.values[0] == '0;1;4'
assert df[df.sequence=='EFGK'].mods.values[0] == 'Mod@Any N-term;label@K'
assert df[df.sequence=='EFGK'].mod_sites.values[0] == '0;4'
df = create_labeling_peptide_df(pep_df, [])
assert np.all(df.mods.values==pep_df.mods.values)

In [None]:
#export
def protein_idxes_to_names(protein_idxes:str, protein_names:list):
    if len(protein_idxes) == 0: return ''
    proteins = [protein_names[int(i)] for i in protein_idxes.split(';')]
    proteins = [protein for protein in proteins if protein]
    return ';'.join(proteins)

In [None]:
#hide
assert protein_idxes_to_names('0;1', ['A','','B'])=='A'
assert protein_idxes_to_names('0;1', ['A','C','B'])=='A;C'

In [None]:
#export
def append_regular_modifications(df:pd.DataFrame, 
    var_mods = ['Phospho@S','Phospho@T','Phospho@Y'], 
    max_mod_num=1, max_combs=100,
    keep_unmodified=True,
)->pd.DataFrame:
    """
    Append regular (not N/C-term) variable modifications to the 
    exsiting modifications of each sequence in `df`.

    Args:
        df (pd.DataFrame): Precursor dataframe
        var_mods (list, optional): Considered varialbe modification list. 
            Defaults to ['Phospho@S','Phospho@T','Phospho@Y'].
        max_mod_num (int, optional): Maximal modification number for 
            each sequence of the `var_mods`. Defaults to 1.
        max_combs (int, optional): One sequence is only allowed to explode 
            to `max_combs` number of modified peptides. Defaults to 100.
        keep_unmodified (bool, optional): If unmodified (only refered to `var_mods`)
        peptides are also remained in the returned dataframe. Defaults to True.

    Returns:
        pd.DataFrame: The precursor_df with `var_mods` appended.
    """
    mod_dict = dict([(mod[-1],mod) for mod in var_mods])
    var_mod_aas = ''.join(mod_dict.keys())
    
    (
        df['mods_app'],
        df['mod_sites_app']
    ) = zip(*df.sequence.apply(get_var_mods, 
            var_mod_aas=var_mod_aas, mod_dict=mod_dict, 
            max_var_mod=max_mod_num, max_combs=max_combs,
            keep_unmodified=keep_unmodified
        )
    )
    
    if keep_unmodified:
        df = df.explode(['mods_app','mod_sites_app'])
        df.fillna('', inplace=True)
    else:
        df.drop(df[df.mods_app.apply(lambda x: len(x)==0)].index, inplace=True)
        df = df.explode(['mods_app','mod_sites_app'])
    df['mods'] = df[['mods','mods_app']].apply(
        lambda x: ';'.join(i for i in x if i), axis=1
    )
    df['mod_sites'] = df[['mod_sites','mod_sites_app']].apply(
        lambda x: ';'.join(i for i in x if i), axis=1
    )
    df.drop(columns=['mods_app', 'mod_sites_app'], inplace=True)
    df.reset_index(drop=True, inplace=True)
    return df

In [None]:
#export
from alphabase.spectral_library.library_base import SpecLibBase

class FastaLib(SpecLibBase):
    def __init__(self,
        charged_frag_types:list = [
            'b_z1','b_z2','y_z1', 'y_z2'
        ],
        protease:str = 'trypsin',
        max_missed_cleavages:int = 2,
        peptide_length_min:int = 7,
        peptide_length_max:int = 35,
        precursor_charge_min:int = 2,
        precursor_charge_max:int = 4,
        precursor_mz_min:float = 200.0, 
        precursor_mz_max:float = 2000.0,
        var_mods:list = ['Acetyl@Protein N-term','Oxidation@M'],
        max_var_mod_num:int = 2,
        fix_mods:list = ['Carbamidomethyl@C'],
        decoy: str = None, # or pseudo_reverse or diann
        I_to_L=False,
    ):
        super().__init__(
            charged_frag_types=charged_frag_types,
            precursor_mz_min=precursor_mz_min,
            precursor_mz_max=precursor_mz_max,
            decoy=decoy
        )
        self.protein_df:pd.DataFrame() = pd.DataFrame()
        self.I_to_L = I_to_L
        self.max_mod_combinations = 100
        self._digest = Digest(
            protease, max_missed_cleavages,
            peptide_length_min, peptide_length_max
        )
        self.min_precursor_charge = precursor_charge_min
        self.max_precursor_charge = precursor_charge_max

        self.var_mods = var_mods
        self.fix_mods = fix_mods
        self.max_var_mod_num = max_var_mod_num

        self.fix_mod_aas = ''
        self.fix_mod_prot_nterm_dict = {}
        self.fix_mod_prot_cterm_dict = {}
        self.fix_mod_pep_nterm_dict = {}
        self.fix_mod_pep_cterm_dict = {}
        self.fix_mod_dict = {}

        def _set_term_mod(term_mod,
            prot_nterm, prot_cterm, pep_nterm, pep_cterm,
            allow_conflicts
        ):
            def _set_dict(term_dict,site,mod,
                allow_conflicts
            ):
                if allow_conflicts:
                    if site in term_dict:
                        term_dict[site].append(term_mod)
                    else:
                        term_dict[site] = [term_mod]
                else:
                    term_dict[site] = term_mod
            site, term = parse_term_mod(term_mod)
            if term == "Any N-term":
                _set_dict(pep_nterm, site, term_mod, 
                    allow_conflicts
                )
            elif term == 'Protein N-term':
                _set_dict(prot_nterm, site, term_mod, 
                    allow_conflicts
                )
            elif term == 'Any C-term':
                _set_dict(pep_cterm, site, term_mod, 
                    allow_conflicts
                )
            elif term == 'Protein C-term':
                _set_dict(prot_cterm, site, term_mod, 
                    allow_conflicts
                )
        
        for mod in fix_mods:
            if mod.find('@')+2 == len(mod):
                self.fix_mod_aas += mod[-1]
                self.fix_mod_dict[mod[-1]] = mod
            else:
                _set_term_mod(
                    mod, 
                    self.fix_mod_prot_nterm_dict,
                    self.fix_mod_prot_cterm_dict,
                    self.fix_mod_pep_nterm_dict,
                    self.fix_mod_pep_cterm_dict,
                    allow_conflicts=False
                )

        self.var_mod_aas = ''
        self.var_mod_prot_nterm_dict = {}
        self.var_mod_prot_cterm_dict = {}
        self.var_mod_pep_nterm_dict = {}
        self.var_mod_pep_cterm_dict = {}
        self.var_mod_dict = {}

        global get_var_mods_per_sites
        if self._check_if_multi_mods_on_aa(var_mods):
            for mod in var_mods:
                if mod.find('@')+2 == len(mod):
                    if mod[-1] in self.fix_mod_dict: continue
                    self.var_mod_aas += mod[-1]
                    if mod[-1] in self.var_mod_dict:
                        self.var_mod_dict[mod[-1]].append(mod)
                    else:
                        self.var_mod_dict[mod[-1]] = [mod]
            get_var_mods_per_sites = get_var_mods_per_sites_multi_mods_on_aa
        else:
            for mod in var_mods:
                if mod.find('@')+2 == len(mod):
                    if mod[-1] in self.fix_mod_dict: continue
                    self.var_mod_aas += mod[-1]
                    self.var_mod_dict[mod[-1]] = mod
            get_var_mods_per_sites = get_var_mods_per_sites_single_mod_on_aa
        
        for mod in var_mods:
            if mod.find('@')+2 < len(mod):
                _set_term_mod(
                    mod, 
                    self.var_mod_prot_nterm_dict,
                    self.var_mod_prot_cterm_dict,
                    self.var_mod_pep_nterm_dict,
                    self.var_mod_pep_cterm_dict,
                    allow_conflicts=True
                )

    def _check_if_multi_mods_on_aa(self, var_mods):
        mod_set = set()
        for mod in var_mods:
            if mod.find('@')+2 == len(mod):
                if mod[-1] in mod_set: return True
                mod_set.add(mod[-1])
        return False

    def import_and_process_fasta(self, fasta_files:Union[str,list]):
        protein_dict = load_all_proteins(fasta_files)
        self.import_and_process_protein_dict(protein_dict)

    def import_and_process_protein_dict(self, protein_dict:dict):
        self.get_peptides_from_protein_dict(protein_dict)
        self._process_after_load_pep_seqs()

    def import_and_process_peptide_sequences(self, 
        pep_seq_list:list, protein_list=None,
    ):
        self.get_peptides_from_peptide_sequence_list(
            pep_seq_list, protein_list
        )
        self._process_after_load_pep_seqs()

    def _process_after_load_pep_seqs(self):
        """
        Called by `import_and_process_...` methods. 
        """
        self.append_decoy_sequence()
        self.add_modifications()
        self.add_charge()

    def get_peptides_from_fasta(self, fasta_file:Union[str,list]):
        """Load peptide sequence from fasta file.

        Args:
            fasta_path (Union[str,list]): could be a fasta path or a list of fasta paths
              or a list of fasta paths
        """
        if isinstance(fasta_file, str):
            self.get_peptides_from_fasta_list([fasta_file])
        else:
            self.get_peptides_from_fasta_list(fasta_file)

    def get_peptides_from_fasta_list(self, fasta_files:list):
        """Load peptide sequences from fasta file list

        Args:
            fasta_files (list): fasta file list
        """
        protein_dict = load_all_proteins(fasta_files)
        self.get_peptides_from_protein_dict(protein_dict)

    def get_peptides_from_protein_dict(self, protein_dict:dict):

        self.protein_df = pd.DataFrame.from_dict(
            protein_dict, orient='index'
        ).reset_index(drop=True)

        if self.I_to_L:
            self.protein_df[
                'sequence_I2L'
            ] = self.protein_df.sequence.str.replace('I','L')
            digest_seq = 'sequence_I2L'
        else:
            digest_seq = 'sequence'
        self._cleave_to_peptides(
            self.protein_df,
            protein_seq_column=digest_seq
        )

    def _cleave_to_peptides(self, 
        protein_df:pd.DataFrame,
        protein_seq_column:str='sequence'
    ):
        pep_dict = {}

        for i,prot_seq in enumerate(
            protein_df[protein_seq_column].values
        ):
            (
                seq_list, miss_list, nterm_list, cterm_list
            ) = self._digest.cleave_sequence(prot_seq)
            for seq,miss,nterm,cterm in zip(
                seq_list,miss_list,nterm_list, cterm_list
            ):
                prot_id = str(i)
                if seq in pep_dict:
                    if not pep_dict[seq][0].endswith(prot_id):
                        pep_dict[seq][0] += ';'+prot_id
                    if nterm:
                        pep_dict[seq][2] = nterm
                    if cterm:
                        pep_dict[seq][3] = cterm
                else:
                    pep_dict[seq] = [prot_id,miss,nterm,cterm]
        self._precursor_df = pd.DataFrame().from_dict(
            pep_dict, orient='index', columns = [
                'protein_idxes','miss_cleavage',
                'is_prot_nterm','is_prot_cterm'
            ]
        )
        self._precursor_df.reset_index(drop=False, inplace=True)
        self._precursor_df.rename(
            columns={'index':'sequence'}, inplace=True
        )
        self._precursor_df['mods'] = ''
        self._precursor_df['mod_sites'] = ''
        self.refine_df()

    def append_protein_name(self):
        if (
            'protein_id' not in self.protein_df or 
            'protein_idxes' not in self._precursor_df
        ): 
            return

        self._precursor_df['proteins'] = self._precursor_df['protein_idxes'].apply(
            protein_idxes_to_names,
            protein_names=self.protein_df['protein_id'].values
        )

        if 'gene_name' in self.protein_df.columns:
            self._precursor_df['genes'] = self._precursor_df['protein_idxes'].apply(
                protein_idxes_to_names,
                protein_names=self.protein_df['gene_name'].values
            )

    def get_peptides_from_peptide_sequence_list(self, 
        pep_seq_list:list,
        protein_list:list = None
    ):
        self._precursor_df = pd.DataFrame()
        self._precursor_df['sequence'] = pep_seq_list
        if protein_list is not None:
            self._precursor_df['protein_name'] = protein_list
        self._precursor_df['is_prot_nterm'] = False
        self._precursor_df['is_prot_cterm'] = False
        self.refine_df()

    def add_mods_for_one_seq(self, sequence:str, 
        is_prot_nterm, is_prot_cterm
    ):
        fix_mods, fix_mod_sites = get_fix_mods(
            sequence, self.fix_mod_aas, self.fix_mod_dict
        )
        #TODO add prot and pep C-term fix mods
        #TODO add prot and pep N-term fix mods

        if len(fix_mods) == 0:
            fix_mods = ['']
            fix_mod_sites = ['']
        else:
            fix_mods = [fix_mods]
            fix_mod_sites = [fix_mod_sites]

        var_mods_list, var_mod_sites_list = get_var_mods(
            sequence, self.var_mod_aas, self.var_mod_dict, 
            self.max_var_mod_num, self.max_mod_combinations-1, # 1 for unmodified
            keep_unmodified=True
        )

        nterm_var_mods = ['']
        nterm_var_mod_sites = ['']
        if is_prot_nterm and len(self.var_mod_prot_nterm_dict)>0:
            if '' in self.var_mod_prot_nterm_dict:
                nterm_var_mods.extend(self.var_mod_prot_nterm_dict[''])
            if sequence[0] in self.var_mod_prot_nterm_dict:
                nterm_var_mods.extend(self.var_mod_prot_nterm_dict[sequence[0]])
        if len(self.var_mod_pep_nterm_dict)>0:
            if '' in self.var_mod_pep_nterm_dict:
                nterm_var_mods.extend(self.var_mod_pep_nterm_dict[''])
            if sequence[0] in self.var_mod_pep_nterm_dict:
                nterm_var_mods.extend(self.var_mod_pep_nterm_dict[sequence[0]])
        nterm_var_mod_sites.extend(['0']*(len(nterm_var_mods)-1))

        #TODO add prot and pep C-term var mods

        return (
            list(
                ';'.join([i for i in items if i]) for items in itertools.product(
                    fix_mods, nterm_var_mods, var_mods_list
                )
            ),
            list(
                ';'.join([i for i in items if i]) for items in itertools.product(
                    fix_mod_sites, nterm_var_mod_sites, var_mod_sites_list
                )
            ),
        )

    def add_modifications(self):
        if 'is_prot_nterm' not in self._precursor_df.columns:
            self._precursor_df['is_prot_nterm'] = False
        if 'is_prot_cterm' not in self._precursor_df.columns:
            self._precursor_df['is_prot_cterm'] = False
        
        (
            self._precursor_df['mods'],
            self._precursor_df['mod_sites']
        ) = zip(*self._precursor_df[
            ['sequence','is_prot_nterm','is_prot_cterm']
        ].apply(lambda x:
            self.add_mods_for_one_seq(*x), axis=1
        ))
        self._precursor_df = explode_multiple_columns(
            self._precursor_df,
            ['mods','mod_sites']
        )
        self._precursor_df.reset_index(drop=True, inplace=True)

    def add_additional_modifications(self,
        var_mods = ['Phospho@S','Phospho@T','Phospho@Y'], 
        max_mod_num=1, max_combs=100,
        keep_unmodified=True,
    ):
        self._precursor_df = append_regular_modifications(
            self.precursor_df,
            var_mods=var_mods,
            max_mod_num=max_mod_num,
            max_combs=max_combs,
            keep_unmodified=keep_unmodified
        )

    def add_peptide_labeling(self, labeling_channel_dict:dict):
        """ 
        Add labeling onto peptides inplace of self._precursor_df

        Args:
            labeling_channel_dict (dict of list): for example:
              {
                  'reference': [], # not labelled for reference
                  'light': ['Dimethyl@Any N-term','Dimethyl@K'],
                  'median': ['Dimethyl:2H(4)@Any N-term','Dimethyl:2H(4)@K'],
                  'heavy': ['Dimethyl:2H(6)13C(2)@Any N-term','Dimethyl:2H(6)13C(2)@K'],
              }.
              The key name could be arbitrary distinguished strings, and value must be a list of string.
        
        """
        df_list = []
        for channel, labels in labeling_channel_dict.items():
            df = create_labeling_peptide_df(self._precursor_df, labels)
            df['label_channel'] = channel
            df_list.append(df)
        self._precursor_df = pd.concat(df_list)
        self._precursor_df.reset_index(drop=True, inplace=True)

    def add_charge(self):
        self._precursor_df['charge'] = [
            np.arange(
                self.min_precursor_charge, 
                self.max_precursor_charge+1
            )
        ]*len(self._precursor_df)
        self._precursor_df = self._precursor_df.explode('charge')
        self._precursor_df['charge'] = self._precursor_df.charge.astype(np.int8)
        self._precursor_df.reset_index(drop=True, inplace=True)

    def save_hdf(self, hdf_file):
        super().save_hdf(hdf_file)
        _hdf = HDF_File(
            hdf_file,
            read_only=False,
            truncate=True,
            delete_existing=False
        )
        _hdf.library.protein_df = self.protein_df

    def load_hdf(self, hdf_file, load_mod_seq=False):
        super().load_hdf(hdf_file, load_mod_seq=load_mod_seq)
        try:
            _hdf = HDF_File(
                hdf_file,
            )
            self.protein_df = _hdf.library.protein_df.values
        except (AttributeError, KeyError, ValueError, TypeError):
            print(f"No protein_df in {hdf_file}")

In [None]:
df = pd.DataFrame(
    {
        'sequence': ['ABSTY','ACXSX','ACDEFG'],
        'mods': ['', 'Acetyl@Protein N-term', ''],
        'mod_sites': ['', '0', '']
    }
)
df = append_regular_modifications(df, keep_unmodified=True)
assert np.sum(df.sequence=='ABSTY')==4
assert np.sum(df.sequence=='ACXSX')==2
assert np.sum(df.sequence=='ACDEFG')==1
assert all(df[df.sequence=='ABSTY'].mods.values == np.array(['Phospho@S','Phospho@T','Phospho@Y','']))
assert all(df[df.sequence=='ABSTY'].mod_sites.values == np.array(['3','4','5','']))
assert all(df[df.sequence=='ACXSX'].mods.values == np.array(['Acetyl@Protein N-term;Phospho@S','Acetyl@Protein N-term']))
assert all(df[df.sequence=='ACXSX'].mod_sites.values == np.array(['0;4','0']))
df

Unnamed: 0,sequence,mods,mod_sites
0,ABSTY,Phospho@S,3
1,ABSTY,Phospho@T,4
2,ABSTY,Phospho@Y,5
3,ABSTY,,
4,ACXSX,Acetyl@Protein N-term;Phospho@S,0;4
5,ACXSX,Acetyl@Protein N-term,0
6,ACDEFG,,


In [None]:
df = pd.DataFrame(
    {
        'sequence': ['ABSTY','ACXSX','ACDEFG'],
        'mods': ['', 'Acetyl@Protein N-term', ''],
        'mod_sites': ['', '0', '']
    }
)
df = append_regular_modifications(df, keep_unmodified=False)
assert np.sum(df.sequence=='ABSTY')==3
assert np.sum(df.sequence=='ACXSX')==1
assert np.sum(df.sequence=='ACDEFG')==0
df

Unnamed: 0,sequence,mods,mod_sites
0,ABSTY,Phospho@S,3
1,ABSTY,Phospho@T,4
2,ABSTY,Phospho@Y,5
3,ACXSX,Acetyl@Protein N-term;Phospho@S,0;4


In [None]:
_lib = FastaLib(None, I_to_L=False, decoy='pseudo_reverse')
prot1 = 'MABCDESTKAFGHIJKLMNOPQRAFGHIJK'
prot2 = 'AFGHIJKLMNOPQR'
protein_dict = {
    'xx': {
        'protein_id': 'xx',
        'gene_name': '',
        'sequence': prot1
    },
    'yy': {
        'protein_id': 'yy',
        'gene_name': 'gene',
        'sequence': prot2
    }
}
_lib.get_peptides_from_protein_dict(protein_dict)
_lib.precursor_df

Unnamed: 0,sequence,protein_idxes,miss_cleavage,is_prot_nterm,is_prot_cterm,mods,mod_sites,nAA
0,AFGHIJK,0;1,0,True,True,,,7
1,LMNOPQR,0;1,0,False,True,,,7
2,ABCDESTK,0,0,True,False,,,8
3,MABCDESTK,0,0,True,False,,,9
4,AFGHIJKLMNOPQR,0;1,1,True,True,,,14
5,LMNOPQRAFGHIJK,0,1,False,True,,,14
6,ABCDESTKAFGHIJK,0,1,True,False,,,15
7,MABCDESTKAFGHIJK,0,1,True,False,,,16
8,AFGHIJKLMNOPQRAFGHIJK,0,2,False,True,,,21
9,ABCDESTKAFGHIJKLMNOPQR,0,2,True,False,,,22


In [None]:
_lib.protein_df

Unnamed: 0,protein_id,gene_name,sequence
0,xx,,MABCDESTKAFGHIJKLMNOPQRAFGHIJK
1,yy,gene,AFGHIJKLMNOPQR


In [None]:
_lib.append_protein_name()
assert 'proteins' in _lib.precursor_df.columns
_lib.precursor_df

Unnamed: 0,sequence,protein_idxes,miss_cleavage,is_prot_nterm,is_prot_cterm,mods,mod_sites,nAA,proteins,genes
0,AFGHIJK,0;1,0,True,True,,,7,xx;yy,gene
1,LMNOPQR,0;1,0,False,True,,,7,xx;yy,gene
2,ABCDESTK,0,0,True,False,,,8,xx,
3,MABCDESTK,0,0,True,False,,,9,xx,
4,AFGHIJKLMNOPQR,0;1,1,True,True,,,14,xx;yy,gene
5,LMNOPQRAFGHIJK,0,1,False,True,,,14,xx,
6,ABCDESTKAFGHIJK,0,1,True,False,,,15,xx,
7,MABCDESTKAFGHIJK,0,1,True,False,,,16,xx,
8,AFGHIJKLMNOPQRAFGHIJK,0,2,False,True,,,21,xx,
9,ABCDESTKAFGHIJKLMNOPQR,0,2,True,False,,,22,xx,


In [None]:
#hide
for i in range(len(_lib.precursor_df)):
    seq = _lib.precursor_df.sequence[i]
    # test is_prot_nterm
    if prot1.startswith(seq) or prot2.startswith(seq):
        assert _lib.precursor_df.is_prot_nterm[i], seq
    elif prot1[1:].startswith(seq): # M.xxxxx
        assert _lib.precursor_df.is_prot_nterm[i], seq
    else:
        assert not _lib.precursor_df.is_prot_nterm[i], seq
    # test is_prot_cterm
    if prot1.endswith(seq) or prot2.endswith(seq):
        assert _lib.precursor_df.is_prot_cterm[i], seq
    else:
        assert not _lib.precursor_df.is_prot_cterm[i], seq
    # test protein_idxes
    if seq in prot1 and seq in prot2:
        assert _lib.precursor_df.protein_idxes[i] == '0;1'
        assert _lib.precursor_df.proteins[i] == 'xx;yy'
        assert _lib.precursor_df.genes[i] == 'gene'
    else:
        assert ';' not in _lib.precursor_df.protein_idxes[i]
        assert ';' not in _lib.precursor_df.proteins[i]
        assert _lib.precursor_df.genes[i] == ''

In [None]:
_lib.add_modifications()
_lib.precursor_df

Unnamed: 0,sequence,protein_idxes,miss_cleavage,is_prot_nterm,is_prot_cterm,mods,mod_sites,nAA,proteins,genes
0,AFGHIJK,0;1,0,True,True,,,7,xx;yy,gene
1,AFGHIJK,0;1,0,True,True,Acetyl@Protein N-term,0,7,xx;yy,gene
2,LMNOPQR,0;1,0,False,True,Oxidation@M,2,7,xx;yy,gene
3,LMNOPQR,0;1,0,False,True,,,7,xx;yy,gene
4,ABCDESTK,0,0,True,False,Carbamidomethyl@C,3,8,xx,
5,ABCDESTK,0,0,True,False,Carbamidomethyl@C;Acetyl@Protein N-term,3;0,8,xx,
6,MABCDESTK,0,0,True,False,Carbamidomethyl@C;Oxidation@M,4;1,9,xx,
7,MABCDESTK,0,0,True,False,Carbamidomethyl@C,4,9,xx,
8,MABCDESTK,0,0,True,False,Carbamidomethyl@C;Acetyl@Protein N-term;Oxidat...,4;0;1,9,xx,
9,MABCDESTK,0,0,True,False,Carbamidomethyl@C;Acetyl@Protein N-term,4;0,9,xx,


In [None]:
#hide
for i in range(len(_lib.precursor_df)):
    seq = _lib.precursor_df.sequence[i]
    mods = _lib.precursor_df.mods[i]
    sites = _lib.precursor_df.mod_sites[i]
    # test fix mods
    if 'C' in seq:
        assert str(seq.find('C')+1) in sites
        assert 'Carbamidomethyl@C' in mods
    else:
        assert 'Carbamidomethyl@C' not in mods
    # test Acetyl@Protein N-term
    if 'Acetyl@Protein N-term' in mods:
        assert _lib.precursor_df.is_prot_nterm[i]
        assert '0' in sites
    if '0' in mods:
        assert _lib.precursor_df.is_prot_nterm[i]
        assert 'Acetyl@Protein N-term' in mods
    if not _lib.precursor_df.is_prot_nterm[i]:
        assert 'Acetyl@Protein N-term' not in mods
    # test Oxidation@M
    if 'Oxidation@M' in mods:
        assert 'M' in seq
        assert str(seq.find('M')+1) in sites
    # test unmodified
    if mods == '':
        assert sites == ''
    if sites == '':
        assert mods == ''
df = _lib.precursor_df
# at least one nterm peptide does not contain Acetyl@Protein N-term
assert not df[df.is_prot_nterm].mod_sites.str.contains('0').all()
# at least one nterm peptide contains Acetyl@Protein N-term
assert df[df.is_prot_nterm].mod_sites.str.contains('0').any()
# test var mod Oxidation@M
assert not df[df.sequence.str.contains('M')].mods.str.contains('Oxidation@M').all()
assert df[df.sequence.str.contains('M')].mods.str.contains('Oxidation@M').any()
assert '' in df.mods.values

In [None]:
_lib.add_additional_modifications(['Phospho@S','Phospho@T'])
assert _lib.precursor_df.mods.str.contains('Phospho').any()
_lib.precursor_df

Unnamed: 0,sequence,protein_idxes,miss_cleavage,is_prot_nterm,is_prot_cterm,mods,mod_sites,nAA,proteins,genes
0,AFGHIJK,0;1,0,True,True,,,7,xx;yy,gene
1,AFGHIJK,0;1,0,True,True,Acetyl@Protein N-term,0,7,xx;yy,gene
2,LMNOPQR,0;1,0,False,True,Oxidation@M,2,7,xx;yy,gene
3,LMNOPQR,0;1,0,False,True,,,7,xx;yy,gene
4,ABCDESTK,0,0,True,False,Carbamidomethyl@C;Phospho@S,3;6,8,xx,
...,...,...,...,...,...,...,...,...,...,...
79,MABCDESTKAFGHIJKLMNOPQR,0,2,True,False,Carbamidomethyl@C;Acetyl@Protein N-term;Oxidat...,4;0;1;18;8,23,xx,
80,MABCDESTKAFGHIJKLMNOPQR,0,2,True,False,Carbamidomethyl@C;Acetyl@Protein N-term;Oxidat...,4;0;1;18,23,xx,
81,MABCDESTKAFGHIJKLMNOPQR,0,2,True,False,Carbamidomethyl@C;Acetyl@Protein N-term;Phospho@S,4;0;7,23,xx,
82,MABCDESTKAFGHIJKLMNOPQR,0,2,True,False,Carbamidomethyl@C;Acetyl@Protein N-term;Phospho@T,4;0;8,23,xx,


In [None]:
#hide
_lib.add_peptide_labeling({
    'none': [], # not labelled for reference
    'light': ['Dimethyl@Any N-term','Dimethyl@K'],
    'heavy': ['Dimethyl:2H(6)13C(2)@Any N-term','Dimethyl:2H(6)13C(2)@K'],
})
_lib.precursor_df

Unnamed: 0,sequence,protein_idxes,miss_cleavage,is_prot_nterm,is_prot_cterm,mods,mod_sites,nAA,proteins,genes,label_channel
0,AFGHIJK,0;1,0,True,True,,,7,xx;yy,gene,none
1,AFGHIJK,0;1,0,True,True,Acetyl@Protein N-term,0,7,xx;yy,gene,none
2,LMNOPQR,0;1,0,False,True,Oxidation@M,2,7,xx;yy,gene,none
3,LMNOPQR,0;1,0,False,True,,,7,xx;yy,gene,none
4,ABCDESTK,0,0,True,False,Carbamidomethyl@C;Phospho@S,3;6,8,xx,,none
...,...,...,...,...,...,...,...,...,...,...,...
247,MABCDESTKAFGHIJKLMNOPQR,0,2,True,False,Carbamidomethyl@C;Acetyl@Protein N-term;Oxidat...,4;0;1;18;8;9;16,23,xx,,heavy
248,MABCDESTKAFGHIJKLMNOPQR,0,2,True,False,Carbamidomethyl@C;Acetyl@Protein N-term;Oxidat...,4;0;1;18;9;16,23,xx,,heavy
249,MABCDESTKAFGHIJKLMNOPQR,0,2,True,False,Carbamidomethyl@C;Acetyl@Protein N-term;Phosph...,4;0;7;9;16,23,xx,,heavy
250,MABCDESTKAFGHIJKLMNOPQR,0,2,True,False,Carbamidomethyl@C;Acetyl@Protein N-term;Phosph...,4;0;8;9;16,23,xx,,heavy


In [None]:
_lib = FastaLib(
    ['b_z1','y_z1'], I_to_L=False, 
    decoy='pseudo_reverse'
)
prot1 = 'MACDESTYKBKFGHIKLMNPQRST'
prot2 = 'FGHIKLMNPQR'
protein_dict = {
    'xx': {
        'protein_id': 'xx',
        'sequence': prot1
    },
    'yy': {
        'protein_id': 'yy',
        'sequence': prot2
    }
}
_lib.import_and_process_protein_dict(protein_dict)
_lib.calc_precursor_isotope()
assert (_lib.precursor_df.charge == _lib.min_precursor_charge).any()
assert (_lib.precursor_df.charge == _lib.max_precursor_charge).any()
assert (_lib.precursor_df.decoy==1).any()
assert ('MACDESTY'[::-1]+'K') in _lib.precursor_df.sequence.values
assert 'isotope_apex_index' in _lib.precursor_df.columns
assert 'isotope_apex_intensity' in _lib.precursor_df.columns
assert ~_lib.precursor_df.sequence.str.contains('B').any()
_lib.precursor_df

Unnamed: 0,sequence,protein_idxes,miss_cleavage,is_prot_nterm,is_prot_cterm,mods,mod_sites,nAA,decoy,charge,precursor_mz,isotope_intensity_m1,isotope_intensity_m2,isotope_apex_intensity,isotope_apex_index,isotope_mz_m1,isotope_mz_m2,isotope_apex_mz
0,LMNPQRST,0,1,False,True,Oxidation@M,2,8,0,2,481.739834,0.478814,0.183746,1.0,0,482.241484,482.743134,481.739834
1,LMNPQRST,0,1,False,True,Oxidation@M,2,8,0,3,321.495648,0.478814,0.183746,1.0,0,321.830081,322.164515,321.495648
2,LMNPQRST,0,1,False,True,Oxidation@M,2,8,0,4,241.373555,0.478814,0.183746,1.0,0,241.624380,241.875205,241.373555
3,LMNPQRST,0,1,False,True,,,8,0,2,473.742377,0.478433,0.181509,1.0,0,474.244027,474.745677,473.742377
4,LMNPQRST,0,1,False,True,,,8,0,3,316.164010,0.478433,0.181509,1.0,0,316.498443,316.832877,316.164010
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
79,FGHIKLMNPQRST,0,2,False,True,Oxidation@M,7,13,0,3,515.604920,0.828432,0.420789,1.0,0,515.939354,516.273787,515.604920
80,FGHIKLMNPQRST,0,2,False,True,Oxidation@M,7,13,0,4,386.955509,0.828432,0.420789,1.0,0,387.206334,387.457159,386.955509
81,FGHIKLMNPQRST,0,2,False,True,,,13,0,2,764.906285,0.828051,0.418418,1.0,0,765.407935,765.909585,764.906285
82,FGHIKLMNPQRST,0,2,False,True,,,13,0,3,510.273282,0.828051,0.418418,1.0,0,510.607715,510.942149,510.273282


In [None]:
_lib.import_and_process_protein_dict(protein_dict)
_lib.add_peptide_labeling({
    'light': ['Dimethyl@Any N-term','Dimethyl@K'],
    'heavy': ['Dimethyl:2H(6)13C(2)@Any N-term','Dimethyl:2H(6)13C(2)@K'],
})
_lib.calc_precursor_isotope()
assert (_lib.precursor_df.decoy==1).any()
assert ('MACDESTY'[::-1]+'K') in _lib.precursor_df.sequence.values
assert 'isotope_apex_index' in _lib.precursor_df.columns
assert 'isotope_apex_intensity' in _lib.precursor_df.columns
assert ~_lib.precursor_df.sequence.str.contains('B').any()
_lib.precursor_df

Unnamed: 0,sequence,protein_idxes,miss_cleavage,is_prot_nterm,is_prot_cterm,mods,mod_sites,nAA,decoy,charge,label_channel,precursor_mz,isotope_intensity_m1,isotope_intensity_m2,isotope_apex_intensity,isotope_apex_index,isotope_mz_m1,isotope_mz_m2,isotope_apex_mz
0,LMNPQRST,0,1,False,True,Oxidation@M;Dimethyl@Any N-term,2;0,8,0,2,light,495.755484,0.500906,0.194451,1.0,0,496.257134,496.758784,495.755484
1,LMNPQRST,0,1,False,True,Oxidation@M;Dimethyl@Any N-term,2;0,8,0,3,light,330.839415,0.500906,0.194451,1.0,0,331.173848,331.508282,330.839415
2,LMNPQRST,0,1,False,True,Oxidation@M;Dimethyl@Any N-term,2;0,8,0,4,light,248.381380,0.500906,0.194451,1.0,0,248.632205,248.883030,248.381380
3,LMNPQRST,0,1,False,True,Dimethyl@Any N-term,0,8,0,2,light,487.758027,0.500525,0.192205,1.0,0,488.259677,488.761327,487.758027
4,LMNPQRST,0,1,False,True,Dimethyl@Any N-term,0,8,0,3,light,325.507777,0.500525,0.192205,1.0,0,325.842210,326.176643,325.507777
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
163,FGHIKLMNPQRST,0,2,False,True,Oxidation@M;Dimethyl:2H(6)13C(2)@Any N-term;Di...,7;0;5,13,0,3,heavy,539.655367,0.788273,0.392103,1.0,0,539.989801,540.324234,539.655367
164,FGHIKLMNPQRST,0,2,False,True,Oxidation@M;Dimethyl:2H(6)13C(2)@Any N-term;Di...,7;0;5,13,0,4,heavy,404.993344,0.788273,0.392103,1.0,0,405.244169,405.494994,404.993344
165,FGHIKLMNPQRST,0,2,False,True,Dimethyl:2H(6)13C(2)@Any N-term;Dimethyl:2H(6)...,0;5,13,0,2,heavy,800.981955,0.787646,0.389779,1.0,0,801.483605,801.985255,800.981955
166,FGHIKLMNPQRST,0,2,False,True,Dimethyl:2H(6)13C(2)@Any N-term;Dimethyl:2H(6)...,0;5,13,0,3,heavy,534.323729,0.787646,0.389779,1.0,0,534.658162,534.992596,534.323729
