In [1]:
#default_exp protein.fasta

In [2]:
#hide
import alphadeep.protein.fasta
__file__ = alphadeep.protein.fasta.__file__

In [3]:
#export
import regex as re
import numpy as np
import pandas as pd
import numba
import os
import itertools
from Bio import SeqIO

from alphabase.yaml_utils import load_yaml
from alphabase.io.hdf import HDF_File
from alphadeep.spec_lib.predict_lib import PredictSpecLib
from alphadeep.pretrained_models import ModelManager

protease_dict = load_yaml(
    os.path.join(
        os.path.dirname(
            __file__
        ), 
        'protease.yaml'
    )
)

In [4]:
#hide
p = re.compile(protease_dict['trypsin'])

idx = '0123456789012345678901234567890123456789012345'
seq = 'ABDNGKENGLANGIXHGRKTNGLANGKVHNAKHNARKANGKPFAAT'
cut_pos = np.array([m.start()+1 for m in p.finditer(seq)])
assert np.all(cut_pos==np.array([6, 18, 19, 27, 32, 36, 37]))

In [5]:
#export

def read_fasta_file(fasta_filename:str=""):
    """
    Read a FASTA file line by line
    Args:
        fasta_filename (str): fasta.
    Yields:
        dict {id:str, name:str, description:str, sequence:str}: protein information.
    """
    with open(fasta_filename, "rt") as handle:
        iterator = SeqIO.parse(handle, "fasta")
        while iterator:
            try:
                record = next(iterator)
                parts = record.id.split("|")  # pipe char
                if len(parts) > 1:
                    id = parts[1]
                else:
                    id = record.name
                sequence = str(record.seq)
                entry = {
                    "id": id,
                    "full_name": record.name,
                    "description": record.description,
                    "sequence": sequence,
                }

                yield entry
            except StopIteration:
                break

def load_all_proteins(fasta_file_list:list):
    protein_dict = {}
    for fasta in fasta_file_list:
        for protein in read_fasta_file(fasta):
            protein_dict[protein['id']] = protein
    return protein_dict

def concat_proteins(protein_dict):
    seq_list = ['']
    seq_count = 1
    for key in protein_dict:
        protein_dict[key]['offset'] = seq_count
        seq_list.append(protein_dict[key]['sequence'])
        seq_count += protein_dict[key]['sequence']+1
    seq_list.append('')
    return '$'.join(seq_list)

In [6]:
#export
@numba.njit
def cleave_sequence_with_cut_pos(
    sequence:str,
    cut_pos:np.array,
    n_missed_cleavages:int=2,
    pep_length_min:int=6,
    pep_length_max:int=45,
)->np.array:
    """
    Cleave a sequence with cut postions (cut_pos). 
    Filters to have a minimum and maximum length.
    Args:
        sequence (str): protein sequence
        cut_pos (np.array): cut postions determined by a given protease.
        n_missed_cleavages (int): the number of max missed cleavages.
        pep_length_min (int): min peptide length.
        pep_length_max (int): max peptide length.
    Returns:
        list (str): cleaved peptide sequences with missed cleavages.
        list (int): number of miss cleavage of each peptide.
        list (bool): if N-term peptide
        list (bool): if C-term pepetide
    """
    seq_list = []
    miss_list = []
    nterm_list = []
    cterm_list = []
    for i,start_pos in enumerate(cut_pos):
        for n_miss,end_pos in enumerate(
            cut_pos[i+1:i+2+n_missed_cleavages]
        ):
            if end_pos > start_pos + pep_length_max:
                break
            elif end_pos < start_pos + pep_length_min:
                continue
            else:
                seq_list.append(sequence[start_pos:end_pos])
                miss_list.append(n_miss)
                if start_pos == 0:
                    nterm_list.append(True)
                else:
                    nterm_list.append(False)
                if end_pos == len(sequence):
                    cterm_list.append(True)
                else:
                    cterm_list.append(False)
    return seq_list, miss_list, nterm_list, cterm_list

class Digest(object):
    def __init__(self,
        protease='trypsin',
        max_missed_cleavages:int=2,
        pep_length_min:int=6,
        pep_length_max:int=45,
    ):
        self.n_miss_cleave = max_missed_cleavages
        self.pep_length_min = pep_length_min
        self.pep_length_max = pep_length_max
        self.regex_pattern = re.compile(
            protease_dict[protease]
        )

    def cleave_sequence(self,
        sequence:str,
    )->list:
        """
        Cleave a sequence.
        Args:
            sequence (str): the given (protein) sequence.
        Returns:
            list (of str): cleaved peptide sequences with missed cleavages.
        """

        cut_pos = [0]
        cut_pos.extend([
            m.start()+1 for m in 
            self.regex_pattern.finditer(sequence)
        ])
        cut_pos.append(len(sequence))
        cut_pos = np.array(cut_pos, dtype=np.int64)

        (
            seq_list, miss_list, nterm_list, cterm_list
        ) = cleave_sequence_with_cut_pos(
            sequence, cut_pos, 
            self.n_miss_cleave,
            self.pep_length_min,
            self.pep_length_max,
        )
        # Consider M loss at protein N-term
        if sequence.startswith('M'):
            for seq,miss,cterm in zip(
                seq_list,miss_list,cterm_list
            ):
                if (
                    sequence.startswith(seq) 
                    and len(seq)>self.pep_length_min
                ):
                    seq_list.append(seq[1:])
                    miss_list.append(miss)
                    nterm_list.append(True)
                    cterm_list.append(cterm)
        return seq_list, miss_list, nterm_list, cterm_list

In [7]:
#hide
seq = 'MABCDEKHIJKLNOPQRST'
digest = Digest()
seq_list, miss_list, nterm_list, cterm_list = digest.cleave_sequence(seq)
assert len(seq_list) == len(miss_list) == len(nterm_list) == len(cterm_list)
M_start_seqs = [seq for seq in seq_list if seq.startswith('M')]
assert len(M_start_seqs)*2 == len([_ for _ in nterm_list if _])
assert np.all(nterm_list[-len(M_start_seqs):])
T_end_seqs = [seq for seq in seq_list if seq.endswith('T')]
assert len(T_end_seqs) == len([_ for _ in cterm_list if _])

In [8]:
#export
def get_fix_mods(
    sequence:str,
    fix_mod_aas:str,
    fix_mod_dict:dict
)->tuple:
    mods = []
    mod_sites = []
    for i,aa in enumerate(sequence):
        if aa in fix_mod_aas:
            mod_sites.append(i+1)
            mods.append(fix_mod_dict[aa])
    return ';'.join(mods), ';'.join(str(i) for i in mod_sites)

In [9]:
#hide
seq = 'ACBCDCK'
_fix_mod_dict = {}
_fix_mod_dict['C'] = 'mod@C'
mods, mod_sites = get_fix_mods(seq, 'C', _fix_mod_dict)
assert mods==';'.join(['mod@C']*3)
assert mod_sites=='2;4;6'
get_fix_mods(seq, 'C', _fix_mod_dict)

('mod@C;mod@C;mod@C', '2;4;6')

In [10]:
#export
def get_candidate_sites(
    sequence:str, target_mod_aas:str
)->list:
    candidate_sites = []
    for i,aa in enumerate(sequence):
        if aa in target_mod_aas:
            candidate_sites.append(i+1) #alphabase mod sites
    return candidate_sites

def get_var_mod_sites(
    sequence:str,
    target_mod_aas:str,
    max_var_mod: int,
    max_combs: int
)->list:
    candidate_sites = get_candidate_sites(
        sequence, target_mod_aas
    )
    mod_sites = [(s,) for s in candidate_sites]
    for n_var_mod in range(2, max_var_mod+1):
        if len(mod_sites)>=max_combs: break
        mod_sites.extend(
            itertools.islice(
                itertools.combinations(
                    candidate_sites, n_var_mod
                ),
                max_combs-len(mod_sites)
            )
        )
    return mod_sites

In [11]:
#hide
seq = 'AMCMSTYK'
candidate_sites = get_candidate_sites(seq, 'MSTY')
assert np.all(np.array(candidate_sites)==np.array([2,4,5,6,7]))
get_var_mod_sites(seq, 'MSTY', 3, 20)

[(2,),
 (4,),
 (5,),
 (6,),
 (7,),
 (2, 4),
 (2, 5),
 (2, 6),
 (2, 7),
 (4, 5),
 (4, 6),
 (4, 7),
 (5, 6),
 (5, 7),
 (6, 7),
 (2, 4, 5),
 (2, 4, 6),
 (2, 4, 7),
 (2, 5, 6),
 (2, 5, 7)]

In [12]:
#export
import copy
def get_var_mods_per_sites_multi_mods_on_aa(
    sequence:str,
    mod_sites:tuple,
    var_mod_dict:dict
)->list:
    mods_str_list = ['']
    for i,site in enumerate(mod_sites):
        if len(var_mod_dict[sequence[site-1]]) == 1:
            for i in range(len(mods_str_list)):
                mods_str_list[i] += var_mod_dict[sequence[site-1]][0]+';'
        else:
            _new_list = []
            for mod in var_mod_dict[sequence[site-1]]:
                _lst = copy.deepcopy(mods_str_list)
                for i in range(len(_lst)):
                    _lst[i] += mod+';'
                _new_list.extend(_lst)
            mods_str_list = _new_list
    return [mod[:-1] for mod in mods_str_list]

def get_var_mods_per_sites_single_mod_on_aa(
    sequence:str,
    mod_sites:tuple,
    var_mod_dict:dict
)->list:
    mod_str = ''
    for site in mod_sites:
            mod_str += var_mod_dict[sequence[site-1]]+';'
    return [mod_str[:-1]]

get_var_mods_per_sites = get_var_mods_per_sites_single_mod_on_aa

def get_var_mods(
    sequence:str,
    var_mod_aas:str,
    mod_dict:dict,
    max_var_mod:int,
    max_combs:int,
)->tuple:
    mod_sites_list = get_var_mod_sites(
        sequence, var_mod_aas, 
        max_var_mod, max_combs
    )
    ret_mods = []
    ret_sites_list = []
    for mod_sites in mod_sites_list:
        _mods = get_var_mods_per_sites(
            sequence,mod_sites,mod_dict
        )
        mod_sites_str = ';'.join([str(i) for i in mod_sites])
        ret_mods.extend(_mods)
        ret_sites_list.extend([mod_sites_str]*len(_mods))
    return ret_mods, ret_sites_list

In [13]:
#hide
get_var_mods_per_sites = get_var_mods_per_sites_multi_mods_on_aa
seq = 'AMCMSTYK'
candidate_sites = get_candidate_sites(seq, 'MSTY')
mod_sites_list = get_var_mod_sites(seq, 'MSTY', 3, 20)
_mod_dict = {
    'M':['mod@M'],
    'S':['mod@S','modX@S'],
    'T':['mod@T'],
    'Y':['mod@Y'],
}
_mods, _sites = get_var_mods(seq, 'MSTY', _mod_dict, 3, 16)

assert 'mod@M;mod@M;mod@S' in _mods
assert _sites[_mods.index('mod@M;mod@M;mod@S')] == '2;4;5'
assert 'mod@M;mod@M;modX@S' in _mods
assert _sites[_mods.index('mod@M;mod@M;modX@S')] == '2;4;5'
assert 'mod@M;mod@S' in _mods
assert 'mod@M;modX@S' in _mods
get_var_mods(seq, 'MSTY', _mod_dict, 3, 16)

(['mod@M',
  'mod@M',
  'mod@S',
  'modX@S',
  'mod@T',
  'mod@Y',
  'mod@M;mod@M',
  'mod@M;mod@S',
  'mod@M;modX@S',
  'mod@M;mod@T',
  'mod@M;mod@Y',
  'mod@M;mod@S',
  'mod@M;modX@S',
  'mod@M;mod@T',
  'mod@M;mod@Y',
  'mod@S;mod@T',
  'modX@S;mod@T',
  'mod@S;mod@Y',
  'modX@S;mod@Y',
  'mod@T;mod@Y',
  'mod@M;mod@M;mod@S',
  'mod@M;mod@M;modX@S'],
 ['2',
  '4',
  '5',
  '5',
  '6',
  '7',
  '2;4',
  '2;5',
  '2;5',
  '2;6',
  '2;7',
  '4;5',
  '4;5',
  '4;6',
  '4;7',
  '5;6',
  '5;6',
  '5;7',
  '5;7',
  '6;7',
  '2;4;5',
  '2;4;5'])

In [14]:
#hide
get_var_mods_per_sites = get_var_mods_per_sites_single_mod_on_aa
seq = 'AMCMSTYK'
candidate_sites = get_candidate_sites(seq, 'MSTY')
mod_sites_list = get_var_mod_sites(seq, 'MSTY', 3, 20)
_mod_dict = {
    'M':'mod@M',
    'S':'mod@S',
    'T':'mod@T',
    'Y':'mod@Y',
}
_mods, _sites = get_var_mods(seq, 'MSTY', _mod_dict, 3, 16)
assert len(_mods) == len(_sites) == 16
assert _sites[_mods.index('mod@M;mod@M;mod@S')] == '2;4;5'
get_var_mods(seq, 'MSTY', _mod_dict, 3, 16)

(['mod@M',
  'mod@M',
  'mod@S',
  'mod@T',
  'mod@Y',
  'mod@M;mod@M',
  'mod@M;mod@S',
  'mod@M;mod@T',
  'mod@M;mod@Y',
  'mod@M;mod@S',
  'mod@M;mod@T',
  'mod@M;mod@Y',
  'mod@S;mod@T',
  'mod@S;mod@Y',
  'mod@T;mod@Y',
  'mod@M;mod@M;mod@S'],
 ['2',
  '4',
  '5',
  '6',
  '7',
  '2;4',
  '2;5',
  '2;6',
  '2;7',
  '4;5',
  '4;6',
  '4;7',
  '5;6',
  '5;7',
  '6;7',
  '2;4;5'])

In [15]:
#export
def parse_term_mod(term_mod_name:str):
    _mod, term = term_mod_name.split('@')
    if '^' in term:
        return tuple(term.split('^'))
    else:
        return '', term

In [16]:
#hide
assert parse_term_mod('Acetyl@Protein N-term') == ('', 'Protein N-term')
assert parse_term_mod('Gln->pyro-Glu@Q^Any N-term') == ('Q', 'Any N-term')

In [17]:
#export
class PredictFastaSpecLib(PredictSpecLib):
    def __init__(self,
        model_manager:ModelManager,
        charged_frag_types:list = ['b_z1','b_z2','y_z1','y_z2'],
        min_frag_mz = 50, max_frag_mz = 2000,
        min_precursor_mz = 400, max_precursor_mz = 1800,
        protease:str = 'trypsin',
        max_missed_cleavages:int = 2,
        pep_length_min:int = 7,
        pep_length_max:int = 30,
        min_charge:int = 2,
        max_charge:int = 4,
        var_mods:list = ['Acetyl@Protein N-term','Oxidation@M'],
        max_var_mod_num:int = 2,
        fix_mods:list = ['Carbamidomethyl@C'],
    ):
        super().__init__(
            model_manager, charged_frag_types,
            min_frag_mz, max_frag_mz,
            min_precursor_mz, max_precursor_mz
        )
        self.protein_df = pd.DataFrame()
        self.max_mod_combs = 100
        self._digest = Digest(
            protease, max_missed_cleavages,
            pep_length_min, pep_length_max
        )
        self.min_charge = min_charge
        self.max_charge = max_charge

        self.var_mods = var_mods
        self.fix_mods = fix_mods
        self.max_var_mod_num = max_var_mod_num

        self.fix_mod_aas = ''
        self.fix_mod_prot_nterm_dict = {}
        self.fix_mod_prot_cterm_dict = {}
        self.fix_mod_pep_nterm_dict = {}
        self.fix_mod_pep_cterm_dict = {}
        self.fix_mod_dict = {}

        def _set_term_mod(term_mod,
            prot_nterm, prot_cterm, pep_nterm, pep_cterm,
            allow_conflicts
        ):
            def _set_dict(term_dict,site,mod,
                allow_conflicts
            ):
                if allow_conflicts:
                    if site in term_dict:
                        term_dict[site].append(term_mod)
                    else:
                        term_dict[site] = [term_mod]
                else:
                    term_dict[site] = term_mod
            site, term = parse_term_mod(term_mod)
            if term == "Any N-term":
                _set_dict(pep_nterm, site, term_mod, 
                    allow_conflicts
                )
            elif term == 'Protein N-term':
                _set_dict(prot_nterm, site, term_mod, 
                    allow_conflicts
                )
            elif term == 'Any C-term':
                _set_dict(pep_cterm, site, term_mod, 
                    allow_conflicts
                )
            elif term == 'Protein C-term':
                _set_dict(prot_cterm, site, term_mod, 
                    allow_conflicts
                )
        
        for mod in fix_mods:
            if mod.find('@')+2 == len(mod):
                self.fix_mod_aas += mod[-1]
                self.fix_mod_dict[mod[-1]] = mod
            else:
                _set_term_mod(
                    mod, 
                    self.fix_mod_prot_nterm_dict,
                    self.fix_mod_prot_cterm_dict,
                    self.fix_mod_pep_nterm_dict,
                    self.fix_mod_pep_cterm_dict,
                    allow_conflicts=False
                )

        self.var_mod_aas = ''
        self.var_mod_prot_nterm_dict = {}
        self.var_mod_prot_cterm_dict = {}
        self.var_mod_pep_nterm_dict = {}
        self.var_mod_pep_cterm_dict = {}
        self.var_mod_dict = {}

        if self._check_if_multi_mods_on_aa(var_mods):
            for mod in var_mods:
                if mod.find('@')+2 == len(mod):
                    if mod[-1] in self.fix_mod_dict: continue
                    self.var_mod_aas += mod[-1]
                    if mod[-1] in self.var_mod_dict:
                        self.var_mod_dict[mod[-1]].append(mod)
                    else:
                        self.var_mod_dict[mod[-1]] = [mod]
            global get_var_mods_per_sites
            get_var_mod_sites = get_var_mods_per_sites_multi_mods_on_aa
        else:
            for mod in var_mods:
                if mod.find('@')+2 == len(mod):
                    if mod[-1] in self.fix_mod_dict: continue
                    self.var_mod_aas += mod[-1]
                    self.var_mod_dict[mod[-1]] = mod
            global get_var_mods_per_sites
            get_var_mod_sites = get_var_mods_per_sites_single_mod_on_aa
        
        for mod in var_mods:
            if mod.find('@')+2 < len(mod):
                _set_term_mod(
                    mod, 
                    self.var_mod_prot_nterm_dict,
                    self.var_mod_prot_cterm_dict,
                    self.var_mod_pep_nterm_dict,
                    self.var_mod_pep_cterm_dict,
                    allow_conflicts=True
                )

    def _check_if_multi_mods_on_aa(self, var_mods):
        mod_set = set()
        for mod in var_mods:
            if mod.find('@')+2 == len(mod):
                if mod[-1] in mod_set: return True
                mod_set.add(mod[-1])
        return False

    def import_fasta(self, fasta_file_list:list):
        self.from_fasta_list(fasta_file_list)
        self._predict_all_after_load_pep_seqs()

    def import_protein_dict(self, protein_dict:dict):
        self.from_protein_dict(protein_dict)
        self._predict_all_after_load_pep_seqs()

    def import_peptide_sequences(self, 
        pep_seq_list:list, protein_list
    ):
        self.from_peptide_sequence_list(pep_seq_list, protein_list)
        self._predict_all_after_load_pep_seqs()

    def _predict_all_after_load_pep_seqs(self):
        self.add_modifications()
        self.predict_rt()
        self.add_charge()
        self.predict_mobility()
        self.calc_fragment_mz_df()
        self.predict_fragment_intensity_df()

    def from_fasta_list(self, fasta_file_list:list):
        protein_dict = load_all_proteins(fasta_file_list)
        self.from_protein_dict(protein_dict)

    def from_protein_dict(self, protein_dict:dict):
        pep_dict = {}

        self.protein_df = pd.DataFrame.from_dict(
            protein_dict, orient='index'
        ).reset_index(drop=True)

        for i,prot_seq in enumerate(
            self.protein_df.sequence.values
        ):
            (
                seq_list, miss_list, nterm_list, cterm_list
            ) = self._digest.cleave_sequence(prot_seq)
            for seq,miss,nterm,cterm in zip(
                seq_list,miss_list,nterm_list, cterm_list
            ):
                if seq in pep_dict:
                    pep_dict[seq][0] += ';'+str(i)
                    if nterm:
                        pep_dict[seq][2] = nterm
                    if cterm:
                        pep_dict[seq][3] = cterm
                else:
                    pep_dict[seq] = [str(i),miss,nterm,cterm]
        self._precursor_df = pd.DataFrame().from_dict(
            pep_dict, orient='index', columns = [
                'protein_idxes','miss_cleavage',
                'is_prot_nterm','is_prot_cterm'
            ]
        )
        self._precursor_df.reset_index(drop=False, inplace=True)
        self._precursor_df.rename(
            columns={'index':'sequence'}, inplace=True
        )
        self._precursor_df['mods'] = ''
        self._precursor_df['mod_sites'] = ''
        self.sort_by_nAA()

    def from_peptide_sequence_list(self, 
        pep_seq_list:list,
        protein_list:list = None
    ):
        self._precursor_df = pd.DataFrame()
        self._precursor_df['sequence'] = pep_seq_list
        if protein_list is not None:
            self._precursor_df['protein_name'] = protein_list
        self._precursor_df['is_prot_nterm'] = False
        self._precursor_df['is_prot_cterm'] = False
        self.sort_by_nAA()

    def predict_rt(self):
        self.sort_by_nAA()
        super().predict_rt()

    def add_mods_for_one_seq(self, sequence:str, 
        is_prot_nterm, is_prot_cterm
    ):
        fix_mods, fix_mod_sites = get_fix_mods(
            sequence, self.fix_mod_aas, self.fix_mod_dict
        )
        #TODO add prot and pep C-term fix mods
        #TODO add prot and pep N-term fix mods

        if len(fix_mods) == 0:
            fix_mods = ['']
            fix_mod_sites = ['']
        else:
            fix_mods = [fix_mods]
            fix_mod_sites = [fix_mod_sites]

        var_mods_list, var_mod_sites_list = get_var_mods(
            sequence, self.var_mod_aas, self.var_mod_dict, 
            self.max_var_mod_num, self.max_mod_combs-1, # 1 for unmodified
        )
        var_mods_list.append('')
        var_mod_sites_list.append('')

        nterm_var_mods = ['']
        nterm_var_mod_sites = ['']
        if is_prot_nterm and len(self.var_mod_prot_nterm_dict)>0:
            if '' in self.var_mod_prot_nterm_dict:
                nterm_var_mods.extend(self.var_mod_prot_nterm_dict[''])
            if sequence[0] in self.var_mod_prot_nterm_dict:
                nterm_var_mods.extend(self.var_mod_prot_nterm_dict[sequence[0]])
        if len(self.var_mod_pep_nterm_dict)>0:
            if '' in self.var_mod_pep_nterm_dict:
                nterm_var_mods.extend(self.var_mod_pep_nterm_dict[''])
            if sequence[0] in self.var_mod_pep_nterm_dict:
                nterm_var_mods.extend(self.var_mod_pep_nterm_dict[sequence[0]])
        nterm_var_mod_sites.extend(['0']*(len(nterm_var_mods)-1))

        #TODO add prot and pep C-term var mods

        return (
            list(
                ';'.join([i for i in items if i]) for items in itertools.product(
                    fix_mods, nterm_var_mods, var_mods_list
                )
            ),
            list(
                ';'.join([i for i in items if i]) for items in itertools.product(
                    fix_mod_sites, nterm_var_mod_sites, var_mod_sites_list
                )
            ),
        )

    def add_modifications(self):
        (
            self._precursor_df['mods'],
            self._precursor_df['mod_sites']
        ) = zip(*self._precursor_df[
            ['sequence','is_prot_nterm','is_prot_cterm']
        ].apply(lambda x:
            self.add_mods_for_one_seq(*x), axis=1
        ))
        self._precursor_df = self._precursor_df.explode(
            ['mods','mod_sites']
        )
        self._precursor_df.reset_index(drop=True, inplace=True)

    def add_charge(self):
        self._precursor_df['charge'] = [
            np.arange(self.min_charge, self.max_charge+1)
        ]*len(self._precursor_df)
        self._precursor_df = self._precursor_df.explode('charge')
        self._precursor_df['charge'] = self._precursor_df.charge.astype(np.int8)
        self._precursor_df.reset_index(drop=True, inplace=True)

    def save_hdf(self, hdf_file):
        super().save_hdf(hdf_file)
        _hdf = HDF_File(
            hdf_file,
            read_only=False,
            truncate=True,
            delete_existing=False
        )
        _hdf.library = {
            'protein_df': self.protein_df,
        }

In [42]:
#export
def append_regular_modifications(df, 
    var_mods = ['Phospho@S','Phospho@T','Phospho@Y'], 
    max_mod_num=1, max_combs=100,
    keep_unmodified=True,
):
    mod_dict = dict([(mod[-1],mod) for mod in var_mods])
    var_mod_aas = ''.join(mod_dict.keys())
    (
        df['mods_app'],
        df['mod_sites_app']
    ) = zip(*df.sequence.apply(get_var_mods, 
            var_mod_aas=var_mod_aas, mod_dict=mod_dict, 
            max_var_mod=max_mod_num, max_combs=max_combs
        )
    )
    
    if keep_unmodified:
        df = df.explode(['mods_app','mod_sites_app'])
        df.fillna('', inplace=True)
    else:
        df.drop(df[df.mods_app.apply(lambda x: len(x)==0)].index, inplace=True)
        df = df.explode(['mods_app','mod_sites_app'])
    df['mods'] = df[['mods','mods_app']].apply(
        lambda x: ';'.join(i for i in x if i), axis=1
    )
    df['mod_sites'] = df[['mod_sites','mod_sites_app']].apply(
        lambda x: ';'.join(i for i in x if i), axis=1
    )
    del df['mods_app']
    del df['mod_sites_app']
    df.reset_index(drop=True, inplace=True)
    return df

In [48]:
#hide
df = pd.DataFrame(
    {
        'sequence': ['ABSTY','ACXSX','ACDEFG'],
        'mods': ['', 'Acetyl@Protein N-term', ''],
        'mod_sites': ['', '0', '']
    }
)
df = append_regular_modifications(df, keep_unmodified=True)
assert np.sum(df.sequence=='ABSTY')==3
assert np.sum(df.sequence=='ACXSX')==1
assert np.sum(df.sequence=='ACDEFG')==1
assert all(df[df.sequence=='ABSTY'].mods.values == np.array(['Phospho@S','Phospho@T','Phospho@Y']))
assert all(df[df.sequence=='ABSTY'].mod_sites.values == np.array(['3','4','5']))
assert all(df[df.sequence=='ACXSX'].mods.values == np.array(['Acetyl@Protein N-term;Phospho@S']))
assert all(df[df.sequence=='ACXSX'].mod_sites.values == np.array(['0;4']))
df

Unnamed: 0,sequence,mods,mod_sites
0,ABSTY,Phospho@S,3
1,ABSTY,Phospho@T,4
2,ABSTY,Phospho@Y,5
3,ACXSX,Acetyl@Protein N-term;Phospho@S,0;4
4,ACDEFG,,


In [44]:
#hide
df = pd.DataFrame(
    {
        'sequence': ['ABSTY','ACXSX','ACDEFG'],
        'mods': ['', 'Acetyl@Protein N-term', ''],
        'mod_sites': ['', '0', '']
    }
)
df = append_regular_modifications(df, keep_unmodified=False)
assert np.sum(df.sequence=='ABSTY')==3
assert np.sum(df.sequence=='ACXSX')==1
assert np.sum(df.sequence=='ACDEFG')==0
df

Unnamed: 0,sequence,mods,mod_sites
0,ABSTY,Phospho@S,3
1,ABSTY,Phospho@T,4
2,ABSTY,Phospho@Y,5
3,ACXSX,Acetyl@Protein N-term;Phospho@S,0;4


In [20]:
#hide
_lib = PredictFastaSpecLib(None)
protein_dict = {
    'xx': {
        'id': 'xx',
        'sequence': 'MABCDEKFGHIJKLMNOPQRST'
    },
    'yy': {
        'id': 'yy',
        'sequence': 'MABCDEKFGHIJKLM'
    }
}
_lib.from_protein_dict(protein_dict)
assert 'id' in _lib.protein_df.columns
assert 'sequence' in _lib.protein_df.columns
_lib.protein_df

Unnamed: 0,id,sequence
0,xx,MABCDEKFGHIJKLMNOPQRST
1,yy,MABCDEKFGHIJKLM


In [21]:
_lib = PredictFastaSpecLib(None)
prot1 = 'MABCDEKFGHIJKLMNOPQRST'
prot2 = 'FGHIJKLMNOPQR'
protein_dict = {
    'xx': {
        'id': 'xx',
        'sequence': prot1
    },
    'yy': {
        'id': 'yy',
        'sequence': prot2
    }
}
_lib.from_protein_dict(protein_dict)
_lib.precursor_df

Unnamed: 0,sequence,protein_idxes,miss_cleavage,is_prot_nterm,is_prot_cterm,mods,mod_sites,nAA
0,MABCDEK,0,0,True,False,,,7
1,LMNOPQR,0;1,0,False,True,,,7
2,LMNOPQRST,0,1,False,True,,,9
3,ABCDEKFGHIJK,0,1,True,False,,,12
4,MABCDEKFGHIJK,0,1,True,False,,,13
5,FGHIJKLMNOPQR,0;1,1,True,True,,,13
6,FGHIJKLMNOPQRST,0,2,False,True,,,15
7,ABCDEKFGHIJKLMNOPQR,0,2,True,False,,,19
8,MABCDEKFGHIJKLMNOPQR,0,2,True,False,,,20


In [22]:
#hide
for i in range(len(_lib.precursor_df)):
    seq = _lib.precursor_df.sequence[i]
    # test is_prot_nterm
    if prot1.startswith(seq) or prot2.startswith(seq):
        assert _lib.precursor_df.is_prot_nterm[i], seq
    elif prot1[1:].startswith(seq): # M.xxxxx
        assert _lib.precursor_df.is_prot_nterm[i], seq
    else:
        assert not _lib.precursor_df.is_prot_nterm[i], seq
    # test is_prot_cterm
    if prot1.endswith(seq) or prot2.endswith(seq):
        assert _lib.precursor_df.is_prot_cterm[i], seq
    else:
        assert not _lib.precursor_df.is_prot_cterm[i], seq
    # test protein_idxes
    if seq in prot1 and seq in prot2:
        assert _lib.precursor_df.protein_idxes[i] == '0;1'
    else:
        assert ';' not in _lib.precursor_df.protein_idxes[i]

In [23]:
_lib.add_modifications()
_lib.precursor_df

Unnamed: 0,sequence,protein_idxes,miss_cleavage,is_prot_nterm,is_prot_cterm,mods,mod_sites,nAA
0,MABCDEK,0,0,True,False,Carbamidomethyl@C;Oxidation@M,4;1,7
1,MABCDEK,0,0,True,False,Carbamidomethyl@C,4,7
2,MABCDEK,0,0,True,False,Carbamidomethyl@C;Acetyl@Protein N-term;Oxidat...,4;0;1,7
3,MABCDEK,0,0,True,False,Carbamidomethyl@C;Acetyl@Protein N-term,4;0,7
4,LMNOPQR,0;1,0,False,True,Oxidation@M,2,7
5,LMNOPQR,0;1,0,False,True,,,7
6,LMNOPQRST,0,1,False,True,Oxidation@M,2,9
7,LMNOPQRST,0,1,False,True,,,9
8,ABCDEKFGHIJK,0,1,True,False,Carbamidomethyl@C,3,12
9,ABCDEKFGHIJK,0,1,True,False,Carbamidomethyl@C;Acetyl@Protein N-term,3;0,12


In [24]:
#hide
for i in range(len(_lib.precursor_df)):
    seq = _lib.precursor_df.sequence[i]
    mods = _lib.precursor_df.mods[i]
    sites = _lib.precursor_df.mod_sites[i]
    # test fix mods
    if 'C' in seq:
        assert str(seq.find('C')+1) in sites
        assert 'Carbamidomethyl@C' in mods
    else:
        assert 'Carbamidomethyl@C' not in mods
    # test Acetyl@Protein N-term
    if 'Acetyl@Protein N-term' in mods:
        assert _lib.precursor_df.is_prot_nterm[i]
        assert '0' in sites
    if '0' in mods:
        assert _lib.precursor_df.is_prot_nterm[i]
        assert 'Acetyl@Protein N-term' in mods
    if not _lib.precursor_df.is_prot_nterm[i]:
        assert 'Acetyl@Protein N-term' not in mods
    # test Oxidation@M
    if 'Oxidation@M' in mods:
        assert 'M' in seq
        assert str(seq.find('M')+1) in sites
    # test unmodified
    if mods == '':
        assert sites == ''
    if sites == '':
        assert mods == ''
df = _lib.precursor_df
# at least one nterm peptide does not contain Acetyl@Protein N-term
assert not df[df.is_prot_nterm].mod_sites.str.contains('0').all()
# at least one nterm peptide contains Acetyl@Protein N-term
assert df[df.is_prot_nterm].mod_sites.str.contains('0').any()
# test var mod Oxidation@M
assert not df[df.sequence.str.contains('M')].mods.str.contains('Oxidation@M').all()
assert df[df.sequence.str.contains('M')].mods.str.contains('Oxidation@M').any()
assert '' in df.mods.values

## Experimental

suffix array/longest common prefix for faster non-specific digestion

In [25]:
#hide
import numpy as np
from pydivsufsort import divsufsort, kasai

prot = "$MSQVQVQV$XQ$"
string_suffix_array = divsufsort(prot)
string_lcp_array = kasai(prot, string_suffix_array)
ordered_lcp = string_lcp_array[np.argsort(string_suffix_array)]
for i in range(len(prot)):
    if prot[i] == '$': continue
    for seq_len in range(ordered_lcp[i]+1, len(prot)-i):
        if prot[i+seq_len-1] == '$': break
        print('pos', i, ': ', prot[i:i+seq_len])

pos 1 :  M
pos 1 :  MS
pos 1 :  MSQ
pos 1 :  MSQV
pos 1 :  MSQVQ
pos 1 :  MSQVQV
pos 1 :  MSQVQVQ
pos 1 :  MSQVQVQV
pos 2 :  S
pos 2 :  SQ
pos 2 :  SQV
pos 2 :  SQVQ
pos 2 :  SQVQV
pos 2 :  SQVQVQ
pos 2 :  SQVQVQV
pos 3 :  Q
pos 3 :  QV
pos 3 :  QVQ
pos 3 :  QVQV
pos 3 :  QVQVQ
pos 3 :  QVQVQV
pos 4 :  V
pos 4 :  VQ
pos 4 :  VQV
pos 4 :  VQVQ
pos 4 :  VQVQV
pos 10 :  X
pos 10 :  XQ
