In [None]:
#| default_exp spec_lib.predict_lib

# Predict Spectral Library

Base functionalities to predict spectral libraries

For most of the users, `peptdeep.pretrained_models.ModelManager`, and `peptdeep.spec_lib.predict_lib.PredictSpecLib`, and `peptdeep.protein.fasta.PredictFastaSpecLib` can cover most of the use cases.

Both  `peptdeep.spec_lib.predict_lib.PredictSpecLib` and `peptdeep.protein.fasta.PredictFastaSpecLib` take `peptdeep.pretrained_models.ModelManager` as the first positional argument. 



In [None]:
#| export
import pandas as pd
import numpy as np
import torch

from alphabase.peptide.precursor import (
    calc_precursor_isotope_mp, calc_precursor_isotope
)
from alphabase.spectral_library.library_base import SpecLibBase
from alphabase.spectral_library.flat_library import FlatSpecLib
from alphabase.peptide.fragment import (
    flatten_fragments, concat_precursor_fragment_dataframes
)

from peptdeep.pretrained_models import ModelManager
from peptdeep.settings import global_settings
from peptdeep.utils import logging
from peptdeep.utils import process_bar

In [None]:
#| export

lib_settings = global_settings['library']

class PredictSpecLib(SpecLibBase):
    """PredictSpecLib

    Parameters
    ----------
    model_manager : ModelManager, optional
        `ModelManager`, by default None

    charged_frag_types : list, optional
        Charged fragment types, by default ['b_z1','b_z2','y_z1','y_z2']

    precursor_mz_min : float, optional
        precursor_mz_min, by default 400.0

    precursor_mz_max : float, optional
        precursor_mz_max, by default 2000.0

    generate_precursor_isotopes : bool, optional
        If calculate isotope masses and relative intensities for precursors

    decoy : str, optional
        Decoy choice, see `alphabase.spec_lib.decoy_library`, 
        by default 'pseudo_reverse'
    """
    def __init__(self,
        model_manager: ModelManager = None,
        charged_frag_types = ['b_z1','b_z2','y_z1','y_z2'],
        precursor_mz_min:float = 400.0, 
        precursor_mz_max:float = 2000.0,
        generate_precursor_isotopes:bool = False,
        decoy:str = 'pseudo_reverse'
    ):
        super().__init__(
            charged_frag_types,
            precursor_mz_min=precursor_mz_min,
            precursor_mz_max=precursor_mz_max,
            decoy = decoy
        )
        self.generate_precursor_isotopes = generate_precursor_isotopes
        self.verbose = True
        if model_manager is None:
            self.model_manager = ModelManager(
                mask_modloss=False
            )
        else:
            self.model_manager = model_manager

        self._precursor_df = pd.DataFrame()
        self._fragment_intensity_df = pd.DataFrame()
        self._fragment_mz_df = pd.DataFrame()

    def set_precursor_and_fragment(self,
        *,
        precursor_df: pd.DataFrame,
        fragment_mz_df: pd.DataFrame,
        fragment_intensity_df: pd.DataFrame,
    ):
        self._precursor_df = precursor_df
        self._fragment_intensity_df = fragment_intensity_df
        self._fragment_mz_df = fragment_mz_df

        self._fragment_mz_df.drop(columns=[
            col for col in self._fragment_mz_df.columns 
            if col not in self.charged_frag_types
        ], inplace=True)

        self._fragment_intensity_df.drop(columns=[
            col for col in self._fragment_intensity_df.columns 
            if col not in self.charged_frag_types
        ], inplace=True)

    def rt_to_irt_pred(self):
        """ Add 'irt_pred' into columns based on 'rt_pred' """
        return self.model_manager.rt_model.add_irt_column_to_precursor_df(self._precursor_df)

    def predict_all(self, 
        min_required_precursor_num_for_mp:int=2000,
    ):
        """
        1. Predict RT/IM/MS2 for self._precursor_df
        2. Calculate isotope information in self._precursor_df
        """
        self.calc_precursor_mz()
        if self.generate_precursor_isotopes:
            if self.verbose:
                logging.info('Calculating precursor isotope distributions ...')
            if len(self.precursor_df) < min_required_precursor_num_for_mp:
                self._precursor_df = calc_precursor_isotope(
                    self._precursor_df
                )
            else:
                self._precursor_df = calc_precursor_isotope_mp(
                    self._precursor_df, process_bar=process_bar
                )
        if self.verbose:
            logging.info('Predicting RT/IM/MS2 ...')
        res = self.model_manager.predict_all(
            self._precursor_df,
            predict_items=['rt','mobility','ms2'],
            frag_types=self.charged_frag_types,
            min_required_precursor_num_for_mp=min_required_precursor_num_for_mp
        )
        self.set_precursor_and_fragment(**res)
        if self.verbose:
            logging.info('End Predicting RT/IM/MS2')
        

In [None]:
#|hide
from nbdev.showdoc import show_doc

In [None]:
show_doc(PredictSpecLib.rt_to_irt_pred)

---

[source](https://github.com/MannLabs/alphapeptdeep/blob/main/peptdeep/spec_lib/predict_lib.py#L99){target="_blank" style="float:right; font-size:smaller"}

### PredictSpecLib.rt_to_irt_pred

>      PredictSpecLib.rt_to_irt_pred ()

Add 'irt_pred' into columns based on 'rt_pred'

In [None]:
show_doc(PredictSpecLib.predict_all)

---

[source](https://github.com/MannLabs/alphapeptdeep/blob/main/peptdeep/spec_lib/predict_lib.py#L103){target="_blank" style="float:right; font-size:smaller"}

### PredictSpecLib.predict_all

>      PredictSpecLib.predict_all (min_required_precursor_num_for_mp:int=2000)

1. Predict RT/IM/MS2 for self._precursor_df
2. Calculate isotope information in self._precursor_df

In [None]:
#| export

class FlatPredictSpecLib(FlatSpecLib):
    """ 
    Flatten the predicted spectral library, the key feature is to 
    predict and flatten fragments in batch with `predict_and_parse_lib_in_batch()`

    Parameters
    ----------
    min_fragment_intensity : float, optional
        minimal intensity to keep, by default 0.001
    keep_top_k_fragments : int, optional
        top k highest peaks to keep, by default 1000
    """
    def __init__(self, 
        min_fragment_intensity:float = 0.001,
        keep_top_k_fragments:int = 1000,
        custom_fragment_df_columns:list = [
            'type','number','position','charge','loss_type'
        ],
        **kwargs,
    ):
        super().__init__(
            min_fragment_intensity=min_fragment_intensity,
            keep_top_k_fragments=keep_top_k_fragments,
            custom_fragment_df_columns=custom_fragment_df_columns
        )

    def predict_and_parse_lib_in_batch(self, 
        predict_lib:PredictSpecLib, 
        batch_size:int = 200000
    ):
        """Predict and flatten fragments in batch

        Parameters
        ----------
        predict_lib : PredictSpecLib
            spectral library to be predicted and flatten
        batch_size : int, optional
            the batch size, by default 200000
        """
        if len(predict_lib.precursor_df) <= batch_size:
            predict_lib.predict_all()
            self.parse_base_library(predict_lib)
        else:
            predict_lib.verbose = False
            predict_lib.refine_df()
            precursor_df = predict_lib.precursor_df
            precursor_df_list = []
            fragment_df_list = []
            for i in range(0, len(precursor_df), batch_size):
                predict_lib._precursor_df = precursor_df.iloc[i:i+batch_size].copy()
                predict_lib.predict_all()
                df, frag_df = flatten_fragments(
                    predict_lib.precursor_df,
                    predict_lib.fragment_mz_df,
                    predict_lib.fragment_intensity_df,
                    min_fragment_intensity = self.min_fragment_intensity,
                    keep_top_k_fragments = self.keep_top_k_fragments,
                    custom_columns=self.custom_fragment_df_columns
                )
                precursor_df_list.append(df)
                fragment_df_list.append(frag_df)
            predict_lib._precursor_df = precursor_df
            self._precursor_df, self._fragment_df = concat_precursor_fragment_dataframes(
                precursor_df_list, fragment_df_list
            )



In [None]:
show_doc(FlatPredictSpecLib.predict_and_parse_lib_in_batch)

---

[source](https://github.com/MannLabs/alphapeptdeep/blob/main/peptdeep/spec_lib/predict_lib.py#L161){target="_blank" style="float:right; font-size:smaller"}

### FlatPredictSpecLib.predict_and_parse_lib_in_batch

>      FlatPredictSpecLib.predict_and_parse_lib_in_batch
>                                                         (predict_lib:__main__.
>                                                         PredictSpecLib,
>                                                         batch_size:int=200000)

Predict and flatten fragments in batch

|    | **Type** | **Default** | **Details** |
| -- | -------- | ----------- | ----------- |
| predict_lib | PredictSpecLib |  | spectral library to be predicted and flatten |
| batch_size | int | 200000 | the batch size, by default 200000 |

In [None]:
from peptdeep.protein.fasta import PredictFastaSpecLib
from alphabase.peptide.fragment import get_charged_frag_types

In [None]:
model_mgr = ModelManager(device='cpu', mask_modloss=False)
model_mgr.load_installed_models()
model_mgr.verbose = False
_lib = PredictFastaSpecLib(
    model_mgr, 
    charged_frag_types=get_charged_frag_types(['b','y','b_modloss','y_modloss'], 2),
    I_to_L=False, 
    decoy='pseudo_reverse'
)
prot1 = 'MACDESTYKAKFGHIKLMNPQRST'
prot2 = 'FGHIKLMNPQR'
protein_dict = {
    'xx': {
        'protein_id': 'xx',
        'sequence': prot1
    },
    'yy': {
        'protein_id': 'yy',
        'sequence': prot2
    }
}
_lib.import_and_process_protein_dict(protein_dict)
_lib.precursor_df

Unnamed: 0,sequence,protein_idxes,miss_cleavage,is_prot_nterm,is_prot_cterm,mods,mod_sites,nAA,decoy,charge
0,AKFGHIK,0,1,False,False,,,7,0,2
1,AKFGHIK,0,1,False,False,,,7,0,3
2,AKFGHIK,0,1,False,False,,,7,0,4
3,IHGFKAK,0,1,False,False,,,7,1,2
4,IHGFKAK,0,1,False,False,,,7,1,3
...,...,...,...,...,...,...,...,...,...,...
169,IHGFKAKYTSEDCAMK,0,2,True,False,Carbamidomethyl@C;Acetyl@Protein N-term;Oxidat...,13;0;15,16,1,3
170,IHGFKAKYTSEDCAMK,0,2,True,False,Carbamidomethyl@C;Acetyl@Protein N-term;Oxidat...,13;0;15,16,1,4
171,IHGFKAKYTSEDCAMK,0,2,True,False,Carbamidomethyl@C;Acetyl@Protein N-term,13;0,16,1,2
172,IHGFKAKYTSEDCAMK,0,2,True,False,Carbamidomethyl@C;Acetyl@Protein N-term,13;0,16,1,3


In [None]:
flat_lib = FlatPredictSpecLib(custom_fragment_df_columns=['type'])
_lib.verbose = False
flat_lib.predict_and_parse_lib_in_batch(
    _lib, batch_size=100
)
flat_lib.precursor_df

Unnamed: 0,sequence,protein_idxes,miss_cleavage,is_prot_nterm,is_prot_cterm,mods,mod_sites,nAA,decoy,charge,precursor_mz,rt_pred,rt_norm_pred,ccs_pred,mobility_pred,frag_end_idx,nce,instrument,frag_start_idx
0,AKFGHIK,0,1,False,False,,,7,0,2,400.742505,0.029719,0.029719,315.498627,0.774353,11,30.0,Lumos,0
1,AKFGHIK,0,1,False,False,,,7,0,3,267.497429,0.029719,0.029719,389.692902,0.637649,23,30.0,Lumos,11
2,AKFGHIK,0,1,False,False,,,7,0,4,200.874891,0.029719,0.029719,456.406067,0.560120,32,30.0,Lumos,23
3,IHGFKAK,0,1,False,False,,,7,1,2,400.742505,0.018621,0.018621,313.312408,0.768987,44,30.0,Lumos,32
4,IHGFKAK,0,1,False,False,,,7,1,3,267.497429,0.018621,0.018621,376.615753,0.616251,53,30.0,Lumos,44
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
169,IHGFKAKYTSEDCAMK,0,2,True,False,Carbamidomethyl@C;Acetyl@Protein N-term;Oxidat...,13;0;15,16,1,3,648.637850,0.213083,0.213083,543.556580,0.898357,2623,30.0,Lumos,2593
170,IHGFKAKYTSEDCAMK,0,2,True,False,Carbamidomethyl@C;Acetyl@Protein N-term;Oxidat...,13;0;15,16,1,4,486.730206,0.213083,0.213083,671.651733,0.832552,2648,30.0,Lumos,2623
171,IHGFKAKYTSEDCAMK,0,2,True,False,Carbamidomethyl@C;Acetyl@Protein N-term,13;0,16,1,2,964.455679,0.270350,0.270350,469.423584,1.163679,2673,30.0,Lumos,2648
172,IHGFKAKYTSEDCAMK,0,2,True,False,Carbamidomethyl@C;Acetyl@Protein N-term,13;0,16,1,3,643.306211,0.270350,0.270350,543.796631,0.898701,2702,30.0,Lumos,2673


In [None]:
flat_lib.fragment_df

Unnamed: 0,mz,intensity,type
0,729.440621,0.049367,121
1,365.223949,0.133035,121
2,200.139353,0.194481,98
3,601.345658,1.000000,121
4,347.207767,0.045913,98
...,...,...,...
2720,509.221052,0.737302,121
2721,255.114164,0.082723,121
2722,349.190403,0.549802,121
2723,278.153289,0.576329,121
