In [None]:
#| default_exp pretrained_models

# Integrated functionalities for MS2/RT/CCS models

`peptdeep.pretrained_models` handles the pretrained models, including downloading, installing, and loading the models.

## 1. Downloading and installing the models
For continuous model deployment, we uploaded several pretrained models (compressed as a ZIP file) onto a net disk. peptdeep will automatically download the ZIP file into `global_settings['PEPTDEEP_HOME']/pretrained_models/pretrained_models.zip` when importing peptdeep.pretrained_models. The models will be downloaded only once, if we would like to update them to the latest models, we can call `download_models(overwrite=True)`

In [None]:
#| export
import os
import pathlib
import io
import pandas as pd
import torch
import urllib
import socket
import logging
import shutil
import ssl
from pickle import UnpicklingError
import torch.multiprocessing as mp
from typing import Dict
from zipfile import ZipFile
from tarfile import TarFile
from typing import Tuple

from alphabase.peptide.fragment import (
    create_fragment_mz_dataframe,
    get_charged_frag_types,
    concat_precursor_fragment_dataframes
)
from alphabase.peptide.precursor import (
    refine_precursor_df,
    update_precursor_mz
)
from alphabase.peptide.mobility import (
    mobility_to_ccs_for_df,
    ccs_to_mobility_for_df
)

from peptdeep.settings import global_settings
from peptdeep.utils import logging, process_bar
from peptdeep.settings import global_settings

from peptdeep.model.ms2 import (
    pDeepModel, normalize_training_intensities
)
from peptdeep.model.rt import AlphaRTModel
from peptdeep.model.ccs import AlphaCCSModel
from peptdeep.utils import uniform_sampling

from peptdeep.settings import global_settings

In [None]:
#| export
pretrain_dir = os.path.join(
    os.path.join(
        os.path.expanduser(
            global_settings['PEPTDEEP_HOME']
        ),
        "pretrained_models"
    )
)

if not os.path.exists(pretrain_dir):
    os.makedirs(pretrain_dir)

model_zip_name = global_settings['local_model_zip_name']
model_url = global_settings['model_url']

model_zip = os.path.join(
    pretrain_dir, model_zip_name
)

def is_model_zip(downloaded_zip):
    with ZipFile(downloaded_zip) as zip:
        return any(x=='generic/ms2.pth' for x in zip.namelist())

def download_models(
    url:str=model_url, overwrite=True
):
    """

    Parameters
    ----------
    url : str, optional
        Remote or local path. 
        Defaults to `peptdeep.pretrained_models.model_url`

    overwrite : bool, optional
        overwirte old model files. 
        Defaults to True.

    Raises
    ------
    FileNotFoundError
        If remote url is not accessible.
    """
    if not os.path.isfile(url):
        logging.info(f'Downloading {model_zip_name} ...')
        try:
            context = ssl._create_unverified_context()
            requests = urllib.request.urlopen(url, context=context, timeout=10)
            with open(model_zip, 'wb') as f:
                f.write(requests.read())
        except (
            socket.timeout, 
            urllib.error.URLError, 
            urllib.error.HTTPError
        ) as e:
            raise FileNotFoundError(
                'Downloading model failed! Please download the '
                f'zip or tar file by yourself from "{url}",'
                ' and use \n'
                f'"peptdeep --install-model /path/to/{model_zip_name}.zip"\n'
                ' to install the models'
            )
    else:
        shutil.copy(
            url, model_zip
        )
    logging.info(f'The pretrained models had been downloaded in {model_zip}')

In [None]:
#| export
if not os.path.exists(model_zip):
    download_models()

In [None]:
#| hide
assert is_model_zip(model_zip)

## 2. Loading the models
peptdeep provides a convenient APIs to load models from ZIP files. 

`load_models()` will load the generic models for unmodified peptides, `load_phos_models()` will load the phospho models. Note that MS2/CCS prediction models are the same for generic and phospho models because this model was trained on both generic and phospho peptides.

In [None]:
#| export
model_mgr_settings = global_settings['model_mgr']

def count_mods(psm_df)->pd.DataFrame:
    mods = psm_df[
        psm_df.mods.str.len()>0
    ].mods.apply(lambda x: x.split(';'))
    mod_dict = {}
    mod_dict['mutation'] = {}
    mod_dict['mutation']['spec_count'] = 0
    for one_mods in mods.values:
        for mod in set(one_mods):
            items = mod.split('->')
            if (
                len(items)==2 
                and len(items[0])==3 
                and len(items[1])==5
            ):
                mod_dict['mutation']['spec_count'] += 1
            elif mod not in mod_dict:
                mod_dict[mod] = {}
                mod_dict[mod]['spec_count'] = 1
            else:
                mod_dict[mod]['spec_count'] += 1
    return pd.DataFrame().from_dict(
            mod_dict, orient='index'
        ).reset_index(drop=False).rename(
            columns={'index':'mod'}
        ).sort_values(
            'spec_count',ascending=False
        ).reset_index(drop=True)

def psm_sampling_with_important_mods(
    psm_df, n_sample, 
    top_n_mods = 10,
    n_sample_each_mod = 0, 
    uniform_sampling_column = None,
    random_state=1337,
):
    psm_df_list = []
    if uniform_sampling_column is None:
        def _sample(psm_df, n):
            if n < len(psm_df):
                return psm_df.sample(
                    n, replace=False,
                    random_state=random_state
                ).copy()
            else:
                return psm_df.copy()
    else:
        def _sample(psm_df, n):
            return uniform_sampling(
                psm_df, target=uniform_sampling_column,
                n_train = n, random_state=random_state
            )

    psm_df_list.append(_sample(psm_df, n_sample))
    if n_sample_each_mod > 0:
        mod_df = count_mods(psm_df)
        mod_df = mod_df[mod_df['mod']!='mutation']

        if len(mod_df) > top_n_mods:
            mod_df = mod_df.iloc[:top_n_mods,:]
        for mod in mod_df['mod'].values:
            psm_df_list.append(
                _sample(
                    psm_df[psm_df.mods.str.contains(mod, regex=False)],
                    n_sample_each_mod,
                )
            )
    if len(psm_df_list) > 0:
        return pd.concat(psm_df_list, ignore_index=True)
    else:
        return pd.DataFrame()

def load_phos_models(mask_modloss=True):
    ms2_model = pDeepModel(mask_modloss=mask_modloss)
    ms2_model.load(model_zip, model_path_in_zip='phospho/ms2_phos.pth')
    rt_model = AlphaRTModel()
    rt_model.load(model_zip, model_path_in_zip='phospho/rt_phos.pth')
    ccs_model = AlphaCCSModel()
    ccs_model.load(model_zip, model_path_in_zip='generic/ccs.pth')
    return ms2_model, rt_model, ccs_model

def load_models(mask_modloss=True):
    ms2_model = pDeepModel(mask_modloss=mask_modloss)
    ms2_model.load(model_zip, model_path_in_zip='generic/ms2.pth')
    rt_model = AlphaRTModel()
    rt_model.load(model_zip, model_path_in_zip='generic/rt.pth')
    ccs_model = AlphaCCSModel()
    ccs_model.load(model_zip, model_path_in_zip='generic/ccs.pth')
    return ms2_model, rt_model, ccs_model

def load_models_by_model_type_in_zip(model_type_in_zip:str, mask_modloss=True):
    ms2_model = pDeepModel(mask_modloss=mask_modloss)
    ms2_model.load(model_zip, model_path_in_zip=f'{model_type_in_zip}/ms2.pth')
    rt_model = AlphaRTModel()
    rt_model.load(model_zip, model_path_in_zip=f'{model_type_in_zip}/rt.pth')
    ccs_model = AlphaCCSModel()
    ccs_model.load(model_zip, model_path_in_zip=f'{model_type_in_zip}/ccs.pth')
    return ms2_model, rt_model, ccs_model


## 3. Using `ModelManager`

For users, `ModelManager` class is the only thing we need to manage models (loading, transfer learning, etc). According to different arguments, `ModelManager::load_installed_models()` will call `load_models()` or `load_phos_models()`. For external models, `ModelManager::load_external_models()` will load them by file path or file stream. Here is an example:

```
from zipfile import ZipFile

admodel = ModelManager()
ext_zip = 'external_models.zip' # model compressed in ZIP
rt_model_path = '/path/to/rt.pth' # model as file path
with ZipFile(ext_zip) as model_zip:
    with model_zip.open('generic/ms2.pth','r') as ms2_file:
        admodel.load_external_models(ms2_model_file=ms2_file, rt_model_file=rt_model_path)
```

Transfer learning for different models could also be done in `ModelManager` by using the given training dataframes.

In [None]:
#| export
def clear_error_modloss_intensities(
    fragment_mz_df, fragment_intensity_df
):
    # clear error modloss intensities
    for col in fragment_mz_df.columns.values:
        if 'modloss' in col:
            fragment_intensity_df.loc[
                fragment_mz_df[col]==0,col
            ] = 0

class ModelManager(object):
    def __init__(self, 
        mask_modloss:bool=model_mgr_settings['mask_modloss'],
        device:str='gpu',
        mgr_settings:dict=model_mgr_settings,
    ):
        """ The manager class to access MS2/RT/CCS models.

        Args:
            mask_modloss (bool, optional): If modloss ions are masked to zeros
                in the ms2 model. `modloss` ions are mostly useful for phospho 
                MS2 prediciton model. 
                Defaults to :py:data:`global_settings`['model_mgr']['mask_modloss'].
            device (str, optional): Device for DL models, could be 'gpu' ('cuda') or 'cpu'.
                if device=='gpu' but no GPUs are detected, it will automatically switch to 'cpu'.
                Defaults to 'gpu'.
                
        Attributes:
            ms2_model (:py:class:`peptdeep.model.ms2.pDeepModel`): The MS2 
                prediction model.
            rt_model (:py:class:`peptdeep.model.rt.AlphaRTModel`): The RT prediction model.
            ccs_model (:py:class:`peptdeep.model.ccs.AlphaCCSModel`): The CCS prediciton model.
            psm_num_to_train_ms2 (int): Number of PSMs to train the MS2 model. 
                Defaults to global_settings['model_mgr']['transfer']['psm_num_to_train_ms2'].
            epoch_to_train_ms2 (int): Number of epoches to train the MS2 model. 
                Defaults to global_settings['model_mgr']['transfer']['epoch_ms2'].
            psm_num_to_train_rt_ccs (int): Number of PSMs to train RT/CCS model. 
                Defaults to global_settings['model_mgr']['transfer']['psm_num_to_train_rt_ccs'].
            epoch_to_train_rt_ccs (int): Number of epoches to train RT/CCS model. 
                Defaults to global_settings['model_mgr']['transfer']['epoch_rt_ccs'].
            nce (float): Default NCE value for a precursor_df without the 'nce' column.
                Defaults to global_settings['model_mgr']['default_nce'].
            instrument (str): Default instrument type for a precursor_df without the 'instrument' column.
                Defaults to global_settings['model_mgr']['default_instrument'].
            use_grid_nce_search (bool): If self.ms2_model uses 
                :py:meth:`peptdeep.model.ms2.pDeepModel.grid_nce_search` to determine optimal
                NCE and instrument type. This will change `self.nce` and `self.instrument` values.
                Defaults to global_settings['model_mgr']['transfer']['grid_nce_search'].
        """
        self.mgr_settings = mgr_settings

        self.ms2_model:pDeepModel = pDeepModel(mask_modloss=mask_modloss, device=device)
        self.rt_model:AlphaRTModel = AlphaRTModel(device=device)
        self.ccs_model:AlphaCCSModel = AlphaCCSModel(device=device)

        self.load_installed_models(mgr_settings['model_type'])
        self.load_external_models(
            ms2_model_file = mgr_settings['external_ms2_model'],
            rt_model_file = mgr_settings['external_rt_model'],
            ccs_model_file = mgr_settings['external_ccs_model'],
        )

        self.use_grid_nce_search = mgr_settings[
            'transfer'
        ]['grid_nce_search']

        self.psm_num_to_train_ms2 = mgr_settings[
            "transfer"
        ]["psm_num_to_train_ms2"]
        self.epoch_to_train_ms2 = mgr_settings[
            'transfer'
        ]['epoch_ms2']
        self.warmup_epoch_to_train_ms2 = mgr_settings[
            'transfer'
        ]['warmup_epoch_ms2']
        self.batch_size_to_train_ms2 = mgr_settings[
            'transfer'
        ]['batch_size_ms2']
        self.lr_to_train_ms2 = float(
            mgr_settings[
                'transfer'
            ]['lr_ms2']
        )

        self.psm_num_to_train_rt_ccs = mgr_settings[
            "transfer"
        ]["psm_num_to_train_rt_ccs"]
        self.epoch_to_train_rt_ccs = mgr_settings[
            'transfer'
        ]['epoch_rt_ccs']
        self.warmup_epoch_to_train_rt_ccs = mgr_settings[
            'transfer'
        ]['warmup_epoch_rt_ccs']
        self.batch_size_to_train_rt_ccs = mgr_settings[
            'transfer'
        ]['batch_size_rt_ccs']
        self.lr_to_train_rt_ccs = float(
            mgr_settings[
                'transfer'
            ]['lr_rt_ccs']
        )

        self.psm_num_per_mod_to_train_ms2 = mgr_settings[
            'transfer'
        ]["psm_num_per_mod_to_train_ms2"]
        self.psm_num_per_mod_to_train_rt_ccs = mgr_settings[
            'transfer'
        ]["psm_num_per_mod_to_train_rt_ccs"]
        self.top_n_mods_to_train = mgr_settings[
            'transfer'
        ]["top_n_mods_to_train"]

        self.nce = mgr_settings['default_nce']
        self.instrument = mgr_settings['default_instrument']
        self.verbose = mgr_settings['predict']['verbose']
        self.train_verbose = mgr_settings['transfer']['verbose']


    @property
    def instrument(self):
        return self._instrument
    @instrument.setter
    def instrument(self, instrument_name:str):
        instrument_name = instrument_name.upper()
        if instrument_name in self.mgr_settings[
            'instrument_group'
        ]:
            self._instrument = self.mgr_settings[
                'instrument_group'
            ][instrument_name]
        else:
            self._instrument = 'Lumos'

    def set_default_nce_instrument(self, df):
        if 'nce' not in df.columns and 'instrument' not in df.columns:
            df['nce'] = self.nce
            df['instrument'] = self.instrument
        elif 'nce' not in df.columns:
            df['nce'] = self.nce
        elif 'instrument' not in df.columns:
            df['instrument'] = self.instrument

    def set_default_nce(self, df):
        self.set_default_nce_instrument(df)

    def load_installed_models(self, 
        model_type:str=model_mgr_settings['model_type']
    ):
        """ Load built-in MS2/CCS/RT models.
        Args:
            model_type (str, optional): To load the installed MS2/RT/CCS models 
                or phos MS2/RT/CCS models. It could be 'digly', 'phospho', 'HLA', or 'generic'.
                Defaults to `global_settings['model_mgr']['model_type']` ('generic').
        """
        if model_type.lower() in [
            'phospho','phos','phosphorylation'
        ]:
            self.ms2_model.load(
                model_zip,
                model_path_in_zip='generic/ms2.pth'
            )
            self.rt_model.load(
                model_zip, 
                model_path_in_zip='phospho/rt_phos.pth'
            )
            self.ccs_model.load(
                model_zip, 
                model_path_in_zip='generic/ccs.pth'
            )
        elif model_type.lower() in [
            'digly','glygly','ubiquitylation', 
            'ubiquitination','ubiquitinylation'
        ]:
            self.ms2_model.load(
                model_zip,
                model_path_in_zip='generic/ms2.pth'
            )
            self.rt_model.load(
                model_zip, 
                model_path_in_zip='digly/rt_digly.pth'
            )
            self.ccs_model.load(
                model_zip, 
                model_path_in_zip='generic/ccs.pth'
            )
        elif model_type.lower() in ['regular','common','generic']:
            self.ms2_model.load(
                model_zip, model_path_in_zip='generic/ms2.pth'
            )
            self.rt_model.load(
                model_zip, model_path_in_zip='generic/rt.pth'
            )
            self.ccs_model.load(
                model_zip, model_path_in_zip='generic/ccs.pth'
            )
        elif model_type.lower() in [
            'hla','unspecific','non-specific', 'nonspecific'
        ]:
            self.load_installed_models(model_type="generic")
        else:
            logging.warning(
                f"model_type='{model_type}' is not supported, use 'generic' instead."
            )
            self.load_installed_models(model_type="generic")

    def load_external_models(self,
        *,
        ms2_model_file: Tuple[str, io.BytesIO]=model_mgr_settings['external_ms2_model'],
        rt_model_file: Tuple[str, io.BytesIO]=model_mgr_settings['external_rt_model'],
        ccs_model_file: Tuple[str, io.BytesIO]=model_mgr_settings['external_ccs_model'],
    ):
        """Load external MS2/RT/CCS models.

        Args:
            ms2_model_file (Tuple[str, io.BytesIO], optional): ms2 model file or stream.
                Do nothing if the value is ''. Defaults to global_settings['model_mgr']['external_ms2_model'].
            rt_model_file (Tuple[str, io.BytesIO], optional): rt model file or stream.
                Do nothing if the value is ''. Defaults to global_settings['model_mgr']['external_rt_model'].
            ccs_model_file (Tuple[str, io.BytesIO], optional): ccs model or stream.
                Do nothing if the value is ''. Defaults to global_settings['model_mgr']['external_ccs_model'].
        """

        def _load_file(model, model_file):
            try:
                if isinstance(model_file, str):
                    if os.path.isfile(model_file):
                        model.load(model_file)
                    else:
                        return
                model.load(model_file)
            except UnpicklingError as e:
                logging.info(f"Cannot load {model_file} as {model.__class__} model, peptdeep will use the pretrained model instead.")

        _load_file(self.ms2_model, ms2_model_file)
        _load_file(self.rt_model, rt_model_file)
        _load_file(self.ccs_model, ccs_model_file)

    def train_rt_model(self,
        psm_df:pd.DataFrame,
    ):
        """ Train/fine-tune the RT model. The fine-tuning will be skipped 
            if `self.psm_num_to_train_rt_ccs` is zero.

        Args:
            psm_df (pd.DataFrame): training psm_df which contains 'rt_norm' column.
        """
        if self.psm_num_to_train_rt_ccs > 0:
            if self.psm_num_per_mod_to_train_rt_ccs < len(psm_df):
                tr_df = psm_sampling_with_important_mods(
                    psm_df, self.psm_num_to_train_rt_ccs,
                    self.top_n_mods_to_train,
                    self.psm_num_per_mod_to_train_rt_ccs,
                    uniform_sampling_column='rt_norm'
                )
            else:
                tr_df = psm_df
            if len(tr_df) > 0:
                self.rt_model.train(tr_df, 
                    batch_size=self.batch_size_to_train_rt_ccs,
                    epoch=self.epoch_to_train_rt_ccs,
                    warmup_epoch=self.warmup_epoch_to_train_rt_ccs,
                    lr=self.lr_to_train_rt_ccs,
                    verbose=self.train_verbose,
                )

    def train_ccs_model(self,
        psm_df:pd.DataFrame,
    ):
        """ Train/fine-tune the CCS model. The fine-tuning will be skipped
            if `self.psm_num_to_train_rt_ccs` is zero.

        Args:
            psm_df (pd.DataFrame): training psm_df which contains 
            'ccs' or 'mobility' column.
        """

        if 'mobility' not in psm_df.columns or 'ccs' not in psm_df.columns:
            return
        elif 'ccs' not in psm_df.columns:
            psm_df['ccs'] = mobility_to_ccs_for_df(
                psm_df, 'mobility'
            )
        elif 'mobility' not in psm_df.columns:
            psm_df['mobility'] = ccs_to_mobility_for_df(
                psm_df, 'ccs'
            )

        if self.psm_num_to_train_rt_ccs > 0:
            if self.psm_num_per_mod_to_train_rt_ccs < len(psm_df):
                tr_df = psm_sampling_with_important_mods(
                    psm_df, self.psm_num_to_train_rt_ccs,
                    self.top_n_mods_to_train,
                    self.psm_num_per_mod_to_train_rt_ccs,
                    uniform_sampling_column='ccs'
                )
            else:
                tr_df = psm_df
            if len(tr_df) > 0:
                self.ccs_model.train(tr_df, 
                    batch_size=self.batch_size_to_train_rt_ccs,
                    epoch=self.epoch_to_train_rt_ccs,
                    warmup_epoch=self.warmup_epoch_to_train_rt_ccs,
                    lr=self.lr_to_train_rt_ccs,
                    verbose=self.train_verbose,
                )

    def train_ms2_model(self,
        psm_df: pd.DataFrame,
        matched_intensity_df: pd.DataFrame,
    ):
        """Using matched_intensity_df to train/fine-tune the ms2 model. 
        1. It will sample `n=self.psm_num_to_train_ms2` PSMs into training dataframe (`tr_df`) to for fine-tuning.
        2. This method will also consider some important PTMs (`n=self.top_n_mods_to_train`) into `tr_df` for fine-tuning. 
        3. If `self.use_grid_nce_search==True`, this method will call `self.ms2_model.grid_nce_search` to find the best NCE and instrument.

        Args:
            psm_df (pd.DataFrame): PSM dataframe for fine-tuning.
            matched_intensity_df (pd.DataFrame): The matched fragment intensities for `psm_df`.
        """
        if self.psm_num_to_train_ms2 > 0:
            if self.psm_num_to_train_ms2 < len(psm_df):
                tr_df = psm_sampling_with_important_mods(
                    psm_df, self.psm_num_to_train_ms2,
                    self.top_n_mods_to_train,
                    self.psm_num_per_mod_to_train_ms2
                )
            else:
                tr_df = psm_df
            if len(tr_df) > 0:
                tr_df, frag_df = normalize_training_intensities(
                    tr_df, matched_intensity_df
                )
                tr_inten_df = pd.DataFrame()
                for frag_type in self.ms2_model.charged_frag_types:
                    if frag_type in frag_df.columns:
                        tr_inten_df[frag_type] = frag_df[frag_type]
                    else:
                        tr_inten_df[frag_type] = 0

                if self.use_grid_nce_search:
                    self.nce, self.instrument = self.ms2_model.grid_nce_search(
                        tr_df, tr_inten_df,
                        nce_first=self.mgr_settings['transfer'][
                            'grid_nce_first'
                        ],
                        nce_last=self.mgr_settings['transfer'][
                            'grid_nce_last'
                        ],
                        nce_step=self.mgr_settings['transfer'][
                            'grid_nce_step'
                        ],
                        search_instruments=self.mgr_settings['transfer'][
                            'grid_instrument'
                        ],
                    )
                    tr_df['nce'] = self.nce
                    tr_df['instrument'] = self.instrument
                else:
                    self.set_default_nce_instrument(tr_df)

                self.ms2_model.train(tr_df, 
                    fragment_intensity_df=tr_inten_df,
                    batch_size=self.batch_size_to_train_ms2,
                    epoch=self.epoch_to_train_ms2,
                    warmup_epoch=self.warmup_epoch_to_train_ms2,
                    lr=self.lr_to_train_ms2,
                    verbose=self.train_verbose,
                )

    def predict_ms2(self, precursor_df:pd.DataFrame, 
        *, 
        batch_size:int=model_mgr_settings[
            'predict'
        ]['batch_size_ms2'],
        reference_frag_df:pd.DataFrame = None,
    )->pd.DataFrame:
        """Predict MS2 for the given precursor_df

        Args:
            precursor_df (pd.DataFrame): precursor dataframe for MS2 prediction.
            batch_size (int, optional): Batch size for prediction. 
              Defaults to mgr_settings[ 'predict' ]['batch_size_ms2'].
            reference_frag_df (pd.DataFrame, optional): 
              If precursor_df has 'frag_start_idx' pointing to reference_frag_df. 
              Defaults to None.

        Returns:
            pd.DataFrame: predicted fragment intensity dataframe. 
              If there are no such two columns in precursor_df, 
              it will insert 'frag_start_idx' and `frag_end_idx` in 
              precursor_df pointing to this predicted fragment dataframe.
        """
        self.set_default_nce_instrument(precursor_df)
        if self.verbose:
            logging.info('Predicting MS2 ...')
        return self.ms2_model.predict(precursor_df, 
            batch_size=batch_size,
            reference_frag_df=reference_frag_df,
            verbose=self.verbose
        )

    def predict_rt(self, precursor_df:pd.DataFrame,
        *, 
        batch_size:int=model_mgr_settings[
            'predict'
        ]['batch_size_rt_ccs']
    )->pd.DataFrame:
        """ Predict RT ('rt_pred') inplace into `precursor_df`.

        Args:
            precursor_df (pd.DataFrame): precursor_df for RT prediction
            batch_size (int, optional): Batch size for prediction. 
              Defaults to mgr_settings[ 'predict' ]['batch_size_rt_ccs']. 
              mgr_settings=peptdeep.settings.global_settings['model_mgr'].

        Returns:
            pd.DataFrame: df with 'rt_pred' and 'rt_norm_pred' columns.
        """
        if self.verbose:
            logging.info("Predicting RT ...")
        df = self.rt_model.predict(precursor_df, 
            batch_size=batch_size, verbose=self.verbose
        )
        df['rt_norm_pred'] = df.rt_pred
        return df

    def predict_mobility(self, precursor_df:pd.DataFrame,
        *, 
        batch_size:int=model_mgr_settings[
            'predict'
        ]['batch_size_rt_ccs']
    )->pd.DataFrame:
        """ Predict mobility ('ccs_pred' and `mobility_pred`) inplace into `precursor_df`.

        Args:
            precursor_df (pd.DataFrame): precursor_df for CCS/mobility prediction
            batch_size (int, optional): Batch size for prediction. 
              Defaults to mgr_settings[ 'predict' ]['batch_size_rt_ccs']. 
              mgr_settings=peptdeep.settings.global_settings['model_mgr'].

        Returns:
            pd.DataFrame: df with 'ccs_pred' and 'mobility_pred' columns.
        """
        if self.verbose:
            logging.info("Predicting mobility ...")
        precursor_df = self.ccs_model.predict(precursor_df,
            batch_size=batch_size, verbose=self.verbose
        )
        return self.ccs_model.ccs_to_mobility_pred(
            precursor_df
        )

    def _predict_all_for_mp(self, arg_dict):
        """Internal function, for multiprocessing"""
        return self.predict_all(
            multiprocessing=False, **arg_dict
        )

    def predict_all(self, precursor_df:pd.DataFrame,
        *, 
        predict_items:list = [
            'rt' ,'mobility' ,'ms2'
        ], 
        frag_types:list =  None,
        multiprocessing:bool = model_mgr_settings['predict']['multiprocessing'],
        min_required_precursor_num_for_mp:int = 3000,
        process_num:int = global_settings['thread_num'],
        mp_batch_size:int = 100000,
    )->Dict[str, pd.DataFrame]:
        """ predict all items defined by `predict_items`, 
        which may include rt, mobility, fragment_mz 
        and fragment_intensity.

        Args:
            precursor_df (pd.DataFrame): precursor dataframe contains 
              `sequence`, `mods`, `mod_sites`, `charge` ... columns. 
            predict_items (list, optional): items ('rt', 'mobility', 
              'ms2') to predict.
              Defaults to ['rt' ,'mobility' ,'ms2'].
            frag_types (list, optional): fragment types to predict. If it is None,
            it then depends on `self.ms2_model.charged_frag_types` and 
            `self.ms2_model.model._mask_modloss`.
              Defaults to None.
            multiprocessing (bool, optional): if use multiprocessing.
              Defaults to True.
            process_num (int, optional): Defaults to global_settings['thread_num']
            min_required_precursor_num_for_mp (int, optional): It will not use 
              multiprocessing when the number of precursors in precursor_df 
              is lower than this value. Defaults to 3000.
            mp_batch_size (int, optional): Splitting data into batches 
                for multiprocessing. Defaults to 100000.
              
        Returns:
            Dict[str, pd.DataFrame]: {'precursor_df': precursor_df}
              if 'ms2' in predict_items, it also contains:
              {
                  'fragment_mz_df': fragment_mz_df,
                  'fragment_intensity_df': fragment_intensity_df
              }
        """
        def refine_df(df):
            if 'ms2' in predict_items:
                refine_precursor_df(df)
            else:
                refine_precursor_df(df, drop_frag_idx=False)

        if frag_types is None:
            if self.ms2_model.model._mask_modloss:
                frag_types = [
                    frag for frag in self.ms2_model.charged_frag_types
                    if 'modloss' not in frag
                ]
            else:
                frag_types = self.ms2_model.charged_frag_types

        if 'precursor_mz' not in precursor_df.columns:
            update_precursor_mz(precursor_df)

        if (
            self.ms2_model.device_type!='cpu' or not multiprocessing
            or len(precursor_df) < min_required_precursor_num_for_mp
        ):
            refine_df(precursor_df)
            if 'rt' in predict_items:
                self.predict_rt(precursor_df)
            if 'mobility' in predict_items:
                self.predict_mobility(precursor_df)
            if 'ms2' in predict_items:
                fragment_mz_df = create_fragment_mz_dataframe(
                    precursor_df, frag_types
                )

                precursor_df.drop(
                    columns=['frag_start_idx'], inplace=True
                )
                
                fragment_intensity_df = self.predict_ms2(
                    precursor_df
                )

                fragment_intensity_df.drop(
                    columns=[
                        col for col in fragment_intensity_df.columns
                        if col not in frag_types
                    ], inplace=True
                )

                clear_error_modloss_intensities(
                    fragment_mz_df, fragment_intensity_df
                )

                return {
                    'precursor_df': precursor_df, 
                    'fragment_mz_df': fragment_mz_df,
                    'fragment_intensity_df': fragment_intensity_df, 
                }
            else:
                return {'precursor_df': precursor_df}
        else:
            logging.info("Using multiprocessing ...")
            self.ms2_model.model.share_memory()
            self.rt_model.model.share_memory()
            self.ccs_model.model.share_memory()

            df_groupby = precursor_df.groupby('nAA')

            def get_batch_num_mp(df_groupby):
                batch_num = 0
                for group_len in df_groupby.size().values:
                    for i in range(0, group_len, mp_batch_size):
                        batch_num += 1
                return batch_num

            def mp_param_generator(df_groupby):
                for nAA, df in df_groupby:
                    for i in range(0, len(df), mp_batch_size):
                        yield {
                            'precursor_df': df.iloc[i:i+mp_batch_size,:],
                            'predict_items': predict_items,
                            'frag_types': frag_types,
                        }

            precursor_df_list = []
            if 'ms2' in predict_items:
                fragment_mz_df_list = []
                fragment_intensity_df_list = []
            else:
                fragment_mz_df_list = None

            if self.verbose:
                logging.info(
                    f'Predicting {",".join(predict_items)} ...'
                )
            verbose_bak = self.verbose
            self.verbose = False

            with mp.Pool(process_num) as p:
                for ret_dict in process_bar(
                    p.imap_unordered(
                        self._predict_all_for_mp, 
                        mp_param_generator(df_groupby)
                    ), 
                    get_batch_num_mp(df_groupby)
                ):
                    precursor_df_list.append(ret_dict['precursor_df'])
                    if fragment_mz_df_list is not None:
                        fragment_mz_df_list.append(
                            ret_dict['fragment_mz_df']
                        )
                        fragment_intensity_df_list.append(
                            ret_dict['fragment_intensity_df']
                        )
            self.verbose = verbose_bak

            if fragment_mz_df_list is not None:
                (
                    precursor_df, fragment_mz_df, fragment_intensity_df
                ) = concat_precursor_fragment_dataframes(
                    precursor_df_list,
                    fragment_mz_df_list,
                    fragment_intensity_df_list,
                )
                
                return {
                    'precursor_df': precursor_df, 
                    'fragment_mz_df': fragment_mz_df,
                    'fragment_intensity_df': fragment_intensity_df, 
                }
            else:
                precursor_df = pd.concat(precursor_df_list)
                precursor_df.reset_index(drop=True, inplace=True)
                
                return {'precursor_df': precursor_df} 


In [None]:
#| hide
assert os.path.isfile(model_zip)
with ZipFile(model_zip) as _zip:
    with _zip.open('generic/ms2.pth'):
        pass
    with _zip.open('generic/rt.pth'):
        pass
    with _zip.open('generic/ccs.pth'):
        pass
    with _zip.open('digly/rt_digly.pth'):
        pass
    with _zip.open('phospho/rt_phos.pth'):
        pass

In [None]:
#| hide
from io import StringIO

In [None]:
#| hide

matched_df = pd.read_csv(
    StringIO(',b_z1,b_z2,y_z1,y_z2,b_modloss_z1,b_modloss_z2,y_modloss_z1,y_modloss_z2\r\n'
        '0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0\r\n1,0.13171915994341352,0.0,0.0,0.0,0.0,0.0,0.0,0.0\r\n'
        '2,0.09560456716002332,0.0,0.0,0.0,0.0,0.0,0.0,0.0\r\n'
        '3,0.032392355556351476,0.0,0.0,0.0,0.0,0.0,0.0,0.0\r\n'
        '4,0.06267661211925589,0.0,0.0,0.0,0.0,0.0,0.0,0.0\r\n'
        '5,0.10733421416437268,0.0,0.0,0.0,0.0,0.0,0.0,0.0\r\n'
        '6,0.07955175724673087,0.0,0.0,0.0,0.0,0.0,0.0,0.0\r\n'
        '7,0.08283861204882843,0.0,0.03294760940125559,0.0,0.0,0.0,0.0,0.0\r\n'
        '8,0.0914959582993716,0.0,0.09471333271745186,0.0,0.0,0.0,0.0,0.0\r\n'
        '9,0.10283525167783934,0.0,0.29624251030302834,0.0,0.0,0.0,0.0,0.0\r\n'
        '10,0.02220051360812495,0.0272619351931404,0.8077539764174795,0.0,0.0,0.0,0.0,0.0\r\n'
        '11,0.0,0.02411148245999131,0.851474013001872,0.0,0.0,0.0,0.0,0.0\r\n'
        '12,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0\r\n13,0.0,0.0,0.22244818653184315,0.0,0.0,0.0,0.0,0.0\r\n'
        '14,0.0,0.0,0.21824010319946407,0.0,0.0,0.0,0.0,0.0\r\n'
        '15,0.0,0.0,0.16690493688692923,0.0,0.0,0.0,0.0,0.0\r\n'),
    index_col=0
)

model_mgr = ModelManager(mask_modloss=True)
model_mgr.verbose=False
def pred_one(seq, mods, mod_sites, charge):
    df = pd.DataFrame()
    df["sequence"] = [seq]
    df["mods"] = [mods]
    df["mod_sites"] = [mod_sites]
    df["charge"] = charge
    df["nce"] = 35
    df["instrument"] = "Lumos"
    predict_dict = model_mgr.predict_all(
        df, predict_items=['mobility','rt','ms2'],
        multiprocessing=False
    )
    return predict_dict['fragment_intensity_df']

pred_df = pred_one('ANEKTESSSAQQVAVSR', '', '', 3)

def get_pcc(matched_df, pred_df):
    matched_df = matched_df[pred_df.columns.values]
    return torch.nn.functional.cosine_similarity(
        torch.tensor((pred_df.values   -pred_df.values.mean()).reshape(-1)), 
        torch.tensor((matched_df.values-matched_df.values.mean()).reshape(-1)), 
        dim=0
    )
assert get_pcc(matched_df, pred_df) > 0.95

In [None]:
from peptdeep.model.rt import IRT_PEPTIDE_DF

In [None]:
model_mgr = ModelManager(mask_modloss=False)
model_mgr.load_installed_models('phos')
model_mgr.predict_rt(IRT_PEPTIDE_DF)
model_mgr.rt_model.add_irt_column_to_precursor_df(IRT_PEPTIDE_DF)

Unnamed: 0,sequence,pep_name,irt,mods,mod_sites,nAA,rt_pred,rt_norm_pred,irt_pred
0,LGGNEQVTR,RT-pep a,-24.92,,,9,0.184235,0.184235,-26.123537
1,GAGSSEPVTGLDAK,RT-pep b,0.0,,,14,0.250092,0.250092,4.2381
2,VEATFGVDESNAK,RT-pep c,12.39,,,13,0.266133,0.266133,11.63312
3,YILAGVENSK,RT-pep d,19.79,,,10,0.290495,0.290495,22.864811
4,TPVISGGPYEYR,RT-pep e,28.71,,,12,0.303847,0.303847,29.020259
5,TPVITGAPYEYR,RT-pep f,33.38,,,12,0.316514,0.316514,34.860122
6,DGLDAASYYAPVR,RT-pep g,42.26,,,13,0.324423,0.324423,38.506308
7,ADVTPADFSEWSK,RT-pep h,54.62,,,13,0.345197,0.345197,48.08389
8,GTFIIDPGGVIR,RT-pep i,70.52,,,12,0.394248,0.394248,70.697474
9,GTFIIDPAAVIR,RT-pep k,87.23,,,12,0.434775,0.434775,89.38115


In [None]:
#| hide
model_mgr.verbose=False
def pred_one(seq, mods, mod_sites, charge):
    df = pd.DataFrame()
    df["sequence"] = [seq]
    df["mods"] = [mods]
    df["mod_sites"] = [mod_sites]
    df["charge"] = charge
    df["nce"] = 30
    df["instrument"] = "Lumos"
    predict_dict = model_mgr.predict_all(
        df, predict_items=['mobility','rt','ms2'],
        multiprocessing=False
    )
    return predict_dict['fragment_intensity_df']

pred_df = pred_one('ANEKTESSSAQQVAVSR', 'Phospho@S', '9',2)
assert (pred_df.y_modloss_z1.values>0.5).any()
pred_df = pred_one('ANEKTESSTAQQVAVSR', 'Phospho@T', '9',2)
assert (pred_df.y_modloss_z1.values>0.5).any()
pred_df = pred_one('ANEKTESSSAQQVAVSR', 'Phospho@S', '16',2)
assert (pred_df.y_modloss_z1.values>0.5).any()
pred_df = pred_one('ANEKTESSYAQQVAVSR', 'Phospho@Y', '9',2)
assert (pred_df.y_modloss_z1.values<=0).all()

In [None]:
#| hide
IRT_PEPTIDE_DF['rt_norm'] = IRT_PEPTIDE_DF['irt']
IRT_PEPTIDE_DF['ccs'] = IRT_PEPTIDE_DF['irt']
model_mgr.epoch_to_train_rt_ccs = 1
model_mgr.train_rt_model(IRT_PEPTIDE_DF)
model_mgr.train_ccs_model(IRT_PEPTIDE_DF)