In [None]:
#| default_exp spec_lib.library_factory

# Library Factory

Factory classes to predict libraries from different sources (input file format)

In [None]:
#| export

import pandas as pd
import numpy as np
from typing import Union

from alphabase.peptide.fragment import get_charged_frag_types

from peptdeep.settings import global_settings
from peptdeep.protein.fasta import PredictFastaSpecLib
from peptdeep.spec_lib.translate import (
    speclib_to_single_df, mod_to_unimod_dict,
    translate_to_tsv, mod_to_modname_dict
)

from peptdeep.pretrained_models import ModelManager
from peptdeep.utils import logging

In [None]:
#| export

class PredictLibraryMakerBase(object):
    """
    Base class to predict libraries

    Parameters
    ----------
    settings : dict, optional
        By default `global_settings`

    model_manager : ModelManager, optional
        By default None
    """
    def __init__(self, 
        settings:dict = global_settings,
        model_manager:ModelManager = None,
    ):
        self._settings = settings
        lib_settings = settings['library']
        in_settings = lib_settings['input']
        self.spec_lib = PredictFastaSpecLib(
            model_manager=model_manager,
            charged_frag_types = get_charged_frag_types(
                in_settings['frag_types'],
                in_settings['max_frag_charge'],
            ),
            protease = in_settings['fasta']['protease'],
            max_missed_cleavages = in_settings['fasta']['max_miss_cleave'],
            peptide_length_min = in_settings['min_peptide_len'],
            peptide_length_max = in_settings['max_peptide_len'],
            precursor_charge_min = in_settings['min_precursor_charge'],
            precursor_charge_max = in_settings['max_precursor_charge'],
            precursor_mz_min = in_settings['min_precursor_mz'], 
            precursor_mz_max = in_settings['max_precursor_mz'],
            var_mods = in_settings['var_mods'],
            max_var_mod_num = in_settings['max_var_mod_num'],
            fix_mods = in_settings['fix_mods'],
            decoy = in_settings['decoy'],
            I_to_L=False,
    )

    def _check_df(self)->str:
        pass

    def _input(self, _input):
        """Virtual method to be re-implemented by sub-classes"""
        raise NotImplementedError("All sub-classes must re-implement '_input()' method")

    def _predict(self):
        self.spec_lib.predict_all()
        
    def _set_df(self):
        self.precursor_df = self.spec_lib.precursor_df
        self.fragment_mz_df = self.spec_lib.fragment_mz_df
        self.fragment_intensity_df = self.spec_lib.fragment_intensity_df

    def make_library(self, _input):
        """Predict a library for the `_input`, 
        this function runs the following methods.

        - self._input(_input)
        - self._check_df()
        - self._predict()
        - self._set_df()

        Parameters
        ----------
        _input
            _input file or source

        Raises
        ------
        ValueError
            ValueError for some reasons
        """
        logging.info("Generating the library...")
        try:
            self._input(_input)
            self._check_df()
            self._predict()
            self._set_df()
        except ValueError as e:
            raise e
    
    def translate_to_tsv(self, 
        tsv_path:str, 
        translate_mod_dict:dict=mod_to_modname_dict
    ):
        """Translate the predicted DataFrames into a TSV file
        """
        logging.info(f"Translating to {tsv_path} for DiaNN/Spectronaut...")
        lib_settings = self._settings['library']

        if 'proteins' not in self.spec_lib._precursor_df.columns:
            self.spec_lib.append_protein_name()
        
        translate_to_tsv(
            self.spec_lib, 
            tsv_path,
            keep_k_highest_fragments=lib_settings['output_tsv'][
                'keep_higest_k_peaks'
            ],
            min_frag_intensity=lib_settings['output_tsv'][
                'min_relative_intensity'
            ],
            min_frag_mz=lib_settings['output_tsv'][
                'min_fragment_mz'
            ],
            max_frag_mz=lib_settings['output_tsv'][
                'max_fragment_mz'
            ],
            batch_size=lib_settings['output_tsv'][
                'translate_batch_size'
            ],
            translate_mod_dict=translate_mod_dict,
        )
    
    def translate_library(self, 
        translate_mod_dict:dict=mod_to_modname_dict
    )->pd.DataFrame:
        """Translate predicted DataFrames into 
        a single DataFrame in SWATH library format
        """
        logging.info("Translating library for DiaNN/Spectronaut...")
        lib_settings = self._settings['library']

        if 'proteins' not in self.spec_lib._precursor_df.columns:
            self.spec_lib.append_protein_name()
        
        return speclib_to_single_df(
            self.spec_lib, 
            translate_mod_dict=translate_mod_dict,
            keep_k_highest_fragments=lib_settings['output_tsv'][
                'keep_higest_k_peaks'
            ],
            min_frag_intensity=lib_settings['output_tsv'][
                'min_relative_intensity'
            ],
            min_frag_mz=lib_settings['output_tsv'][
                'min_fragment_mz'
            ],
            max_frag_mz=lib_settings['output_tsv'][
                'max_fragment_mz'
            ],
        )

In [None]:
#|hide
from nbdev.showdoc import show_doc

In [None]:
show_doc(PredictLibraryMakerBase._input)

---

[source](https://github.com/MannLabs/alphapeptdeep/blob/main/peptdeep/spec_lib/library_factory.py#L68){target="_blank" style="float:right; font-size:smaller"}

### PredictLibraryMakerBase._input

>      PredictLibraryMakerBase._input (_input)

Virtual method to be re-implemented by sub-classes

In [None]:
show_doc(PredictLibraryMakerBase.make_library)

  else: warn(msg)


---

[source](https://github.com/MannLabs/alphapeptdeep/blob/main/peptdeep/spec_lib/library_factory.py#L80){target="_blank" style="float:right; font-size:smaller"}

### PredictLibraryMakerBase.make_library

>      PredictLibraryMakerBase.make_library (_input)

Predict a library for the `_input`, 
this function runs the following methods.

- self._input(_input)
- self._check_df()
- self._predict()
- self._set_df()

|    | **Type** | **Details** |
| -- | -------- | ----------- |
| _input |  | _input file or source |

In [None]:
#| export

class PrecursorLibraryMaker(PredictLibraryMakerBase):
    """For input dataframe of charged modified sequences"""
    def _input(self, precursor_df:pd.DataFrame):
        self.spec_lib._precursor_df = precursor_df
        self.spec_lib.append_decoy_sequence()
    
    def _check_df(self):
        if 'charge' not in self.spec_lib.precursor_df.columns:
            raise ValueError('self.spec_lib.precursor_df must contain the "charge" column.')
        (
            self.spec_lib.precursor_df['charge']
        ) = self.spec_lib.precursor_df['charge'].astype(np.int8)

        if (
            'mods' not in self.spec_lib.precursor_df.columns or 
            'mod_sites' not in self.spec_lib.precursor_df.columns
        ):
            self.spec_lib.precursor_df['mods'] = ''
            self.spec_lib.precursor_df['mod_sites'] = ''
        else:
            (
                self.spec_lib.precursor_df['mods']
            ) = self.spec_lib.precursor_df['mods'].astype(str)
            (
                self.spec_lib.precursor_df['mod_sites']
            ) = self.spec_lib.precursor_df['mod_sites'].astype(str)

        self.spec_lib.protein_df = pd.DataFrame()

class PeptideLibraryMaker(PrecursorLibraryMaker):
    """For input dataframe of modified sequences"""
    def _input(self, peptide_df:pd.DataFrame):
        self.spec_lib._precursor_df = peptide_df
        self.spec_lib.append_decoy_sequence()
        self.spec_lib.add_charge()

class SequenceLibraryMaker(PeptideLibraryMaker):
    """For input dataframe of AA sequences"""
    def _input(self, sequence_df:pd.DataFrame):
        self.spec_lib._precursor_df = sequence_df
        self.spec_lib.append_decoy_sequence()
        self.spec_lib.add_modifications()
        self.spec_lib.add_charge()

class FastaLibraryMaker(PredictLibraryMakerBase):
    """For fasta or a list of fasta files"""
    def _input(self, fasta:Union[str,list]):
        self.spec_lib.get_peptides_from_fasta(fasta)
        self.spec_lib.append_decoy_sequence()
        self.spec_lib.add_modifications()
        self.spec_lib.add_charge()

In [None]:
#| export
class LibraryMakerProvider:
    """
    Factory class for library makers
    """
    def __init__(self):
        self.library_maker_dict = {}

    def register_maker(self, maker_name:str, maker_class):
        self.library_maker_dict[maker_name.lower()] = maker_class

    def get_maker(self, maker_name:str, *, 
        settings:dict = global_settings,
        model_manager = None,
    )->PredictLibraryMakerBase:
        maker_name = maker_name.lower()
        if maker_name in self.library_maker_dict:
            return self.library_maker_dict[maker_name](settings, model_manager)
        else:
            raise ValueError(f'library maker "{maker_name}" is not registered.')

library_maker_provider = LibraryMakerProvider()
library_maker_provider.register_maker('precursor_table', PrecursorLibraryMaker)
library_maker_provider.register_maker('precursor_library', PrecursorLibraryMaker)
library_maker_provider.register_maker('peptide_table', PeptideLibraryMaker)
library_maker_provider.register_maker('peptide_library', PeptideLibraryMaker)
library_maker_provider.register_maker('sequence_table', SequenceLibraryMaker)
library_maker_provider.register_maker('sequence_library', SequenceLibraryMaker)
library_maker_provider.register_maker('fasta', FastaLibraryMaker)
library_maker_provider.register_maker('fasta_library', FastaLibraryMaker)

### Registered library makers

In [None]:
library_maker_provider.library_maker_dict

{'precursor_table': __main__.PrecursorLibraryMaker,
 'precursor_library': __main__.PrecursorLibraryMaker,
 'peptide_table': __main__.PeptideLibraryMaker,
 'peptide_library': __main__.PeptideLibraryMaker,
 'sequence_table': __main__.SequenceLibraryMaker,
 'sequence_library': __main__.SequenceLibraryMaker,
 'fasta': __main__.FastaLibraryMaker,
 'fasta_library': __main__.FastaLibraryMaker}

### Testing

In [None]:
from peptdeep.model.rt import irt_pep

In [None]:

lib_maker = library_maker_provider.get_maker('peptide_table')
lib_maker.make_library(irt_pep.copy())

lib_maker = library_maker_provider.get_maker('sequence_table')
lib_maker.make_library(irt_pep.copy())
lib_maker.precursor_df

Device `gpu` is not available, set to `cpu`
Device `gpu` is not available, set to `cpu`
Device `gpu` is not available, set to `cpu`
Device `gpu` is not available, set to `cpu`
Device `gpu` is not available, set to `cpu`
Device `gpu` is not available, set to `cpu`
2022-09-08 23:30:55> Generating the library...
2022-09-08 23:30:55> Calculating precursor isotope distributions ...
2022-09-08 23:30:56> Predicting RT/IM/MS2 ...
2022-09-08 23:30:56> Predicting RT ...


100%|██████████| 5/5 [00:00<00:00, 81.85it/s]

2022-09-08 23:30:56> Predicting mobility ...



100%|██████████| 5/5 [00:00<00:00, 90.21it/s]

2022-09-08 23:30:56> Predicting MS2 ...



100%|██████████| 5/5 [00:00<00:00, 30.79it/s]

2022-09-08 23:30:56> End Predicting RT/IM/MS2





Device `gpu` is not available, set to `cpu`
Device `gpu` is not available, set to `cpu`
Device `gpu` is not available, set to `cpu`
Device `gpu` is not available, set to `cpu`
Device `gpu` is not available, set to `cpu`
Device `gpu` is not available, set to `cpu`
2022-09-08 23:30:59> Generating the library...
2022-09-08 23:30:59> Calculating precursor isotope distributions ...
2022-09-08 23:30:59> Predicting RT/IM/MS2 ...
2022-09-08 23:30:59> Predicting RT ...


100%|██████████| 5/5 [00:00<00:00, 96.70it/s]

2022-09-08 23:30:59> Predicting mobility ...



100%|██████████| 5/5 [00:00<00:00, 92.94it/s]

2022-09-08 23:30:59> Predicting MS2 ...



100%|██████████| 5/5 [00:00<00:00, 32.09it/s]

2022-09-08 23:30:59> End Predicting RT/IM/MS2





Unnamed: 0,sequence,pep_name,irt,mods,mod_sites,nAA,decoy,is_prot_nterm,is_prot_cterm,charge,...,isotope_apex_mz,isotope_right_most_mz,rt_pred,rt_norm_pred,ccs_pred,mobility_pred,frag_end_idx,nce,instrument,frag_start_idx
0,LGGNEQVTR,RT-pep a,-24.92,,,9,0,False,False,2,...,487.256705,487.758355,0.072804,0.072804,331.279816,0.815533,8,30.0,Lumos,0
1,LGGNEQVTR,RT-pep a,-24.92,,,9,0,False,False,3,...,325.173562,325.507996,0.072804,0.072804,382.416168,0.627622,16,30.0,Lumos,8
2,LGGNEQVTR,RT-pep a,-24.92,,,9,0,False,False,4,...,244.131991,244.382816,0.072804,0.072804,461.825714,0.568470,24,30.0,Lumos,16
3,TVQENGGLR,RT-pep a,-24.92,,,9,1,False,False,2,...,487.256705,487.758355,0.086820,0.086820,331.682037,0.816523,32,30.0,Lumos,24
4,TVQENGGLR,RT-pep a,-24.92,,,9,1,False,False,3,...,325.173562,325.507996,0.086820,0.086820,396.429779,0.650621,40,30.0,Lumos,32
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
61,ADLGTVPESSGAGK,RT-pep b,0.00,,,14,1,False,False,3,...,430.217496,430.886363,0.232995,0.232995,449.663818,0.740515,686,30.0,Lumos,673
62,ADLGTVPESSGAGK,RT-pep b,0.00,,,14,1,False,False,4,...,322.914941,323.416591,0.232995,0.232995,533.650513,0.659125,699,30.0,Lumos,686
63,LFPSGQAGFQLFLK,RT-pep l,100.00,,,14,1,False,False,2,...,776.929751,777.933051,0.860533,0.860533,432.605652,1.070562,712,30.0,Lumos,699
64,LFPSGQAGFQLFLK,RT-pep l,100.00,,,14,1,False,False,3,...,518.288926,518.957793,0.860533,0.860533,501.384583,0.827184,725,30.0,Lumos,712


In [None]:
#| hide
lib_maker = library_maker_provider.get_maker('precursor_table')
try:
    lib_maker.make_library(irt_pep.copy())
except ValueError as e:
    assert 'must contain the "charge" column' in str(e)

Device `gpu` is not available, set to `cpu`
Device `gpu` is not available, set to `cpu`
Device `gpu` is not available, set to `cpu`
Device `gpu` is not available, set to `cpu`
Device `gpu` is not available, set to `cpu`
Device `gpu` is not available, set to `cpu`
2022-09-08 23:31:02> Generating the library...
