In [1]:
#| default_exp spectral_library.flat_library

In [2]:
#| export
import pandas as pd

from alphabase.spectral_library.library_base import (
    SpecLibBase
)
from alphabase.peptide.fragment import (
    flatten_fragments
)

from alphabase.io.hdf import HDF_File

import alphabase.peptide.precursor as precursor

In [3]:
#| export
class FlatSpecLib:
    """ 
    Flatten the spectral library (SpecLibBase) with `parse_base_library()`

    Parameters
    ----------
    min_fragment_intensity : float, optional
        minimal intensity to keep, by default 0.001

    keep_top_k_fragments : int, optional
        top k highest peaks to keep, by default 1000

    custom_fragment_df_columns : list, optional
        'mz' and 'intensity' columns are required. Others could be customized. 
        Defaults to ['type','number','position','charge','loss_type']
    """
    def __init__(self,
        min_fragment_intensity:float = 0.001,
        keep_top_k_fragments:int = 1000,
        custom_fragment_df_columns:list = [
            'type','number','position','charge','loss_type'
        ],
        **kwargs
    ):
        self.min_fragment_intensity = min_fragment_intensity
        self.keep_top_k_fragments = keep_top_k_fragments

        self.key_numeric_columns = [
            'ccs_pred', 'charge', 
            'decoy',
            'frag_end_idx', 'frag_start_idx',
            'isotope_m1_intensity', 'isotope_m1_mz',
            'isotope_apex_mz', 'isotope_apex_intensity',
            'isotope_apex_index',
            'isotope_right_most_mz', 'isotope_right_most_intensity',
            'isotope_right_most_index',
            'miss_cleavage', 'mobility_pred',
            'nAA', 
            'precursor_mz', 
            'rt_pred', 'rt_norm_pred'
        ]

        self.custom_fragment_df_columns = custom_fragment_df_columns

    @property
    def precursor_df(self)->pd.DataFrame:
        """: pd.DataFrame : precursor dataframe with columns
        'sequence', 'mods', 'mod_sites', 'charge', ...
        Identical to `self.peptide_df`.
        """
        return self._precursor_df

    @precursor_df.setter
    def precursor_df(self, df:pd.DataFrame):
        self._precursor_df = df
        precursor.refine_precursor_df(
            self._precursor_df,
            drop_frag_idx=False,
            ensure_data_validity=True,
        )

    @property
    def peptide_df(self)->pd.DataFrame:
        """: pd.DataFrame : peptide dataframe with columns
        'sequence', 'mods', 'mod_sites', 'charge', ...
        Identical to `self.precursor_df`.
        """
        return self._precursor_df

    @peptide_df.setter
    def peptide_df(self, df:pd.DataFrame):
        self.precursor_df = df

    @property
    def fragment_df(self)->pd.DataFrame:
        """: pd.DataFrame : The fragment mz dataframe with 
        fragment types as columns (['b_z1', 'y_z2', ...])
        """
        return self._fragment_df

    def parse_base_library(self, library:SpecLibBase):
        """ Flatten a SpecLibBase object

        Parameters
        ----------
        library : SpecLibBase
            the library with fragment_mz_df and fragment_intensity_df
        """
        self._precursor_df, self._fragment_df = flatten_fragments(
            library.precursor_df, 
            library.fragment_mz_df, 
            library.fragment_intensity_df,
            min_fragment_intensity=self.min_fragment_intensity,
            keep_top_k_fragments=self.keep_top_k_fragments,
            custom_columns=self.custom_fragment_df_columns,
        )

    def save_hdf(self, hdf_file:str):
        """Save library dataframes into hdf_file.
        For `self.precursor_df`, this method will save it into two hdf groups:
            hdf_file: `flat_library/precursor_df` and `library/mod_seq_df`.

        `flat_library/precursor_df` contains all essential numberic columns those 
        can be loaded faster from hdf file into memory:
            'precursor_mz', 'charge', 'mod_seq_hash', 'mod_seq_charge_hash',
            'frag_start_idx', 'frag_end_idx', 'decoy', 'rt_pred', 'ccs_pred',
            'mobility_pred', 'miss_cleave', 'nAA', 
            ['isotope_mz_m1', 'isotope_intensity_m1'], ...

        `flat_library/mod_seq_df` contains all string columns and the other 
        not essential columns:
            'sequence','mods','mod_sites', ['proteins', 'genes']...
        as well as 'mod_seq_hash', 'mod_seq_charge_hash' columns to map 
        back to `precursor_df`

        Parameters
        ----------
        hdf_file : str
            the hdf file path to save
            
        """
        _hdf = HDF_File(
            hdf_file, 
            read_only=False, 
            truncate=True,
            delete_existing=True
        )
        if 'mod_seq_charge_hash' not in self._precursor_df.columns:
            self.hash_precursor_df()

        key_columns = self.key_numeric_columns+[
            'mod_seq_hash', 'mod_seq_charge_hash'
        ]

        _hdf.flat_library = {
            'mod_seq_df': self._precursor_df[
                [
                    col for col in self._precursor_df.columns 
                    if col not in self.key_numeric_columns
                ]
            ],
            'precursor_df': self._precursor_df[
                [
                    col for col in self._precursor_df.columns 
                    if col in key_columns
                ]
            ],
            'fragment_df': self._fragment_df,
        }
        
    def load_hdf(self, hdf_file:str, load_mod_seq:bool=False):
        """Load the hdf library from hdf_file

        Parameters
        ----------
        hdf_file : str
            hdf library path to load

        load_mod_seq : bool, optional
            if also load mod_seq_df. 
            Defaults to False.
            
        """
        _hdf = HDF_File(
            hdf_file,
        )
        self._precursor_df:pd.DataFrame = _hdf.flat_library.precursor_df.values
        if load_mod_seq:
            key_columns = self.key_numeric_columns+[
                'mod_seq_hash', 'mod_seq_charge_hash'
            ]
            mod_seq_df = _hdf.flat_library.mod_seq_df.values
            cols = [
                col for col in mod_seq_df.columns 
                if col not in key_columns
            ]
            self._precursor_df[cols] = mod_seq_df[cols]
            
        self._fragment_df = _hdf.flat_library.fragment_df.values
        

In [4]:
#|hide 
from nbdev.showdoc import show_doc

In [5]:
show_doc(FlatSpecLib.parse_base_library)

---

[source](https://github.com/MannLabs/alphabase/blob/main/alphabase/spectral_library/flat_library.py#L101){target="_blank" style="float:right; font-size:smaller"}

### FlatSpecLib.parse_base_library

>      FlatSpecLib.parse_base_library
>                                      (library:alphabase.spectral_library.libra
>                                      ry_base.SpecLibBase)

Flatten a SpecLibBase object

|    | **Type** | **Details** |
| -- | -------- | ----------- |
| library | SpecLibBase | the library with fragment_mz_df and fragment_intensity_df |

In [6]:
show_doc(FlatSpecLib.save_hdf)

---

[source](https://github.com/MannLabs/alphabase/blob/main/alphabase/spectral_library/flat_library.py#L118){target="_blank" style="float:right; font-size:smaller"}

### FlatSpecLib.save_hdf

>      FlatSpecLib.save_hdf (hdf_file:str)

Save library dataframes into hdf_file.
For `self.precursor_df`, this method will save it into two hdf groups:
    hdf_file: `flat_library/precursor_df` and `library/mod_seq_df`.

`flat_library/precursor_df` contains all essential numberic columns those 
can be loaded faster from hdf file into memory:
    'precursor_mz', 'charge', 'mod_seq_hash', 'mod_seq_charge_hash',
    'frag_start_idx', 'frag_end_idx', 'decoy', 'rt_pred', 'ccs_pred',
    'mobility_pred', 'miss_cleave', 'nAA', 
    ['isotope_mz_m1', 'isotope_intensity_m1'], ...

`flat_library/mod_seq_df` contains all string columns and the other 
not essential columns:
    'sequence','mods','mod_sites', ['proteins', 'genes']...
as well as 'mod_seq_hash', 'mod_seq_charge_hash' columns to map 
back to `precursor_df`

|    | **Type** | **Details** |
| -- | -------- | ----------- |
| hdf_file | str | the hdf file path to save |

In [7]:
show_doc(FlatSpecLib.load_hdf)

---

[source](https://github.com/MannLabs/alphabase/blob/main/alphabase/spectral_library/flat_library.py#L171){target="_blank" style="float:right; font-size:smaller"}

### FlatSpecLib.load_hdf

>      FlatSpecLib.load_hdf (hdf_file:str, load_mod_seq:bool=False)

Load the hdf library from hdf_file

|    | **Type** | **Default** | **Details** |
| -- | -------- | ----------- | ----------- |
| hdf_file | str |  | hdf library path to load |
| load_mod_seq | bool | False | if also load mod_seq_df. <br>Defaults to False. |

In [8]:
#｜ hide
from io import StringIO
from alphabase.spectral_library.library_reader import SWATHLibraryReader

In [9]:
#| hide
tsv_str = """PrecursorCharge	ModifiedPeptide	StrippedPeptide	iRT	LabeledPeptide	PrecursorMz	FragmentLossType	FragmentNumber	FragmentType	FragmentCharge	FragmentMz	RelativeIntensity	IonMobility
2	_DPLAVDK_	DPLAVDK	-15.0871	_DPLAVDK_	379.2081611	noloss	3	b	1	326.1710473	14.37029	0.9
2	_DPLAVDK_	DPLAVDK	-15.0871	_DPLAVDK_	379.2081611	noloss	3	y	1	361.2081611	37.7585	0.9
2	_DPLAVDK_	DPLAVDK	-15.0871	_DPLAVDK_	379.2081611	noloss	4	b	1	397.2081611	9.488808	0.9
2	_DPLAVDK_	DPLAVDK	-15.0871	_DPLAVDK_	379.2081611	noloss	4	y	1	432.2452749	100	0.9
2	_DPLAVDK_	DPLAVDK	-15.0871	_DPLAVDK_	379.2081611	noloss	5	b	1	496.276575	5.498003	0.9
2	_DPLAVDK_	DPLAVDK	-15.0871	_DPLAVDK_	379.2081611	noloss	5	y	1	545.3293389	74.56643	0.9
2	_DPLAVDK_	DPLAVDK	-15.0871	_DPLAVDK_	379.2081611	noloss	6	y	2	321.6946896	51.50719	0.9
2	_AVVVS[Phospho (STY)]PK_	AVVVSPK	-22.84974	_AVVVS[Phospho (STY)]PK_	390.2067795	noloss	3	y	1	411.1639269	6.911595	0.9
2	_AVVVS[Phospho (STY)]PK_	AVVVSPK	-22.84974	_AVVVS[Phospho (STY)]PK_	390.2067795	H3PO4	3	y	1	313.1870287	17.38582	0.9
2	_AVVVS[Phospho (STY)]PK_	AVVVSPK	-22.84974	_AVVVS[Phospho (STY)]PK_	390.2067795	noloss	4	y	1	510.2323409	10.65426	0.9
2	_AVVVS[Phospho (STY)]PK_	AVVVSPK	-22.84974	_AVVVS[Phospho (STY)]PK_	390.2067795	H3PO4	4	y	1	412.2554427	37.41231	0.9
2	_AVVVS[Phospho (STY)]PK_	AVVVSPK	-22.84974	_AVVVS[Phospho (STY)]PK_	390.2067795	noloss	5	y	1	609.3007548	45.03617	0.9
2	_AVVVS[Phospho (STY)]PK_	AVVVSPK	-22.84974	_AVVVS[Phospho (STY)]PK_	390.2067795	H3PO4	5	y	1	511.3238566	100	0.9
2	_MGS[Phospho (STY)]LDSK_	MGSLDSK	-27.5635	_MGS[Phospho (STY)]LDSK_	409.1617118	noloss	3	y	1	349.1717756	9.20575	0.9
2	_MGS[Phospho (STY)]LDSK_	MGSLDSK	-27.5635	_MGS[Phospho (STY)]LDSK_	409.1617118	noloss	6	y	1	686.2756622	10.37339	0.9
2	_MGS[Phospho (STY)]LDSK_	MGSLDSK	-27.5635	_MGS[Phospho (STY)]LDSK_	409.1617118	H3PO4	6	y	1	588.298764	100	0.9
1	_SVS[Phospho (STY)]FSLK_	SVSFSLK	35.01411	_SVS[Phospho (STY)]FSLK_	847.3961117	noloss	3	y	1	347.2288965	88.27327	0.9
1	_SVS[Phospho (STY)]FSLK_	SVSFSLK	35.01411	_SVS[Phospho (STY)]FSLK_	847.3961117	H3PO4	3	b	1	256.1291795	64.97146	0.9
1	_SVS[Phospho (STY)]FSLK_	SVSFSLK	35.01411	_SVS[Phospho (STY)]FSLK_	847.3961117	noloss	4	y	1	494.2973105	100	0.9
1	_SVS[Phospho (STY)]FSLK_	SVSFSLK	35.01411	_SVS[Phospho (STY)]FSLK_	847.3961117	H3PO4	4	b	1	403.1975934	35.17805	0.9
1	_SVS[Phospho (STY)]FSLK_	SVSFSLK	35.01411	_SVS[Phospho (STY)]FSLK_	847.3961117	noloss	5	y	1	661.2956694	19.89741	0.9
1	_SVS[Phospho (STY)]FSLK_	SVSFSLK	35.01411	_SVS[Phospho (STY)]FSLK_	847.3961117	H3PO4	5	b	1	490.2296218	40.04738	0.9
1	_SVS[Phospho (STY)]FSLK_	SVSFSLK	35.01411	_SVS[Phospho (STY)]FSLK_	847.3961117	H3PO4	5	y	1	563.3187712	77.43164	0.9
1	_SVS[Phospho (STY)]FSLK_	SVSFSLK	35.01411	_SVS[Phospho (STY)]FSLK_	847.3961117	noloss	6	b	1	701.290584	24.43497	0.9
1	_SVS[Phospho (STY)]FSLK_	SVSFSLK	35.01411	_SVS[Phospho (STY)]FSLK_	847.3961117	H3PO4	6	b	1	603.3136858	63.09999	0.9
1	_SVS[Phospho (STY)]FSLK_	SVSFSLK	35.01411	_SVS[Phospho (STY)]FSLK_	847.3961117	1(+H2+O)1(+H3+O4+P)	3	b	1	238.1186147	62.60851	0.9
1	_SVS[Phospho (STY)]FSLK_	SVSFSLK	35.01411	_SVS[Phospho (STY)]FSLK_	847.3961117	1(+H2+O)1(+H3+O4+P)	5	b	1	472.219057	22.99903	0.9
1	_SVS[Phospho (STY)]FSLK_	SVSFSLK	35.01411	_SVS[Phospho (STY)]FSLK_	847.3961117	1(+H2+O)1(+H3+O4+P)	6	b	1	585.303121	66.30389	0.9
2	_VS[Phospho (STY)]VS[Phospho (STY)]PGR_	VSVSPGR	-23.93085	_VS[Phospho (STY)]VS[Phospho (STY)]PGR_	431.1670009	noloss	3	y	1	329.1931797	100	0.9
2	_VS[Phospho (STY)]VS[Phospho (STY)]PGR_	VSVSPGR	-23.93085	_VS[Phospho (STY)]VS[Phospho (STY)]PGR_	431.1670009	H3PO4	3	b	1	268.165565	5.755442	0.9
2	_VS[Phospho (STY)]VS[Phospho (STY)]PGR_	VSVSPGR	-23.93085	_VS[Phospho (STY)]VS[Phospho (STY)]PGR_	431.1670009	noloss	4	b	2	267.0740493	8.743931	0.9
2	_VS[Phospho (STY)]VS[Phospho (STY)]PGR_	VSVSPGR	-23.93085	_VS[Phospho (STY)]VS[Phospho (STY)]PGR_	431.1670009	noloss	4	y	1	496.1915387	27.69686	0.9
2	_VS[Phospho (STY)]VS[Phospho (STY)]PGR_	VSVSPGR	-23.93085	_VS[Phospho (STY)]VS[Phospho (STY)]PGR_	431.1670009	H3PO4	4	b	1	435.1639239	6.162673	0.9
2	_VS[Phospho (STY)]VS[Phospho (STY)]PGR_	VSVSPGR	-23.93085	_VS[Phospho (STY)]VS[Phospho (STY)]PGR_	431.1670009	2(+H3+O4+P)	4	b	1	337.1870258	10.84257	0.9
2	_VS[Phospho (STY)]VS[Phospho (STY)]PGR_	VSVSPGR	-23.93085	_VS[Phospho (STY)]VS[Phospho (STY)]PGR_	431.1670009	H3PO4	4	y	1	398.2146405	26.28527	0.9
2	_VS[Phospho (STY)]VS[Phospho (STY)]PGR_	VSVSPGR	-23.93085	_VS[Phospho (STY)]VS[Phospho (STY)]PGR_	431.1670009	H3PO4	5	y	1	497.2830544	28.41294	0.9
2	_VS[Phospho (STY)]VS[Phospho (STY)]PGR_	VSVSPGR	-23.93085	_VS[Phospho (STY)]VS[Phospho (STY)]PGR_	431.1670009	noloss	6	y	1	762.2583115	8.490795	0.9
2	_VS[Phospho (STY)]VS[Phospho (STY)]PGR_	VSVSPGR	-23.93085	_VS[Phospho (STY)]VS[Phospho (STY)]PGR_	431.1670009	H3PO4	6	y	1	664.2814133	32.87384	0.9
2	_VS[Phospho (STY)]VS[Phospho (STY)]PGR_	VSVSPGR	-23.93085	_VS[Phospho (STY)]VS[Phospho (STY)]PGR_	431.1670009	2(+H3+O4+P)	6	y	1	566.3045151	35.87218	0.9
2	_YSLS[Phospho (STY)]PSK_	YSLSPSK	-6.428198	_YSLS[Phospho (STY)]PSK_	431.1913264	noloss	3	y	1	331.1975964	49.20179	0.9
2	_YSLS[Phospho (STY)]PSK_	YSLSPSK	-6.428198	_YSLS[Phospho (STY)]PSK_	431.1913264	noloss	4	y	1	498.1959553	10.89141	0.9
2	_YSLS[Phospho (STY)]PSK_	YSLSPSK	-6.428198	_YSLS[Phospho (STY)]PSK_	431.1913264	H3PO4	4	y	1	400.2190571	27.99594	0.9
2	_YSLS[Phospho (STY)]PSK_	YSLSPSK	-6.428198	_YSLS[Phospho (STY)]PSK_	431.1913264	noloss	5	y	1	611.2800193	14.11057	0.9
2	_YSLS[Phospho (STY)]PSK_	YSLSPSK	-6.428198	_YSLS[Phospho (STY)]PSK_	431.1913264	H3PO4	5	y	1	513.3031211	70.5295	0.9
2	_YSLS[Phospho (STY)]PSK_	YSLSPSK	-6.428198	_YSLS[Phospho (STY)]PSK_	431.1913264	noloss	6	y	1	698.3120477	60.23455	0.9
2	_YSLS[Phospho (STY)]PSK_	YSLSPSK	-6.428198	_YSLS[Phospho (STY)]PSK_	431.1913264	H3PO4	6	y	1	600.3351495	100	0.9
2	_YSLS[Phospho (STY)]PSK_	YSLSPSK	-6.428198	_YSLS[Phospho (STY)]PSK_	431.1913264	1(+H2+O)1(+H3+O4+P)	6	y	1	582.3245847	5.233977	0.9
"""

reader = SWATHLibraryReader()
reader.import_file(StringIO(tsv_str))
flat_lib = FlatSpecLib(custom_fragment_df_columns=['type'])
flat_lib.parse_base_library(reader)
flat_lib.fragment_df

Unnamed: 0,mz,intensity,type
0,609.300755,0.450362,121
1,511.32386,1.0,121
2,510.232341,0.106543,121
3,412.255446,0.374123,121
4,411.163927,0.069116,121
5,313.187032,0.173858,121
6,321.69469,0.515072,121
7,545.329339,0.745664,121
8,326.171047,0.143703,98
9,432.245275,1.0,121


In [10]:
#| hide
flat_lib.precursor_df

Unnamed: 0,sequence,charge,rt,mobility,mods,mod_sites,nAA,frag_start_idx,frag_end_idx,rt_norm,precursor_mz,ccs
0,AVVVSPK,2,-22.84974,0.9,Phospho@S,5,7,0,6,0.075327,390.20678,366.858877
1,DPLAVDK,2,-15.0871,0.9,,,7,6,13,0.199375,379.208161,367.0431
2,MGSLDSK,2,-27.5635,0.9,Phospho@S,3,7,13,16,0.0,409.161712,366.564438
3,SVSFSLK,1,35.01411,0.9,Phospho@S,3,7,16,25,1.0,847.396112,183.178171
4,VSVSPGR,2,-23.93085,0.9,Phospho@S;Phospho@S,2;4,7,25,34,0.05805,431.167001,366.254833
5,YSLSPSK,2,-6.428198,0.9,Phospho@S,4,7,34,41,0.337745,431.191327,366.254509
