In [None]:
#default_exp spectrum_library.library_base

In [None]:
#export

import pandas as pd
import numpy as np
import typing

import alphabase.peptide.fragment as fragment
import alphabase.peptide.precursor as precursor
from alphabase.io.hdf import HDF_File

In [None]:
#export 
class SpecLibBase(object):
    def __init__(self,
        # ['b_z1','b_z2','y_z1','y_modloss_z1', ...]; 
        # 'b_z1': 'b' is the fragment type and 
        # 'z1' is the charge state z=1.
        charged_frag_types:typing.List[str] = [
            'b_z1','b_z2','y_z1', 'y_z2'
        ], 
        min_precursor_mz = 400, max_precursor_mz = 6000,
        decoy:str = 'pseudo_reverse',
    ):
        self.charged_frag_types = charged_frag_types
        self._precursor_df = pd.DataFrame()
        self._fragment_intensity_df = pd.DataFrame()
        self._fragment_mz_df = pd.DataFrame()
        self.min_precursor_mz = min_precursor_mz
        self.max_precursor_mz = max_precursor_mz

        self.mod_seq_df_columns = [
            'sequence', 'mods', 'mod_sites',
            'nce', 'instrument',
            'protein_idxes', 
            'proteins', 'genes', 'uniprot_ids',
            'is_prot_nterm', 'is_prot_cterm'
        ]
        self.decoy = decoy
    
    @property
    def precursor_df(self):
        return self._precursor_df

    @precursor_df.setter
    def precursor_df(self, df):
        self._precursor_df = df
        precursor.refine_precursor_df(
            self._precursor_df,
            drop_frag_idx=False,
            ensure_data_validity=True,
        )

    @property
    def fragment_mz_df(self):
        return self._fragment_mz_df

    @property
    def fragment_intensity_df(self):
        return self._fragment_intensity_df

    def refine_df(self):
        precursor.refine_precursor_df(
            self._precursor_df
        )

    def append_decoy_sequence(self):
        from alphabase.spectrum_library.decoy_library import (
            decoy_lib_provider
        )
        if self.decoy not in (
            decoy_lib_provider.decoy_dict
        ): return
        decoy_lib = (
            decoy_lib_provider.get_decoy_lib(
                self.decoy, self
            )
        )
        decoy_lib.decoy_sequence()
        self._precursor_df['decoy'] = 0
        decoy_lib._precursor_df['decoy'] = 1
        self._precursor_df = self._precursor_df.append(
            decoy_lib._precursor_df
        )
        self.refine_df()

    def clip_by_precursor_mz_(self):
        ''' 
        Clip self._precursor_df inplace
        '''
        self._precursor_df.drop(
            self._precursor_df.loc[
                (self._precursor_df['precursor_mz']<self.min_precursor_mz)|
                (self._precursor_df['precursor_mz']>self.max_precursor_mz)
            ].index, inplace=True
        )
        self._precursor_df.reset_index(drop=True, inplace=True)


    def flatten_fragment_data(
        self
    )->typing.Tuple[np.array, np.array]:
        '''
        Create flattened (1-D) np.array for fragment mz and intensity 
        dataframes, respectively. The arrays are references to 
        original data, that means: 
          1. This method is fast; 
          2. Changing the array values will change the df values. 
        They can be unraveled back using:
          `array.reshape(len(self._fragment_mz_df.columns), -1)`

        Returns:
            np.array: 1-D flattened mz array (a reference to 
            original fragment mz df data)
            np.array: 1-D flattened intensity array (a reference to 
            original fragment intensity df data)
        '''
        return (
            self._fragment_mz_df.values.reshape(-1),
            self._fragment_intensity_df.values.reshape(-1)
        )

    def calc_precursor_mz(self):
        fragment.update_precursor_mz(self._precursor_df)
        self.clip_by_precursor_mz_()

    def update_precursor_mz(self):
        """Calculate precursor mz for self._precursor_df"""
        self.calc_precursor_mz()

    def hash_precursor_df(self):
        precursor.hash_precursor_df(
            self._precursor_df
        )

    def _get_hdf_to_save(self, 
        hdf_file, 
        delete_existing=False
    ):
        _hdf = HDF_File(
            hdf_file, 
            read_only=False, 
            truncate=True,
            delete_existing=delete_existing
        )
        return _hdf.library

    def _get_hdf_to_load(self,
        hdf_file, 
    ):
        _hdf = HDF_File(
            hdf_file,
        )
        return _hdf.library

    def save_df_to_hdf(self, 
        hdf_file:str, 
        df_key: str,
        df: pd.DataFrame,
        delete_existing=False
    ):
        self._get_hdf_to_save(
            hdf_file, 
            delete_existing=delete_existing
        ).add_group(df_key, df)

    def load_df_from_hdf(self, 
        hdf_file:str, 
        df_name: str
    )->pd.DataFrame:
        """Load specific dataset (dataframe) from hdf_file.

        Args:
            hdf_file (str): The hdf file name
            df_name (str): The dataset/dataframe name in the hdf file

        Returns:
            pd.DataFrame: Loaded dataframe
        """
        return self._get_hdf_to_load(
            hdf_file
        ).__getattribute__(df_name).values

    def save_hdf(self, hdf_file):
        _hdf = HDF_File(
            hdf_file, 
            read_only=False, 
            truncate=True,
            delete_existing=True
        )
        if 'mod_seq_charge_hash' not in self._precursor_df.columns:
            self.hash_precursor_df()

        mod_seq_cols = self.mod_seq_df_columns+[
            'mod_seq_hash', 'mod_seq_charge_hash'
        ]

        _hdf.library = {
            'precursor_df': self._precursor_df[
                [
                    col for col in self._precursor_df.columns 
                    if col not in self.mod_seq_df_columns
                ]
            ],
            'mod_seq_df': self._precursor_df[
                [
                    col for col in self._precursor_df.columns 
                    if col in mod_seq_cols
                ]
            ],
            'fragment_mz_df': self._fragment_mz_df,
            'fragment_intensity_df': self._fragment_intensity_df,
        }
        
    def load_hdf(self, hdf_file, load_mod_seq=False):
        _hdf = HDF_File(
            hdf_file,
        )
        self._precursor_df:pd.DataFrame = _hdf.library.precursor_df.values
        if load_mod_seq:
            mod_seq_df = _hdf.library.mod_seq_df.values
            cols = [
                col for col in mod_seq_df.columns 
                if col in self.mod_seq_df_columns
            ]
            self._precursor_df[cols] = mod_seq_df[cols]
        self._fragment_mz_df = _hdf.library.fragment_mz_df.values
        self._fragment_intensity_df = _hdf.library.fragment_intensity_df.values

In [None]:
#hide
lib = SpecLibBase([])
lib._precursor_df = pd.DataFrame({
    'precursor_mz': [100,1000,1500,2000],
    'charge': 2,
})
lib.clip_by_precursor_mz_()
assert np.allclose(lib.precursor_df.precursor_mz.values, [1000,1500,2000])

In [None]:
import pandas as pd
repeat = 3
peptides = ['AGHCEWQMK']*repeat
mods = ['Acetyl@Protein N-term;Carbamidomethyl@C;Oxidation@M']*repeat
sites = ['0;4;8']*repeat
peptides += ['AGHCEWQMKAADER']*repeat
mods += ['']*repeat
sites += ['']*repeat

precursor_df = pd.DataFrame({
    'sequence': peptides,
    'mods': mods,
    'mod_sites': sites
})
precursor_df['nAA'] = precursor_df['sequence'].str.len()
precursor_df['charge'] = 2
target_lib = SpecLibBase(['b_z1','b_z2','y_z1','y_z2'])
target_lib._precursor_df = precursor_df
target_lib.calc_precursor_mz()
target_lib._fragment_mz_df = pd.DataFrame()
target_lib._fragment_intensity_df = pd.DataFrame()
import os
if not os.path.isdir('sandbox'):
    os.makedirs('sandbox')
target_lib.save_hdf('sandbox/test_lib.hdf')
target_lib.save_df_to_hdf('sandbox/test_lib.hdf','protein_df',pd.DataFrame(
    {
        'id':[1,2],
        'full_name': [1,2],
        'description': [1,2],
        'sequence': [1,2]
    })
)
new_lib = SpecLibBase([])
new_lib.load_hdf('sandbox/test_lib.hdf')

assert len(new_lib.precursor_df) > 0
assert len(new_lib.fragment_mz_df) == 0
assert len(new_lib.fragment_intensity_df) == 0

assert 'sequence' not in new_lib.precursor_df.columns
assert 'mod_seq_hash' in new_lib.precursor_df.columns


new_lib = SpecLibBase([])
new_lib.load_hdf('sandbox/test_lib.hdf', load_mod_seq=True)
assert 'sequence' in new_lib.precursor_df.columns
assert 'mod_seq_hash' in new_lib.precursor_df.columns

df = target_lib.load_df_from_hdf('sandbox/test_lib.hdf', 'precursor_df')
assert len(precursor_df)==len(df)
df = target_lib.load_df_from_hdf('sandbox/test_lib.hdf', 'protein_df')
assert len(df)==2
os.remove('sandbox/test_lib.hdf')
precursor_df

Unnamed: 0,sequence,mods,mod_sites,nAA,charge,precursor_mz,mod_seq_hash,mod_seq_charge_hash
0,AGHCEWQMK,Acetyl@Protein N-term;Carbamidomethyl@C;Oxidat...,0;4;8,9,2,602.747333,-3804119042269154857,527285331510768262
1,AGHCEWQMK,Acetyl@Protein N-term;Carbamidomethyl@C;Oxidat...,0;4;8,9,2,602.747333,-3804119042269154857,527285331510768262
2,AGHCEWQMK,Acetyl@Protein N-term;Carbamidomethyl@C;Oxidat...,0;4;8,9,2,602.747333,-3804119042269154857,527285331510768262
3,AGHCEWQMKAADER,,,14,2,816.356299,1252374117656441545,-279089282710494095
4,AGHCEWQMKAADER,,,14,2,816.356299,1252374117656441545,-279089282710494095
5,AGHCEWQMKAADER,,,14,2,816.356299,1252374117656441545,-279089282710494095


In [None]:
target_lib.append_decoy_sequence()
assert len(target_lib.precursor_df) == len(precursor_df)*2
target_lib.precursor_df

Unnamed: 0,sequence,mods,mod_sites,nAA,charge,precursor_mz,mod_seq_hash,mod_seq_charge_hash,decoy
0,AGHCEWQMK,Acetyl@Protein N-term;Carbamidomethyl@C;Oxidat...,0;4;8,9,2,602.747333,-3804119042269154857,527285331510768262,0
1,AGHCEWQMK,Acetyl@Protein N-term;Carbamidomethyl@C;Oxidat...,0;4;8,9,2,602.747333,-3804119042269154857,527285331510768262,0
2,AGHCEWQMK,Acetyl@Protein N-term;Carbamidomethyl@C;Oxidat...,0;4;8,9,2,602.747333,-3804119042269154857,527285331510768262,0
3,MQWECHGAK,Acetyl@Protein N-term;Carbamidomethyl@C;Oxidat...,0;4;8,9,2,602.747333,-3804119042269154857,527285331510768262,1
4,MQWECHGAK,Acetyl@Protein N-term;Carbamidomethyl@C;Oxidat...,0;4;8,9,2,602.747333,-3804119042269154857,527285331510768262,1
5,MQWECHGAK,Acetyl@Protein N-term;Carbamidomethyl@C;Oxidat...,0;4;8,9,2,602.747333,-3804119042269154857,527285331510768262,1
6,AGHCEWQMKAADER,,,14,2,816.356299,1252374117656441545,-279089282710494095,0
7,AGHCEWQMKAADER,,,14,2,816.356299,1252374117656441545,-279089282710494095,0
8,AGHCEWQMKAADER,,,14,2,816.356299,1252374117656441545,-279089282710494095,0
9,EDAAKMQWECHGAR,,,14,2,816.356299,1252374117656441545,-279089282710494095,1
