In [None]:
#default_exp spectrum_library.decoy_library

In [None]:
#export
import copy
from alphabase.spectrum_library.library_base import SpecLibBase
from alphabase.io.hdf import HDF_File

class DecoyLib(SpecLibBase):
    def __init__(self, 
        target_lib:SpecLibBase,
        fix_C_term = True,
    ):
        self.__dict__ = copy.deepcopy(target_lib.__dict__)
        self.fix_C_term = fix_C_term

    def translate_to_decoy(self):
        self._decoy_seq()
        self._decoy_mod()
        self._decoy_meta()
        self._decoy_frag()

    def _decoy_meta(self):
        """
        Decoy for CCS/RT or other meta data
        """
        pass

    def _decoy_mod(self):
        """
        Decoy for modifications and modification sites
        """
        pass

    def _decoy_frag(self):
        """
        Decoy for fragment masses and intensities
        """
        self._decoy_fragment_mz()
        self._decoy_fragment_intensity()
    
    def _decoy_fragment_mz(self):
        del self._precursor_df['precursor_mz']
        del self._precursor_df['frag_start_idx']
        del self._precursor_df['frag_end_idx']

        self.calc_fragment_mz_df()
        
    def _decoy_fragment_intensity(self):
        pass

    def _decoy_seq(self):
        (
            self._precursor_df.sequence
        ) = self._precursor_df.sequence.apply(
            lambda x: (x[:-1][::-1]+x[-1])
             if self.fix_C_term else x[::-1]
        )

    def save_hdf(self, hdf_file):
        _hdf = HDF_File(
            hdf_file, 
            read_only=False, 
            truncate=True,
            delete_existing=False
        )
        _hdf.library.decoy = {
            'precursor_df': self._precursor_df,
            'fragment_mz_df': self._fragment_mz_df,
            'fragment_intensity_df': self._fragment_intensity_df,
        }

    def load_hdf(self, hdf_file):
        _hdf = HDF_File(
            hdf_file,
        )
        _hdf_lib = _hdf.library
        self._precursor_df = _hdf_lib.decoy.precursor_df.values
        self._fragment_mz_df = _hdf_lib.decoy.fragment_mz_df.values
        self._fragment_intensity_df = _hdf_lib.decoy.fragment_intensity_df.values

class DiaNNDecoyLib(DecoyLib):
    def __init__(self, 
        target_lib:SpecLibBase,
        fix_C_term = True,
        raw_AAs:str = 'GAVLIFMPWSCTYHKRQEND',
        mutated_AAs:str = 'LLLVVLLLLTSSSSLLNDQE', #DiaNN
    ):  
        super().__init__(target_lib, fix_C_term)
        self.raw_AAs = raw_AAs
        self.mutated_AAs = mutated_AAs

    def _decoy_seq(self):
        (
            self._precursor_df.sequence
        ) = self._precursor_df.sequence.apply(
            lambda x:
                x[0]+self.mutated_AAs[self.raw_AAs.index(x[1])]+
                x[2:-2]+self.mutated_AAs[self.raw_AAs.index(x[-2])]+x[-1]
        )

In [None]:
#export
class DecoyLibProvider(object):
    def __init__(self):
        self.decoy_dict = {}

    def register(self, name, decoy_class):
        self.decoy_dict[name.lower()] = decoy_class

    def get_decoy(self, name, 
        target_lib, fix_C_term=True
    )->DecoyLib:
        return self.decoy_dict[name.lower()](
            target_lib, fix_C_term
        )

decoy_lib_provider = DecoyLibProvider()
decoy_lib_provider.register('reverse', DecoyLib)
decoy_lib_provider.register('diann', DiaNNDecoyLib)

In [None]:
#hide
import pandas as pd
repeat = 3
peptides = ['AGHCEWQMK']*repeat
mods = ['Acetyl@Protein N-term;Carbamidomethyl@C;Oxidation@M']*repeat
sites = ['0;4;8']*repeat
peptides += ['AGHCEWQMKAADER']*repeat
mods += ['']*repeat
sites += ['']*repeat

precursor_df = pd.DataFrame({
    'sequence': peptides,
    'mods': mods,
    'mod_sites': sites
})
precursor_df['nAA'] = precursor_df['sequence'].str.len()
precursor_df['charge'] = 2
precursor_df

Unnamed: 0,sequence,mods,mod_sites,nAA,charge
0,AGHCEWQMK,Acetyl@Protein N-term;Carbamidomethyl@C;Oxidat...,0;4;8,9,2
1,AGHCEWQMK,Acetyl@Protein N-term;Carbamidomethyl@C;Oxidat...,0;4;8,9,2
2,AGHCEWQMK,Acetyl@Protein N-term;Carbamidomethyl@C;Oxidat...,0;4;8,9,2
3,AGHCEWQMKAADER,,,14,2
4,AGHCEWQMKAADER,,,14,2
5,AGHCEWQMKAADER,,,14,2


In [None]:
#hide
target_lib = SpecLibBase(['b_z1','b_z2','y_z1','y_z2'])
target_lib._precursor_df = precursor_df
target_lib.calc_fragment_mz_df()
target_lib._fragment_intensity_df = pd.DataFrame()
target_lib.save_hdf('../../sandbox/lib.hdf')
target_lib.fragment_mz_df

Unnamed: 0,b_z1,b_z2,y_z1,y_z2
0,114.054954,57.531115,1091.439711,546.223494
1,171.076418,86.041847,1034.418247,517.712762
2,308.135330,154.571303,897.359336,449.183306
3,468.165979,234.586627,737.328687,369.167981
4,597.208572,299.107924,608.286094,304.646685
...,...,...,...,...
58,1071.449882,536.228579,561.262715,281.134996
59,1142.486996,571.747136,490.225602,245.616439
60,1213.524110,607.265693,419.188488,210.097882
61,1328.551053,664.779164,304.161545,152.584410


In [None]:
#hide
decoy_lib = decoy_lib_provider.get_decoy('reverse', target_lib)
decoy_lib.translate_to_decoy()
decoy_lib.precursor_df

Unnamed: 0,sequence,mods,mod_sites,nAA,charge,precursor_mz,frag_start_idx,frag_end_idx
0,MQWECHGAK,Acetyl@Protein N-term;Carbamidomethyl@C;Oxidat...,0;4;8,9,2,602.747333,0,8
1,MQWECHGAK,Acetyl@Protein N-term;Carbamidomethyl@C;Oxidat...,0;4;8,9,2,602.747333,8,16
2,MQWECHGAK,Acetyl@Protein N-term;Carbamidomethyl@C;Oxidat...,0;4;8,9,2,602.747333,16,24
3,EDAAKMQWECHGAR,,,14,2,816.356299,24,37
4,EDAAKMQWECHGAR,,,14,2,816.356299,37,50
5,EDAAKMQWECHGAR,,,14,2,816.356299,50,63


In [None]:
#hide
decoy_lib = decoy_lib_provider.get_decoy('diann', target_lib, fix_C_term=False)
decoy_lib.translate_to_decoy()
decoy_lib.save_hdf('../../sandbox/lib.hdf')
decoy_lib.precursor_df

Unnamed: 0,sequence,mods,mod_sites,nAA,charge,precursor_mz,frag_start_idx,frag_end_idx
0,ALHCEWQLK,Acetyl@Protein N-term;Carbamidomethyl@C;Oxidat...,0;4;8,9,2,621.800422,0,8
1,ALHCEWQLK,Acetyl@Protein N-term;Carbamidomethyl@C;Oxidat...,0;4;8,9,2,621.800422,8,16
2,ALHCEWQLK,Acetyl@Protein N-term;Carbamidomethyl@C;Oxidat...,0;4;8,9,2,621.800422,16,24
3,ALHCEWQMKAADDR,,,14,2,837.379774,24,37
4,ALHCEWQMKAADDR,,,14,2,837.379774,37,50
5,ALHCEWQMKAADDR,,,14,2,837.379774,50,63


In [None]:
#hide
_hdf = HDF_File('../../sandbox/lib.hdf')
_hdf.library.precursor_df.values

Unnamed: 0,charge,frag_end_idx,frag_start_idx,mod_sites,mods,nAA,precursor_mz,sequence
0,2,8,0,0;4;8,Acetyl@Protein N-term;Carbamidomethyl@C;Oxidat...,9,602.747333,AGHCEWQMK
1,2,16,8,0;4;8,Acetyl@Protein N-term;Carbamidomethyl@C;Oxidat...,9,602.747333,AGHCEWQMK
2,2,24,16,0;4;8,Acetyl@Protein N-term;Carbamidomethyl@C;Oxidat...,9,602.747333,AGHCEWQMK
3,2,37,24,,,14,816.356299,AGHCEWQMKAADER
4,2,50,37,,,14,816.356299,AGHCEWQMKAADER
5,2,63,50,,,14,816.356299,AGHCEWQMKAADER


In [None]:
#hide
_hdf.library.decoy.precursor_df.values

Unnamed: 0,charge,frag_end_idx,frag_start_idx,mod_sites,mods,nAA,precursor_mz,sequence
0,2,8,0,0;4;8,Acetyl@Protein N-term;Carbamidomethyl@C;Oxidat...,9,621.800422,ALHCEWQLK
1,2,16,8,0;4;8,Acetyl@Protein N-term;Carbamidomethyl@C;Oxidat...,9,621.800422,ALHCEWQLK
2,2,24,16,0;4;8,Acetyl@Protein N-term;Carbamidomethyl@C;Oxidat...,9,621.800422,ALHCEWQLK
3,2,37,24,,,14,837.379774,ALHCEWQMKAADDR
4,2,50,37,,,14,837.379774,ALHCEWQMKAADDR
5,2,63,50,,,14,837.379774,ALHCEWQMKAADDR


In [None]:
#hide
test_lib = DecoyLib(target_lib)
test_lib.load_hdf('../../sandbox/lib.hdf')
test_lib._precursor_df

Unnamed: 0,charge,frag_end_idx,frag_start_idx,mod_sites,mods,nAA,precursor_mz,sequence
0,2,8,0,0;4;8,Acetyl@Protein N-term;Carbamidomethyl@C;Oxidat...,9,621.800422,ALHCEWQLK
1,2,16,8,0;4;8,Acetyl@Protein N-term;Carbamidomethyl@C;Oxidat...,9,621.800422,ALHCEWQLK
2,2,24,16,0;4;8,Acetyl@Protein N-term;Carbamidomethyl@C;Oxidat...,9,621.800422,ALHCEWQLK
3,2,37,24,,,14,837.379774,ALHCEWQMKAADDR
4,2,50,37,,,14,837.379774,ALHCEWQMKAADDR
5,2,63,50,,,14,837.379774,ALHCEWQMKAADDR
