In [2]:
%reload_ext autoreload
%autoreload 2


In [113]:
from peptdeep.model.ms2 import ModelMS2Transformer, ModelMS2Bert
from peptdeep.pretrained_models import ModelManager
from peptdeep.settings import global_settings
from peptdeep.model.rt import IRT_PEPTIDE_DF
import pandas as pd
from typing import IO, Tuple, List, Union

In [6]:
model_mgr = ModelManager(mask_modloss=True, device='cuda:2')

In [12]:
df = IRT_PEPTIDE_DF.copy()
# randomly add some modifications, this may change the real irt
df.loc[1,'mods'] = 'Phospho@S'
df.loc[1,'mod_sites'] = '5'

In [191]:
df

Unnamed: 0,sequence,pep_name,irt,mods,mod_sites,nAA,charge,nce,instrument,frag_start_idx,frag_stop_idx
0,LGGNEQVTR,RT-pep a,-24.92,,,9,2,30.0,Lumos,0,8
1,GAGSSEPVTGLDAK,RT-pep b,0.0,Phospho@S,5.0,14,2,30.0,Lumos,8,21
2,VEATFGVDESNAK,RT-pep c,12.39,,,13,2,30.0,Lumos,21,33
3,YILAGVENSK,RT-pep d,19.79,,,10,2,30.0,Lumos,33,42
4,TPVISGGPYEYR,RT-pep e,28.71,,,12,2,30.0,Lumos,42,53
5,TPVITGAPYEYR,RT-pep f,33.38,,,12,2,30.0,Lumos,53,64
6,DGLDAASYYAPVR,RT-pep g,42.26,,,13,2,30.0,Lumos,64,76
7,ADVTPADFSEWSK,RT-pep h,54.62,,,13,2,30.0,Lumos,76,88
8,GTFIIDPGGVIR,RT-pep i,70.52,,,12,2,30.0,Lumos,88,99
9,GTFIIDPAAVIR,RT-pep k,87.23,,,12,2,30.0,Lumos,99,110


In [26]:
aa_num = 0
for i in df["sequence"]:
    aa_num += len(i) - 1
aa_num

123

In [14]:
df['charge'] = 2
inten_df = model_mgr.predict_ms2(df)
inten_df

2023-09-27 16:41:57> Predicting MS2 ...


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:00<00:00, 21.12it/s]


Unnamed: 0,b_z1,b_z2,y_z1,y_z2,b_modloss_z1,b_modloss_z2,y_modloss_z1,y_modloss_z2
0,0.000000,0.0,1.000000,0.004739,0.0,0.0,0.0,0.0
1,0.162034,0.0,0.360414,0.000000,0.0,0.0,0.0,0.0
2,0.046660,0.0,0.109920,0.005516,0.0,0.0,0.0,0.0
3,0.018628,0.0,0.203326,0.000000,0.0,0.0,0.0,0.0
4,0.013530,0.0,0.267507,0.000000,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...
118,0.000000,0.0,0.098116,0.000000,0.0,0.0,0.0,0.0
119,0.000000,0.0,0.647829,0.000000,0.0,0.0,0.0,0.0
120,0.000000,0.0,0.034361,0.000000,0.0,0.0,0.0,0.0
121,0.000000,0.0,0.130814,0.000000,0.0,0.0,0.0,0.0


In [188]:
ms2_model = ModelMS2Transformer(2)

In [189]:
df.head()

Unnamed: 0,sequence,pep_name,irt,mods,mod_sites,nAA,charge,nce,instrument,frag_start_idx,frag_stop_idx
0,LGGNEQVTR,RT-pep a,-24.92,,,9,2,30.0,Lumos,0,8
1,GAGSSEPVTGLDAK,RT-pep b,0.0,Phospho@S,5.0,14,2,30.0,Lumos,8,21
2,VEATFGVDESNAK,RT-pep c,12.39,,,13,2,30.0,Lumos,21,33
3,YILAGVENSK,RT-pep d,19.79,,,10,2,30.0,Lumos,33,42
4,TPVISGGPYEYR,RT-pep e,28.71,,,12,2,30.0,Lumos,42,53


In [190]:
ms2_model(
    torch.randint(87,(1,11)),
    torch.rand(1,11,109, dtype=torch.float32),
    torch.rand(1,1,  dtype=torch.float32),
    torch.rand(1,1, dtype=torch.float32),
    torch.randint(2,(1,))
)

IndexError: index out of range in self

In [84]:
import torch
import peptdeep.model.building_block as building_block

class ModelMS2Transformer(torch.nn.Module):
    """Transformer model for MS2 prediction

    Parameters
    ----------
    num_frag_types : int
        Total number of fragment types of a fragmentation position to predict

    num_modloss_types : int, optional
        Number of fragment types of a fragmentation position to predict, by default 0

    mask_modloss : bool, optional
        If True, the modloss layer will be disabled, by default True

    dropout : float, optional
        Dropout, by default 0.1

    nlayers : int, optional
        Number of transformer layer, by default 4

    hidden : int, optional
        Hidden layer size, by default 256
    """

    def __init__(self,
        num_frag_types:int,
        num_modloss_types:int=0,
        mask_modloss:bool=True,
        dropout:float=0.1,
        nlayers:int=4,
        hidden:int=256,
        **kwargs,
    ):
        super().__init__()

        self.dropout = torch.nn.Dropout(dropout)

        self._num_modloss_types = num_modloss_types
        self._num_non_modloss = num_frag_types-num_modloss_types
        self._mask_modloss = mask_modloss
        if num_modloss_types == 0:
            self._mask_modloss = True

        meta_dim = 8
        self.input_nn = building_block.Input_26AA_Mod_PositionalEncoding(hidden-meta_dim)

        self.meta_nn = building_block.Meta_Embedding(meta_dim)

        self.hidden_nn = building_block.Hidden_Transformer(
            hidden, nlayers=nlayers, dropout=dropout
        )

        self.output_nn = building_block.Decoder_Linear(
            hidden,
            self._num_non_modloss,
        )
        
        if num_modloss_types > 0:
            # for transfer learning of modloss frags
            self.modloss_nn = torch.nn.ModuleList([
                building_block.Hidden_Transformer(
                    hidden, nlayers=1, dropout=dropout
                ),
                building_block.Decoder_Linear(
                    hidden, num_modloss_types,
                ),
            ])
        else:
            self.modloss_nn = None


    def forward(self,
        aa_indices,
        mod_x,
        charges:torch.Tensor,
        NCEs:torch.Tensor,
        instrument_indices,
    ):
        in_x = self.dropout(self.input_nn(
            aa_indices, mod_x
        ))
        print(in_x.shape)
        meta_x = self.meta_nn(
            charges, NCEs, instrument_indices
        ).unsqueeze(1).repeat(1,in_x.size(1),1)
        print(meta_x.shape)
        in_x = torch.cat((in_x, meta_x),2)

        hidden_x = self.hidden_nn(in_x)
        hidden_x = self.dropout(hidden_x+in_x*0.2)

        out_x = self.output_nn(
            hidden_x
        )

        if self._num_modloss_types > 0:
            if self._mask_modloss:
                out_x = torch.cat((out_x, torch.zeros(
                    *out_x.size()[:2],self._num_modloss_types,
                    device=in_x.device
                )), 2)
            else:
                modloss_x = self.modloss_nn[0](
                    in_x
                ) + hidden_x
                modloss_x = self.modloss_nn[-1](
                    modloss_x
                )
                out_x = torch.cat((
                    out_x, modloss_x
                ),2)

        return out_x[:,3:,:]


In [95]:
from peptdeep.settings import global_settings

global_settings['model_mgr']['default_nce']
global_settings['model_mgr']['default_instrument']

30.0

In [96]:
df

Unnamed: 0,sequence,pep_name,irt,mods,mod_sites,nAA,charge,nce,instrument,frag_start_idx,frag_stop_idx
0,LGGNEQVTR,RT-pep a,-24.92,,,9,2,30.0,Lumos,0,8
1,GAGSSEPVTGLDAK,RT-pep b,0.0,Phospho@S,5.0,14,2,30.0,Lumos,8,21
2,VEATFGVDESNAK,RT-pep c,12.39,,,13,2,30.0,Lumos,21,33
3,YILAGVENSK,RT-pep d,19.79,,,10,2,30.0,Lumos,33,42
4,TPVISGGPYEYR,RT-pep e,28.71,,,12,2,30.0,Lumos,42,53
5,TPVITGAPYEYR,RT-pep f,33.38,,,12,2,30.0,Lumos,53,64
6,DGLDAASYYAPVR,RT-pep g,42.26,,,13,2,30.0,Lumos,64,76
7,ADVTPADFSEWSK,RT-pep h,54.62,,,13,2,30.0,Lumos,76,88
8,GTFIIDPGGVIR,RT-pep i,70.52,,,12,2,30.0,Lumos,88,99
9,GTFIIDPAAVIR,RT-pep k,87.23,,,12,2,30.0,Lumos,99,110


In [101]:
_grouped = df.groupby('nAA')

In [109]:
fixed_sequence_len = 0

In [125]:
def _get_features_from_batch_df(
        batch_df:pd.DataFrame, **kwargs,
    )->Union[torch.LongTensor, Tuple[torch.Tensor]]:
        """
        Any sub-class must re-implement this method:
        
        - Return `self._get_aa_features()` for sequence-level prediciton 
        - Return `self._get_aa_mod_features()` for modified sequence-level

        Parameters
        ----------
        batch_df : pd.DataFrame
            Batch of precursor dataframe.

        Returns
        -------
        Union[torch.LongTensor, Tuple[torch.Tensor]]: 
            A LongTensor if the sub-class call `self._get_aa_features(batch_df)` (default).
            Or a tuple of tensors if call `self._get_aa_mod_features(batch_df)`.
        """
        return _get_aa_features(batch_df)
    
def _get_aa_features( 
        batch_df:pd.DataFrame
    )->torch.LongTensor:
        """
        Get AA indices
        """
        if fixed_sequence_len == 0:
            return _get_aa_indice_features(batch_df)
        else:
            return _get_aa_indice_features_padding_zeros(batch_df)
        
def _get_aa_indice_features(
         batch_df:pd.DataFrame
    )->torch.LongTensor:
        """
        Get indices values for fixed length sequences 
        with 128 ascii codes.
        """
        return _as_tensor(
            get_ascii_indices(
                batch_df['sequence'].values.astype('U')
            ), 
            dtype=torch.long
        )

def _get_aa_indice_features_padding_zeros(
     batch_df:pd.DataFrame
)->torch.LongTensor:
    """
    Get indices values of variable length sequences 
    using 128 ascii codes
    """
    if fixed_sequence_len < 0:
        max_len = batch_df.nAA.max()
    else:
        max_len = fixed_sequence_len
    return _as_tensor(
        get_ascii_indices(
            batch_df['sequence'].apply(
                lambda seq: seq + chr(0)*(max_len-len(seq))
            ).values.astype('U')
        ), 
        dtype=torch.long
    )

def _as_tensor( 
        data:np.ndarray, 
        dtype:torch.dtype=torch.float32
    )->torch.Tensor:
        """Convert numerical np.array to pytorch tensor.
        The tensor will be stored in self.device

        Parameters
        ----------
        data : np.ndarray
            Numerical np.ndarray to be converted as a tensor
            
        dtype : torch.dtype, optional
            The dtype of the indices used for embedding should be `torch.long`. 
            Defaults to `torch.float32`

        Returns
        -------
        torch.Tensor
            The tensor stored in self.device
        """
        return torch.tensor(data, dtype=dtype, device="cpu")

In [126]:
import numpy as np
from peptdeep.model.featurize import (
    get_ascii_indices, get_batch_aa_indices,
    get_batch_mod_feature
)

In [142]:
for nAA, df_group in _grouped:
    for i in range(0, len(df_group), 1):
        batch_end = i+1
        
        batch_df = df_group.iloc[i:batch_end,:]
        features = _get_features_from_batch_df(
            batch_df
        )

In [139]:
features

tensor([[ 0, 76, 70, 76, 81, 70, 71, 65, 81, 71, 83, 80, 70, 76, 75,  0]])

In [85]:
ms2 = ModelMS2Transformer(2)

In [169]:
ms2(
    features,
    features.repeat(2,1),
    2,
    30, 
    "Lumos"
)

TypeError: forward() missing 1 required positional argument: 'instrument_indices'

In [92]:
in_seq = [df["sequence"].iloc[0]]
in_mod = torch.rand(1,2,2)
in_charge = torch.tensor(df["charge"].iloc[0]).unsqueeze(0)
in_nce = torch.tensor(df["nce"].iloc[0]).unsqueeze(0)
in_instrument = torch.rand(1,2)

In [146]:
in_mod.shape

torch.Size([1, 2, 2])

In [165]:
features.repeat(2,1).shape

torch.Size([2, 16])

In [157]:
features.unsqueeze(2).shape

torch.Size([1, 16, 1])