In [None]:
%reload_ext autoreload
%autoreload 2

## Building our own models for RT prediction

In [None]:
from peptdeep.model.featurize import (
    get_batch_aa_indices, 
    get_batch_mod_feature
)

from peptdeep.settings import model_const

import peptdeep.model.model_interface as model_base

mod_feature_size = len(model_const['mod_elements'])

import torch
import pandas as pd

In [None]:
class RT_LSTM_Module(torch.nn.Module):
    def __init__(self, 
        dropout=0.2
    ):
        super().__init__()
        
        self.dropout = torch.nn.Dropout(dropout)
        
        hidden = 128
        self.rt_encoder = model_base.Encoder_26AA_Mod_CNN_LSTM_AttnSum(
            hidden
        )

        self.rt_decoder = model_base.Decoder_Linear(
            hidden,
            1
        )

    def forward(self, 
        aa_indices, 
        mod_x,
    ):
        x = self.rt_encoder(aa_indices, mod_x)
        x = self.dropout(x)

        return self.rt_decoder(x).squeeze(1)

In [None]:
class RT_Transformer_Module(torch.nn.Module):
    def __init__(self, 
        dropout=0.2
    ):
        super().__init__()
        
        self.dropout = torch.nn.Dropout(dropout)
        
        hidden = 128
        self.encoder = model_base.Encoder_AA_Mod_Transformer_AttnSum(
            hidden
        )

        self.decoder = model_base.Decoder_Linear(
            hidden,1
        )

    def forward(self, 
        aa_indices, 
        mod_x,
    ):
        x = self.encoder(aa_indices, mod_x)
        x = self.dropout(x)

        return self.decoder(x).squeeze(1)

In [None]:
class RT_Model(model_base.ModelInterface):
    def __init__(self, 
        model_class:torch.nn.Module=RT_LSTM_Module,
        dropout=0.1,
    ):
        super().__init__()
        self.build(
            model_class,
            dropout=dropout,
        )
        self.loss_func = torch.nn.L1Loss()

    def _prepare_predict_data_df(self,
        precursor_df:pd.DataFrame,
    ):
        self._predict_column_in_df = 'rt_pred'
        precursor_df[self._predict_column_in_df] = 0.
        self.predict_df = precursor_df

    def _get_features_from_batch_df(self, 
        batch_df: pd.DataFrame,
    ):
        aa_indices = torch.LongTensor(
            get_batch_aa_indices(
                batch_df['sequence'].values.astype('U')
            )
        )
        mod_x = torch.Tensor(
            get_batch_mod_feature(
                batch_df
            )
        )

        return aa_indices, mod_x

    def _get_targets_from_batch_df(self, 
        batch_df: pd.DataFrame,
    ) -> torch.Tensor:
        return torch.Tensor(batch_df['rt_norm'].values)

## Testing the RT model

### Prepare training data

In [None]:
from peptdeep.model.rt import irt_pep
irt_pep['rt_norm'] = (irt_pep.irt - irt_pep.irt.min())/(irt_pep.irt.max()-irt_pep.irt.min())
irt_pep

Unnamed: 0,sequence,pep_name,irt,mods,mod_sites,nAA,rt_norm
0,LGGNEQVTR,RT-pep a,-24.92,,,9,0.0
1,GAGSSEPVTGLDAK,RT-pep b,0.0,,,14,0.199488
2,VEATFGVDESNAK,RT-pep c,12.39,,,13,0.298671
3,YILAGVENSK,RT-pep d,19.79,,,10,0.357909
4,TPVISGGPYEYR,RT-pep e,28.71,,,12,0.429315
5,TPVITGAPYEYR,RT-pep f,33.38,,,12,0.466699
6,DGLDAASYYAPVR,RT-pep g,42.26,,,13,0.537784
7,ADVTPADFSEWSK,RT-pep h,54.62,,,13,0.636728
8,GTFIIDPGGVIR,RT-pep i,70.52,,,12,0.764009
9,GTFIIDPAAVIR,RT-pep k,87.23,,,12,0.897775


In [None]:
rt_model = RT_Model(model_class=RT_LSTM_Module)

### Test the untrained model

In [None]:
rt_model.predict(irt_pep)

Unnamed: 0,sequence,pep_name,irt,mods,mod_sites,nAA,rt_norm,rt_pred
0,LGGNEQVTR,RT-pep a,-24.92,,,9,0.0,0.0
1,GAGSSEPVTGLDAK,RT-pep b,0.0,,,14,0.199488,0.0
2,VEATFGVDESNAK,RT-pep c,12.39,,,13,0.298671,0.0
3,YILAGVENSK,RT-pep d,19.79,,,10,0.357909,0.0
4,TPVISGGPYEYR,RT-pep e,28.71,,,12,0.429315,0.0
5,TPVITGAPYEYR,RT-pep f,33.38,,,12,0.466699,0.0
6,DGLDAASYYAPVR,RT-pep g,42.26,,,13,0.537784,0.0
7,ADVTPADFSEWSK,RT-pep h,54.62,,,13,0.636728,0.0
8,GTFIIDPGGVIR,RT-pep i,70.52,,,12,0.764009,0.0
9,GTFIIDPAAVIR,RT-pep k,87.23,,,12,0.897775,0.0


### Test if training works

In [None]:
rt_model.train(irt_pep, epoch=100, verbose=False)

### Test if the model fits the irt_pep data

In [None]:
rt_model.predict(irt_pep)

Unnamed: 0,sequence,pep_name,irt,mods,mod_sites,nAA,rt_norm,rt_pred
0,LGGNEQVTR,RT-pep a,-24.92,,,9,0.0,0.002638
1,GAGSSEPVTGLDAK,RT-pep b,0.0,,,14,0.199488,0.187863
2,VEATFGVDESNAK,RT-pep c,12.39,,,13,0.298671,0.277881
3,YILAGVENSK,RT-pep d,19.79,,,10,0.357909,0.351155
4,TPVISGGPYEYR,RT-pep e,28.71,,,12,0.429315,0.405445
5,TPVITGAPYEYR,RT-pep f,33.38,,,12,0.466699,0.448743
6,DGLDAASYYAPVR,RT-pep g,42.26,,,13,0.537784,0.515632
7,ADVTPADFSEWSK,RT-pep h,54.62,,,13,0.636728,0.621282
8,GTFIIDPGGVIR,RT-pep i,70.52,,,12,0.764009,0.737075
9,GTFIIDPAAVIR,RT-pep k,87.23,,,12,0.897775,0.865838


### Get number of model parameters

In [None]:
rt_model.get_parameter_num()

232448

### It is easy to switch the model to Transformer. 
#### Users can add more nn.Modules without re-designing the AA/PTM feature extraction parts.

In [None]:
rt_model = RT_Model(model_class=RT_Transformer_Module)
rt_model.train(irt_pep, epoch=50, warmup_epoch=20)
rt_model.predict(irt_pep)

Unnamed: 0,sequence,pep_name,irt,mods,mod_sites,nAA,rt_norm,rt_pred
0,LGGNEQVTR,RT-pep a,-24.92,,,9,0.0,0.0
1,GAGSSEPVTGLDAK,RT-pep b,0.0,,,14,0.199488,0.165758
2,VEATFGVDESNAK,RT-pep c,12.39,,,13,0.298671,0.262574
3,YILAGVENSK,RT-pep d,19.79,,,10,0.357909,0.334137
4,TPVISGGPYEYR,RT-pep e,28.71,,,12,0.429315,0.399647
5,TPVITGAPYEYR,RT-pep f,33.38,,,12,0.466699,0.462391
6,DGLDAASYYAPVR,RT-pep g,42.26,,,13,0.537784,0.599279
7,ADVTPADFSEWSK,RT-pep h,54.62,,,13,0.636728,0.645319
8,GTFIIDPGGVIR,RT-pep i,70.52,,,12,0.764009,0.797928
9,GTFIIDPAAVIR,RT-pep k,87.23,,,12,0.897775,0.913671


In [None]:
rt_model.get_parameter_num()

804984