In [None]:
#default_exp model.model_shop

In [None]:
%reload_ext autoreload
%autoreload 2

In [None]:
#export
import torch
import peptdeep.model.building_block as building_block
from peptdeep.model.model_interface import ModelInterface
from peptdeep.model.featurize import (
    get_ascii_indices, get_batch_mod_feature
)
import pandas as pd
import numpy as np

ASCII_NUM=128

# Regression models for predicting a scalar value for a given amino acid sequence

In [None]:
#export

class ScalarRegression_LSTM_Model_for_AASeq(torch.nn.Module):
    def __init__(self, 
        *,
        hidden_dim=256,
        n_lstm_layers=4,
        dropout=0.1,
        **kwargs,
    ):
        super().__init__()
        self.dropout = torch.nn.Dropout(dropout)
        
        self.nn = torch.nn.Sequential(
            building_block.ascii_embedding(hidden_dim//4),
            building_block.SeqCNN(hidden_dim//4),
            self.dropout,
            building_block.SeqLSTM(
                hidden_dim, hidden_dim, 
                rnn_layer=n_lstm_layers
            ),
            building_block.SeqAttentionSum(hidden_dim),
            self.dropout,
            torch.nn.Linear(hidden_dim,64),
            torch.nn.GELU(),
            torch.nn.Linear(64, 1),
        )
    def forward(self, aa_x):
        return self.nn(aa_x).squeeze(-1)

class ScalarRegression_Transformer_Model_for_AASeq(torch.nn.Module):
    def __init__(self,
        *,
        hidden_dim = 256,
        nlayers = 4,
        output_attentions=False,
        dropout = 0.1,
        **kwargs,
    ):
        """
        Model based on a transformer Architecture from 
        Huggingface's BertEncoder class.
        """
        super().__init__()

        self.dropout = torch.nn.Dropout(dropout)

        self.input_nn =  torch.nn.Sequential(
            building_block.ascii_embedding(hidden_dim),
        )

        self.output_attentions = output_attentions
        
        self.hidden_nn = building_block.HFace_Transformer_with_PositionalEncoder(
            hidden_dim, nlayers=nlayers, dropout=dropout,
            output_attentions=output_attentions
        )

        self.output_nn = torch.nn.Sequential(
            building_block.SeqAttentionSum(hidden_dim),
            torch.nn.PReLU(),
            self.dropout,
            torch.nn.Linear(hidden_dim, 1),
        )

    @property
    def output_attentions(self)->bool:
        return self._output_attentions

    @output_attentions.setter
    def output_attentions(self, val:bool):
        self._output_attentions = val

    def forward(self, aa_x):
        aa_x = self.dropout(self.input_nn(aa_x))

        aa_x = self.hidden_nn(aa_x)
        if self.output_attentions:
            self.attentions = aa_x[1]
        else:
            self.attentions = None
        aa_x = self.dropout(aa_x[0])

        return self.output_nn(aa_x).squeeze(1)

class ScalarRegression_ModelInterface_for_AASeq(ModelInterface):
    def __init__(self, 
        dropout=0.1,
        model_class:torch.nn.Module=ScalarRegression_LSTM_Model_for_AASeq, #model defined above
        device:str='gpu',
        **kwargs,
    ):
        super().__init__(device=device)
        self.build(
            model_class,
            dropout=dropout,
            **kwargs
        )
        self.loss_func = torch.nn.L1Loss() # for regression

        self.target_column_to_predict = 'predicted_property'
        self.target_column_to_train = 'detected_property'

#### Building a RT model for only sequences based on `ScalarRegression_LSTM_Model_for_AASeq`

In [None]:
from peptdeep.model.rt import IRT_PEPTIDE_DF

IRT_PEPTIDE_DF=IRT_PEPTIDE_DF.copy()
IRT_PEPTIDE_DF['detected_property'] = (
    IRT_PEPTIDE_DF.irt-IRT_PEPTIDE_DF.irt.min()
)/(IRT_PEPTIDE_DF.irt.max()-IRT_PEPTIDE_DF.irt.min())
model = ScalarRegression_ModelInterface_for_AASeq(
    model_class=ScalarRegression_LSTM_Model_for_AASeq
)
model.train(IRT_PEPTIDE_DF, epoch=20)
model.predict(IRT_PEPTIDE_DF)

Unnamed: 0,sequence,pep_name,irt,mods,mod_sites,nAA,detected_property,predicted_property
0,LGGNEQVTR,RT-pep a,-24.92,,,9,0.0,0.012358
1,GAGSSEPVTGLDAK,RT-pep b,0.0,,,14,0.199488,0.219922
2,VEATFGVDESNAK,RT-pep c,12.39,,,13,0.298671,0.317224
3,YILAGVENSK,RT-pep d,19.79,,,10,0.357909,0.418843
4,TPVISGGPYEYR,RT-pep e,28.71,,,12,0.429315,0.439064
5,TPVITGAPYEYR,RT-pep f,33.38,,,12,0.466699,0.470105
6,DGLDAASYYAPVR,RT-pep g,42.26,,,13,0.537784,0.658076
7,ADVTPADFSEWSK,RT-pep h,54.62,,,13,0.636728,0.691136
8,GTFIIDPGGVIR,RT-pep i,70.52,,,12,0.764009,0.739845
9,GTFIIDPAAVIR,RT-pep k,87.23,,,12,0.897775,0.942851


#### Building a RT model for only sequences based on `ScalarRegression_Transformer_Model_for_AASeq`

In [None]:
from peptdeep.model.rt import IRT_PEPTIDE_DF

IRT_PEPTIDE_DF=IRT_PEPTIDE_DF.copy()
IRT_PEPTIDE_DF['detected_property'] = (
    IRT_PEPTIDE_DF.irt-IRT_PEPTIDE_DF.irt.min()
)/(IRT_PEPTIDE_DF.irt.max()-IRT_PEPTIDE_DF.irt.min())
model = ScalarRegression_ModelInterface_for_AASeq(
    model_class=ScalarRegression_Transformer_Model_for_AASeq
)
model.train(IRT_PEPTIDE_DF, epoch=20)
model.predict(IRT_PEPTIDE_DF)

Unnamed: 0,sequence,pep_name,irt,mods,mod_sites,nAA,detected_property,predicted_property
0,LGGNEQVTR,RT-pep a,-24.92,,,9,0.0,0.0
1,GAGSSEPVTGLDAK,RT-pep b,0.0,,,14,0.199488,0.0
2,VEATFGVDESNAK,RT-pep c,12.39,,,13,0.298671,0.0
3,YILAGVENSK,RT-pep d,19.79,,,10,0.357909,0.186036
4,TPVISGGPYEYR,RT-pep e,28.71,,,12,0.429315,0.271055
5,TPVITGAPYEYR,RT-pep f,33.38,,,12,0.466699,0.25523
6,DGLDAASYYAPVR,RT-pep g,42.26,,,13,0.537784,0.303514
7,ADVTPADFSEWSK,RT-pep h,54.62,,,13,0.636728,0.270619
8,GTFIIDPGGVIR,RT-pep i,70.52,,,12,0.764009,0.542062
9,GTFIIDPAAVIR,RT-pep k,87.23,,,12,0.897775,0.63497


# Binary classification models for a given amino acid sequence

In [None]:
#export
class BinaryClassification_LSTM_Model_for_AASeq(
    ScalarRegression_LSTM_Model_for_AASeq
):
    def __init__(self, 
        *,
        hidden_dim=256,
        n_lstm_layers=4,
        dropout=0.1,
        **kwargs,
    ):
        super().__init__(
            hidden_dim=hidden_dim,
            n_lstm_layers=n_lstm_layers,
            dropout=dropout,
        )

    def forward(self, aa_x):
        x = super().forward(aa_x)
        return torch.sigmoid(x)

class BinaryClassification_Transformer_Model_for_AASeq(
    ScalarRegression_Transformer_Model_for_AASeq
):
    def __init__(self,
        *,
        hidden_dim = 256,
        nlayers = 4,
        output_attentions=False,
        dropout = 0.1,
        **kwargs,
    ):
        """
        Model based on a transformer Architecture from 
        Huggingface's BertEncoder class.
        """
        super().__init__(
            nlayers=nlayers,
            hidden_dim=hidden_dim,
            output_attentions=output_attentions,
            dropout=dropout,
            **kwargs,
        )

    def forward(self, aa_x):
        x = super().forward(aa_x)
        return torch.sigmoid(x)

class BinaryClassification_ModelInterface_for_AASeq(ModelInterface):
    def __init__(self, 
        dropout=0.1,
        model_class:torch.nn.Module=BinaryClassification_LSTM_Model_for_AASeq, #model defined above
        device:str='gpu',
        **kwargs,
    ):
        """
        Class to predict retention times from precursor dataframes.
        """
        super().__init__(device=device)
        self.build(
            model_class,
            dropout=dropout,
            **kwargs
        )
        self.loss_func = torch.nn.BCELoss() # for binary classification
        self.target_column_to_predict = 'predicted_prob'
        self.target_column_to_train = 'detected_prob'

#### A sequence classification model using `BinaryClassification_LSTM_Model_for_AASeq`

In [None]:
from peptdeep.model.rt import IRT_PEPTIDE_DF

IRT_PEPTIDE_DF=IRT_PEPTIDE_DF.copy()
IRT_PEPTIDE_DF['detected_prob'] = 0
IRT_PEPTIDE_DF.loc[:5,'detected_prob']=1
model = BinaryClassification_ModelInterface_for_AASeq(
    model_class=BinaryClassification_LSTM_Model_for_AASeq
)
model.train(IRT_PEPTIDE_DF, epoch=20)
model.predict(IRT_PEPTIDE_DF)

Unnamed: 0,sequence,pep_name,irt,mods,mod_sites,nAA,detected_prob,predicted_prob
0,LGGNEQVTR,RT-pep a,-24.92,,,9,1,0.990362
1,GAGSSEPVTGLDAK,RT-pep b,0.0,,,14,1,0.990646
2,VEATFGVDESNAK,RT-pep c,12.39,,,13,1,0.990164
3,YILAGVENSK,RT-pep d,19.79,,,10,1,0.989409
4,TPVISGGPYEYR,RT-pep e,28.71,,,12,1,0.989448
5,TPVITGAPYEYR,RT-pep f,33.38,,,12,1,0.985974
6,DGLDAASYYAPVR,RT-pep g,42.26,,,13,0,0.373001
7,ADVTPADFSEWSK,RT-pep h,54.62,,,13,0,0.376261
8,GTFIIDPGGVIR,RT-pep i,70.52,,,12,0,0.38411
9,GTFIIDPAAVIR,RT-pep k,87.23,,,12,0,0.371762


#### A sequence classification model using `BinaryClassification_LSTM_Model_for_AASeq`

In [None]:
from peptdeep.model.rt import IRT_PEPTIDE_DF

IRT_PEPTIDE_DF=IRT_PEPTIDE_DF.copy()
IRT_PEPTIDE_DF['detected_prob'] = 0
IRT_PEPTIDE_DF.loc[:5,'detected_prob']=1
model = BinaryClassification_ModelInterface_for_AASeq(
    model_class=BinaryClassification_Transformer_Model_for_AASeq
)
model.train(IRT_PEPTIDE_DF, epoch=10)
model.predict(IRT_PEPTIDE_DF)

Unnamed: 0,sequence,pep_name,irt,mods,mod_sites,nAA,detected_prob,predicted_prob
0,LGGNEQVTR,RT-pep a,-24.92,,,9,1,0.995044
1,GAGSSEPVTGLDAK,RT-pep b,0.0,,,14,1,0.986316
2,VEATFGVDESNAK,RT-pep c,12.39,,,13,1,0.991748
3,YILAGVENSK,RT-pep d,19.79,,,10,1,0.994141
4,TPVISGGPYEYR,RT-pep e,28.71,,,12,1,0.984417
5,TPVITGAPYEYR,RT-pep f,33.38,,,12,1,0.976034
6,DGLDAASYYAPVR,RT-pep g,42.26,,,13,0,0.032328
7,ADVTPADFSEWSK,RT-pep h,54.62,,,13,0,0.039279
8,GTFIIDPGGVIR,RT-pep i,70.52,,,12,0,0.01227
9,GTFIIDPAAVIR,RT-pep k,87.23,,,12,0,0.00877


# Regression models for predicting a scalar value for a given amino acid sequence and site-specific PTMs

In [None]:
#export
class ScalarRegression_LSTM_Model_for_ModAASeq(torch.nn.Module):
    def __init__(self, 
        *,
        hidden_dim=256,
        n_lstm_layers=4,
        dropout=0.1,
        **kwargs,
    ):
        super().__init__()
        self.dropout = torch.nn.Dropout(dropout)

        self.encoder_nn = building_block.Encoder_AsciiAA_Mod_CNN_LSTM_AttnSum(
            hidden_dim,
            n_lstm_layers=n_lstm_layers,
        )
        self.output_nn = torch.nn.Sequential(
            self.dropout,
            torch.nn.Linear(hidden_dim,64),
            torch.nn.GELU(),
            torch.nn.Linear(64, 1),
        )
    def forward(self, aa_x, mod_x):
        x = self.encoder_nn(aa_x, mod_x)
        return self.output_nn(x).squeeze(-1)

class ScalarRegression_Transformer_Model_for_ModAASeq(torch.nn.Module):
    def __init__(self,
        *,
        hidden_dim = 256,
        nlayers = 4,
        output_attentions=False,
        dropout = 0.1,
        **kwargs,
    ):
        """
        Model based on a transformer Architecture from 
        Huggingface's BertEncoder class.
        """
        super().__init__()

        self.dropout = torch.nn.Dropout(dropout)

        self.input_nn = building_block.AA_Mod_Embedding(hidden_dim)

        self._output_attentions = output_attentions
        
        self.hidden_nn = building_block.HFace_Transformer_with_PositionalEncoder(
            hidden_dim, nlayers=nlayers, dropout=dropout,
            output_attentions=output_attentions
        )

        self.output_nn = torch.nn.Sequential(
            building_block.SeqAttentionSum(hidden_dim),
            torch.nn.PReLU(),
            self.dropout,
            torch.nn.Linear(hidden_dim, 1),
        )

    @property
    def output_attentions(self)->bool:
        return self._output_attentions

    @output_attentions.setter
    def output_attentions(self, val:bool):
        self._output_attentions = val

    def forward(self, 
        aa_indices, 
        mod_x,
    ):
        x = self.dropout(self.input_nn(
            aa_indices, mod_x
        ))

        hidden_x = self.hidden_nn(x)
        if self.output_attentions:
            self.attentions = hidden_x[1]
        else:
            self.attentions = None
        x = self.dropout(hidden_x[0]+x*0.2)

        return self.output_nn(x).squeeze(1)

class ScalarRegression_ModelInterface_for_ModAASeq(ModelInterface):
    def __init__(self, 
        dropout=0.1,
        model_class:torch.nn.Module=ScalarRegression_LSTM_Model_for_ModAASeq, #model defined above
        device:str='gpu',
        **kwargs,
    ):
        super().__init__(device=device)
        self.build(
            model_class,
            dropout=dropout,
            **kwargs
        )
        self.loss_func = torch.nn.L1Loss() # for regression

        self.target_column_to_predict = 'predicted_property'
        self.target_column_to_train = 'detected_property'

    def _get_features_from_batch_df(self, 
        batch_df: pd.DataFrame,
        **kwargs,
    ):
        return self._get_aa_mod_features(batch_df)

#### Scalar regression model (RT) with modified AA sequences using `ScalarRegression_LSTM_Model_for_ModAASeq`

In [None]:
from peptdeep.model.rt import IRT_PEPTIDE_DF

IRT_PEPTIDE_DF=IRT_PEPTIDE_DF.copy()
IRT_PEPTIDE_DF['detected_property'] = (
    IRT_PEPTIDE_DF.irt-IRT_PEPTIDE_DF.irt.min()
)/(IRT_PEPTIDE_DF.irt.max()-IRT_PEPTIDE_DF.irt.min())
model = ScalarRegression_ModelInterface_for_ModAASeq(
    model_class=ScalarRegression_LSTM_Model_for_ModAASeq
)
model.train(IRT_PEPTIDE_DF, epoch=20)
model.predict(IRT_PEPTIDE_DF)

Unnamed: 0,sequence,pep_name,irt,mods,mod_sites,nAA,detected_property,predicted_property
0,LGGNEQVTR,RT-pep a,-24.92,,,9,0.0,9.4e-05
1,GAGSSEPVTGLDAK,RT-pep b,0.0,,,14,0.199488,0.177103
2,VEATFGVDESNAK,RT-pep c,12.39,,,13,0.298671,0.207272
3,YILAGVENSK,RT-pep d,19.79,,,10,0.357909,0.275805
4,TPVISGGPYEYR,RT-pep e,28.71,,,12,0.429315,0.360522
5,TPVITGAPYEYR,RT-pep f,33.38,,,12,0.466699,0.387341
6,DGLDAASYYAPVR,RT-pep g,42.26,,,13,0.537784,0.423155
7,ADVTPADFSEWSK,RT-pep h,54.62,,,13,0.636728,0.488202
8,GTFIIDPGGVIR,RT-pep i,70.52,,,12,0.764009,0.679493
9,GTFIIDPAAVIR,RT-pep k,87.23,,,12,0.897775,0.791727


#### Scalar regression model (RT) with modified AA sequences using `ScalarRegression_Transformer_Model_for_ModAASeq`

In [None]:
from peptdeep.model.rt import IRT_PEPTIDE_DF

IRT_PEPTIDE_DF=IRT_PEPTIDE_DF.copy()
IRT_PEPTIDE_DF['detected_property'] = (
    IRT_PEPTIDE_DF.irt-IRT_PEPTIDE_DF.irt.min()
)/(IRT_PEPTIDE_DF.irt.max()-IRT_PEPTIDE_DF.irt.min())
model = ScalarRegression_ModelInterface_for_ModAASeq(
    model_class=ScalarRegression_Transformer_Model_for_ModAASeq
)
model.train(IRT_PEPTIDE_DF, epoch=20)
model.predict(IRT_PEPTIDE_DF)

Unnamed: 0,sequence,pep_name,irt,mods,mod_sites,nAA,detected_property,predicted_property
0,LGGNEQVTR,RT-pep a,-24.92,,,9,0.0,0.090805
1,GAGSSEPVTGLDAK,RT-pep b,0.0,,,14,0.199488,0.251882
2,VEATFGVDESNAK,RT-pep c,12.39,,,13,0.298671,0.212338
3,YILAGVENSK,RT-pep d,19.79,,,10,0.357909,0.44349
4,TPVISGGPYEYR,RT-pep e,28.71,,,12,0.429315,0.419389
5,TPVITGAPYEYR,RT-pep f,33.38,,,12,0.466699,0.423449
6,DGLDAASYYAPVR,RT-pep g,42.26,,,13,0.537784,0.533382
7,ADVTPADFSEWSK,RT-pep h,54.62,,,13,0.636728,0.485162
8,GTFIIDPGGVIR,RT-pep i,70.52,,,12,0.764009,0.778576
9,GTFIIDPAAVIR,RT-pep k,87.23,,,12,0.897775,0.836544


# Binary Classification Models for Given Amino Acid Sequence and Site-specific PTMs

In [None]:
#export
class BinaryClassification_LSTM_Model_for_ModAASeq(
    ScalarRegression_LSTM_Model_for_ModAASeq
):
    def __init__(self, 
        *,
        hidden_dim=256,
        n_lstm_layers=4,
        dropout=0.1,
        **kwargs,
    ):
        super().__init__(
            hidden_dim=hidden_dim,
            n_lstm_layers=n_lstm_layers,
            dropout=dropout,
            **kwargs,
        )

    def forward(self, aa_x, mod_x):
        x = super().forward(aa_x, mod_x)
        return torch.sigmoid(x)

class BinaryClassification_Transformer_Model_for_ModAASeq(
    ScalarRegression_Transformer_Model_for_ModAASeq
):
    def __init__(self,
        *,
        hidden_dim = 256,
        nlayers = 4,
        output_attentions=False,
        dropout = 0.1,
        **kwargs,
    ):
        """
        Model based on a transformer Architecture from 
        Huggingface's BertEncoder class.
        """
        super().__init__(
            nlayers=nlayers,
            hidden_dim=hidden_dim,
            output_attentions=output_attentions,
            dropout=dropout,
            **kwargs
        )

    @property
    def output_attentions(self)->bool:
        return self._output_attentions

    @output_attentions.setter
    def output_attentions(self, val:bool):
        self._output_attentions = val

    def forward(self, 
        aa_indices, 
        mod_x,
    ):
        x = super().forward(aa_indices, mod_x)
        return torch.sigmoid(x)

class BinaryClassification_ModelInterface_for_ModAASeq(ModelInterface):
    def __init__(self, 
        dropout=0.1,
        model_class:torch.nn.Module=BinaryClassification_LSTM_Model_for_ModAASeq, #model defined above
        device:str='gpu',
        **kwargs,
    ):
        super().__init__(device=device)
        self.build(
            model_class,
            dropout=dropout,
            **kwargs
        )
        self.loss_func = torch.nn.BCELoss() # for regression

        self.target_column_to_predict = 'predicted_prob'
        self.target_column_to_train = 'detected_prob'

    def _get_features_from_batch_df(self, 
        batch_df: pd.DataFrame,
    ):
        return self._get_aa_mod_features(batch_df)

#### Scalar regression model (RT) with modified AA sequences using `BinaryClassification_LSTM_Model_for_ModAASeq`

In [None]:
from peptdeep.model.rt import IRT_PEPTIDE_DF

IRT_PEPTIDE_DF=IRT_PEPTIDE_DF.copy()
IRT_PEPTIDE_DF['detected_prob'] = 0
IRT_PEPTIDE_DF.loc[:5,'detected_prob']=1
model = BinaryClassification_ModelInterface_for_ModAASeq(
    model_class=BinaryClassification_LSTM_Model_for_ModAASeq
)
model.train(IRT_PEPTIDE_DF, epoch=20)
model.predict(IRT_PEPTIDE_DF)

Unnamed: 0,sequence,pep_name,irt,mods,mod_sites,nAA,detected_prob,predicted_prob
0,LGGNEQVTR,RT-pep a,-24.92,,,9,1,0.994669
1,GAGSSEPVTGLDAK,RT-pep b,0.0,,,14,1,0.994937
2,VEATFGVDESNAK,RT-pep c,12.39,,,13,1,0.994977
3,YILAGVENSK,RT-pep d,19.79,,,10,1,0.994884
4,TPVISGGPYEYR,RT-pep e,28.71,,,12,1,0.99459
5,TPVITGAPYEYR,RT-pep f,33.38,,,12,1,0.992918
6,DGLDAASYYAPVR,RT-pep g,42.26,,,13,0,0.389787
7,ADVTPADFSEWSK,RT-pep h,54.62,,,13,0,0.394886
8,GTFIIDPGGVIR,RT-pep i,70.52,,,12,0,0.396302
9,GTFIIDPAAVIR,RT-pep k,87.23,,,12,0,0.390515


#### Scalar regression model (RT) with modified AA sequences using `BinaryClassification_Transformer_Model_for_ModAASeq`

In [None]:
from peptdeep.model.rt import IRT_PEPTIDE_DF

IRT_PEPTIDE_DF=IRT_PEPTIDE_DF.copy()
IRT_PEPTIDE_DF['detected_prob'] = 0
IRT_PEPTIDE_DF.loc[:5,'detected_prob']=1
model = BinaryClassification_ModelInterface_for_ModAASeq(
    model_class=BinaryClassification_Transformer_Model_for_ModAASeq
)
model.train(IRT_PEPTIDE_DF, epoch=10)
model.predict(IRT_PEPTIDE_DF)

Unnamed: 0,sequence,pep_name,irt,mods,mod_sites,nAA,detected_prob,predicted_prob
0,LGGNEQVTR,RT-pep a,-24.92,,,9,1,0.993933
1,GAGSSEPVTGLDAK,RT-pep b,0.0,,,14,1,0.989393
2,VEATFGVDESNAK,RT-pep c,12.39,,,13,1,0.986015
3,YILAGVENSK,RT-pep d,19.79,,,10,1,0.993998
4,TPVISGGPYEYR,RT-pep e,28.71,,,12,1,0.990084
5,TPVITGAPYEYR,RT-pep f,33.38,,,12,1,0.976695
6,DGLDAASYYAPVR,RT-pep g,42.26,,,13,0,0.01592
7,ADVTPADFSEWSK,RT-pep h,54.62,,,13,0,0.023054
8,GTFIIDPGGVIR,RT-pep i,70.52,,,12,0,0.015328
9,GTFIIDPAAVIR,RT-pep k,87.23,,,12,0,0.010076
