In [None]:
#default_exp model.generic_property_prediction

In [None]:
%reload_ext autoreload
%autoreload 2

# Generic Property Prediction

## Description
### Scope of the generic property prediction
One focus of the AlpahPeptDeep framework is on the prediction of retention times, MS2 Spectra and Collisional Cross sections of peptides. The Models to predict these properties are defined in the notebooks `rt.ipynb`, `ms2.ipynb` and `ccs.ipynb` respectively. In the following notebook, we define a more generalized model, which in principle allows to predict or classify arbitrary properties of peptides. The classes allow users to create new deep learning predictions of peptide properties, even without prior expertise in deep learning.
### How to build new models, even without experience in deep learning
In order to predict or classify novel properties of peptides, the user simply needs to provide a list of peptides with a corresponding property (e.g. 'binding affinity') for each peptide. The user only needs to initialize one the `ModelInterface` classes below and specify the name of the `target_column_to_train` and optionally the `target_column_to_predict`. The generic model classes will try to derive predictions or classifications of the property from the peptide sequence. Additional classes are also given, which complement the peptide sequence info with site-specific PTM infos.

## Imports

In [None]:
#export
import torch
import peptdeep.model.building_block as building_block
from peptdeep.model.model_interface import ModelInterface
import pandas as pd
import numpy as np

ASCII_NUM=128

## Regression models for predicting a scalar value for a given amino acid sequence

In [None]:
#export

class Model_for_Generic_AASeq_Regression_LSTM(torch.nn.Module):
    def __init__(self, 
        *,
        hidden_dim=256,
        n_lstm_layers=4,
        dropout=0.1,
        **kwargs,
    ):
        super().__init__()
        self.dropout = torch.nn.Dropout(dropout)
        
        self.nn = torch.nn.Sequential(
            building_block.ascii_embedding(hidden_dim//4),
            building_block.SeqCNN(hidden_dim//4),
            self.dropout,
            building_block.SeqLSTM(
                hidden_dim, hidden_dim, 
                rnn_layer=n_lstm_layers
            ),
            building_block.SeqAttentionSum(hidden_dim),
            self.dropout,
            torch.nn.Linear(hidden_dim,64),
            torch.nn.GELU(),
            torch.nn.Linear(64, 1),
        )
    def forward(self, aa_x):
        return self.nn(aa_x).squeeze(-1)

class Model_for_Generic_AASeq_Regression_Transformer(torch.nn.Module):
    def __init__(self,
        *,
        hidden_dim = 256,
        nlayers = 4,
        output_attentions=False,
        dropout = 0.1,
        **kwargs,
    ):
        """
        Model based on a transformer Architecture from 
        Huggingface's BertEncoder class.
        """
        super().__init__()

        self.dropout = torch.nn.Dropout(dropout)

        self.input_nn =  torch.nn.Sequential(
            building_block.ascii_embedding(hidden_dim),
        )

        self.output_attentions = output_attentions
        
        self.hidden_nn = building_block.HFace_Transformer_with_PositionalEncoder(
            hidden_dim, nlayers=nlayers, dropout=dropout,
            output_attentions=output_attentions
        )

        self.output_nn = torch.nn.Sequential(
            building_block.SeqAttentionSum(hidden_dim),
            torch.nn.PReLU(),
            self.dropout,
            torch.nn.Linear(hidden_dim, 1),
        )

    @property
    def output_attentions(self)->bool:
        return self._output_attentions

    @output_attentions.setter
    def output_attentions(self, val:bool):
        self._output_attentions = val

    def forward(self, aa_x):
        aa_x = self.dropout(self.input_nn(aa_x))

        aa_x = self.hidden_nn(aa_x)
        if self.output_attentions:
            self.attentions = aa_x[1]
        else:
            self.attentions = None
        aa_x = self.dropout(aa_x[0])

        return self.output_nn(aa_x).squeeze(1)


class ModelInterface_for_Generic_AASeq_Regression(ModelInterface):
    def __init__(self, 
        dropout=0.1,
        model_class:torch.nn.Module=Model_for_Generic_AASeq_Regression_LSTM, #one of the two models defined above can be specified, default LSTM
        device:str='gpu',
        **kwargs,
    ):
        super().__init__(device=device)
        self.build(
            model_class,
            dropout=dropout,
            **kwargs
        )
        self.loss_func = torch.nn.L1Loss() # for regression

        self.target_column_to_predict = 'predicted_property'
        self.target_column_to_train = 'detected_property'

### Examples

#### Define example Table

In [None]:
def create_example_input_dataframe_normalized_irt():
    from peptdeep.model.rt import IRT_PEPTIDE_DF
    
    IRT_PEPTIDE_DF=IRT_PEPTIDE_DF.copy()
    IRT_PEPTIDE_DF['normalized_irt'] = (
        IRT_PEPTIDE_DF.irt-IRT_PEPTIDE_DF.irt.min()
    )/(IRT_PEPTIDE_DF.irt.max()-IRT_PEPTIDE_DF.irt.min())
    return IRT_PEPTIDE_DF

create_example_input_dataframe_normalized_irt()

Unnamed: 0,sequence,pep_name,irt,mods,mod_sites,nAA,normalized_irt
0,LGGNEQVTR,RT-pep a,-24.92,,,9,0.0
1,GAGSSEPVTGLDAK,RT-pep b,0.0,,,14,0.199488
2,VEATFGVDESNAK,RT-pep c,12.39,,,13,0.298671
3,YILAGVENSK,RT-pep d,19.79,,,10,0.357909
4,TPVISGGPYEYR,RT-pep e,28.71,,,12,0.429315
5,TPVITGAPYEYR,RT-pep f,33.38,,,12,0.466699
6,DGLDAASYYAPVR,RT-pep g,42.26,,,13,0.537784
7,ADVTPADFSEWSK,RT-pep h,54.62,,,13,0.636728
8,GTFIIDPGGVIR,RT-pep i,70.52,,,12,0.764009
9,GTFIIDPAAVIR,RT-pep k,87.23,,,12,0.897775


#### Building an RT model based on `Model_for_Generic_AASeq_Regression_LSTM`

In [None]:
example_df = create_example_input_dataframe_normalized_irt()
display(example_df)

#initialize the modelinterface, specify which of the models to use
model = ModelInterface_for_Generic_AASeq_Regression(
    model_class=Model_for_Generic_AASeq_Regression_LSTM
)
# specify the name of the column you want to use for traning
model.target_column_to_train = 'normalized_irt' #specify the name of the column to use for prediction
model.target_column_to_predict = 'predicted_normalized_irt' #specify the name of the predicted output column (optional)
model.train(example_df, epoch=20)
model.predict(example_df)

Unnamed: 0,sequence,pep_name,irt,mods,mod_sites,nAA,normalized_irt
0,LGGNEQVTR,RT-pep a,-24.92,,,9,0.0
1,GAGSSEPVTGLDAK,RT-pep b,0.0,,,14,0.199488
2,VEATFGVDESNAK,RT-pep c,12.39,,,13,0.298671
3,YILAGVENSK,RT-pep d,19.79,,,10,0.357909
4,TPVISGGPYEYR,RT-pep e,28.71,,,12,0.429315
5,TPVITGAPYEYR,RT-pep f,33.38,,,12,0.466699
6,DGLDAASYYAPVR,RT-pep g,42.26,,,13,0.537784
7,ADVTPADFSEWSK,RT-pep h,54.62,,,13,0.636728
8,GTFIIDPGGVIR,RT-pep i,70.52,,,12,0.764009
9,GTFIIDPAAVIR,RT-pep k,87.23,,,12,0.897775


Unnamed: 0,sequence,pep_name,irt,mods,mod_sites,nAA,normalized_irt,predicted_normalized_irt
0,LGGNEQVTR,RT-pep a,-24.92,,,9,0.0,0.0
1,GAGSSEPVTGLDAK,RT-pep b,0.0,,,14,0.199488,0.280662
2,VEATFGVDESNAK,RT-pep c,12.39,,,13,0.298671,0.316936
3,YILAGVENSK,RT-pep d,19.79,,,10,0.357909,0.401811
4,TPVISGGPYEYR,RT-pep e,28.71,,,12,0.429315,0.528301
5,TPVITGAPYEYR,RT-pep f,33.38,,,12,0.466699,0.506149
6,DGLDAASYYAPVR,RT-pep g,42.26,,,13,0.537784,0.723796
7,ADVTPADFSEWSK,RT-pep h,54.62,,,13,0.636728,0.650295
8,GTFIIDPGGVIR,RT-pep i,70.52,,,12,0.764009,0.701642
9,GTFIIDPAAVIR,RT-pep k,87.23,,,12,0.897775,0.79289


#### Building an RT model for only sequences based on `Model_for_Generic_AASeq_Regression_Transformer`

In [None]:
example_df = create_example_input_dataframe_normalized_irt()
display(example_df)

#initialize the modelinterface, specify which of the models to use
model = ModelInterface_for_Generic_AASeq_Regression(
    model_class=Model_for_Generic_AASeq_Regression_Transformer
)
# specify the name of the column you want to use for traning
model.target_column_to_train = 'normalized_irt' #specify the name of the column to use for prediction
model.target_column_to_predict = 'predicted_normalized_irt' #specify the name of the predicted output column (optional)
model.train(example_df, epoch=20)
model.predict(example_df)

Unnamed: 0,sequence,pep_name,irt,mods,mod_sites,nAA,normalized_irt
0,LGGNEQVTR,RT-pep a,-24.92,,,9,0.0
1,GAGSSEPVTGLDAK,RT-pep b,0.0,,,14,0.199488
2,VEATFGVDESNAK,RT-pep c,12.39,,,13,0.298671
3,YILAGVENSK,RT-pep d,19.79,,,10,0.357909
4,TPVISGGPYEYR,RT-pep e,28.71,,,12,0.429315
5,TPVITGAPYEYR,RT-pep f,33.38,,,12,0.466699
6,DGLDAASYYAPVR,RT-pep g,42.26,,,13,0.537784
7,ADVTPADFSEWSK,RT-pep h,54.62,,,13,0.636728
8,GTFIIDPGGVIR,RT-pep i,70.52,,,12,0.764009
9,GTFIIDPAAVIR,RT-pep k,87.23,,,12,0.897775


Unnamed: 0,sequence,pep_name,irt,mods,mod_sites,nAA,normalized_irt,predicted_normalized_irt
0,LGGNEQVTR,RT-pep a,-24.92,,,9,0.0,0.445867
1,GAGSSEPVTGLDAK,RT-pep b,0.0,,,14,0.199488,0.691615
2,VEATFGVDESNAK,RT-pep c,12.39,,,13,0.298671,0.673369
3,YILAGVENSK,RT-pep d,19.79,,,10,0.357909,0.665059
4,TPVISGGPYEYR,RT-pep e,28.71,,,12,0.429315,0.795825
5,TPVITGAPYEYR,RT-pep f,33.38,,,12,0.466699,0.789972
6,DGLDAASYYAPVR,RT-pep g,42.26,,,13,0.537784,0.865027
7,ADVTPADFSEWSK,RT-pep h,54.62,,,13,0.636728,0.87835
8,GTFIIDPGGVIR,RT-pep i,70.52,,,12,0.764009,1.012773
9,GTFIIDPAAVIR,RT-pep k,87.23,,,12,0.897775,1.074674


## Regression models for predicting a scalar value for a given amino acid sequence and site-specific PTMs

In [None]:
#export
class Model_for_Generic_Mod_AASeq_Regression_LSTM(torch.nn.Module):
    def __init__(self, 
        *,
        hidden_dim=256,
        n_lstm_layers=4,
        dropout=0.1,
        **kwargs,
    ):
        super().__init__()
        self.dropout = torch.nn.Dropout(dropout)

        self.encoder_nn = building_block.Encoder_AsciiAA_Mod_CNN_LSTM_AttnSum(
            hidden_dim,
            n_lstm_layers=n_lstm_layers,
        )
        self.output_nn = torch.nn.Sequential(
            self.dropout,
            torch.nn.Linear(hidden_dim,64),
            torch.nn.GELU(),
            torch.nn.Linear(64, 1),
        )
    def forward(self, aa_x, mod_x):
        x = self.encoder_nn(aa_x, mod_x)
        return self.output_nn(x).squeeze(-1)

class Model_for_Generic_Mod_AASeq_Regression_Transformer(torch.nn.Module):
    def __init__(self,
        *,
        hidden_dim = 256,
        nlayers = 4,
        output_attentions=False,
        dropout = 0.1,
        **kwargs,
    ):
        """
        Model based on a transformer Architecture from 
        Huggingface's BertEncoder class.
        """
        super().__init__()

        self.dropout = torch.nn.Dropout(dropout)

        self.input_nn = building_block.AA_Mod_Embedding(hidden_dim)

        self._output_attentions = output_attentions
        
        self.hidden_nn = building_block.HFace_Transformer_with_PositionalEncoder(
            hidden_dim, nlayers=nlayers, dropout=dropout,
            output_attentions=output_attentions
        )

        self.output_nn = torch.nn.Sequential(
            building_block.SeqAttentionSum(hidden_dim),
            torch.nn.PReLU(),
            self.dropout,
            torch.nn.Linear(hidden_dim, 1),
        )

    @property
    def output_attentions(self)->bool:
        return self._output_attentions

    @output_attentions.setter
    def output_attentions(self, val:bool):
        self._output_attentions = val

    def forward(self, 
        aa_indices, 
        mod_x,
    ):
        x = self.dropout(self.input_nn(
            aa_indices, mod_x
        ))

        hidden_x = self.hidden_nn(x)
        if self.output_attentions:
            self.attentions = hidden_x[1]
        else:
            self.attentions = None
        x = self.dropout(hidden_x[0]+x*0.2)

        return self.output_nn(x).squeeze(1)

class ModelInterface_for_Generic_Mod_AASeq_Regression(ModelInterface):
    def __init__(self, 
        dropout=0.1,
        model_class:torch.nn.Module=Model_for_Generic_Mod_AASeq_Regression_LSTM, #model defined above
        device:str='gpu',
        **kwargs,
    ):
        super().__init__(device=device)
        self.build(
            model_class,
            dropout=dropout,
            **kwargs
        )
        self.loss_func = torch.nn.L1Loss() # for regression

        self.target_column_to_predict = 'predicted_property'
        self.target_column_to_train = 'detected_property'

    def _get_features_from_batch_df(self, 
        batch_df: pd.DataFrame,
        **kwargs,
    ):
        return self._get_aa_mod_features(batch_df)

#### Scalar regression model (RT) with modified AA sequences using `Model_for_Generic_Mod_AASeq_Regression_LSTM`

In [None]:
example_df = create_example_input_dataframe_normalized_irt()

#initialize the modelinterface, specify which of the models to use
model = ModelInterface_for_Generic_Mod_AASeq_Regression(
    model_class=Model_for_Generic_Mod_AASeq_Regression_LSTM
)
# specify the name of the column you want to use for traning
model.target_column_to_train = 'normalized_irt' #specify the name of the column to use for prediction
model.target_column_to_predict = 'predicted_normalized_irt' #specify the name of the predicted output column (optional)
model.train(example_df, epoch=20)
model.predict(example_df)

Unnamed: 0,sequence,pep_name,irt,mods,mod_sites,nAA,normalized_irt,predicted_normalized_irt
0,LGGNEQVTR,RT-pep a,-24.92,,,9,0.0,0.0
1,GAGSSEPVTGLDAK,RT-pep b,0.0,,,14,0.199488,0.202916
2,VEATFGVDESNAK,RT-pep c,12.39,,,13,0.298671,0.1554
3,YILAGVENSK,RT-pep d,19.79,,,10,0.357909,0.291
4,TPVISGGPYEYR,RT-pep e,28.71,,,12,0.429315,0.324772
5,TPVITGAPYEYR,RT-pep f,33.38,,,12,0.466699,0.271285
6,DGLDAASYYAPVR,RT-pep g,42.26,,,13,0.537784,0.725914
7,ADVTPADFSEWSK,RT-pep h,54.62,,,13,0.636728,0.595412
8,GTFIIDPGGVIR,RT-pep i,70.52,,,12,0.764009,0.723404
9,GTFIIDPAAVIR,RT-pep k,87.23,,,12,0.897775,0.914466


#### Scalar regression model (RT) with modified AA sequences using `Model_for_Generic_Mod_AASeq_Regression_Transformer`

In [None]:
example_df = create_example_input_dataframe_normalized_irt()

#initialize the modelinterface, specify which of the models to use
model = ModelInterface_for_Generic_Mod_AASeq_Regression(
    model_class=Model_for_Generic_Mod_AASeq_Regression_Transformer
)
# specify the name of the column you want to use for traning
model.target_column_to_train = 'normalized_irt' #specify the name of the column to use for prediction
model.target_column_to_predict = 'predicted_normalized_irt' #specify the name of the predicted output column (optional)
model.train(example_df, epoch=20)
model.predict(example_df)

Unnamed: 0,sequence,pep_name,irt,mods,mod_sites,nAA,normalized_irt,predicted_normalized_irt
0,LGGNEQVTR,RT-pep a,-24.92,,,9,0.0,0.0
1,GAGSSEPVTGLDAK,RT-pep b,0.0,,,14,0.199488,0.0
2,VEATFGVDESNAK,RT-pep c,12.39,,,13,0.298671,0.00701
3,YILAGVENSK,RT-pep d,19.79,,,10,0.357909,0.136426
4,TPVISGGPYEYR,RT-pep e,28.71,,,12,0.429315,0.324049
5,TPVITGAPYEYR,RT-pep f,33.38,,,12,0.466699,0.340585
6,DGLDAASYYAPVR,RT-pep g,42.26,,,13,0.537784,0.279297
7,ADVTPADFSEWSK,RT-pep h,54.62,,,13,0.636728,0.418784
8,GTFIIDPGGVIR,RT-pep i,70.52,,,12,0.764009,0.605927
9,GTFIIDPAAVIR,RT-pep k,87.23,,,12,0.897775,0.682211


## Binary classification models for a given amino acid sequence

In [None]:
#export
class Model_for_Generic_AASeq_BinaryClassification_LSTM(
    Model_for_Generic_AASeq_Regression_LSTM
):
    def __init__(self, 
        *,
        hidden_dim=256,
        n_lstm_layers=4,
        dropout=0.1,
        **kwargs,
    ):
        super().__init__(
            hidden_dim=hidden_dim,
            n_lstm_layers=n_lstm_layers,
            dropout=dropout,
        )

    def forward(self, aa_x):
        x = super().forward(aa_x)
        return torch.sigmoid(x)

class Model_for_Generic_AASeq_BinaryClassification_Transformer(
    Model_for_Generic_AASeq_Regression_Transformer
):
    def __init__(self,
        *,
        hidden_dim = 256,
        nlayers = 4,
        output_attentions=False,
        dropout = 0.1,
        **kwargs,
    ):
        """
        Model based on a transformer Architecture from 
        Huggingface's BertEncoder class.
        """
        super().__init__(
            nlayers=nlayers,
            hidden_dim=hidden_dim,
            output_attentions=output_attentions,
            dropout=dropout,
            **kwargs,
        )

    def forward(self, aa_x):
        x = super().forward(aa_x)
        return torch.sigmoid(x)

class ModelInterface_for_Generic_AASeq_BinaryClassification(ModelInterface):
    def __init__(self, 
        dropout=0.1,
        model_class:torch.nn.Module=Model_for_Generic_AASeq_BinaryClassification_LSTM, #model defined above
        device:str='gpu',
        **kwargs,
    ):
        """
        Class to predict retention times from precursor dataframes.
        """
        super().__init__(device=device)
        self.build(
            model_class,
            dropout=dropout,
            **kwargs
        )
        self.loss_func = torch.nn.BCELoss() # for binary classification
        self.target_column_to_predict = 'predicted_prob'
        self.target_column_to_train = 'detected_prob'

### Examples

In [None]:
def create_example_input_dataframe_classification_rt():
    rt_df = create_example_input_dataframe_normalized_irt()
    rt_df['is_in_first_half_of_column'] = 0
    rt_df.loc[:5,'is_in_first_half_of_column']=1
    return rt_df

#### A sequence classification model using `Model_for_Generic_AASeq_BinaryClassification_LSTM`

In [None]:
example_df = create_example_input_dataframe_classification_rt()

#initialize the modelinterface, specify which of the models to use
model = ModelInterface_for_Generic_AASeq_BinaryClassification(
    model_class=Model_for_Generic_AASeq_BinaryClassification_LSTM
)
# specify the name of the column you want to use for traning
model.target_column_to_train = 'is_in_first_half_of_column' #specify the name of the column to use for prediction
model.target_column_to_predict = 'predicted_will_be_in_first_half_of_column' #specify the name of the predicted output column (optional)
model.train(example_df, epoch=20)
model.predict(example_df)

Unnamed: 0,sequence,pep_name,irt,mods,mod_sites,nAA,normalized_irt,is_in_first_half_of_column,predicted_will_be_in_first_half_of_column
0,LGGNEQVTR,RT-pep a,-24.92,,,9,0.0,1,0.990433
1,GAGSSEPVTGLDAK,RT-pep b,0.0,,,14,0.199488,1,0.989925
2,VEATFGVDESNAK,RT-pep c,12.39,,,13,0.298671,1,0.989627
3,YILAGVENSK,RT-pep d,19.79,,,10,0.357909,1,0.990521
4,TPVISGGPYEYR,RT-pep e,28.71,,,12,0.429315,1,0.990736
5,TPVITGAPYEYR,RT-pep f,33.38,,,12,0.466699,1,0.990266
6,DGLDAASYYAPVR,RT-pep g,42.26,,,13,0.537784,0,0.356433
7,ADVTPADFSEWSK,RT-pep h,54.62,,,13,0.636728,0,0.36448
8,GTFIIDPGGVIR,RT-pep i,70.52,,,12,0.764009,0,0.359525
9,GTFIIDPAAVIR,RT-pep k,87.23,,,12,0.897775,0,0.356143


#### A sequence classification model using `Model_for_Generic_AASeq_BinaryClassification_Transformer`

In [None]:
example_df = create_example_input_dataframe_classification_rt()

#initialize the modelinterface, specify which of the models to use
model = ModelInterface_for_Generic_AASeq_BinaryClassification(
    model_class=Model_for_Generic_AASeq_BinaryClassification_Transformer
)
# specify the name of the column you want to use for traning
model.target_column_to_train = 'is_in_first_half_of_column' #specify the name of the column to use for prediction
model.target_column_to_predict = 'predicted_will_be_in_first_half_of_column' #specify the name of the predicted output column (optional)
model.train(example_df, epoch=20)
model.predict(example_df)

Unnamed: 0,sequence,pep_name,irt,mods,mod_sites,nAA,normalized_irt,is_in_first_half_of_column,predicted_will_be_in_first_half_of_column
0,LGGNEQVTR,RT-pep a,-24.92,,,9,0.0,1,0.997611
1,GAGSSEPVTGLDAK,RT-pep b,0.0,,,14,0.199488,1,0.996842
2,VEATFGVDESNAK,RT-pep c,12.39,,,13,0.298671,1,0.99494
3,YILAGVENSK,RT-pep d,19.79,,,10,0.357909,1,0.997572
4,TPVISGGPYEYR,RT-pep e,28.71,,,12,0.429315,1,0.99551
5,TPVITGAPYEYR,RT-pep f,33.38,,,12,0.466699,1,0.994901
6,DGLDAASYYAPVR,RT-pep g,42.26,,,13,0.537784,0,0.003404
7,ADVTPADFSEWSK,RT-pep h,54.62,,,13,0.636728,0,0.003937
8,GTFIIDPGGVIR,RT-pep i,70.52,,,12,0.764009,0,0.003203
9,GTFIIDPAAVIR,RT-pep k,87.23,,,12,0.897775,0,0.003172


## Binary Classification Models for Given Amino Acid Sequence and Site-specific PTMs

In [None]:
#export
class Model_for_Generic_Mod_AASeq_BinaryClassification_LSTM(
    Model_for_Generic_Mod_AASeq_Regression_LSTM
):
    def __init__(self, 
        *,
        hidden_dim=256,
        n_lstm_layers=4,
        dropout=0.1,
        **kwargs,
    ):
        super().__init__(
            hidden_dim=hidden_dim,
            n_lstm_layers=n_lstm_layers,
            dropout=dropout,
            **kwargs,
        )

    def forward(self, aa_x, mod_x):
        x = super().forward(aa_x, mod_x)
        return torch.sigmoid(x)


class Model_for_Generic_Mod_AASeq_BinaryClassification_Transformer(
    Model_for_Generic_Mod_AASeq_Regression_Transformer
):
    def __init__(self,
        *,
        hidden_dim = 256,
        nlayers = 4,
        output_attentions=False,
        dropout = 0.1,
        **kwargs,
    ):
        """
        Model based on a transformer Architecture from 
        Huggingface's BertEncoder class.
        """
        super().__init__(
            nlayers=nlayers,
            hidden_dim=hidden_dim,
            output_attentions=output_attentions,
            dropout=dropout,
            **kwargs
        )

    @property
    def output_attentions(self)->bool:
        return self._output_attentions

    @output_attentions.setter
    def output_attentions(self, val:bool):
        self._output_attentions = val

    def forward(self, 
        aa_indices, 
        mod_x,
    ):
        x = super().forward(aa_indices, mod_x)
        return torch.sigmoid(x)


class ModelInterface_for_Generic_Mod_AASeq_BinaryClassification(ModelInterface):
    def __init__(self, 
        dropout=0.1,
        model_class:torch.nn.Module=Model_for_Generic_Mod_AASeq_BinaryClassification_LSTM, #model defined above
        device:str='gpu',
        **kwargs,
    ):
        super().__init__(device=device)
        self.build(
            model_class,
            dropout=dropout,
            **kwargs
        )
        self.loss_func = torch.nn.BCELoss() # for regression

        self.target_column_to_predict = 'predicted_prob'
        self.target_column_to_train = 'detected_prob'

    def _get_features_from_batch_df(self, 
        batch_df: pd.DataFrame,
    ):
        return self._get_aa_mod_features(batch_df)

### Examples

In [None]:
def create_example_input_dataframe_classification_rt():
    rt_df = create_example_input_dataframe_normalized_irt()
    rt_df['is_in_first_half_of_column'] = 0
    rt_df.loc[:5,'is_in_first_half_of_column']=1
    return rt_df

#### A sequence classification model using `Model_for_Generic_Mod_AASeq_BinaryClassification_LSTM`

In [None]:
example_df = create_example_input_dataframe_classification_rt()

#initialize the modelinterface, specify which of the models to use
model = ModelInterface_for_Generic_Mod_AASeq_BinaryClassification(
    model_class=Model_for_Generic_Mod_AASeq_BinaryClassification_LSTM
)
# specify the name of the column you want to use for traning
model.target_column_to_train = 'is_in_first_half_of_column' #specify the name of the column to use for prediction
model.target_column_to_predict = 'predicted_will_be_in_first_half_of_column' #specify the name of the predicted output column (optional)
model.train(example_df, epoch=20)
model.predict(example_df)

Unnamed: 0,sequence,pep_name,irt,mods,mod_sites,nAA,normalized_irt,is_in_first_half_of_column,predicted_will_be_in_first_half_of_column
0,LGGNEQVTR,RT-pep a,-24.92,,,9,0.0,1,0.990758
1,GAGSSEPVTGLDAK,RT-pep b,0.0,,,14,0.199488,1,0.990021
2,VEATFGVDESNAK,RT-pep c,12.39,,,13,0.298671,1,0.991787
3,YILAGVENSK,RT-pep d,19.79,,,10,0.357909,1,0.990449
4,TPVISGGPYEYR,RT-pep e,28.71,,,12,0.429315,1,0.989052
5,TPVITGAPYEYR,RT-pep f,33.38,,,12,0.466699,1,0.986801
6,DGLDAASYYAPVR,RT-pep g,42.26,,,13,0.537784,0,0.329738
7,ADVTPADFSEWSK,RT-pep h,54.62,,,13,0.636728,0,0.33201
8,GTFIIDPGGVIR,RT-pep i,70.52,,,12,0.764009,0,0.341253
9,GTFIIDPAAVIR,RT-pep k,87.23,,,12,0.897775,0,0.332452


#### A sequence classification model using `Model_for_Generic_Mod_AASeq_BinaryClassification_Transformer`

In [None]:
example_df = create_example_input_dataframe_classification_rt()

#initialize the modelinterface, specify which of the models to use
model = ModelInterface_for_Generic_Mod_AASeq_BinaryClassification(
    model_class=Model_for_Generic_Mod_AASeq_BinaryClassification_Transformer
)
# specify the name of the column you want to use for traning
model.target_column_to_train = 'is_in_first_half_of_column' #specify the name of the column to use for prediction
model.target_column_to_predict = 'predicted_will_be_in_first_half_of_column' #specify the name of the predicted output column (optional)
model.train(example_df, epoch=20)
model.predict(example_df)

Unnamed: 0,sequence,pep_name,irt,mods,mod_sites,nAA,normalized_irt,is_in_first_half_of_column,predicted_will_be_in_first_half_of_column
0,LGGNEQVTR,RT-pep a,-24.92,,,9,0.0,1,0.997072
1,GAGSSEPVTGLDAK,RT-pep b,0.0,,,14,0.199488,1,0.997123
2,VEATFGVDESNAK,RT-pep c,12.39,,,13,0.298671,1,0.996509
3,YILAGVENSK,RT-pep d,19.79,,,10,0.357909,1,0.997287
4,TPVISGGPYEYR,RT-pep e,28.71,,,12,0.429315,1,0.99714
5,TPVITGAPYEYR,RT-pep f,33.38,,,12,0.466699,1,0.997106
6,DGLDAASYYAPVR,RT-pep g,42.26,,,13,0.537784,0,0.00653
7,ADVTPADFSEWSK,RT-pep h,54.62,,,13,0.636728,0,0.00374
8,GTFIIDPGGVIR,RT-pep i,70.52,,,12,0.764009,0,0.003015
9,GTFIIDPAAVIR,RT-pep k,87.23,,,12,0.897775,0,0.002974
