In [None]:
#| default_exp model.generic_property_prediction

In [None]:
%reload_ext autoreload
%autoreload 2

# Generic Property Prediction

## Description
### Scope of the generic property prediction
One focus of the AlpahPeptDeep framework is on the prediction of retention times, MS2 Spectra and Collisional Cross sections of peptides. The Models to predict these properties are defined in the notebooks `rt.ipynb`, `ms2.ipynb` and `ccs.ipynb` respectively. In the following notebook, we define a more generalized model, which in principle allows to predict or classify arbitrary properties of peptides. The classes allow users to create new deep learning predictions of peptide properties, even without prior expertise in deep learning.
### How to build new models, even without experience in deep learning
In order to predict or classify novel properties of peptides, the user simply needs to provide a list of peptides with a corresponding property (e.g. 'binding affinity') for each peptide. The user only needs to initialize one of the `ModelInterface` classes below and specify the name of the `target_column_to_train` and optionally the `target_column_to_predict`. The generic model classes will try to derive predictions or classifications of the property from the peptide sequence. Additional classes are also given, which complement the peptide sequence info with site-specific PTM infos.

## Imports

In [None]:
#| export
import torch
import pandas as pd
import numpy as np

import peptdeep.model.building_block as building_block
from peptdeep.model.model_interface import ModelInterface

In [None]:
#| export

ASCII_NUM=128

## Regression models for predicting a scalar value for a given amino acid sequence

In [None]:
#| export

class Model_for_Generic_AASeq_Regression_LSTM(torch.nn.Module):
    """Generic LSTM regression model for AA sequence"""
    def __init__(self, 
        *,
        hidden_dim=256,
        nlayers=4,
        dropout=0.1,
        **kwargs,
    ):
        super().__init__()
        self.dropout = torch.nn.Dropout(dropout)
        
        self.nn = torch.nn.Sequential(
            building_block.ascii_embedding(hidden_dim//4),
            building_block.SeqCNN(hidden_dim//4),
            self.dropout,
            building_block.SeqLSTM(
                hidden_dim, hidden_dim, 
                rnn_layer=nlayers
            ),
            building_block.SeqAttentionSum(hidden_dim),
            self.dropout,
            torch.nn.Linear(hidden_dim,64),
            torch.nn.GELU(),
            torch.nn.Linear(64, 1),
        )
    def forward(self, aa_x):
        return self.nn(aa_x).squeeze(-1)

class Model_for_Generic_AASeq_Regression_Transformer(torch.nn.Module):
    """Generic transformer regression model for AA sequence """
    def __init__(self,
        *,
        hidden_dim = 256,
        nlayers = 4,
        output_attentions=False,
        dropout = 0.1,
        **kwargs,
    ):
        super().__init__()

        self.dropout = torch.nn.Dropout(dropout)

        self.input_nn =  torch.nn.Sequential(
            building_block.ascii_embedding(hidden_dim),
        )

        self.output_attentions = output_attentions
        
        self.hidden_nn = building_block.HFace_Transformer_with_PositionalEncoder(
            hidden_dim, nlayers=nlayers, dropout=dropout,
            output_attentions=output_attentions
        )

        self.output_nn = torch.nn.Sequential(
            building_block.SeqAttentionSum(hidden_dim),
            torch.nn.PReLU(),
            self.dropout,
            torch.nn.Linear(hidden_dim, 1),
        )

    @property
    def output_attentions(self)->bool:
        return self._output_attentions

    @output_attentions.setter
    def output_attentions(self, val:bool):
        self._output_attentions = val

    def forward(self, aa_x):
        aa_x = self.dropout(self.input_nn(aa_x))

        aa_x = self.hidden_nn(aa_x)
        if self.output_attentions:
            self.attentions = aa_x[1]
        else:
            self.attentions = None
        aa_x = self.dropout(aa_x[0])

        return self.output_nn(aa_x).squeeze(1)


class ModelInterface_for_Generic_AASeq_Regression(ModelInterface):
    """
    `ModelInterface` for Generic_AASeq_Regression models
    """
    def __init__(self, 
        model_class:torch.nn.Module=Model_for_Generic_AASeq_Regression_LSTM, 
        dropout=0.1,
        device:str='gpu',
        hidden_dim=256,
        nlayers=4,
        **kwargs,
    ):
        super().__init__(device=device)
        self.build(
            model_class,
            dropout=dropout,
            hidden_dim=hidden_dim,
            nlayers=nlayers,
            **kwargs
        )
        self.loss_func = torch.nn.L1Loss() # for regression

        self.target_column_to_predict = 'predicted_property'
        self.target_column_to_train = 'detected_property'

### Examples

#### Define example Table

In [None]:
from peptdeep.model.rt import IRT_PEPTIDE_DF

In [None]:
def create_example_input_dataframe_normalized_irt():
    
    irt_df=IRT_PEPTIDE_DF.copy()
    irt_df['normalized_irt'] = (
        irt_df.irt-irt_df.irt.min()
    )/(irt_df.irt.max()-irt_df.irt.min())
    return irt_df

create_example_input_dataframe_normalized_irt()

Unnamed: 0,sequence,pep_name,irt,mods,mod_sites,nAA,normalized_irt
0,LGGNEQVTR,RT-pep a,-24.92,,,9,0.0
1,GAGSSEPVTGLDAK,RT-pep b,0.0,,,14,0.199488
2,VEATFGVDESNAK,RT-pep c,12.39,,,13,0.298671
3,YILAGVENSK,RT-pep d,19.79,,,10,0.357909
4,TPVISGGPYEYR,RT-pep e,28.71,,,12,0.429315
5,TPVITGAPYEYR,RT-pep f,33.38,,,12,0.466699
6,DGLDAASYYAPVR,RT-pep g,42.26,,,13,0.537784
7,ADVTPADFSEWSK,RT-pep h,54.62,,,13,0.636728
8,GTFIIDPGGVIR,RT-pep i,70.52,,,12,0.764009
9,GTFIIDPAAVIR,RT-pep k,87.23,,,12,0.897775


#### Building an RT model based on `Model_for_Generic_AASeq_Regression_LSTM`

In [None]:
example_df = create_example_input_dataframe_normalized_irt()

#initialize the modelinterface, specify which of the models to use
model = ModelInterface_for_Generic_AASeq_Regression(
    model_class=Model_for_Generic_AASeq_Regression_LSTM
)
# specify the name of the column you want to use for traning
model.target_column_to_train = 'normalized_irt'
model.target_column_to_predict = 'predicted_normalized_irt'
model.train(example_df, epoch=20)
model.predict(example_df)

Unnamed: 0,sequence,pep_name,irt,mods,mod_sites,nAA,normalized_irt,predicted_normalized_irt
0,LGGNEQVTR,RT-pep a,-24.92,,,9,0.0,0.0
1,GAGSSEPVTGLDAK,RT-pep b,0.0,,,14,0.199488,0.263999
2,VEATFGVDESNAK,RT-pep c,12.39,,,13,0.298671,0.222347
3,YILAGVENSK,RT-pep d,19.79,,,10,0.357909,0.341881
4,TPVISGGPYEYR,RT-pep e,28.71,,,12,0.429315,0.37949
5,TPVITGAPYEYR,RT-pep f,33.38,,,12,0.466699,0.4022
6,DGLDAASYYAPVR,RT-pep g,42.26,,,13,0.537784,0.616874
7,ADVTPADFSEWSK,RT-pep h,54.62,,,13,0.636728,0.667358
8,GTFIIDPGGVIR,RT-pep i,70.52,,,12,0.764009,0.765801
9,GTFIIDPAAVIR,RT-pep k,87.23,,,12,0.897775,0.88929


#### Building an RT model for only sequences based on `Model_for_Generic_AASeq_Regression_Transformer`

In [None]:
example_df = create_example_input_dataframe_normalized_irt()

#initialize the modelinterface, specify which of the models to use
model = ModelInterface_for_Generic_AASeq_Regression(
    model_class=Model_for_Generic_AASeq_Regression_Transformer
)
# specify the name of the column you want to use for traning
model.target_column_to_train = 'normalized_irt'
model.target_column_to_predict = 'predicted_normalized_irt'
model.train(example_df, epoch=20)
model.predict(example_df)

Unnamed: 0,sequence,pep_name,irt,mods,mod_sites,nAA,normalized_irt,predicted_normalized_irt
0,LGGNEQVTR,RT-pep a,-24.92,,,9,0.0,0.000834
1,GAGSSEPVTGLDAK,RT-pep b,0.0,,,14,0.199488,0.215954
2,VEATFGVDESNAK,RT-pep c,12.39,,,13,0.298671,0.29814
3,YILAGVENSK,RT-pep d,19.79,,,10,0.357909,0.435898
4,TPVISGGPYEYR,RT-pep e,28.71,,,12,0.429315,0.487245
5,TPVITGAPYEYR,RT-pep f,33.38,,,12,0.466699,0.509875
6,DGLDAASYYAPVR,RT-pep g,42.26,,,13,0.537784,0.667695
7,ADVTPADFSEWSK,RT-pep h,54.62,,,13,0.636728,0.605467
8,GTFIIDPGGVIR,RT-pep i,70.52,,,12,0.764009,0.916113
9,GTFIIDPAAVIR,RT-pep k,87.23,,,12,0.897775,1.017125


## Regression models for predicting a scalar value for a given amino acid sequence and site-specific PTMs

In [None]:
#| export
class Model_for_Generic_ModAASeq_Regression_LSTM(torch.nn.Module):
    """Generic LSTM regression model for modified sequence"""
    def __init__(self, 
        *,
        hidden_dim=256,
        nlayers=4,
        dropout=0.1,
        **kwargs,
    ):
        super().__init__()
        self.dropout = torch.nn.Dropout(dropout)

        self.encoder_nn = building_block.Encoder_AA_Mod_CNN_LSTM_AttnSum(
            hidden_dim,
            n_lstm_layers=nlayers,
        )
        self.output_nn = torch.nn.Sequential(
            self.dropout,
            torch.nn.Linear(hidden_dim,64),
            torch.nn.GELU(),
            torch.nn.Linear(64, 1),
        )
    def forward(self, aa_x, mod_x):
        x = self.encoder_nn(aa_x, mod_x)
        return self.output_nn(x).squeeze(-1)

class Model_for_Generic_ModAASeq_Regression_Transformer(torch.nn.Module):
    """Generic transformer regression model for modified sequence"""
    def __init__(self,
        *,
        hidden_dim = 256,
        nlayers = 4,
        output_attentions=False,
        dropout = 0.1,
        **kwargs,
    ):
        super().__init__()

        self.dropout = torch.nn.Dropout(dropout)

        self.input_nn = building_block.AA_Mod_Embedding(hidden_dim)

        self._output_attentions = output_attentions
        
        self.hidden_nn = building_block.HFace_Transformer_with_PositionalEncoder(
            hidden_dim, nlayers=nlayers, dropout=dropout,
            output_attentions=output_attentions
        )

        self.output_nn = torch.nn.Sequential(
            building_block.SeqAttentionSum(hidden_dim),
            torch.nn.PReLU(),
            self.dropout,
            torch.nn.Linear(hidden_dim, 1),
        )

    @property
    def output_attentions(self)->bool:
        return self._output_attentions

    @output_attentions.setter
    def output_attentions(self, val:bool):
        self._output_attentions = val

    def forward(self, 
        aa_indices, 
        mod_x,
    ):
        x = self.dropout(self.input_nn(
            aa_indices, mod_x
        ))

        hidden_x = self.hidden_nn(x)
        if self.output_attentions:
            self.attentions = hidden_x[1]
        else:
            self.attentions = None
        x = self.dropout(hidden_x[0]+x*0.2)

        return self.output_nn(x).squeeze(1)

class ModelInterface_for_Generic_ModAASeq_Regression(ModelInterface):
    """
    `ModelInterface` for all Generic_ModAASeq_Regression models
    """
    def __init__(self, 
        model_class:torch.nn.Module=Model_for_Generic_ModAASeq_Regression_LSTM,
        dropout=0.1,
        device:str='gpu',
        hidden_dim=256,
        nlayers=4,
        **kwargs,
    ):
        super().__init__(device=device)
        self.build(
            model_class,
            dropout=dropout,
            hidden_dim=hidden_dim,
            nlayers=nlayers,
            **kwargs
        )
        self.loss_func = torch.nn.L1Loss() # for regression

        self.target_column_to_predict = 'predicted_property'
        self.target_column_to_train = 'detected_property'

    def _get_features_from_batch_df(self, 
        batch_df: pd.DataFrame,
        **kwargs,
    ):
        return self._get_aa_mod_features(batch_df)

#### Scalar regression model (RT) with modified AA sequences using `Model_for_Generic_ModAASeq_Regression_LSTM`

In [None]:
example_df = create_example_input_dataframe_normalized_irt()

#initialize the modelinterface, specify which of the models to use
model = ModelInterface_for_Generic_ModAASeq_Regression(
    model_class=Model_for_Generic_ModAASeq_Regression_LSTM
)
# specify the name of the column you want to use for traning
model.target_column_to_train = 'normalized_irt'
model.target_column_to_predict = 'predicted_normalized_irt'
model.train(example_df, epoch=20)
model.predict(example_df)

Unnamed: 0,sequence,pep_name,irt,mods,mod_sites,nAA,normalized_irt,predicted_normalized_irt
0,LGGNEQVTR,RT-pep a,-24.92,,,9,0.0,0.0
1,GAGSSEPVTGLDAK,RT-pep b,0.0,,,14,0.199488,0.218646
2,VEATFGVDESNAK,RT-pep c,12.39,,,13,0.298671,0.238523
3,YILAGVENSK,RT-pep d,19.79,,,10,0.357909,0.338038
4,TPVISGGPYEYR,RT-pep e,28.71,,,12,0.429315,0.389952
5,TPVITGAPYEYR,RT-pep f,33.38,,,12,0.466699,0.414328
6,DGLDAASYYAPVR,RT-pep g,42.26,,,13,0.537784,0.542586
7,ADVTPADFSEWSK,RT-pep h,54.62,,,13,0.636728,0.578945
8,GTFIIDPGGVIR,RT-pep i,70.52,,,12,0.764009,0.741118
9,GTFIIDPAAVIR,RT-pep k,87.23,,,12,0.897775,0.851133


#### Scalar regression model (RT) with modified AA sequences using `Model_for_Generic_ModAASeq_Regression_Transformer`

In [None]:
example_df = create_example_input_dataframe_normalized_irt()

#initialize the modelinterface, specify which of the models to use
model = ModelInterface_for_Generic_ModAASeq_Regression(
    model_class=Model_for_Generic_ModAASeq_Regression_Transformer
)
# specify the name of the column you want to use for traning
model.target_column_to_train = 'normalized_irt'
model.target_column_to_predict = 'predicted_normalized_irt'
model.train(example_df, epoch=20)
model.predict(example_df)

Unnamed: 0,sequence,pep_name,irt,mods,mod_sites,nAA,normalized_irt,predicted_normalized_irt
0,LGGNEQVTR,RT-pep a,-24.92,,,9,0.0,0.023732
1,GAGSSEPVTGLDAK,RT-pep b,0.0,,,14,0.199488,0.371864
2,VEATFGVDESNAK,RT-pep c,12.39,,,13,0.298671,0.371675
3,YILAGVENSK,RT-pep d,19.79,,,10,0.357909,0.444706
4,TPVISGGPYEYR,RT-pep e,28.71,,,12,0.429315,0.599888
5,TPVITGAPYEYR,RT-pep f,33.38,,,12,0.466699,0.647195
6,DGLDAASYYAPVR,RT-pep g,42.26,,,13,0.537784,0.642657
7,ADVTPADFSEWSK,RT-pep h,54.62,,,13,0.636728,0.730952
8,GTFIIDPGGVIR,RT-pep i,70.52,,,12,0.764009,0.957414
9,GTFIIDPAAVIR,RT-pep k,87.23,,,12,0.897775,1.022258


## Binary classification models for a given amino acid sequence

In [None]:
#| export
class Model_for_Generic_AASeq_BinaryClassification_LSTM(
    Model_for_Generic_AASeq_Regression_LSTM
):
    """Generic LSTM classification model for AA sequence"""
    def __init__(self, 
        *,
        hidden_dim=256,
        nlayers=4,
        dropout=0.1,
        **kwargs,
    ):
        super().__init__(
            hidden_dim=hidden_dim,
            nlayers=nlayers,
            dropout=dropout,
        )

    def forward(self, aa_x):
        x = super().forward(aa_x)
        return torch.sigmoid(x)

class Model_for_Generic_AASeq_BinaryClassification_Transformer(
    Model_for_Generic_AASeq_Regression_Transformer
):
    """Generic transformer classification model for AA sequence"""
    def __init__(self,
        *,
        hidden_dim = 256,
        nlayers = 4,
        output_attentions=False,
        dropout = 0.1,
        **kwargs,
    ):
        """
        Model based on a transformer Architecture from 
        Huggingface's BertEncoder class.
        """
        super().__init__(
            nlayers=nlayers,
            hidden_dim=hidden_dim,
            output_attentions=output_attentions,
            dropout=dropout,
            **kwargs,
        )

    def forward(self, aa_x):
        x = super().forward(aa_x)
        return torch.sigmoid(x)

class ModelInterface_for_Generic_AASeq_BinaryClassification(ModelInterface):
    """
    `ModelInterface` for all Generic_AASeq_BinaryClassification models
    """
    def __init__(self, 
        model_class:torch.nn.Module=Model_for_Generic_AASeq_BinaryClassification_LSTM,
        dropout=0.1,
        device:str='gpu',
        hidden_dim=256,
        nlayers=4,
        **kwargs,
    ):
        """
        Class to predict retention times from precursor dataframes.
        """
        super().__init__(device=device)
        self.build(
            model_class,
            dropout=dropout,
            hidden_dim=hidden_dim,
            nlayers=nlayers,
            **kwargs
        )
        self.loss_func = torch.nn.BCELoss() # for binary classification
        self.target_column_to_predict = 'predicted_prob'
        self.target_column_to_train = 'detected_prob'

### Examples

In [None]:
def create_example_input_dataframe_classification_rt():
    rt_df = create_example_input_dataframe_normalized_irt()
    rt_df['is_in_first_half_of_column'] = 0
    rt_df.loc[:5,'is_in_first_half_of_column']=1
    return rt_df

#### A sequence classification model using `Model_for_Generic_AASeq_BinaryClassification_LSTM`

In [None]:
example_df = create_example_input_dataframe_classification_rt()

#initialize the modelinterface, specify which of the models to use
model = ModelInterface_for_Generic_AASeq_BinaryClassification(
    model_class=Model_for_Generic_AASeq_BinaryClassification_LSTM
)
# specify the name of the column you want to use for traning
model.target_column_to_train = 'is_in_first_half_of_column' 
model.target_column_to_predict = 'predicted_will_be_in_first_half_of_column'
model.train(example_df, epoch=20)
model.predict(example_df)

Unnamed: 0,sequence,pep_name,irt,mods,mod_sites,nAA,normalized_irt,is_in_first_half_of_column,predicted_will_be_in_first_half_of_column
0,LGGNEQVTR,RT-pep a,-24.92,,,9,0.0,1,0.990723
1,GAGSSEPVTGLDAK,RT-pep b,0.0,,,14,0.199488,1,0.990067
2,VEATFGVDESNAK,RT-pep c,12.39,,,13,0.298671,1,0.99126
3,YILAGVENSK,RT-pep d,19.79,,,10,0.357909,1,0.990588
4,TPVISGGPYEYR,RT-pep e,28.71,,,12,0.429315,1,0.990547
5,TPVITGAPYEYR,RT-pep f,33.38,,,12,0.466699,1,0.986322
6,DGLDAASYYAPVR,RT-pep g,42.26,,,13,0.537784,0,0.362422
7,ADVTPADFSEWSK,RT-pep h,54.62,,,13,0.636728,0,0.357596
8,GTFIIDPGGVIR,RT-pep i,70.52,,,12,0.764009,0,0.364087
9,GTFIIDPAAVIR,RT-pep k,87.23,,,12,0.897775,0,0.360649


#### A sequence classification model using `Model_for_Generic_AASeq_BinaryClassification_Transformer`

In [None]:
example_df = create_example_input_dataframe_classification_rt()

#initialize the modelinterface, specify which of the models to use
model = ModelInterface_for_Generic_AASeq_BinaryClassification(
    model_class=Model_for_Generic_AASeq_BinaryClassification_Transformer
)
# specify the name of the column you want to use for traning
model.target_column_to_train = 'is_in_first_half_of_column'
model.target_column_to_predict = 'predicted_will_be_in_first_half_of_column'
model.train(example_df, epoch=20)
model.predict(example_df)

Unnamed: 0,sequence,pep_name,irt,mods,mod_sites,nAA,normalized_irt,is_in_first_half_of_column,predicted_will_be_in_first_half_of_column
0,LGGNEQVTR,RT-pep a,-24.92,,,9,0.0,1,0.997154
1,GAGSSEPVTGLDAK,RT-pep b,0.0,,,14,0.199488,1,0.997196
2,VEATFGVDESNAK,RT-pep c,12.39,,,13,0.298671,1,0.994428
3,YILAGVENSK,RT-pep d,19.79,,,10,0.357909,1,0.997426
4,TPVISGGPYEYR,RT-pep e,28.71,,,12,0.429315,1,0.997273
5,TPVITGAPYEYR,RT-pep f,33.38,,,12,0.466699,1,0.997214
6,DGLDAASYYAPVR,RT-pep g,42.26,,,13,0.537784,0,0.007716
7,ADVTPADFSEWSK,RT-pep h,54.62,,,13,0.636728,0,0.007046
8,GTFIIDPGGVIR,RT-pep i,70.52,,,12,0.764009,0,0.00514
9,GTFIIDPAAVIR,RT-pep k,87.23,,,12,0.897775,0,0.004675


## Binary classification models for given amino acid sequence and site-specific PTMs

In [None]:
#| export
class Model_for_Generic_ModAASeq_BinaryClassification_LSTM(
    Model_for_Generic_ModAASeq_Regression_LSTM
):
    """Generic LSTM classification model for modified sequence"""
    def __init__(self, 
        *,
        hidden_dim=256,
        nlayers=4,
        dropout=0.1,
        **kwargs,
    ):
        super().__init__(
            hidden_dim=hidden_dim,
            nlayers=nlayers,
            dropout=dropout,
            **kwargs,
        )

    def forward(self, aa_x, mod_x):
        x = super().forward(aa_x, mod_x)
        return torch.sigmoid(x)


class Model_for_Generic_ModAASeq_BinaryClassification_Transformer(
    Model_for_Generic_ModAASeq_Regression_Transformer
):
    """Generic transformer classification model for modified sequence"""
    def __init__(self,
        *,
        hidden_dim = 256,
        nlayers = 4,
        output_attentions=False,
        dropout = 0.1,
        **kwargs,
    ):
        super().__init__(
            nlayers=nlayers,
            hidden_dim=hidden_dim,
            output_attentions=output_attentions,
            dropout=dropout,
            **kwargs
        )

    @property
    def output_attentions(self)->bool:
        return self._output_attentions

    @output_attentions.setter
    def output_attentions(self, val:bool):
        self._output_attentions = val

    def forward(self, 
        aa_indices, 
        mod_x,
    ):
        x = super().forward(aa_indices, mod_x)
        return torch.sigmoid(x)


class ModelInterface_for_Generic_ModAASeq_BinaryClassification(ModelInterface):
    """
    `ModelInterface` for Generic_ModAASeq_BinaryClassification
    """
    def __init__(self, 
        model_class:torch.nn.Module=Model_for_Generic_ModAASeq_BinaryClassification_LSTM,
        dropout=0.1,
        device:str='gpu',
        hidden_dim=256,
        nlayers=4,
        **kwargs,
    ):
        super().__init__(device=device)
        self.build(
            model_class,
            dropout=dropout,
            **kwargs
        )
        self.loss_func = torch.nn.BCELoss() # for regression

        self.target_column_to_predict = 'predicted_prob'
        self.target_column_to_train = 'detected_prob'

    def _get_features_from_batch_df(self, 
        batch_df: pd.DataFrame,
    ):
        return self._get_aa_mod_features(batch_df)

### Examples

In [None]:
def create_example_input_dataframe_classification_rt():
    rt_df = create_example_input_dataframe_normalized_irt()
    rt_df['is_in_first_half_of_column'] = 0
    rt_df.loc[:5,'is_in_first_half_of_column']=1
    return rt_df

#### A sequence classification model using `Model_for_Generic_ModAASeq_BinaryClassification_LSTM`

In [None]:
example_df = create_example_input_dataframe_classification_rt()

#initialize the modelinterface, specify which of the models to use
model = ModelInterface_for_Generic_ModAASeq_BinaryClassification(
    model_class=Model_for_Generic_ModAASeq_BinaryClassification_LSTM
)
# specify the name of the column you want to use for traning
model.target_column_to_train = 'is_in_first_half_of_column' 
model.target_column_to_predict = 'predicted_will_be_in_first_half_of_column'
model.train(example_df, epoch=20)
model.predict(example_df)

Unnamed: 0,sequence,pep_name,irt,mods,mod_sites,nAA,normalized_irt,is_in_first_half_of_column,predicted_will_be_in_first_half_of_column
0,LGGNEQVTR,RT-pep a,-24.92,,,9,0.0,1,0.991679
1,GAGSSEPVTGLDAK,RT-pep b,0.0,,,14,0.199488,1,0.991799
2,VEATFGVDESNAK,RT-pep c,12.39,,,13,0.298671,1,0.991879
3,YILAGVENSK,RT-pep d,19.79,,,10,0.357909,1,0.991361
4,TPVISGGPYEYR,RT-pep e,28.71,,,12,0.429315,1,0.990937
5,TPVITGAPYEYR,RT-pep f,33.38,,,12,0.466699,1,0.987399
6,DGLDAASYYAPVR,RT-pep g,42.26,,,13,0.537784,0,0.399395
7,ADVTPADFSEWSK,RT-pep h,54.62,,,13,0.636728,0,0.400205
8,GTFIIDPGGVIR,RT-pep i,70.52,,,12,0.764009,0,0.407074
9,GTFIIDPAAVIR,RT-pep k,87.23,,,12,0.897775,0,0.400328


#### A sequence classification model using `Model_for_Generic_ModAASeq_BinaryClassification_Transformer`

In [None]:
example_df = create_example_input_dataframe_classification_rt()

#initialize the modelinterface, specify which of the models to use
model = ModelInterface_for_Generic_ModAASeq_BinaryClassification(
    model_class=Model_for_Generic_ModAASeq_BinaryClassification_Transformer
)
# specify the name of the column you want to use for traning
model.target_column_to_train = 'is_in_first_half_of_column' 
model.target_column_to_predict = 'predicted_will_be_in_first_half_of_column'
model.train(example_df, epoch=20)
model.predict(example_df)

Unnamed: 0,sequence,pep_name,irt,mods,mod_sites,nAA,normalized_irt,is_in_first_half_of_column,predicted_will_be_in_first_half_of_column
0,LGGNEQVTR,RT-pep a,-24.92,,,9,0.0,1,0.99698
1,GAGSSEPVTGLDAK,RT-pep b,0.0,,,14,0.199488,1,0.994647
2,VEATFGVDESNAK,RT-pep c,12.39,,,13,0.298671,1,0.995371
3,YILAGVENSK,RT-pep d,19.79,,,10,0.357909,1,0.99705
4,TPVISGGPYEYR,RT-pep e,28.71,,,12,0.429315,1,0.995105
5,TPVITGAPYEYR,RT-pep f,33.38,,,12,0.466699,1,0.994481
6,DGLDAASYYAPVR,RT-pep g,42.26,,,13,0.537784,0,0.00275
7,ADVTPADFSEWSK,RT-pep h,54.62,,,13,0.636728,0,0.003557
8,GTFIIDPGGVIR,RT-pep i,70.52,,,12,0.764009,0,0.002703
9,GTFIIDPAAVIR,RT-pep k,87.23,,,12,0.897775,0,0.00248
