In [None]:
#---#| default_exp model.generic_property_prediction

In [None]:
%reload_ext autoreload
%autoreload 2

# Generic Property Prediction

## Description
### Scope of the generic property prediction
One focus of the AlpahPeptDeep framework is on the prediction of retention times, MS2 Spectra and Collisional Cross sections of peptides. The Models to predict these properties are defined in the notebooks `rt.ipynb`, `ms2.ipynb` and `ccs.ipynb` respectively. In the following notebook, we define a more generalized model, which in principle allows to predict or classify arbitrary properties of peptides. The classes allow users to create new deep learning predictions of peptide properties, even without prior expertise in deep learning.
### How to build new models, even without experience in deep learning
In order to predict or classify novel properties of peptides, the user simply needs to provide a list of peptides with a corresponding property (e.g. 'binding affinity') for each peptide. The user only needs to initialize one of the `ModelInterface` classes below and specify the name of the `target_column_to_train` and optionally the `target_column_to_predict`. The generic model classes will try to derive predictions or classifications of the property from the peptide sequence. Additional classes are also given, which complement the peptide sequence info with site-specific PTM infos.

## Imports

In [None]:
from peptdeep.model.generic_property_prediction import *

### Examples

#### Define example Table

In [None]:
from peptdeep.model.rt import IRT_PEPTIDE_DF

In [None]:
def create_example_input_dataframe_normalized_irt():
    
    irt_df=IRT_PEPTIDE_DF.copy()
    irt_df['normalized_irt'] = (
        irt_df.irt-irt_df.irt.min()
    )/(irt_df.irt.max()-irt_df.irt.min())
    return irt_df

create_example_input_dataframe_normalized_irt()

Unnamed: 0,sequence,pep_name,irt,mods,mod_sites,nAA,normalized_irt
0,LGGNEQVTR,RT-pep a,-24.92,,,9,0.0
1,GAGSSEPVTGLDAK,RT-pep b,0.0,,,14,0.199488
2,VEATFGVDESNAK,RT-pep c,12.39,,,13,0.298671
3,YILAGVENSK,RT-pep d,19.79,,,10,0.357909
4,TPVISGGPYEYR,RT-pep e,28.71,,,12,0.429315
5,TPVITGAPYEYR,RT-pep f,33.38,,,12,0.466699
6,DGLDAASYYAPVR,RT-pep g,42.26,,,13,0.537784
7,ADVTPADFSEWSK,RT-pep h,54.62,,,13,0.636728
8,GTFIIDPGGVIR,RT-pep i,70.52,,,12,0.764009
9,GTFIIDPAAVIR,RT-pep k,87.23,,,12,0.897775


#### Building an RT model based on `Model_for_Generic_AASeq_Regression_LSTM`

In [None]:
example_df = create_example_input_dataframe_normalized_irt()

#initialize the modelinterface, specify which of the models to use
model = ModelInterface_for_Generic_AASeq_Regression(
    model_class=Model_for_Generic_AASeq_Regression_LSTM
)
# specify the name of the column you want to use for traning
model.target_column_to_train = 'normalized_irt'
model.target_column_to_predict = 'predicted_normalized_irt'
model.train(example_df, epoch=20)
model.predict(example_df)

Unnamed: 0,sequence,pep_name,irt,mods,mod_sites,nAA,normalized_irt,predicted_normalized_irt
0,LGGNEQVTR,RT-pep a,-24.92,,,9,0.0,0.0
1,GAGSSEPVTGLDAK,RT-pep b,0.0,,,14,0.199488,0.217062
2,VEATFGVDESNAK,RT-pep c,12.39,,,13,0.298671,0.262668
3,YILAGVENSK,RT-pep d,19.79,,,10,0.357909,0.35069
4,TPVISGGPYEYR,RT-pep e,28.71,,,12,0.429315,0.454322
5,TPVITGAPYEYR,RT-pep f,33.38,,,12,0.466699,0.58535
6,DGLDAASYYAPVR,RT-pep g,42.26,,,13,0.537784,0.885816
7,ADVTPADFSEWSK,RT-pep h,54.62,,,13,0.636728,0.825349
8,GTFIIDPGGVIR,RT-pep i,70.52,,,12,0.764009,0.819246
9,GTFIIDPAAVIR,RT-pep k,87.23,,,12,0.897775,1.027452


#### Building an RT model for only sequences based on `Model_for_Generic_AASeq_Regression_Transformer`

In [None]:
example_df = create_example_input_dataframe_normalized_irt()

#initialize the modelinterface, specify which of the models to use
model = ModelInterface_for_Generic_AASeq_Regression(
    model_class=Model_for_Generic_AASeq_Regression_Transformer
)
# specify the name of the column you want to use for traning
model.target_column_to_train = 'normalized_irt'
model.target_column_to_predict = 'predicted_normalized_irt'
model.train(example_df, epoch=20)
model.predict(example_df)

Unnamed: 0,sequence,pep_name,irt,mods,mod_sites,nAA,normalized_irt,predicted_normalized_irt
0,LGGNEQVTR,RT-pep a,-24.92,,,9,0.0,0.011669
1,GAGSSEPVTGLDAK,RT-pep b,0.0,,,14,0.199488,0.175132
2,VEATFGVDESNAK,RT-pep c,12.39,,,13,0.298671,0.229852
3,YILAGVENSK,RT-pep d,19.79,,,10,0.357909,0.426767
4,TPVISGGPYEYR,RT-pep e,28.71,,,12,0.429315,0.375254
5,TPVITGAPYEYR,RT-pep f,33.38,,,12,0.466699,0.434823
6,DGLDAASYYAPVR,RT-pep g,42.26,,,13,0.537784,0.562113
7,ADVTPADFSEWSK,RT-pep h,54.62,,,13,0.636728,0.611791
8,GTFIIDPGGVIR,RT-pep i,70.52,,,12,0.764009,0.682872
9,GTFIIDPAAVIR,RT-pep k,87.23,,,12,0.897775,0.858451


## Regression models for predicting a scalar value for a given amino acid sequence and site-specific PTMs

#### Scalar regression model (RT) with modified AA sequences using `Model_for_Generic_ModAASeq_Regression_LSTM`

In [None]:
example_df = create_example_input_dataframe_normalized_irt()

#initialize the modelinterface, specify which of the models to use
model = ModelInterface_for_Generic_ModAASeq_Regression(
    model_class=Model_for_Generic_ModAASeq_Regression_LSTM
)
# specify the name of the column you want to use for traning
model.target_column_to_train = 'normalized_irt'
model.target_column_to_predict = 'predicted_normalized_irt'
model.train(example_df, epoch=20)
model.predict(example_df)

Unnamed: 0,sequence,pep_name,irt,mods,mod_sites,nAA,normalized_irt,predicted_normalized_irt
0,LGGNEQVTR,RT-pep a,-24.92,,,9,0.0,0.015872
1,GAGSSEPVTGLDAK,RT-pep b,0.0,,,14,0.199488,0.201153
2,VEATFGVDESNAK,RT-pep c,12.39,,,13,0.298671,0.194088
3,YILAGVENSK,RT-pep d,19.79,,,10,0.357909,0.32385
4,TPVISGGPYEYR,RT-pep e,28.71,,,12,0.429315,0.352255
5,TPVITGAPYEYR,RT-pep f,33.38,,,12,0.466699,0.383529
6,DGLDAASYYAPVR,RT-pep g,42.26,,,13,0.537784,0.637659
7,ADVTPADFSEWSK,RT-pep h,54.62,,,13,0.636728,0.648361
8,GTFIIDPGGVIR,RT-pep i,70.52,,,12,0.764009,0.781518
9,GTFIIDPAAVIR,RT-pep k,87.23,,,12,0.897775,0.905771


#### Scalar regression model (RT) with modified AA sequences using `Model_for_Generic_ModAASeq_Regression_Transformer`

In [None]:
example_df = create_example_input_dataframe_normalized_irt()

#initialize the modelinterface, specify which of the models to use
model = ModelInterface_for_Generic_ModAASeq_Regression(
    model_class=Model_for_Generic_ModAASeq_Regression_Transformer
)
# specify the name of the column you want to use for traning
model.target_column_to_train = 'normalized_irt'
model.target_column_to_predict = 'predicted_normalized_irt'
model.train(example_df, epoch=20)
model.predict(example_df)

Unnamed: 0,sequence,pep_name,irt,mods,mod_sites,nAA,normalized_irt,predicted_normalized_irt
0,LGGNEQVTR,RT-pep a,-24.92,,,9,0.0,0.0
1,GAGSSEPVTGLDAK,RT-pep b,0.0,,,14,0.199488,0.0
2,VEATFGVDESNAK,RT-pep c,12.39,,,13,0.298671,0.151904
3,YILAGVENSK,RT-pep d,19.79,,,10,0.357909,0.046422
4,TPVISGGPYEYR,RT-pep e,28.71,,,12,0.429315,0.258884
5,TPVITGAPYEYR,RT-pep f,33.38,,,12,0.466699,0.389873
6,DGLDAASYYAPVR,RT-pep g,42.26,,,13,0.537784,0.455479
7,ADVTPADFSEWSK,RT-pep h,54.62,,,13,0.636728,0.468062
8,GTFIIDPGGVIR,RT-pep i,70.52,,,12,0.764009,0.593896
9,GTFIIDPAAVIR,RT-pep k,87.23,,,12,0.897775,0.828839


## Binary classification models for a given amino acid sequence

### Examples

In [None]:
def create_example_input_dataframe_classification_rt():
    rt_df = create_example_input_dataframe_normalized_irt()
    rt_df['is_in_first_half_of_column'] = 0
    rt_df.loc[:5,'is_in_first_half_of_column']=1
    return rt_df

#### A sequence classification model using `Model_for_Generic_AASeq_BinaryClassification_LSTM`

In [None]:
example_df = create_example_input_dataframe_classification_rt()

#initialize the modelinterface, specify which of the models to use
model = ModelInterface_for_Generic_AASeq_BinaryClassification(
    model_class=Model_for_Generic_AASeq_BinaryClassification_LSTM
)
# specify the name of the column you want to use for traning
model.target_column_to_train = 'is_in_first_half_of_column' 
model.target_column_to_predict = 'predicted_will_be_in_first_half_of_column'
model.train(example_df, epoch=20)
model.predict(example_df)

Unnamed: 0,sequence,pep_name,irt,mods,mod_sites,nAA,normalized_irt,is_in_first_half_of_column,predicted_will_be_in_first_half_of_column
0,LGGNEQVTR,RT-pep a,-24.92,,,9,0.0,1,0.996315
1,GAGSSEPVTGLDAK,RT-pep b,0.0,,,14,0.199488,1,0.994864
2,VEATFGVDESNAK,RT-pep c,12.39,,,13,0.298671,1,0.995806
3,YILAGVENSK,RT-pep d,19.79,,,10,0.357909,1,0.996244
4,TPVISGGPYEYR,RT-pep e,28.71,,,12,0.429315,1,0.995376
5,TPVITGAPYEYR,RT-pep f,33.38,,,12,0.466699,1,0.993511
6,DGLDAASYYAPVR,RT-pep g,42.26,,,13,0.537784,0,0.417006
7,ADVTPADFSEWSK,RT-pep h,54.62,,,13,0.636728,0,0.420201
8,GTFIIDPGGVIR,RT-pep i,70.52,,,12,0.764009,0,0.412116
9,GTFIIDPAAVIR,RT-pep k,87.23,,,12,0.897775,0,0.410713


#### A sequence classification model using `Model_for_Generic_AASeq_BinaryClassification_Transformer`

In [None]:
example_df = create_example_input_dataframe_classification_rt()

#initialize the modelinterface, specify which of the models to use
model = ModelInterface_for_Generic_AASeq_BinaryClassification(
    model_class=Model_for_Generic_AASeq_BinaryClassification_Transformer
)
# specify the name of the column you want to use for traning
model.target_column_to_train = 'is_in_first_half_of_column'
model.target_column_to_predict = 'predicted_will_be_in_first_half_of_column'
model.train(example_df, epoch=20)
model.predict(example_df)

Unnamed: 0,sequence,pep_name,irt,mods,mod_sites,nAA,normalized_irt,is_in_first_half_of_column,predicted_will_be_in_first_half_of_column
0,LGGNEQVTR,RT-pep a,-24.92,,,9,0.0,1,0.997119
1,GAGSSEPVTGLDAK,RT-pep b,0.0,,,14,0.199488,1,0.996818
2,VEATFGVDESNAK,RT-pep c,12.39,,,13,0.298671,1,0.994436
3,YILAGVENSK,RT-pep d,19.79,,,10,0.357909,1,0.997186
4,TPVISGGPYEYR,RT-pep e,28.71,,,12,0.429315,1,0.996343
5,TPVITGAPYEYR,RT-pep f,33.38,,,12,0.466699,1,0.996218
6,DGLDAASYYAPVR,RT-pep g,42.26,,,13,0.537784,0,0.005464
7,ADVTPADFSEWSK,RT-pep h,54.62,,,13,0.636728,0,0.005231
8,GTFIIDPGGVIR,RT-pep i,70.52,,,12,0.764009,0,0.004986
9,GTFIIDPAAVIR,RT-pep k,87.23,,,12,0.897775,0,0.004768


## Binary classification models for given amino acid sequence and site-specific PTMs

### Examples

In [None]:
def create_example_input_dataframe_classification_rt():
    rt_df = create_example_input_dataframe_normalized_irt()
    rt_df['is_in_first_half_of_column'] = 0
    rt_df.loc[:5,'is_in_first_half_of_column']=1
    return rt_df

#### A sequence classification model using `Model_for_Generic_ModAASeq_BinaryClassification_LSTM`

In [None]:
example_df = create_example_input_dataframe_classification_rt()

#initialize the modelinterface, specify which of the models to use
model = ModelInterface_for_Generic_ModAASeq_BinaryClassification(
    model_class=Model_for_Generic_ModAASeq_BinaryClassification_LSTM
)
# specify the name of the column you want to use for traning
model.target_column_to_train = 'is_in_first_half_of_column' 
model.target_column_to_predict = 'predicted_will_be_in_first_half_of_column'
model.train(example_df, epoch=20)
model.predict(example_df)

Unnamed: 0,sequence,pep_name,irt,mods,mod_sites,nAA,normalized_irt,is_in_first_half_of_column,predicted_will_be_in_first_half_of_column
0,LGGNEQVTR,RT-pep a,-24.92,,,9,0.0,1,0.992248
1,GAGSSEPVTGLDAK,RT-pep b,0.0,,,14,0.199488,1,0.991091
2,VEATFGVDESNAK,RT-pep c,12.39,,,13,0.298671,1,0.991896
3,YILAGVENSK,RT-pep d,19.79,,,10,0.357909,1,0.992127
4,TPVISGGPYEYR,RT-pep e,28.71,,,12,0.429315,1,0.992189
5,TPVITGAPYEYR,RT-pep f,33.38,,,12,0.466699,1,0.988601
6,DGLDAASYYAPVR,RT-pep g,42.26,,,13,0.537784,0,0.423455
7,ADVTPADFSEWSK,RT-pep h,54.62,,,13,0.636728,0,0.416458
8,GTFIIDPGGVIR,RT-pep i,70.52,,,12,0.764009,0,0.418717
9,GTFIIDPAAVIR,RT-pep k,87.23,,,12,0.897775,0,0.415309


#### A sequence classification model using `Model_for_Generic_ModAASeq_BinaryClassification_Transformer`

In [None]:
example_df = create_example_input_dataframe_classification_rt()

#initialize the modelinterface, specify which of the models to use
model = ModelInterface_for_Generic_ModAASeq_BinaryClassification(
    model_class=Model_for_Generic_ModAASeq_BinaryClassification_Transformer
)
# specify the name of the column you want to use for traning
model.target_column_to_train = 'is_in_first_half_of_column' 
model.target_column_to_predict = 'predicted_will_be_in_first_half_of_column'
model.train(example_df, epoch=20)
model.predict(example_df)

Unnamed: 0,sequence,pep_name,irt,mods,mod_sites,nAA,normalized_irt,is_in_first_half_of_column,predicted_will_be_in_first_half_of_column
0,LGGNEQVTR,RT-pep a,-24.92,,,9,0.0,1,0.997684
1,GAGSSEPVTGLDAK,RT-pep b,0.0,,,14,0.199488,1,0.994951
2,VEATFGVDESNAK,RT-pep c,12.39,,,13,0.298671,1,0.995613
3,YILAGVENSK,RT-pep d,19.79,,,10,0.357909,1,0.997709
4,TPVISGGPYEYR,RT-pep e,28.71,,,12,0.429315,1,0.995338
5,TPVITGAPYEYR,RT-pep f,33.38,,,12,0.466699,1,0.995143
6,DGLDAASYYAPVR,RT-pep g,42.26,,,13,0.537784,0,0.004144
7,ADVTPADFSEWSK,RT-pep h,54.62,,,13,0.636728,0,0.004095
8,GTFIIDPGGVIR,RT-pep i,70.52,,,12,0.764009,0,0.002779
9,GTFIIDPAAVIR,RT-pep k,87.23,,,12,0.897775,0,0.002644
