In [None]:
# @title Install AutoPeptideML
%%capture
!pip install autopeptideml
!pip install hestia-ood
!wget https://mmseqs.com/latest/mmseqs-linux-avx2.tar.gz
!tar xvfz /content/mmseqs-linux-avx2.tar.gz
!cp /content/mmseqs/bin/mmseqs /bin/
%env mmseqs=/bin/mmseqs


In [None]:
# @title Import AutoPeptideML
import pandas as pd

from autopeptideml import AutoPeptideML, RepresentationEngine
from autopeptideml.utils.embeddings import AVAILABLE_MODELS, SYNONYMS

apml = AutoPeptideML(verbose=True)

# Hyperparameter Space

You can define the hyperparameter search space for a single model (`hpo_single`), for an ensemble of models (`hpo_ensemble`), the UniDL4BioPep architecture can also be used. Both options are added below and you can execute whichever you prefer to use. The search spaces can be modified at will, more information in the project documentation: https://ibm.github.io/AutoPeptideML/.

In [None]:
# @title HPO single (model selection)
hpo_space = {
    "trials": 100,
    "model_selection": [
{
            "model": "K-Nearest Neighbours",
            "optimization_metric": "test_matthews_corrcoef",
            "hyperparameter-space": [
                {
                    "name": "n_neighbors",
                    "type": "int",
                    "min": 1,
                    "max": 30,
                    "log": "False"
                },
                {
                    "name": "weights",
                    "type": "categorical",
                    "values": ["uniform", "distance"]
                }
            ]
        },
        # {
        #     "model": "mlp",
        #     "optimization_metric": "test_matthews_corrcoef",
        #     "hyperparameter-space": [
        #         {
        #             "name": "learning_rate",
        #             "type": "float",
        #             "min": 1e-7,
        #             "max": 1,
        #             "log": "True"
        #         },
        #         {
        #             "name": "activation",
        #             "type": "categorical",
        #             "values": ["identity", "logistic", "tanh", "relu"]
        #         },
        #         {
        #             "name": "solver",
        #             "type": "categorical",
        #             "values": ["adam", "sgd"]
        #         },
        #         {
        #             "name": "hidden_layer_sizes",
        #             "type": "categorical",
        #             "values": [[12, 12], [120, 120], [12, 12, 12], [120, 120, 120], [12, 12, 12, 12]]
        #         }
        #     ]
        # },
        # {
        #     "model": "XGBoost",
        #     "optimization_metric": "test_matthews_corrcoef",
        #     "hyperparameter-space": [
        #         {
        #             "name": "learning_rate",
        #             "type": "float",
        #             "min": 1e-5,
        #             "max": 1,
        #             "log": "True"
        #         },
        #         {
        #             "name": "n_estimators",
        #             "type": "int",
        #             "min": 1,
        #             "max": 100,
        #             "log": "False"
        #         },
        #         {
        #             "name": "max_depth",
        #             "type": "int",
        #             "min": 1,
        #             "max": 10,
        #             "log": "False"
        #         },
        #     ]
        # },
        {
            "model": "RFC",
            "optimization_metric": "test_matthews_corrcoef",
            "hyperparameter-space": [
                {
                    "name": "max_depth",
                    "type": "int",
                    "min": 2,
                    "max": 20,
                    "log": "False"
                },
                {
                    "name": "n_estimators",
                    "type": "int",
                    "min": 10,
                    "max": 100,
                    "log": "False"
                }
            ]
        },
        {
            "model": "LightGBM",
            "optimization_metric": "test_matthews_corrcoef",
            "hyperparameter-space": [
                {
                    "name": "max_depth",
                    "type": "int",
                    "min": 1,
                    "max": 30,
                    "log": "True"
                },
                {
                    "name": "num_leaves",
                    "type": "int",
                    "min": 5,
                    "max": 50,
                    "log": "True"
                },
                 {
                    "name": "learning_rate",
                    "type": "float",
                    "min": 0.001,
                    "max": 0.3,
                     "log": "True"
                },
                {
                    "name": "verbose",
                    "type": "fixed",
                    "value": -1
                }
            ]
        }
    ]
}

In [None]:
# @title HPO UniDL4BioPep
hpo_space = {
    "ensemble":
    [
        {
            "model": "unidl4biopep",
            "trials": 100,
            "optimization_metric": "test_matthews_corrcoef",
            "hyperparameter-space": {
                "epochs": 20,
                "optimizer": [
                    {
                        "name": "lr",
                        "type": "float",
                        "min": 1e-7,
                        "max": 0.1
                    }
                ]
            }
        }
    ]
}

In [None]:
# @title HPO ensemble
hpo_space = {
    "ensemble": [
        {
            "model": "K-Nearest Neighbours",
            "trials": 30,
            "optimization_metric": "test_matthews_corrcoef",
            "hyperparameter-space": [
                {
                    "name": "n_neighbors",
                    "type": "int",
                    "min": 1,
                    "max": 30,
                    "log": "False"
                },
                {
                    "name": "weights",
                    "type": "categorical",
                    "values": ["uniform", "distance"]
                }
            ]
        },
        # {
        #     "model": "mlp",
        #     "trials": 30,
        #     "optimization_metric": "test_matthews_corrcoef",
        #     "hyperparameter-space": [
        #         {
        #             "name": "learning_rate",
        #             "type": "categorical",
        #             "values": ["constant", "invscaling", "adaptive"]
        #         },
        #         {
        #             "name": "activation",
        #             "type": "categorical",
        #             "values": ["identity", "logistic", "tanh", "relu"]
        #         },
        #         {
        #           "name": "learning_rate_init",
        #           "type": "float",
        #           "min": 1e-7,
        #           "max": 1e-1,
        #           "log": True
        #         },
        #         {
        #             "name": "solver",
        #             "type": "categorical",
        #             "values": ["adam", "sgd"]
        #         },
        #         {
        #             "name": "hidden_layer_sizes",
        #             "type": "categorical",
        #             "values": [[12, 12], [120, 120], [12, 12, 12], [120, 120, 120], [12, 12, 12, 12]]
        #         }
        #     ]
        # },
        # {
        #     "model": "XGBoost",
        #     "trials": 30,
        #     "optimization_metric": "test_matthews_corrcoef",
        #     "hyperparameter-space": [
        #         {
        #             "name": "learning_rate",
        #             "type": "float",
        #             "min": 1e-5,
        #             "max": 1,
        #             "log": "True"
        #         },
        #         {
        #             "name": "n_estimators",
        #             "type": "int",
        #             "min": 1,
        #             "max": 100,
        #             "log": "False"
        #         },
        #         {
        #             "name": "max_depth",
        #             "type": "int",
        #             "min": 1,
        #             "max": 10,
        #             "log": "False"
        #         },
        #     ]
        # },
        {
            "model": "RFC",
            "trials": 30,
            "optimization_metric": "test_matthews_corrcoef",
            "hyperparameter-space": [
                {
                    "name": "max_depth",
                    "type": "int",
                    "min": 2,
                    "max": 20,
                    "log": "False"
                },
                {
                    "name": "n_estimators",
                    "type": "int",
                    "min": 10,
                    "max": 100,
                    "log": "False"
                }
            ]
        },
        {
            "model": "LightGBM",
            "trials": 30,
            "optimization_metric": "test_matthews_corrcoef",
            "hyperparameter-space": [
                {
                    "name": "max_depth",
                    "type": "int",
                    "min": 1,
                    "max": 30,
                    "log": "True"
                },
                {
                    "name": "num_leaves",
                    "type": "int",
                    "min": 5,
                    "max": 50,
                    "log": "True"
                },
                 {
                    "name": "learning_rate",
                    "type": "float",
                    "min": 0.001,
                    "max": 0.3,
                     "log": "True"
                },
                {
                    "name": "verbose",
                    "type": "fixed",
                    "value": -1
                }
            ]
        }
    ]
}


# Load your data

Please partition your files through the [Hestia webserver](http://peptide.ucd.ie/Hestia)


In [None]:
# @title Upload dataset
from google.colab import files
import io

uploaded = files.upload()
df = pd.read_csv(io.StringIO(uploaded[list(uploaded.keys())[0]].decode('utf-8')))
df.head()

In [None]:
# @title Inputs

field_name = 'sequence' # @param{type: 'string'}
id_field = None # @param{type: 'raw'}
label_name = 'Y' #@param{type: 'string'}
alignment_algorithm = 'mmseqs+prefilter' #@param{type: 'string'}
threshold = 0.3 #@param
plm_model = 'esm2-8m' #@param {type: 'string'}

if plm_model not in AVAILABLE_MODELS and plm_model not in SYNONYMS:
  print(f'Model: {plm_model} is not supported, please use one of the following: {list(SYNONYMS.keys())}')

if id_field is None:
  df['id'] = df.index
else:
  df['id'] = df[id_field]

df.head()

In [None]:
# @title Split dataset
from hestia.partition import ccpart, random_partition

train, test = ccpart(df, alignment_algorithm, field_name, label_name,
                      denominator='shortest',
                      test_size=0.2, valid_size=0.0, threshold=threshold,
                      verbose=0, data_type='protein',
                      distance='tanimoto', representation='3di+aa',
                      bits=1024, radius=2, config=None, sim_df=None)

datasets = {
    'train': df.iloc[train].reset_index(drop=True),
    'test': df.iloc[test].reset_index(drop=True)
}


In [None]:
# @title Preparing cross-validation folds

folds = apml.train_val_partition(
    datasets['train'], method='random',
    threshold=0.4, alignment='mmseqs+prefilter',
    n_folds=10, outputdir='results/folds',

)

In [None]:
# @title What Representation Model do you want to use?
# @markdown It is recommended to set the runtime to GPU in order to accelerate embedding computation.

re = RepresentationEngine(plm_model, 12)
id2rep = apml.compute_representations(datasets, re)
id2rep = {id: rep.numpy() for id, rep in id2rep.items()}

In [None]:
# @title Train models

model = apml.hpo_train(
    hpo_space, datasets['train'], id2rep, folds, 'results'
)

In [None]:
# @title Evaluate models

results = apml.evaluate_model(
    model, datasets['test'], id2rep, 'results'
)
print(results)