In [None]:
import pandas as pd
from autopeptideml import AutoPeptideML

# Load data
df = pd.read_csv('antibacterial_data_canonical.csv')
df2 = pd.read_csv('antibacterial_data_noncanonical.csv')
all_inputs = df['sequence'].tolist() + df['SMILES'].tolist()

# Initialise AutoPeptideML
apml = AutoPeptideML(
    data=all_inputs,
    outputdir='demo'
)

# Preprocess
apml.preprocess_data(
    pipeline='to-smiles',
    n_jobs=5,
    verbose=True
)

# Build models
apml.build_models(split_strategy='min',
                  task='class',
                  reps=['chemberta-2', 'ecfp', 'esm2-8m',
                        'peptideclm'],
                  device='mps',
                  n_trials=200)

Executing preprocessing step 1 of 2: to-smiles-1
Executing preprocessing step 1 of 2: to-smiles-1a
Executing preprocessing step 1 of 3: filter-smiles


100%|██████████| 19.6k/19.6k [00:03<00:00, 5.05kit/s] 


Executing preprocessing step 2 of 3: canonical-cleaner


100%|██████████| 9.78k/9.78k [00:02<00:00, 3.52kit/s]


Executing preprocessing step 3 of 3: sequence-to-smiles


100%|██████████| 9.78k/9.78k [00:03<00:00, 2.67kit/s]


Executing preprocessing step 2 of 2: to-smiles-1b
Executing preprocessing step 1 of 1: filter-smiles


100%|██████████| 19.6k/19.6k [00:03<00:00, 5.16kit/s] 


Executing preprocessing step 2 of 2: canonicalize-smiles


100%|██████████| 19.6k/19.6k [00:05<00:00, 3.49kit/s] 


In [None]:
# Sample negatives
apml.sample_negatives(
    target_db='both',
    activities_to_exclude='Antibacterial',
    desired_ratio=1.0,
    sample_by='mw',
    n_jobs=10
)

: 

In [None]:
# Build model
from hestia import HestiaGenerator, SimArguments

sim_args = SimArguments(
    data_type='small molecule', fingerprint='ecfp',
    radius=4, min_threshold=0.1,
    verbose=3, field_name=apml.sequence_field,
)
hdg = HestiaGenerator(apml.df, verbose=True)
hdg.calculate_partitions(
    sim_args=sim_args, label_name=apml.label_field,
    min_threshold=0.1, threshold_step=0.1)

apml.build_models(split_strategy='min',
                  task='class',
                  hestia_generator=hdg,
                  reps=['chemberta-2', 'ecfp', 'esm2-8m', 'peptideclm'],
                  device='mps',
                  n_trials=200)

Initialising Hestia Dataset Generator
Number of items in data: 39,112
Calculating similarity...
Calculating molecular similarities using ecfp with 1,024 bits, radius 4 and tanimoto index...


Query FPs: 100%|██████████| 39.1k/39.1k [00:33<00:00, 1.15kit/s]
Similarity calculation:  51%|█████     | 19.8k/39.1k [12:07<04:51, 66.3it/s]   

In [None]:
from autopeptideml.utils.hpo_plots import plot_optimization_history

plot_optimization_history(apml.trainer.history)

In [None]:
from autopeptideml.utils.hpo_plots import plot_model_vs_rep

plot_model_vs_rep(apml.trainer.history)

In [None]:
apml._evaluating()

In [None]:
apml.trainer.history

In [None]:
from os import path as osp

input_trial = {rep: apml.x[rep][:10] for rep in apml.trainer.best_model.reps}
preds = apml.trainer.best_model.predict(input_trial)

apml.trainer.best_model.save(osp.join(apml.outputdir, 'ensemble'))

In [None]:
from autopeptideml.train.architectures import VotingEnsemble
ensemble = VotingEnsemble.load(osp.join(apml.outputdir, 'ensemble'))

preds = apml.trainer.best_model.predict(input_trial)[0]
preds2 = ensemble.predict(input_trial)[0]
