In [1]:
!pip install transformers




[notice] A new release of pip is available: 24.0 -> 24.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
import os
import pandas as pd
import numpy as np
import torch
import re

from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from torch.utils.data import Dataset, DataLoader

torch.cuda.is_available()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

from transformers import AutoTokenizer, Trainer, TrainingArguments, BertForSequenceClassification, AdamW

cpu


  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# define a class for the AMP data that will correctly format the sequence information
# for fine-tuning with huggingface API
# the input dataframe columns must be formatted the same way as the given example

class amp_data():
    def __init__(self, df, tokenizer_name='Rostlab/prot_bert_bfd', max_len=200):

        self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_name, do_lower_case=False)
        self.max_len = max_len

        self.seqs, self.labels = self.get_seqs_labels()

    def get_seqs_labels(self):
        # isolate the amino acid sequences and their respective AMP labels
        seqs = list(df['aa_seq'])
        labels = list(df['AMP'].astype(int))

#         assert len(seqs) == len(labels)
        return seqs, labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        seq = " ".join("".join(self.seqs[idx].split()))
        seq_ids = self.tokenizer(seq, truncation=True, padding='max_length', max_length=self.max_len)

        sample = {key: torch.tensor(val) for key, val in seq_ids.items()}
        sample['labels'] = torch.tensor(self.labels[idx])

        return sample

In [4]:
# read in the train dataset
# create an amp_data class of the dataset

data_url = 'https://raw.githubusercontent.com/GIST-CSBL/AMP-BERT/main/all_veltri.csv'
df = pd.read_csv(data_url, index_col = 0)
df = df.sample(frac=1, random_state = 0)
print(df.head(7))

train_dataset = amp_data(df)

                                                            aa_seq  aa_len  \
AP02151          YEALVTSILGKLTGLWHNDSVDFMGHICYFRRRPKIRRFKLYHEGK...      95   
AP01951                                          FLPLVLGALSGILPKIL      17   
AP00972                                        FLSLIPHAINAVGVHAKHF      19   
AP01261                                           IIEKLVNTALGLLSGL      16   
AP01298                                       GLFTLIKCAYQLIAPTVACN      20   
AP01802                                     RPWAGNGSVHRYTVLSPRLKTQ      22   
UniRef50_Q9UTR1                                SKENSYVEKLLYKQRFYAS      19   

                   AMP  
AP02151           True  
AP01951           True  
AP00972           True  
AP01261           True  
AP01298           True  
AP01802           True  
UniRef50_Q9UTR1  False  


In [5]:
# define the necessary metrics for performance evaluation

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)
#     conf = confusion_matrix(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall,
#         'confusion matrix': conf
    }

In [6]:
# define the model initializing function for Trainer in huggingface

def model_init():
    return BertForSequenceClassification.from_pretrained('Rostlab/prot_bert_bfd')

In [7]:
!pip install accelerate -U





[notice] A new release of pip is available: 24.0 -> 24.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [8]:
os.getcwd()

'd:\\Faks\\4. godina\\Evolucijsko računarstvo\\Projekt\\antimicrobial\\amp_bert'

In [19]:
output_dir = './results'
logging_dir = './logs'

# Create directories if they do not exist
if not os.path.exists(output_dir):
    os.makedirs(output_dir)
    print(f"Created output directory: {output_dir}")
if not os.path.exists(logging_dir):
    os.makedirs(logging_dir)
    print(f"Created logging directory: {logging_dir}")

# Verify the directories are created and accessible
assert os.path.isdir(output_dir), f"{output_dir} is not a directory"
assert os.path.isdir(logging_dir), f"{logging_dir} is not a directory"

training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=15,
    learning_rate=5e-5,
    per_device_train_batch_size=1,
    warmup_steps=0,
    weight_decay=0.1,
    logging_dir='D:\\Faks\\4. godina\\Evolucijsko računarstvo\\Projekt\\antimicrobial\\amp_bert\\logs',
    logging_steps=100,
    do_train=True,
    do_eval=True,
    evaluation_strategy="no",
    save_strategy='no',
    gradient_accumulation_steps=64,
    fp16=False,
    fp16_opt_level="O2",
    run_name="AMP-BERT",
    seed=0,
    load_best_model_at_end=True
)

trainer = Trainer(
    model_init=model_init,
    args=training_args,
    train_dataset=train_dataset,
    compute_metrics=compute_metrics,
)

trainer.train()


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at Rostlab/prot_bert_bfd and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


TypeError: expected str, bytes or os.PathLike object, not NoneType

In [12]:
os.path.isdir("D:\\Faks\\4. godina\\Evolucijsko računarstvo\\Projekt\\antimicrobial\\amp_bert\\logs") 

True

In [None]:
# performance metrics on the training data itself

predictions, label_ids, metrics = trainer.predict(train_dataset)
metrics

NameError: name 'trainer' is not defined

In [None]:
# save the model, if desired

# from google.colab import drive
# drive.mount('/content/drive')

# trainer.save_model('/content/drive/MyDrive/Colab Notebooks/AMP-BERT/Fine-tuned_model/')

In [None]:
# predict AMP/non-AMP for a single example

# IMPORTANT:
# one must mount their Google Drive and load their own fine-tuned model before running the below cell for individual predictions
from google.colab import drive
drive.mount('/content/drive')

# load appropriate tokenizer and fine-tuned model
tokenizer = AutoTokenizer.from_pretrained('Rostlab/prot_bert_bfd', do_lower_case=False)
model = BertForSequenceClassification.from_pretrained("/content/drive/MyDrive/Colab Notebooks/AMP-BERT/Fine-tuned_model")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--Rostlab--prot_bert_bfd/snapshots/6c5c8a55a52ff08a664dfd584aa1773f125a0487/config.json
Model config BertConfig {
  "_name_or_path": "Rostlab/prot_bert_bfd",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.0,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.0,
  "hidden_size": 1024,
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 40000,
  "model_type": "bert",
  "num_attention_heads": 16,
  "num_hidden_layers": 30,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.22.1",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30
}

loading file vocab.txt from cache at /root/.cache/huggingface/hub/models--Rostlab--prot_bert_bfd/snapshots/6c5c8a55a52ff08a664dfd584aa1773f125a0487/vocab.txt
loading file tokenizer.json from cac

In [None]:
# predict AMP/non-AMP for a single example (default ex. is from external test data: DRAMP00126)

#@markdown **Input peptide sequence (upper case only)**
input_seq = 'FNRGGYNFGKSVRHVVDAIGSVAGIRGILKSIR' #@param {type:"string"}
input_seq_spaced = ' '.join([ input_seq[i:i+1] for i in range(0, len(input_seq), 1) ])
input_seq_spaced = re.sub(r'[UZOB]', 'X', input_seq_spaced)
input_seq_tok = tokenizer(input_seq_spaced, return_tensors = 'pt')

output = model(**input_seq_tok)
logits = output[0]

# extract AMP class probability and make binary prediction
y_prob = torch.sigmoid(logits)[:,1].detach().numpy()
y_pred = y_prob > 0.5
if y_pred == True:
  input_class = 'AMP'
else:
  input_class = 'non-AMP'

print('Input peptide sequence: ' + input_seq)
print('Class prediction: ' + input_class)

Input peptide sequence: FNRGGYNFGKSVRHVVDAIGSVAGIRGILKSIR
Class prediction: AMP
