In [None]:
import os
import pandas as pd
import numpy as np
import torch
import re

from sklearn.metrics import accuracy_score, precision_recall_fscore_support, hamming_loss, jaccard_score
from torch.utils.data import Dataset, DataLoader

torch.cuda.is_available()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

from transformers import AutoTokenizer, Trainer, TrainingArguments, BertForSequenceClassification, AdamW

# Define a class for the AMP data that will correctly format the sequence information
# for fine-tuning with the Huggingface API
# The input DataFrame columns must be formatted the same way as the given example

class amp_data(Dataset):
    def __init__(self, df, tokenizer_name='Rostlab/prot_bert_bfd', max_len=200):
        self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_name, do_lower_case=False)
        self.max_len = max_len

        self.seqs, self.labels = self.get_seqs_labels(df)

    def get_seqs_labels(self, df):
        # Isolate the amino acid sequences and their respective AMP labels
        seqs = list(df['Sequence'])
        labels = list(df[['Antibacterial', 'Antiviral', 'Antiparasitic', 'Antifungal']].values)
        labels = torch.tensor(labels, dtype=torch.float32)
        return seqs, labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        seq = " ".join("".join(self.seqs[idx].split()))
        seq_ids = self.tokenizer(seq, truncation=True, padding='max_length', max_length=self.max_len)

        sample = {key: torch.tensor(val) for key, val in seq_ids.items()}
        sample['labels'] = torch.tensor(self.labels[idx])

        return sample

# Read in the train dataset
# Create an amp_data class of the dataset

data_url = 'https://raw.githubusercontent.com/Kevinzhn/AMP-BERT-Multilabel/main/treinamento'
df = pd.read_csv(data_url, index_col=None)  # Use index_col=None to prevent treating "Numero" as an index column
df = df.sample(frac=1, random_state=0)
print(df.head(7))
train_dataset = amp_data(df)

# Define the necessary metrics for performance evaluation

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions > 0.5  # 假设你的模型输出的是概率或者经过阈值处理的概率结果
    hamming_loss_val = hamming_loss(labels, preds)
    jaccard_score_val = jaccard_score(labels, preds, average='samples')  # 使用 'samples' 平均指标
    return {
        'hamming_loss': hamming_loss_val,
        'jaccard_score_samples': jaccard_score_val,
    }


# Define the model initializing function for Trainer in Huggingface

def model_init():
    return BertForSequenceClassification.from_pretrained('Rostlab/prot_bert_bfd', num_labels=4)

# Read in the evaluation dataset
eval_data_url = 'https://raw.githubusercontent.com/Kevinzhn/AMP-BERT-Multilabel/main/teste'
eval_df = pd.read_csv(eval_data_url, index_col=None)
eval_df = eval_df.sample(frac=1, random_state=0)
eval_dataset = amp_data(eval_df)

training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=15,
    learning_rate=5e-5,
    per_device_train_batch_size=1,
    warmup_steps=0,
    weight_decay=0.1,
    logging_dir='./logs',
    logging_steps=100,
    do_train=True,
    do_eval=True,
    evaluation_strategy="epoch",
    save_strategy='epoch',
    gradient_accumulation_steps=64,
    fp16=True,
    fp16_opt_level="O2",
    run_name="AMP-BERT",
    seed=0,
    load_best_model_at_end=True
)

trainer = Trainer(
    model_init=model_init,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    compute_metrics=compute_metrics,
)

trainer.train()


cuda
                                        Sequence  Antibacterial  Antiviral  \
15                                     RWRWWWRVY              1          0   
3262                   CNIAPASIVSRNIVYTRAQPNQDIA              0          1   
499                       YPGPQAKEDSEGPSQGPASREK              1          0   
4132                               FIPLVSGLFSRLL              1          0   
4560                       DWTFANWSCLVCDDCSVNLTV              1          0   
608               ILQKAVLDCLKAAGSSLSKAAITAIYNKIT              1          0   
1362  GALWGAPAGGVGALPGAFVGAHVGAIAGGFACMGGMIGNKFN              1          0   

      Antiparasitic  Antifungal  Numero  
15                0           0       9  
3262              0           0      25  
499               0           0      22  
4132              0           1      13  
4560              0           1      21  
608               0           0      30  
1362              0           0      42  


  labels = torch.tensor(labels, dtype=torch.float32)
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at Rostlab/prot_bert_bfd and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at Rostlab/prot_bert_bfd and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  sample['labels'] = torch.tensor(self.labels[idx])


Epoch,Training Loss,Validation Loss,Loss,Jaccard Score Samples,Runtime,Samples Per Second,Steps Per Second
0,No log,0.298975,0.541473,0.502743,28.1233,65.888,8.249
1,0.499800,0.34687,0.523102,0.046096,28.1019,65.938,8.256
2,0.508900,0.212628,0.452154,0.645305,28.2752,65.535,8.205
3,0.446600,0.240826,0.475572,0.554236,28.2455,65.603,8.214
4,0.455200,0.248246,0.481494,0.58873,28.0937,65.958,8.258
5,0.469300,0.290205,0.475523,0.219194,27.8736,66.479,8.323


  sample['labels'] = torch.tensor(self.labels[idx])
  sample['labels'] = torch.tensor(self.labels[idx])
  sample['labels'] = torch.tensor(self.labels[idx])
  sample['labels'] = torch.tensor(self.labels[idx])
  sample['labels'] = torch.tensor(self.labels[idx])


In [2]:
!pip install transformers[torch] accelerate -U

Collecting transformers[torch]
  Downloading transformers-4.31.0-py3-none-any.whl (7.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.4/7.4 MB[0m [31m55.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting accelerate
  Downloading accelerate-0.21.0-py3-none-any.whl (244 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m244.2/244.2 kB[0m [31m28.6 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers[torch])
  Downloading huggingface_hub-0.16.4-py3-none-any.whl (268 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m33.5 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers[torch])
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m100.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (fro