In [2]:
import os
import pandas as pd
import numpy as np
import torch
import re

from sklearn.metrics import accuracy_score, precision_recall_fscore_support, hamming_loss, jaccard_score, f1_score, precision_score, recall_score
from torch.utils.data import Dataset, DataLoader

torch.cuda.is_available()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

from transformers import AutoTokenizer, Trainer, TrainingArguments, BertForSequenceClassification, AdamW

# Definir uma classe para os dados do AMP que formate corretamente as informações de sequência para o ajuste fino com a API do Huggingface.
# As colunas de entrada do DataFrame devem ser formatadas da mesma maneira que o exemplo fornecido

class amp_data(Dataset):
    def __init__(self, df, tokenizer_name='Rostlab/prot_bert_bfd', max_len=200):
        self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_name, do_lower_case=False)
        self.max_len = max_len

        self.seqs, self.labels = self.get_seqs_labels(df)

    def get_seqs_labels(self, df):
      # Isolar as sequências de aminoácidos e seus respectivos rótulos AMP
      seqs = df['Sequence'].tolist()
      labels = df[['Antibacterial', 'Antiviral', 'Antiparasitic', 'Antifungal']].values
      labels = torch.tensor(labels, dtype=torch.float32)
      return seqs, labels


    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        seq = " ".join("".join(self.seqs[idx].split()))
        seq_ids = self.tokenizer(seq, truncation=True, padding='max_length', max_length=self.max_len)

        sample = {key: torch.tensor(val) for key, val in seq_ids.items()}
        sample['labels'] = self.labels[idx].clone().detach()

        return sample

# Ler o conjunto de dados de treinamento

data_url = 'https://raw.githubusercontent.com/Kevinzhn/AMP-BERT-Multilabel/main/treinamento'
df = pd.read_csv(data_url, index_col=None)  # Use index_col=None to prevent treating "Numero" as an index column
df = df.sample(frac=1, random_state=0)
print(df.head(7))
train_dataset = amp_data(df)

# Definir as métricas necessárias para avaliação de desempenho

from sklearn.metrics import hamming_loss, jaccard_score, accuracy_score, f1_score, precision_score, recall_score

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions > 0.5

    hamming_loss_val = hamming_loss(labels, preds)
    jaccard_score_val = jaccard_score(labels, preds, average='samples')

    # Para a classificação de múltiplas etiquetas, é necessário calcular essas métricas para cada etiqueta individualmente e, 
    # em seguida, fazer a média ou média ponderada delas.
    accuracy_val = accuracy_score(labels, preds)
    f1_val = f1_score(labels, preds, average='samples')
    precision_val = precision_score(labels, preds, average='samples')
    recall_val = recall_score(labels, preds, average='samples')

    return {
        'hamming_loss': hamming_loss_val,
        'jaccard_score_samples': jaccard_score_val,
        'accuracy': accuracy_val,
        'f1': f1_val,
        'precision': precision_val,
        'recall': recall_val,
    }



# Definir a função de inicialização do modelo para o Trainer no Huggingface

def model_init():
    return BertForSequenceClassification.from_pretrained('Rostlab/prot_bert_bfd', num_labels=4)

# Ler o conjunto de dados de avaliação
eval_data_url = 'https://raw.githubusercontent.com/Kevinzhn/AMP-BERT-Multilabel/main/teste'
eval_df = pd.read_csv(eval_data_url, index_col=None)
eval_df = eval_df.sample(frac=1, random_state=0)
eval_dataset = amp_data(eval_df)

training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=5,
    learning_rate=5e-5,
    per_device_train_batch_size=1,
    warmup_steps=0,
    weight_decay=0.1,
    logging_dir='./logs',
    logging_steps=100,
    do_train=True,
    do_eval=True,
    evaluation_strategy="epoch",
    save_strategy='epoch',
    gradient_accumulation_steps=64,
    fp16=True,
    fp16_opt_level="O2",
    run_name="AMP-BERT",
    seed=0,
    load_best_model_at_end=True
)

trainer = Trainer(
    model_init=model_init,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    compute_metrics=compute_metrics,
)

trainer.train()


cuda
                                        Sequence  Antibacterial  Antiviral  \
15                                     RWRWWWRVY              1          0   
3262                   CNIAPASIVSRNIVYTRAQPNQDIA              0          1   
499                       YPGPQAKEDSEGPSQGPASREK              1          0   
4132                               FIPLVSGLFSRLL              1          0   
4560                       DWTFANWSCLVCDDCSVNLTV              1          0   
608               ILQKAVLDCLKAAGSSLSKAAITAIYNKIT              1          0   
1362  GALWGAPAGGVGALPGAFVGAHVGAIAGGFACMGGMIGNKFN              1          0   

      Antiparasitic  Antifungal  Numero  
15                0           0       9  
3262              0           0      25  
499               0           0      22  
4132              0           1      13  
4560              0           1      21  
608               0           0      30  
1362              0           0      42  


Downloading (…)okenizer_config.json:   0%|          | 0.00/86.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/361 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/81.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/1.68G [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at Rostlab/prot_bert_bfd and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at Rostlab/prot_bert_bfd and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Loss,Jaccard Score Samples,Accuracy,F1,Precision,Recall,Runtime,Samples Per Second,Steps Per Second
0,No log,0.199002,0.427536,0.667071,0.566109,0.709516,0.821371,0.667071,29.4626,62.893,7.874
1,0.479700,0.191986,0.41647,0.677325,0.573125,0.720939,0.835402,0.677325,29.4668,62.884,7.873
2,0.403200,0.180923,0.390748,0.681912,0.577982,0.725256,0.83864,0.681912,29.4466,62.928,7.879
3,0.379800,0.17701,0.383349,0.70224,0.594172,0.747113,0.864004,0.70224,29.6673,62.459,7.82
4,0.366600,0.172963,0.376397,0.706062,0.599568,0.750225,0.865084,0.706062,29.6563,62.483,7.823


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


TrainOutput(global_step=430, training_loss=0.403947011814561, metrics={'train_runtime': 3574.5343, 'train_samples_per_second': 7.738, 'train_steps_per_second': 0.12, 'total_flos': 1.2514196063232e+16, 'train_loss': 0.403947011814561, 'epoch': 4.97})

In [1]:
!pip install transformers[torch] accelerate -U

Collecting transformers[torch]
  Downloading transformers-4.31.0-py3-none-any.whl (7.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.4/7.4 MB[0m [31m14.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting accelerate
  Downloading accelerate-0.21.0-py3-none-any.whl (244 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m244.2/244.2 kB[0m [31m17.7 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers[torch])
  Downloading huggingface_hub-0.16.4-py3-none-any.whl (268 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m17.0 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers[torch])
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m40.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from